MediaWiki master
trackBlobs.php
Go to the documentation of this file.
1<?php
29
30// @codeCoverageIgnoreStart
31require_once __DIR__ . '/../Maintenance.php';
32// @codeCoverageIgnoreEnd
33
34class TrackBlobs extends Maintenance {
36 public $clusters;
42 public $trackedBlobs = [];
43
45 public $batchSize = 1000;
47 public $reportingInterval = 10;
48
49 public function __construct() {
50 parent::__construct();
51
52 $this->addArg( 'cluster', 'cluster(s) to scan', true, true );
53
54 $this->addDescription(
55 'Adds blobs from a given ES cluster to the blob_tracking table. ' .
56 'Automatically deletes the tracking table and starts from the start again when restarted.'
57 );
58 }
59
60 public function execute() {
61 $this->clusters = $this->parameters->getArgs();
62 if ( extension_loaded( 'gmp' ) ) {
63 $this->doBlobOrphans = true;
64 foreach ( $this->clusters as $cluster ) {
65 $this->trackedBlobs[$cluster] = gmp_init( 0 );
66 }
67 } else {
68 echo "Warning: the gmp extension is needed to find orphan blobs\n";
69 }
70
71 $this->checkIntegrity();
72 $this->initTrackingTable();
73 $this->trackRevisions();
74 $this->trackOrphanText();
75 if ( $this->doBlobOrphans ) {
76 $this->findOrphanBlobs();
77 }
78 $this->output( "All done.\n" );
79 }
80
81 private function checkIntegrity() {
82 echo "Doing integrity check...\n";
83 $dbr = $this->getReplicaDB();
84
85 // Scan for HistoryBlobStub objects in the text table (T22757)
86
87 $exists = (bool)$dbr->newSelectQueryBuilder()
88 ->select( '1' )
89 ->from( 'text' )
90 ->where(
91 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
92 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'' )
93 ->caller( __METHOD__ )->fetchField();
94
95 if ( $exists ) {
96 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
97 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
98 "to fix this.\n";
99 exit( 1 );
100 }
101
102 echo "Integrity check OK\n";
103 }
104
105 private function initTrackingTable() {
106 $dbw = $this->getDB( DB_PRIMARY );
107 if ( $dbw->tableExists( 'blob_tracking', __METHOD__ ) ) {
108 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ), __METHOD__ );
109 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ), __METHOD__ );
110 }
111 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
112 }
113
114 private function getTextClause(): IExpression {
115 if ( !$this->textClause ) {
116 $dbr = $this->getReplicaDB();
117 $conds = [];
118 foreach ( $this->clusters as $cluster ) {
119 $conds[] = $dbr->expr(
120 'old_text',
121 IExpression::LIKE,
122 new LikeValue( "DB://$cluster/", $dbr->anyString() )
123 );
124 }
125 $this->textClause = $dbr->orExpr( $conds );
126 }
127
128 return $this->textClause;
129 }
130
132 private function interpretPointer( string $text ) {
133 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
134 return false;
135 }
136
137 return [
138 'cluster' => $m[1],
139 'id' => intval( $m[2] ),
140 'hash' => $m[3] ?? null
141 ];
142 }
143
147 private function trackRevisions() {
148 $dbw = $this->getPrimaryDB();
149 $dbr = $this->getReplicaDB();
150
151 $textClause = $this->getTextClause();
152 $startId = 0;
153 $endId = (int)$dbr->newSelectQueryBuilder()
154 ->select( 'MAX(rev_id)' )
155 ->from( 'revision' )
156 ->caller( __METHOD__ )->fetchField();
157 $batchesDone = 0;
158 $rowsInserted = 0;
159
160 echo "Finding revisions...\n";
161
162 $conds = [
163 $textClause,
164 $dbr->expr(
165 'old_flags',
166 IExpression::LIKE,
167 new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() )
168 )
169 ];
170 $slotRoleStore = $this->getServiceContainer()->getSlotRoleStore();
171
172 $conds = array_merge( [
173 'slot_role_id' => $slotRoleStore->getId( SlotRecord::MAIN ),
174 'SUBSTRING(content_address, 1, 3)=' . $dbr->addQuotes( 'tt:' ),
175 ], $conds );
176
177 while ( true ) {
178 $res = $dbr->newSelectQueryBuilder()
179 ->select( [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ] )
180 ->from( 'revision' )
181 ->join( 'slots', null, 'rev_id=slot_revision_id' )
182 ->join( 'content', null, 'content_id=slot_content_id' )
183 ->join( 'text', null, 'SUBSTRING(content_address, 4)=old_id' )
184 ->where( $dbr->expr( 'rev_id', '>', $startId ) )
185 ->andWhere( $conds )
186 ->orderBy( 'rev_id' )
187 ->limit( $this->batchSize )
188 ->caller( __METHOD__ )->fetchResultSet();
189 if ( !$res->numRows() ) {
190 break;
191 }
192
193 $insertBatch = [];
194 foreach ( $res as $row ) {
195 $startId = (int)$row->rev_id;
196 $info = $this->interpretPointer( $row->old_text );
197 if ( !$info ) {
198 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
199 continue;
200 }
201 if ( !in_array( $info['cluster'], $this->clusters ) ) {
202 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
203 continue;
204 }
205 $insertBatch[] = [
206 'bt_page' => $row->rev_page,
207 'bt_rev_id' => $row->rev_id,
208 'bt_text_id' => $row->old_id,
209 'bt_cluster' => $info['cluster'],
210 'bt_blob_id' => $info['id'],
211 'bt_cgz_hash' => $info['hash']
212 ];
213 if ( $this->doBlobOrphans ) {
214 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
215 }
216 }
217 $dbw->newInsertQueryBuilder()
218 ->insertInto( 'blob_tracking' )
219 ->rows( $insertBatch )
220 ->caller( __METHOD__ )->execute();
221 $rowsInserted += count( $insertBatch );
222
223 ++$batchesDone;
224 if ( $batchesDone >= $this->reportingInterval ) {
225 $batchesDone = 0;
226 echo "$startId / $endId\n";
227 $this->waitForReplication();
228 }
229 }
230 echo "Found $rowsInserted revisions\n";
231 }
232
238 private function trackOrphanText() {
239 # Wait until the blob_tracking table is available in the replica DB
240 $dbw = $this->getPrimaryDB();
241 $dbr = $this->getReplicaDB();
242 $this->getServiceContainer()->getDBLoadBalancerFactory()->waitForReplication( [ 'timeout' => 100_000 ] );
243
244 $textClause = $this->getTextClause();
245 $startId = 0;
246 $endId = (int)$dbr->newSelectQueryBuilder()
247 ->select( 'MAX(old_id)' )
248 ->from( 'text' )
249 ->caller( __METHOD__ )->fetchField();
250 $rowsInserted = 0;
251 $batchesDone = 0;
252
253 echo "Finding orphan text...\n";
254
255 # Scan the text table for orphan text
256 while ( true ) {
257 $res = $dbr->newSelectQueryBuilder()
258 ->select( [ 'old_id', 'old_flags', 'old_text' ] )
259 ->from( 'text' )
260 ->leftJoin( 'blob_tracking', null, 'bt_text_id=old_id' )
261 ->where( [
262 $dbr->expr( 'old_id', '>', $startId ),
263 $textClause,
264 $dbr->expr(
265 'old_flags',
266 IExpression::LIKE,
267 new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() )
268 ),
269 'bt_text_id' => null,
270 ] )
271 ->orderBy( 'old_id' )
272 ->limit( $this->batchSize )
273 ->caller( __METHOD__ )->fetchResultSet();
274
275 if ( !$res->numRows() ) {
276 break;
277 }
278
279 $insertBatch = [];
280 foreach ( $res as $row ) {
281 $startId = (int)$row->old_id;
282 $info = $this->interpretPointer( $row->old_text );
283 if ( !$info ) {
284 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
285 continue;
286 }
287 if ( !in_array( $info['cluster'], $this->clusters ) ) {
288 echo "Invalid cluster returned in SQL query\n";
289 continue;
290 }
291
292 $insertBatch[] = [
293 'bt_page' => 0,
294 'bt_rev_id' => 0,
295 'bt_text_id' => $row->old_id,
296 'bt_cluster' => $info['cluster'],
297 'bt_blob_id' => $info['id'],
298 'bt_cgz_hash' => $info['hash']
299 ];
300 if ( $this->doBlobOrphans ) {
301 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
302 }
303 }
304 $dbw->newInsertQueryBuilder()
305 ->insertInto( 'blob_tracking' )
306 ->rows( $insertBatch )
307 ->caller( __METHOD__ )->execute();
308
309 $rowsInserted += count( $insertBatch );
310 ++$batchesDone;
311 if ( $batchesDone >= $this->reportingInterval ) {
312 $batchesDone = 0;
313 echo "$startId / $endId\n";
314 $this->waitForReplication();
315 }
316 }
317 echo "Found $rowsInserted orphan text rows\n";
318 }
319
327 private function findOrphanBlobs() {
328 if ( !extension_loaded( 'gmp' ) ) {
329 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
330
331 return;
332 }
333
334 $dbw = $this->getPrimaryDB();
335 $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();
336 $dbStore = $this->getServiceContainer()->getExternalStoreFactory()->getStore( 'DB' );
337 '@phan-var ExternalStoreDB $dbStore';
339 foreach ( $this->clusters as $cluster ) {
340 echo "Searching for orphan blobs in $cluster...\n";
341 $lb = $lbFactory->getExternalLB( $cluster );
342 try {
343 $extDB = $lb->getMaintenanceConnectionRef( DB_REPLICA );
344 } catch ( DBConnectionError $e ) {
345 if ( strpos( $e->getMessage(), 'Unknown database' ) !== false ) {
346 echo "No database on $cluster\n";
347 } else {
348 echo "Error on $cluster: " . $e->getMessage() . "\n";
349 }
350 continue;
351 }
352 $table = $dbStore->getTable( $cluster );
353 if ( !$extDB->tableExists( $table, __METHOD__ ) ) {
354 echo "No blobs table on cluster $cluster\n";
355 continue;
356 }
357 $startId = 0;
358 $batchesDone = 0;
359 $actualBlobs = gmp_init( 0 );
360 $endId = (int)$extDB->newSelectQueryBuilder()
361 ->select( 'MAX(blob_id)' )
362 ->from( $table )
363 ->caller( __METHOD__ )->fetchField();
364
365 // Build a bitmap of actual blob rows
366 while ( true ) {
367 $res = $extDB->newSelectQueryBuilder()
368 ->select( [ 'blob_id' ] )
369 ->from( $table )
370 ->where( $extDB->expr( 'blob_id', '>', $startId ) )
371 ->orderBy( 'blob_id' )
372 ->limit( $this->batchSize )
373 ->caller( __METHOD__ )->fetchResultSet();
374
375 if ( !$res->numRows() ) {
376 break;
377 }
378
379 foreach ( $res as $row ) {
380 gmp_setbit( $actualBlobs, $row->blob_id );
381 $startId = (int)$row->blob_id;
382 }
383
384 ++$batchesDone;
385 if ( $batchesDone >= $this->reportingInterval ) {
386 $batchesDone = 0;
387 echo "$startId / $endId\n";
388 }
389 }
390
391 // Find actual blobs that weren't tracked by the previous passes
392 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
393 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
394
395 // Traverse the orphan list
396 $insertBatch = [];
397 $id = 0;
398 $numOrphans = 0;
399 while ( true ) {
400 $id = gmp_scan1( $orphans, $id );
401 if ( $id == -1 ) {
402 break;
403 }
404 $insertBatch[] = [
405 'bo_cluster' => $cluster,
406 'bo_blob_id' => $id
407 ];
408 if ( count( $insertBatch ) > $this->batchSize ) {
409 $dbw->newInsertQueryBuilder()
410 ->insertInto( 'blob_orphans' )
411 ->rows( $insertBatch )
412 ->caller( __METHOD__ )->execute();
413 $insertBatch = [];
414 }
415
416 ++$id;
417 ++$numOrphans;
418 }
419 if ( $insertBatch ) {
420 $dbw->newInsertQueryBuilder()
421 ->insertInto( 'blob_orphans' )
422 ->rows( $insertBatch )
423 ->caller( __METHOD__ )->execute();
424 }
425 echo "Found $numOrphans orphan(s) in $cluster\n";
426 }
427 }
428}
429
430// @codeCoverageIgnoreStart
431$maintClass = TrackBlobs::class;
432require_once RUN_MAINTENANCE_IF_MAIN;
433// @codeCoverageIgnoreEnd
if(!defined('MW_SETUP_CALLBACK'))
Definition WebStart.php:81
External storage in a SQL database.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
output( $out, $channel=null)
Throw some output to the user.
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
addDescription( $text)
Set the description text.
Value object representing a content slot associated with a page revision.
int $reportingInterval
IExpression null $textClause
execute()
Do the actual work.
bool $doBlobOrphans
__construct()
Default constructor.
string[] $clusters
array $trackedBlobs
Content of like value.
Definition LikeValue.php:14
const DB_REPLICA
Definition defines.php:26
const DB_PRIMARY
Definition defines.php:28
$maintClass