MediaWiki REL1_40
findBadBlobs.php
Go to the documentation of this file.
1<?php
31
32require_once __DIR__ . '/Maintenance.php';
33
40
44 private $revisionStore;
45
49 private $blobStore;
50
54 private $loadBalancer;
55
59 private $lbFactory;
60
61 public function __construct() {
62 parent::__construct();
63
64 $this->setBatchSize( 1000 );
65 $this->addDescription( 'Find and mark bad content blobs. Marked blobs will be read as empty. '
66 . 'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' );
67 $this->addOption( 'scan-from', 'Start scanning revisions at the given date. '
68 . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
69 false, true );
70 $this->addOption( 'revisions', 'A list of revision IDs to process, separated by comma or '
71 . 'colon or whitespace. Revisions belonging to deleted pages will work. '
72 . 'If set to "-" IDs are read from stdin, one per line.', false, true );
73 $this->addOption( 'limit', 'Maximum number of revisions for --scan-from to scan. '
74 . 'Default: 1000', false, true );
75 $this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when '
76 . 'attempting to read it. The value given is the reason for marking the blob as bad, '
77 . 'typically a ticket ID. Requires --revisions to also be set.', false, true );
78 }
79
80 public function initializeServices(
81 ?RevisionStore $revisionStore = null,
82 ?BlobStore $blobStore = null,
83 ?LoadBalancer $loadBalancer = null,
84 ?LBFactory $lbFactory = null
85 ) {
86 $services = MediaWikiServices::getInstance();
87
88 $this->revisionStore = $revisionStore ?? $this->revisionStore ?? $services->getRevisionStore();
89 $this->blobStore = $blobStore ?? $this->blobStore ?? $services->getBlobStore();
90 $this->loadBalancer = $loadBalancer ?? $this->loadBalancer ?? $services->getDBLoadBalancer();
91 $this->lbFactory = $lbFactory ?? $this->lbFactory ?? $services->getDBLoadBalancerFactory();
92 }
93
97 private function getStartTimestamp() {
98 $tsOpt = $this->getOption( 'scan-from' );
99 if ( strlen( $tsOpt ) < 14 ) {
100 $this->fatalError( 'Bad timestamp: ' . $tsOpt
101 . ', please provide time and date down to the second.' );
102 }
103
104 $ts = wfTimestamp( TS_MW, $tsOpt );
105 if ( !$ts ) {
106 $this->fatalError( 'Bad timestamp: ' . $tsOpt );
107 }
108
109 return $ts;
110 }
111
115 private function getRevisionIds() {
116 $opt = $this->getOption( 'revisions' );
117
118 if ( $opt === '-' ) {
119 $opt = stream_get_contents( STDIN );
120
121 if ( !$opt ) {
122 return [];
123 }
124 }
125
126 return $this->parseIntList( $opt );
127 }
128
132 public function execute() {
133 $this->initializeServices();
134
135 if ( $this->hasOption( 'revisions' ) ) {
136 if ( $this->hasOption( 'scan-from' ) ) {
137 $this->fatalError( 'Cannot use --revisions together with --scan-from' );
138 }
139
140 $ids = $this->getRevisionIds();
141
142 $count = $this->scanRevisionsById( $ids );
143 } elseif ( $this->hasOption( 'scan-from' ) ) {
144 if ( $this->hasOption( 'mark' ) ) {
145 $this->fatalError( 'Cannot use --mark with --scan-from, '
146 . 'use --revisions to specify revisions to mark.' );
147 }
148
149 $fromTimestamp = $this->getStartTimestamp();
150 $total = $this->getOption( 'limit', 1000 );
151
152 $count = $this->scanRevisionsByTimestamp( $fromTimestamp, $total );
153
154 $this->output( "The range of archive rows scanned is based on the range of revision IDs "
155 . "scanned in the revision table.\n" );
156 } else {
157 if ( $this->hasOption( 'mark' ) ) {
158 $this->fatalError( 'The --mark must be used together with --revisions' );
159 } else {
160 $this->fatalError( 'Must specify one of --revisions or --scan-from' );
161 }
162 }
163
164 if ( $this->hasOption( 'mark' ) ) {
165 $this->output( "Marked $count bad revisions.\n" );
166 } else {
167 $this->output( "Found $count bad revisions.\n" );
168
169 if ( $count > 0 ) {
170 $this->output( "On a unix/linux environment, you can use grep and cut to list of IDs\n" );
171 $this->output( "that can then be used with the --revisions option. E.g.\n" );
172 $this->output( " grep '! Found bad blob' | cut -s -f 3\n" );
173 }
174 }
175 }
176
183 private function scanRevisionsByTimestamp( $fromTimestamp, $total ) {
184 $count = 0;
185 $lastRevId = 0;
186 $firstRevId = 0;
187 $lastTimestamp = $fromTimestamp;
188 $revisionRowsScanned = 0;
189 $archiveRowsScanned = 0;
190
191 $this->output( "Scanning revisions table, "
192 . "$total rows starting at rev_timestamp $fromTimestamp\n" );
193
194 while ( $revisionRowsScanned < $total ) {
195 $batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() );
196 $revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize );
197 if ( !$revisions ) {
198 break;
199 }
200
201 foreach ( $revisions as $rev ) {
202 // we are sorting by timestamp, so we may encounter revision IDs out of sequence
203 $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
204 $lastRevId = max( $lastRevId, $rev->getId() );
205
206 $count += $this->checkRevision( $rev );
207 }
208
209 $lastTimestamp = $rev->getTimestamp();
210 $batchSize = count( $revisions );
211 $revisionRowsScanned += $batchSize;
212 $this->output(
213 "\t- Scanned a batch of $batchSize revisions, "
214 . "up to revision $lastRevId ($lastTimestamp)\n"
215 );
216
217 $this->waitForReplication();
218 }
219
220 // NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the
221 // revision ID just before the first revision ID we found above as the starting point
222 // of the scan, and scan up to on revision after the last revision ID we found above.
223 // If $firstRevId is 0, the loop body above didn't execute,
224 // so we should skip the one below as well.
225 $fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' );
226 $maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' );
227 $maxArchived = $maxArchived ?: PHP_INT_MAX;
228
229 $this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
230 while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
231 $batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() );
232 $revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize );
233 if ( !$revisions ) {
234 break;
235 }
237 foreach ( $revisions as $rev ) {
238 $count += $this->checkRevision( $rev );
239 }
240 $fromArchived = $rev->getId();
241 $batchSize = count( $revisions );
242 $archiveRowsScanned += $batchSize;
243 $this->output(
244 "\t- Scanned a batch of $batchSize archived revisions, "
245 . "up to revision $fromArchived ($lastTimestamp)\n"
246 );
247
248 $this->waitForReplication();
249 }
250
251 return $count;
252 }
253
261 private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize ) {
262 $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
263 $queryInfo = $this->revisionStore->getQueryInfo();
264 $rows = $db->newSelectQueryBuilder()
265 ->select( $queryInfo['fields'] )
266 ->tables( $queryInfo['tables'] )
267 ->where( $db->buildComparison( '>', [
268 'rev_timestamp' => $fromTimestamp,
269 'rev_id' => $afterId,
270 ] ) )
271 ->joinConds( $queryInfo['joins'] )
272 ->useIndex( [ 'revision' => 'rev_timestamp' ] )
273 ->orderBy( [ 'rev_timestamp', 'rev_id' ] )
274 ->limit( $batchSize )
275 ->caller( __METHOD__ )
276 ->fetchResultSet();
277 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
278 $this->handleStatus( $result );
279
280 $records = array_filter( $result->value );
281
282 '@phan-var RevisionStoreRecord[] $records';
283 return $records;
284 }
285
293 private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) {
294 $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
295 $queryInfo = $this->revisionStore->getArchiveQueryInfo();
296 $rows = $db->newSelectQueryBuilder()
297 ->select( $queryInfo['fields'] )
298 ->tables( $queryInfo['tables'] )
299 ->where( [ "ar_rev_id > $afterId", "ar_rev_id <= $uptoId" ] )
300 ->joinConds( $queryInfo['joins'] )
301 ->orderBy( 'ar_rev_id' )
302 ->limit( $batchSize )
303 ->caller( __METHOD__ )
304 ->fetchResultSet();
305 $result = $this->revisionStore->newRevisionsFromBatch(
306 $rows,
307 [ 'archive' => true, 'slots' => true ]
308 );
309 $this->handleStatus( $result );
310
311 $records = array_filter( $result->value );
312
313 '@phan-var RevisionArchiveRecord[] $records';
314 return $records;
315 }
316
326 private function getNextRevision( int $revId, string $comp, string $dir ) {
327 $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
328 $next = $db->newSelectQueryBuilder()
329 ->select( 'rev_id' )
330 ->from( 'revision' )
331 ->where( "rev_id $comp $revId" )
332 ->orderBy( [ "rev_id" ], $dir )
333 ->caller( __METHOD__ )
334 ->fetchField();
335 return (int)$next;
336 }
337
343 private function scanRevisionsById( array $ids ) {
344 $count = 0;
345 $total = count( $ids );
346
347 $this->output( "Scanning $total ids\n" );
348
349 foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) {
350 $revisions = $this->loadRevisionsById( $batch );
351
352 if ( !$revisions ) {
353 continue;
354 }
355
357 foreach ( $revisions as $rev ) {
358 $count += $this->checkRevision( $rev );
359 }
360
361 $batchSize = count( $revisions );
362 $this->output( "\t- Scanned a batch of $batchSize revisions\n" );
363 }
364
365 return $count;
366 }
367
373 private function loadRevisionsById( array $ids ) {
374 $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
375 $queryInfo = $this->revisionStore->getQueryInfo();
376
377 $rows = $db->newSelectQueryBuilder()
378 ->select( $queryInfo['fields'] )
379 ->tables( $queryInfo['tables'] )
380 ->where( [ 'rev_id' => $ids ] )
381 ->joinConds( $queryInfo['joins'] )
382 ->caller( __METHOD__ )
383 ->fetchResultSet();
384
385 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
386
387 $this->handleStatus( $result );
388
389 $revisions = array_filter( $result->value );
390 '@phan-var RevisionArchiveRecord[] $revisions';
391
392 // if not all revisions were found, check the archive table.
393 if ( count( $revisions ) < count( $ids ) ) {
394 $archiveQueryInfo = $this->revisionStore->getArchiveQueryInfo();
395 $remainingIds = array_diff( $ids, array_keys( $revisions ) );
396
397 $rows = $db->newSelectQueryBuilder()
398 ->select( $archiveQueryInfo['fields'] )
399 ->tables( $archiveQueryInfo['tables'] )
400 ->where( [ 'ar_rev_id' => $remainingIds ] )
401 ->joinConds( $archiveQueryInfo['joins'] )
402 ->caller( __METHOD__ )
403 ->fetchResultSet();
404
405 $archiveResult = $this->revisionStore->newRevisionsFromBatch(
406 $rows,
407 [ 'slots' => true, 'archive' => true ]
408 );
409
410 $this->handleStatus( $archiveResult );
411
412 // don't use array_merge, since it will re-index
413 $revisions += array_filter( $archiveResult->value );
414 }
415
416 return $revisions;
417 }
418
424 private function checkRevision( RevisionRecord $rev ) {
425 $count = 0;
426 foreach ( $rev->getSlots()->getSlots() as $slot ) {
427 $count += $this->checkSlot( $rev, $slot );
428 }
429
430 if ( $count === 0 && $this->hasOption( 'mark' ) ) {
431 $this->output( "\t# No bad blob found on revision {$rev->getId()}, skipped!\n" );
432 }
433
434 return $count;
435 }
436
443 private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) {
444 $address = $slot->getAddress();
445
446 try {
447 $this->blobStore->getBlob( $address );
448 // nothing to do
449 return 0;
450 } catch ( Exception $ex ) {
451 $error = $ex->getMessage();
452 $type = get_class( $ex );
453 }
454
455 // NOTE: output the revision ID again at the end in a separate column for easy processing
456 // via the "cut" shell command.
457 $this->output( "\t! Found bad blob on revision {$rev->getId()} "
458 . "from {$rev->getTimestamp()} ({$slot->getRole()} slot): "
459 . "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, "
460 . "error='$error', type='$type'. ID:\t{$rev->getId()}\n" );
461
462 if ( $this->hasOption( 'mark' ) ) {
463 $newAddress = $this->markBlob( $slot, $error );
464 $this->output( "\tChanged address to <$newAddress>\n" );
465 }
466
467 return 1;
468 }
469
476 private function markBlob( SlotRecord $slot, string $error = null ) {
477 $args = [];
478
479 if ( $this->hasOption( 'mark' ) ) {
480 $args['reason'] = $this->getOption( 'mark' );
481 }
482
483 if ( $error ) {
484 $args['error'] = $error;
485 }
486
487 $address = $slot->getAddress() ?: 'empty';
488 $badAddress = 'bad:' . urlencode( $address );
489
490 if ( $args ) {
491 $badAddress .= '?' . wfArrayToCgi( $args );
492 }
493
494 $badAddress = substr( $badAddress, 0, 255 );
495
496 $dbw = $this->loadBalancer->getConnectionRef( DB_PRIMARY );
497 $dbw->update(
498 'content',
499 [ 'content_address' => $badAddress ],
500 [ 'content_id' => $slot->getContentId() ],
501 __METHOD__
502 );
503
504 return $badAddress;
505 }
506
507 private function handleStatus( StatusValue $status ) {
508 if ( !$status->isOK() ) {
509 $this->fatalError(
510 Status::wrap( $status )->getMessage( false, false, 'en' )->text()
511 );
512 }
513 if ( !$status->isGood() ) {
514 $this->error(
515 "\t! " . Status::wrap( $status )->getMessage( false, false, 'en' )->text()
516 );
517 }
518 }
519
520}
521
522$maintClass = FindBadBlobs::class;
523require_once RUN_MAINTENANCE_IF_MAIN;
wfArrayToCgi( $array1, $array2=null, $prefix='')
This function takes one or two arrays as input, and returns a CGI-style string, e....
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Maintenance script for finding and marking bad content blobs.
__construct()
Default constructor.
initializeServices(?RevisionStore $revisionStore=null, ?BlobStore $blobStore=null, ?LoadBalancer $loadBalancer=null, ?LBFactory $lbFactory=null)
execute()
Do the actual work.All child classes will need to implement thisbool|null|void True for success,...
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
error( $err, $die=0)
Throw an error to the user.
output( $out, $channel=null)
Throw some output to the user.
waitForReplication()
Wait for replica DBs to catch up.
hasOption( $name)
Checks to see if a particular option was set.
getBatchSize()
Returns batch size.
parseIntList( $text)
Utility function to parse a string (perhaps from a command line option) into a list of integers (perh...
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Service locator for MediaWiki core services.
A RevisionRecord representing a revision of a deleted page persisted in the archive table.
Page revision base class.
getSlots()
Returns the slots defined for this revision.
A RevisionRecord representing an existing revision persisted in the revision table.
Service for looking up page revisions.
Value object representing a content slot associated with a page revision.
getAddress()
Returns the address of this slot's content.
getContentId()
Returns the ID of the content meta data row associated with the slot.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
isOK()
Returns whether the operation completed.
isGood()
Returns whether the operation completed and didn't have any error or warnings.
$maintClass
Service for loading and storing data blobs.
Definition BlobStore.php:33
const DB_REPLICA
Definition defines.php:26
const DB_PRIMARY
Definition defines.php:28