MediaWiki master
findBadBlobs.php
Go to the documentation of this file.
1<?php
29
30require_once __DIR__ . '/Maintenance.php';
31
38
39 private RevisionStore $revisionStore;
40 private BlobStore $blobStore;
41
42 public function __construct() {
43 parent::__construct();
44
45 $this->setBatchSize( 1000 );
46 $this->addDescription( 'Find and mark bad content blobs. Marked blobs will be read as empty. '
47 . 'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' );
48 $this->addOption( 'scan-from', 'Start scanning revisions at the given date. '
49 . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
50 false, true );
51 $this->addOption( 'revisions', 'A list of revision IDs to process, separated by comma or '
52 . 'colon or whitespace. Revisions belonging to deleted pages will work. '
53 . 'If set to "-" IDs are read from stdin, one per line.', false, true );
54 $this->addOption( 'limit', 'Maximum number of revisions for --scan-from to scan. '
55 . 'Default: 1000', false, true );
56 $this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when '
57 . 'attempting to read it. The value given is the reason for marking the blob as bad, '
58 . 'typically a ticket ID. Requires --revisions to also be set.', false, true );
59 }
60
64 private function getStartTimestamp() {
65 $tsOpt = $this->getOption( 'scan-from' );
66 if ( strlen( $tsOpt ) < 14 ) {
67 $this->fatalError( 'Bad timestamp: ' . $tsOpt
68 . ', please provide time and date down to the second.' );
69 }
70
71 $ts = wfTimestamp( TS_MW, $tsOpt );
72 if ( !$ts ) {
73 $this->fatalError( 'Bad timestamp: ' . $tsOpt );
74 }
75
76 return $ts;
77 }
78
82 private function getRevisionIds() {
83 $opt = $this->getOption( 'revisions' );
84
85 if ( $opt === '-' ) {
86 $opt = stream_get_contents( STDIN );
87
88 if ( !$opt ) {
89 return [];
90 }
91 }
92
93 return $this->parseIntList( $opt );
94 }
95
99 public function execute() {
100 $services = $this->getServiceContainer();
101 $this->revisionStore = $services->getRevisionStore();
102 $this->blobStore = $services->getBlobStore();
103 $this->setDBProvider( $services->getConnectionProvider() );
104
105 if ( $this->hasOption( 'revisions' ) ) {
106 if ( $this->hasOption( 'scan-from' ) ) {
107 $this->fatalError( 'Cannot use --revisions together with --scan-from' );
108 }
109
110 $ids = $this->getRevisionIds();
111
112 $count = $this->scanRevisionsById( $ids );
113 } elseif ( $this->hasOption( 'scan-from' ) ) {
114 if ( $this->hasOption( 'mark' ) ) {
115 $this->fatalError( 'Cannot use --mark with --scan-from, '
116 . 'use --revisions to specify revisions to mark.' );
117 }
118
119 $fromTimestamp = $this->getStartTimestamp();
120 $total = $this->getOption( 'limit', 1000 );
121
122 $count = $this->scanRevisionsByTimestamp( $fromTimestamp, $total );
123
124 $this->output( "The range of archive rows scanned is based on the range of revision IDs "
125 . "scanned in the revision table.\n" );
126 } else {
127 if ( $this->hasOption( 'mark' ) ) {
128 $this->fatalError( 'The --mark must be used together with --revisions' );
129 } else {
130 $this->fatalError( 'Must specify one of --revisions or --scan-from' );
131 }
132 }
133
134 if ( $this->hasOption( 'mark' ) ) {
135 $this->output( "Marked $count bad revisions.\n" );
136 } else {
137 $this->output( "Found $count bad revisions.\n" );
138
139 if ( $count > 0 ) {
140 $this->output( "On a unix/linux environment, you can use grep and cut to list of IDs\n" );
141 $this->output( "that can then be used with the --revisions option. E.g.\n" );
142 $this->output( " grep '! Found bad blob' | cut -s -f 3\n" );
143 }
144 }
145 }
146
153 private function scanRevisionsByTimestamp( $fromTimestamp, $total ) {
154 $count = 0;
155 $lastRevId = 0;
156 $firstRevId = 0;
157 $lastTimestamp = $fromTimestamp;
158 $revisionRowsScanned = 0;
159 $archiveRowsScanned = 0;
160
161 $this->output( "Scanning revisions table, "
162 . "$total rows starting at rev_timestamp $fromTimestamp\n" );
163
164 while ( $revisionRowsScanned < $total ) {
165 $batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() );
166 $revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize );
167 if ( !$revisions ) {
168 break;
169 }
170
171 foreach ( $revisions as $rev ) {
172 // we are sorting by timestamp, so we may encounter revision IDs out of sequence
173 $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
174 $lastRevId = max( $lastRevId, $rev->getId() );
175
176 $count += $this->checkRevision( $rev );
177 }
178
179 $lastTimestamp = $rev->getTimestamp();
180 $batchSize = count( $revisions );
181 $revisionRowsScanned += $batchSize;
182 $this->output(
183 "\t- Scanned a batch of $batchSize revisions, "
184 . "up to revision $lastRevId ($lastTimestamp)\n"
185 );
186
187 $this->waitForReplication();
188 }
189
190 // NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the
191 // revision ID just before the first revision ID we found above as the starting point
192 // of the scan, and scan up to on revision after the last revision ID we found above.
193 // If $firstRevId is 0, the loop body above didn't execute,
194 // so we should skip the one below as well.
195 $fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' );
196 $maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' );
197 $maxArchived = $maxArchived ?: PHP_INT_MAX;
198
199 $this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
200 while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
201 $batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() );
202 $revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize );
203 if ( !$revisions ) {
204 break;
205 }
207 foreach ( $revisions as $rev ) {
208 $count += $this->checkRevision( $rev );
209 }
210 $fromArchived = $rev->getId();
211 $batchSize = count( $revisions );
212 $archiveRowsScanned += $batchSize;
213 $this->output(
214 "\t- Scanned a batch of $batchSize archived revisions, "
215 . "up to revision $fromArchived ($lastTimestamp)\n"
216 );
217
218 $this->waitForReplication();
219 }
220
221 return $count;
222 }
223
231 private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize ) {
232 $db = $this->getReplicaDB();
233 $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db );
234 $rows = $queryBuilder->joinComment()
235 ->where( $db->buildComparison( '>', [
236 'rev_timestamp' => $fromTimestamp,
237 'rev_id' => $afterId,
238 ] ) )
239 ->useIndex( [ 'revision' => 'rev_timestamp' ] )
240 ->orderBy( [ 'rev_timestamp', 'rev_id' ] )
241 ->limit( $batchSize )
242 ->caller( __METHOD__ )->fetchResultSet();
243 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
244 $this->handleStatus( $result );
245
246 $records = array_filter( $result->value );
247
248 '@phan-var RevisionStoreRecord[] $records';
249 return $records;
250 }
251
259 private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) {
260 $db = $this->getReplicaDB();
261 $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db )
262 ->joinComment()
263 ->where( [ "ar_rev_id > $afterId", "ar_rev_id <= $uptoId" ] )
264 ->orderBy( 'ar_rev_id' )
265 ->limit( $batchSize )
266 ->caller( __METHOD__ )->fetchResultSet();
267 $result = $this->revisionStore->newRevisionsFromBatch(
268 $rows,
269 [ 'archive' => true, 'slots' => true ]
270 );
271 $this->handleStatus( $result );
272
273 $records = array_filter( $result->value );
274
275 '@phan-var RevisionArchiveRecord[] $records';
276 return $records;
277 }
278
288 private function getNextRevision( int $revId, string $comp, string $dir ) {
289 $db = $this->getReplicaDB();
290 $next = $db->newSelectQueryBuilder()
291 ->select( 'rev_id' )
292 ->from( 'revision' )
293 ->where( "rev_id $comp $revId" )
294 ->orderBy( [ "rev_id" ], $dir )
295 ->caller( __METHOD__ )
296 ->fetchField();
297 return (int)$next;
298 }
299
305 private function scanRevisionsById( array $ids ) {
306 $count = 0;
307 $total = count( $ids );
308
309 $this->output( "Scanning $total ids\n" );
310
311 foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) {
312 $revisions = $this->loadRevisionsById( $batch );
313
314 if ( !$revisions ) {
315 continue;
316 }
317
319 foreach ( $revisions as $rev ) {
320 $count += $this->checkRevision( $rev );
321 }
322
323 $batchSize = count( $revisions );
324 $this->output( "\t- Scanned a batch of $batchSize revisions\n" );
325 }
326
327 return $count;
328 }
329
335 private function loadRevisionsById( array $ids ) {
336 $db = $this->getReplicaDB();
337 $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db );
338
339 $rows = $queryBuilder
340 ->joinComment()
341 ->where( [ 'rev_id' => $ids ] )
342 ->caller( __METHOD__ )->fetchResultSet();
343
344 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
345
346 $this->handleStatus( $result );
347
348 $revisions = array_filter( $result->value );
349 '@phan-var RevisionArchiveRecord[] $revisions';
350
351 // if not all revisions were found, check the archive table.
352 if ( count( $revisions ) < count( $ids ) ) {
353 $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db )
354 ->joinComment()
355 ->where( [ 'ar_rev_id' => array_diff( $ids, array_keys( $revisions ) ) ] )
356 ->caller( __METHOD__ )->fetchResultSet();
357
358 $archiveResult = $this->revisionStore->newRevisionsFromBatch(
359 $rows,
360 [ 'slots' => true, 'archive' => true ]
361 );
362
363 $this->handleStatus( $archiveResult );
364
365 // don't use array_merge, since it will re-index
366 $revisions += array_filter( $archiveResult->value );
367 }
368
369 return $revisions;
370 }
371
377 private function checkRevision( RevisionRecord $rev ) {
378 $count = 0;
379 foreach ( $rev->getSlots()->getSlots() as $slot ) {
380 $count += $this->checkSlot( $rev, $slot );
381 }
382
383 if ( $count === 0 && $this->hasOption( 'mark' ) ) {
384 $this->output( "\t# No bad blob found on revision {$rev->getId()}, skipped!\n" );
385 }
386
387 return $count;
388 }
389
396 private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) {
397 $address = $slot->getAddress();
398
399 try {
400 $this->blobStore->getBlob( $address );
401 // nothing to do
402 return 0;
403 } catch ( Exception $ex ) {
404 $error = $ex->getMessage();
405 $type = get_class( $ex );
406 }
407
408 // NOTE: output the revision ID again at the end in a separate column for easy processing
409 // via the "cut" shell command.
410 $this->output( "\t! Found bad blob on revision {$rev->getId()} "
411 . "from {$rev->getTimestamp()} ({$slot->getRole()} slot): "
412 . "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, "
413 . "error='$error', type='$type'. ID:\t{$rev->getId()}\n" );
414
415 if ( $this->hasOption( 'mark' ) ) {
416 $newAddress = $this->markBlob( $slot, $error );
417 $this->output( "\tChanged address to <$newAddress>\n" );
418 }
419
420 return 1;
421 }
422
429 private function markBlob( SlotRecord $slot, string $error = null ) {
430 $args = [];
431
432 if ( $this->hasOption( 'mark' ) ) {
433 $args['reason'] = $this->getOption( 'mark' );
434 }
435
436 if ( $error ) {
437 $args['error'] = $error;
438 }
439
440 $address = $slot->getAddress() ?: 'empty';
441 $badAddress = 'bad:' . urlencode( $address );
442
443 if ( $args ) {
444 $badAddress .= '?' . wfArrayToCgi( $args );
445 }
446
447 $badAddress = substr( $badAddress, 0, 255 );
448
449 $dbw = $this->getPrimaryDB();
450 $dbw->newUpdateQueryBuilder()
451 ->update( 'content' )
452 ->set( [ 'content_address' => $badAddress ] )
453 ->where( [ 'content_id' => $slot->getContentId() ] )
454 ->caller( __METHOD__ )->execute();
455
456 return $badAddress;
457 }
458
459 private function handleStatus( StatusValue $status ) {
460 if ( !$status->isOK() ) {
461 $this->fatalError(
462 Status::wrap( $status )->getMessage( false, false, 'en' )->text()
463 );
464 }
465 if ( !$status->isGood() ) {
466 $this->error(
467 "\t! " . Status::wrap( $status )->getMessage( false, false, 'en' )->text()
468 );
469 }
470 }
471
472}
473
474$maintClass = FindBadBlobs::class;
475require_once RUN_MAINTENANCE_IF_MAIN;
wfArrayToCgi( $array1, $array2=null, $prefix='')
This function takes one or two arrays as input, and returns a CGI-style string, e....
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Maintenance script for finding and marking bad content blobs.
__construct()
Default constructor.
execute()
Do the actual work.All child classes will need to implement thisbool|null|void True for success,...
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
error( $err, $die=0)
Throw an error to the user.
output( $out, $channel=null)
Throw some output to the user.
waitForReplication()
Wait for replica DBs to catch up.
hasOption( $name)
Checks to see if a particular option was set.
getServiceContainer()
Returns the main service container.
getBatchSize()
Returns batch size.
parseIntList( $text)
Utility function to parse a string (perhaps from a command line option) into a list of integers (perh...
addDescription( $text)
Set the description text.
setDBProvider(IConnectionProvider $dbProvider)
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
A RevisionRecord representing a revision of a deleted page persisted in the archive table.
Page revision base class.
getSlots()
Returns the slots defined for this revision.
A RevisionRecord representing an existing revision persisted in the revision table.
Service for looking up page revisions.
Value object representing a content slot associated with a page revision.
getAddress()
Returns the address of this slot's content.
getContentId()
Returns the ID of the content meta data row associated with the slot.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition Status.php:54
Generic operation result class Has warning/error list, boolean status and arbitrary value.
isOK()
Returns whether the operation completed.
isGood()
Returns whether the operation completed and didn't have any error or warnings.
$maintClass
Service for loading and storing data blobs.
Definition BlobStore.php:33