MediaWiki master
findBadBlobs.php
Go to the documentation of this file.
1<?php
15use Wikimedia\Timestamp\TimestampFormat as TS;
16
17// @codeCoverageIgnoreStart
18require_once __DIR__ . '/Maintenance.php';
19// @codeCoverageIgnoreEnd
20
27
28 private RevisionStore $revisionStore;
29 private BlobStore $blobStore;
30
31 public function __construct() {
32 parent::__construct();
33
34 $this->setBatchSize( 1000 );
35 $this->addDescription( 'Find and mark bad content blobs. Marked blobs will be read as empty. '
36 . 'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' );
37 $this->addOption( 'scan-from', 'Start scanning revisions at the given date. '
38 . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
39 false, true );
40 $this->addOption( 'scan-to', 'End of scan date range. '
41 . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
42 false, true );
43 $this->addOption( 'revisions', 'A list of revision IDs to process, separated by comma or '
44 . 'colon or whitespace. Revisions belonging to deleted pages will work. '
45 . 'If set to "-" IDs are read from stdin, one per line.', false, true );
46 $this->addOption( 'limit', 'Maximum number of revisions for --scan-from to scan. '
47 . 'Default: 1000', false, true );
48 $this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when '
49 . 'attempting to read it. The value given is the reason for marking the blob as bad, '
50 . 'typically a ticket ID. Requires --revisions to also be set.', false, true );
51 }
52
56 private function getStartTimestamp() {
57 $tsOpt = $this->getOption( 'scan-from' );
58 if ( strlen( $tsOpt ) < 14 ) {
59 $this->fatalError( 'Bad timestamp: ' . $tsOpt
60 . ', please provide time and date down to the second.' );
61 }
62
63 $ts = wfTimestamp( TS::MW, $tsOpt );
64 if ( !$ts ) {
65 $this->fatalError( 'Bad timestamp: ' . $tsOpt );
66 }
67
68 return $ts;
69 }
70
71 private function getEndTimestamp(): string {
72 $tsOpt = $this->getOption( 'scan-to' );
73 if ( strlen( $tsOpt ) < 14 ) {
74 $this->fatalError( 'Bad timestamp: ' . $tsOpt
75 . ', please provide time and date down to the second.' );
76 }
77
78 $ts = wfTimestamp( TS::MW, $tsOpt );
79 if ( !$ts ) {
80 $this->fatalError( 'Bad timestamp: ' . $tsOpt );
81 }
82
83 return $ts;
84 }
85
89 private function getRevisionIds() {
90 $opt = $this->getOption( 'revisions' );
91
92 if ( $opt === '-' ) {
93 $opt = stream_get_contents( STDIN );
94
95 if ( !$opt ) {
96 return [];
97 }
98 }
99
100 return $this->parseIntList( $opt );
101 }
102
106 public function execute() {
107 $services = $this->getServiceContainer();
108 $this->revisionStore = $services->getRevisionStore();
109 $this->blobStore = $services->getBlobStore();
110
111 if ( $this->hasOption( 'revisions' ) ) {
112 if ( $this->hasOption( 'scan-from' ) || $this->hasOption( 'scan-to' ) ) {
113 $this->fatalError( 'Cannot use --revisions together with --scan-from or --scan-to' );
114 }
115
116 $ids = $this->getRevisionIds();
117
118 $count = $this->scanRevisionsById( $ids );
119 } elseif ( $this->hasOption( 'scan-from' ) ) {
120 if ( $this->hasOption( 'mark' ) ) {
121 $this->fatalError( 'Cannot use --mark with --scan-from, '
122 . 'use --revisions to specify revisions to mark.' );
123 }
124
125 if ( $this->hasOption( 'scan-to' ) && $this->hasOption( 'limit' ) ) {
126 $this->fatalError( 'Cannot use --limit with --scan-to' );
127 }
128
129 $count = $this->scanRevisionsByTimestamp();
130 $this->output( "The range of archive rows scanned is based on the range of revision IDs "
131 . "scanned in the revision table.\n" );
132 } else {
133 if ( $this->hasOption( 'mark' ) ) {
134 $this->fatalError( 'The --mark must be used together with --revisions' );
135 } else {
136 $this->fatalError( 'Must specify one of --revisions or --scan-from' );
137 }
138 }
139
140 if ( $this->hasOption( 'mark' ) ) {
141 $this->output( "Marked $count bad revisions.\n" );
142 } else {
143 $this->output( "Found $count bad revisions.\n" );
144
145 if ( $count > 0 ) {
146 $this->output( "On a unix/linux environment, you can use grep and cut to list of IDs\n" );
147 $this->output( "that can then be used with the --revisions option. E.g.\n" );
148 $this->output( " grep '! Found bad blob' | cut -s -f 3\n" );
149 }
150 }
151 }
152
156 private function scanRevisionsByTimestamp() {
157 $fromTimestamp = $this->getStartTimestamp();
158 if ( $this->getOption( 'scan-to' ) ) {
159 $toTimestamp = $this->getEndTimestamp();
160 $total = INF;
161 $msg = "Scanning revisions table, "
162 . "starting at rev_timestamp $fromTimestamp until $toTimestamp\n";
163 } else {
164 $toTimestamp = null;
165 $total = $this->getOption( 'limit', 1000 );
166 $msg = "Scanning revisions table, "
167 . "$total rows starting at rev_timestamp $fromTimestamp\n";
168 }
169
170 $count = 0;
171 $lastRevId = 0;
172 $firstRevId = 0;
173 $lastTimestamp = $fromTimestamp;
174 $revisionRowsScanned = 0;
175 $archiveRowsScanned = 0;
176
177 $this->output( $msg );
178
179 while ( $revisionRowsScanned < $total ) {
180 $batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() );
181 $revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize, $toTimestamp );
182 if ( !$revisions ) {
183 break;
184 }
185
186 foreach ( $revisions as $rev ) {
187 // we are sorting by timestamp, so we may encounter revision IDs out of sequence
188 $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
189 $lastRevId = max( $lastRevId, $rev->getId() );
190
191 $count += $this->checkRevision( $rev );
192 }
193
194 $lastTimestamp = $rev->getTimestamp();
195 $batchSize = count( $revisions );
196 $revisionRowsScanned += $batchSize;
197 $this->output(
198 "\t- Scanned a batch of $batchSize revisions, "
199 . "up to revision $lastRevId ($lastTimestamp)\n"
200 );
201
202 $this->waitForReplication();
203 }
204
205 // NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the
206 // revision ID just before the first revision ID we found above as the starting point
207 // of the scan, and scan up to on revision after the last revision ID we found above.
208 // If $firstRevId is 0, the loop body above didn't execute,
209 // so we should skip the one below as well.
210 $fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' );
211 $maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' );
212 $maxArchived = $maxArchived ?: PHP_INT_MAX;
213
214 $this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
215 while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
216 $batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() );
217 $revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize );
218 if ( !$revisions ) {
219 break;
220 }
222 foreach ( $revisions as $rev ) {
223 $count += $this->checkRevision( $rev );
224 }
225 $fromArchived = $rev->getId();
226 $batchSize = count( $revisions );
227 $archiveRowsScanned += $batchSize;
228 $this->output(
229 "\t- Scanned a batch of $batchSize archived revisions, "
230 . "up to revision $fromArchived ($lastTimestamp)\n"
231 );
232
233 $this->waitForReplication();
234 }
235
236 return $count;
237 }
238
247 private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize, $toTimestamp ) {
248 $db = $this->getReplicaDB();
249 $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db )
250 ->joinComment()
251 ->where( $db->buildComparison( '>', [
252 'rev_timestamp' => $fromTimestamp,
253 'rev_id' => $afterId,
254 ] ) )
255 ->useIndex( [ 'revision' => 'rev_timestamp' ] )
256 ->orderBy( [ 'rev_timestamp', 'rev_id' ] )
257 ->limit( $batchSize );
258
259 if ( $toTimestamp ) {
260 $queryBuilder->where( $db->expr( 'rev_timestamp', '<', $toTimestamp ) );
261 }
262
263 $rows = $queryBuilder->caller( __METHOD__ )->fetchResultSet();
264 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
265 $this->handleStatus( $result );
266
267 $records = array_filter( $result->value );
268
269 '@phan-var RevisionStoreRecord[] $records';
270 return $records;
271 }
272
280 private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) {
281 $db = $this->getReplicaDB();
282 $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db )
283 ->joinComment()
284 ->where( [ $db->expr( 'ar_rev_id', '>', $afterId ), $db->expr( 'ar_rev_id', '<=', $uptoId ) ] )
285 ->orderBy( 'ar_rev_id' )
286 ->limit( $batchSize )
287 ->caller( __METHOD__ )->fetchResultSet();
288 $result = $this->revisionStore->newRevisionsFromBatch(
289 $rows,
290 [ 'archive' => true, 'slots' => true ]
291 );
292 $this->handleStatus( $result );
293
294 $records = array_filter( $result->value );
295
296 '@phan-var RevisionArchiveRecord[] $records';
297 return $records;
298 }
299
309 private function getNextRevision( int $revId, string $comp, string $dir ) {
310 $db = $this->getReplicaDB();
311 $next = $db->newSelectQueryBuilder()
312 ->select( 'rev_id' )
313 ->from( 'revision' )
314 ->where( "rev_id $comp $revId" )
315 ->orderBy( [ "rev_id" ], $dir )
316 ->caller( __METHOD__ )
317 ->fetchField();
318 return (int)$next;
319 }
320
326 private function scanRevisionsById( array $ids ) {
327 $count = 0;
328 $total = count( $ids );
329
330 $this->output( "Scanning $total ids\n" );
331
332 foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) {
333 $revisions = $this->loadRevisionsById( $batch );
334
335 if ( !$revisions ) {
336 continue;
337 }
338
340 foreach ( $revisions as $rev ) {
341 $count += $this->checkRevision( $rev );
342 }
343
344 $batchSize = count( $revisions );
345 $this->output( "\t- Scanned a batch of $batchSize revisions\n" );
346 }
347
348 return $count;
349 }
350
356 private function loadRevisionsById( array $ids ) {
357 $db = $this->getReplicaDB();
358 $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db );
359
360 $rows = $queryBuilder
361 ->joinComment()
362 ->where( [ 'rev_id' => $ids ] )
363 ->caller( __METHOD__ )->fetchResultSet();
364
365 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
366
367 $this->handleStatus( $result );
368
369 $revisions = array_filter( $result->value );
370 '@phan-var RevisionArchiveRecord[] $revisions';
371
372 // if not all revisions were found, check the archive table.
373 if ( count( $revisions ) < count( $ids ) ) {
374 $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db )
375 ->joinComment()
376 ->where( [ 'ar_rev_id' => array_diff( $ids, array_keys( $revisions ) ) ] )
377 ->caller( __METHOD__ )->fetchResultSet();
378
379 $archiveResult = $this->revisionStore->newRevisionsFromBatch(
380 $rows,
381 [ 'slots' => true, 'archive' => true ]
382 );
383
384 $this->handleStatus( $archiveResult );
385
386 // don't use array_merge, since it will re-index
387 $revisions += array_filter( $archiveResult->value );
388 }
389
390 return $revisions;
391 }
392
398 private function checkRevision( RevisionRecord $rev ) {
399 $count = 0;
400 foreach ( $rev->getSlots()->getSlots() as $slot ) {
401 $count += $this->checkSlot( $rev, $slot );
402 }
403
404 if ( $count === 0 && $this->hasOption( 'mark' ) ) {
405 $this->output( "\t# No bad blob found on revision {$rev->getId()}, skipped!\n" );
406 }
407
408 return $count;
409 }
410
417 private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) {
418 $address = $slot->getAddress();
419
420 try {
421 $blob = $this->blobStore->getBlob( $address );
422 if ( mb_check_encoding( $blob ) ) {
423 // nothing to do
424 return 0;
425 } else {
426 $type = 'invalid-utf-8';
427 $error = 'Invalid UTF-8';
428 }
429 } catch ( Exception $ex ) {
430 $error = $ex->getMessage();
431 $type = get_class( $ex );
432 }
433
434 // NOTE: output the revision ID again at the end in a separate column for easy processing
435 // via the "cut" shell command.
436 $this->output( "\t! Found bad blob on revision {$rev->getId()} "
437 . "from {$rev->getTimestamp()} ({$slot->getRole()} slot): "
438 . "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, "
439 . "error='$error', type='$type'. ID:\t{$rev->getId()}\n" );
440
441 if ( $this->hasOption( 'mark' ) ) {
442 $newAddress = $this->markBlob( $slot, $error );
443 $this->output( "\tChanged address to <$newAddress>\n" );
444 }
445
446 return 1;
447 }
448
455 private function markBlob( SlotRecord $slot, ?string $error = null ) {
456 $args = [];
457
458 if ( $this->hasOption( 'mark' ) ) {
459 $args['reason'] = $this->getOption( 'mark' );
460 }
461
462 if ( $error ) {
463 $args['error'] = $error;
464 }
465
466 $address = $slot->getAddress() ?: 'empty';
467 $badAddress = 'bad:' . urlencode( $address );
468
469 if ( $args ) {
470 $badAddress .= '?' . wfArrayToCgi( $args );
471 }
472
473 $badAddress = substr( $badAddress, 0, 255 );
474
475 $dbw = $this->getPrimaryDB();
476 $dbw->newUpdateQueryBuilder()
477 ->update( 'content' )
478 ->set( [ 'content_address' => $badAddress ] )
479 ->where( [ 'content_id' => $slot->getContentId() ] )
480 ->caller( __METHOD__ )->execute();
481
482 return $badAddress;
483 }
484
485 private function handleStatus( StatusValue $status ) {
486 if ( !$status->isOK() ) {
487 $this->fatalError( $status );
488 }
489 if ( !$status->isGood() ) {
490 $this->error( $status );
491 }
492 }
493
494}
495
496// @codeCoverageIgnoreStart
497$maintClass = FindBadBlobs::class;
498require_once RUN_MAINTENANCE_IF_MAIN;
499// @codeCoverageIgnoreEnd
wfTimestamp( $outputtype=TS::UNIX, $ts=0)
Get a timestamp string in one of various formats.
wfArrayToCgi( $array1, $array2=null, $prefix='')
This function takes one or two arrays as input, and returns a CGI-style string, e....
Maintenance script for finding and marking bad content blobs.
__construct()
Default constructor.
execute()
Do the actual work.All child classes will need to implement thisbool|null|void True for success,...
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
addDescription( $text)
Set the description text.
A RevisionRecord representing a revision of a deleted page persisted in the archive table.
Page revision base class.
getSlots()
Returns the slots defined for this revision.
A RevisionRecord representing an existing revision persisted in the revision table.
Service for looking up page revisions.
Value object representing a content slot associated with a page revision.
getAddress()
Returns the address of this slot's content.
getContentId()
Returns the ID of the content meta data row associated with the slot.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
isOK()
Returns whether the operation completed.
isGood()
Returns whether the operation completed and didn't have any error or warnings.
$maintClass
Service for loading and storing data blobs.
Definition BlobStore.php:19