MediaWiki master
findBadBlobs.php
Go to the documentation of this file.
1<?php
15
16// @codeCoverageIgnoreStart
17require_once __DIR__ . '/Maintenance.php';
18// @codeCoverageIgnoreEnd
19
26
27 private RevisionStore $revisionStore;
28 private BlobStore $blobStore;
29
30 public function __construct() {
31 parent::__construct();
32
33 $this->setBatchSize( 1000 );
34 $this->addDescription( 'Find and mark bad content blobs. Marked blobs will be read as empty. '
35 . 'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' );
36 $this->addOption( 'scan-from', 'Start scanning revisions at the given date. '
37 . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
38 false, true );
39 $this->addOption( 'scan-to', 'End of scan date range. '
40 . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
41 false, true );
42 $this->addOption( 'revisions', 'A list of revision IDs to process, separated by comma or '
43 . 'colon or whitespace. Revisions belonging to deleted pages will work. '
44 . 'If set to "-" IDs are read from stdin, one per line.', false, true );
45 $this->addOption( 'limit', 'Maximum number of revisions for --scan-from to scan. '
46 . 'Default: 1000', false, true );
47 $this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when '
48 . 'attempting to read it. The value given is the reason for marking the blob as bad, '
49 . 'typically a ticket ID. Requires --revisions to also be set.', false, true );
50 }
51
55 private function getStartTimestamp() {
56 $tsOpt = $this->getOption( 'scan-from' );
57 if ( strlen( $tsOpt ) < 14 ) {
58 $this->fatalError( 'Bad timestamp: ' . $tsOpt
59 . ', please provide time and date down to the second.' );
60 }
61
62 $ts = wfTimestamp( TS_MW, $tsOpt );
63 if ( !$ts ) {
64 $this->fatalError( 'Bad timestamp: ' . $tsOpt );
65 }
66
67 return $ts;
68 }
69
70 private function getEndTimestamp(): string {
71 $tsOpt = $this->getOption( 'scan-to' );
72 if ( strlen( $tsOpt ) < 14 ) {
73 $this->fatalError( 'Bad timestamp: ' . $tsOpt
74 . ', please provide time and date down to the second.' );
75 }
76
77 $ts = wfTimestamp( TS_MW, $tsOpt );
78 if ( !$ts ) {
79 $this->fatalError( 'Bad timestamp: ' . $tsOpt );
80 }
81
82 return $ts;
83 }
84
88 private function getRevisionIds() {
89 $opt = $this->getOption( 'revisions' );
90
91 if ( $opt === '-' ) {
92 $opt = stream_get_contents( STDIN );
93
94 if ( !$opt ) {
95 return [];
96 }
97 }
98
99 return $this->parseIntList( $opt );
100 }
101
105 public function execute() {
106 $services = $this->getServiceContainer();
107 $this->revisionStore = $services->getRevisionStore();
108 $this->blobStore = $services->getBlobStore();
109
110 if ( $this->hasOption( 'revisions' ) ) {
111 if ( $this->hasOption( 'scan-from' ) || $this->hasOption( 'scan-to' ) ) {
112 $this->fatalError( 'Cannot use --revisions together with --scan-from or --scan-to' );
113 }
114
115 $ids = $this->getRevisionIds();
116
117 $count = $this->scanRevisionsById( $ids );
118 } elseif ( $this->hasOption( 'scan-from' ) ) {
119 if ( $this->hasOption( 'mark' ) ) {
120 $this->fatalError( 'Cannot use --mark with --scan-from, '
121 . 'use --revisions to specify revisions to mark.' );
122 }
123
124 if ( $this->hasOption( 'scan-to' ) && $this->hasOption( 'limit' ) ) {
125 $this->fatalError( 'Cannot use --limit with --scan-to' );
126 }
127
128 $count = $this->scanRevisionsByTimestamp();
129 $this->output( "The range of archive rows scanned is based on the range of revision IDs "
130 . "scanned in the revision table.\n" );
131 } else {
132 if ( $this->hasOption( 'mark' ) ) {
133 $this->fatalError( 'The --mark must be used together with --revisions' );
134 } else {
135 $this->fatalError( 'Must specify one of --revisions or --scan-from' );
136 }
137 }
138
139 if ( $this->hasOption( 'mark' ) ) {
140 $this->output( "Marked $count bad revisions.\n" );
141 } else {
142 $this->output( "Found $count bad revisions.\n" );
143
144 if ( $count > 0 ) {
145 $this->output( "On a unix/linux environment, you can use grep and cut to list of IDs\n" );
146 $this->output( "that can then be used with the --revisions option. E.g.\n" );
147 $this->output( " grep '! Found bad blob' | cut -s -f 3\n" );
148 }
149 }
150 }
151
155 private function scanRevisionsByTimestamp() {
156 $fromTimestamp = $this->getStartTimestamp();
157 if ( $this->getOption( 'scan-to' ) ) {
158 $toTimestamp = $this->getEndTimestamp();
159 } else {
160 $toTimestamp = null;
161 }
162
163 $total = $this->getOption( 'limit', 1000 );
164 $count = 0;
165 $lastRevId = 0;
166 $firstRevId = 0;
167 $lastTimestamp = $fromTimestamp;
168 $revisionRowsScanned = 0;
169 $archiveRowsScanned = 0;
170
171 $this->output( "Scanning revisions table, "
172 . "$total rows starting at rev_timestamp $fromTimestamp\n" );
173
174 while ( $toTimestamp === null ? $revisionRowsScanned < $total : true ) {
175 $batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() );
176 $revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize, $toTimestamp );
177 if ( !$revisions ) {
178 break;
179 }
180
181 foreach ( $revisions as $rev ) {
182 // we are sorting by timestamp, so we may encounter revision IDs out of sequence
183 $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
184 $lastRevId = max( $lastRevId, $rev->getId() );
185
186 $count += $this->checkRevision( $rev );
187 }
188
189 $lastTimestamp = $rev->getTimestamp();
190 $batchSize = count( $revisions );
191 $revisionRowsScanned += $batchSize;
192 $this->output(
193 "\t- Scanned a batch of $batchSize revisions, "
194 . "up to revision $lastRevId ($lastTimestamp)\n"
195 );
196
197 $this->waitForReplication();
198 }
199
200 // NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the
201 // revision ID just before the first revision ID we found above as the starting point
202 // of the scan, and scan up to on revision after the last revision ID we found above.
203 // If $firstRevId is 0, the loop body above didn't execute,
204 // so we should skip the one below as well.
205 $fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' );
206 $maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' );
207 $maxArchived = $maxArchived ?: PHP_INT_MAX;
208
209 $this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
210 while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
211 $batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() );
212 $revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize );
213 if ( !$revisions ) {
214 break;
215 }
217 foreach ( $revisions as $rev ) {
218 $count += $this->checkRevision( $rev );
219 }
220 $fromArchived = $rev->getId();
221 $batchSize = count( $revisions );
222 $archiveRowsScanned += $batchSize;
223 $this->output(
224 "\t- Scanned a batch of $batchSize archived revisions, "
225 . "up to revision $fromArchived ($lastTimestamp)\n"
226 );
227
228 $this->waitForReplication();
229 }
230
231 return $count;
232 }
233
242 private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize, $toTimestamp ) {
243 $db = $this->getReplicaDB();
244 $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db )
245 ->joinComment()
246 ->where( $db->buildComparison( '>', [
247 'rev_timestamp' => $fromTimestamp,
248 'rev_id' => $afterId,
249 ] ) )
250 ->useIndex( [ 'revision' => 'rev_timestamp' ] )
251 ->orderBy( [ 'rev_timestamp', 'rev_id' ] )
252 ->limit( $batchSize );
253
254 if ( $toTimestamp ) {
255 $queryBuilder->where( $db->expr( 'rev_timestamp', '<', $toTimestamp ) );
256 }
257
258 $rows = $queryBuilder->caller( __METHOD__ )->fetchResultSet();
259 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
260 $this->handleStatus( $result );
261
262 $records = array_filter( $result->value );
263
264 '@phan-var RevisionStoreRecord[] $records';
265 return $records;
266 }
267
275 private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) {
276 $db = $this->getReplicaDB();
277 $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db )
278 ->joinComment()
279 ->where( [ $db->expr( 'ar_rev_id', '>', $afterId ), $db->expr( 'ar_rev_id', '<=', $uptoId ) ] )
280 ->orderBy( 'ar_rev_id' )
281 ->limit( $batchSize )
282 ->caller( __METHOD__ )->fetchResultSet();
283 $result = $this->revisionStore->newRevisionsFromBatch(
284 $rows,
285 [ 'archive' => true, 'slots' => true ]
286 );
287 $this->handleStatus( $result );
288
289 $records = array_filter( $result->value );
290
291 '@phan-var RevisionArchiveRecord[] $records';
292 return $records;
293 }
294
304 private function getNextRevision( int $revId, string $comp, string $dir ) {
305 $db = $this->getReplicaDB();
306 $next = $db->newSelectQueryBuilder()
307 ->select( 'rev_id' )
308 ->from( 'revision' )
309 ->where( "rev_id $comp $revId" )
310 ->orderBy( [ "rev_id" ], $dir )
311 ->caller( __METHOD__ )
312 ->fetchField();
313 return (int)$next;
314 }
315
321 private function scanRevisionsById( array $ids ) {
322 $count = 0;
323 $total = count( $ids );
324
325 $this->output( "Scanning $total ids\n" );
326
327 foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) {
328 $revisions = $this->loadRevisionsById( $batch );
329
330 if ( !$revisions ) {
331 continue;
332 }
333
335 foreach ( $revisions as $rev ) {
336 $count += $this->checkRevision( $rev );
337 }
338
339 $batchSize = count( $revisions );
340 $this->output( "\t- Scanned a batch of $batchSize revisions\n" );
341 }
342
343 return $count;
344 }
345
351 private function loadRevisionsById( array $ids ) {
352 $db = $this->getReplicaDB();
353 $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db );
354
355 $rows = $queryBuilder
356 ->joinComment()
357 ->where( [ 'rev_id' => $ids ] )
358 ->caller( __METHOD__ )->fetchResultSet();
359
360 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
361
362 $this->handleStatus( $result );
363
364 $revisions = array_filter( $result->value );
365 '@phan-var RevisionArchiveRecord[] $revisions';
366
367 // if not all revisions were found, check the archive table.
368 if ( count( $revisions ) < count( $ids ) ) {
369 $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db )
370 ->joinComment()
371 ->where( [ 'ar_rev_id' => array_diff( $ids, array_keys( $revisions ) ) ] )
372 ->caller( __METHOD__ )->fetchResultSet();
373
374 $archiveResult = $this->revisionStore->newRevisionsFromBatch(
375 $rows,
376 [ 'slots' => true, 'archive' => true ]
377 );
378
379 $this->handleStatus( $archiveResult );
380
381 // don't use array_merge, since it will re-index
382 $revisions += array_filter( $archiveResult->value );
383 }
384
385 return $revisions;
386 }
387
393 private function checkRevision( RevisionRecord $rev ) {
394 $count = 0;
395 foreach ( $rev->getSlots()->getSlots() as $slot ) {
396 $count += $this->checkSlot( $rev, $slot );
397 }
398
399 if ( $count === 0 && $this->hasOption( 'mark' ) ) {
400 $this->output( "\t# No bad blob found on revision {$rev->getId()}, skipped!\n" );
401 }
402
403 return $count;
404 }
405
412 private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) {
413 $address = $slot->getAddress();
414
415 try {
416 $blob = $this->blobStore->getBlob( $address );
417 if ( mb_check_encoding( $blob ) ) {
418 // nothing to do
419 return 0;
420 } else {
421 $type = 'invalid-utf-8';
422 $error = 'Invalid UTF-8';
423 }
424 } catch ( Exception $ex ) {
425 $error = $ex->getMessage();
426 $type = get_class( $ex );
427 }
428
429 // NOTE: output the revision ID again at the end in a separate column for easy processing
430 // via the "cut" shell command.
431 $this->output( "\t! Found bad blob on revision {$rev->getId()} "
432 . "from {$rev->getTimestamp()} ({$slot->getRole()} slot): "
433 . "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, "
434 . "error='$error', type='$type'. ID:\t{$rev->getId()}\n" );
435
436 if ( $this->hasOption( 'mark' ) ) {
437 $newAddress = $this->markBlob( $slot, $error );
438 $this->output( "\tChanged address to <$newAddress>\n" );
439 }
440
441 return 1;
442 }
443
450 private function markBlob( SlotRecord $slot, ?string $error = null ) {
451 $args = [];
452
453 if ( $this->hasOption( 'mark' ) ) {
454 $args['reason'] = $this->getOption( 'mark' );
455 }
456
457 if ( $error ) {
458 $args['error'] = $error;
459 }
460
461 $address = $slot->getAddress() ?: 'empty';
462 $badAddress = 'bad:' . urlencode( $address );
463
464 if ( $args ) {
465 $badAddress .= '?' . wfArrayToCgi( $args );
466 }
467
468 $badAddress = substr( $badAddress, 0, 255 );
469
470 $dbw = $this->getPrimaryDB();
471 $dbw->newUpdateQueryBuilder()
472 ->update( 'content' )
473 ->set( [ 'content_address' => $badAddress ] )
474 ->where( [ 'content_id' => $slot->getContentId() ] )
475 ->caller( __METHOD__ )->execute();
476
477 return $badAddress;
478 }
479
480 private function handleStatus( StatusValue $status ) {
481 if ( !$status->isOK() ) {
482 $this->fatalError( $status );
483 }
484 if ( !$status->isGood() ) {
485 $this->error( $status );
486 }
487 }
488
489}
490
491// @codeCoverageIgnoreStart
492$maintClass = FindBadBlobs::class;
493require_once RUN_MAINTENANCE_IF_MAIN;
494// @codeCoverageIgnoreEnd
wfArrayToCgi( $array1, $array2=null, $prefix='')
This function takes one or two arrays as input, and returns a CGI-style string, e....
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Maintenance script for finding and marking bad content blobs.
__construct()
Default constructor.
execute()
Do the actual work.All child classes will need to implement thisbool|null|void True for success,...
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
addDescription( $text)
Set the description text.
A RevisionRecord representing a revision of a deleted page persisted in the archive table.
Page revision base class.
getSlots()
Returns the slots defined for this revision.
A RevisionRecord representing an existing revision persisted in the revision table.
Service for looking up page revisions.
Value object representing a content slot associated with a page revision.
getAddress()
Returns the address of this slot's content.
getContentId()
Returns the ID of the content meta data row associated with the slot.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
isOK()
Returns whether the operation completed.
isGood()
Returns whether the operation completed and didn't have any error or warnings.
$maintClass
Service for loading and storing data blobs.
Definition BlobStore.php:19