MediaWiki REL1_37
findBadBlobs.php
Go to the documentation of this file.
1<?php
31
32require_once __DIR__ . '/Maintenance.php';
33
40
45
49 private $blobStore;
50
55
59 private $lbFactory;
60
61 public function __construct() {
62 parent::__construct();
63
64 $this->setBatchSize( 1000 );
65 $this->addDescription( 'Find and mark bad content blobs. Marked blobs will be read as empty. '
66 . 'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' );
67 $this->addOption( 'scan-from', 'Start scanning revisions at the given date. '
68 . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
69 false, true );
70 $this->addOption( 'revisions', 'A list of revision IDs to process, separated by comma or '
71 . 'colon or whitespace. Revisions belonging to deleted pages will work. '
72 . 'If set to "-" IDs are read from stdin, one per line.', false, true );
73 $this->addOption( 'limit', 'Maximum number of revisions for --scan-from to scan. '
74 . 'Default: 1000', false, true );
75 $this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when '
76 . 'attempting to read it. The value given is the reason for marking the blob as bad, '
77 . 'typically a ticket ID. Requires --revisions to also be set.', false, true );
78 }
79
80 public function initializeServices(
82 ?BlobStore $blobStore = null,
84 ?LBFactory $lbFactory = null
85 ) {
86 $services = MediaWikiServices::getInstance();
87
88 $this->revisionStore = $revisionStore ?? $this->revisionStore ?? $services->getRevisionStore();
89 $this->blobStore = $blobStore ?? $this->blobStore ?? $services->getBlobStore();
90 $this->loadBalancer = $loadBalancer ?? $this->loadBalancer ?? $services->getDBLoadBalancer();
91 $this->lbFactory = $lbFactory ?? $this->lbFactory ?? $services->getDBLoadBalancerFactory();
92 }
93
97 private function getStartTimestamp() {
98 $tsOpt = $this->getOption( 'scan-from' );
99 if ( strlen( $tsOpt ) < 14 ) {
100 $this->fatalError( 'Bad timestamp: ' . $tsOpt
101 . ', please provide time and date down to the second.' );
102 }
103
104 $ts = wfTimestamp( TS_MW, $tsOpt );
105 if ( !$ts ) {
106 $this->fatalError( 'Bad timestamp: ' . $tsOpt );
107 }
108
109 return $ts;
110 }
111
115 private function getRevisionIds() {
116 $opt = $this->getOption( 'revisions' );
117
118 if ( $opt === '-' ) {
119 $opt = stream_get_contents( STDIN );
120
121 if ( !$opt ) {
122 return [];
123 }
124 }
125
126 return $this->parseIntList( $opt );
127 }
128
132 public function execute() {
133 $this->initializeServices();
134
135 if ( $this->hasOption( 'revisions' ) ) {
136 if ( $this->hasOption( 'scan-from' ) ) {
137 $this->fatalError( 'Cannot use --revisions together with --scan-from' );
138 }
139
140 $ids = $this->getRevisionIds();
141
142 $count = $this->scanRevisionsById( $ids );
143 } elseif ( $this->hasOption( 'scan-from' ) ) {
144 if ( $this->hasOption( 'mark' ) ) {
145 $this->fatalError( 'Cannot use --mark with --scan-from, '
146 . 'use --revisions to specify revisions to mark.' );
147 }
148
149 $fromTimestamp = $this->getStartTimestamp();
150 $total = $this->getOption( 'limit', 1000 );
151
152 $count = $this->scanRevisionsByTimestamp( $fromTimestamp, $total );
153
154 $this->output( "The range of archive rows scanned is based on the range of revision IDs "
155 . "scanned in the revision table.\n" );
156 } else {
157 if ( $this->hasOption( 'mark' ) ) {
158 $this->fatalError( 'The --mark must be used together with --revisions' );
159 } else {
160 $this->fatalError( 'Must specify one of --revisions or --scan-from' );
161 }
162 }
163
164 if ( $this->hasOption( 'mark' ) ) {
165 $this->output( "Marked $count bad revisions.\n" );
166 } else {
167 $this->output( "Found $count bad revisions.\n" );
168
169 if ( $count > 0 ) {
170 $this->output( "On a unix/linux environment, you can use grep and cut to list of IDs\n" );
171 $this->output( "that can then be used with the --revisions option. E.g.\n" );
172 $this->output( " grep '! Found bad blob' | cut -s -f 3\n" );
173 }
174 }
175 }
176
183 private function scanRevisionsByTimestamp( $fromTimestamp, $total ) {
184 $count = 0;
185 $lastRevId = 0;
186 $firstRevId = 0;
187 $lastTimestamp = $fromTimestamp;
188 $revisionRowsScanned = 0;
189 $archiveRowsScanned = 0;
190
191 $this->output( "Scanning revisions table, "
192 . "$total rows starting at rev_timestamp $fromTimestamp\n" );
193
194 while ( $revisionRowsScanned < $total ) {
195 $batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() );
196 $revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize );
197 if ( !$revisions ) {
198 break;
199 }
200
201 foreach ( $revisions as $rev ) {
202 // we are sorting by timestamp, so we may encounter revision IDs out of sequence
203 $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
204 $lastRevId = max( $lastRevId, $rev->getId() );
205
206 $count += $this->checkRevision( $rev );
207 }
208
209 $lastTimestamp = $rev->getTimestamp();
210 $batchSize = count( $revisions );
211 $revisionRowsScanned += $batchSize;
212 $this->output(
213 "\t- Scanned a batch of $batchSize revisions, "
214 . "up to revision $lastRevId ($lastTimestamp)\n"
215 );
216
217 $this->waitForReplication();
218 }
219
220 // NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the
221 // revision ID just before the first revision ID we found above as the starting point
222 // of the scan, and scan up to on revision after the last revision ID we found above.
223 // If $firstRevId is 0, the loop body above didn't execute,
224 // so we should skip the one below as well.
225 $fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' );
226 $maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' );
227 $maxArchived = $maxArchived ?: PHP_INT_MAX;
228
229 $this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
230 while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
231 $batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() );
232 $revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize );
233 if ( !$revisions ) {
234 break;
235 }
237 foreach ( $revisions as $rev ) {
238 $count += $this->checkRevision( $rev );
239 }
240 $fromArchived = $rev->getId();
241 $batchSize = count( $revisions );
242 $archiveRowsScanned += $batchSize;
243 $this->output(
244 "\t- Scanned a batch of $batchSize archived revisions, "
245 . "up to revision $fromArchived ($lastTimestamp)\n"
246 );
247
248 $this->waitForReplication();
249 }
250
251 return $count;
252 }
253
261 private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize ) {
262 $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
263 $queryInfo = $this->revisionStore->getQueryInfo();
264 $quotedTimestamp = $db->addQuotes( $fromTimestamp );
265 $rows = $db->select(
266 $queryInfo['tables'],
267 $queryInfo['fields'],
268 "rev_timestamp > $quotedTimestamp OR "
269 . "(rev_timestamp = $quotedTimestamp AND rev_id > $afterId )",
270 __METHOD__,
271 [
272 'USE INDEX' => [ 'revision' => 'rev_timestamp' ],
273 'ORDER BY' => 'rev_timestamp, rev_id',
274 'LIMIT' => $batchSize,
275 ],
276 $queryInfo['joins']
277 );
278 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
279 $this->handleStatus( $result );
280
281 $records = array_filter( $result->value );
282
283 '@phan-var RevisionStoreRecord[] $records';
284 return $records;
285 }
286
294 private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) {
295 $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
296 $queryInfo = $this->revisionStore->getArchiveQueryInfo();
297 $rows = $db->select(
298 $queryInfo['tables'],
299 $queryInfo['fields'],
300 [ "ar_rev_id > $afterId", "ar_rev_id <= $uptoId" ],
301 __METHOD__,
302 [ 'LIMIT' => $batchSize, 'ORDER BY' => 'ar_rev_id' ],
303 $queryInfo['joins']
304 );
305 $result = $this->revisionStore->newRevisionsFromBatch(
306 $rows,
307 [ 'archive' => true, 'slots' => true ]
308 );
309 $this->handleStatus( $result );
310
311 $records = array_filter( $result->value );
312
313 '@phan-var RevisionArchiveRecord[] $records';
314 return $records;
315 }
316
326 private function getNextRevision( int $revId, string $comp, string $dir ) {
327 $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
328 $next = $db->selectField(
329 'revision',
330 'rev_id',
331 "rev_id $comp $revId",
332 __METHOD__,
333 [ 'ORDER BY' => "rev_id $dir" ]
334 );
335 return (int)$next;
336 }
337
343 private function scanRevisionsById( array $ids ) {
344 $count = 0;
345 $total = count( $ids );
346
347 $this->output( "Scanning $total ids\n" );
348
349 foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) {
350 $revisions = $this->loadRevisionsById( $batch );
351
352 if ( !$revisions ) {
353 continue;
354 }
355
357 foreach ( $revisions as $rev ) {
358 $count += $this->checkRevision( $rev );
359 }
360
361 $batchSize = count( $revisions );
362 $this->output( "\t- Scanned a batch of $batchSize revisions\n" );
363 }
364
365 return $count;
366 }
367
373 private function loadRevisionsById( array $ids ) {
374 $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
375 $queryInfo = $this->revisionStore->getQueryInfo();
376
377 $rows = $db->select(
378 $queryInfo['tables'],
379 $queryInfo['fields'],
380 [
381 'rev_id ' => $ids,
382 ],
383 __METHOD__,
384 [],
385 $queryInfo['joins']
386 );
387
388 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
389
390 $this->handleStatus( $result );
391
392 $revisions = array_filter( $result->value );
393 '@phan-var RevisionArchiveRecord[] $revisions';
394
395 // if not all revisions were found, check the archive table.
396 if ( count( $revisions ) < count( $ids ) ) {
397 $archiveQueryInfo = $this->revisionStore->getArchiveQueryInfo();
398 $remainingIds = array_diff( $ids, array_keys( $revisions ) );
399
400 $rows = $db->select(
401 $archiveQueryInfo['tables'],
402 $archiveQueryInfo['fields'],
403 [
404 'ar_rev_id ' => $remainingIds,
405 ],
406 __METHOD__,
407 [],
408 $archiveQueryInfo['joins']
409 );
410
411 $archiveResult = $this->revisionStore->newRevisionsFromBatch(
412 $rows,
413 [ 'slots' => true, 'archive' => true ]
414 );
415
416 $this->handleStatus( $archiveResult );
417
418 // don't use array_merge, since it will re-index
419 $revisions += array_filter( $archiveResult->value );
420 }
421
422 return $revisions;
423 }
424
430 private function checkRevision( RevisionRecord $rev ) {
431 $count = 0;
432 foreach ( $rev->getSlots()->getSlots() as $slot ) {
433 $count += $this->checkSlot( $rev, $slot );
434 }
435
436 if ( $count === 0 && $this->hasOption( 'mark' ) ) {
437 $this->output( "\t# No bad blob found on revision {$rev->getId()}, skipped!\n" );
438 }
439
440 return $count;
441 }
442
449 private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) {
450 $address = $slot->getAddress();
451 $error = null;
452 $type = null;
453
454 try {
455 $this->blobStore->getBlob( $address );
456 // nothing to do
457 return 0;
458 } catch ( Exception $ex ) {
459 $error = $ex->getMessage();
460 $type = get_class( $ex );
461 }
462
463 // NOTE: output the revision ID again at the end in a separate column for easy processing
464 // via the "cut" shell command.
465 $this->output( "\t! Found bad blob on revision {$rev->getId()} "
466 . "from {$rev->getTimestamp()} ({$slot->getRole()} slot): "
467 . "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, "
468 . "error='$error', type='$type'. ID:\t{$rev->getId()}\n" );
469
470 if ( $this->hasOption( 'mark' ) ) {
471 $newAddress = $this->markBlob( $rev, $slot, $error );
472 $this->output( "\tChanged address to <$newAddress>\n" );
473 }
474
475 return 1;
476 }
477
485 private function markBlob( RevisionRecord $rev, SlotRecord $slot, string $error = null ) {
486 $args = [];
487
488 if ( $this->hasOption( 'mark' ) ) {
489 $args['reason'] = $this->getOption( 'mark' );
490 }
491
492 if ( $error ) {
493 $args['error'] = $error;
494 }
495
496 $address = $slot->getAddress() ?: 'empty';
497 $badAddress = 'bad:' . urlencode( $address );
498
499 if ( $args ) {
500 $badAddress .= '?' . wfArrayToCgi( $args );
501 }
502
503 $badAddress = substr( $badAddress, 0, 255 );
504
505 $dbw = $this->loadBalancer->getConnectionRef( DB_PRIMARY );
506 $dbw->update(
507 'content',
508 [ 'content_address' => $badAddress ],
509 [ 'content_id' => $slot->getContentId() ],
510 __METHOD__
511 );
512
513 return $badAddress;
514 }
515
516 private function handleStatus( StatusValue $status ) {
517 if ( !$status->isOK() ) {
518 $this->fatalError(
519 Status::wrap( $status )->getMessage( false, false, 'en' )->text()
520 );
521 }
522 if ( !$status->isGood() ) {
523 $this->error(
524 "\t! " . Status::wrap( $status )->getMessage( false, false, 'en' )->text()
525 );
526 }
527 }
528
529}
530
531$maintClass = FindBadBlobs::class;
532require_once RUN_MAINTENANCE_IF_MAIN;
wfArrayToCgi( $array1, $array2=null, $prefix='')
This function takes one or two arrays as input, and returns a CGI-style string, e....
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Maintenance script for finding and marking bad content blobs.
BlobStore null $blobStore
LBFactory $lbFactory
__construct()
Default constructor.
scanRevisionsById(array $ids)
LoadBalancer null $loadBalancer
loadRevisionsById(array $ids)
initializeServices(?RevisionStore $revisionStore=null, ?BlobStore $blobStore=null, ?LoadBalancer $loadBalancer=null, ?LBFactory $lbFactory=null)
checkSlot(RevisionRecord $rev, SlotRecord $slot)
checkRevision(RevisionRecord $rev)
handleStatus(StatusValue $status)
RevisionStore null $revisionStore
loadRevisionsByTimestamp(int $afterId, string $fromTimestamp, $batchSize)
getNextRevision(int $revId, string $comp, string $dir)
Returns the revision ID next to $revId, according to $comp and $dir.
loadArchiveByRevisionId(int $afterId, int $uptoId, $batchSize)
scanRevisionsByTimestamp( $fromTimestamp, $total)
execute()
Do the actual work.All child classes will need to implement thisbool|null|void True for success,...
markBlob(RevisionRecord $rev, SlotRecord $slot, string $error=null)
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
error( $err, $die=0)
Throw an error to the user.
output( $out, $channel=null)
Throw some output to the user.
waitForReplication()
Wait for replica DBs to catch up.
hasOption( $name)
Checks to see if a particular option was set.
getBatchSize()
Returns batch size.
parseIntList( $text)
Utility function to parse a string (perhaps from a command line option) into a list of integers (perh...
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
MediaWikiServices is the service locator for the application scope of MediaWiki.
A RevisionRecord representing a revision of a deleted page persisted in the archive table.
Page revision base class.
getSlots()
Returns the slots defined for this revision.
A RevisionRecord representing an existing revision persisted in the revision table.
Service for looking up page revisions.
Value object representing a content slot associated with a page revision.
getAddress()
Returns the address of this slot's content.
getContentId()
Returns the ID of the content meta data row associated with the slot.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
isOK()
Returns whether the operation completed.
isGood()
Returns whether the operation completed and didn't have any error or warnings.
An interface for generating database load balancers.
Definition LBFactory.php:42
Database connection, tracking, load balancing, and transaction manager for a cluster.
$maintClass
Service for loading and storing data blobs.
Definition BlobStore.php:35
if( $line===false) $args
Definition mcc.php:124
const DB_REPLICA
Definition defines.php:25
const DB_PRIMARY
Definition defines.php:27