MediaWiki  master
findBadBlobs.php
Go to the documentation of this file.
1 <?php
31 
32 require_once __DIR__ . '/Maintenance.php';
33 
39 class FindBadBlobs extends Maintenance {
40 
44  private $revisionStore;
45 
49  private $blobStore;
50 
54  private $loadBalancer;
55 
59  private $lbFactory;
60 
61  public function __construct() {
62  parent::__construct();
63 
64  $this->setBatchSize( 1000 );
65  $this->addDescription( 'Find and mark bad content blobs. Marked blobs will be read as empty. '
66  . 'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' );
67  $this->addOption( 'scan-from', 'Start scanning revisions at the given date. '
68  . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
69  false, true );
70  $this->addOption( 'revisions', 'A list of revision IDs to process, separated by comma or '
71  . 'colon or whitespace. Revisions belonging to deleted pages will work. '
72  . 'If set to "-" IDs are read from stdin, one per line.', false, true );
73  $this->addOption( 'limit', 'Maximum number of revisions for --scan-from to scan. '
74  . 'Default: 1000', false, true );
75  $this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when '
76  . 'attempting to read it. The value given is the reason for marking the blob as bad, '
77  . 'typically a ticket ID. Requires --revisions to also be set.', false, true );
78  }
79 
80  public function initializeServices(
81  ?RevisionStore $revisionStore = null,
82  ?BlobStore $blobStore = null,
83  ?LoadBalancer $loadBalancer = null,
84  ?LBFactory $lbFactory = null
85  ) {
86  $services = MediaWikiServices::getInstance();
87 
88  $this->revisionStore = $revisionStore ?? $this->revisionStore ?? $services->getRevisionStore();
89  $this->blobStore = $blobStore ?? $this->blobStore ?? $services->getBlobStore();
90  $this->loadBalancer = $loadBalancer ?? $this->loadBalancer ?? $services->getDBLoadBalancer();
91  $this->lbFactory = $lbFactory ?? $this->lbFactory ?? $services->getDBLoadBalancerFactory();
92  }
93 
97  private function getStartTimestamp() {
98  $tsOpt = $this->getOption( 'scan-from' );
99  if ( strlen( $tsOpt ) < 14 ) {
100  $this->fatalError( 'Bad timestamp: ' . $tsOpt
101  . ', please provide time and date down to the second.' );
102  }
103 
104  $ts = wfTimestamp( TS_MW, $tsOpt );
105  if ( !$ts ) {
106  $this->fatalError( 'Bad timestamp: ' . $tsOpt );
107  }
108 
109  return $ts;
110  }
111 
115  private function getRevisionIds() {
116  $opt = $this->getOption( 'revisions' );
117 
118  if ( $opt === '-' ) {
119  $opt = stream_get_contents( STDIN );
120 
121  if ( !$opt ) {
122  return [];
123  }
124  }
125 
126  return $this->parseIntList( $opt );
127  }
128 
132  public function execute() {
133  $this->initializeServices();
134 
135  if ( $this->hasOption( 'revisions' ) ) {
136  if ( $this->hasOption( 'scan-from' ) ) {
137  $this->fatalError( 'Cannot use --revisions together with --scan-from' );
138  }
139 
140  $ids = $this->getRevisionIds();
141 
142  $count = $this->scanRevisionsById( $ids );
143  } elseif ( $this->hasOption( 'scan-from' ) ) {
144  if ( $this->hasOption( 'mark' ) ) {
145  $this->fatalError( 'Cannot use --mark with --scan-from, '
146  . 'use --revisions to specify revisions to mark.' );
147  }
148 
149  $fromTimestamp = $this->getStartTimestamp();
150  $total = $this->getOption( 'limit', 1000 );
151 
152  $count = $this->scanRevisionsByTimestamp( $fromTimestamp, $total );
153 
154  $this->output( "The range of archive rows scanned is based on the range of revision IDs "
155  . "scanned in the revision table.\n" );
156  } else {
157  if ( $this->hasOption( 'mark' ) ) {
158  $this->fatalError( 'The --mark must be used together with --revisions' );
159  } else {
160  $this->fatalError( 'Must specify one of --revisions or --scan-from' );
161  }
162  }
163 
164  if ( $this->hasOption( 'mark' ) ) {
165  $this->output( "Marked $count bad revisions.\n" );
166  } else {
167  $this->output( "Found $count bad revisions.\n" );
168 
169  if ( $count > 0 ) {
170  $this->output( "On a unix/linux environment, you can use grep and cut to list of IDs\n" );
171  $this->output( "that can then be used with the --revisions option. E.g.\n" );
172  $this->output( " grep '! Found bad blob' | cut -s -f 3\n" );
173  }
174  }
175  }
176 
183  private function scanRevisionsByTimestamp( $fromTimestamp, $total ) {
184  $count = 0;
185  $lastRevId = 0;
186  $firstRevId = 0;
187  $lastTimestamp = $fromTimestamp;
188  $revisionRowsScanned = 0;
189  $archiveRowsScanned = 0;
190 
191  $this->output( "Scanning revisions table, "
192  . "$total rows starting at rev_timestamp $fromTimestamp\n" );
193 
194  while ( $revisionRowsScanned < $total ) {
195  $batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() );
196  $revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize );
197  if ( !$revisions ) {
198  break;
199  }
200 
201  foreach ( $revisions as $rev ) {
202  // we are sorting by timestamp, so we may encounter revision IDs out of sequence
203  $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
204  $lastRevId = max( $lastRevId, $rev->getId() );
205 
206  $count += $this->checkRevision( $rev );
207  }
208 
209  $lastTimestamp = $rev->getTimestamp();
210  $batchSize = count( $revisions );
211  $revisionRowsScanned += $batchSize;
212  $this->output(
213  "\t- Scanned a batch of $batchSize revisions, "
214  . "up to revision $lastRevId ($lastTimestamp)\n"
215  );
216 
217  $this->waitForReplication();
218  }
219 
220  // NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the
221  // revision ID just before the first revision ID we found above as the starting point
222  // of the scan, and scan up to on revision after the last revision ID we found above.
223  // If $firstRevId is 0, the loop body above didn't execute,
224  // so we should skip the one below as well.
225  $fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' );
226  $maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' );
227  $maxArchived = $maxArchived ?: PHP_INT_MAX;
228 
229  $this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
230  while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
231  $batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() );
232  $revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize );
233  if ( !$revisions ) {
234  break;
235  }
237  foreach ( $revisions as $rev ) {
238  $count += $this->checkRevision( $rev );
239  }
240  $fromArchived = $rev->getId();
241  $batchSize = count( $revisions );
242  $archiveRowsScanned += $batchSize;
243  $this->output(
244  "\t- Scanned a batch of $batchSize archived revisions, "
245  . "up to revision $fromArchived ($lastTimestamp)\n"
246  );
247 
248  $this->waitForReplication();
249  }
250 
251  return $count;
252  }
253 
261  private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize ) {
262  $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
263  $queryInfo = $this->revisionStore->getQueryInfo();
264  $quotedTimestamp = $db->addQuotes( $fromTimestamp );
265  $rows = $db->newSelectQueryBuilder()
266  ->select( $queryInfo['fields'] )
267  ->tables( $queryInfo['tables'] )
268  ->where( "rev_timestamp > $quotedTimestamp OR "
269  . "(rev_timestamp = $quotedTimestamp AND rev_id > $afterId )" )
270  ->joinConds( $queryInfo['joins'] )
271  ->useIndex( [ 'revision' => 'rev_timestamp' ] )
272  ->orderBy( [ 'rev_timestamp', 'rev_id' ] )
273  ->limit( $batchSize )
274  ->caller( __METHOD__ )
275  ->fetchResultSet();
276  $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
277  $this->handleStatus( $result );
278 
279  $records = array_filter( $result->value );
280 
281  '@phan-var RevisionStoreRecord[] $records';
282  return $records;
283  }
284 
292  private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) {
293  $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
294  $queryInfo = $this->revisionStore->getArchiveQueryInfo();
295  $rows = $db->newSelectQueryBuilder()
296  ->select( $queryInfo['fields'] )
297  ->tables( $queryInfo['tables'] )
298  ->where( [ "ar_rev_id > $afterId", "ar_rev_id <= $uptoId" ] )
299  ->joinConds( $queryInfo['joins'] )
300  ->orderBy( 'ar_rev_id' )
301  ->limit( $batchSize )
302  ->caller( __METHOD__ )
303  ->fetchResultSet();
304  $result = $this->revisionStore->newRevisionsFromBatch(
305  $rows,
306  [ 'archive' => true, 'slots' => true ]
307  );
308  $this->handleStatus( $result );
309 
310  $records = array_filter( $result->value );
311 
312  '@phan-var RevisionArchiveRecord[] $records';
313  return $records;
314  }
315 
325  private function getNextRevision( int $revId, string $comp, string $dir ) {
326  $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
327  $next = $db->newSelectQueryBuilder()
328  ->select( 'rev_id' )
329  ->from( 'revision' )
330  ->where( "rev_id $comp $revId" )
331  ->orderBy( [ "rev_id" ], $dir )
332  ->caller( __METHOD__ )
333  ->fetchField();
334  return (int)$next;
335  }
336 
342  private function scanRevisionsById( array $ids ) {
343  $count = 0;
344  $total = count( $ids );
345 
346  $this->output( "Scanning $total ids\n" );
347 
348  foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) {
349  $revisions = $this->loadRevisionsById( $batch );
350 
351  if ( !$revisions ) {
352  continue;
353  }
354 
356  foreach ( $revisions as $rev ) {
357  $count += $this->checkRevision( $rev );
358  }
359 
360  $batchSize = count( $revisions );
361  $this->output( "\t- Scanned a batch of $batchSize revisions\n" );
362  }
363 
364  return $count;
365  }
366 
372  private function loadRevisionsById( array $ids ) {
373  $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
374  $queryInfo = $this->revisionStore->getQueryInfo();
375 
376  $rows = $db->newSelectQueryBuilder()
377  ->select( $queryInfo['fields'] )
378  ->tables( $queryInfo['tables'] )
379  ->where( [ 'rev_id' => $ids ] )
380  ->joinConds( $queryInfo['joins'] )
381  ->caller( __METHOD__ )
382  ->fetchResultSet();
383 
384  $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
385 
386  $this->handleStatus( $result );
387 
388  $revisions = array_filter( $result->value );
389  '@phan-var RevisionArchiveRecord[] $revisions';
390 
391  // if not all revisions were found, check the archive table.
392  if ( count( $revisions ) < count( $ids ) ) {
393  $archiveQueryInfo = $this->revisionStore->getArchiveQueryInfo();
394  $remainingIds = array_diff( $ids, array_keys( $revisions ) );
395 
396  $rows = $db->newSelectQueryBuilder()
397  ->select( $archiveQueryInfo['fields'] )
398  ->tables( $archiveQueryInfo['tables'] )
399  ->where( [ 'ar_rev_id' => $remainingIds ] )
400  ->joinConds( $archiveQueryInfo['joins'] )
401  ->caller( __METHOD__ )
402  ->fetchResultSet();
403 
404  $archiveResult = $this->revisionStore->newRevisionsFromBatch(
405  $rows,
406  [ 'slots' => true, 'archive' => true ]
407  );
408 
409  $this->handleStatus( $archiveResult );
410 
411  // don't use array_merge, since it will re-index
412  $revisions += array_filter( $archiveResult->value );
413  }
414 
415  return $revisions;
416  }
417 
423  private function checkRevision( RevisionRecord $rev ) {
424  $count = 0;
425  foreach ( $rev->getSlots()->getSlots() as $slot ) {
426  $count += $this->checkSlot( $rev, $slot );
427  }
428 
429  if ( $count === 0 && $this->hasOption( 'mark' ) ) {
430  $this->output( "\t# No bad blob found on revision {$rev->getId()}, skipped!\n" );
431  }
432 
433  return $count;
434  }
435 
442  private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) {
443  $address = $slot->getAddress();
444  $error = null;
445  $type = null;
446 
447  try {
448  $this->blobStore->getBlob( $address );
449  // nothing to do
450  return 0;
451  } catch ( Exception $ex ) {
452  $error = $ex->getMessage();
453  $type = get_class( $ex );
454  }
455 
456  // NOTE: output the revision ID again at the end in a separate column for easy processing
457  // via the "cut" shell command.
458  $this->output( "\t! Found bad blob on revision {$rev->getId()} "
459  . "from {$rev->getTimestamp()} ({$slot->getRole()} slot): "
460  . "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, "
461  . "error='$error', type='$type'. ID:\t{$rev->getId()}\n" );
462 
463  if ( $this->hasOption( 'mark' ) ) {
464  $newAddress = $this->markBlob( $rev, $slot, $error );
465  $this->output( "\tChanged address to <$newAddress>\n" );
466  }
467 
468  return 1;
469  }
470 
478  private function markBlob( RevisionRecord $rev, SlotRecord $slot, string $error = null ) {
479  $args = [];
480 
481  if ( $this->hasOption( 'mark' ) ) {
482  $args['reason'] = $this->getOption( 'mark' );
483  }
484 
485  if ( $error ) {
486  $args['error'] = $error;
487  }
488 
489  $address = $slot->getAddress() ?: 'empty';
490  $badAddress = 'bad:' . urlencode( $address );
491 
492  if ( $args ) {
493  $badAddress .= '?' . wfArrayToCgi( $args );
494  }
495 
496  $badAddress = substr( $badAddress, 0, 255 );
497 
498  $dbw = $this->loadBalancer->getConnectionRef( DB_PRIMARY );
499  $dbw->update(
500  'content',
501  [ 'content_address' => $badAddress ],
502  [ 'content_id' => $slot->getContentId() ],
503  __METHOD__
504  );
505 
506  return $badAddress;
507  }
508 
509  private function handleStatus( StatusValue $status ) {
510  if ( !$status->isOK() ) {
511  $this->fatalError(
512  Status::wrap( $status )->getMessage( false, false, 'en' )->text()
513  );
514  }
515  if ( !$status->isGood() ) {
516  $this->error(
517  "\t! " . Status::wrap( $status )->getMessage( false, false, 'en' )->text()
518  );
519  }
520  }
521 
522 }
523 
524 $maintClass = FindBadBlobs::class;
525 require_once RUN_MAINTENANCE_IF_MAIN;
wfArrayToCgi( $array1, $array2=null, $prefix='')
This function takes one or two arrays as input, and returns a CGI-style string, e....
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Maintenance script for finding and marking bad content blobs.
__construct()
Default constructor.
initializeServices(?RevisionStore $revisionStore=null, ?BlobStore $blobStore=null, ?LoadBalancer $loadBalancer=null, ?LBFactory $lbFactory=null)
execute()
Do the actual work.All child classes will need to implement thisbool|null|void True for success,...
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:66
error( $err, $die=0)
Throw an error to the user.
output( $out, $channel=null)
Throw some output to the user.
waitForReplication()
Wait for replica DBs to catch up.
hasOption( $name)
Checks to see if a particular option was set.
getBatchSize()
Returns batch size.
parseIntList( $text)
Utility function to parse a string (perhaps from a command line option) into a list of integers (perh...
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
setBatchSize( $s=0)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Service locator for MediaWiki core services.
A RevisionRecord representing a revision of a deleted page persisted in the archive table.
Page revision base class.
getSlots()
Returns the slots defined for this revision.
A RevisionRecord representing an existing revision persisted in the revision table.
Service for looking up page revisions.
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:40
getAddress()
Returns the address of this slot's content.
Definition: SlotRecord.php:517
getContentId()
Returns the ID of the content meta data row associated with the slot.
Definition: SlotRecord.php:531
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition: StatusValue.php:46
isOK()
Returns whether the operation completed.
isGood()
Returns whether the operation completed and didn't have any error or warnings.
static wrap( $sv)
Succinct helper method to wrap a StatusValue.
Definition: Status.php:62
$maintClass
Service for loading and storing data blobs.
Definition: BlobStore.php:35
if( $line===false) $args
Definition: mcc.php:124
const DB_REPLICA
Definition: defines.php:26
const DB_PRIMARY
Definition: defines.php:28