MediaWiki  master
findBadBlobs.php
Go to the documentation of this file.
1 <?php
31 
32 require_once __DIR__ . '/cleanupTable.inc';
33 
39 class FindBadBlobs extends Maintenance {
40 
44  private $revisionStore;
45 
49  private $blobStore;
50 
54  private $loadBalancer;
55 
59  private $lbFactory;
60 
61  public function __construct() {
62  parent::__construct();
63 
64  $this->setBatchSize( 1000 );
65  $this->addDescription( 'Find and mark bad content blobs. '
66  . 'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' );
67  $this->addOption( 'scan-from', 'Start scanning revisions at the given date. '
68  . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DD_HH:MM:SS',
69  false, true );
70  $this->addOption( 'revisions', 'A list of revision IDs to process, separated by comma or '
71  . 'colon or whitespace. Revisions belonging to deleted pages will work. '
72  . 'If set to "-" IDs are read from stdin, one per line.', false, true );
73  $this->addOption( 'limit', 'Maximum number of revisions for --scan-from to scan. '
74  . 'Default: 1000', false, true );
75  $this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when '
76  . 'attempting to read it. The value given is the reason for marking the blob as bad, '
77  . 'typically a ticket ID. Requires --revisions to also be set.', false, true );
78  }
79 
80  public function initializeServices(
82  ?BlobStore $blobStore = null,
84  ?LBFactory $lbFactory = null
85  ) {
86  $services = MediaWikiServices::getInstance();
87 
88  $this->revisionStore = $revisionStore ?? $this->revisionStore ?? $services->getRevisionStore();
89  $this->blobStore = $blobStore ?? $this->blobStore ?? $services->getBlobStore();
90  $this->loadBalancer = $loadBalancer ?? $this->loadBalancer ?? $services->getDBLoadBalancer();
91  $this->lbFactory = $lbFactory ?? $this->lbFactory ?? $services->getDBLoadBalancerFactory();
92  }
93 
97  private function getStartTimestamp() {
98  $tsOpt = $this->getOption( 'scan-from' );
99  if ( strlen( $tsOpt ) < 14 ) {
100  $this->fatalError( 'Bad timestamp: ' . $tsOpt
101  . ', please provide time and date down to the second.' );
102  }
103 
104  $ts = wfTimestamp( TS_MW, $tsOpt );
105  if ( !$ts ) {
106  $this->fatalError( 'Bad timestamp: ' . $tsOpt );
107  }
108 
109  return $ts;
110  }
111 
115  private function getRevisionIds() {
116  $opt = $this->getOption( 'revisions' );
117 
118  if ( $opt === '-' ) {
119  $opt = stream_get_contents( STDIN );
120 
121  if ( !$opt ) {
122  return [];
123  }
124  }
125 
126  return $this->normalizeIds( $opt );
127  }
128 
134  private function normalizeIds( $text ) {
135  $ids = preg_split( '/[\s,;:]+/', $text );
136  return array_map( function ( $id ) {
137  return (int)$id;
138  }, $ids );
139  }
140 
144  public function execute() {
145  $this->initializeServices();
146 
147  if ( $this->hasOption( 'revisions' ) ) {
148  if ( $this->hasOption( 'scan-from' ) ) {
149  $this->fatalError( 'Cannot use --revisions together with --scan-from' );
150  }
151 
152  $ids = $this->getRevisionIds();
153 
154  $count = $this->scanRevisionsById( $ids );
155  } elseif ( $this->hasOption( 'scan-from' ) ) {
156  if ( $this->hasOption( 'mark' ) ) {
157  $this->fatalError( 'Cannot use --mark with --scan-from, '
158  . 'use --revisions to specify revisions to mark.' );
159  }
160 
161  $fromTimestamp = $this->getStartTimestamp();
162  $total = $this->getOption( 'limit', 1000 );
163 
164  $count = $this->scanRevisionsByTimestamp( $fromTimestamp, $total );
165 
166  $this->output( "The range of archive rows scanned is based on the range of revision IDs "
167  . "scanned in the revision table.\n" );
168  } else {
169  if ( $this->hasOption( 'mark' ) ) {
170  $this->fatalError( 'The --mark must be used together with --revisions' );
171  } else {
172  $this->fatalError( 'Must specify one of --revisions or --scan-from' );
173  }
174  }
175 
176  if ( $this->hasOption( 'mark' ) ) {
177  $this->output( "Marked $count bad revisions.\n" );
178  } else {
179  $this->output( "Found $count bad revisions.\n" );
180 
181  if ( $count > 0 ) {
182  $this->output( "On a unix/linux environment, you can use grep and cut to list of IDs\n" );
183  $this->output( "that can then be used with the --revisions option. E.g.\n" );
184  $this->output( " grep '!.*Bad blob address' | cut -s -f 3\n" );
185  }
186  }
187  }
188 
195  private function scanRevisionsByTimestamp( $fromTimestamp, $total ) {
196  $count = 0;
197  $lastRevId = 0;
198  $firstRevId = 0;
199  $lastTimestamp = $fromTimestamp;
200  $revisionRowsScanned = 0;
201  $archiveRowsScanned = 0;
202 
203  $this->output( "Scanning revisions table, "
204  . "$total rows starting at rev_timestamp $fromTimestamp\n" );
205 
206  while ( $revisionRowsScanned < $total ) {
207  $batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() );
208  $revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize );
209  if ( !$revisions ) {
210  break;
211  }
212 
213  foreach ( $revisions as $rev ) {
214  // we are sorting by timestamp, so we may encounter revision IDs out of sequence
215  $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
216  $lastRevId = max( $lastRevId, $rev->getId() );
217 
218  $count += $this->checkRevision( $rev );
219  }
220 
221  $lastTimestamp = $rev->getTimestamp();
222  $batchSize = count( $revisions );
223  $revisionRowsScanned += $batchSize;
224  $this->output(
225  "\t- Scanned a batch of $batchSize revisions, "
226  . "up to revision $lastRevId ($lastTimestamp)\n"
227  );
228 
229  $this->waitForReplication();
230  }
231 
232  // NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the
233  // revision ID just before the first revision ID we found above as the starting point
234  // of the scan, and scan up to on revision after the last revision ID we found above.
235  // If $firstRevId is 0, the loop body above didn't execute,
236  // so we should skip the one below as well.
237  $fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' );
238  $maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' );
239  $maxArchived = $maxArchived ?: PHP_INT_MAX;
240 
241  $this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
242  while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
243  $batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() );
244  $revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize );
245  if ( !$revisions ) {
246  break;
247  }
249  foreach ( $revisions as $rev ) {
250  $count += $this->checkRevision( $rev );
251  }
252  $fromArchived = $rev->getId();
253  $batchSize = count( $revisions );
254  $archiveRowsScanned += $batchSize;
255  $this->output(
256  "\t- Scanned a batch of $batchSize archived revisions, "
257  . "up to revision $fromArchived ($lastTimestamp)\n"
258  );
259 
260  $this->waitForReplication();
261  }
262 
263  return $count;
264  }
265 
273  private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize ) {
274  $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
275  $queryInfo = $this->revisionStore->getQueryInfo();
276  $quotedTimestamp = $db->addQuotes( $fromTimestamp );
277  $rows = $db->select(
278  $queryInfo['tables'],
279  $queryInfo['fields'],
280  "rev_timestamp > $quotedTimestamp OR "
281  . "(rev_timestamp = $quotedTimestamp AND rev_id > $afterId )",
282  __METHOD__,
283  [
284  'USE INDEX' => [ 'revision' => 'rev_timestamp' ],
285  'ORDER BY' => 'rev_timestamp, rev_id',
286  'LIMIT' => $batchSize,
287  ],
288  $queryInfo['joins']
289  );
290  $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
291  $this->handleStatus( $result );
292 
293  $records = array_filter( $result->value );
294 
295  '@phan-var RevisionStoreRecord[] $records';
296  return $records;
297  }
298 
306  private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) {
307  $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
308  $queryInfo = $this->revisionStore->getArchiveQueryInfo();
309  $rows = $db->select(
310  $queryInfo['tables'],
311  $queryInfo['fields'],
312  [ "ar_rev_id > $afterId", "ar_rev_id <= $uptoId" ],
313  __METHOD__,
314  [ 'LIMIT' => $batchSize, 'ORDER BY' => 'ar_rev_id' ],
315  $queryInfo['joins']
316  );
317  $result = $this->revisionStore->newRevisionsFromBatch(
318  $rows,
319  [ 'archive' => true, 'slots' => true ]
320  );
321  $this->handleStatus( $result );
322 
323  $records = array_filter( $result->value );
324 
325  '@phan-var RevisionArchiveRecord[] $records';
326  return $records;
327  }
328 
338  private function getNextRevision( int $revId, string $comp, string $dir ) {
339  $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
340  $next = $db->selectField(
341  'revision',
342  'rev_id',
343  "rev_id $comp $revId",
344  __METHOD__,
345  [ 'ORDER BY' => "rev_id $dir" ]
346  );
347  return (int)$next;
348  }
349 
355  private function scanRevisionsById( array $ids ) {
356  $count = 0;
357  $total = count( $ids );
358 
359  $this->output( "Scanning $total ids\n" );
360 
361  foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) {
362  $revisions = $this->loadRevisionsById( $batch );
363 
364  if ( !$revisions ) {
365  continue;
366  }
367 
369  foreach ( $revisions as $rev ) {
370  $count += $this->checkRevision( $rev );
371  }
372 
373  $batchSize = count( $revisions );
374  $this->output( "\t- Scanned a batch of $batchSize revisions\n" );
375  }
376 
377  return $count;
378  }
379 
385  private function loadRevisionsById( array $ids ) {
386  $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
387  $queryInfo = $this->revisionStore->getQueryInfo();
388 
389  $rows = $db->select(
390  $queryInfo['tables'],
391  $queryInfo['fields'],
392  [
393  'rev_id ' => $ids,
394  ],
395  __METHOD__,
396  [],
397  $queryInfo['joins']
398  );
399 
400  $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
401 
402  $this->handleStatus( $result );
403 
404  $revisions = array_filter( $result->value );
405  '@phan-var RevisionArchiveRecord[] $revisions';
406 
407  // if not all revisions were found, check the archive table.
408  if ( count( $revisions ) < count( $ids ) ) {
409  $archiveQueryInfo = $this->revisionStore->getArchiveQueryInfo();
410  $remainingIds = array_diff( $ids, array_keys( $revisions ) );
411 
412  $rows = $db->select(
413  $archiveQueryInfo['tables'],
414  $archiveQueryInfo['fields'],
415  [
416  'ar_rev_id ' => $remainingIds,
417  ],
418  __METHOD__,
419  [],
420  $archiveQueryInfo['joins']
421  );
422 
423  $archiveResult = $this->revisionStore->newRevisionsFromBatch(
424  $rows,
425  [ 'slots' => true, 'archive' => true ]
426  );
427 
428  $this->handleStatus( $archiveResult );
429 
430  // don't use array_merge, since it will re-index
431  $revisions += array_filter( $archiveResult->value );
432  }
433 
434  return $revisions;
435  }
436 
442  private function checkRevision( RevisionRecord $rev ) {
443  $count = 0;
444  foreach ( $rev->getSlots()->getSlots() as $slot ) {
445  $count += $this->checkSlot( $rev, $slot );
446  }
447 
448  if ( $count === 0 && $this->hasOption( 'mark' ) ) {
449  $this->output( "\t# No bad blob found on revision {$rev->getId()}, skipped!\n" );
450  }
451 
452  return $count;
453  }
454 
461  private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) {
462  $address = $slot->getAddress();
463  $error = null;
464  $type = null;
465 
466  try {
467  $this->blobStore->getBlob( $address );
468  // nothing to do
469  return 0;
470  } catch ( Exception $ex ) {
471  $error = $ex->getMessage();
472  $type = get_class( $ex );
473  }
474 
475  // NOTE: output the revision ID again at the end in a separate column for easy processing
476  // via the "cut" shell command.
477  $this->output( "\t! Found bad blob on revision {$rev->getId()} ({$slot->getRole()} slot): "
478  . "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, "
479  . "error='$error', type='$type'. ID:\t{$rev->getId()}\n" );
480 
481  if ( $this->hasOption( 'mark' ) ) {
482  $newAddress = $this->markBlob( $rev, $slot, $error );
483  $this->output( "\tChanged address to <$newAddress>\n" );
484  }
485 
486  return 1;
487  }
488 
496  private function markBlob( RevisionRecord $rev, SlotRecord $slot, string $error = null ) {
497  $args = [];
498 
499  if ( $this->hasOption( 'mark' ) ) {
500  $args['reason'] = $this->getOption( 'mark' );
501  }
502 
503  if ( $error ) {
504  $args['error'] = $error;
505  }
506 
507  $address = $slot->getAddress() ?: 'empty';
508  $badAddress = 'bad:' . urlencode( $address );
509 
510  if ( $args ) {
511  $badAddress .= '?' . wfArrayToCgi( $args );
512  }
513 
514  $badAddress = substr( $badAddress, 0, 255 );
515 
516  $dbw = $this->loadBalancer->getConnectionRef( DB_MASTER );
517  $dbw->update(
518  'content',
519  [ 'content_address' => $badAddress ],
520  [ 'content_id' => $slot->getContentId() ],
521  __METHOD__
522  );
523 
524  return $badAddress;
525  }
526 
527  private function waitForReplication() {
528  return $this->lbFactory->waitForReplication();
529  }
530 
531  private function handleStatus( StatusValue $status ) {
532  if ( !$status->isOK() ) {
533  $this->fatalError(
534  Status::wrap( $status )->getMessage( false, false, 'en' )->text()
535  );
536  }
537  if ( !$status->isGood() ) {
538  $this->error(
539  "\t! " . Status::wrap( $status )->getMessage( false, false, 'en' )->text()
540  );
541  }
542  }
543 
544 }
545 
546 $maintClass = FindBadBlobs::class;
547 require_once RUN_MAINTENANCE_IF_MAIN;
RUN_MAINTENANCE_IF_MAIN
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:38
FindBadBlobs\markBlob
markBlob(RevisionRecord $rev, SlotRecord $slot, string $error=null)
Definition: findBadBlobs.php:496
FindBadBlobs\normalizeIds
normalizeIds( $text)
Definition: findBadBlobs.php:134
FindBadBlobs\getStartTimestamp
getStartTimestamp()
Definition: findBadBlobs.php:97
StatusValue
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition: StatusValue.php:43
Revision\RevisionRecord
Page revision base class.
Definition: RevisionRecord.php:46
FindBadBlobs\loadRevisionsById
loadRevisionsById(array $ids)
Definition: findBadBlobs.php:385
FindBadBlobs\checkRevision
checkRevision(RevisionRecord $rev)
Definition: findBadBlobs.php:442
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:154
Maintenance\fatalError
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Definition: Maintenance.php:487
Revision\RevisionStore
Service for looking up page revisions.
Definition: RevisionStore.php:81
Maintenance\addDescription
addDescription( $text)
Set the description text.
Definition: Maintenance.php:327
FindBadBlobs\loadArchiveByRevisionId
loadArchiveByRevisionId(int $afterId, int $uptoId, $batchSize)
Definition: findBadBlobs.php:306
FindBadBlobs\$blobStore
BlobStore null $blobStore
Definition: findBadBlobs.php:49
wfTimestamp
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Definition: GlobalFunctions.php:1808
FindBadBlobs\__construct
__construct()
Default constructor.
Definition: findBadBlobs.php:61
Maintenance
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:55
FindBadBlobs\scanRevisionsByTimestamp
scanRevisionsByTimestamp( $fromTimestamp, $total)
Definition: findBadBlobs.php:195
FindBadBlobs\loadRevisionsByTimestamp
loadRevisionsByTimestamp(int $afterId, string $fromTimestamp, $batchSize)
Definition: findBadBlobs.php:273
FindBadBlobs\$revisionStore
RevisionStore null $revisionStore
Definition: findBadBlobs.php:44
StatusValue\isGood
isGood()
Returns whether the operation completed and didn't have any error or warnings.
Definition: StatusValue.php:122
FindBadBlobs\execute
execute()
Do the actual work.All child classes will need to implement thisbool|null|void True for success,...
Definition: findBadBlobs.php:144
FindBadBlobs\checkSlot
checkSlot(RevisionRecord $rev, SlotRecord $slot)
Definition: findBadBlobs.php:461
FindBadBlobs\getNextRevision
getNextRevision(int $revId, string $comp, string $dir)
Returns the revision ID next to $revId, according to $comp and $dir.
Definition: findBadBlobs.php:338
Status\wrap
static wrap( $sv)
Succinct helper method to wrap a StatusValue.
Definition: Status.php:62
StatusValue\isOK
isOK()
Returns whether the operation completed.
Definition: StatusValue.php:131
Maintenance\addOption
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Definition: Maintenance.php:245
FindBadBlobs\scanRevisionsById
scanRevisionsById(array $ids)
Definition: findBadBlobs.php:355
$args
if( $line===false) $args
Definition: mcc.php:124
FindBadBlobs\$loadBalancer
LoadBalancer null $loadBalancer
Definition: findBadBlobs.php:54
FindBadBlobs
Maintenance script for finding and marking bad content blobs.
Definition: findBadBlobs.php:39
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
Revision\SlotRecord\getAddress
getAddress()
Returns the address of this slot's content.
Definition: SlotRecord.php:499
DB_MASTER
const DB_MASTER
Definition: defines.php:26
FindBadBlobs\handleStatus
handleStatus(StatusValue $status)
Definition: findBadBlobs.php:531
Wikimedia\Rdbms\LoadBalancer
Database connection, tracking, load balancing, and transaction manager for a cluster.
Definition: LoadBalancer.php:42
Revision\RevisionArchiveRecord
A RevisionRecord representing a revision of a deleted page persisted in the archive table.
Definition: RevisionArchiveRecord.php:41
FindBadBlobs\waitForReplication
waitForReplication()
Definition: findBadBlobs.php:527
Revision\RevisionStoreRecord
A RevisionRecord representing an existing revision persisted in the revision table.
Definition: RevisionStoreRecord.php:40
Revision\RevisionRecord\getSlots
getSlots()
Returns the slots defined for this revision.
Definition: RevisionRecord.php:233
MediaWiki\Storage\BlobStore
Service for loading and storing data blobs.
Definition: BlobStore.php:35
Maintenance\getOption
getOption( $name, $default=null)
Get an option, or return the default.
Definition: Maintenance.php:281
FindBadBlobs\getRevisionIds
getRevisionIds()
Definition: findBadBlobs.php:115
Wikimedia\Rdbms\LBFactory
An interface for generating database load balancers.
Definition: LBFactory.php:41
Maintenance\getBatchSize
getBatchSize()
Returns batch size.
Definition: Maintenance.php:366
FindBadBlobs\initializeServices
initializeServices(?RevisionStore $revisionStore=null, ?BlobStore $blobStore=null, ?LoadBalancer $loadBalancer=null, ?LBFactory $lbFactory=null)
Definition: findBadBlobs.php:80
FindBadBlobs\$lbFactory
LBFactory $lbFactory
Definition: findBadBlobs.php:59
Maintenance\error
error( $err, $die=0)
Throw an error to the user.
Definition: Maintenance.php:463
Maintenance\output
output( $out, $channel=null)
Throw some output to the user.
Definition: Maintenance.php:434
$maintClass
$maintClass
Definition: findBadBlobs.php:546
Maintenance\hasOption
hasOption( $name)
Checks to see if a particular option exists.
Definition: Maintenance.php:266
Revision\SlotRecord\getContentId
getContentId()
Returns the ID of the content meta data row associated with the slot.
Definition: SlotRecord.php:513
Revision\SlotRecord
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:39
Maintenance\setBatchSize
setBatchSize( $s=0)
Set the batch size.
Definition: Maintenance.php:374
wfArrayToCgi
wfArrayToCgi( $array1, $array2=null, $prefix='')
This function takes one or two arrays as input, and returns a CGI-style string, e....
Definition: GlobalFunctions.php:346
$type
$type
Definition: testCompression.php:52