MediaWiki  master
findBadBlobs.php
Go to the documentation of this file.
1 <?php
32 
33 require_once __DIR__ . '/cleanupTable.inc';
34 
40 class FindBadBlobs extends Maintenance {
41 
45  private $revisionStore;
46 
50  private $blobStore;
51 
55  private $loadBalancer;
56 
60  private $lbFactory;
61 
62  public function __construct() {
63  parent::__construct();
64 
65  $this->setBatchSize( 1000 );
66  $this->addDescription( 'Scan for bad content blobs' );
67  $this->addOption( 'from-date', 'Start scanning revisions at the given date. '
68  . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DD_HH:MM:SS',
69  false, true );
70  $this->addOption( 'revisions', 'A list of revision IDs to scan, separated by comma or colon '
71  . 'or whitespace. Revisions belonging to deleted pages will work. '
72  . 'If set to "-" IDs are read from stdin, one per line.', false, true );
73  $this->addOption( 'limit', 'Maximum number of revisions to scan. Default: 1000', false, true );
74  $this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when '
75  . 'attempting to read it. The value given is the reason for marking the blob as bad, '
76  . 'typically a ticket ID', false, true );
77  }
78 
79  public function initializeServices(
81  ?BlobStore $blobStore = null,
83  ?LBFactory $lbFactory = null
84  ) {
85  $services = MediaWikiServices::getInstance();
86 
87  $this->revisionStore = $revisionStore ?? $this->revisionStore ?? $services->getRevisionStore();
88  $this->blobStore = $blobStore ?? $this->blobStore ?? $services->getBlobStore();
89  $this->loadBalancer = $loadBalancer ?? $this->loadBalancer ?? $services->getDBLoadBalancer();
90  $this->lbFactory = $lbFactory ?? $this->lbFactory ?? $services->getDBLoadBalancerFactory();
91  }
92 
96  private function getStartTimestamp() {
97  $tsOpt = $this->getOption( 'from-date' );
98  if ( strlen( $tsOpt ) < 14 ) {
99  $this->fatalError( 'Bad timestamp: ' . $tsOpt
100  . ', please provide time and date down to the second.' );
101  }
102 
103  $ts = wfTimestamp( TS_MW, $tsOpt );
104  if ( !$ts ) {
105  $this->fatalError( 'Bad timestamp: ' . $tsOpt );
106  }
107 
108  return $ts;
109  }
110 
114  private function getRevisionIds() {
115  $opt = $this->getOption( 'revisions' );
116 
117  if ( $opt === '-' ) {
118  $opt = stream_get_contents( STDIN );
119 
120  if ( !$opt ) {
121  return [];
122  }
123  }
124 
125  return $this->normalizeIds( $opt );
126  }
127 
133  private function normalizeIds( $text ) {
134  $ids = preg_split( '/[\s,;:]+/', $text );
135  return array_map( function ( $id ) {
136  return (int)$id;
137  }, $ids );
138  }
139 
143  public function execute() {
144  $this->initializeServices();
145 
146  if ( $this->hasOption( 'revisions' ) ) {
147  $ids = $this->getRevisionIds();
148 
149  $count = $this->scanRevisionsById( $ids );
150  } elseif ( $this->hasOption( 'from-date' ) ) {
151  $fromTimestamp = $this->getStartTimestamp();
152  $total = $this->getOption( 'limit', 1000 );
153 
154  $count = $this->scanRevisionsByTimestamp( $fromTimestamp, $total );
155 
156  $this->output( "The range of archive rows scanned is based on the range of revision IDs "
157  . "scanned in the revision table.\n" );
158  } else {
159  $this->fatalError( 'Must specify either --revisions or --from-date' );
160  }
161 
162  if ( $this->hasOption( 'mark' ) ) {
163  $this->output( "Marked $count bad revisions\n" );
164  } else {
165  $this->output( "Found $count bad revisions\n" );
166  }
167  }
168 
175  private function scanRevisionsByTimestamp( $fromTimestamp, $total ) {
176  $count = 0;
177  $lastRevId = 0;
178  $firstRevId = 0;
179  $lastTimestamp = $fromTimestamp;
180  $revisionRowsScanned = 0;
181  $archiveRowsScanned = 0;
182 
183  $this->output( "Scanning revisions table, "
184  . "$total rows starting at rev_timestamp $fromTimestamp\n" );
185 
186  while ( $revisionRowsScanned < $total ) {
187  $batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() );
188  $revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize );
189  if ( !$revisions ) {
190  break;
191  }
192 
193  foreach ( $revisions as $rev ) {
194  // we are sorting by timestamp, so we may encounter revision IDs out of sequence
195  $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
196  $lastRevId = max( $lastRevId, $rev->getId() );
197 
198  $count += $this->checkRevision( $rev );
199  }
200 
201  $lastTimestamp = $rev->getTimestamp();
202  $batchSize = count( $revisions );
203  $revisionRowsScanned += $batchSize;
204  $this->output(
205  "\t- Scanned a batch of $batchSize revisions, "
206  . "up to revision $lastRevId ($lastTimestamp)\n"
207  );
208 
209  $this->waitForReplication();
210  }
211 
212  // NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the
213  // revision ID just before the first revision ID we found above as the starting point
214  // of the scan, and scan up to on revision after the last revision ID we found above.
215  // If $firstRevId is 0, the loop body above didn't execute,
216  // so we should skip the one below as well.
217  $fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' );
218  $maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' );
219  $maxArchived = $maxArchived ?: PHP_INT_MAX;
220 
221  $this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
222  while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
223  $batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() );
224  $revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize );
225  if ( !$revisions ) {
226  break;
227  }
229  foreach ( $revisions as $rev ) {
230  $count += $this->checkRevision( $rev );
231  }
232  $fromArchived = $rev->getId();
233  $batchSize = count( $revisions );
234  $archiveRowsScanned += $batchSize;
235  $this->output(
236  "\t- Scanned a batch of $batchSize archived revisions, "
237  . "up to revision $fromArchived ($lastTimestamp)\n"
238  );
239 
240  $this->waitForReplication();
241  }
242 
243  return $count;
244  }
245 
253  private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize ) {
254  $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
255  $queryInfo = $this->revisionStore->getQueryInfo();
256  $quotedTimestamp = $db->addQuotes( $fromTimestamp );
257  $rows = $db->select(
258  $queryInfo['tables'],
259  $queryInfo['fields'],
260  "rev_timestamp > $quotedTimestamp OR "
261  . "(rev_timestamp = $quotedTimestamp AND rev_id > $afterId )",
262  __METHOD__,
263  [
264  'USE INDEX' => [ 'revision' => 'rev_timestamp' ],
265  'ORDER BY' => 'rev_timestamp, rev_id',
266  'LIMIT' => $batchSize,
267  ],
268  $queryInfo['joins']
269  );
270  $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
271  $this->handleStatus( $result );
272 
273  $records = array_filter( $result->value );
274 
275  '@phan-var RevisionStoreRecord[] $records';
276  return $records;
277  }
278 
286  private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) {
287  $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
288  $queryInfo = $this->revisionStore->getArchiveQueryInfo();
289  $rows = $db->select(
290  $queryInfo['tables'],
291  $queryInfo['fields'],
292  [ "ar_rev_id > $afterId", "ar_rev_id <= $uptoId" ],
293  __METHOD__,
294  [ 'LIMIT' => $batchSize, 'ORDER BY' => 'ar_rev_id' ],
295  $queryInfo['joins']
296  );
297  $result = $this->revisionStore->newRevisionsFromBatch(
298  $rows,
299  [ 'archive' => true, 'slots' => true ]
300  );
301  $this->handleStatus( $result );
302 
303  $records = array_filter( $result->value );
304 
305  '@phan-var RevisionArchiveRecord[] $records';
306  return $records;
307  }
308 
318  private function getNextRevision( int $revId, string $comp, string $dir ) {
319  $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
320  $next = $db->selectField(
321  'revision',
322  'rev_id',
323  "rev_id $comp $revId",
324  __METHOD__,
325  [ 'ORDER BY' => "rev_id $dir" ]
326  );
327  return (int)$next;
328  }
329 
335  private function scanRevisionsById( array $ids ) {
336  $count = 0;
337  $total = count( $ids );
338 
339  $this->output( "Scanning $total ids\n" );
340 
341  foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) {
342  $revisions = $this->loadRevisionsById( $batch );
343 
344  if ( !$revisions ) {
345  continue;
346  }
347 
349  foreach ( $revisions as $rev ) {
350  $count += $this->checkRevision( $rev );
351  }
352 
353  $batchSize = count( $revisions );
354  $this->output( "\t- Scanned a batch of $batchSize revisions\n" );
355  }
356 
357  return $count;
358  }
359 
365  private function loadRevisionsById( array $ids ) {
366  $db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
367  $queryInfo = $this->revisionStore->getQueryInfo();
368 
369  $rows = $db->select(
370  $queryInfo['tables'],
371  $queryInfo['fields'],
372  [
373  'rev_id ' => $ids,
374  ],
375  __METHOD__,
376  [],
377  $queryInfo['joins']
378  );
379 
380  $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
381 
382  $this->handleStatus( $result );
383 
384  $revisions = array_filter( $result->value );
385  '@phan-var RevisionArchiveRecord[] $revisions';
386 
387  // if not all revisions were found, check the archive table.
388  if ( count( $revisions ) < count( $ids ) ) {
389  $archiveQueryInfo = $this->revisionStore->getArchiveQueryInfo();
390  $remainingIds = array_diff( $ids, array_keys( $revisions ) );
391 
392  $rows = $db->select(
393  $archiveQueryInfo['tables'],
394  $archiveQueryInfo['fields'],
395  [
396  'ar_rev_id ' => $remainingIds,
397  ],
398  __METHOD__,
399  [],
400  $archiveQueryInfo['joins']
401  );
402 
403  $archiveResult = $this->revisionStore->newRevisionsFromBatch(
404  $rows,
405  [ 'slots' => true, 'archive' => true ]
406  );
407 
408  $this->handleStatus( $archiveResult );
409 
410  // don't use array_merge, since it will re-index
411  $revisions = $revisions + array_filter( $archiveResult->value );
412  }
413 
414  return $revisions;
415  }
416 
422  private function checkRevision( RevisionRecord $rev ) {
423  $count = 0;
424  foreach ( $rev->getSlots()->getSlots() as $slot ) {
425  $count += $this->checkSlot( $rev, $slot );
426  }
427 
428  return $count;
429  }
430 
437  private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) {
438  $address = $slot->getAddress();
439  $error = null;
440 
441  try {
442  $this->blobStore->getBlob( $address );
443  // nothing to do
444  return 0;
445  } catch ( BlobAccessException $ex ) {
446  $error = $ex->getMessage();
447  } catch ( ExternalStoreException $ex ) {
448  $error = $ex->getMessage();
449  }
450 
451  $this->output( "\t! Found bad blob on revision {$rev->getId()} ({$slot->getRole()} slot): "
452  . "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, error='$error'\n" );
453 
454  if ( $this->hasOption( 'mark' ) ) {
455  $newAddress = $this->markBlob( $rev, $slot, $error );
456  $this->output( "\tChanged address to <$newAddress>\n" );
457  }
458 
459  return 1;
460  }
461 
469  private function markBlob( RevisionRecord $rev, SlotRecord $slot, string $error = null ) {
470  $args = [];
471 
472  if ( $this->hasOption( 'mark' ) ) {
473  $args['reason'] = $this->getOption( 'mark' );
474  }
475 
476  if ( $error ) {
477  $args['error'] = $error;
478  }
479 
480  $address = $slot->getAddress() ?: 'empty';
481  $badAddress = 'bad:' . urlencode( $address );
482 
483  if ( $args ) {
484  $badAddress .= '?' . wfArrayToCgi( $args );
485  }
486 
487  $badAddress = substr( $badAddress, 0, 255 );
488 
489  $dbw = $this->loadBalancer->getConnectionRef( DB_MASTER );
490  $dbw->update(
491  'content',
492  [ 'content_address' => $badAddress ],
493  [ 'content_id' => $slot->getContentId() ],
494  __METHOD__
495  );
496 
497  return $badAddress;
498  }
499 
500  private function waitForReplication() {
501  return $this->lbFactory->waitForReplication();
502  }
503 
504  private function handleStatus( StatusValue $status ) {
505  if ( !$status->isOK() ) {
506  $this->fatalError(
507  Status::wrap( $status )->getMessage( false, false, 'en' )->text()
508  );
509  }
510  if ( !$status->isGood() ) {
511  $this->error(
512  "\t! " . Status::wrap( $status )->getMessage( false, false, 'en' )->text()
513  );
514  }
515  }
516 
517 }
518 
519 $maintClass = FindBadBlobs::class;
520 require_once RUN_MAINTENANCE_IF_MAIN;
RUN_MAINTENANCE_IF_MAIN
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:38
FindBadBlobs\markBlob
markBlob(RevisionRecord $rev, SlotRecord $slot, string $error=null)
Definition: findBadBlobs.php:469
FindBadBlobs\normalizeIds
normalizeIds( $text)
Definition: findBadBlobs.php:133
MediaWiki\Storage\BlobAccessException
Exception representing a failure to access a data blob.
Definition: BlobAccessException.php:32
FindBadBlobs\getStartTimestamp
getStartTimestamp()
Definition: findBadBlobs.php:96
StatusValue
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition: StatusValue.php:42
Revision\RevisionRecord
Page revision base class.
Definition: RevisionRecord.php:46
FindBadBlobs\loadRevisionsById
loadRevisionsById(array $ids)
Definition: findBadBlobs.php:365
FindBadBlobs\checkRevision
checkRevision(RevisionRecord $rev)
Definition: findBadBlobs.php:422
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:149
Maintenance\fatalError
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Definition: Maintenance.php:480
Revision\RevisionStore
Service for looking up page revisions.
Definition: RevisionStore.php:80
Maintenance\addDescription
addDescription( $text)
Set the description text.
Definition: Maintenance.php:323
FindBadBlobs\loadArchiveByRevisionId
loadArchiveByRevisionId(int $afterId, int $uptoId, $batchSize)
Definition: findBadBlobs.php:286
FindBadBlobs\$blobStore
BlobStore null $blobStore
Definition: findBadBlobs.php:50
wfTimestamp
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Definition: GlobalFunctions.php:1806
FindBadBlobs\__construct
__construct()
Default constructor.
Definition: findBadBlobs.php:62
Maintenance
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:53
FindBadBlobs\scanRevisionsByTimestamp
scanRevisionsByTimestamp( $fromTimestamp, $total)
Definition: findBadBlobs.php:175
FindBadBlobs\loadRevisionsByTimestamp
loadRevisionsByTimestamp(int $afterId, string $fromTimestamp, $batchSize)
Definition: findBadBlobs.php:253
FindBadBlobs\$revisionStore
RevisionStore null $revisionStore
Definition: findBadBlobs.php:45
StatusValue\isGood
isGood()
Returns whether the operation completed and didn't have any error or warnings.
Definition: StatusValue.php:121
FindBadBlobs\execute
execute()
Do the actual work.All child classes will need to implement thisbool|null|void True for success,...
Definition: findBadBlobs.php:143
FindBadBlobs\checkSlot
checkSlot(RevisionRecord $rev, SlotRecord $slot)
Definition: findBadBlobs.php:437
FindBadBlobs\getNextRevision
getNextRevision(int $revId, string $comp, string $dir)
Returns the revision ID next to $revId, according to $comp and $dir.
Definition: findBadBlobs.php:318
Status\wrap
static wrap( $sv)
Succinct helper method to wrap a StatusValue.
Definition: Status.php:60
StatusValue\isOK
isOK()
Returns whether the operation completed.
Definition: StatusValue.php:130
Maintenance\addOption
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Definition: Maintenance.php:241
FindBadBlobs\scanRevisionsById
scanRevisionsById(array $ids)
Definition: findBadBlobs.php:335
$args
if( $line===false) $args
Definition: mcc.php:124
FindBadBlobs\$loadBalancer
LoadBalancer null $loadBalancer
Definition: findBadBlobs.php:55
FindBadBlobs
Maintenance script for finding and marking bad content blobs.
Definition: findBadBlobs.php:40
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
Revision\SlotRecord\getAddress
getAddress()
Returns the address of this slot's content.
Definition: SlotRecord.php:499
DB_MASTER
const DB_MASTER
Definition: defines.php:26
FindBadBlobs\handleStatus
handleStatus(StatusValue $status)
Definition: findBadBlobs.php:504
Wikimedia\Rdbms\LoadBalancer
Database connection, tracking, load balancing, and transaction manager for a cluster.
Definition: LoadBalancer.php:42
Revision\RevisionArchiveRecord
A RevisionRecord representing a revision of a deleted page persisted in the archive table.
Definition: RevisionArchiveRecord.php:41
FindBadBlobs\waitForReplication
waitForReplication()
Definition: findBadBlobs.php:500
Revision\RevisionStoreRecord
A RevisionRecord representing an existing revision persisted in the revision table.
Definition: RevisionStoreRecord.php:40
Revision\RevisionRecord\getSlots
getSlots()
Returns the slots defined for this revision.
Definition: RevisionRecord.php:233
MediaWiki\Storage\BlobStore
Service for loading and storing data blobs.
Definition: BlobStore.php:35
Maintenance\getOption
getOption( $name, $default=null)
Get an option, or return the default.
Definition: Maintenance.php:277
FindBadBlobs\getRevisionIds
getRevisionIds()
Definition: findBadBlobs.php:114
Wikimedia\Rdbms\LBFactory
An interface for generating database load balancers.
Definition: LBFactory.php:41
Maintenance\getBatchSize
getBatchSize()
Returns batch size.
Definition: Maintenance.php:362
FindBadBlobs\initializeServices
initializeServices(?RevisionStore $revisionStore=null, ?BlobStore $blobStore=null, ?LoadBalancer $loadBalancer=null, ?LBFactory $lbFactory=null)
Definition: findBadBlobs.php:79
FindBadBlobs\$lbFactory
LBFactory $lbFactory
Definition: findBadBlobs.php:60
Maintenance\error
error( $err, $die=0)
Throw an error to the user.
Definition: Maintenance.php:457
Maintenance\output
output( $out, $channel=null)
Throw some output to the user.
Definition: Maintenance.php:429
$maintClass
$maintClass
Definition: findBadBlobs.php:519
Maintenance\hasOption
hasOption( $name)
Checks to see if a particular option exists.
Definition: Maintenance.php:262
Revision\SlotRecord\getContentId
getContentId()
Returns the ID of the content meta data row associated with the slot.
Definition: SlotRecord.php:513
ExternalStoreException
Definition: ExternalStoreException.php:3
Revision\SlotRecord
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:39
Maintenance\setBatchSize
setBatchSize( $s=0)
Set the batch size.
Definition: Maintenance.php:370
wfArrayToCgi
wfArrayToCgi( $array1, $array2=null, $prefix='')
This function takes one or two arrays as input, and returns a CGI-style string, e....
Definition: GlobalFunctions.php:347