32require_once __DIR__ .
'/Maintenance.php';
62 parent::__construct();
65 $this->
addDescription(
'Find and mark bad content blobs. Marked blobs will be read as empty. '
66 .
'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' );
67 $this->
addOption(
'scan-from',
'Start scanning revisions at the given date. '
68 .
'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
70 $this->
addOption(
'revisions',
'A list of revision IDs to process, separated by comma or '
71 .
'colon or whitespace. Revisions belonging to deleted pages will work. '
72 .
'If set to "-" IDs are read from stdin, one per line.',
false,
true );
73 $this->
addOption(
'limit',
'Maximum number of revisions for --scan-from to scan. '
74 .
'Default: 1000',
false,
true );
75 $this->
addOption(
'mark',
'Mark the blob as "known bad", to avoid errors when '
76 .
'attempting to read it. The value given is the reason for marking the blob as bad, '
77 .
'typically a ticket ID. Requires --revisions to also be set.',
false,
true );
86 $services = MediaWikiServices::getInstance();
88 $this->revisionStore =
$revisionStore ?? $this->revisionStore ?? $services->getRevisionStore();
89 $this->blobStore =
$blobStore ?? $this->blobStore ?? $services->getBlobStore();
90 $this->loadBalancer =
$loadBalancer ?? $this->loadBalancer ?? $services->getDBLoadBalancer();
91 $this->lbFactory =
$lbFactory ?? $this->lbFactory ?? $services->getDBLoadBalancerFactory();
99 if ( strlen( $tsOpt ) < 14 ) {
101 .
', please provide time and date down to the second.' );
106 $this->
fatalError(
'Bad timestamp: ' . $tsOpt );
118 if ( $opt ===
'-' ) {
119 $opt = stream_get_contents( STDIN );
137 $this->
fatalError(
'Cannot use --revisions together with --scan-from' );
143 } elseif ( $this->
hasOption(
'scan-from' ) ) {
145 $this->
fatalError(
'Cannot use --mark with --scan-from, '
146 .
'use --revisions to specify revisions to mark.' );
150 $total = $this->
getOption(
'limit', 1000 );
154 $this->
output(
"The range of archive rows scanned is based on the range of revision IDs "
155 .
"scanned in the revision table.\n" );
158 $this->
fatalError(
'The --mark must be used together with --revisions' );
160 $this->
fatalError(
'Must specify one of --revisions or --scan-from' );
165 $this->
output(
"Marked $count bad revisions.\n" );
167 $this->
output(
"Found $count bad revisions.\n" );
170 $this->
output(
"On a unix/linux environment, you can use grep and cut to list of IDs\n" );
171 $this->
output(
"that can then be used with the --revisions option. E.g.\n" );
172 $this->
output(
" grep '! Found bad blob' | cut -s -f 3\n" );
187 $lastTimestamp = $fromTimestamp;
188 $revisionRowsScanned = 0;
189 $archiveRowsScanned = 0;
191 $this->
output(
"Scanning revisions table, "
192 .
"$total rows starting at rev_timestamp $fromTimestamp\n" );
194 while ( $revisionRowsScanned < $total ) {
195 $batchSize = min( $total - $revisionRowsScanned, $this->
getBatchSize() );
201 foreach ( $revisions as $rev ) {
203 $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
204 $lastRevId = max( $lastRevId, $rev->getId() );
209 $lastTimestamp = $rev->getTimestamp();
210 $batchSize = count( $revisions );
211 $revisionRowsScanned += $batchSize;
213 "\t- Scanned a batch of $batchSize revisions, "
214 .
"up to revision $lastRevId ($lastTimestamp)\n"
227 $maxArchived = $maxArchived ?: PHP_INT_MAX;
229 $this->
output(
"Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
230 while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
231 $batchSize = min( $total - $archiveRowsScanned, $this->
getBatchSize() );
237 foreach ( $revisions as $rev ) {
240 $fromArchived = $rev->getId();
241 $batchSize = count( $revisions );
242 $archiveRowsScanned += $batchSize;
244 "\t- Scanned a batch of $batchSize archived revisions, "
245 .
"up to revision $fromArchived ($lastTimestamp)\n"
262 $db = $this->loadBalancer->getConnectionRef(
DB_REPLICA );
263 $queryInfo = $this->revisionStore->getQueryInfo();
264 $quotedTimestamp = $db->addQuotes( $fromTimestamp );
266 $queryInfo[
'tables'],
267 $queryInfo[
'fields'],
268 "rev_timestamp > $quotedTimestamp OR "
269 .
"(rev_timestamp = $quotedTimestamp AND rev_id > $afterId )",
272 'USE INDEX' => [
'revision' =>
'rev_timestamp' ],
273 'ORDER BY' =>
'rev_timestamp, rev_id',
274 'LIMIT' => $batchSize,
278 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [
'slots' =>
true ] );
281 $records = array_filter( $result->value );
283 '@phan-var RevisionStoreRecord[] $records';
295 $db = $this->loadBalancer->getConnectionRef(
DB_REPLICA );
296 $queryInfo = $this->revisionStore->getArchiveQueryInfo();
298 $queryInfo[
'tables'],
299 $queryInfo[
'fields'],
300 [
"ar_rev_id > $afterId",
"ar_rev_id <= $uptoId" ],
302 [
'LIMIT' => $batchSize,
'ORDER BY' =>
'ar_rev_id' ],
305 $result = $this->revisionStore->newRevisionsFromBatch(
307 [
'archive' =>
true,
'slots' =>
true ]
311 $records = array_filter( $result->value );
313 '@phan-var RevisionArchiveRecord[] $records';
327 $db = $this->loadBalancer->getConnectionRef(
DB_REPLICA );
328 $next = $db->selectField(
331 "rev_id $comp $revId",
333 [
'ORDER BY' =>
"rev_id $dir" ]
345 $total = count( $ids );
347 $this->
output(
"Scanning $total ids\n" );
349 foreach ( array_chunk( $ids, $this->
getBatchSize() ) as $batch ) {
357 foreach ( $revisions as $rev ) {
361 $batchSize = count( $revisions );
362 $this->
output(
"\t- Scanned a batch of $batchSize revisions\n" );
374 $db = $this->loadBalancer->getConnectionRef(
DB_REPLICA );
375 $queryInfo = $this->revisionStore->getQueryInfo();
378 $queryInfo[
'tables'],
379 $queryInfo[
'fields'],
388 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [
'slots' =>
true ] );
392 $revisions = array_filter( $result->value );
393 '@phan-var RevisionArchiveRecord[] $revisions';
396 if ( count( $revisions ) < count( $ids ) ) {
397 $archiveQueryInfo = $this->revisionStore->getArchiveQueryInfo();
398 $remainingIds = array_diff( $ids, array_keys( $revisions ) );
401 $archiveQueryInfo[
'tables'],
402 $archiveQueryInfo[
'fields'],
404 'ar_rev_id ' => $remainingIds,
408 $archiveQueryInfo[
'joins']
411 $archiveResult = $this->revisionStore->newRevisionsFromBatch(
413 [
'slots' =>
true,
'archive' =>
true ]
419 $revisions += array_filter( $archiveResult->value );
432 foreach ( $rev->
getSlots()->getSlots() as $slot ) {
433 $count += $this->
checkSlot( $rev, $slot );
436 if ( $count === 0 && $this->
hasOption(
'mark' ) ) {
437 $this->
output(
"\t# No bad blob found on revision {$rev->getId()}, skipped!\n" );
455 $this->blobStore->getBlob( $address );
458 }
catch ( Exception $ex ) {
459 $error = $ex->getMessage();
460 $type = get_class( $ex );
465 $this->
output(
"\t! Found bad blob on revision {$rev->getId()} "
466 .
"from {$rev->getTimestamp()} ({$slot->getRole()} slot): "
467 .
"content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, "
468 .
"error='$error', type='$type'. ID:\t{$rev->getId()}\n" );
471 $newAddress = $this->
markBlob( $rev, $slot, $error );
472 $this->
output(
"\tChanged address to <$newAddress>\n" );
493 $args[
'error'] = $error;
497 $badAddress =
'bad:' . urlencode( $address );
503 $badAddress = substr( $badAddress, 0, 255 );
505 $dbw = $this->loadBalancer->getConnectionRef(
DB_PRIMARY );
508 [
'content_address' => $badAddress ],
517 if ( !$status->
isOK() ) {
519 Status::wrap( $status )->getMessage(
false,
false,
'en' )->text()
522 if ( !$status->
isGood() ) {
524 "\t! " . Status::wrap( $status )->getMessage(
false,
false,
'en' )->text()
532require_once RUN_MAINTENANCE_IF_MAIN;
wfArrayToCgi( $array1, $array2=null, $prefix='')
This function takes one or two arrays as input, and returns a CGI-style string, e....
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
Maintenance script for finding and marking bad content blobs.
BlobStore null $blobStore
__construct()
Default constructor.
scanRevisionsById(array $ids)
LoadBalancer null $loadBalancer
loadRevisionsById(array $ids)
initializeServices(?RevisionStore $revisionStore=null, ?BlobStore $blobStore=null, ?LoadBalancer $loadBalancer=null, ?LBFactory $lbFactory=null)
checkSlot(RevisionRecord $rev, SlotRecord $slot)
checkRevision(RevisionRecord $rev)
handleStatus(StatusValue $status)
RevisionStore null $revisionStore
loadRevisionsByTimestamp(int $afterId, string $fromTimestamp, $batchSize)
getNextRevision(int $revId, string $comp, string $dir)
Returns the revision ID next to $revId, according to $comp and $dir.
loadArchiveByRevisionId(int $afterId, int $uptoId, $batchSize)
scanRevisionsByTimestamp( $fromTimestamp, $total)
execute()
Do the actual work.All child classes will need to implement thisbool|null|void True for success,...
markBlob(RevisionRecord $rev, SlotRecord $slot, string $error=null)
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
error( $err, $die=0)
Throw an error to the user.
output( $out, $channel=null)
Throw some output to the user.
waitForReplication()
Wait for replica DBs to catch up.
hasOption( $name)
Checks to see if a particular option was set.
getBatchSize()
Returns batch size.
parseIntList( $text)
Utility function to parse a string (perhaps from a command line option) into a list of integers (perh...
addDescription( $text)
Set the description text.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
isOK()
Returns whether the operation completed.
isGood()
Returns whether the operation completed and didn't have any error or warnings.