32 parent::__construct();
35 $this->
addDescription(
'Find and mark bad content blobs. Marked blobs will be read as empty. '
36 .
'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' );
37 $this->
addOption(
'scan-from',
'Start scanning revisions at the given date. '
38 .
'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
40 $this->
addOption(
'scan-to',
'End of scan date range. '
41 .
'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
43 $this->
addOption(
'revisions',
'A list of revision IDs to process, separated by comma or '
44 .
'colon or whitespace. Revisions belonging to deleted pages will work. '
45 .
'If set to "-" IDs are read from stdin, one per line.',
false,
true );
46 $this->
addOption(
'limit',
'Maximum number of revisions for --scan-from to scan. '
47 .
'Default: 1000',
false,
true );
48 $this->
addOption(
'mark',
'Mark the blob as "known bad", to avoid errors when '
49 .
'attempting to read it. The value given is the reason for marking the blob as bad, '
50 .
'typically a ticket ID. Requires --revisions to also be set.',
false,
true );
56 private function getStartTimestamp() {
58 if ( strlen( $tsOpt ) < 14 ) {
60 .
', please provide time and date down to the second.' );
65 $this->
fatalError(
'Bad timestamp: ' . $tsOpt );
71 private function getEndTimestamp(): string {
73 if ( strlen( $tsOpt ) < 14 ) {
75 .
', please provide time and date down to the second.' );
80 $this->
fatalError(
'Bad timestamp: ' . $tsOpt );
89 private function getRevisionIds() {
90 $opt = $this->getOption(
'revisions' );
93 $opt = stream_get_contents( STDIN );
100 return $this->parseIntList( $opt );
107 $services = $this->getServiceContainer();
108 $this->revisionStore = $services->getRevisionStore();
109 $this->blobStore = $services->getBlobStore();
111 if ( $this->hasOption(
'revisions' ) ) {
112 if ( $this->hasOption(
'scan-from' ) || $this->hasOption(
'scan-to' ) ) {
113 $this->fatalError(
'Cannot use --revisions together with --scan-from or --scan-to' );
116 $ids = $this->getRevisionIds();
118 $count = $this->scanRevisionsById( $ids );
119 } elseif ( $this->hasOption(
'scan-from' ) ) {
120 if ( $this->hasOption(
'mark' ) ) {
121 $this->fatalError(
'Cannot use --mark with --scan-from, '
122 .
'use --revisions to specify revisions to mark.' );
125 if ( $this->hasOption(
'scan-to' ) && $this->hasOption(
'limit' ) ) {
126 $this->fatalError(
'Cannot use --limit with --scan-to' );
129 $count = $this->scanRevisionsByTimestamp();
130 $this->output(
"The range of archive rows scanned is based on the range of revision IDs "
131 .
"scanned in the revision table.\n" );
133 if ( $this->hasOption(
'mark' ) ) {
134 $this->fatalError(
'The --mark must be used together with --revisions' );
136 $this->fatalError(
'Must specify one of --revisions or --scan-from' );
140 if ( $this->hasOption(
'mark' ) ) {
141 $this->output(
"Marked $count bad revisions.\n" );
143 $this->output(
"Found $count bad revisions.\n" );
146 $this->output(
"On a unix/linux environment, you can use grep and cut to list of IDs\n" );
147 $this->output(
"that can then be used with the --revisions option. E.g.\n" );
148 $this->output(
" grep '! Found bad blob' | cut -s -f 3\n" );
156 private function scanRevisionsByTimestamp() {
157 $fromTimestamp = $this->getStartTimestamp();
158 if ( $this->getOption(
'scan-to' ) ) {
159 $toTimestamp = $this->getEndTimestamp();
161 $msg =
"Scanning revisions table, "
162 .
"starting at rev_timestamp $fromTimestamp until $toTimestamp\n";
165 $total = $this->getOption(
'limit', 1000 );
166 $msg =
"Scanning revisions table, "
167 .
"$total rows starting at rev_timestamp $fromTimestamp\n";
173 $lastTimestamp = $fromTimestamp;
174 $revisionRowsScanned = 0;
175 $archiveRowsScanned = 0;
177 $this->output( $msg );
179 while ( $revisionRowsScanned < $total ) {
180 $batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() );
181 $revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize, $toTimestamp );
186 foreach ( $revisions as $rev ) {
188 $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
189 $lastRevId = max( $lastRevId, $rev->getId() );
191 $count += $this->checkRevision( $rev );
194 $lastTimestamp = $rev->getTimestamp();
195 $batchSize = count( $revisions );
196 $revisionRowsScanned += $batchSize;
198 "\t- Scanned a batch of $batchSize revisions, "
199 .
"up to revision $lastRevId ($lastTimestamp)\n"
202 $this->waitForReplication();
210 $fromArchived = $this->getNextRevision( $firstRevId,
'<',
'DESC' );
211 $maxArchived = $this->getNextRevision( $lastRevId,
'>',
'ASC' );
212 $maxArchived = $maxArchived ?: PHP_INT_MAX;
214 $this->output(
"Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
215 while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
216 $batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() );
217 $revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize );
222 foreach ( $revisions as $rev ) {
223 $count += $this->checkRevision( $rev );
225 $fromArchived = $rev->getId();
226 $batchSize = count( $revisions );
227 $archiveRowsScanned += $batchSize;
229 "\t- Scanned a batch of $batchSize archived revisions, "
230 .
"up to revision $fromArchived ($lastTimestamp)\n"
233 $this->waitForReplication();
247 private function loadRevisionsByTimestamp(
int $afterId,
string $fromTimestamp, $batchSize, $toTimestamp ) {
248 $db = $this->getReplicaDB();
249 $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db )
251 ->where( $db->buildComparison(
'>', [
252 'rev_timestamp' => $fromTimestamp,
253 'rev_id' => $afterId,
255 ->useIndex( [
'revision' =>
'rev_timestamp' ] )
256 ->orderBy( [
'rev_timestamp',
'rev_id' ] )
257 ->limit( $batchSize );
259 if ( $toTimestamp ) {
260 $queryBuilder->where( $db->expr(
'rev_timestamp',
'<', $toTimestamp ) );
263 $rows = $queryBuilder->caller( __METHOD__ )->fetchResultSet();
264 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [
'slots' =>
true ] );
265 $this->handleStatus( $result );
267 $records = array_filter( $result->value );
269 '@phan-var RevisionStoreRecord[] $records';
280 private function loadArchiveByRevisionId(
int $afterId,
int $uptoId, $batchSize ) {
281 $db = $this->getReplicaDB();
282 $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db )
284 ->where( [ $db->expr(
'ar_rev_id',
'>', $afterId ), $db->expr(
'ar_rev_id',
'<=', $uptoId ) ] )
285 ->orderBy(
'ar_rev_id' )
286 ->limit( $batchSize )
287 ->caller( __METHOD__ )->fetchResultSet();
288 $result = $this->revisionStore->newRevisionsFromBatch(
290 [
'archive' =>
true,
'slots' =>
true ]
292 $this->handleStatus( $result );
294 $records = array_filter( $result->value );
296 '@phan-var RevisionArchiveRecord[] $records';
309 private function getNextRevision(
int $revId,
string $comp,
string $dir ) {
310 $db = $this->getReplicaDB();
311 $next = $db->newSelectQueryBuilder()
314 ->where(
"rev_id $comp $revId" )
315 ->orderBy( [
"rev_id" ], $dir )
316 ->caller( __METHOD__ )
326 private function scanRevisionsById( array $ids ) {
328 $total = count( $ids );
330 $this->output(
"Scanning $total ids\n" );
332 foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) {
333 $revisions = $this->loadRevisionsById( $batch );
340 foreach ( $revisions as $rev ) {
341 $count += $this->checkRevision( $rev );
344 $batchSize = count( $revisions );
345 $this->output(
"\t- Scanned a batch of $batchSize revisions\n" );
356 private function loadRevisionsById( array $ids ) {
357 $db = $this->getReplicaDB();
358 $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db );
360 $rows = $queryBuilder
362 ->where( [
'rev_id' => $ids ] )
363 ->caller( __METHOD__ )->fetchResultSet();
365 $result = $this->revisionStore->newRevisionsFromBatch( $rows, [
'slots' =>
true ] );
367 $this->handleStatus( $result );
369 $revisions = array_filter( $result->value );
370 '@phan-var RevisionArchiveRecord[] $revisions';
373 if ( count( $revisions ) < count( $ids ) ) {
374 $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db )
376 ->where( [
'ar_rev_id' => array_diff( $ids, array_keys( $revisions ) ) ] )
377 ->caller( __METHOD__ )->fetchResultSet();
379 $archiveResult = $this->revisionStore->newRevisionsFromBatch(
381 [
'slots' =>
true,
'archive' =>
true ]
384 $this->handleStatus( $archiveResult );
387 $revisions += array_filter( $archiveResult->value );
400 foreach ( $rev->
getSlots()->getSlots() as $slot ) {
401 $count += $this->checkSlot( $rev, $slot );
404 if ( $count === 0 && $this->hasOption(
'mark' ) ) {
405 $this->output(
"\t# No bad blob found on revision {$rev->getId()}, skipped!\n" );
421 $blob = $this->blobStore->getBlob( $address );
422 if ( mb_check_encoding( $blob ) ) {
426 $type =
'invalid-utf-8';
427 $error =
'Invalid UTF-8';
429 }
catch ( Exception $ex ) {
430 $error = $ex->getMessage();
431 $type = get_class( $ex );
436 $this->output(
"\t! Found bad blob on revision {$rev->getId()} "
437 .
"from {$rev->getTimestamp()} ({$slot->getRole()} slot): "
438 .
"content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, "
439 .
"error='$error', type='$type'. ID:\t{$rev->getId()}\n" );
441 if ( $this->hasOption(
'mark' ) ) {
442 $newAddress = $this->markBlob( $slot, $error );
443 $this->output(
"\tChanged address to <$newAddress>\n" );
455 private function markBlob(
SlotRecord $slot, ?
string $error =
null ) {
458 if ( $this->hasOption(
'mark' ) ) {
459 $args[
'reason'] = $this->getOption(
'mark' );
463 $args[
'error'] = $error;
467 $badAddress =
'bad:' . urlencode( $address );
473 $badAddress = substr( $badAddress, 0, 255 );
475 $dbw = $this->getPrimaryDB();
476 $dbw->newUpdateQueryBuilder()
477 ->update(
'content' )
478 ->set( [
'content_address' => $badAddress ] )
480 ->caller( __METHOD__ )->execute();
485 private function handleStatus(
StatusValue $status ) {
486 if ( !$status->
isOK() ) {
487 $this->fatalError( $status );
489 if ( !$status->
isGood() ) {
490 $this->error( $status );