Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 216 |
|
0.00% |
0 / 14 |
CRAP | |
0.00% |
0 / 1 |
FindBadBlobs | |
0.00% |
0 / 216 |
|
0.00% |
0 / 14 |
2450 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
2 | |||
getStartTimestamp | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
getRevisionIds | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
execute | |
0.00% |
0 / 27 |
|
0.00% |
0 / 1 |
72 | |||
scanRevisionsByTimestamp | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
110 | |||
loadRevisionsByTimestamp | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
2 | |||
loadArchiveByRevisionId | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
2 | |||
getNextRevision | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
scanRevisionsById | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
20 | |||
loadRevisionsById | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
6 | |||
checkRevision | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
checkSlot | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
12 | |||
markBlob | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
30 | |||
handleStatus | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | * @ingroup Maintenance |
20 | */ |
21 | |
22 | use MediaWiki\Maintenance\Maintenance; |
23 | use MediaWiki\Revision\RevisionArchiveRecord; |
24 | use MediaWiki\Revision\RevisionRecord; |
25 | use MediaWiki\Revision\RevisionStore; |
26 | use MediaWiki\Revision\RevisionStoreRecord; |
27 | use MediaWiki\Revision\SlotRecord; |
28 | use MediaWiki\Storage\BlobStore; |
29 | |
30 | // @codeCoverageIgnoreStart |
31 | require_once __DIR__ . '/Maintenance.php'; |
32 | // @codeCoverageIgnoreEnd |
33 | |
34 | /** |
35 | * Maintenance script for finding and marking bad content blobs. |
36 | * |
37 | * @ingroup Maintenance |
38 | */ |
39 | class FindBadBlobs extends Maintenance { |
40 | |
41 | private RevisionStore $revisionStore; |
42 | private BlobStore $blobStore; |
43 | |
44 | public function __construct() { |
45 | parent::__construct(); |
46 | |
47 | $this->setBatchSize( 1000 ); |
48 | $this->addDescription( 'Find and mark bad content blobs. Marked blobs will be read as empty. ' |
49 | . 'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' ); |
50 | $this->addOption( 'scan-from', 'Start scanning revisions at the given date. ' |
51 | . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS', |
52 | false, true ); |
53 | $this->addOption( 'revisions', 'A list of revision IDs to process, separated by comma or ' |
54 | . 'colon or whitespace. Revisions belonging to deleted pages will work. ' |
55 | . 'If set to "-" IDs are read from stdin, one per line.', false, true ); |
56 | $this->addOption( 'limit', 'Maximum number of revisions for --scan-from to scan. ' |
57 | . 'Default: 1000', false, true ); |
58 | $this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when ' |
59 | . 'attempting to read it. The value given is the reason for marking the blob as bad, ' |
60 | . 'typically a ticket ID. Requires --revisions to also be set.', false, true ); |
61 | } |
62 | |
63 | /** |
64 | * @return string |
65 | */ |
66 | private function getStartTimestamp() { |
67 | $tsOpt = $this->getOption( 'scan-from' ); |
68 | if ( strlen( $tsOpt ) < 14 ) { |
69 | $this->fatalError( 'Bad timestamp: ' . $tsOpt |
70 | . ', please provide time and date down to the second.' ); |
71 | } |
72 | |
73 | $ts = wfTimestamp( TS_MW, $tsOpt ); |
74 | if ( !$ts ) { |
75 | $this->fatalError( 'Bad timestamp: ' . $tsOpt ); |
76 | } |
77 | |
78 | return $ts; |
79 | } |
80 | |
81 | /** |
82 | * @return int[] |
83 | */ |
84 | private function getRevisionIds() { |
85 | $opt = $this->getOption( 'revisions' ); |
86 | |
87 | if ( $opt === '-' ) { |
88 | $opt = stream_get_contents( STDIN ); |
89 | |
90 | if ( !$opt ) { |
91 | return []; |
92 | } |
93 | } |
94 | |
95 | return $this->parseIntList( $opt ); |
96 | } |
97 | |
98 | /** |
99 | * @inheritDoc |
100 | */ |
101 | public function execute() { |
102 | $services = $this->getServiceContainer(); |
103 | $this->revisionStore = $services->getRevisionStore(); |
104 | $this->blobStore = $services->getBlobStore(); |
105 | |
106 | if ( $this->hasOption( 'revisions' ) ) { |
107 | if ( $this->hasOption( 'scan-from' ) ) { |
108 | $this->fatalError( 'Cannot use --revisions together with --scan-from' ); |
109 | } |
110 | |
111 | $ids = $this->getRevisionIds(); |
112 | |
113 | $count = $this->scanRevisionsById( $ids ); |
114 | } elseif ( $this->hasOption( 'scan-from' ) ) { |
115 | if ( $this->hasOption( 'mark' ) ) { |
116 | $this->fatalError( 'Cannot use --mark with --scan-from, ' |
117 | . 'use --revisions to specify revisions to mark.' ); |
118 | } |
119 | |
120 | $fromTimestamp = $this->getStartTimestamp(); |
121 | $total = $this->getOption( 'limit', 1000 ); |
122 | |
123 | $count = $this->scanRevisionsByTimestamp( $fromTimestamp, $total ); |
124 | |
125 | $this->output( "The range of archive rows scanned is based on the range of revision IDs " |
126 | . "scanned in the revision table.\n" ); |
127 | } else { |
128 | if ( $this->hasOption( 'mark' ) ) { |
129 | $this->fatalError( 'The --mark must be used together with --revisions' ); |
130 | } else { |
131 | $this->fatalError( 'Must specify one of --revisions or --scan-from' ); |
132 | } |
133 | } |
134 | |
135 | if ( $this->hasOption( 'mark' ) ) { |
136 | $this->output( "Marked $count bad revisions.\n" ); |
137 | } else { |
138 | $this->output( "Found $count bad revisions.\n" ); |
139 | |
140 | if ( $count > 0 ) { |
141 | $this->output( "On a unix/linux environment, you can use grep and cut to list of IDs\n" ); |
142 | $this->output( "that can then be used with the --revisions option. E.g.\n" ); |
143 | $this->output( " grep '! Found bad blob' | cut -s -f 3\n" ); |
144 | } |
145 | } |
146 | } |
147 | |
148 | /** |
149 | * @param string $fromTimestamp |
150 | * @param int $total |
151 | * |
152 | * @return int |
153 | */ |
154 | private function scanRevisionsByTimestamp( $fromTimestamp, $total ) { |
155 | $count = 0; |
156 | $lastRevId = 0; |
157 | $firstRevId = 0; |
158 | $lastTimestamp = $fromTimestamp; |
159 | $revisionRowsScanned = 0; |
160 | $archiveRowsScanned = 0; |
161 | |
162 | $this->output( "Scanning revisions table, " |
163 | . "$total rows starting at rev_timestamp $fromTimestamp\n" ); |
164 | |
165 | while ( $revisionRowsScanned < $total ) { |
166 | $batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() ); |
167 | $revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize ); |
168 | if ( !$revisions ) { |
169 | break; |
170 | } |
171 | |
172 | foreach ( $revisions as $rev ) { |
173 | // we are sorting by timestamp, so we may encounter revision IDs out of sequence |
174 | $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId(); |
175 | $lastRevId = max( $lastRevId, $rev->getId() ); |
176 | |
177 | $count += $this->checkRevision( $rev ); |
178 | } |
179 | |
180 | $lastTimestamp = $rev->getTimestamp(); |
181 | $batchSize = count( $revisions ); |
182 | $revisionRowsScanned += $batchSize; |
183 | $this->output( |
184 | "\t- Scanned a batch of $batchSize revisions, " |
185 | . "up to revision $lastRevId ($lastTimestamp)\n" |
186 | ); |
187 | |
188 | $this->waitForReplication(); |
189 | } |
190 | |
191 | // NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the |
192 | // revision ID just before the first revision ID we found above as the starting point |
193 | // of the scan, and scan up to on revision after the last revision ID we found above. |
194 | // If $firstRevId is 0, the loop body above didn't execute, |
195 | // so we should skip the one below as well. |
196 | $fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' ); |
197 | $maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' ); |
198 | $maxArchived = $maxArchived ?: PHP_INT_MAX; |
199 | |
200 | $this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" ); |
201 | while ( $firstRevId > 0 && $fromArchived < $maxArchived ) { |
202 | $batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() ); |
203 | $revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize ); |
204 | if ( !$revisions ) { |
205 | break; |
206 | } |
207 | /** @var RevisionRecord $rev */ |
208 | foreach ( $revisions as $rev ) { |
209 | $count += $this->checkRevision( $rev ); |
210 | } |
211 | $fromArchived = $rev->getId(); |
212 | $batchSize = count( $revisions ); |
213 | $archiveRowsScanned += $batchSize; |
214 | $this->output( |
215 | "\t- Scanned a batch of $batchSize archived revisions, " |
216 | . "up to revision $fromArchived ($lastTimestamp)\n" |
217 | ); |
218 | |
219 | $this->waitForReplication(); |
220 | } |
221 | |
222 | return $count; |
223 | } |
224 | |
225 | /** |
226 | * @param int $afterId |
227 | * @param string $fromTimestamp |
228 | * @param int $batchSize |
229 | * |
230 | * @return RevisionStoreRecord[] |
231 | */ |
232 | private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize ) { |
233 | $db = $this->getReplicaDB(); |
234 | $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db ); |
235 | $rows = $queryBuilder->joinComment() |
236 | ->where( $db->buildComparison( '>', [ |
237 | 'rev_timestamp' => $fromTimestamp, |
238 | 'rev_id' => $afterId, |
239 | ] ) ) |
240 | ->useIndex( [ 'revision' => 'rev_timestamp' ] ) |
241 | ->orderBy( [ 'rev_timestamp', 'rev_id' ] ) |
242 | ->limit( $batchSize ) |
243 | ->caller( __METHOD__ )->fetchResultSet(); |
244 | $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] ); |
245 | $this->handleStatus( $result ); |
246 | |
247 | $records = array_filter( $result->value ); |
248 | |
249 | '@phan-var RevisionStoreRecord[] $records'; |
250 | return $records; |
251 | } |
252 | |
253 | /** |
254 | * @param int $afterId |
255 | * @param int $uptoId |
256 | * @param int $batchSize |
257 | * |
258 | * @return RevisionArchiveRecord[] |
259 | */ |
260 | private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) { |
261 | $db = $this->getReplicaDB(); |
262 | $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db ) |
263 | ->joinComment() |
264 | ->where( [ $db->expr( 'ar_rev_id', '>', $afterId ), $db->expr( 'ar_rev_id', '<=', $uptoId ) ] ) |
265 | ->orderBy( 'ar_rev_id' ) |
266 | ->limit( $batchSize ) |
267 | ->caller( __METHOD__ )->fetchResultSet(); |
268 | $result = $this->revisionStore->newRevisionsFromBatch( |
269 | $rows, |
270 | [ 'archive' => true, 'slots' => true ] |
271 | ); |
272 | $this->handleStatus( $result ); |
273 | |
274 | $records = array_filter( $result->value ); |
275 | |
276 | '@phan-var RevisionArchiveRecord[] $records'; |
277 | return $records; |
278 | } |
279 | |
280 | /** |
281 | * Returns the revision ID next to $revId, according to $comp and $dir |
282 | * |
283 | * @param int $revId |
284 | * @param string $comp the comparator, either '<' or '>', to go with $dir |
285 | * @param string $dir the sort direction to go with $comp, either 'ARC' or 'DESC' |
286 | * |
287 | * @return int |
288 | */ |
289 | private function getNextRevision( int $revId, string $comp, string $dir ) { |
290 | $db = $this->getReplicaDB(); |
291 | $next = $db->newSelectQueryBuilder() |
292 | ->select( 'rev_id' ) |
293 | ->from( 'revision' ) |
294 | ->where( "rev_id $comp $revId" ) |
295 | ->orderBy( [ "rev_id" ], $dir ) |
296 | ->caller( __METHOD__ ) |
297 | ->fetchField(); |
298 | return (int)$next; |
299 | } |
300 | |
301 | /** |
302 | * @param array $ids |
303 | * |
304 | * @return int |
305 | */ |
306 | private function scanRevisionsById( array $ids ) { |
307 | $count = 0; |
308 | $total = count( $ids ); |
309 | |
310 | $this->output( "Scanning $total ids\n" ); |
311 | |
312 | foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) { |
313 | $revisions = $this->loadRevisionsById( $batch ); |
314 | |
315 | if ( !$revisions ) { |
316 | continue; |
317 | } |
318 | |
319 | /** @var RevisionRecord $rev */ |
320 | foreach ( $revisions as $rev ) { |
321 | $count += $this->checkRevision( $rev ); |
322 | } |
323 | |
324 | $batchSize = count( $revisions ); |
325 | $this->output( "\t- Scanned a batch of $batchSize revisions\n" ); |
326 | } |
327 | |
328 | return $count; |
329 | } |
330 | |
331 | /** |
332 | * @param int[] $ids |
333 | * |
334 | * @return RevisionRecord[] |
335 | */ |
336 | private function loadRevisionsById( array $ids ) { |
337 | $db = $this->getReplicaDB(); |
338 | $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db ); |
339 | |
340 | $rows = $queryBuilder |
341 | ->joinComment() |
342 | ->where( [ 'rev_id' => $ids ] ) |
343 | ->caller( __METHOD__ )->fetchResultSet(); |
344 | |
345 | $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] ); |
346 | |
347 | $this->handleStatus( $result ); |
348 | |
349 | $revisions = array_filter( $result->value ); |
350 | '@phan-var RevisionArchiveRecord[] $revisions'; |
351 | |
352 | // if not all revisions were found, check the archive table. |
353 | if ( count( $revisions ) < count( $ids ) ) { |
354 | $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db ) |
355 | ->joinComment() |
356 | ->where( [ 'ar_rev_id' => array_diff( $ids, array_keys( $revisions ) ) ] ) |
357 | ->caller( __METHOD__ )->fetchResultSet(); |
358 | |
359 | $archiveResult = $this->revisionStore->newRevisionsFromBatch( |
360 | $rows, |
361 | [ 'slots' => true, 'archive' => true ] |
362 | ); |
363 | |
364 | $this->handleStatus( $archiveResult ); |
365 | |
366 | // don't use array_merge, since it will re-index |
367 | $revisions += array_filter( $archiveResult->value ); |
368 | } |
369 | |
370 | return $revisions; |
371 | } |
372 | |
373 | /** |
374 | * @param RevisionRecord $rev |
375 | * |
376 | * @return int |
377 | */ |
378 | private function checkRevision( RevisionRecord $rev ) { |
379 | $count = 0; |
380 | foreach ( $rev->getSlots()->getSlots() as $slot ) { |
381 | $count += $this->checkSlot( $rev, $slot ); |
382 | } |
383 | |
384 | if ( $count === 0 && $this->hasOption( 'mark' ) ) { |
385 | $this->output( "\t# No bad blob found on revision {$rev->getId()}, skipped!\n" ); |
386 | } |
387 | |
388 | return $count; |
389 | } |
390 | |
391 | /** |
392 | * @param RevisionRecord $rev |
393 | * @param SlotRecord $slot |
394 | * |
395 | * @return int |
396 | */ |
397 | private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) { |
398 | $address = $slot->getAddress(); |
399 | |
400 | try { |
401 | $this->blobStore->getBlob( $address ); |
402 | // nothing to do |
403 | return 0; |
404 | } catch ( Exception $ex ) { |
405 | $error = $ex->getMessage(); |
406 | $type = get_class( $ex ); |
407 | } |
408 | |
409 | // NOTE: output the revision ID again at the end in a separate column for easy processing |
410 | // via the "cut" shell command. |
411 | $this->output( "\t! Found bad blob on revision {$rev->getId()} " |
412 | . "from {$rev->getTimestamp()} ({$slot->getRole()} slot): " |
413 | . "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, " |
414 | . "error='$error', type='$type'. ID:\t{$rev->getId()}\n" ); |
415 | |
416 | if ( $this->hasOption( 'mark' ) ) { |
417 | $newAddress = $this->markBlob( $slot, $error ); |
418 | $this->output( "\tChanged address to <$newAddress>\n" ); |
419 | } |
420 | |
421 | return 1; |
422 | } |
423 | |
424 | /** |
425 | * @param SlotRecord $slot |
426 | * @param string|null $error |
427 | * |
428 | * @return false|string |
429 | */ |
430 | private function markBlob( SlotRecord $slot, ?string $error = null ) { |
431 | $args = []; |
432 | |
433 | if ( $this->hasOption( 'mark' ) ) { |
434 | $args['reason'] = $this->getOption( 'mark' ); |
435 | } |
436 | |
437 | if ( $error ) { |
438 | $args['error'] = $error; |
439 | } |
440 | |
441 | $address = $slot->getAddress() ?: 'empty'; |
442 | $badAddress = 'bad:' . urlencode( $address ); |
443 | |
444 | if ( $args ) { |
445 | $badAddress .= '?' . wfArrayToCgi( $args ); |
446 | } |
447 | |
448 | $badAddress = substr( $badAddress, 0, 255 ); |
449 | |
450 | $dbw = $this->getPrimaryDB(); |
451 | $dbw->newUpdateQueryBuilder() |
452 | ->update( 'content' ) |
453 | ->set( [ 'content_address' => $badAddress ] ) |
454 | ->where( [ 'content_id' => $slot->getContentId() ] ) |
455 | ->caller( __METHOD__ )->execute(); |
456 | |
457 | return $badAddress; |
458 | } |
459 | |
460 | private function handleStatus( StatusValue $status ) { |
461 | if ( !$status->isOK() ) { |
462 | $this->fatalError( $status ); |
463 | } |
464 | if ( !$status->isGood() ) { |
465 | $this->error( $status ); |
466 | } |
467 | } |
468 | |
469 | } |
470 | |
471 | // @codeCoverageIgnoreStart |
472 | $maintClass = FindBadBlobs::class; |
473 | require_once RUN_MAINTENANCE_IF_MAIN; |
474 | // @codeCoverageIgnoreEnd |