Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 224 |
|
0.00% |
0 / 14 |
CRAP | |
0.00% |
0 / 1 |
FindBadBlobs | |
0.00% |
0 / 221 |
|
0.00% |
0 / 14 |
2450 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
2 | |||
getStartTimestamp | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
getRevisionIds | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
execute | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
72 | |||
scanRevisionsByTimestamp | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
110 | |||
loadRevisionsByTimestamp | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
2 | |||
loadArchiveByRevisionId | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
2 | |||
getNextRevision | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
scanRevisionsById | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
20 | |||
loadRevisionsById | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
6 | |||
checkRevision | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
checkSlot | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
12 | |||
markBlob | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
30 | |||
handleStatus | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | * @ingroup Maintenance |
20 | */ |
21 | |
22 | use MediaWiki\Revision\RevisionArchiveRecord; |
23 | use MediaWiki\Revision\RevisionRecord; |
24 | use MediaWiki\Revision\RevisionStore; |
25 | use MediaWiki\Revision\RevisionStoreRecord; |
26 | use MediaWiki\Revision\SlotRecord; |
27 | use MediaWiki\Status\Status; |
28 | use MediaWiki\Storage\BlobStore; |
29 | |
30 | require_once __DIR__ . '/Maintenance.php'; |
31 | |
32 | /** |
33 | * Maintenance script for finding and marking bad content blobs. |
34 | * |
35 | * @ingroup Maintenance |
36 | */ |
37 | class FindBadBlobs extends Maintenance { |
38 | |
39 | private RevisionStore $revisionStore; |
40 | private BlobStore $blobStore; |
41 | |
42 | public function __construct() { |
43 | parent::__construct(); |
44 | |
45 | $this->setBatchSize( 1000 ); |
46 | $this->addDescription( 'Find and mark bad content blobs. Marked blobs will be read as empty. ' |
47 | . 'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' ); |
48 | $this->addOption( 'scan-from', 'Start scanning revisions at the given date. ' |
49 | . 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS', |
50 | false, true ); |
51 | $this->addOption( 'revisions', 'A list of revision IDs to process, separated by comma or ' |
52 | . 'colon or whitespace. Revisions belonging to deleted pages will work. ' |
53 | . 'If set to "-" IDs are read from stdin, one per line.', false, true ); |
54 | $this->addOption( 'limit', 'Maximum number of revisions for --scan-from to scan. ' |
55 | . 'Default: 1000', false, true ); |
56 | $this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when ' |
57 | . 'attempting to read it. The value given is the reason for marking the blob as bad, ' |
58 | . 'typically a ticket ID. Requires --revisions to also be set.', false, true ); |
59 | } |
60 | |
61 | /** |
62 | * @return string |
63 | */ |
64 | private function getStartTimestamp() { |
65 | $tsOpt = $this->getOption( 'scan-from' ); |
66 | if ( strlen( $tsOpt ) < 14 ) { |
67 | $this->fatalError( 'Bad timestamp: ' . $tsOpt |
68 | . ', please provide time and date down to the second.' ); |
69 | } |
70 | |
71 | $ts = wfTimestamp( TS_MW, $tsOpt ); |
72 | if ( !$ts ) { |
73 | $this->fatalError( 'Bad timestamp: ' . $tsOpt ); |
74 | } |
75 | |
76 | return $ts; |
77 | } |
78 | |
79 | /** |
80 | * @return int[] |
81 | */ |
82 | private function getRevisionIds() { |
83 | $opt = $this->getOption( 'revisions' ); |
84 | |
85 | if ( $opt === '-' ) { |
86 | $opt = stream_get_contents( STDIN ); |
87 | |
88 | if ( !$opt ) { |
89 | return []; |
90 | } |
91 | } |
92 | |
93 | return $this->parseIntList( $opt ); |
94 | } |
95 | |
96 | /** |
97 | * @inheritDoc |
98 | */ |
99 | public function execute() { |
100 | $services = $this->getServiceContainer(); |
101 | $this->revisionStore = $services->getRevisionStore(); |
102 | $this->blobStore = $services->getBlobStore(); |
103 | $this->setDBProvider( $services->getConnectionProvider() ); |
104 | |
105 | if ( $this->hasOption( 'revisions' ) ) { |
106 | if ( $this->hasOption( 'scan-from' ) ) { |
107 | $this->fatalError( 'Cannot use --revisions together with --scan-from' ); |
108 | } |
109 | |
110 | $ids = $this->getRevisionIds(); |
111 | |
112 | $count = $this->scanRevisionsById( $ids ); |
113 | } elseif ( $this->hasOption( 'scan-from' ) ) { |
114 | if ( $this->hasOption( 'mark' ) ) { |
115 | $this->fatalError( 'Cannot use --mark with --scan-from, ' |
116 | . 'use --revisions to specify revisions to mark.' ); |
117 | } |
118 | |
119 | $fromTimestamp = $this->getStartTimestamp(); |
120 | $total = $this->getOption( 'limit', 1000 ); |
121 | |
122 | $count = $this->scanRevisionsByTimestamp( $fromTimestamp, $total ); |
123 | |
124 | $this->output( "The range of archive rows scanned is based on the range of revision IDs " |
125 | . "scanned in the revision table.\n" ); |
126 | } else { |
127 | if ( $this->hasOption( 'mark' ) ) { |
128 | $this->fatalError( 'The --mark must be used together with --revisions' ); |
129 | } else { |
130 | $this->fatalError( 'Must specify one of --revisions or --scan-from' ); |
131 | } |
132 | } |
133 | |
134 | if ( $this->hasOption( 'mark' ) ) { |
135 | $this->output( "Marked $count bad revisions.\n" ); |
136 | } else { |
137 | $this->output( "Found $count bad revisions.\n" ); |
138 | |
139 | if ( $count > 0 ) { |
140 | $this->output( "On a unix/linux environment, you can use grep and cut to list of IDs\n" ); |
141 | $this->output( "that can then be used with the --revisions option. E.g.\n" ); |
142 | $this->output( " grep '! Found bad blob' | cut -s -f 3\n" ); |
143 | } |
144 | } |
145 | } |
146 | |
147 | /** |
148 | * @param string $fromTimestamp |
149 | * @param int $total |
150 | * |
151 | * @return int |
152 | */ |
153 | private function scanRevisionsByTimestamp( $fromTimestamp, $total ) { |
154 | $count = 0; |
155 | $lastRevId = 0; |
156 | $firstRevId = 0; |
157 | $lastTimestamp = $fromTimestamp; |
158 | $revisionRowsScanned = 0; |
159 | $archiveRowsScanned = 0; |
160 | |
161 | $this->output( "Scanning revisions table, " |
162 | . "$total rows starting at rev_timestamp $fromTimestamp\n" ); |
163 | |
164 | while ( $revisionRowsScanned < $total ) { |
165 | $batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() ); |
166 | $revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize ); |
167 | if ( !$revisions ) { |
168 | break; |
169 | } |
170 | |
171 | foreach ( $revisions as $rev ) { |
172 | // we are sorting by timestamp, so we may encounter revision IDs out of sequence |
173 | $firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId(); |
174 | $lastRevId = max( $lastRevId, $rev->getId() ); |
175 | |
176 | $count += $this->checkRevision( $rev ); |
177 | } |
178 | |
179 | $lastTimestamp = $rev->getTimestamp(); |
180 | $batchSize = count( $revisions ); |
181 | $revisionRowsScanned += $batchSize; |
182 | $this->output( |
183 | "\t- Scanned a batch of $batchSize revisions, " |
184 | . "up to revision $lastRevId ($lastTimestamp)\n" |
185 | ); |
186 | |
187 | $this->waitForReplication(); |
188 | } |
189 | |
190 | // NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the |
191 | // revision ID just before the first revision ID we found above as the starting point |
192 | // of the scan, and scan up to on revision after the last revision ID we found above. |
193 | // If $firstRevId is 0, the loop body above didn't execute, |
194 | // so we should skip the one below as well. |
195 | $fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' ); |
196 | $maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' ); |
197 | $maxArchived = $maxArchived ?: PHP_INT_MAX; |
198 | |
199 | $this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" ); |
200 | while ( $firstRevId > 0 && $fromArchived < $maxArchived ) { |
201 | $batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() ); |
202 | $revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize ); |
203 | if ( !$revisions ) { |
204 | break; |
205 | } |
206 | /** @var RevisionRecord $rev */ |
207 | foreach ( $revisions as $rev ) { |
208 | $count += $this->checkRevision( $rev ); |
209 | } |
210 | $fromArchived = $rev->getId(); |
211 | $batchSize = count( $revisions ); |
212 | $archiveRowsScanned += $batchSize; |
213 | $this->output( |
214 | "\t- Scanned a batch of $batchSize archived revisions, " |
215 | . "up to revision $fromArchived ($lastTimestamp)\n" |
216 | ); |
217 | |
218 | $this->waitForReplication(); |
219 | } |
220 | |
221 | return $count; |
222 | } |
223 | |
224 | /** |
225 | * @param int $afterId |
226 | * @param string $fromTimestamp |
227 | * @param int $batchSize |
228 | * |
229 | * @return RevisionStoreRecord[] |
230 | */ |
231 | private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize ) { |
232 | $db = $this->getReplicaDB(); |
233 | $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db ); |
234 | $rows = $queryBuilder->joinComment() |
235 | ->where( $db->buildComparison( '>', [ |
236 | 'rev_timestamp' => $fromTimestamp, |
237 | 'rev_id' => $afterId, |
238 | ] ) ) |
239 | ->useIndex( [ 'revision' => 'rev_timestamp' ] ) |
240 | ->orderBy( [ 'rev_timestamp', 'rev_id' ] ) |
241 | ->limit( $batchSize ) |
242 | ->caller( __METHOD__ )->fetchResultSet(); |
243 | $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] ); |
244 | $this->handleStatus( $result ); |
245 | |
246 | $records = array_filter( $result->value ); |
247 | |
248 | '@phan-var RevisionStoreRecord[] $records'; |
249 | return $records; |
250 | } |
251 | |
252 | /** |
253 | * @param int $afterId |
254 | * @param int $uptoId |
255 | * @param int $batchSize |
256 | * |
257 | * @return RevisionArchiveRecord[] |
258 | */ |
259 | private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) { |
260 | $db = $this->getReplicaDB(); |
261 | $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db ) |
262 | ->joinComment() |
263 | ->where( [ "ar_rev_id > $afterId", "ar_rev_id <= $uptoId" ] ) |
264 | ->orderBy( 'ar_rev_id' ) |
265 | ->limit( $batchSize ) |
266 | ->caller( __METHOD__ )->fetchResultSet(); |
267 | $result = $this->revisionStore->newRevisionsFromBatch( |
268 | $rows, |
269 | [ 'archive' => true, 'slots' => true ] |
270 | ); |
271 | $this->handleStatus( $result ); |
272 | |
273 | $records = array_filter( $result->value ); |
274 | |
275 | '@phan-var RevisionArchiveRecord[] $records'; |
276 | return $records; |
277 | } |
278 | |
279 | /** |
280 | * Returns the revision ID next to $revId, according to $comp and $dir |
281 | * |
282 | * @param int $revId |
283 | * @param string $comp the comparator, either '<' or '>', to go with $dir |
284 | * @param string $dir the sort direction to go with $comp, either 'ARC' or 'DESC' |
285 | * |
286 | * @return int |
287 | */ |
288 | private function getNextRevision( int $revId, string $comp, string $dir ) { |
289 | $db = $this->getReplicaDB(); |
290 | $next = $db->newSelectQueryBuilder() |
291 | ->select( 'rev_id' ) |
292 | ->from( 'revision' ) |
293 | ->where( "rev_id $comp $revId" ) |
294 | ->orderBy( [ "rev_id" ], $dir ) |
295 | ->caller( __METHOD__ ) |
296 | ->fetchField(); |
297 | return (int)$next; |
298 | } |
299 | |
300 | /** |
301 | * @param array $ids |
302 | * |
303 | * @return int |
304 | */ |
305 | private function scanRevisionsById( array $ids ) { |
306 | $count = 0; |
307 | $total = count( $ids ); |
308 | |
309 | $this->output( "Scanning $total ids\n" ); |
310 | |
311 | foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) { |
312 | $revisions = $this->loadRevisionsById( $batch ); |
313 | |
314 | if ( !$revisions ) { |
315 | continue; |
316 | } |
317 | |
318 | /** @var RevisionRecord $rev */ |
319 | foreach ( $revisions as $rev ) { |
320 | $count += $this->checkRevision( $rev ); |
321 | } |
322 | |
323 | $batchSize = count( $revisions ); |
324 | $this->output( "\t- Scanned a batch of $batchSize revisions\n" ); |
325 | } |
326 | |
327 | return $count; |
328 | } |
329 | |
330 | /** |
331 | * @param int[] $ids |
332 | * |
333 | * @return RevisionRecord[] |
334 | */ |
335 | private function loadRevisionsById( array $ids ) { |
336 | $db = $this->getReplicaDB(); |
337 | $queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db ); |
338 | |
339 | $rows = $queryBuilder |
340 | ->joinComment() |
341 | ->where( [ 'rev_id' => $ids ] ) |
342 | ->caller( __METHOD__ )->fetchResultSet(); |
343 | |
344 | $result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] ); |
345 | |
346 | $this->handleStatus( $result ); |
347 | |
348 | $revisions = array_filter( $result->value ); |
349 | '@phan-var RevisionArchiveRecord[] $revisions'; |
350 | |
351 | // if not all revisions were found, check the archive table. |
352 | if ( count( $revisions ) < count( $ids ) ) { |
353 | $rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db ) |
354 | ->joinComment() |
355 | ->where( [ 'ar_rev_id' => array_diff( $ids, array_keys( $revisions ) ) ] ) |
356 | ->caller( __METHOD__ )->fetchResultSet(); |
357 | |
358 | $archiveResult = $this->revisionStore->newRevisionsFromBatch( |
359 | $rows, |
360 | [ 'slots' => true, 'archive' => true ] |
361 | ); |
362 | |
363 | $this->handleStatus( $archiveResult ); |
364 | |
365 | // don't use array_merge, since it will re-index |
366 | $revisions += array_filter( $archiveResult->value ); |
367 | } |
368 | |
369 | return $revisions; |
370 | } |
371 | |
372 | /** |
373 | * @param RevisionRecord $rev |
374 | * |
375 | * @return int |
376 | */ |
377 | private function checkRevision( RevisionRecord $rev ) { |
378 | $count = 0; |
379 | foreach ( $rev->getSlots()->getSlots() as $slot ) { |
380 | $count += $this->checkSlot( $rev, $slot ); |
381 | } |
382 | |
383 | if ( $count === 0 && $this->hasOption( 'mark' ) ) { |
384 | $this->output( "\t# No bad blob found on revision {$rev->getId()}, skipped!\n" ); |
385 | } |
386 | |
387 | return $count; |
388 | } |
389 | |
390 | /** |
391 | * @param RevisionRecord $rev |
392 | * @param SlotRecord $slot |
393 | * |
394 | * @return int |
395 | */ |
396 | private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) { |
397 | $address = $slot->getAddress(); |
398 | |
399 | try { |
400 | $this->blobStore->getBlob( $address ); |
401 | // nothing to do |
402 | return 0; |
403 | } catch ( Exception $ex ) { |
404 | $error = $ex->getMessage(); |
405 | $type = get_class( $ex ); |
406 | } |
407 | |
408 | // NOTE: output the revision ID again at the end in a separate column for easy processing |
409 | // via the "cut" shell command. |
410 | $this->output( "\t! Found bad blob on revision {$rev->getId()} " |
411 | . "from {$rev->getTimestamp()} ({$slot->getRole()} slot): " |
412 | . "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, " |
413 | . "error='$error', type='$type'. ID:\t{$rev->getId()}\n" ); |
414 | |
415 | if ( $this->hasOption( 'mark' ) ) { |
416 | $newAddress = $this->markBlob( $slot, $error ); |
417 | $this->output( "\tChanged address to <$newAddress>\n" ); |
418 | } |
419 | |
420 | return 1; |
421 | } |
422 | |
423 | /** |
424 | * @param SlotRecord $slot |
425 | * @param string|null $error |
426 | * |
427 | * @return false|string |
428 | */ |
429 | private function markBlob( SlotRecord $slot, string $error = null ) { |
430 | $args = []; |
431 | |
432 | if ( $this->hasOption( 'mark' ) ) { |
433 | $args['reason'] = $this->getOption( 'mark' ); |
434 | } |
435 | |
436 | if ( $error ) { |
437 | $args['error'] = $error; |
438 | } |
439 | |
440 | $address = $slot->getAddress() ?: 'empty'; |
441 | $badAddress = 'bad:' . urlencode( $address ); |
442 | |
443 | if ( $args ) { |
444 | $badAddress .= '?' . wfArrayToCgi( $args ); |
445 | } |
446 | |
447 | $badAddress = substr( $badAddress, 0, 255 ); |
448 | |
449 | $dbw = $this->getPrimaryDB(); |
450 | $dbw->newUpdateQueryBuilder() |
451 | ->update( 'content' ) |
452 | ->set( [ 'content_address' => $badAddress ] ) |
453 | ->where( [ 'content_id' => $slot->getContentId() ] ) |
454 | ->caller( __METHOD__ )->execute(); |
455 | |
456 | return $badAddress; |
457 | } |
458 | |
459 | private function handleStatus( StatusValue $status ) { |
460 | if ( !$status->isOK() ) { |
461 | $this->fatalError( |
462 | Status::wrap( $status )->getMessage( false, false, 'en' )->text() |
463 | ); |
464 | } |
465 | if ( !$status->isGood() ) { |
466 | $this->error( |
467 | "\t! " . Status::wrap( $status )->getMessage( false, false, 'en' )->text() |
468 | ); |
469 | } |
470 | } |
471 | |
472 | } |
473 | |
474 | $maintClass = FindBadBlobs::class; |
475 | require_once RUN_MAINTENANCE_IF_MAIN; |