MediaWiki master
checkStorage.php
Go to the documentation of this file.
1<?php
28
29// @codeCoverageIgnoreStart
30require_once __DIR__ . '/../Maintenance.php';
31// @codeCoverageIgnoreEnd
32
33// ----------------------------------------------------------------------------------
34
42 private const CONCAT_HEADER = 'O:27:"concatenatedgziphistoryblob"';
43
44 public array $oldIdMap;
45 public array $errors;
46
48 public $dbStore = null;
49
50 public function __construct() {
51 parent::__construct();
52
53 $this->addOption( 'fix', 'Fix errors if possible' );
54 $this->addArg( 'xml', 'Path to an XML dump', false );
55 }
56
57 public function execute() {
58 $fix = $this->hasOption( 'fix' );
59 $xml = $this->getArg( 'xml', false );
60 $this->check( $fix, $xml );
61 }
62
65 'restore text' => 'Damaged text, need to be restored from a backup',
66 'restore revision' => 'Damaged revision row, need to be restored from a backup',
67 'unfixable' => 'Unexpected errors with no automated fixing method',
68 'fixed' => 'Errors already fixed',
69 'fixable' => 'Errors which would already be fixed if --fix was specified',
70 ];
71
72 public function check( $fix = false, $xml = '' ) {
73 $dbr = $this->getReplicaDB();
74 if ( $fix ) {
75 print "Checking, will fix errors if possible...\n";
76 } else {
77 print "Checking...\n";
78 }
79 $maxRevId = $dbr->newSelectQueryBuilder()
80 ->select( 'MAX(rev_id)' )
81 ->from( 'revision' )
82 ->caller( __METHOD__ )->fetchField();
83 $chunkSize = 1000;
84 $flagStats = [];
85 $objectStats = [];
86 $knownFlags = [ 'external', 'gzip', 'object', 'utf-8' ];
87 $this->errors = [
88 'restore text' => [],
89 'restore revision' => [],
90 'unfixable' => [],
91 'fixed' => [],
92 'fixable' => [],
93 ];
94
95 for ( $chunkStart = 1; $chunkStart < $maxRevId; $chunkStart += $chunkSize ) {
96 $chunkEnd = $chunkStart + $chunkSize - 1;
97 // print "$chunkStart of $maxRevId\n";
98
99 $this->oldIdMap = [];
100 $dbr->ping();
101
102 // Fetch revision rows
103 $res = $dbr->newSelectQueryBuilder()
104 ->select( [ 'slot_revision_id', 'content_address' ] )
105 ->from( 'slots' )
106 ->join( 'content', null, 'content_id = slot_content_id' )
107 ->where( [
108 $dbr->expr( 'slot_revision_id', '>=', $chunkStart ),
109 $dbr->expr( 'slot_revision_id', '<=', $chunkEnd ),
110 ] )
111 ->caller( __METHOD__ )->fetchResultSet();
113 $blobStore = $this->getServiceContainer()->getBlobStore();
114 '@phan-var \MediaWiki\Storage\SqlBlobStore $blobStore';
115 foreach ( $res as $row ) {
116 $textId = $blobStore->getTextIdFromAddress( $row->content_address );
117 if ( $textId ) {
118 if ( !isset( $this->oldIdMap[$textId] ) ) {
119 $this->oldIdMap[ $textId ] = [ $row->slot_revision_id ];
120 } elseif ( !in_array( $row->slot_revision_id, $this->oldIdMap[$textId] ) ) {
121 $this->oldIdMap[ $textId ][] = $row->slot_revision_id;
122 }
123 }
124 }
125
126 if ( !count( $this->oldIdMap ) ) {
127 continue;
128 }
129
130 // Fetch old_flags
131 $missingTextRows = $this->oldIdMap;
132 $externalRevs = [];
133 $objectRevs = [];
134 $res = $dbr->newSelectQueryBuilder()
135 ->select( [ 'old_id', 'old_flags' ] )
136 ->from( 'text' )
137 ->where( [ 'old_id' => array_keys( $this->oldIdMap ) ] )
138 ->caller( __METHOD__ )->fetchResultSet();
139 foreach ( $res as $row ) {
143 $flags = $row->old_flags;
144 $id = $row->old_id;
145
146 // Create flagStats row if it doesn't exist
147 $flagStats += [ $flags => 0 ];
148 // Increment counter
149 $flagStats[$flags]++;
150
151 // Not missing
152 unset( $missingTextRows[$row->old_id] );
153
154 // Check for external or object
155 if ( $flags == '' ) {
156 $flagArray = [];
157 } else {
158 $flagArray = explode( ',', $flags );
159 }
160 if ( in_array( 'external', $flagArray ) ) {
161 $externalRevs[] = $id;
162 } elseif ( in_array( 'object', $flagArray ) ) {
163 $objectRevs[] = $id;
164 }
165
166 // Check for unrecognised flags
167 if ( $flags == '0' ) {
168 // This is a known bug from 2004
169 // It's safe to just erase the old_flags field
170 if ( $fix ) {
171 $this->addError( 'fixed', "Warning: old_flags set to 0", $id );
172 $dbw = $this->getPrimaryDB();
173 $dbw->ping();
174 $dbw->newUpdateQueryBuilder()
175 ->update( 'text' )
176 ->set( [ 'old_flags' => '' ] )
177 ->where( [ 'old_id' => $id ] )
178 ->caller( __METHOD__ )
179 ->execute();
180 echo "Fixed\n";
181 } else {
182 $this->addError( 'fixable', "Warning: old_flags set to 0", $id );
183 }
184 } elseif ( count( array_diff( $flagArray, $knownFlags ) ) ) {
185 $this->addError( 'unfixable', "Error: invalid flags field \"$flags\"", $id );
186 }
187 }
188
189 // Output errors for any missing text rows
190 foreach ( $missingTextRows as $oldId => $revIds ) {
191 $this->addError( 'restore revision', "Error: missing text row", $oldId );
192 }
193
194 // Verify external revisions
195 $externalConcatBlobs = [];
196 $externalNormalBlobs = [];
197 if ( count( $externalRevs ) ) {
198 $res = $dbr->newSelectQueryBuilder()
199 ->select( [ 'old_id', 'old_flags', 'old_text' ] )
200 ->from( 'text' )
201 ->where( [ 'old_id' => $externalRevs ] )
202 ->caller( __METHOD__ )->fetchResultSet();
203 foreach ( $res as $row ) {
204 $urlParts = explode( '://', $row->old_text, 2 );
205 if ( count( $urlParts ) !== 2 || $urlParts[1] == '' ) {
206 $this->addError( 'restore text', "Error: invalid URL \"{$row->old_text}\"", $row->old_id );
207 continue;
208 }
209 [ $proto, ] = $urlParts;
210 if ( $proto != 'DB' ) {
211 $this->addError(
212 'restore text',
213 "Error: invalid external protocol \"$proto\"",
214 $row->old_id );
215 continue;
216 }
217 $path = explode( '/', $row->old_text );
218 $cluster = $path[2];
219 $id = $path[3];
220 if ( isset( $path[4] ) ) {
221 $externalConcatBlobs[$cluster][$id][] = $row->old_id;
222 } else {
223 $externalNormalBlobs[$cluster][$id][] = $row->old_id;
224 }
225 }
226 }
227
228 // Check external concat blobs for the right header
229 $this->checkExternalConcatBlobs( $externalConcatBlobs );
230
231 // Check external normal blobs for existence
232 if ( count( $externalNormalBlobs ) ) {
233 if ( $this->dbStore === null ) {
234 $esFactory = $this->getServiceContainer()->getExternalStoreFactory();
235 $this->dbStore = $esFactory->getStore( 'DB' );
236 }
237 foreach ( $externalConcatBlobs as $cluster => $xBlobIds ) {
238 $blobIds = array_keys( $xBlobIds );
239 $extDb = $this->dbStore->getReplica( $cluster );
240 $blobsTable = $this->dbStore->getTable( $cluster );
241 $res = $extDb->newSelectQueryBuilder()
242 ->select( [ 'blob_id' ] )
243 ->from( $blobsTable )
244 ->where( [ 'blob_id' => $blobIds ] )
245 ->caller( __METHOD__ )->fetchResultSet();
246 foreach ( $res as $row ) {
247 unset( $xBlobIds[$row->blob_id] );
248 }
249 // Print errors for missing blobs rows
250 foreach ( $xBlobIds as $blobId => $oldId ) {
251 $this->addError(
252 'restore text',
253 "Error: missing target $blobId for one-part ES URL",
254 $oldId );
255 }
256 }
257 }
258
259 // Check local objects
260 $dbr->ping();
261 $concatBlobs = [];
262 $curIds = [];
263 if ( count( $objectRevs ) ) {
264 $headerLength = 300;
265 $res = $dbr->newSelectQueryBuilder()
266 ->select( [ 'old_id', 'old_flags', "LEFT(old_text, $headerLength) AS header" ] )
267 ->from( 'text' )
268 ->where( [ 'old_id' => $objectRevs ] )
269 ->caller( __METHOD__ )->fetchResultSet();
270 foreach ( $res as $row ) {
271 $oldId = $row->old_id;
272 $matches = [];
273 if ( !preg_match( '/^O:(\d+):"(\w+)"/', $row->header, $matches ) ) {
274 $this->addError( 'restore text', "Error: invalid object header", $oldId );
275 continue;
276 }
277
278 $className = strtolower( $matches[2] );
279 if ( strlen( $className ) != $matches[1] ) {
280 $this->addError(
281 'restore text',
282 "Error: invalid object header, wrong class name length",
283 $oldId
284 );
285 continue;
286 }
287
288 $objectStats += [ $className => 0 ];
289 $objectStats[$className]++;
290
291 switch ( $className ) {
292 case 'concatenatedgziphistoryblob':
293 // Good
294 break;
295 case 'historyblobstub':
296 case 'historyblobcurstub':
297 if ( strlen( $row->header ) == $headerLength ) {
298 $this->addError( 'unfixable', "Error: overlong stub header", $oldId );
299 break;
300 }
301 $stubObj = unserialize( $row->header );
302 if ( !is_object( $stubObj ) ) {
303 $this->addError( 'restore text', "Error: unable to unserialize stub object", $oldId );
304 break;
305 }
306 if ( $className == 'historyblobstub' ) {
307 $concatBlobs[$stubObj->getLocation()][] = $oldId;
308 } else {
309 $curIds[$stubObj->mCurId][] = $oldId;
310 }
311 break;
312 default:
313 $this->addError( 'unfixable', "Error: unrecognised object class \"$className\"", $oldId );
314 }
315 }
316 }
317
318 // Check local concat blob validity
319 $externalConcatBlobs = [];
320 if ( count( $concatBlobs ) ) {
321 $headerLength = 300;
322 $res = $dbr->newSelectQueryBuilder()
323 ->select( [ 'old_id', 'old_flags', "LEFT(old_text, $headerLength) AS header" ] )
324 ->from( 'text' )
325 ->where( [ 'old_id' => array_keys( $concatBlobs ) ] )
326 ->caller( __METHOD__ )->fetchResultSet();
327 foreach ( $res as $row ) {
328 $flags = explode( ',', $row->old_flags );
329 if ( in_array( 'external', $flags ) ) {
330 // Concat blob is in external storage?
331 if ( in_array( 'object', $flags ) ) {
332 $urlParts = explode( '/', $row->header );
333 if ( $urlParts[0] != 'DB:' ) {
334 $this->addError(
335 'unfixable',
336 "Error: unrecognised external storage type \"{$urlParts[0]}",
337 $row->old_id
338 );
339 } else {
340 $cluster = $urlParts[2];
341 $id = $urlParts[3];
342 if ( !isset( $externalConcatBlobs[$cluster][$id] ) ) {
343 $externalConcatBlobs[$cluster][$id] = [];
344 }
345 $externalConcatBlobs[$cluster][$id] = array_merge(
346 $externalConcatBlobs[$cluster][$id], $concatBlobs[$row->old_id]
347 );
348 }
349 } else {
350 $this->addError(
351 'unfixable',
352 "Error: invalid flags \"{$row->old_flags}\" on concat bulk row {$row->old_id}",
353 $concatBlobs[$row->old_id] );
354 }
355 } elseif ( strcasecmp(
356 substr( $row->header, 0, strlen( self::CONCAT_HEADER ) ),
357 self::CONCAT_HEADER
358 ) ) {
359 $this->addError(
360 'restore text',
361 "Error: Incorrect object header for concat bulk row {$row->old_id}",
362 $concatBlobs[$row->old_id]
363 );
364 }
365
366 unset( $concatBlobs[$row->old_id] );
367 }
368 }
369
370 // Check targets of unresolved stubs
371 $this->checkExternalConcatBlobs( $externalConcatBlobs );
372 // next chunk
373 }
374
375 print "\n\nErrors:\n";
376 foreach ( $this->errors as $name => $errors ) {
377 if ( count( $errors ) ) {
378 $description = $this->errorDescriptions[$name];
379 echo "$description: " . implode( ',', array_keys( $errors ) ) . "\n";
380 }
381 }
382
383 if ( count( $this->errors['restore text'] ) && $fix ) {
384 if ( (string)$xml !== '' ) {
385 $this->restoreText( array_keys( $this->errors['restore text'] ), $xml );
386 } else {
387 echo "Can't fix text, no XML backup specified\n";
388 }
389 }
390
391 print "\nFlag statistics:\n";
392 $total = array_sum( $flagStats );
393 foreach ( $flagStats as $flag => $count ) {
394 printf( "%-30s %10d %5.2f%%\n", $flag, $count, $count / $total * 100 );
395 }
396 print "\nLocal object statistics:\n";
397 $total = array_sum( $objectStats );
398 foreach ( $objectStats as $className => $count ) {
399 printf( "%-30s %10d %5.2f%%\n", $className, $count, $count / $total * 100 );
400 }
401 }
402
408 private function addError( string $type, string $msg, $ids ) {
409 if ( is_array( $ids ) && count( $ids ) == 1 ) {
410 $ids = reset( $ids );
411 }
412 if ( is_array( $ids ) ) {
413 $revIds = [];
414 foreach ( $ids as $id ) {
415 $revIds = array_unique( array_merge( $revIds, $this->oldIdMap[$id] ) );
416 }
417 print "$msg in text rows " . implode( ', ', $ids ) .
418 ", revisions " . implode( ', ', $revIds ) . "\n";
419 } else {
420 $id = $ids;
421 $revIds = $this->oldIdMap[$id];
422 if ( count( $revIds ) == 1 ) {
423 print "$msg in old_id $id, rev_id {$revIds[0]}\n";
424 } else {
425 print "$msg in old_id $id, revisions " . implode( ', ', $revIds ) . "\n";
426 }
427 }
428 $this->errors[$type] += array_fill_keys( $revIds, true );
429 }
430
431 private function checkExternalConcatBlobs( array $externalConcatBlobs ) {
432 if ( !count( $externalConcatBlobs ) ) {
433 return;
434 }
435
436 if ( $this->dbStore === null ) {
437 $esFactory = $this->getServiceContainer()->getExternalStoreFactory();
438 $this->dbStore = $esFactory->getStore( 'DB' );
439 }
440
441 foreach ( $externalConcatBlobs as $cluster => $oldIds ) {
442 $blobIds = array_keys( $oldIds );
443 $extDb = $this->dbStore->getReplica( $cluster );
444 $blobsTable = $this->dbStore->getTable( $cluster );
445 $headerLength = strlen( self::CONCAT_HEADER );
446 $res = $extDb->newSelectQueryBuilder()
447 ->select( [ 'blob_id', "LEFT(blob_text, $headerLength) AS header" ] )
448 ->from( $blobsTable )
449 ->where( [ 'blob_id' => $blobIds ] )
450 ->caller( __METHOD__ )->fetchResultSet();
451 foreach ( $res as $row ) {
452 if ( strcasecmp( $row->header, self::CONCAT_HEADER ) ) {
453 $this->addError(
454 'restore text',
455 "Error: invalid header on target $cluster/{$row->blob_id} of two-part ES URL",
456 $oldIds[$row->blob_id]
457 );
458 }
459 unset( $oldIds[$row->blob_id] );
460 }
461
462 // Print errors for missing blobs rows
463 foreach ( $oldIds as $blobId => $oldIds2 ) {
464 $this->addError(
465 'restore text',
466 "Error: missing target $cluster/$blobId for two-part ES URL",
467 $oldIds2
468 );
469 }
470 }
471 }
472
473 private function restoreText( array $revIds, string $xml ) {
474 global $wgDBname;
475 $tmpDir = wfTempDir();
476
477 if ( !count( $revIds ) ) {
478 return;
479 }
480
481 print "Restoring text from XML backup...\n";
482
483 $revFileName = "$tmpDir/broken-revlist-$wgDBname";
484 $filteredXmlFileName = "$tmpDir/filtered-$wgDBname.xml";
485
486 // Write revision list
487 if ( !file_put_contents( $revFileName, implode( "\n", $revIds ) ) ) {
488 echo "Error writing revision list, can't restore text\n";
489
490 return;
491 }
492
493 // Run mwdumper
494 echo "Filtering XML dump...\n";
495 $exitStatus = 0;
496 // phpcs:ignore MediaWiki.Usage.ForbiddenFunctions.passthru
497 passthru( 'mwdumper ' .
498 Shell::escape(
499 "--output=file:$filteredXmlFileName",
500 "--filter=revlist:$revFileName",
501 $xml
502 ), $exitStatus
503 );
504
505 if ( $exitStatus ) {
506 echo "mwdumper died with exit status $exitStatus\n";
507
508 return;
509 }
510
511 $file = fopen( $filteredXmlFileName, 'r' );
512 if ( !$file ) {
513 echo "Unable to open filtered XML file\n";
514
515 return;
516 }
517
518 $dbr = $this->getReplicaDB();
519 $dbw = $this->getPrimaryDB();
520 $dbr->ping();
521 $dbw->ping();
522
523 $source = new ImportStreamSource( $file );
524 $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
525 $importer = $this->getServiceContainer()
526 ->getWikiImporterFactory()
527 ->getWikiImporter( $source, new UltimateAuthority( $user ) );
528 $importer->setRevisionCallback( [ $this, 'importRevision' ] );
529 $importer->setNoticeCallback( static function ( $msg, $params ) {
530 echo wfMessage( $msg, $params )->text() . "\n";
531 } );
532 $importer->doImport();
533 }
534
538 public function importRevision( $revision ) {
539 $id = $revision->getID();
540 $content = $revision->getContent();
541 $id = $id ?: '';
542
543 if ( $content === null ) {
544 echo "Revision $id is broken, we have no content available\n";
545
546 return;
547 }
548
549 $text = $content->serialize();
550 if ( $text === '' ) {
551 // This is what happens if the revision was broken at the time the
552 // dump was made. Unfortunately, it also happens if the revision was
553 // legitimately blank, so there's no way to tell the difference. To
554 // be safe, we'll skip it and leave it broken
555
556 echo "Revision $id is blank in the dump, may have been broken before export\n";
557
558 return;
559 }
560
561 if ( !$id ) {
562 // No ID, can't import
563 echo "No id tag in revision, can't import\n";
564
565 return;
566 }
567
568 // Find text row again
569 $dbr = $this->getReplicaDB();
570 $res = $dbr->newSelectQueryBuilder()
571 ->select( [ 'content_address' ] )
572 ->from( 'slots' )
573 ->join( 'content', null, 'content_id = slot_content_id' )
574 ->where( [ 'slot_revision_id' => $id ] )
575 ->caller( __METHOD__ )->fetchRow();
576
577 $blobStore = $this->getServiceContainer()
578 ->getBlobStoreFactory()
579 ->newSqlBlobStore();
580 $oldId = $blobStore->getTextIdFromAddress( $res->content_address );
581
582 if ( !$oldId ) {
583 echo "Missing revision row for rev_id $id\n";
584 return;
585 }
586
587 // Compress the text
588 $flags = $blobStore->compressData( $text );
589
590 // Update the text row
591 $dbw = $this->getPrimaryDB();
592 $dbw->newUpdateQueryBuilder()
593 ->update( 'text' )
594 ->set( [ 'old_flags' => $flags, 'old_text' => $text ] )
595 ->where( [ 'old_id' => $oldId ] )
596 ->caller( __METHOD__ )
597 ->execute();
598
599 // Remove it from the unfixed list and add it to the fixed list
600 unset( $this->errors['restore text'][$id] );
601 $this->errors['fixed'][$id] = true;
602 }
603
604}
605
606// @codeCoverageIgnoreStart
607$maintClass = CheckStorage::class;
608require_once RUN_MAINTENANCE_IF_MAIN;
609// @codeCoverageIgnoreEnd
wfTempDir()
Tries to get the system directory for temporary files.
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
$maintClass
Maintenance script to do various checks on external storage.
importRevision( $revision)
check( $fix=false, $xml='')
__construct()
Default constructor.
string[] $errorDescriptions
execute()
Do the actual work.
ExternalStoreDB $dbStore
External storage in a SQL database.
Imports a XML dump from a file (either from file upload, files on disk, or HTTP)
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
getArg( $argId=0, $default=null)
Get an argument.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
hasOption( $name)
Checks to see if a particular option was set.
getServiceContainer()
Returns the main service container.
Represents an authority that has all permissions.
Executes shell commands.
Definition Shell.php:46
User class for the MediaWiki software.
Definition User.php:121
$wgDBname
Config variable stub for the DBname setting, for use by phpdoc and IDEs.
$source