MediaWiki  master
checkStorage.php
Go to the documentation of this file.
1 <?php
25 
26 require_once __DIR__ . '/../Maintenance.php';
27 
28 // ----------------------------------------------------------------------------------
29 
36 class CheckStorage extends Maintenance {
37  private const CONCAT_HEADER = 'O:27:"concatenatedgziphistoryblob"';
38 
39  public $oldIdMap, $errors;
40 
42  public $dbStore = null;
43 
44  public function __construct() {
45  parent::__construct();
46 
47  $this->addOption( 'fix', 'Fix errors if possible' );
48  $this->addArg( 'xml', 'Path to an XML dump', false );
49  }
50 
51  public function execute() {
52  $fix = $this->hasOption( 'fix' );
53  $xml = $this->getArg( 'xml', false );
54  $this->check( $fix, $xml );
55  }
56 
57  public $errorDescriptions = [
58  'restore text' => 'Damaged text, need to be restored from a backup',
59  'restore revision' => 'Damaged revision row, need to be restored from a backup',
60  'unfixable' => 'Unexpected errors with no automated fixing method',
61  'fixed' => 'Errors already fixed',
62  'fixable' => 'Errors which would already be fixed if --fix was specified',
63  ];
64 
65  public function check( $fix = false, $xml = '' ) {
66  $dbr = wfGetDB( DB_REPLICA );
67  if ( $fix ) {
68  print "Checking, will fix errors if possible...\n";
69  } else {
70  print "Checking...\n";
71  }
72  $maxRevId = $dbr->newSelectQueryBuilder()
73  ->select( 'MAX(rev_id)' )
74  ->from( 'revision' )
75  ->caller( __METHOD__ )->fetchField();
76  $chunkSize = 1000;
77  $flagStats = [];
78  $objectStats = [];
79  $knownFlags = [ 'external', 'gzip', 'object', 'utf-8' ];
80  $this->errors = [
81  'restore text' => [],
82  'restore revision' => [],
83  'unfixable' => [],
84  'fixed' => [],
85  'fixable' => [],
86  ];
87 
88  for ( $chunkStart = 1; $chunkStart < $maxRevId; $chunkStart += $chunkSize ) {
89  $chunkEnd = $chunkStart + $chunkSize - 1;
90  // print "$chunkStart of $maxRevId\n";
91 
92  $this->oldIdMap = [];
93  $dbr->ping();
94 
95  // Fetch revision rows
96  $res = $dbr->newSelectQueryBuilder()
97  ->select( [ 'slot_revision_id', 'content_address' ] )
98  ->from( 'slots' )
99  ->join( 'content', null, 'content_id = slot_content_id' )
100  ->where( [ "slot_revision_id BETWEEN $chunkStart AND $chunkEnd" ] )
101  ->caller( __METHOD__ )->fetchResultSet();
103  $blobStore = $this->getServiceContainer()->getBlobStore();
104  '@phan-var \MediaWiki\Storage\SqlBlobStore $blobStore';
105  foreach ( $res as $row ) {
106  $textId = $blobStore->getTextIdFromAddress( $row->content_address );
107  if ( $textId ) {
108  if ( !isset( $this->oldIdMap[$textId] ) ) {
109  $this->oldIdMap[ $textId ] = [ $row->slot_revision_id ];
110  } elseif ( !in_array( $row->slot_revision_id, $this->oldIdMap[$textId] ) ) {
111  $this->oldIdMap[ $textId ][] = $row->slot_revision_id;
112  }
113  }
114  }
115 
116  if ( !count( $this->oldIdMap ) ) {
117  continue;
118  }
119 
120  // Fetch old_flags
121  $missingTextRows = $this->oldIdMap;
122  $externalRevs = [];
123  $objectRevs = [];
124  $res = $dbr->newSelectQueryBuilder()
125  ->select( [ 'old_id', 'old_flags' ] )
126  ->from( 'text' )
127  ->where( [ 'old_id' => array_keys( $this->oldIdMap ) ] )
128  ->caller( __METHOD__ )->fetchResultSet();
129  foreach ( $res as $row ) {
133  $flags = $row->old_flags;
134  $id = $row->old_id;
135 
136  // Create flagStats row if it doesn't exist
137  $flagStats += [ $flags => 0 ];
138  // Increment counter
139  $flagStats[$flags]++;
140 
141  // Not missing
142  unset( $missingTextRows[$row->old_id] );
143 
144  // Check for external or object
145  if ( $flags == '' ) {
146  $flagArray = [];
147  } else {
148  $flagArray = explode( ',', $flags );
149  }
150  if ( in_array( 'external', $flagArray ) ) {
151  $externalRevs[] = $id;
152  } elseif ( in_array( 'object', $flagArray ) ) {
153  $objectRevs[] = $id;
154  }
155 
156  // Check for unrecognised flags
157  if ( $flags == '0' ) {
158  // This is a known bug from 2004
159  // It's safe to just erase the old_flags field
160  if ( $fix ) {
161  $this->addError( 'fixed', "Warning: old_flags set to 0", $id );
162  $dbw = wfGetDB( DB_PRIMARY );
163  $dbw->ping();
164  $dbw->update( 'text', [ 'old_flags' => '' ],
165  [ 'old_id' => $id ], __METHOD__ );
166  echo "Fixed\n";
167  } else {
168  $this->addError( 'fixable', "Warning: old_flags set to 0", $id );
169  }
170  } elseif ( count( array_diff( $flagArray, $knownFlags ) ) ) {
171  $this->addError( 'unfixable', "Error: invalid flags field \"$flags\"", $id );
172  }
173  }
174 
175  // Output errors for any missing text rows
176  foreach ( $missingTextRows as $oldId => $revIds ) {
177  $this->addError( 'restore revision', "Error: missing text row", $oldId );
178  }
179 
180  // Verify external revisions
181  $externalConcatBlobs = [];
182  $externalNormalBlobs = [];
183  if ( count( $externalRevs ) ) {
184  $res = $dbr->newSelectQueryBuilder()
185  ->select( [ 'old_id', 'old_flags', 'old_text' ] )
186  ->from( 'text' )
187  ->where( [ 'old_id' => $externalRevs ] )
188  ->caller( __METHOD__ )->fetchResultSet();
189  foreach ( $res as $row ) {
190  $urlParts = explode( '://', $row->old_text, 2 );
191  if ( count( $urlParts ) !== 2 || $urlParts[1] == '' ) {
192  $this->addError( 'restore text', "Error: invalid URL \"{$row->old_text}\"", $row->old_id );
193  continue;
194  }
195  [ $proto, ] = $urlParts;
196  if ( $proto != 'DB' ) {
197  $this->addError(
198  'restore text',
199  "Error: invalid external protocol \"$proto\"",
200  $row->old_id );
201  continue;
202  }
203  $path = explode( '/', $row->old_text );
204  $cluster = $path[2];
205  $id = $path[3];
206  if ( isset( $path[4] ) ) {
207  $externalConcatBlobs[$cluster][$id][] = $row->old_id;
208  } else {
209  $externalNormalBlobs[$cluster][$id][] = $row->old_id;
210  }
211  }
212  }
213 
214  // Check external concat blobs for the right header
215  $this->checkExternalConcatBlobs( $externalConcatBlobs );
216 
217  // Check external normal blobs for existence
218  if ( count( $externalNormalBlobs ) ) {
219  if ( $this->dbStore === null ) {
220  $esFactory = $this->getServiceContainer()->getExternalStoreFactory();
221  $this->dbStore = $esFactory->getStore( 'DB' );
222  }
223  foreach ( $externalConcatBlobs as $cluster => $xBlobIds ) {
224  $blobIds = array_keys( $xBlobIds );
225  $extDb = $this->dbStore->getReplica( $cluster );
226  $blobsTable = $this->dbStore->getTable( $extDb );
227  $res = $extDb->newSelectQueryBuilder()
228  ->select( [ 'blob_id' ] )
229  ->from( $blobsTable )
230  ->where( [ 'blob_id' => $blobIds ] )
231  ->caller( __METHOD__ )->fetchResultSet();
232  foreach ( $res as $row ) {
233  unset( $xBlobIds[$row->blob_id] );
234  }
235  // Print errors for missing blobs rows
236  foreach ( $xBlobIds as $blobId => $oldId ) {
237  $this->addError(
238  'restore text',
239  "Error: missing target $blobId for one-part ES URL",
240  $oldId );
241  }
242  }
243  }
244 
245  // Check local objects
246  $dbr->ping();
247  $concatBlobs = [];
248  $curIds = [];
249  if ( count( $objectRevs ) ) {
250  $headerLength = 300;
251  $res = $dbr->newSelectQueryBuilder()
252  ->select( [ 'old_id', 'old_flags', "LEFT(old_text, $headerLength) AS header" ] )
253  ->from( 'text' )
254  ->where( [ 'old_id' => $objectRevs ] )
255  ->caller( __METHOD__ )->fetchResultSet();
256  foreach ( $res as $row ) {
257  $oldId = $row->old_id;
258  $matches = [];
259  if ( !preg_match( '/^O:(\d+):"(\w+)"/', $row->header, $matches ) ) {
260  $this->addError( 'restore text', "Error: invalid object header", $oldId );
261  continue;
262  }
263 
264  $className = strtolower( $matches[2] );
265  if ( strlen( $className ) != $matches[1] ) {
266  $this->addError(
267  'restore text',
268  "Error: invalid object header, wrong class name length",
269  $oldId
270  );
271  continue;
272  }
273 
274  $objectStats += [ $className => 0 ];
275  $objectStats[$className]++;
276 
277  switch ( $className ) {
278  case 'concatenatedgziphistoryblob':
279  // Good
280  break;
281  case 'historyblobstub':
282  case 'historyblobcurstub':
283  if ( strlen( $row->header ) == $headerLength ) {
284  $this->addError( 'unfixable', "Error: overlong stub header", $oldId );
285  break;
286  }
287  $stubObj = unserialize( $row->header );
288  if ( !is_object( $stubObj ) ) {
289  $this->addError( 'restore text', "Error: unable to unserialize stub object", $oldId );
290  break;
291  }
292  if ( $className == 'historyblobstub' ) {
293  $concatBlobs[$stubObj->getLocation()][] = $oldId;
294  } else {
295  $curIds[$stubObj->mCurId][] = $oldId;
296  }
297  break;
298  default:
299  $this->addError( 'unfixable', "Error: unrecognised object class \"$className\"", $oldId );
300  }
301  }
302  }
303 
304  // Check local concat blob validity
305  $externalConcatBlobs = [];
306  if ( count( $concatBlobs ) ) {
307  $headerLength = 300;
308  $res = $dbr->newSelectQueryBuilder()
309  ->select( [ 'old_id', 'old_flags', "LEFT(old_text, $headerLength) AS header" ] )
310  ->from( 'text' )
311  ->where( [ 'old_id' => array_keys( $concatBlobs ) ] )
312  ->caller( __METHOD__ )->fetchResultSet();
313  foreach ( $res as $row ) {
314  $flags = explode( ',', $row->old_flags );
315  if ( in_array( 'external', $flags ) ) {
316  // Concat blob is in external storage?
317  if ( in_array( 'object', $flags ) ) {
318  $urlParts = explode( '/', $row->header );
319  if ( $urlParts[0] != 'DB:' ) {
320  $this->addError(
321  'unfixable',
322  "Error: unrecognised external storage type \"{$urlParts[0]}",
323  $row->old_id
324  );
325  } else {
326  $cluster = $urlParts[2];
327  $id = $urlParts[3];
328  if ( !isset( $externalConcatBlobs[$cluster][$id] ) ) {
329  $externalConcatBlobs[$cluster][$id] = [];
330  }
331  $externalConcatBlobs[$cluster][$id] = array_merge(
332  $externalConcatBlobs[$cluster][$id], $concatBlobs[$row->old_id]
333  );
334  }
335  } else {
336  $this->addError(
337  'unfixable',
338  "Error: invalid flags \"{$row->old_flags}\" on concat bulk row {$row->old_id}",
339  $concatBlobs[$row->old_id] );
340  }
341  } elseif ( strcasecmp(
342  substr( $row->header, 0, strlen( self::CONCAT_HEADER ) ),
343  self::CONCAT_HEADER
344  ) ) {
345  $this->addError(
346  'restore text',
347  "Error: Incorrect object header for concat bulk row {$row->old_id}",
348  $concatBlobs[$row->old_id]
349  );
350  }
351 
352  unset( $concatBlobs[$row->old_id] );
353  }
354  }
355 
356  // Check targets of unresolved stubs
357  $this->checkExternalConcatBlobs( $externalConcatBlobs );
358  // next chunk
359  }
360 
361  print "\n\nErrors:\n";
362  foreach ( $this->errors as $name => $errors ) {
363  if ( count( $errors ) ) {
364  $description = $this->errorDescriptions[$name];
365  echo "$description: " . implode( ',', array_keys( $errors ) ) . "\n";
366  }
367  }
368 
369  if ( count( $this->errors['restore text'] ) && $fix ) {
370  if ( (string)$xml !== '' ) {
371  $this->restoreText( array_keys( $this->errors['restore text'] ), $xml );
372  } else {
373  echo "Can't fix text, no XML backup specified\n";
374  }
375  }
376 
377  print "\nFlag statistics:\n";
378  $total = array_sum( $flagStats );
379  foreach ( $flagStats as $flag => $count ) {
380  printf( "%-30s %10d %5.2f%%\n", $flag, $count, $count / $total * 100 );
381  }
382  print "\nLocal object statistics:\n";
383  $total = array_sum( $objectStats );
384  foreach ( $objectStats as $className => $count ) {
385  printf( "%-30s %10d %5.2f%%\n", $className, $count, $count / $total * 100 );
386  }
387  }
388 
389  private function addError( $type, $msg, $ids ) {
390  if ( is_array( $ids ) && count( $ids ) == 1 ) {
391  $ids = reset( $ids );
392  }
393  if ( is_array( $ids ) ) {
394  $revIds = [];
395  foreach ( $ids as $id ) {
396  $revIds = array_unique( array_merge( $revIds, $this->oldIdMap[$id] ) );
397  }
398  print "$msg in text rows " . implode( ', ', $ids ) .
399  ", revisions " . implode( ', ', $revIds ) . "\n";
400  } else {
401  $id = $ids;
402  $revIds = $this->oldIdMap[$id];
403  if ( count( $revIds ) == 1 ) {
404  print "$msg in old_id $id, rev_id {$revIds[0]}\n";
405  } else {
406  print "$msg in old_id $id, revisions " . implode( ', ', $revIds ) . "\n";
407  }
408  }
409  $this->errors[$type] += array_fill_keys( $revIds, true );
410  }
411 
412  private function checkExternalConcatBlobs( $externalConcatBlobs ) {
413  if ( !count( $externalConcatBlobs ) ) {
414  return;
415  }
416 
417  if ( $this->dbStore === null ) {
418  $esFactory = $this->getServiceContainer()->getExternalStoreFactory();
419  $this->dbStore = $esFactory->getStore( 'DB' );
420  }
421 
422  foreach ( $externalConcatBlobs as $cluster => $oldIds ) {
423  $blobIds = array_keys( $oldIds );
424  $extDb = $this->dbStore->getReplica( $cluster );
425  $blobsTable = $this->dbStore->getTable( $extDb );
426  $headerLength = strlen( self::CONCAT_HEADER );
427  $res = $extDb->newSelectQueryBuilder()
428  ->select( [ 'blob_id', "LEFT(blob_text, $headerLength) AS header" ] )
429  ->from( $blobsTable )
430  ->where( [ 'blob_id' => $blobIds ] )
431  ->caller( __METHOD__ )->fetchResultSet();
432  foreach ( $res as $row ) {
433  if ( strcasecmp( $row->header, self::CONCAT_HEADER ) ) {
434  $this->addError(
435  'restore text',
436  "Error: invalid header on target $cluster/{$row->blob_id} of two-part ES URL",
437  $oldIds[$row->blob_id]
438  );
439  }
440  unset( $oldIds[$row->blob_id] );
441  }
442 
443  // Print errors for missing blobs rows
444  foreach ( $oldIds as $blobId => $oldIds2 ) {
445  $this->addError(
446  'restore text',
447  "Error: missing target $cluster/$blobId for two-part ES URL",
448  $oldIds2
449  );
450  }
451  }
452  }
453 
454  private function restoreText( $revIds, $xml ) {
455  global $wgDBname;
456  $tmpDir = wfTempDir();
457 
458  if ( !count( $revIds ) ) {
459  return;
460  }
461 
462  print "Restoring text from XML backup...\n";
463 
464  $revFileName = "$tmpDir/broken-revlist-$wgDBname";
465  $filteredXmlFileName = "$tmpDir/filtered-$wgDBname.xml";
466 
467  // Write revision list
468  if ( !file_put_contents( $revFileName, implode( "\n", $revIds ) ) ) {
469  echo "Error writing revision list, can't restore text\n";
470 
471  return;
472  }
473 
474  // Run mwdumper
475  echo "Filtering XML dump...\n";
476  $exitStatus = 0;
477  // phpcs:ignore MediaWiki.Usage.ForbiddenFunctions.passthru
478  passthru( 'mwdumper ' .
479  Shell::escape(
480  "--output=file:$filteredXmlFileName",
481  "--filter=revlist:$revFileName",
482  $xml
483  ), $exitStatus
484  );
485 
486  if ( $exitStatus ) {
487  echo "mwdumper died with exit status $exitStatus\n";
488 
489  return;
490  }
491 
492  $file = fopen( $filteredXmlFileName, 'r' );
493  if ( !$file ) {
494  echo "Unable to open filtered XML file\n";
495 
496  return;
497  }
498 
499  $dbr = wfGetDB( DB_REPLICA );
500  $dbw = wfGetDB( DB_PRIMARY );
501  $dbr->ping();
502  $dbw->ping();
503 
505  $importer = $this->getServiceContainer()
506  ->getWikiImporterFactory()
507  ->getWikiImporter( $source );
508  $importer->setRevisionCallback( [ $this, 'importRevision' ] );
509  $importer->setNoticeCallback( static function ( $msg, $params ) {
510  echo wfMessage( $msg, $params )->text() . "\n";
511  } );
512  $importer->doImport();
513  }
514 
518  public function importRevision( $revision ) {
519  $id = $revision->getID();
520  $content = $revision->getContent();
521  $id = $id ?: '';
522 
523  if ( $content === null ) {
524  echo "Revision $id is broken, we have no content available\n";
525 
526  return;
527  }
528 
529  $text = $content->serialize();
530  if ( $text === '' ) {
531  // This is what happens if the revision was broken at the time the
532  // dump was made. Unfortunately, it also happens if the revision was
533  // legitimately blank, so there's no way to tell the difference. To
534  // be safe, we'll skip it and leave it broken
535 
536  echo "Revision $id is blank in the dump, may have been broken before export\n";
537 
538  return;
539  }
540 
541  if ( !$id ) {
542  // No ID, can't import
543  echo "No id tag in revision, can't import\n";
544 
545  return;
546  }
547 
548  // Find text row again
549  $dbr = wfGetDB( DB_REPLICA );
550  $res = $dbr->newSelectQueryBuilder()
551  ->select( [ 'content_address' ] )
552  ->from( 'slots' )
553  ->join( 'content', null, 'content_id = slot_content_id' )
554  ->where( [ 'slot_revision_id' => $id ] )
555  ->caller( __METHOD__ )->fetchRow();
556 
557  $blobStore = $this->getServiceContainer()
558  ->getBlobStoreFactory()
559  ->newSqlBlobStore();
560  $oldId = $blobStore->getTextIdFromAddress( $res->content_address );
561 
562  if ( !$oldId ) {
563  echo "Missing revision row for rev_id $id\n";
564  return;
565  }
566 
567  // Compress the text
568  $flags = $blobStore->compressData( $text );
569 
570  // Update the text row
571  $dbw = wfGetDB( DB_PRIMARY );
572  $dbw->update( 'text',
573  [ 'old_flags' => $flags, 'old_text' => $text ],
574  [ 'old_id' => $oldId ],
575  __METHOD__, [ 'LIMIT' => 1 ]
576  );
577 
578  // Remove it from the unfixed list and add it to the fixed list
579  unset( $this->errors['restore text'][$id] );
580  $this->errors['fixed'][$id] = true;
581  }
582 
583 }
584 
585 $maintClass = CheckStorage::class;
586 require_once RUN_MAINTENANCE_IF_MAIN;
wfTempDir()
Tries to get the system directory for temporary files.
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
$matches
$maintClass
Maintenance script to do various checks on external storage.
importRevision( $revision)
check( $fix=false, $xml='')
__construct()
Default constructor.
execute()
Do the actual work.
ExternalStoreDB $dbStore
Imports a XML dump from a file (either from file upload, files on disk, or HTTP)
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:66
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
hasOption( $name)
Checks to see if a particular option was set.
getServiceContainer()
Returns the main service container.
getArg( $argId=0, $default=null)
Get an argument.
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Executes shell commands.
Definition: Shell.php:46
$wgDBname
Config variable stub for the DBname setting, for use by phpdoc and IDEs.
$source
const DB_REPLICA
Definition: defines.php:26
const DB_PRIMARY
Definition: defines.php:28
$content
Definition: router.php:76
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition: router.php:42