MediaWiki  1.34.0
checkStorage.php
Go to the documentation of this file.
1 <?php
27 
28 if ( !defined( 'MEDIAWIKI' ) ) {
29  $optionsWithoutArgs = [ 'fix' ];
30  require_once __DIR__ . '/../commandLine.inc';
31 
32  $cs = new CheckStorage;
33  $fix = isset( $options['fix'] );
34  $xml = $args[0] ?? false;
35  $cs->check( $fix, $xml );
36 }
37 
38 // ----------------------------------------------------------------------------------
39 
46 class CheckStorage {
47  const CONCAT_HEADER = 'O:27:"concatenatedgziphistoryblob"';
48  public $oldIdMap, $errors;
50  public $dbStore = null;
51 
52  public $errorDescriptions = [
53  'restore text' => 'Damaged text, need to be restored from a backup',
54  'restore revision' => 'Damaged revision row, need to be restored from a backup',
55  'unfixable' => 'Unexpected errors with no automated fixing method',
56  'fixed' => 'Errors already fixed',
57  'fixable' => 'Errors which would already be fixed if --fix was specified',
58  ];
59 
60  function check( $fix = false, $xml = '' ) {
62 
63  $dbr = wfGetDB( DB_REPLICA );
64  if ( $fix ) {
65  print "Checking, will fix errors if possible...\n";
66  } else {
67  print "Checking...\n";
68  }
69  $maxRevId = $dbr->selectField( 'revision', 'MAX(rev_id)', '', __METHOD__ );
70  $chunkSize = 1000;
71  $flagStats = [];
72  $objectStats = [];
73  $knownFlags = [ 'external', 'gzip', 'object', 'utf-8' ];
74  $this->errors = [
75  'restore text' => [],
76  'restore revision' => [],
77  'unfixable' => [],
78  'fixed' => [],
79  'fixable' => [],
80  ];
81 
82  for ( $chunkStart = 1; $chunkStart < $maxRevId; $chunkStart += $chunkSize ) {
83  $chunkEnd = $chunkStart + $chunkSize - 1;
84  // print "$chunkStart of $maxRevId\n";
85 
86  $this->oldIdMap = [];
87  $dbr->ping();
88 
89  // Fetch revision rows
91  $res = $dbr->select( 'revision', [ 'rev_id', 'rev_text_id' ],
92  [ "rev_id BETWEEN $chunkStart AND $chunkEnd" ], __METHOD__ );
93  foreach ( $res as $row ) {
94  if ( !isset( $this->oldIdMap[ $row->rev_text_id ] ) ) {
95  $this->oldIdMap[ $row->rev_text_id ] = [ $row->rev_id ];
96  } elseif ( !in_array( $row->rev_id, $this->oldIdMap[ $row->rev_text_id ] ) ) {
97  $this->oldIdMap[ $row->rev_text_id ][] = $row->rev_id;
98  }
99  }
100  } else {
101  $res = $dbr->select(
102  [ 'slots', 'content' ],
103  [ 'slot_revision_id', 'content_address' ],
104  [ "slot_revision_id BETWEEN $chunkStart AND $chunkEnd" ],
105  __METHOD__,
106  [],
107  [ 'content' => [ 'INNER JOIN', [ 'content_id = slot_content_id' ] ] ]
108  );
110  $blobStore = MediaWikiServices::getInstance()->getBlobStore();
111  '@phan-var \MediaWiki\Storage\SqlBlobStore $blobStore';
112  foreach ( $res as $row ) {
113  $textId = $blobStore->getTextIdFromAddress( $row->content_address );
114  if ( $textId ) {
115  if ( !isset( $this->oldIdMap[$textId] ) ) {
116  $this->oldIdMap[ $textId ] = [ $row->slot_revision_id ];
117  } elseif ( !in_array( $row->slot_revision_id, $this->oldIdMap[$textId] ) ) {
118  $this->oldIdMap[ $textId ][] = $row->slot_revision_id;
119  }
120  }
121  }
122  }
123 
124  if ( !count( $this->oldIdMap ) ) {
125  continue;
126  }
127 
128  // Fetch old_flags
129  $missingTextRows = $this->oldIdMap;
130  $externalRevs = [];
131  $objectRevs = [];
132  $res = $dbr->select(
133  'text',
134  [ 'old_id', 'old_flags' ],
135  [ 'old_id' => array_keys( $this->oldIdMap ) ],
136  __METHOD__
137  );
138  foreach ( $res as $row ) {
142  $flags = $row->old_flags;
143  $id = $row->old_id;
144 
145  // Create flagStats row if it doesn't exist
146  $flagStats = $flagStats + [ $flags => 0 ];
147  // Increment counter
148  $flagStats[$flags]++;
149 
150  // Not missing
151  unset( $missingTextRows[$row->old_id] );
152 
153  // Check for external or object
154  if ( $flags == '' ) {
155  $flagArray = [];
156  } else {
157  $flagArray = explode( ',', $flags );
158  }
159  if ( in_array( 'external', $flagArray ) ) {
160  $externalRevs[] = $id;
161  } elseif ( in_array( 'object', $flagArray ) ) {
162  $objectRevs[] = $id;
163  }
164 
165  // Check for unrecognised flags
166  if ( $flags == '0' ) {
167  // This is a known bug from 2004
168  // It's safe to just erase the old_flags field
169  if ( $fix ) {
170  $this->addError( 'fixed', "Warning: old_flags set to 0", $id );
171  $dbw = wfGetDB( DB_MASTER );
172  $dbw->ping();
173  $dbw->update( 'text', [ 'old_flags' => '' ],
174  [ 'old_id' => $id ], __METHOD__ );
175  echo "Fixed\n";
176  } else {
177  $this->addError( 'fixable', "Warning: old_flags set to 0", $id );
178  }
179  } elseif ( count( array_diff( $flagArray, $knownFlags ) ) ) {
180  $this->addError( 'unfixable', "Error: invalid flags field \"$flags\"", $id );
181  }
182  }
183 
184  // Output errors for any missing text rows
185  foreach ( $missingTextRows as $oldId => $revIds ) {
186  $this->addError( 'restore revision', "Error: missing text row", $oldId );
187  }
188 
189  // Verify external revisions
190  $externalConcatBlobs = [];
191  $externalNormalBlobs = [];
192  if ( count( $externalRevs ) ) {
193  $res = $dbr->select(
194  'text',
195  [ 'old_id', 'old_flags', 'old_text' ],
196  [ 'old_id' => $externalRevs ],
197  __METHOD__
198  );
199  foreach ( $res as $row ) {
200  $urlParts = explode( '://', $row->old_text, 2 );
201  if ( count( $urlParts ) !== 2 || $urlParts[1] == '' ) {
202  $this->addError( 'restore text', "Error: invalid URL \"{$row->old_text}\"", $row->old_id );
203  continue;
204  }
205  list( $proto, ) = $urlParts;
206  if ( $proto != 'DB' ) {
207  $this->addError(
208  'restore text',
209  "Error: invalid external protocol \"$proto\"",
210  $row->old_id );
211  continue;
212  }
213  $path = explode( '/', $row->old_text );
214  $cluster = $path[2];
215  $id = $path[3];
216  if ( isset( $path[4] ) ) {
217  $externalConcatBlobs[$cluster][$id][] = $row->old_id;
218  } else {
219  $externalNormalBlobs[$cluster][$id][] = $row->old_id;
220  }
221  }
222  }
223 
224  // Check external concat blobs for the right header
225  $this->checkExternalConcatBlobs( $externalConcatBlobs );
226 
227  // Check external normal blobs for existence
228  if ( count( $externalNormalBlobs ) ) {
229  if ( is_null( $this->dbStore ) ) {
230  $esFactory = MediaWikiServices::getInstance()->getExternalStoreFactory();
231  $this->dbStore = $esFactory->getStore( 'DB' );
232  }
233  foreach ( $externalConcatBlobs as $cluster => $xBlobIds ) {
234  $blobIds = array_keys( $xBlobIds );
235  $extDb =& $this->dbStore->getReplica( $cluster );
236  $blobsTable = $this->dbStore->getTable( $extDb );
237  $res = $extDb->select( $blobsTable,
238  [ 'blob_id' ],
239  [ 'blob_id' => $blobIds ],
240  __METHOD__
241  );
242  foreach ( $res as $row ) {
243  unset( $xBlobIds[$row->blob_id] );
244  }
245  // Print errors for missing blobs rows
246  foreach ( $xBlobIds as $blobId => $oldId ) {
247  $this->addError(
248  'restore text',
249  "Error: missing target $blobId for one-part ES URL",
250  $oldId );
251  }
252  }
253  }
254 
255  // Check local objects
256  $dbr->ping();
257  $concatBlobs = [];
258  $curIds = [];
259  if ( count( $objectRevs ) ) {
260  $headerLength = 300;
261  $res = $dbr->select(
262  'text',
263  [ 'old_id', 'old_flags', "LEFT(old_text, $headerLength) AS header" ],
264  [ 'old_id' => $objectRevs ],
265  __METHOD__
266  );
267  foreach ( $res as $row ) {
268  $oldId = $row->old_id;
269  $matches = [];
270  if ( !preg_match( '/^O:(\d+):"(\w+)"/', $row->header, $matches ) ) {
271  $this->addError( 'restore text', "Error: invalid object header", $oldId );
272  continue;
273  }
274 
275  $className = strtolower( $matches[2] );
276  if ( strlen( $className ) != $matches[1] ) {
277  $this->addError(
278  'restore text',
279  "Error: invalid object header, wrong class name length",
280  $oldId
281  );
282  continue;
283  }
284 
285  $objectStats = $objectStats + [ $className => 0 ];
286  $objectStats[$className]++;
287 
288  switch ( $className ) {
289  case 'concatenatedgziphistoryblob':
290  // Good
291  break;
292  case 'historyblobstub':
293  case 'historyblobcurstub':
294  if ( strlen( $row->header ) == $headerLength ) {
295  $this->addError( 'unfixable', "Error: overlong stub header", $oldId );
296  break;
297  }
298  $stubObj = unserialize( $row->header );
299  if ( !is_object( $stubObj ) ) {
300  $this->addError( 'restore text', "Error: unable to unserialize stub object", $oldId );
301  break;
302  }
303  if ( $className == 'historyblobstub' ) {
304  $concatBlobs[$stubObj->mOldId][] = $oldId;
305  } else {
306  $curIds[$stubObj->mCurId][] = $oldId;
307  }
308  break;
309  default:
310  $this->addError( 'unfixable', "Error: unrecognised object class \"$className\"", $oldId );
311  }
312  }
313  }
314 
315  // Check local concat blob validity
316  $externalConcatBlobs = [];
317  if ( count( $concatBlobs ) ) {
318  $headerLength = 300;
319  $res = $dbr->select(
320  'text',
321  [ 'old_id', 'old_flags', "LEFT(old_text, $headerLength) AS header" ],
322  [ 'old_id' => array_keys( $concatBlobs ) ],
323  __METHOD__
324  );
325  foreach ( $res as $row ) {
326  $flags = explode( ',', $row->old_flags );
327  if ( in_array( 'external', $flags ) ) {
328  // Concat blob is in external storage?
329  if ( in_array( 'object', $flags ) ) {
330  $urlParts = explode( '/', $row->header );
331  if ( $urlParts[0] != 'DB:' ) {
332  $this->addError(
333  'unfixable',
334  "Error: unrecognised external storage type \"{$urlParts[0]}",
335  $row->old_id
336  );
337  } else {
338  $cluster = $urlParts[2];
339  $id = $urlParts[3];
340  if ( !isset( $externalConcatBlobs[$cluster][$id] ) ) {
341  $externalConcatBlobs[$cluster][$id] = [];
342  }
343  $externalConcatBlobs[$cluster][$id] = array_merge(
344  $externalConcatBlobs[$cluster][$id], $concatBlobs[$row->old_id]
345  );
346  }
347  } else {
348  $this->addError(
349  'unfixable',
350  "Error: invalid flags \"{$row->old_flags}\" on concat bulk row {$row->old_id}",
351  $concatBlobs[$row->old_id] );
352  }
353  } elseif ( strcasecmp(
354  substr( $row->header, 0, strlen( self::CONCAT_HEADER ) ),
355  self::CONCAT_HEADER
356  ) ) {
357  $this->addError(
358  'restore text',
359  "Error: Incorrect object header for concat bulk row {$row->old_id}",
360  $concatBlobs[$row->old_id]
361  );
362  } # else good
363 
364  unset( $concatBlobs[$row->old_id] );
365  }
366  }
367 
368  // Check targets of unresolved stubs
369  $this->checkExternalConcatBlobs( $externalConcatBlobs );
370  // next chunk
371  }
372 
373  print "\n\nErrors:\n";
374  foreach ( $this->errors as $name => $errors ) {
375  if ( count( $errors ) ) {
376  $description = $this->errorDescriptions[$name];
377  echo "$description: " . implode( ',', array_keys( $errors ) ) . "\n";
378  }
379  }
380 
381  if ( count( $this->errors['restore text'] ) && $fix ) {
382  if ( (string)$xml !== '' ) {
383  $this->restoreText( array_keys( $this->errors['restore text'] ), $xml );
384  } else {
385  echo "Can't fix text, no XML backup specified\n";
386  }
387  }
388 
389  print "\nFlag statistics:\n";
390  $total = array_sum( $flagStats );
391  foreach ( $flagStats as $flag => $count ) {
392  printf( "%-30s %10d %5.2f%%\n", $flag, $count, $count / $total * 100 );
393  }
394  print "\nLocal object statistics:\n";
395  $total = array_sum( $objectStats );
396  foreach ( $objectStats as $className => $count ) {
397  printf( "%-30s %10d %5.2f%%\n", $className, $count, $count / $total * 100 );
398  }
399  }
400 
401  function addError( $type, $msg, $ids ) {
402  if ( is_array( $ids ) && count( $ids ) == 1 ) {
403  $ids = reset( $ids );
404  }
405  if ( is_array( $ids ) ) {
406  $revIds = [];
407  foreach ( $ids as $id ) {
408  $revIds = array_unique( array_merge( $revIds, $this->oldIdMap[$id] ) );
409  }
410  print "$msg in text rows " . implode( ', ', $ids ) .
411  ", revisions " . implode( ', ', $revIds ) . "\n";
412  } else {
413  $id = $ids;
414  $revIds = $this->oldIdMap[$id];
415  if ( count( $revIds ) == 1 ) {
416  print "$msg in old_id $id, rev_id {$revIds[0]}\n";
417  } else {
418  print "$msg in old_id $id, revisions " . implode( ', ', $revIds ) . "\n";
419  }
420  }
421  $this->errors[$type] = $this->errors[$type] + array_flip( $revIds );
422  }
423 
424  function checkExternalConcatBlobs( $externalConcatBlobs ) {
425  if ( !count( $externalConcatBlobs ) ) {
426  return;
427  }
428 
429  if ( is_null( $this->dbStore ) ) {
430  $esFactory = MediaWikiServices::getInstance()->getExternalStoreFactory();
431  $this->dbStore = $esFactory->getStore( 'DB' );
432  }
433 
434  foreach ( $externalConcatBlobs as $cluster => $oldIds ) {
435  $blobIds = array_keys( $oldIds );
436  $extDb =& $this->dbStore->getReplica( $cluster );
437  $blobsTable = $this->dbStore->getTable( $extDb );
438  $headerLength = strlen( self::CONCAT_HEADER );
439  $res = $extDb->select( $blobsTable,
440  [ 'blob_id', "LEFT(blob_text, $headerLength) AS header" ],
441  [ 'blob_id' => $blobIds ],
442  __METHOD__
443  );
444  foreach ( $res as $row ) {
445  if ( strcasecmp( $row->header, self::CONCAT_HEADER ) ) {
446  $this->addError(
447  'restore text',
448  "Error: invalid header on target $cluster/{$row->blob_id} of two-part ES URL",
449  $oldIds[$row->blob_id]
450  );
451  }
452  unset( $oldIds[$row->blob_id] );
453  }
454 
455  // Print errors for missing blobs rows
456  foreach ( $oldIds as $blobId => $oldIds2 ) {
457  $this->addError(
458  'restore text',
459  "Error: missing target $cluster/$blobId for two-part ES URL",
460  $oldIds2
461  );
462  }
463  }
464  }
465 
466  function restoreText( $revIds, $xml ) {
467  global $wgDBname;
468  $tmpDir = wfTempDir();
469 
470  if ( !count( $revIds ) ) {
471  return;
472  }
473 
474  print "Restoring text from XML backup...\n";
475 
476  $revFileName = "$tmpDir/broken-revlist-$wgDBname";
477  $filteredXmlFileName = "$tmpDir/filtered-$wgDBname.xml";
478 
479  // Write revision list
480  if ( !file_put_contents( $revFileName, implode( "\n", $revIds ) ) ) {
481  echo "Error writing revision list, can't restore text\n";
482 
483  return;
484  }
485 
486  // Run mwdumper
487  echo "Filtering XML dump...\n";
488  $exitStatus = 0;
489  passthru( 'mwdumper ' .
490  Shell::escape(
491  "--output=file:$filteredXmlFileName",
492  "--filter=revlist:$revFileName",
493  $xml
494  ), $exitStatus
495  );
496 
497  if ( $exitStatus ) {
498  echo "mwdumper died with exit status $exitStatus\n";
499 
500  return;
501  }
502 
503  $file = fopen( $filteredXmlFileName, 'r' );
504  if ( !$file ) {
505  echo "Unable to open filtered XML file\n";
506 
507  return;
508  }
509 
510  $dbr = wfGetDB( DB_REPLICA );
511  $dbw = wfGetDB( DB_MASTER );
512  $dbr->ping();
513  $dbw->ping();
514 
516  $importer = new WikiImporter(
517  $source,
518  MediaWikiServices::getInstance()->getMainConfig()
519  );
520  $importer->setRevisionCallback( [ $this, 'importRevision' ] );
521  $importer->setNoticeCallback( function ( $msg, $params ) {
522  echo wfMessage( $msg, $params )->text() . "\n";
523  } );
524  $importer->doImport();
525  }
526 
527  function importRevision( &$revision, &$importer ) {
528  $id = $revision->getID();
529  $content = $revision->getContent( RevisionRecord::RAW );
530  $id = $id ?: '';
531 
532  if ( $content === null ) {
533  echo "Revision $id is broken, we have no content available\n";
534 
535  return;
536  }
537 
538  $text = $content->serialize();
539  if ( $text === '' ) {
540  // This is what happens if the revision was broken at the time the
541  // dump was made. Unfortunately, it also happens if the revision was
542  // legitimately blank, so there's no way to tell the difference. To
543  // be safe, we'll skip it and leave it broken
544 
545  echo "Revision $id is blank in the dump, may have been broken before export\n";
546 
547  return;
548  }
549 
550  if ( !$id ) {
551  // No ID, can't import
552  echo "No id tag in revision, can't import\n";
553 
554  return;
555  }
556 
557  // Find text row again
558  $dbr = wfGetDB( DB_REPLICA );
561  $oldId = $dbr->selectField( 'revision', 'rev_text_id', [ 'rev_id' => $id ], __METHOD__ );
562  } else {
563  $res = $dbr->selectRow(
564  [ 'slots', 'content' ],
565  [ 'content_address' ],
566  [ 'slot_revision_id' => $id ],
567  __METHOD__,
568  [],
569  [ 'content' => [ 'INNER JOIN', [ 'content_id = slot_content_id' ] ] ]
570  );
571  // @phan-suppress-next-line PhanAccessMethodInternal
572  $blobStore = MediaWikiServices::getInstance()
573  ->getBlobStoreFactory()
574  ->newSqlBlobStore();
575  $oldId = $blobStore->getTextIdFromAddress( $res->content_address );
576  }
577 
578  if ( !$oldId ) {
579  echo "Missing revision row for rev_id $id\n";
580  return;
581  }
582 
583  // Compress the text
584  $flags = Revision::compressRevisionText( $text );
585 
586  // Update the text row
587  $dbw = wfGetDB( DB_MASTER );
588  $dbw->update( 'text',
589  [ 'old_flags' => $flags, 'old_text' => $text ],
590  [ 'old_id' => $oldId ],
591  __METHOD__, [ 'LIMIT' => 1 ]
592  );
593 
594  // Remove it from the unfixed list and add it to the fixed list
595  unset( $this->errors['restore text'][$id] );
596  $this->errors['fixed'][$id] = true;
597  }
598 }
MediaWiki\Shell\Shell
Executes shell commands.
Definition: Shell.php:44
WikiImporter
XML file reader for the page data importer.
Definition: WikiImporter.php:35
CheckStorage
Maintenance script to do various checks on external storage.
Definition: checkStorage.php:46
Revision\RevisionRecord
Page revision base class.
Definition: RevisionRecord.php:46
ExternalStoreDB
DB accessible external objects.
Definition: ExternalStoreDB.php:39
$optionsWithoutArgs
global $optionsWithoutArgs
Definition: commandLine.inc:24
$wgDBname
$wgDBname
Current wiki database name.
Definition: DefaultSettings.php:1893
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:117
CheckStorage\check
check( $fix=false, $xml='')
Definition: checkStorage.php:60
$file
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition: router.php:42
$wgMultiContentRevisionSchemaMigrationStage
int $wgMultiContentRevisionSchemaMigrationStage
RevisionStore table schema migration stage (content, slots, content_models & slot_roles tables).
Definition: DefaultSettings.php:9003
wfMessage
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Definition: GlobalFunctions.php:1264
$res
$res
Definition: testCompression.php:52
ImportStreamSource
Imports a XML dump from a file (either from file upload, files on disk, or HTTP)
Definition: ImportStreamSource.php:32
CheckStorage\importRevision
importRevision(&$revision, &$importer)
Definition: checkStorage.php:527
CheckStorage\$errorDescriptions
$errorDescriptions
Definition: checkStorage.php:52
$dbr
$dbr
Definition: testCompression.php:50
CheckStorage\$oldIdMap
$oldIdMap
Definition: checkStorage.php:48
wfGetDB
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Definition: GlobalFunctions.php:2575
$matches
$matches
Definition: NoLocalSettings.php:24
Revision\compressRevisionText
static compressRevisionText(&$text)
If $wgCompressRevisions is enabled, we will compress data.
Definition: Revision.php:926
CheckStorage\addError
addError( $type, $msg, $ids)
Definition: checkStorage.php:401
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
DB_MASTER
const DB_MASTER
Definition: defines.php:26
CheckStorage\CONCAT_HEADER
const CONCAT_HEADER
Definition: checkStorage.php:47
$content
$content
Definition: router.php:78
CheckStorage\restoreText
restoreText( $revIds, $xml)
Definition: checkStorage.php:466
unserialize
unserialize( $serialized)
Definition: ApiMessageTrait.php:146
$args
if( $line===false) $args
Definition: cdb.php:64
wfTempDir
wfTempDir()
Tries to get the system directory for temporary files.
Definition: GlobalFunctions.php:1947
CheckStorage\$errors
$errors
Definition: checkStorage.php:48
$path
$path
Definition: NoLocalSettings.php:25
$source
$source
Definition: mwdoc-filter.php:34
CheckStorage\$dbStore
ExternalStoreDB $dbStore
Definition: checkStorage.php:50
SCHEMA_COMPAT_READ_OLD
const SCHEMA_COMPAT_READ_OLD
Definition: Defines.php:265
CheckStorage\checkExternalConcatBlobs
checkExternalConcatBlobs( $externalConcatBlobs)
Definition: checkStorage.php:424
$type
$type
Definition: testCompression.php:48