MediaWiki  master
fixT22757.php
Go to the documentation of this file.
1 <?php
24 require_once __DIR__ . '/../Maintenance.php';
25 
31 class FixT22757 extends Maintenance {
32  public $batchSize = 10000;
33  public $mapCache = [];
34  public $mapCacheSize = 0;
35  public $maxMapCacheSize = 1000000;
36 
37  function __construct() {
38  parent::__construct();
39  $this->addDescription( 'Script to fix T22757 assuming that blob_tracking is intact' );
40  $this->addOption( 'dry-run', 'Report only' );
41  $this->addOption( 'start', 'old_id to start at', false, true );
42  }
43 
44  function execute() {
45  $dbr = $this->getDB( DB_REPLICA );
46  $dbw = $this->getDB( DB_MASTER );
47 
48  $dryRun = $this->getOption( 'dry-run' );
49  if ( $dryRun ) {
50  print "Dry run only.\n";
51  }
52 
53  $startId = $this->getOption( 'start', 0 );
54  $numGood = 0;
55  $numFixed = 0;
56  $numBad = 0;
57 
58  $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', '', __METHOD__ );
59 
60  // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function
61  $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))';
62 
63  while ( true ) {
64  print "ID: $startId / $totalRevs\r";
65 
66  $res = $dbr->select(
67  'text',
68  [ 'old_id', 'old_flags', 'old_text' ],
69  [
70  'old_id > ' . intval( $startId ),
71  'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'',
72  "$lowerLeft = 'o:15:\"historyblobstub\"'",
73  ],
74  __METHOD__,
75  [
76  'ORDER BY' => 'old_id',
77  'LIMIT' => $this->batchSize,
78  ]
79  );
80 
81  if ( !$res->numRows() ) {
82  break;
83  }
84 
85  $secondaryIds = [];
86  $stubs = [];
87 
88  foreach ( $res as $row ) {
89  $startId = $row->old_id;
90 
91  // Basic sanity checks
92  $obj = unserialize( $row->old_text );
93  if ( $obj === false ) {
94  print "{$row->old_id}: unrecoverable: cannot unserialize\n";
95  ++$numBad;
96  continue;
97  }
98 
99  if ( !is_object( $obj ) ) {
100  print "{$row->old_id}: unrecoverable: unserialized to type " .
101  gettype( $obj ) . ", possible double-serialization\n";
102  ++$numBad;
103  continue;
104  }
105 
106  if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
107  print "{$row->old_id}: unrecoverable: unexpected object class " .
108  get_class( $obj ) . "\n";
109  ++$numBad;
110  continue;
111  }
112 
113  // Process flags
114  $flags = explode( ',', $row->old_flags );
115  if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) {
116  $legacyEncoding = false;
117  } else {
118  $legacyEncoding = true;
119  }
120 
121  // Queue the stub for future batch processing
122  $id = intval( $obj->mOldId );
123  $secondaryIds[] = $id;
124  $stubs[$row->old_id] = [
125  'legacyEncoding' => $legacyEncoding,
126  'secondaryId' => $id,
127  'hash' => $obj->mHash,
128  ];
129  }
130 
131  $secondaryIds = array_unique( $secondaryIds );
132 
133  if ( !count( $secondaryIds ) ) {
134  continue;
135  }
136 
137  // Run the batch query on blob_tracking
138  $res = $dbr->select(
139  'blob_tracking',
140  '*',
141  [
142  'bt_text_id' => $secondaryIds,
143  ],
144  __METHOD__
145  );
146  $trackedBlobs = [];
147  foreach ( $res as $row ) {
148  $trackedBlobs[$row->bt_text_id] = $row;
149  }
150 
151  // Process the stubs
152  foreach ( $stubs as $primaryId => $stub ) {
153  $secondaryId = $stub['secondaryId'];
154  if ( !isset( $trackedBlobs[$secondaryId] ) ) {
155  // No tracked blob. Work out what went wrong
156  $secondaryRow = $dbr->selectRow(
157  'text',
158  [ 'old_flags', 'old_text' ],
159  [ 'old_id' => $secondaryId ],
160  __METHOD__
161  );
162  if ( !$secondaryRow ) {
163  print "$primaryId: unrecoverable: secondary row is missing\n";
164  ++$numBad;
165  } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) {
166  // Not broken yet, and not in the tracked clusters so it won't get
167  // broken by the current RCT run.
168  ++$numGood;
169  } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
170  print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
171  ++$numBad;
172  } else {
173  print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
174  ++$numBad;
175  }
176  unset( $stubs[$primaryId] );
177  continue;
178  }
179  $trackRow = $trackedBlobs[$secondaryId];
180 
181  // Check that the specified text really is available in the tracked source row
182  $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
183  $text = ExternalStore::fetchFromURL( $url );
184  if ( $text === false ) {
185  print "$primaryId: unrecoverable: source text missing\n";
186  ++$numBad;
187  unset( $stubs[$primaryId] );
188  continue;
189  }
190  if ( md5( $text ) !== $stub['hash'] ) {
191  print "$primaryId: unrecoverable: content hashes do not match\n";
192  ++$numBad;
193  unset( $stubs[$primaryId] );
194  continue;
195  }
196 
197  // Find the page_id and rev_id
198  // The page is probably the same as the page of the secondary row
199  $pageId = intval( $trackRow->bt_page );
200  if ( !$pageId ) {
201  $revId = $pageId = 0;
202  } else {
203  $revId = $this->findTextIdInPage( $pageId, $primaryId );
204  if ( !$revId ) {
205  // Actually an orphan
206  $pageId = $revId = 0;
207  }
208  }
209 
210  $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8';
211 
212  if ( !$dryRun ) {
213  // Reset the text row to point to the original copy
214  $this->beginTransaction( $dbw, __METHOD__ );
215  $dbw->update(
216  'text',
217  // SET
218  [
219  'old_flags' => $newFlags,
220  'old_text' => $url
221  ],
222  // WHERE
223  [ 'old_id' => $primaryId ],
224  __METHOD__
225  );
226 
227  // Add a blob_tracking row so that the new reference can be recompressed
228  // without needing to run trackBlobs.php again
229  $dbw->insert( 'blob_tracking',
230  [
231  'bt_page' => $pageId,
232  'bt_rev_id' => $revId,
233  'bt_text_id' => $primaryId,
234  'bt_cluster' => $trackRow->bt_cluster,
235  'bt_blob_id' => $trackRow->bt_blob_id,
236  'bt_cgz_hash' => $stub['hash'],
237  'bt_new_url' => null,
238  'bt_moved' => 0,
239  ],
240  __METHOD__
241  );
242  $this->commitTransaction( $dbw, __METHOD__ );
243  }
244 
245  print "$primaryId: resolved to $url\n";
246  ++$numFixed;
247  }
248  }
249 
250  print "\n";
251  print "Fixed: $numFixed\n";
252  print "Unrecoverable: $numBad\n";
253  print "Good stubs: $numGood\n";
254  }
255 
256  function findTextIdInPage( $pageId, $textId ) {
257  $ids = $this->getRevTextMap( $pageId );
258  return $ids[$textId] ?? null;
259  }
260 
261  function getRevTextMap( $pageId ) {
262  if ( !isset( $this->mapCache[$pageId] ) ) {
263  // Limit cache size
264  while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
265  $key = key( $this->mapCache );
266  $this->mapCacheSize -= count( $this->mapCache[$key] );
267  unset( $this->mapCache[$key] );
268  }
269 
270  $dbr = $this->getDB( DB_REPLICA );
271  $map = [];
272  $res = $dbr->select( 'revision',
273  [ 'rev_id', 'rev_text_id' ],
274  [ 'rev_page' => $pageId ],
275  __METHOD__
276  );
277  foreach ( $res as $row ) {
278  $map[$row->rev_text_id] = $row->rev_id;
279  }
280  $this->mapCache[$pageId] = $map;
281  $this->mapCacheSize += count( $map );
282  }
283 
284  return $this->mapCache[$pageId];
285  }
286 
294  function isUnbrokenStub( $stub, $secondaryRow ) {
295  $flags = explode( ',', $secondaryRow->old_flags );
296  $text = $secondaryRow->old_text;
297  if ( in_array( 'external', $flags ) ) {
298  $url = $text;
299  Wikimedia\suppressWarnings();
300  list( /* $proto */, $path ) = explode( '://', $url, 2 );
301  Wikimedia\restoreWarnings();
302 
303  if ( $path == "" ) {
304  return false;
305  }
306  $text = ExternalStore::fetchFromURL( $url );
307  }
308  if ( !in_array( 'object', $flags ) ) {
309  return false;
310  }
311 
312  if ( in_array( 'gzip', $flags ) ) {
313  $obj = unserialize( gzinflate( $text ) );
314  } else {
315  $obj = unserialize( $text );
316  }
317 
318  if ( !is_object( $obj ) ) {
319  // Correct for old double-serialization bug.
320  $obj = unserialize( $obj );
321  }
322 
323  if ( !is_object( $obj ) ) {
324  return false;
325  }
326 
327  $obj->uncompress();
328  $text = $obj->getItem( $stub['hash'] );
329 
330  return $text !== false;
331  }
332 }
333 
334 $maintClass = FixT22757::class;
335 require_once RUN_MAINTENANCE_IF_MAIN;
commitTransaction(IDatabase $dbw, $fname)
Commit the transcation on a DB handle and wait for replica DBs to catch up.
const RUN_MAINTENANCE_IF_MAIN
Definition: Maintenance.php:39
getOption( $name, $default=null)
Get an option, or return the default.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: Maintenance.php:82
isUnbrokenStub( $stub, $secondaryRow)
This is based on part of HistoryBlobStub::getText().
Definition: fixT22757.php:294
const DB_MASTER
Definition: defines.php:26
findTextIdInPage( $pageId, $textId)
Definition: fixT22757.php:256
addDescription( $text)
Set the description text.
unserialize( $serialized)
$maintClass
Definition: fixT22757.php:334
getRevTextMap( $pageId)
Definition: fixT22757.php:261
__construct()
Definition: fixT22757.php:37
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
const DB_REPLICA
Definition: defines.php:25
$maxMapCacheSize
Definition: fixT22757.php:35
getDB( $db, $groups=[], $dbDomain=false)
Returns a database to be used by current maintenance script.
Maintenance script to fix T22757.
Definition: fixT22757.php:31
static fetchFromURL( $url, array $params=[])
Fetch data from given URL.
beginTransaction(IDatabase $dbw, $fname)
Begin a transcation on a DB.