MediaWiki  1.28.0
trackBlobs.php
Go to the documentation of this file.
1 <?php
25 require __DIR__ . '/../commandLine.inc';
26 
27 if ( count( $args ) < 1 ) {
28  echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
29  echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
30  echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
31 
32  exit( 1 );
33 }
35 $tracker->run();
36 echo "All done.\n";
37 
38 class TrackBlobs {
41  public $trackedBlobs = [];
42 
43  public $batchSize = 1000;
44  public $reportingInterval = 10;
45 
46  function __construct( $clusters ) {
47  $this->clusters = $clusters;
48  if ( extension_loaded( 'gmp' ) ) {
49  $this->doBlobOrphans = true;
50  foreach ( $clusters as $cluster ) {
51  $this->trackedBlobs[$cluster] = gmp_init( 0 );
52  }
53  } else {
54  echo "Warning: the gmp extension is needed to find orphan blobs\n";
55  }
56  }
57 
58  function run() {
59  $this->checkIntegrity();
60  $this->initTrackingTable();
61  $this->trackRevisions();
62  $this->trackOrphanText();
63  if ( $this->doBlobOrphans ) {
64  $this->findOrphanBlobs();
65  }
66  }
67 
68  function checkIntegrity() {
69  echo "Doing integrity check...\n";
70  $dbr = wfGetDB( DB_REPLICA );
71 
72  // Scan for HistoryBlobStub objects in the text table (bug 20757)
73 
74  $exists = $dbr->selectField( 'text', 1,
75  'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
76  'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
77  __METHOD__
78  );
79 
80  if ( $exists ) {
81  echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
82  "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
83  "to fix this.\n";
84  exit( 1 );
85  }
86 
87  // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
88  $flags = $dbr->selectField( 'archive', 'ar_flags',
89  'ar_flags LIKE \'%external%\' OR (' .
90  'ar_flags LIKE \'%object%\' ' .
91  'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
92  __METHOD__
93  );
94 
95  if ( strpos( $flags, 'external' ) !== false ) {
96  echo "Integrity check failed: found external storage pointers in your archive table.\n" .
97  "Run normaliseArchiveTable.php to fix this.\n";
98  exit( 1 );
99  } elseif ( $flags ) {
100  echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
101  "These objects are probably already broken, continuing would make them\n" .
102  "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
103  exit( 1 );
104  }
105 
106  echo "Integrity check OK\n";
107  }
108 
109  function initTrackingTable() {
110  $dbw = wfGetDB( DB_MASTER );
111  if ( $dbw->tableExists( 'blob_tracking' ) ) {
112  $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
113  $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
114  }
115  $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
116  }
117 
118  function getTextClause() {
119  if ( !$this->textClause ) {
120  $dbr = wfGetDB( DB_REPLICA );
121  $this->textClause = '';
122  foreach ( $this->clusters as $cluster ) {
123  if ( $this->textClause != '' ) {
124  $this->textClause .= ' OR ';
125  }
126  $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
127  }
128  }
129 
130  return $this->textClause;
131  }
132 
133  function interpretPointer( $text ) {
134  if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
135  return false;
136  }
137 
138  return [
139  'cluster' => $m[1],
140  'id' => intval( $m[2] ),
141  'hash' => isset( $m[3] ) ? $m[3] : null
142  ];
143  }
144 
148  function trackRevisions() {
149  $dbw = wfGetDB( DB_MASTER );
150  $dbr = wfGetDB( DB_REPLICA );
151 
152  $textClause = $this->getTextClause();
153  $startId = 0;
154  $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
155  $batchesDone = 0;
156  $rowsInserted = 0;
157 
158  echo "Finding revisions...\n";
159 
160  while ( true ) {
161  $res = $dbr->select( [ 'revision', 'text' ],
162  [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ],
163  [
164  'rev_id > ' . $dbr->addQuotes( $startId ),
165  'rev_text_id=old_id',
166  $textClause,
167  'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
168  ],
169  __METHOD__,
170  [
171  'ORDER BY' => 'rev_id',
172  'LIMIT' => $this->batchSize
173  ]
174  );
175  if ( !$res->numRows() ) {
176  break;
177  }
178 
179  $insertBatch = [];
180  foreach ( $res as $row ) {
181  $startId = $row->rev_id;
182  $info = $this->interpretPointer( $row->old_text );
183  if ( !$info ) {
184  echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
185  continue;
186  }
187  if ( !in_array( $info['cluster'], $this->clusters ) ) {
188  echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
189  continue;
190  }
191  $insertBatch[] = [
192  'bt_page' => $row->rev_page,
193  'bt_rev_id' => $row->rev_id,
194  'bt_text_id' => $row->old_id,
195  'bt_cluster' => $info['cluster'],
196  'bt_blob_id' => $info['id'],
197  'bt_cgz_hash' => $info['hash']
198  ];
199  if ( $this->doBlobOrphans ) {
200  gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
201  }
202  }
203  $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
204  $rowsInserted += count( $insertBatch );
205 
206  ++$batchesDone;
207  if ( $batchesDone >= $this->reportingInterval ) {
208  $batchesDone = 0;
209  echo "$startId / $endId\n";
210  wfWaitForSlaves();
211  }
212  }
213  echo "Found $rowsInserted revisions\n";
214  }
215 
221  function trackOrphanText() {
222  # Wait until the blob_tracking table is available in the replica DB
223  $dbw = wfGetDB( DB_MASTER );
224  $dbr = wfGetDB( DB_REPLICA );
225  $pos = $dbw->getMasterPos();
226  $dbr->masterPosWait( $pos, 100000 );
227 
228  $textClause = $this->getTextClause( $this->clusters );
229  $startId = 0;
230  $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
231  $rowsInserted = 0;
232  $batchesDone = 0;
233 
234  echo "Finding orphan text...\n";
235 
236  # Scan the text table for orphan text
237  while ( true ) {
238  $res = $dbr->select( [ 'text', 'blob_tracking' ],
239  [ 'old_id', 'old_flags', 'old_text' ],
240  [
241  'old_id>' . $dbr->addQuotes( $startId ),
242  $textClause,
243  'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
244  'bt_text_id IS NULL'
245  ],
246  __METHOD__,
247  [
248  'ORDER BY' => 'old_id',
249  'LIMIT' => $this->batchSize
250  ],
251  [ 'blob_tracking' => [ 'LEFT JOIN', 'bt_text_id=old_id' ] ]
252  );
253  $ids = [];
254  foreach ( $res as $row ) {
255  $ids[] = $row->old_id;
256  }
257 
258  if ( !$res->numRows() ) {
259  break;
260  }
261 
262  $insertBatch = [];
263  foreach ( $res as $row ) {
264  $startId = $row->old_id;
265  $info = $this->interpretPointer( $row->old_text );
266  if ( !$info ) {
267  echo "Invalid DB:// URL in old_id {$row->old_id}\n";
268  continue;
269  }
270  if ( !in_array( $info['cluster'], $this->clusters ) ) {
271  echo "Invalid cluster returned in SQL query\n";
272  continue;
273  }
274 
275  $insertBatch[] = [
276  'bt_page' => 0,
277  'bt_rev_id' => 0,
278  'bt_text_id' => $row->old_id,
279  'bt_cluster' => $info['cluster'],
280  'bt_blob_id' => $info['id'],
281  'bt_cgz_hash' => $info['hash']
282  ];
283  if ( $this->doBlobOrphans ) {
284  gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
285  }
286  }
287  $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
288 
289  $rowsInserted += count( $insertBatch );
290  ++$batchesDone;
291  if ( $batchesDone >= $this->reportingInterval ) {
292  $batchesDone = 0;
293  echo "$startId / $endId\n";
294  wfWaitForSlaves();
295  }
296  }
297  echo "Found $rowsInserted orphan text rows\n";
298  }
299 
307  function findOrphanBlobs() {
308  if ( !extension_loaded( 'gmp' ) ) {
309  echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
310 
311  return;
312  }
313 
314  $dbw = wfGetDB( DB_MASTER );
315 
316  foreach ( $this->clusters as $cluster ) {
317  echo "Searching for orphan blobs in $cluster...\n";
318  $lb = wfGetLBFactory()->getExternalLB( $cluster );
319  try {
320  $extDB = $lb->getConnection( DB_REPLICA );
321  } catch ( DBConnectionError $e ) {
322  if ( strpos( $e->error, 'Unknown database' ) !== false ) {
323  echo "No database on $cluster\n";
324  } else {
325  echo "Error on $cluster: " . $e->getMessage() . "\n";
326  }
327  continue;
328  }
329  $table = $extDB->getLBInfo( 'blobs table' );
330  if ( is_null( $table ) ) {
331  $table = 'blobs';
332  }
333  if ( !$extDB->tableExists( $table ) ) {
334  echo "No blobs table on cluster $cluster\n";
335  continue;
336  }
337  $startId = 0;
338  $batchesDone = 0;
339  $actualBlobs = gmp_init( 0 );
340  $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
341 
342  // Build a bitmap of actual blob rows
343  while ( true ) {
344  $res = $extDB->select( $table,
345  [ 'blob_id' ],
346  [ 'blob_id > ' . $extDB->addQuotes( $startId ) ],
347  __METHOD__,
348  [ 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' ]
349  );
350 
351  if ( !$res->numRows() ) {
352  break;
353  }
354 
355  foreach ( $res as $row ) {
356  gmp_setbit( $actualBlobs, $row->blob_id );
357  }
358  $startId = $row->blob_id;
359 
360  ++$batchesDone;
361  if ( $batchesDone >= $this->reportingInterval ) {
362  $batchesDone = 0;
363  echo "$startId / $endId\n";
364  }
365  }
366 
367  // Find actual blobs that weren't tracked by the previous passes
368  // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
369  $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
370 
371  // Traverse the orphan list
372  $insertBatch = [];
373  $id = 0;
374  $numOrphans = 0;
375  while ( true ) {
376  $id = gmp_scan1( $orphans, $id );
377  if ( $id == -1 ) {
378  break;
379  }
380  $insertBatch[] = [
381  'bo_cluster' => $cluster,
382  'bo_blob_id' => $id
383  ];
384  if ( count( $insertBatch ) > $this->batchSize ) {
385  $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
386  $insertBatch = [];
387  }
388 
389  ++$id;
390  ++$numOrphans;
391  }
392  if ( $insertBatch ) {
393  $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
394  }
395  echo "Found $numOrphans orphan(s) in $cluster\n";
396  }
397  }
398 }
wfGetDB($db, $groups=[], $wiki=false)
Get a Database object.
wfWaitForSlaves($ifWritesSince=null, $wiki=false, $cluster=false, $timeout=null)
Waits for the replica DBs to catch up to the master position.
div flags Integer display flags(NO_ACTION_LINK, NO_EXTRA_USER_LINKS) 'LogException'returning false will NOT prevent logging $e
Definition: hooks.txt:2102
checkIntegrity()
Definition: trackBlobs.php:68
it s the revision text itself In either if gzip is the revision text is gzipped $flags
Definition: hooks.txt:2703
trackRevisions()
Scan the revision table for rows stored in the specified clusters.
Definition: trackBlobs.php:148
const DB_MASTER
Definition: defines.php:23
if($line===false) $args
Definition: cdb.php:64
trackOrphanText()
Scan the text table for orphan text Orphan text here does not imply DB corruption – deleted text tra...
Definition: trackBlobs.php:221
$res
Definition: database.txt:21
interpretPointer($text)
Definition: trackBlobs.php:133
__construct($clusters)
Definition: trackBlobs.php:46
if(count($args)< 1) $tracker
Definition: trackBlobs.php:34
findOrphanBlobs()
Scan the blobs table for rows not registered in blob_tracking (and thus not registered in the text ta...
Definition: trackBlobs.php:307
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
wfGetLBFactory()
Get the load balancer factory object.
initTrackingTable()
Definition: trackBlobs.php:109
const DB_REPLICA
Definition: defines.php:22