MediaWiki  1.23.5
trackBlobs.php
Go to the documentation of this file.
1 <?php
25 require __DIR__ . '/../commandLine.inc';
26 
27 if ( count( $args ) < 1 ) {
28  echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
29  echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
30  echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
31 
32  exit( 1 );
33 }
35 $tracker->run();
36 echo "All done.\n";
37 
38 class TrackBlobs {
41  public $trackedBlobs = array();
42 
43  public $batchSize = 1000;
44  public $reportingInterval = 10;
45 
46  function __construct( $clusters ) {
47  $this->clusters = $clusters;
48  if ( extension_loaded( 'gmp' ) ) {
49  $this->doBlobOrphans = true;
50  foreach ( $clusters as $cluster ) {
51  $this->trackedBlobs[$cluster] = gmp_init( 0 );
52  }
53  } else {
54  echo "Warning: the gmp extension is needed to find orphan blobs\n";
55  }
56  }
57 
58  function run() {
59  $this->checkIntegrity();
60  $this->initTrackingTable();
61  $this->trackRevisions();
62  $this->trackOrphanText();
63  if ( $this->doBlobOrphans ) {
64  $this->findOrphanBlobs();
65  }
66  }
67 
68  function checkIntegrity() {
69  echo "Doing integrity check...\n";
70  $dbr = wfGetDB( DB_SLAVE );
71 
72  // Scan for HistoryBlobStub objects in the text table (bug 20757)
73 
74  $exists = $dbr->selectField( 'text', 1,
75  'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
76  'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
77  __METHOD__
78  );
79 
80  if ( $exists ) {
81  echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
82  "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
83  "to fix this.\n";
84  exit( 1 );
85  }
86 
87  // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
88  $flags = $dbr->selectField( 'archive', 'ar_flags',
89  'ar_flags LIKE \'%external%\' OR (' .
90  'ar_flags LIKE \'%object%\' ' .
91  'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
92  __METHOD__
93  );
94 
95  if ( strpos( $flags, 'external' ) !== false ) {
96  echo "Integrity check failed: found external storage pointers in your archive table.\n" .
97  "Run normaliseArchiveTable.php to fix this.\n";
98  exit( 1 );
99  } elseif ( $flags ) {
100  echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
101  "These objects are probably already broken, continuing would make them\n" .
102  "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
103  exit( 1 );
104  }
105 
106  echo "Integrity check OK\n";
107  }
108 
109  function initTrackingTable() {
110  $dbw = wfGetDB( DB_MASTER );
111  if ( $dbw->tableExists( 'blob_tracking' ) ) {
112  $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
113  $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
114  }
115  $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
116  }
117 
118  function getTextClause() {
119  if ( !$this->textClause ) {
120  $dbr = wfGetDB( DB_SLAVE );
121  $this->textClause = '';
122  foreach ( $this->clusters as $cluster ) {
123  if ( $this->textClause != '' ) {
124  $this->textClause .= ' OR ';
125  }
126  $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
127  }
128  }
129  return $this->textClause;
130  }
131 
132  function interpretPointer( $text ) {
133  if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
134  return false;
135  }
136  return array(
137  'cluster' => $m[1],
138  'id' => intval( $m[2] ),
139  'hash' => isset( $m[3] ) ? $m[3] : null
140  );
141  }
142 
146  function trackRevisions() {
147  $dbw = wfGetDB( DB_MASTER );
148  $dbr = wfGetDB( DB_SLAVE );
149 
150  $textClause = $this->getTextClause();
151  $startId = 0;
152  $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
153  $batchesDone = 0;
154  $rowsInserted = 0;
155 
156  echo "Finding revisions...\n";
157 
158  while ( true ) {
159  $res = $dbr->select( array( 'revision', 'text' ),
160  array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
161  array(
162  'rev_id > ' . $dbr->addQuotes( $startId ),
163  'rev_text_id=old_id',
164  $textClause,
165  'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
166  ),
167  __METHOD__,
168  array(
169  'ORDER BY' => 'rev_id',
170  'LIMIT' => $this->batchSize
171  )
172  );
173  if ( !$res->numRows() ) {
174  break;
175  }
176 
177  $insertBatch = array();
178  foreach ( $res as $row ) {
179  $startId = $row->rev_id;
180  $info = $this->interpretPointer( $row->old_text );
181  if ( !$info ) {
182  echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
183  continue;
184  }
185  if ( !in_array( $info['cluster'], $this->clusters ) ) {
186  echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
187  continue;
188  }
189  $insertBatch[] = array(
190  'bt_page' => $row->rev_page,
191  'bt_rev_id' => $row->rev_id,
192  'bt_text_id' => $row->old_id,
193  'bt_cluster' => $info['cluster'],
194  'bt_blob_id' => $info['id'],
195  'bt_cgz_hash' => $info['hash']
196  );
197  if ( $this->doBlobOrphans ) {
198  gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
199  }
200  }
201  $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
202  $rowsInserted += count( $insertBatch );
203 
204  ++$batchesDone;
205  if ( $batchesDone >= $this->reportingInterval ) {
206  $batchesDone = 0;
207  echo "$startId / $endId\n";
208  wfWaitForSlaves();
209  }
210  }
211  echo "Found $rowsInserted revisions\n";
212  }
213 
219  function trackOrphanText() {
220  # Wait until the blob_tracking table is available in the slave
221  $dbw = wfGetDB( DB_MASTER );
222  $dbr = wfGetDB( DB_SLAVE );
223  $pos = $dbw->getMasterPos();
224  $dbr->masterPosWait( $pos, 100000 );
225 
226  $textClause = $this->getTextClause( $this->clusters );
227  $startId = 0;
228  $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
229  $rowsInserted = 0;
230  $batchesDone = 0;
231 
232  echo "Finding orphan text...\n";
233 
234  # Scan the text table for orphan text
235  while ( true ) {
236  $res = $dbr->select( array( 'text', 'blob_tracking' ),
237  array( 'old_id', 'old_flags', 'old_text' ),
238  array(
239  'old_id>' . $dbr->addQuotes( $startId ),
240  $textClause,
241  'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
242  'bt_text_id IS NULL'
243  ),
244  __METHOD__,
245  array(
246  'ORDER BY' => 'old_id',
247  'LIMIT' => $this->batchSize
248  ),
249  array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
250  );
251  $ids = array();
252  foreach ( $res as $row ) {
253  $ids[] = $row->old_id;
254  }
255 
256  if ( !$res->numRows() ) {
257  break;
258  }
259 
260  $insertBatch = array();
261  foreach ( $res as $row ) {
262  $startId = $row->old_id;
263  $info = $this->interpretPointer( $row->old_text );
264  if ( !$info ) {
265  echo "Invalid DB:// URL in old_id {$row->old_id}\n";
266  continue;
267  }
268  if ( !in_array( $info['cluster'], $this->clusters ) ) {
269  echo "Invalid cluster returned in SQL query\n";
270  continue;
271  }
272 
273  $insertBatch[] = array(
274  'bt_page' => 0,
275  'bt_rev_id' => 0,
276  'bt_text_id' => $row->old_id,
277  'bt_cluster' => $info['cluster'],
278  'bt_blob_id' => $info['id'],
279  'bt_cgz_hash' => $info['hash']
280  );
281  if ( $this->doBlobOrphans ) {
282  gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
283  }
284  }
285  $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
286 
287  $rowsInserted += count( $insertBatch );
288  ++$batchesDone;
289  if ( $batchesDone >= $this->reportingInterval ) {
290  $batchesDone = 0;
291  echo "$startId / $endId\n";
292  wfWaitForSlaves();
293  }
294  }
295  echo "Found $rowsInserted orphan text rows\n";
296  }
297 
305  function findOrphanBlobs() {
306  if ( !extension_loaded( 'gmp' ) ) {
307  echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
308  return;
309  }
310 
311  $dbw = wfGetDB( DB_MASTER );
312 
313  foreach ( $this->clusters as $cluster ) {
314  echo "Searching for orphan blobs in $cluster...\n";
315  $lb = wfGetLBFactory()->getExternalLB( $cluster );
316  try {
317  $extDB = $lb->getConnection( DB_SLAVE );
318  } catch ( DBConnectionError $e ) {
319  if ( strpos( $e->error, 'Unknown database' ) !== false ) {
320  echo "No database on $cluster\n";
321  } else {
322  echo "Error on $cluster: " . $e->getMessage() . "\n";
323  }
324  continue;
325  }
326  $table = $extDB->getLBInfo( 'blobs table' );
327  if ( is_null( $table ) ) {
328  $table = 'blobs';
329  }
330  if ( !$extDB->tableExists( $table ) ) {
331  echo "No blobs table on cluster $cluster\n";
332  continue;
333  }
334  $startId = 0;
335  $batchesDone = 0;
336  $actualBlobs = gmp_init( 0 );
337  $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
338 
339  // Build a bitmap of actual blob rows
340  while ( true ) {
341  $res = $extDB->select( $table,
342  array( 'blob_id' ),
343  array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
344  __METHOD__,
345  array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
346  );
347 
348  if ( !$res->numRows() ) {
349  break;
350  }
351 
352  foreach ( $res as $row ) {
353  gmp_setbit( $actualBlobs, $row->blob_id );
354  }
355  $startId = $row->blob_id;
356 
357  ++$batchesDone;
358  if ( $batchesDone >= $this->reportingInterval ) {
359  $batchesDone = 0;
360  echo "$startId / $endId\n";
361  }
362  }
363 
364  // Find actual blobs that weren't tracked by the previous passes
365  // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
366  $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
367 
368  // Traverse the orphan list
369  $insertBatch = array();
370  $id = 0;
371  $numOrphans = 0;
372  while ( true ) {
373  $id = gmp_scan1( $orphans, $id );
374  if ( $id == -1 ) {
375  break;
376  }
377  $insertBatch[] = array(
378  'bo_cluster' => $cluster,
379  'bo_blob_id' => $id
380  );
381  if ( count( $insertBatch ) > $this->batchSize ) {
382  $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
383  $insertBatch = array();
384  }
385 
386  ++$id;
387  ++$numOrphans;
388  }
389  if ( $insertBatch ) {
390  $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
391  }
392  echo "Found $numOrphans orphan(s) in $cluster\n";
393  }
394  }
395 }
TrackBlobs\checkIntegrity
checkIntegrity()
Definition: trackBlobs.php:68
TrackBlobs\$trackedBlobs
$trackedBlobs
Definition: trackBlobs.php:41
DB_MASTER
const DB_MASTER
Definition: Defines.php:56
TrackBlobs\__construct
__construct( $clusters)
Definition: trackBlobs.php:46
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
wfGetDB
& wfGetDB( $db, $groups=array(), $wiki=false)
Get a Database object.
Definition: GlobalFunctions.php:3659
TrackBlobs\findOrphanBlobs
findOrphanBlobs()
Scan the blobs table for rows not registered in blob_tracking (and thus not registered in the text ta...
Definition: trackBlobs.php:305
$flags
it s the revision text itself In either if gzip is the revision text is gzipped $flags
Definition: hooks.txt:2113
$dbr
$dbr
Definition: testCompression.php:48
$lb
if( $wgAPIRequestLog) $lb
Definition: api.php:126
TrackBlobs\$clusters
$clusters
Definition: trackBlobs.php:39
TrackBlobs\interpretPointer
interpretPointer( $text)
Definition: trackBlobs.php:132
TrackBlobs\$batchSize
$batchSize
Definition: trackBlobs.php:43
TrackBlobs\trackOrphanText
trackOrphanText()
Scan the text table for orphan text Orphan text here does not imply DB corruption – deleted text trac...
Definition: trackBlobs.php:219
array
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
DBConnectionError
Definition: DatabaseError.php:98
wfWaitForSlaves
wfWaitForSlaves( $maxLag=false, $wiki=false, $cluster=false)
Modern version of wfWaitForSlaves().
Definition: GlobalFunctions.php:3804
TrackBlobs\trackRevisions
trackRevisions()
Scan the revision table for rows stored in the specified clusters.
Definition: trackBlobs.php:146
TrackBlobs\run
run()
Definition: trackBlobs.php:58
TrackBlobs\$textClause
$textClause
Definition: trackBlobs.php:39
TrackBlobs\getTextClause
getTextClause()
Definition: trackBlobs.php:118
$args
if( $line===false) $args
Definition: cdb.php:62
DB_SLAVE
const DB_SLAVE
Definition: Defines.php:55
wfGetLBFactory
& wfGetLBFactory()
Get the load balancer factory object.
Definition: GlobalFunctions.php:3678
TrackBlobs\$doBlobOrphans
$doBlobOrphans
Definition: trackBlobs.php:40
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
TrackBlobs
Definition: trackBlobs.php:38
TrackBlobs\$reportingInterval
$reportingInterval
Definition: trackBlobs.php:44
$e
if( $useReadline) $e
Definition: eval.php:66
$tracker
if(count( $args)< 1) $tracker
Definition: trackBlobs.php:34
$res
$res
Definition: database.txt:21
TrackBlobs\initTrackingTable
initTrackingTable()
Definition: trackBlobs.php:109