MediaWiki  master
trackBlobs.php
Go to the documentation of this file.
1 <?php
27 
28 require __DIR__ . '/../CommandLineInc.php';
29 
30 if ( count( $args ) < 1 ) {
31  echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
32  echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
33  echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
34 
35  exit( 1 );
36 }
37 $tracker = new TrackBlobs( $args );
38 $tracker->run();
39 echo "All done.\n";
40 
41 class TrackBlobs {
44  public $trackedBlobs = [];
45 
46  public $batchSize = 1000;
47  public $reportingInterval = 10;
48 
49  public function __construct( $clusters ) {
50  $this->clusters = $clusters;
51  if ( extension_loaded( 'gmp' ) ) {
52  $this->doBlobOrphans = true;
53  foreach ( $clusters as $cluster ) {
54  $this->trackedBlobs[$cluster] = gmp_init( 0 );
55  }
56  } else {
57  echo "Warning: the gmp extension is needed to find orphan blobs\n";
58  }
59  }
60 
61  public function run() {
62  $this->checkIntegrity();
63  $this->initTrackingTable();
64  $this->trackRevisions();
65  $this->trackOrphanText();
66  if ( $this->doBlobOrphans ) {
67  $this->findOrphanBlobs();
68  }
69  }
70 
71  private function checkIntegrity() {
72  echo "Doing integrity check...\n";
73  $dbr = wfGetDB( DB_REPLICA );
74 
75  // Scan for HistoryBlobStub objects in the text table (T22757)
76 
77  $exists = (bool)$dbr->selectField( 'text', '1',
78  'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
79  'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
80  __METHOD__
81  );
82 
83  if ( $exists ) {
84  echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
85  "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
86  "to fix this.\n";
87  exit( 1 );
88  }
89 
90  echo "Integrity check OK\n";
91  }
92 
93  private function initTrackingTable() {
94  $dbw = wfGetDB( DB_PRIMARY );
95  if ( $dbw->tableExists( 'blob_tracking', __METHOD__ ) ) {
96  $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ), __METHOD__ );
97  $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ), __METHOD__ );
98  }
99  $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
100  }
101 
102  private function getTextClause() {
103  if ( !$this->textClause ) {
104  $dbr = wfGetDB( DB_REPLICA );
105  $this->textClause = '';
106  foreach ( $this->clusters as $cluster ) {
107  if ( $this->textClause != '' ) {
108  $this->textClause .= ' OR ';
109  }
110  $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
111  }
112  }
113 
114  return $this->textClause;
115  }
116 
117  private function interpretPointer( $text ) {
118  if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
119  return false;
120  }
121 
122  return [
123  'cluster' => $m[1],
124  'id' => intval( $m[2] ),
125  'hash' => $m[3] ?? null
126  ];
127  }
128 
132  private function trackRevisions() {
133  $dbw = wfGetDB( DB_PRIMARY );
134  $dbr = wfGetDB( DB_REPLICA );
135 
136  $textClause = $this->getTextClause();
137  $startId = 0;
138  $endId = (int)$dbr->selectField( 'revision', 'MAX(rev_id)', '', __METHOD__ );
139  $batchesDone = 0;
140  $rowsInserted = 0;
141 
142  echo "Finding revisions...\n";
143 
144  $fields = [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ];
145  $options = [
146  'ORDER BY' => 'rev_id',
147  'LIMIT' => $this->batchSize
148  ];
149  $conds = [
150  $textClause,
151  'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
152  ];
153  $slotRoleStore = MediaWikiServices::getInstance()->getSlotRoleStore();
154  $tables = [ 'revision', 'slots', 'content', 'text' ];
155  $conds = array_merge( [
156  'rev_id=slot_revision_id',
157  'slot_role_id=' . $slotRoleStore->getId( SlotRecord::MAIN ),
158  'content_id=slot_content_id',
159  'SUBSTRING(content_address, 1, 3)=' . $dbr->addQuotes( 'tt:' ),
160  'SUBSTRING(content_address, 4)=old_id',
161  ], $conds );
162  $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
163 
164  while ( true ) {
165  $res = $dbr->select( $tables,
166  $fields,
167  array_merge( [
168  'rev_id > ' . $dbr->addQuotes( $startId ),
169  ], $conds ),
170  __METHOD__,
171  $options
172  );
173  if ( !$res->numRows() ) {
174  break;
175  }
176 
177  $insertBatch = [];
178  foreach ( $res as $row ) {
179  $startId = (int)$row->rev_id;
180  $info = $this->interpretPointer( $row->old_text );
181  if ( !$info ) {
182  echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
183  continue;
184  }
185  if ( !in_array( $info['cluster'], $this->clusters ) ) {
186  echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
187  continue;
188  }
189  $insertBatch[] = [
190  'bt_page' => $row->rev_page,
191  'bt_rev_id' => $row->rev_id,
192  'bt_text_id' => $row->old_id,
193  'bt_cluster' => $info['cluster'],
194  'bt_blob_id' => $info['id'],
195  'bt_cgz_hash' => $info['hash']
196  ];
197  if ( $this->doBlobOrphans ) {
198  gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
199  }
200  }
201  $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
202  $rowsInserted += count( $insertBatch );
203 
204  ++$batchesDone;
205  if ( $batchesDone >= $this->reportingInterval ) {
206  $batchesDone = 0;
207  echo "$startId / $endId\n";
208  $lbFactory->waitForReplication();
209  }
210  }
211  echo "Found $rowsInserted revisions\n";
212  }
213 
219  private function trackOrphanText() {
220  # Wait until the blob_tracking table is available in the replica DB
221  $dbw = wfGetDB( DB_PRIMARY );
222  $dbr = wfGetDB( DB_REPLICA );
223  $pos = $dbw->getPrimaryPos();
224  $dbr->primaryPosWait( $pos, 100000 );
225 
226  $textClause = $this->getTextClause();
227  $startId = 0;
228  $endId = (int)$dbr->selectField( 'text', 'MAX(old_id)', '', __METHOD__ );
229  $rowsInserted = 0;
230  $batchesDone = 0;
231  $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
232 
233  echo "Finding orphan text...\n";
234 
235  # Scan the text table for orphan text
236  while ( true ) {
237  $res = $dbr->select( [ 'text', 'blob_tracking' ],
238  [ 'old_id', 'old_flags', 'old_text' ],
239  [
240  'old_id>' . $dbr->addQuotes( $startId ),
241  $textClause,
242  'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
243  'bt_text_id IS NULL'
244  ],
245  __METHOD__,
246  [
247  'ORDER BY' => 'old_id',
248  'LIMIT' => $this->batchSize
249  ],
250  [ 'blob_tracking' => [ 'LEFT JOIN', 'bt_text_id=old_id' ] ]
251  );
252 
253  if ( !$res->numRows() ) {
254  break;
255  }
256 
257  $insertBatch = [];
258  foreach ( $res as $row ) {
259  $startId = (int)$row->old_id;
260  $info = $this->interpretPointer( $row->old_text );
261  if ( !$info ) {
262  echo "Invalid DB:// URL in old_id {$row->old_id}\n";
263  continue;
264  }
265  if ( !in_array( $info['cluster'], $this->clusters ) ) {
266  echo "Invalid cluster returned in SQL query\n";
267  continue;
268  }
269 
270  $insertBatch[] = [
271  'bt_page' => 0,
272  'bt_rev_id' => 0,
273  'bt_text_id' => $row->old_id,
274  'bt_cluster' => $info['cluster'],
275  'bt_blob_id' => $info['id'],
276  'bt_cgz_hash' => $info['hash']
277  ];
278  if ( $this->doBlobOrphans ) {
279  gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
280  }
281  }
282  $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
283 
284  $rowsInserted += count( $insertBatch );
285  ++$batchesDone;
286  if ( $batchesDone >= $this->reportingInterval ) {
287  $batchesDone = 0;
288  echo "$startId / $endId\n";
289  $lbFactory->waitForReplication();
290  }
291  }
292  echo "Found $rowsInserted orphan text rows\n";
293  }
294 
302  private function findOrphanBlobs() {
303  if ( !extension_loaded( 'gmp' ) ) {
304  echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
305 
306  return;
307  }
308 
309  $dbw = wfGetDB( DB_PRIMARY );
310  $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
311 
312  foreach ( $this->clusters as $cluster ) {
313  echo "Searching for orphan blobs in $cluster...\n";
314  $lb = $lbFactory->getExternalLB( $cluster );
315  try {
316  $extDB = $lb->getMaintenanceConnectionRef( DB_REPLICA );
317  } catch ( DBConnectionError $e ) {
318  if ( strpos( $e->getMessage(), 'Unknown database' ) !== false ) {
319  echo "No database on $cluster\n";
320  } else {
321  echo "Error on $cluster: " . $e->getMessage() . "\n";
322  }
323  continue;
324  }
325  $table = $extDB->getLBInfo( 'blobs table' ) ?? 'blobs';
326  if ( !$extDB->tableExists( $table, __METHOD__ ) ) {
327  echo "No blobs table on cluster $cluster\n";
328  continue;
329  }
330  $startId = 0;
331  $batchesDone = 0;
332  $actualBlobs = gmp_init( 0 );
333  $endId = (int)$extDB->selectField( $table, 'MAX(blob_id)', '', __METHOD__ );
334 
335  // Build a bitmap of actual blob rows
336  while ( true ) {
337  $res = $extDB->select( $table,
338  [ 'blob_id' ],
339  [ 'blob_id > ' . $extDB->addQuotes( $startId ) ],
340  __METHOD__,
341  [ 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' ]
342  );
343 
344  if ( !$res->numRows() ) {
345  break;
346  }
347 
348  foreach ( $res as $row ) {
349  gmp_setbit( $actualBlobs, $row->blob_id );
350  $startId = (int)$row->blob_id;
351  }
352 
353  ++$batchesDone;
354  if ( $batchesDone >= $this->reportingInterval ) {
355  $batchesDone = 0;
356  echo "$startId / $endId\n";
357  }
358  }
359 
360  // Find actual blobs that weren't tracked by the previous passes
361  // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
362  $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
363 
364  // Traverse the orphan list
365  $insertBatch = [];
366  $id = 0;
367  $numOrphans = 0;
368  while ( true ) {
369  $id = gmp_scan1( $orphans, $id );
370  if ( $id == -1 ) {
371  break;
372  }
373  $insertBatch[] = [
374  'bo_cluster' => $cluster,
375  'bo_blob_id' => $id
376  ];
377  if ( count( $insertBatch ) > $this->batchSize ) {
378  $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
379  $insertBatch = [];
380  }
381 
382  ++$id;
383  ++$numOrphans;
384  }
385  if ( $insertBatch ) {
386  $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
387  }
388  echo "Found $numOrphans orphan(s) in $cluster\n";
389  }
390  }
391 }
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Service locator for MediaWiki core services.
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:40
__construct( $clusters)
Definition: trackBlobs.php:49
const DB_REPLICA
Definition: defines.php:26
const DB_PRIMARY
Definition: defines.php:28
if(count( $args)< 1) $tracker
Definition: trackBlobs.php:37