MediaWiki REL1_30
trackBlobs.php
Go to the documentation of this file.
1<?php
26
27require __DIR__ . '/../commandLine.inc';
28
29if ( count( $args ) < 1 ) {
30 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
31 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
32 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
33
34 exit( 1 );
35}
37$tracker->run();
38echo "All done.\n";
39
43 public $trackedBlobs = [];
44
45 public $batchSize = 1000;
46 public $reportingInterval = 10;
47
48 function __construct( $clusters ) {
49 $this->clusters = $clusters;
50 if ( extension_loaded( 'gmp' ) ) {
51 $this->doBlobOrphans = true;
52 foreach ( $clusters as $cluster ) {
53 $this->trackedBlobs[$cluster] = gmp_init( 0 );
54 }
55 } else {
56 echo "Warning: the gmp extension is needed to find orphan blobs\n";
57 }
58 }
59
60 function run() {
61 $this->checkIntegrity();
62 $this->initTrackingTable();
63 $this->trackRevisions();
64 $this->trackOrphanText();
65 if ( $this->doBlobOrphans ) {
66 $this->findOrphanBlobs();
67 }
68 }
69
70 function checkIntegrity() {
71 echo "Doing integrity check...\n";
73
74 // Scan for HistoryBlobStub objects in the text table (T22757)
75
76 $exists = $dbr->selectField( 'text', 1,
77 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
78 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
79 __METHOD__
80 );
81
82 if ( $exists ) {
83 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
84 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
85 "to fix this.\n";
86 exit( 1 );
87 }
88
89 // Scan the archive table for HistoryBlobStub objects or external flags (T24624)
90 $flags = $dbr->selectField( 'archive', 'ar_flags',
91 'ar_flags LIKE \'%external%\' OR (' .
92 'ar_flags LIKE \'%object%\' ' .
93 'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
94 __METHOD__
95 );
96
97 if ( strpos( $flags, 'external' ) !== false ) {
98 echo "Integrity check failed: found external storage pointers in your archive table.\n" .
99 "Run normaliseArchiveTable.php to fix this.\n";
100 exit( 1 );
101 } elseif ( $flags ) {
102 echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
103 "These objects are probably already broken, continuing would make them\n" .
104 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
105 exit( 1 );
106 }
107
108 echo "Integrity check OK\n";
109 }
110
111 function initTrackingTable() {
112 $dbw = wfGetDB( DB_MASTER );
113 if ( $dbw->tableExists( 'blob_tracking' ) ) {
114 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
115 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
116 }
117 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
118 }
119
120 function getTextClause() {
121 if ( !$this->textClause ) {
123 $this->textClause = '';
124 foreach ( $this->clusters as $cluster ) {
125 if ( $this->textClause != '' ) {
126 $this->textClause .= ' OR ';
127 }
128 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
129 }
130 }
131
132 return $this->textClause;
133 }
134
135 function interpretPointer( $text ) {
136 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
137 return false;
138 }
139
140 return [
141 'cluster' => $m[1],
142 'id' => intval( $m[2] ),
143 'hash' => isset( $m[3] ) ? $m[3] : null
144 ];
145 }
146
150 function trackRevisions() {
151 $dbw = wfGetDB( DB_MASTER );
153
154 $textClause = $this->getTextClause();
155 $startId = 0;
156 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
157 $batchesDone = 0;
158 $rowsInserted = 0;
159
160 echo "Finding revisions...\n";
161
162 while ( true ) {
163 $res = $dbr->select( [ 'revision', 'text' ],
164 [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ],
165 [
166 'rev_id > ' . $dbr->addQuotes( $startId ),
167 'rev_text_id=old_id',
169 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
170 ],
171 __METHOD__,
172 [
173 'ORDER BY' => 'rev_id',
174 'LIMIT' => $this->batchSize
175 ]
176 );
177 if ( !$res->numRows() ) {
178 break;
179 }
180
181 $insertBatch = [];
182 foreach ( $res as $row ) {
183 $startId = $row->rev_id;
184 $info = $this->interpretPointer( $row->old_text );
185 if ( !$info ) {
186 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
187 continue;
188 }
189 if ( !in_array( $info['cluster'], $this->clusters ) ) {
190 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
191 continue;
192 }
193 $insertBatch[] = [
194 'bt_page' => $row->rev_page,
195 'bt_rev_id' => $row->rev_id,
196 'bt_text_id' => $row->old_id,
197 'bt_cluster' => $info['cluster'],
198 'bt_blob_id' => $info['id'],
199 'bt_cgz_hash' => $info['hash']
200 ];
201 if ( $this->doBlobOrphans ) {
202 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
203 }
204 }
205 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
206 $rowsInserted += count( $insertBatch );
207
208 ++$batchesDone;
209 if ( $batchesDone >= $this->reportingInterval ) {
210 $batchesDone = 0;
211 echo "$startId / $endId\n";
213 }
214 }
215 echo "Found $rowsInserted revisions\n";
216 }
217
223 function trackOrphanText() {
224 # Wait until the blob_tracking table is available in the replica DB
225 $dbw = wfGetDB( DB_MASTER );
227 $pos = $dbw->getMasterPos();
228 $dbr->masterPosWait( $pos, 100000 );
229
230 $textClause = $this->getTextClause( $this->clusters );
231 $startId = 0;
232 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
233 $rowsInserted = 0;
234 $batchesDone = 0;
235
236 echo "Finding orphan text...\n";
237
238 # Scan the text table for orphan text
239 while ( true ) {
240 $res = $dbr->select( [ 'text', 'blob_tracking' ],
241 [ 'old_id', 'old_flags', 'old_text' ],
242 [
243 'old_id>' . $dbr->addQuotes( $startId ),
245 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
246 'bt_text_id IS NULL'
247 ],
248 __METHOD__,
249 [
250 'ORDER BY' => 'old_id',
251 'LIMIT' => $this->batchSize
252 ],
253 [ 'blob_tracking' => [ 'LEFT JOIN', 'bt_text_id=old_id' ] ]
254 );
255 $ids = [];
256 foreach ( $res as $row ) {
257 $ids[] = $row->old_id;
258 }
259
260 if ( !$res->numRows() ) {
261 break;
262 }
263
264 $insertBatch = [];
265 foreach ( $res as $row ) {
266 $startId = $row->old_id;
267 $info = $this->interpretPointer( $row->old_text );
268 if ( !$info ) {
269 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
270 continue;
271 }
272 if ( !in_array( $info['cluster'], $this->clusters ) ) {
273 echo "Invalid cluster returned in SQL query\n";
274 continue;
275 }
276
277 $insertBatch[] = [
278 'bt_page' => 0,
279 'bt_rev_id' => 0,
280 'bt_text_id' => $row->old_id,
281 'bt_cluster' => $info['cluster'],
282 'bt_blob_id' => $info['id'],
283 'bt_cgz_hash' => $info['hash']
284 ];
285 if ( $this->doBlobOrphans ) {
286 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
287 }
288 }
289 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
290
291 $rowsInserted += count( $insertBatch );
292 ++$batchesDone;
293 if ( $batchesDone >= $this->reportingInterval ) {
294 $batchesDone = 0;
295 echo "$startId / $endId\n";
297 }
298 }
299 echo "Found $rowsInserted orphan text rows\n";
300 }
301
309 function findOrphanBlobs() {
310 if ( !extension_loaded( 'gmp' ) ) {
311 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
312
313 return;
314 }
315
316 $dbw = wfGetDB( DB_MASTER );
317
318 foreach ( $this->clusters as $cluster ) {
319 echo "Searching for orphan blobs in $cluster...\n";
320 $lb = wfGetLBFactory()->getExternalLB( $cluster );
321 try {
322 $extDB = $lb->getConnection( DB_REPLICA );
323 } catch ( DBConnectionError $e ) {
324 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
325 echo "No database on $cluster\n";
326 } else {
327 echo "Error on $cluster: " . $e->getMessage() . "\n";
328 }
329 continue;
330 }
331 $table = $extDB->getLBInfo( 'blobs table' );
332 if ( is_null( $table ) ) {
333 $table = 'blobs';
334 }
335 if ( !$extDB->tableExists( $table ) ) {
336 echo "No blobs table on cluster $cluster\n";
337 continue;
338 }
339 $startId = 0;
340 $batchesDone = 0;
341 $actualBlobs = gmp_init( 0 );
342 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
343
344 // Build a bitmap of actual blob rows
345 while ( true ) {
346 $res = $extDB->select( $table,
347 [ 'blob_id' ],
348 [ 'blob_id > ' . $extDB->addQuotes( $startId ) ],
349 __METHOD__,
350 [ 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' ]
351 );
352
353 if ( !$res->numRows() ) {
354 break;
355 }
356
357 foreach ( $res as $row ) {
358 gmp_setbit( $actualBlobs, $row->blob_id );
359 }
360 $startId = $row->blob_id;
361
362 ++$batchesDone;
363 if ( $batchesDone >= $this->reportingInterval ) {
364 $batchesDone = 0;
365 echo "$startId / $endId\n";
366 }
367 }
368
369 // Find actual blobs that weren't tracked by the previous passes
370 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
371 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
372
373 // Traverse the orphan list
374 $insertBatch = [];
375 $id = 0;
376 $numOrphans = 0;
377 while ( true ) {
378 $id = gmp_scan1( $orphans, $id );
379 if ( $id == -1 ) {
380 break;
381 }
382 $insertBatch[] = [
383 'bo_cluster' => $cluster,
384 'bo_blob_id' => $id
385 ];
386 if ( count( $insertBatch ) > $this->batchSize ) {
387 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
388 $insertBatch = [];
389 }
390
391 ++$id;
392 ++$numOrphans;
393 }
394 if ( $insertBatch ) {
395 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
396 }
397 echo "Found $numOrphans orphan(s) in $cluster\n";
398 }
399 }
400}
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
wfWaitForSlaves( $ifWritesSince=null, $wiki=false, $cluster=false, $timeout=null)
Waits for the replica DBs to catch up to the master position.
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
wfGetLBFactory()
Get the load balancer factory object.
if( $line===false) $args
Definition cdb.php:63
interpretPointer( $text)
findOrphanBlobs()
Scan the blobs table for rows not registered in blob_tracking (and thus not registered in the text ta...
trackOrphanText()
Scan the text table for orphan text Orphan text here does not imply DB corruption – deleted text trac...
trackRevisions()
Scan the revision table for rows stored in the specified clusters.
__construct( $clusters)
$res
Definition database.txt:21
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
it s the revision text itself In either if gzip is the revision text is gzipped $flags
Definition hooks.txt:2805
returning false will NOT prevent logging $e
Definition hooks.txt:2146
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition injection.txt:37
const DB_REPLICA
Definition defines.php:25
const DB_MASTER
Definition defines.php:26
if(count( $args)< 1) $tracker