MediaWiki REL1_32
trackBlobs.php
Go to the documentation of this file.
1<?php
27
28require __DIR__ . '/../commandLine.inc';
29
30if ( count( $args ) < 1 ) {
31 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
32 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
33 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
34
35 exit( 1 );
36}
38$tracker->run();
39echo "All done.\n";
40
44 public $trackedBlobs = [];
45
46 public $batchSize = 1000;
47 public $reportingInterval = 10;
48
49 function __construct( $clusters ) {
50 $this->clusters = $clusters;
51 if ( extension_loaded( 'gmp' ) ) {
52 $this->doBlobOrphans = true;
53 foreach ( $clusters as $cluster ) {
54 $this->trackedBlobs[$cluster] = gmp_init( 0 );
55 }
56 } else {
57 echo "Warning: the gmp extension is needed to find orphan blobs\n";
58 }
59 }
60
61 function run() {
62 $this->checkIntegrity();
63 $this->initTrackingTable();
64 $this->trackRevisions();
65 $this->trackOrphanText();
66 if ( $this->doBlobOrphans ) {
67 $this->findOrphanBlobs();
68 }
69 }
70
71 function checkIntegrity() {
72 echo "Doing integrity check...\n";
74
75 // Scan for HistoryBlobStub objects in the text table (T22757)
76
77 $exists = $dbr->selectField( 'text', 1,
78 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
79 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
80 __METHOD__
81 );
82
83 if ( $exists ) {
84 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
85 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
86 "to fix this.\n";
87 exit( 1 );
88 }
89
90 echo "Integrity check OK\n";
91 }
92
93 function initTrackingTable() {
94 $dbw = wfGetDB( DB_MASTER );
95 if ( $dbw->tableExists( 'blob_tracking' ) ) {
96 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
97 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
98 }
99 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
100 }
101
102 function getTextClause() {
103 if ( !$this->textClause ) {
105 $this->textClause = '';
106 foreach ( $this->clusters as $cluster ) {
107 if ( $this->textClause != '' ) {
108 $this->textClause .= ' OR ';
109 }
110 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
111 }
112 }
113
114 return $this->textClause;
115 }
116
117 function interpretPointer( $text ) {
118 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
119 return false;
120 }
121
122 return [
123 'cluster' => $m[1],
124 'id' => intval( $m[2] ),
125 'hash' => $m[3] ?? null
126 ];
127 }
128
132 function trackRevisions() {
133 $dbw = wfGetDB( DB_MASTER );
135
136 $textClause = $this->getTextClause();
137 $startId = 0;
138 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', '', __METHOD__ );
139 $batchesDone = 0;
140 $rowsInserted = 0;
141
142 echo "Finding revisions...\n";
143
144 while ( true ) {
145 $res = $dbr->select( [ 'revision', 'text' ],
146 [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ],
147 [
148 'rev_id > ' . $dbr->addQuotes( $startId ),
149 'rev_text_id=old_id',
151 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
152 ],
153 __METHOD__,
154 [
155 'ORDER BY' => 'rev_id',
156 'LIMIT' => $this->batchSize
157 ]
158 );
159 if ( !$res->numRows() ) {
160 break;
161 }
162
163 $insertBatch = [];
164 foreach ( $res as $row ) {
165 $startId = $row->rev_id;
166 $info = $this->interpretPointer( $row->old_text );
167 if ( !$info ) {
168 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
169 continue;
170 }
171 if ( !in_array( $info['cluster'], $this->clusters ) ) {
172 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
173 continue;
174 }
175 $insertBatch[] = [
176 'bt_page' => $row->rev_page,
177 'bt_rev_id' => $row->rev_id,
178 'bt_text_id' => $row->old_id,
179 'bt_cluster' => $info['cluster'],
180 'bt_blob_id' => $info['id'],
181 'bt_cgz_hash' => $info['hash']
182 ];
183 if ( $this->doBlobOrphans ) {
184 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
185 }
186 }
187 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
188 $rowsInserted += count( $insertBatch );
189
190 ++$batchesDone;
191 if ( $batchesDone >= $this->reportingInterval ) {
192 $batchesDone = 0;
193 echo "$startId / $endId\n";
195 }
196 }
197 echo "Found $rowsInserted revisions\n";
198 }
199
205 function trackOrphanText() {
206 # Wait until the blob_tracking table is available in the replica DB
207 $dbw = wfGetDB( DB_MASTER );
209 $pos = $dbw->getMasterPos();
210 $dbr->masterPosWait( $pos, 100000 );
211
212 $textClause = $this->getTextClause( $this->clusters );
213 $startId = 0;
214 $endId = $dbr->selectField( 'text', 'MAX(old_id)', '', __METHOD__ );
215 $rowsInserted = 0;
216 $batchesDone = 0;
217
218 echo "Finding orphan text...\n";
219
220 # Scan the text table for orphan text
221 while ( true ) {
222 $res = $dbr->select( [ 'text', 'blob_tracking' ],
223 [ 'old_id', 'old_flags', 'old_text' ],
224 [
225 'old_id>' . $dbr->addQuotes( $startId ),
227 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
228 'bt_text_id IS NULL'
229 ],
230 __METHOD__,
231 [
232 'ORDER BY' => 'old_id',
233 'LIMIT' => $this->batchSize
234 ],
235 [ 'blob_tracking' => [ 'LEFT JOIN', 'bt_text_id=old_id' ] ]
236 );
237 $ids = [];
238 foreach ( $res as $row ) {
239 $ids[] = $row->old_id;
240 }
241
242 if ( !$res->numRows() ) {
243 break;
244 }
245
246 $insertBatch = [];
247 foreach ( $res as $row ) {
248 $startId = $row->old_id;
249 $info = $this->interpretPointer( $row->old_text );
250 if ( !$info ) {
251 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
252 continue;
253 }
254 if ( !in_array( $info['cluster'], $this->clusters ) ) {
255 echo "Invalid cluster returned in SQL query\n";
256 continue;
257 }
258
259 $insertBatch[] = [
260 'bt_page' => 0,
261 'bt_rev_id' => 0,
262 'bt_text_id' => $row->old_id,
263 'bt_cluster' => $info['cluster'],
264 'bt_blob_id' => $info['id'],
265 'bt_cgz_hash' => $info['hash']
266 ];
267 if ( $this->doBlobOrphans ) {
268 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
269 }
270 }
271 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
272
273 $rowsInserted += count( $insertBatch );
274 ++$batchesDone;
275 if ( $batchesDone >= $this->reportingInterval ) {
276 $batchesDone = 0;
277 echo "$startId / $endId\n";
279 }
280 }
281 echo "Found $rowsInserted orphan text rows\n";
282 }
283
291 function findOrphanBlobs() {
292 if ( !extension_loaded( 'gmp' ) ) {
293 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
294
295 return;
296 }
297
298 $dbw = wfGetDB( DB_MASTER );
299
300 foreach ( $this->clusters as $cluster ) {
301 echo "Searching for orphan blobs in $cluster...\n";
302 $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
303 $lb = $lbFactory->getExternalLB( $cluster );
304 try {
305 $extDB = $lb->getConnection( DB_REPLICA );
306 } catch ( DBConnectionError $e ) {
307 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
308 echo "No database on $cluster\n";
309 } else {
310 echo "Error on $cluster: " . $e->getMessage() . "\n";
311 }
312 continue;
313 }
314 $table = $extDB->getLBInfo( 'blobs table' );
315 if ( is_null( $table ) ) {
316 $table = 'blobs';
317 }
318 if ( !$extDB->tableExists( $table ) ) {
319 echo "No blobs table on cluster $cluster\n";
320 continue;
321 }
322 $startId = 0;
323 $batchesDone = 0;
324 $actualBlobs = gmp_init( 0 );
325 $endId = $extDB->selectField( $table, 'MAX(blob_id)', '', __METHOD__ );
326
327 // Build a bitmap of actual blob rows
328 while ( true ) {
329 $res = $extDB->select( $table,
330 [ 'blob_id' ],
331 [ 'blob_id > ' . $extDB->addQuotes( $startId ) ],
332 __METHOD__,
333 [ 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' ]
334 );
335
336 if ( !$res->numRows() ) {
337 break;
338 }
339
340 foreach ( $res as $row ) {
341 gmp_setbit( $actualBlobs, $row->blob_id );
342 }
343 $startId = $row->blob_id;
344
345 ++$batchesDone;
346 if ( $batchesDone >= $this->reportingInterval ) {
347 $batchesDone = 0;
348 echo "$startId / $endId\n";
349 }
350 }
351
352 // Find actual blobs that weren't tracked by the previous passes
353 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
354 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
355
356 // Traverse the orphan list
357 $insertBatch = [];
358 $id = 0;
359 $numOrphans = 0;
360 while ( true ) {
361 $id = gmp_scan1( $orphans, $id );
362 if ( $id == -1 ) {
363 break;
364 }
365 $insertBatch[] = [
366 'bo_cluster' => $cluster,
367 'bo_blob_id' => $id
368 ];
369 if ( count( $insertBatch ) > $this->batchSize ) {
370 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
371 $insertBatch = [];
372 }
373
374 ++$id;
375 ++$numOrphans;
376 }
377 if ( $insertBatch ) {
378 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
379 }
380 echo "Found $numOrphans orphan(s) in $cluster\n";
381 }
382 }
383}
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
wfWaitForSlaves( $ifWritesSince=null, $wiki=false, $cluster=false, $timeout=null)
Waits for the replica DBs to catch up to the master position.
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
if( $line===false) $args
Definition cdb.php:64
MediaWikiServices is the service locator for the application scope of MediaWiki.
interpretPointer( $text)
findOrphanBlobs()
Scan the blobs table for rows not registered in blob_tracking (and thus not registered in the text ta...
initTrackingTable()
trackOrphanText()
Scan the text table for orphan text Orphan text here does not imply DB corruption – deleted text trac...
trackRevisions()
Scan the revision table for rows stored in the specified clusters.
__construct( $clusters)
$res
Definition database.txt:21
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
returning false will NOT prevent logging $e
Definition hooks.txt:2226
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition injection.txt:37
const DB_REPLICA
Definition defines.php:25
const DB_MASTER
Definition defines.php:26
if(count( $args)< 1) $tracker