MediaWiki REL1_28
trackBlobs.php
Go to the documentation of this file.
1<?php
25require __DIR__ . '/../commandLine.inc';
26
27if ( count( $args ) < 1 ) {
28 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
29 echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
30 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
31
32 exit( 1 );
33}
35$tracker->run();
36echo "All done.\n";
37
41 public $trackedBlobs = [];
42
43 public $batchSize = 1000;
44 public $reportingInterval = 10;
45
46 function __construct( $clusters ) {
47 $this->clusters = $clusters;
48 if ( extension_loaded( 'gmp' ) ) {
49 $this->doBlobOrphans = true;
50 foreach ( $clusters as $cluster ) {
51 $this->trackedBlobs[$cluster] = gmp_init( 0 );
52 }
53 } else {
54 echo "Warning: the gmp extension is needed to find orphan blobs\n";
55 }
56 }
57
58 function run() {
59 $this->checkIntegrity();
60 $this->initTrackingTable();
61 $this->trackRevisions();
62 $this->trackOrphanText();
63 if ( $this->doBlobOrphans ) {
64 $this->findOrphanBlobs();
65 }
66 }
67
68 function checkIntegrity() {
69 echo "Doing integrity check...\n";
71
72 // Scan for HistoryBlobStub objects in the text table (bug 20757)
73
74 $exists = $dbr->selectField( 'text', 1,
75 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
76 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
77 __METHOD__
78 );
79
80 if ( $exists ) {
81 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
82 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
83 "to fix this.\n";
84 exit( 1 );
85 }
86
87 // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
88 $flags = $dbr->selectField( 'archive', 'ar_flags',
89 'ar_flags LIKE \'%external%\' OR (' .
90 'ar_flags LIKE \'%object%\' ' .
91 'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
92 __METHOD__
93 );
94
95 if ( strpos( $flags, 'external' ) !== false ) {
96 echo "Integrity check failed: found external storage pointers in your archive table.\n" .
97 "Run normaliseArchiveTable.php to fix this.\n";
98 exit( 1 );
99 } elseif ( $flags ) {
100 echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
101 "These objects are probably already broken, continuing would make them\n" .
102 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
103 exit( 1 );
104 }
105
106 echo "Integrity check OK\n";
107 }
108
109 function initTrackingTable() {
110 $dbw = wfGetDB( DB_MASTER );
111 if ( $dbw->tableExists( 'blob_tracking' ) ) {
112 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
113 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
114 }
115 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
116 }
117
118 function getTextClause() {
119 if ( !$this->textClause ) {
121 $this->textClause = '';
122 foreach ( $this->clusters as $cluster ) {
123 if ( $this->textClause != '' ) {
124 $this->textClause .= ' OR ';
125 }
126 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
127 }
128 }
129
130 return $this->textClause;
131 }
132
133 function interpretPointer( $text ) {
134 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
135 return false;
136 }
137
138 return [
139 'cluster' => $m[1],
140 'id' => intval( $m[2] ),
141 'hash' => isset( $m[3] ) ? $m[3] : null
142 ];
143 }
144
148 function trackRevisions() {
149 $dbw = wfGetDB( DB_MASTER );
151
152 $textClause = $this->getTextClause();
153 $startId = 0;
154 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
155 $batchesDone = 0;
156 $rowsInserted = 0;
157
158 echo "Finding revisions...\n";
159
160 while ( true ) {
161 $res = $dbr->select( [ 'revision', 'text' ],
162 [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ],
163 [
164 'rev_id > ' . $dbr->addQuotes( $startId ),
165 'rev_text_id=old_id',
167 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
168 ],
169 __METHOD__,
170 [
171 'ORDER BY' => 'rev_id',
172 'LIMIT' => $this->batchSize
173 ]
174 );
175 if ( !$res->numRows() ) {
176 break;
177 }
178
179 $insertBatch = [];
180 foreach ( $res as $row ) {
181 $startId = $row->rev_id;
182 $info = $this->interpretPointer( $row->old_text );
183 if ( !$info ) {
184 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
185 continue;
186 }
187 if ( !in_array( $info['cluster'], $this->clusters ) ) {
188 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
189 continue;
190 }
191 $insertBatch[] = [
192 'bt_page' => $row->rev_page,
193 'bt_rev_id' => $row->rev_id,
194 'bt_text_id' => $row->old_id,
195 'bt_cluster' => $info['cluster'],
196 'bt_blob_id' => $info['id'],
197 'bt_cgz_hash' => $info['hash']
198 ];
199 if ( $this->doBlobOrphans ) {
200 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
201 }
202 }
203 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
204 $rowsInserted += count( $insertBatch );
205
206 ++$batchesDone;
207 if ( $batchesDone >= $this->reportingInterval ) {
208 $batchesDone = 0;
209 echo "$startId / $endId\n";
211 }
212 }
213 echo "Found $rowsInserted revisions\n";
214 }
215
221 function trackOrphanText() {
222 # Wait until the blob_tracking table is available in the replica DB
223 $dbw = wfGetDB( DB_MASTER );
225 $pos = $dbw->getMasterPos();
226 $dbr->masterPosWait( $pos, 100000 );
227
228 $textClause = $this->getTextClause( $this->clusters );
229 $startId = 0;
230 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
231 $rowsInserted = 0;
232 $batchesDone = 0;
233
234 echo "Finding orphan text...\n";
235
236 # Scan the text table for orphan text
237 while ( true ) {
238 $res = $dbr->select( [ 'text', 'blob_tracking' ],
239 [ 'old_id', 'old_flags', 'old_text' ],
240 [
241 'old_id>' . $dbr->addQuotes( $startId ),
243 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
244 'bt_text_id IS NULL'
245 ],
246 __METHOD__,
247 [
248 'ORDER BY' => 'old_id',
249 'LIMIT' => $this->batchSize
250 ],
251 [ 'blob_tracking' => [ 'LEFT JOIN', 'bt_text_id=old_id' ] ]
252 );
253 $ids = [];
254 foreach ( $res as $row ) {
255 $ids[] = $row->old_id;
256 }
257
258 if ( !$res->numRows() ) {
259 break;
260 }
261
262 $insertBatch = [];
263 foreach ( $res as $row ) {
264 $startId = $row->old_id;
265 $info = $this->interpretPointer( $row->old_text );
266 if ( !$info ) {
267 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
268 continue;
269 }
270 if ( !in_array( $info['cluster'], $this->clusters ) ) {
271 echo "Invalid cluster returned in SQL query\n";
272 continue;
273 }
274
275 $insertBatch[] = [
276 'bt_page' => 0,
277 'bt_rev_id' => 0,
278 'bt_text_id' => $row->old_id,
279 'bt_cluster' => $info['cluster'],
280 'bt_blob_id' => $info['id'],
281 'bt_cgz_hash' => $info['hash']
282 ];
283 if ( $this->doBlobOrphans ) {
284 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
285 }
286 }
287 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
288
289 $rowsInserted += count( $insertBatch );
290 ++$batchesDone;
291 if ( $batchesDone >= $this->reportingInterval ) {
292 $batchesDone = 0;
293 echo "$startId / $endId\n";
295 }
296 }
297 echo "Found $rowsInserted orphan text rows\n";
298 }
299
307 function findOrphanBlobs() {
308 if ( !extension_loaded( 'gmp' ) ) {
309 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
310
311 return;
312 }
313
314 $dbw = wfGetDB( DB_MASTER );
315
316 foreach ( $this->clusters as $cluster ) {
317 echo "Searching for orphan blobs in $cluster...\n";
318 $lb = wfGetLBFactory()->getExternalLB( $cluster );
319 try {
320 $extDB = $lb->getConnection( DB_REPLICA );
321 } catch ( DBConnectionError $e ) {
322 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
323 echo "No database on $cluster\n";
324 } else {
325 echo "Error on $cluster: " . $e->getMessage() . "\n";
326 }
327 continue;
328 }
329 $table = $extDB->getLBInfo( 'blobs table' );
330 if ( is_null( $table ) ) {
331 $table = 'blobs';
332 }
333 if ( !$extDB->tableExists( $table ) ) {
334 echo "No blobs table on cluster $cluster\n";
335 continue;
336 }
337 $startId = 0;
338 $batchesDone = 0;
339 $actualBlobs = gmp_init( 0 );
340 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
341
342 // Build a bitmap of actual blob rows
343 while ( true ) {
344 $res = $extDB->select( $table,
345 [ 'blob_id' ],
346 [ 'blob_id > ' . $extDB->addQuotes( $startId ) ],
347 __METHOD__,
348 [ 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' ]
349 );
350
351 if ( !$res->numRows() ) {
352 break;
353 }
354
355 foreach ( $res as $row ) {
356 gmp_setbit( $actualBlobs, $row->blob_id );
357 }
358 $startId = $row->blob_id;
359
360 ++$batchesDone;
361 if ( $batchesDone >= $this->reportingInterval ) {
362 $batchesDone = 0;
363 echo "$startId / $endId\n";
364 }
365 }
366
367 // Find actual blobs that weren't tracked by the previous passes
368 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
369 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
370
371 // Traverse the orphan list
372 $insertBatch = [];
373 $id = 0;
374 $numOrphans = 0;
375 while ( true ) {
376 $id = gmp_scan1( $orphans, $id );
377 if ( $id == -1 ) {
378 break;
379 }
380 $insertBatch[] = [
381 'bo_cluster' => $cluster,
382 'bo_blob_id' => $id
383 ];
384 if ( count( $insertBatch ) > $this->batchSize ) {
385 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
386 $insertBatch = [];
387 }
388
389 ++$id;
390 ++$numOrphans;
391 }
392 if ( $insertBatch ) {
393 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
394 }
395 echo "Found $numOrphans orphan(s) in $cluster\n";
396 }
397 }
398}
wfWaitForSlaves( $ifWritesSince=null, $wiki=false, $cluster=false, $timeout=null)
Waits for the replica DBs to catch up to the master position.
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
wfGetLBFactory()
Get the load balancer factory object.
if( $line===false) $args
Definition cdb.php:64
interpretPointer( $text)
findOrphanBlobs()
Scan the blobs table for rows not registered in blob_tracking (and thus not registered in the text ta...
trackOrphanText()
Scan the text table for orphan text Orphan text here does not imply DB corruption – deleted text trac...
trackRevisions()
Scan the revision table for rows stored in the specified clusters.
__construct( $clusters)
$res
Definition database.txt:21
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
it s the revision text itself In either if gzip is the revision text is gzipped $flags
Definition hooks.txt:2710
returning false will NOT prevent logging $e
Definition hooks.txt:2110
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition injection.txt:37
const DB_REPLICA
Definition defines.php:22
const DB_MASTER
Definition defines.php:23
if(count( $args)< 1) $tracker