MediaWiki 1.41.2
trackBlobs.php
Go to the documentation of this file.
1<?php
26
27require_once __DIR__ . '/../Maintenance.php';
28
29class TrackBlobs extends Maintenance {
32 public $trackedBlobs = [];
33
34 public $batchSize = 1000;
35 public $reportingInterval = 10;
36
37 public function __construct() {
38 parent::__construct();
39
40 $this->addArg( 'cluster', 'cluster(s) to scan', true, true );
41
42 $this->addDescription(
43 'Adds blobs from a given ES cluster to the blob_tracking table. ' .
44 'Automatically deletes the tracking table and starts from the start again when restarted.'
45 );
46 }
47
48 public function execute() {
49 $this->clusters = $this->parameters->getArgs();
50 if ( extension_loaded( 'gmp' ) ) {
51 $this->doBlobOrphans = true;
52 foreach ( $this->clusters as $cluster ) {
53 $this->trackedBlobs[$cluster] = gmp_init( 0 );
54 }
55 } else {
56 echo "Warning: the gmp extension is needed to find orphan blobs\n";
57 }
58
59 $this->checkIntegrity();
60 $this->initTrackingTable();
61 $this->trackRevisions();
62 $this->trackOrphanText();
63 if ( $this->doBlobOrphans ) {
64 $this->findOrphanBlobs();
65 }
66 $this->output( "All done.\n" );
67 }
68
69 private function checkIntegrity() {
70 echo "Doing integrity check...\n";
71 $dbr = wfGetDB( DB_REPLICA );
72
73 // Scan for HistoryBlobStub objects in the text table (T22757)
74
75 $exists = (bool)$dbr->newSelectQueryBuilder()
76 ->select( '1' )
77 ->from( 'text' )
78 ->where(
79 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
80 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'' )
81 ->caller( __METHOD__ )->fetchField();
82
83 if ( $exists ) {
84 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
85 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
86 "to fix this.\n";
87 exit( 1 );
88 }
89
90 echo "Integrity check OK\n";
91 }
92
93 private function initTrackingTable() {
94 $dbw = wfGetDB( DB_PRIMARY );
95 if ( $dbw->tableExists( 'blob_tracking', __METHOD__ ) ) {
96 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ), __METHOD__ );
97 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ), __METHOD__ );
98 }
99 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
100 }
101
102 private function getTextClause() {
103 if ( !$this->textClause ) {
104 $dbr = wfGetDB( DB_REPLICA );
105 $this->textClause = '';
106 foreach ( $this->clusters as $cluster ) {
107 if ( $this->textClause != '' ) {
108 $this->textClause .= ' OR ';
109 }
110 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
111 }
112 }
113
114 return $this->textClause;
115 }
116
117 private function interpretPointer( $text ) {
118 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
119 return false;
120 }
121
122 return [
123 'cluster' => $m[1],
124 'id' => intval( $m[2] ),
125 'hash' => $m[3] ?? null
126 ];
127 }
128
132 private function trackRevisions() {
133 $dbw = wfGetDB( DB_PRIMARY );
134 $dbr = wfGetDB( DB_REPLICA );
135
136 $textClause = $this->getTextClause();
137 $startId = 0;
138 $endId = (int)$dbr->newSelectQueryBuilder()
139 ->select( 'MAX(rev_id)' )
140 ->from( 'revision' )
141 ->caller( __METHOD__ )->fetchField();
142 $batchesDone = 0;
143 $rowsInserted = 0;
144
145 echo "Finding revisions...\n";
146
147 $conds = [
149 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
150 ];
151 $slotRoleStore = $this->getServiceContainer()->getSlotRoleStore();
152
153 $conds = array_merge( [
154 'slot_role_id=' . $slotRoleStore->getId( SlotRecord::MAIN ),
155 'SUBSTRING(content_address, 1, 3)=' . $dbr->addQuotes( 'tt:' ),
156 ], $conds );
157 $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();
158
159 while ( true ) {
160 $res = $dbr->newSelectQueryBuilder()
161 ->select( [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ] )
162 ->from( 'revision' )
163 ->join( 'slots', null, 'rev_id=slot_revision_id' )
164 ->join( 'content', null, 'content_id=slot_content_id' )
165 ->join( 'text', null, 'SUBSTRING(content_address, 4)=old_id' )
166 ->where( [ 'rev_id > ' . $dbr->addQuotes( $startId ) ] )
167 ->andWhere( $conds )
168 ->orderBy( 'rev_id' )
169 ->limit( $this->batchSize )
170 ->caller( __METHOD__ )->fetchResultSet();
171 if ( !$res->numRows() ) {
172 break;
173 }
174
175 $insertBatch = [];
176 foreach ( $res as $row ) {
177 $startId = (int)$row->rev_id;
178 $info = $this->interpretPointer( $row->old_text );
179 if ( !$info ) {
180 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
181 continue;
182 }
183 if ( !in_array( $info['cluster'], $this->clusters ) ) {
184 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
185 continue;
186 }
187 $insertBatch[] = [
188 'bt_page' => $row->rev_page,
189 'bt_rev_id' => $row->rev_id,
190 'bt_text_id' => $row->old_id,
191 'bt_cluster' => $info['cluster'],
192 'bt_blob_id' => $info['id'],
193 'bt_cgz_hash' => $info['hash']
194 ];
195 if ( $this->doBlobOrphans ) {
196 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
197 }
198 }
199 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
200 $rowsInserted += count( $insertBatch );
201
202 ++$batchesDone;
203 if ( $batchesDone >= $this->reportingInterval ) {
204 $batchesDone = 0;
205 echo "$startId / $endId\n";
206 $lbFactory->waitForReplication();
207 }
208 }
209 echo "Found $rowsInserted revisions\n";
210 }
211
217 private function trackOrphanText() {
218 # Wait until the blob_tracking table is available in the replica DB
219 $dbw = wfGetDB( DB_PRIMARY );
220 $dbr = wfGetDB( DB_REPLICA );
221 $pos = $dbw->getPrimaryPos();
222 $dbr->primaryPosWait( $pos, 100000 );
223
224 $textClause = $this->getTextClause();
225 $startId = 0;
226 $endId = (int)$dbr->newSelectQueryBuilder()
227 ->select( 'MAX(old_id)' )
228 ->from( 'text' )
229 ->caller( __METHOD__ )->fetchField();
230 $rowsInserted = 0;
231 $batchesDone = 0;
232 $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();
233
234 echo "Finding orphan text...\n";
235
236 # Scan the text table for orphan text
237 while ( true ) {
238 $res = $dbr->newSelectQueryBuilder()
239 ->select( [ 'old_id', 'old_flags', 'old_text' ] )
240 ->from( 'text' )
241 ->leftJoin( 'blob_tracking', null, 'bt_text_id=old_id' )
242 ->where( [
243 'old_id>' . $dbr->addQuotes( $startId ),
245 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
246 'bt_text_id' => null,
247 ] )
248 ->orderBy( 'old_id' )
249 ->limit( $this->batchSize )
250 ->caller( __METHOD__ )->fetchResultSet();
251
252 if ( !$res->numRows() ) {
253 break;
254 }
255
256 $insertBatch = [];
257 foreach ( $res as $row ) {
258 $startId = (int)$row->old_id;
259 $info = $this->interpretPointer( $row->old_text );
260 if ( !$info ) {
261 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
262 continue;
263 }
264 if ( !in_array( $info['cluster'], $this->clusters ) ) {
265 echo "Invalid cluster returned in SQL query\n";
266 continue;
267 }
268
269 $insertBatch[] = [
270 'bt_page' => 0,
271 'bt_rev_id' => 0,
272 'bt_text_id' => $row->old_id,
273 'bt_cluster' => $info['cluster'],
274 'bt_blob_id' => $info['id'],
275 'bt_cgz_hash' => $info['hash']
276 ];
277 if ( $this->doBlobOrphans ) {
278 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
279 }
280 }
281 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
282
283 $rowsInserted += count( $insertBatch );
284 ++$batchesDone;
285 if ( $batchesDone >= $this->reportingInterval ) {
286 $batchesDone = 0;
287 echo "$startId / $endId\n";
288 $lbFactory->waitForReplication();
289 }
290 }
291 echo "Found $rowsInserted orphan text rows\n";
292 }
293
301 private function findOrphanBlobs() {
302 if ( !extension_loaded( 'gmp' ) ) {
303 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
304
305 return;
306 }
307
308 $dbw = wfGetDB( DB_PRIMARY );
309 $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();
310
311 foreach ( $this->clusters as $cluster ) {
312 echo "Searching for orphan blobs in $cluster...\n";
313 $lb = $lbFactory->getExternalLB( $cluster );
314 try {
315 $extDB = $lb->getMaintenanceConnectionRef( DB_REPLICA );
316 } catch ( DBConnectionError $e ) {
317 if ( strpos( $e->getMessage(), 'Unknown database' ) !== false ) {
318 echo "No database on $cluster\n";
319 } else {
320 echo "Error on $cluster: " . $e->getMessage() . "\n";
321 }
322 continue;
323 }
324 $table = $extDB->getLBInfo( 'blobs table' ) ?? 'blobs';
325 if ( !$extDB->tableExists( $table, __METHOD__ ) ) {
326 echo "No blobs table on cluster $cluster\n";
327 continue;
328 }
329 $startId = 0;
330 $batchesDone = 0;
331 $actualBlobs = gmp_init( 0 );
332 $endId = (int)$extDB->newSelectQueryBuilder()
333 ->select( 'MAX(blob_id)' )
334 ->from( $table )
335 ->caller( __METHOD__ )->fetchField();
336
337 // Build a bitmap of actual blob rows
338 while ( true ) {
339 $res = $extDB->newSelectQueryBuilder()
340 ->select( [ 'blob_id' ] )
341 ->from( $table )
342 ->where( [ 'blob_id > ' . $extDB->addQuotes( $startId ) ] )
343 ->orderBy( 'blob_id' )
344 ->limit( $this->batchSize )
345 ->caller( __METHOD__ )->fetchResultSet();
346
347 if ( !$res->numRows() ) {
348 break;
349 }
350
351 foreach ( $res as $row ) {
352 gmp_setbit( $actualBlobs, $row->blob_id );
353 $startId = (int)$row->blob_id;
354 }
355
356 ++$batchesDone;
357 if ( $batchesDone >= $this->reportingInterval ) {
358 $batchesDone = 0;
359 echo "$startId / $endId\n";
360 }
361 }
362
363 // Find actual blobs that weren't tracked by the previous passes
364 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
365 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
366
367 // Traverse the orphan list
368 $insertBatch = [];
369 $id = 0;
370 $numOrphans = 0;
371 while ( true ) {
372 $id = gmp_scan1( $orphans, $id );
373 if ( $id == -1 ) {
374 break;
375 }
376 $insertBatch[] = [
377 'bo_cluster' => $cluster,
378 'bo_blob_id' => $id
379 ];
380 if ( count( $insertBatch ) > $this->batchSize ) {
381 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
382 $insertBatch = [];
383 }
384
385 ++$id;
386 ++$numOrphans;
387 }
388 if ( $insertBatch ) {
389 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
390 }
391 echo "Found $numOrphans orphan(s) in $cluster\n";
392 }
393 }
394}
395
396$maintClass = TrackBlobs::class;
397require_once RUN_MAINTENANCE_IF_MAIN;
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
output( $out, $channel=null)
Throw some output to the user.
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.
Value object representing a content slot associated with a page revision.
execute()
Do the actual work.
__construct()
Default constructor.
const DB_REPLICA
Definition defines.php:26
const DB_PRIMARY
Definition defines.php:28
$maintClass