MediaWiki master
trackBlobs.php
Go to the documentation of this file.
1<?php
29
30require_once __DIR__ . '/../Maintenance.php';
31
32class TrackBlobs extends Maintenance {
33 public $clusters;
36 public $trackedBlobs = [];
37
38 public $batchSize = 1000;
39 public $reportingInterval = 10;
40
41 public function __construct() {
42 parent::__construct();
43
44 $this->addArg( 'cluster', 'cluster(s) to scan', true, true );
45
46 $this->addDescription(
47 'Adds blobs from a given ES cluster to the blob_tracking table. ' .
48 'Automatically deletes the tracking table and starts from the start again when restarted.'
49 );
50 }
51
52 public function execute() {
53 $this->clusters = $this->parameters->getArgs();
54 if ( extension_loaded( 'gmp' ) ) {
55 $this->doBlobOrphans = true;
56 foreach ( $this->clusters as $cluster ) {
57 $this->trackedBlobs[$cluster] = gmp_init( 0 );
58 }
59 } else {
60 echo "Warning: the gmp extension is needed to find orphan blobs\n";
61 }
62
63 $this->checkIntegrity();
64 $this->initTrackingTable();
65 $this->trackRevisions();
66 $this->trackOrphanText();
67 if ( $this->doBlobOrphans ) {
68 $this->findOrphanBlobs();
69 }
70 $this->output( "All done.\n" );
71 }
72
73 private function checkIntegrity() {
74 echo "Doing integrity check...\n";
75 $dbr = $this->getReplicaDB();
76
77 // Scan for HistoryBlobStub objects in the text table (T22757)
78
79 $exists = (bool)$dbr->newSelectQueryBuilder()
80 ->select( '1' )
81 ->from( 'text' )
82 ->where(
83 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
84 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'' )
85 ->caller( __METHOD__ )->fetchField();
86
87 if ( $exists ) {
88 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
89 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
90 "to fix this.\n";
91 exit( 1 );
92 }
93
94 echo "Integrity check OK\n";
95 }
96
97 private function initTrackingTable() {
98 $dbw = $this->getDB( DB_PRIMARY );
99 if ( $dbw->tableExists( 'blob_tracking', __METHOD__ ) ) {
100 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ), __METHOD__ );
101 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ), __METHOD__ );
102 }
103 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
104 }
105
106 private function getTextClause() {
107 if ( !$this->textClause ) {
108 $dbr = $this->getReplicaDB();
109 $conds = [];
110 foreach ( $this->clusters as $cluster ) {
111 $conds[] = $dbr->expr(
112 'old_text',
113 IExpression::LIKE,
114 new LikeValue( "DB://$cluster/", $dbr->anyString() )
115 );
116 }
117 $this->textClause = new OrExpressionGroup( ...$conds );
118 }
119
120 return $this->textClause;
121 }
122
123 private function interpretPointer( $text ) {
124 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
125 return false;
126 }
127
128 return [
129 'cluster' => $m[1],
130 'id' => intval( $m[2] ),
131 'hash' => $m[3] ?? null
132 ];
133 }
134
138 private function trackRevisions() {
139 $dbw = $this->getPrimaryDB();
140 $dbr = $this->getReplicaDB();
141
142 $textClause = $this->getTextClause();
143 $startId = 0;
144 $endId = (int)$dbr->newSelectQueryBuilder()
145 ->select( 'MAX(rev_id)' )
146 ->from( 'revision' )
147 ->caller( __METHOD__ )->fetchField();
148 $batchesDone = 0;
149 $rowsInserted = 0;
150
151 echo "Finding revisions...\n";
152
153 $conds = [
155 $dbr->expr(
156 'old_flags',
157 IExpression::LIKE,
158 new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() )
159 )
160 ];
161 $slotRoleStore = $this->getServiceContainer()->getSlotRoleStore();
162
163 $conds = array_merge( [
164 'slot_role_id=' . $slotRoleStore->getId( SlotRecord::MAIN ),
165 'SUBSTRING(content_address, 1, 3)=' . $dbr->addQuotes( 'tt:' ),
166 ], $conds );
167
168 while ( true ) {
169 $res = $dbr->newSelectQueryBuilder()
170 ->select( [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ] )
171 ->from( 'revision' )
172 ->join( 'slots', null, 'rev_id=slot_revision_id' )
173 ->join( 'content', null, 'content_id=slot_content_id' )
174 ->join( 'text', null, 'SUBSTRING(content_address, 4)=old_id' )
175 ->where( $dbr->expr( 'rev_id', '>', $startId ) )
176 ->andWhere( $conds )
177 ->orderBy( 'rev_id' )
178 ->limit( $this->batchSize )
179 ->caller( __METHOD__ )->fetchResultSet();
180 if ( !$res->numRows() ) {
181 break;
182 }
183
184 $insertBatch = [];
185 foreach ( $res as $row ) {
186 $startId = (int)$row->rev_id;
187 $info = $this->interpretPointer( $row->old_text );
188 if ( !$info ) {
189 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
190 continue;
191 }
192 if ( !in_array( $info['cluster'], $this->clusters ) ) {
193 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
194 continue;
195 }
196 $insertBatch[] = [
197 'bt_page' => $row->rev_page,
198 'bt_rev_id' => $row->rev_id,
199 'bt_text_id' => $row->old_id,
200 'bt_cluster' => $info['cluster'],
201 'bt_blob_id' => $info['id'],
202 'bt_cgz_hash' => $info['hash']
203 ];
204 if ( $this->doBlobOrphans ) {
205 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
206 }
207 }
208 $dbw->newInsertQueryBuilder()
209 ->insertInto( 'blob_tracking' )
210 ->rows( $insertBatch )
211 ->caller( __METHOD__ )->execute();
212 $rowsInserted += count( $insertBatch );
213
214 ++$batchesDone;
215 if ( $batchesDone >= $this->reportingInterval ) {
216 $batchesDone = 0;
217 echo "$startId / $endId\n";
218 $this->waitForReplication();
219 }
220 }
221 echo "Found $rowsInserted revisions\n";
222 }
223
229 private function trackOrphanText() {
230 # Wait until the blob_tracking table is available in the replica DB
231 $dbw = $this->getPrimaryDB();
232 $dbr = $this->getReplicaDB();
233 $this->getServiceContainer()->getDBLoadBalancerFactory()->waitForReplication( [ 'timeout' => 100_000 ] );
234
235 $textClause = $this->getTextClause();
236 $startId = 0;
237 $endId = (int)$dbr->newSelectQueryBuilder()
238 ->select( 'MAX(old_id)' )
239 ->from( 'text' )
240 ->caller( __METHOD__ )->fetchField();
241 $rowsInserted = 0;
242 $batchesDone = 0;
243
244 echo "Finding orphan text...\n";
245
246 # Scan the text table for orphan text
247 while ( true ) {
248 $res = $dbr->newSelectQueryBuilder()
249 ->select( [ 'old_id', 'old_flags', 'old_text' ] )
250 ->from( 'text' )
251 ->leftJoin( 'blob_tracking', null, 'bt_text_id=old_id' )
252 ->where( [
253 $dbr->expr( 'old_id', '>', $startId ),
255 $dbr->expr(
256 'old_flags',
257 IExpression::LIKE,
258 new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() )
259 ),
260 'bt_text_id' => null,
261 ] )
262 ->orderBy( 'old_id' )
263 ->limit( $this->batchSize )
264 ->caller( __METHOD__ )->fetchResultSet();
265
266 if ( !$res->numRows() ) {
267 break;
268 }
269
270 $insertBatch = [];
271 foreach ( $res as $row ) {
272 $startId = (int)$row->old_id;
273 $info = $this->interpretPointer( $row->old_text );
274 if ( !$info ) {
275 echo "Invalid DB:// URL in old_id {$row->old_id}\n";
276 continue;
277 }
278 if ( !in_array( $info['cluster'], $this->clusters ) ) {
279 echo "Invalid cluster returned in SQL query\n";
280 continue;
281 }
282
283 $insertBatch[] = [
284 'bt_page' => 0,
285 'bt_rev_id' => 0,
286 'bt_text_id' => $row->old_id,
287 'bt_cluster' => $info['cluster'],
288 'bt_blob_id' => $info['id'],
289 'bt_cgz_hash' => $info['hash']
290 ];
291 if ( $this->doBlobOrphans ) {
292 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
293 }
294 }
295 $dbw->newInsertQueryBuilder()
296 ->insertInto( 'blob_tracking' )
297 ->rows( $insertBatch )
298 ->caller( __METHOD__ )->execute();
299
300 $rowsInserted += count( $insertBatch );
301 ++$batchesDone;
302 if ( $batchesDone >= $this->reportingInterval ) {
303 $batchesDone = 0;
304 echo "$startId / $endId\n";
305 $this->waitForReplication();
306 }
307 }
308 echo "Found $rowsInserted orphan text rows\n";
309 }
310
318 private function findOrphanBlobs() {
319 if ( !extension_loaded( 'gmp' ) ) {
320 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
321
322 return;
323 }
324
325 $dbw = $this->getPrimaryDB();
326 $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();
327
328 foreach ( $this->clusters as $cluster ) {
329 echo "Searching for orphan blobs in $cluster...\n";
330 $lb = $lbFactory->getExternalLB( $cluster );
331 try {
332 $extDB = $lb->getMaintenanceConnectionRef( DB_REPLICA );
333 } catch ( DBConnectionError $e ) {
334 if ( strpos( $e->getMessage(), 'Unknown database' ) !== false ) {
335 echo "No database on $cluster\n";
336 } else {
337 echo "Error on $cluster: " . $e->getMessage() . "\n";
338 }
339 continue;
340 }
341 $table = $extDB->getLBInfo( 'blobs table' ) ?? 'blobs';
342 if ( !$extDB->tableExists( $table, __METHOD__ ) ) {
343 echo "No blobs table on cluster $cluster\n";
344 continue;
345 }
346 $startId = 0;
347 $batchesDone = 0;
348 $actualBlobs = gmp_init( 0 );
349 $endId = (int)$extDB->newSelectQueryBuilder()
350 ->select( 'MAX(blob_id)' )
351 ->from( $table )
352 ->caller( __METHOD__ )->fetchField();
353
354 // Build a bitmap of actual blob rows
355 while ( true ) {
356 $res = $extDB->newSelectQueryBuilder()
357 ->select( [ 'blob_id' ] )
358 ->from( $table )
359 ->where( $extDB->expr( 'blob_id', '>', $startId ) )
360 ->orderBy( 'blob_id' )
361 ->limit( $this->batchSize )
362 ->caller( __METHOD__ )->fetchResultSet();
363
364 if ( !$res->numRows() ) {
365 break;
366 }
367
368 foreach ( $res as $row ) {
369 gmp_setbit( $actualBlobs, $row->blob_id );
370 $startId = (int)$row->blob_id;
371 }
372
373 ++$batchesDone;
374 if ( $batchesDone >= $this->reportingInterval ) {
375 $batchesDone = 0;
376 echo "$startId / $endId\n";
377 }
378 }
379
380 // Find actual blobs that weren't tracked by the previous passes
381 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
382 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
383
384 // Traverse the orphan list
385 $insertBatch = [];
386 $id = 0;
387 $numOrphans = 0;
388 while ( true ) {
389 $id = gmp_scan1( $orphans, $id );
390 if ( $id == -1 ) {
391 break;
392 }
393 $insertBatch[] = [
394 'bo_cluster' => $cluster,
395 'bo_blob_id' => $id
396 ];
397 if ( count( $insertBatch ) > $this->batchSize ) {
398 $dbw->newInsertQueryBuilder()
399 ->insertInto( 'blob_orphans' )
400 ->rows( $insertBatch )
401 ->caller( __METHOD__ )->execute();
402 $insertBatch = [];
403 }
404
405 ++$id;
406 ++$numOrphans;
407 }
408 if ( $insertBatch ) {
409 $dbw->newInsertQueryBuilder()
410 ->insertInto( 'blob_orphans' )
411 ->rows( $insertBatch )
412 ->caller( __METHOD__ )->execute();
413 }
414 echo "Found $numOrphans orphan(s) in $cluster\n";
415 }
416 }
417}
418
419$maintClass = TrackBlobs::class;
420require_once RUN_MAINTENANCE_IF_MAIN;
getDB()
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
output( $out, $channel=null)
Throw some output to the user.
waitForReplication()
Wait for replica DBs to catch up.
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.
Value object representing a content slot associated with a page revision.
execute()
Do the actual work.
__construct()
Default constructor.
Content of like value.
Definition LikeValue.php:14
Representing a group of expressions chained via OR.
const DB_REPLICA
Definition defines.php:26
const DB_PRIMARY
Definition defines.php:28
$maintClass