50 parent::__construct();
52 $this->
addArg(
'cluster',
'cluster(s) to scan',
true,
true );
55 'Adds blobs from a given ES cluster to the blob_tracking table. ' .
56 'Automatically deletes the tracking table and starts from the start again when restarted.'
61 $this->clusters = $this->parameters->getArgs();
62 if ( extension_loaded(
'gmp' ) ) {
63 $this->doBlobOrphans =
true;
64 foreach ( $this->clusters as $cluster ) {
65 $this->trackedBlobs[$cluster] = gmp_init( 0 );
68 echo
"Warning: the gmp extension is needed to find orphan blobs\n";
71 $this->checkIntegrity();
72 $this->initTrackingTable();
73 $this->trackRevisions();
74 $this->trackOrphanText();
75 if ( $this->doBlobOrphans ) {
76 $this->findOrphanBlobs();
78 $this->
output(
"All done.\n" );
81 private function checkIntegrity() {
82 echo
"Doing integrity check...\n";
87 $exists = (bool)$dbr->newSelectQueryBuilder()
91 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
92 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'' )
93 ->caller( __METHOD__ )->fetchField();
96 echo
"Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
97 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
102 echo
"Integrity check OK\n";
105 private function initTrackingTable() {
107 if ( $dbw->tableExists(
'blob_tracking', __METHOD__ ) ) {
108 $dbw->query(
'DROP TABLE ' . $dbw->tableName(
'blob_tracking' ), __METHOD__ );
109 $dbw->query(
'DROP TABLE ' . $dbw->tableName(
'blob_orphans' ), __METHOD__ );
111 $dbw->sourceFile( __DIR__ .
'/blob_tracking.sql' );
115 if ( !$this->textClause ) {
118 foreach ( $this->clusters as $cluster ) {
119 $conds[] = $dbr->expr(
122 new LikeValue(
"DB://$cluster/", $dbr->anyString() )
125 $this->textClause = $dbr->orExpr( $conds );
132 private function interpretPointer(
string $text ) {
133 if ( !preg_match(
'!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
139 'id' => intval( $m[2] ),
140 'hash' => $m[3] ?? null
147 private function trackRevisions() {
148 $dbw = $this->getPrimaryDB();
149 $dbr = $this->getReplicaDB();
151 $textClause = $this->getTextClause();
153 $endId = (int)$dbr->newSelectQueryBuilder()
154 ->select(
'MAX(rev_id)' )
156 ->caller( __METHOD__ )->fetchField();
160 echo
"Finding revisions...\n";
167 new LikeValue( $dbr->anyString(),
'external', $dbr->anyString() )
170 $slotRoleStore = $this->getServiceContainer()->getSlotRoleStore();
172 $conds = array_merge( [
173 'slot_role_id' => $slotRoleStore->getId( SlotRecord::MAIN ),
174 'SUBSTRING(content_address, 1, 3)=' . $dbr->addQuotes(
'tt:' ),
178 $res = $dbr->newSelectQueryBuilder()
179 ->select( [
'rev_id',
'rev_page',
'old_id',
'old_flags',
'old_text' ] )
181 ->join(
'slots',
null,
'rev_id=slot_revision_id' )
182 ->join(
'content',
null,
'content_id=slot_content_id' )
183 ->join(
'text',
null,
'SUBSTRING(content_address, 4)=old_id' )
184 ->where( $dbr->expr(
'rev_id',
'>', $startId ) )
186 ->orderBy(
'rev_id' )
187 ->limit( $this->batchSize )
188 ->caller( __METHOD__ )->fetchResultSet();
189 if ( !$res->numRows() ) {
194 foreach ( $res as $row ) {
195 $startId = (int)$row->rev_id;
196 $info = $this->interpretPointer( $row->old_text );
198 echo
"Invalid DB:// URL in rev_id {$row->rev_id}\n";
201 if ( !in_array( $info[
'cluster'], $this->clusters ) ) {
202 echo
"Invalid cluster returned in SQL query: {$info['cluster']}\n";
206 'bt_page' => $row->rev_page,
207 'bt_rev_id' => $row->rev_id,
208 'bt_text_id' => $row->old_id,
209 'bt_cluster' => $info[
'cluster'],
210 'bt_blob_id' => $info[
'id'],
211 'bt_cgz_hash' => $info[
'hash']
213 if ( $this->doBlobOrphans ) {
214 gmp_setbit( $this->trackedBlobs[$info[
'cluster']], $info[
'id'] );
217 $dbw->newInsertQueryBuilder()
218 ->insertInto(
'blob_tracking' )
219 ->rows( $insertBatch )
220 ->caller( __METHOD__ )->execute();
221 $rowsInserted += count( $insertBatch );
224 if ( $batchesDone >= $this->reportingInterval ) {
226 echo
"$startId / $endId\n";
227 $this->waitForReplication();
230 echo
"Found $rowsInserted revisions\n";
238 private function trackOrphanText() {
239 # Wait until the blob_tracking table is available in the replica DB
240 $dbw = $this->getPrimaryDB();
241 $dbr = $this->getReplicaDB();
242 $this->getServiceContainer()->getDBLoadBalancerFactory()->waitForReplication( [
'timeout' => 100_000 ] );
244 $textClause = $this->getTextClause();
246 $endId = (int)$dbr->newSelectQueryBuilder()
247 ->select(
'MAX(old_id)' )
249 ->caller( __METHOD__ )->fetchField();
253 echo
"Finding orphan text...\n";
255 # Scan the text table for orphan text
257 $res = $dbr->newSelectQueryBuilder()
258 ->select( [
'old_id',
'old_flags',
'old_text' ] )
260 ->leftJoin(
'blob_tracking',
null,
'bt_text_id=old_id' )
262 $dbr->expr(
'old_id',
'>', $startId ),
267 new LikeValue( $dbr->anyString(),
'external', $dbr->anyString() )
269 'bt_text_id' =>
null,
271 ->orderBy(
'old_id' )
272 ->limit( $this->batchSize )
273 ->caller( __METHOD__ )->fetchResultSet();
275 if ( !$res->numRows() ) {
280 foreach ( $res as $row ) {
281 $startId = (int)$row->old_id;
282 $info = $this->interpretPointer( $row->old_text );
284 echo
"Invalid DB:// URL in old_id {$row->old_id}\n";
287 if ( !in_array( $info[
'cluster'], $this->clusters ) ) {
288 echo
"Invalid cluster returned in SQL query\n";
295 'bt_text_id' => $row->old_id,
296 'bt_cluster' => $info[
'cluster'],
297 'bt_blob_id' => $info[
'id'],
298 'bt_cgz_hash' => $info[
'hash']
300 if ( $this->doBlobOrphans ) {
301 gmp_setbit( $this->trackedBlobs[$info[
'cluster']], $info[
'id'] );
304 $dbw->newInsertQueryBuilder()
305 ->insertInto(
'blob_tracking' )
306 ->rows( $insertBatch )
307 ->caller( __METHOD__ )->execute();
309 $rowsInserted += count( $insertBatch );
311 if ( $batchesDone >= $this->reportingInterval ) {
313 echo
"$startId / $endId\n";
314 $this->waitForReplication();
317 echo
"Found $rowsInserted orphan text rows\n";
327 private function findOrphanBlobs() {
328 if ( !extension_loaded(
'gmp' ) ) {
329 echo
"Can't find orphan blobs, need bitfield support provided by GMP.\n";
334 $dbw = $this->getPrimaryDB();
335 $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory();
336 $dbStore = $this->getServiceContainer()->getExternalStoreFactory()->getStore(
'DB' );
337 '@phan-var ExternalStoreDB $dbStore';
339 foreach ( $this->clusters as $cluster ) {
340 echo
"Searching for orphan blobs in $cluster...\n";
341 $lb = $lbFactory->getExternalLB( $cluster );
343 $extDB = $lb->getMaintenanceConnectionRef(
DB_REPLICA );
345 if ( strpos( $e->getMessage(),
'Unknown database' ) !==
false ) {
346 echo
"No database on $cluster\n";
348 echo
"Error on $cluster: " . $e->getMessage() .
"\n";
352 $table = $dbStore->getTable( $cluster );
353 if ( !$extDB->tableExists( $table, __METHOD__ ) ) {
354 echo
"No blobs table on cluster $cluster\n";
359 $actualBlobs = gmp_init( 0 );
360 $endId = (int)$extDB->newSelectQueryBuilder()
361 ->select(
'MAX(blob_id)' )
363 ->caller( __METHOD__ )->fetchField();
367 $res = $extDB->newSelectQueryBuilder()
368 ->select( [
'blob_id' ] )
370 ->where( $extDB->expr(
'blob_id',
'>', $startId ) )
371 ->orderBy(
'blob_id' )
372 ->limit( $this->batchSize )
373 ->caller( __METHOD__ )->fetchResultSet();
375 if ( !$res->numRows() ) {
379 foreach ( $res as $row ) {
380 gmp_setbit( $actualBlobs, $row->blob_id );
381 $startId = (int)$row->blob_id;
385 if ( $batchesDone >= $this->reportingInterval ) {
387 echo
"$startId / $endId\n";
393 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
400 $id = gmp_scan1( $orphans, $id );
405 'bo_cluster' => $cluster,
408 if ( count( $insertBatch ) > $this->batchSize ) {
409 $dbw->newInsertQueryBuilder()
410 ->insertInto(
'blob_orphans' )
411 ->rows( $insertBatch )
412 ->caller( __METHOD__ )->execute();
419 if ( $insertBatch ) {
420 $dbw->newInsertQueryBuilder()
421 ->insertInto(
'blob_orphans' )
422 ->rows( $insertBatch )
423 ->caller( __METHOD__ )->execute();
425 echo
"Found $numOrphans orphan(s) in $cluster\n";