Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 259 |
|
0.00% |
0 / 9 |
CRAP | |
0.00% |
0 / 1 |
TrackBlobs | |
0.00% |
0 / 259 |
|
0.00% |
0 / 9 |
1980 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
20 | |||
checkIntegrity | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
6 | |||
initTrackingTable | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
getTextClause | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
12 | |||
interpretPointer | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
trackRevisions | |
0.00% |
0 / 69 |
|
0.00% |
0 / 1 |
72 | |||
trackOrphanText | |
0.00% |
0 / 63 |
|
0.00% |
0 / 1 |
72 | |||
findOrphanBlobs | |
0.00% |
0 / 70 |
|
0.00% |
0 / 1 |
210 |
1 | <?php |
2 | /** |
3 | * Adds blobs from a given external storage cluster to the blob_tracking table. |
4 | * |
5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation; either version 2 of the License, or |
8 | * (at your option) any later version. |
9 | * |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | * GNU General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU General Public License along |
16 | * with this program; if not, write to the Free Software Foundation, Inc., |
17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
18 | * http://www.gnu.org/copyleft/gpl.html |
19 | * |
20 | * @file |
21 | * @ingroup Maintenance |
22 | */ |
23 | |
24 | use MediaWiki\Revision\SlotRecord; |
25 | use Wikimedia\Rdbms\DBConnectionError; |
26 | use Wikimedia\Rdbms\IExpression; |
27 | use Wikimedia\Rdbms\LikeValue; |
28 | |
29 | // @codeCoverageIgnoreStart |
30 | require_once __DIR__ . '/../Maintenance.php'; |
31 | // @codeCoverageIgnoreEnd |
32 | |
33 | class TrackBlobs extends Maintenance { |
34 | /** @var string[] */ |
35 | public $clusters; |
36 | /** @var IExpression|null */ |
37 | public $textClause; |
38 | /** @var bool */ |
39 | public $doBlobOrphans; |
40 | /** @var array */ |
41 | public $trackedBlobs = []; |
42 | |
43 | /** @var int */ |
44 | public $batchSize = 1000; |
45 | /** @var int */ |
46 | public $reportingInterval = 10; |
47 | |
48 | public function __construct() { |
49 | parent::__construct(); |
50 | |
51 | $this->addArg( 'cluster', 'cluster(s) to scan', true, true ); |
52 | |
53 | $this->addDescription( |
54 | 'Adds blobs from a given ES cluster to the blob_tracking table. ' . |
55 | 'Automatically deletes the tracking table and starts from the start again when restarted.' |
56 | ); |
57 | } |
58 | |
59 | public function execute() { |
60 | $this->clusters = $this->parameters->getArgs(); |
61 | if ( extension_loaded( 'gmp' ) ) { |
62 | $this->doBlobOrphans = true; |
63 | foreach ( $this->clusters as $cluster ) { |
64 | $this->trackedBlobs[$cluster] = gmp_init( 0 ); |
65 | } |
66 | } else { |
67 | echo "Warning: the gmp extension is needed to find orphan blobs\n"; |
68 | } |
69 | |
70 | $this->checkIntegrity(); |
71 | $this->initTrackingTable(); |
72 | $this->trackRevisions(); |
73 | $this->trackOrphanText(); |
74 | if ( $this->doBlobOrphans ) { |
75 | $this->findOrphanBlobs(); |
76 | } |
77 | $this->output( "All done.\n" ); |
78 | } |
79 | |
80 | private function checkIntegrity() { |
81 | echo "Doing integrity check...\n"; |
82 | $dbr = $this->getReplicaDB(); |
83 | |
84 | // Scan for HistoryBlobStub objects in the text table (T22757) |
85 | |
86 | $exists = (bool)$dbr->newSelectQueryBuilder() |
87 | ->select( '1' ) |
88 | ->from( 'text' ) |
89 | ->where( |
90 | 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' . |
91 | 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'' ) |
92 | ->caller( __METHOD__ )->fetchField(); |
93 | |
94 | if ( $exists ) { |
95 | echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" . |
96 | "This script could destroy these objects if it continued. Run resolveStubs.php\n" . |
97 | "to fix this.\n"; |
98 | exit( 1 ); |
99 | } |
100 | |
101 | echo "Integrity check OK\n"; |
102 | } |
103 | |
104 | private function initTrackingTable() { |
105 | $dbw = $this->getDB( DB_PRIMARY ); |
106 | if ( $dbw->tableExists( 'blob_tracking', __METHOD__ ) ) { |
107 | $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ), __METHOD__ ); |
108 | $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ), __METHOD__ ); |
109 | } |
110 | $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' ); |
111 | } |
112 | |
113 | private function getTextClause() { |
114 | if ( !$this->textClause ) { |
115 | $dbr = $this->getReplicaDB(); |
116 | $conds = []; |
117 | foreach ( $this->clusters as $cluster ) { |
118 | $conds[] = $dbr->expr( |
119 | 'old_text', |
120 | IExpression::LIKE, |
121 | new LikeValue( "DB://$cluster/", $dbr->anyString() ) |
122 | ); |
123 | } |
124 | $this->textClause = $dbr->orExpr( $conds ); |
125 | } |
126 | |
127 | return $this->textClause; |
128 | } |
129 | |
130 | private function interpretPointer( $text ) { |
131 | if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) { |
132 | return false; |
133 | } |
134 | |
135 | return [ |
136 | 'cluster' => $m[1], |
137 | 'id' => intval( $m[2] ), |
138 | 'hash' => $m[3] ?? null |
139 | ]; |
140 | } |
141 | |
142 | /** |
143 | * Scan the revision table for rows stored in the specified clusters |
144 | */ |
145 | private function trackRevisions() { |
146 | $dbw = $this->getPrimaryDB(); |
147 | $dbr = $this->getReplicaDB(); |
148 | |
149 | $textClause = $this->getTextClause(); |
150 | $startId = 0; |
151 | $endId = (int)$dbr->newSelectQueryBuilder() |
152 | ->select( 'MAX(rev_id)' ) |
153 | ->from( 'revision' ) |
154 | ->caller( __METHOD__ )->fetchField(); |
155 | $batchesDone = 0; |
156 | $rowsInserted = 0; |
157 | |
158 | echo "Finding revisions...\n"; |
159 | |
160 | $conds = [ |
161 | $textClause, |
162 | $dbr->expr( |
163 | 'old_flags', |
164 | IExpression::LIKE, |
165 | new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() ) |
166 | ) |
167 | ]; |
168 | $slotRoleStore = $this->getServiceContainer()->getSlotRoleStore(); |
169 | |
170 | $conds = array_merge( [ |
171 | 'slot_role_id' => $slotRoleStore->getId( SlotRecord::MAIN ), |
172 | 'SUBSTRING(content_address, 1, 3)=' . $dbr->addQuotes( 'tt:' ), |
173 | ], $conds ); |
174 | |
175 | while ( true ) { |
176 | $res = $dbr->newSelectQueryBuilder() |
177 | ->select( [ 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ] ) |
178 | ->from( 'revision' ) |
179 | ->join( 'slots', null, 'rev_id=slot_revision_id' ) |
180 | ->join( 'content', null, 'content_id=slot_content_id' ) |
181 | ->join( 'text', null, 'SUBSTRING(content_address, 4)=old_id' ) |
182 | ->where( $dbr->expr( 'rev_id', '>', $startId ) ) |
183 | ->andWhere( $conds ) |
184 | ->orderBy( 'rev_id' ) |
185 | ->limit( $this->batchSize ) |
186 | ->caller( __METHOD__ )->fetchResultSet(); |
187 | if ( !$res->numRows() ) { |
188 | break; |
189 | } |
190 | |
191 | $insertBatch = []; |
192 | foreach ( $res as $row ) { |
193 | $startId = (int)$row->rev_id; |
194 | $info = $this->interpretPointer( $row->old_text ); |
195 | if ( !$info ) { |
196 | echo "Invalid DB:// URL in rev_id {$row->rev_id}\n"; |
197 | continue; |
198 | } |
199 | if ( !in_array( $info['cluster'], $this->clusters ) ) { |
200 | echo "Invalid cluster returned in SQL query: {$info['cluster']}\n"; |
201 | continue; |
202 | } |
203 | $insertBatch[] = [ |
204 | 'bt_page' => $row->rev_page, |
205 | 'bt_rev_id' => $row->rev_id, |
206 | 'bt_text_id' => $row->old_id, |
207 | 'bt_cluster' => $info['cluster'], |
208 | 'bt_blob_id' => $info['id'], |
209 | 'bt_cgz_hash' => $info['hash'] |
210 | ]; |
211 | if ( $this->doBlobOrphans ) { |
212 | gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] ); |
213 | } |
214 | } |
215 | $dbw->newInsertQueryBuilder() |
216 | ->insertInto( 'blob_tracking' ) |
217 | ->rows( $insertBatch ) |
218 | ->caller( __METHOD__ )->execute(); |
219 | $rowsInserted += count( $insertBatch ); |
220 | |
221 | ++$batchesDone; |
222 | if ( $batchesDone >= $this->reportingInterval ) { |
223 | $batchesDone = 0; |
224 | echo "$startId / $endId\n"; |
225 | $this->waitForReplication(); |
226 | } |
227 | } |
228 | echo "Found $rowsInserted revisions\n"; |
229 | } |
230 | |
231 | /** |
232 | * Scan the text table for orphan text |
233 | * Orphan text here does not imply DB corruption -- deleted text tracked by the |
234 | * archive table counts as orphan for our purposes. |
235 | */ |
236 | private function trackOrphanText() { |
237 | # Wait until the blob_tracking table is available in the replica DB |
238 | $dbw = $this->getPrimaryDB(); |
239 | $dbr = $this->getReplicaDB(); |
240 | $this->getServiceContainer()->getDBLoadBalancerFactory()->waitForReplication( [ 'timeout' => 100_000 ] ); |
241 | |
242 | $textClause = $this->getTextClause(); |
243 | $startId = 0; |
244 | $endId = (int)$dbr->newSelectQueryBuilder() |
245 | ->select( 'MAX(old_id)' ) |
246 | ->from( 'text' ) |
247 | ->caller( __METHOD__ )->fetchField(); |
248 | $rowsInserted = 0; |
249 | $batchesDone = 0; |
250 | |
251 | echo "Finding orphan text...\n"; |
252 | |
253 | # Scan the text table for orphan text |
254 | while ( true ) { |
255 | $res = $dbr->newSelectQueryBuilder() |
256 | ->select( [ 'old_id', 'old_flags', 'old_text' ] ) |
257 | ->from( 'text' ) |
258 | ->leftJoin( 'blob_tracking', null, 'bt_text_id=old_id' ) |
259 | ->where( [ |
260 | $dbr->expr( 'old_id', '>', $startId ), |
261 | $textClause, |
262 | $dbr->expr( |
263 | 'old_flags', |
264 | IExpression::LIKE, |
265 | new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() ) |
266 | ), |
267 | 'bt_text_id' => null, |
268 | ] ) |
269 | ->orderBy( 'old_id' ) |
270 | ->limit( $this->batchSize ) |
271 | ->caller( __METHOD__ )->fetchResultSet(); |
272 | |
273 | if ( !$res->numRows() ) { |
274 | break; |
275 | } |
276 | |
277 | $insertBatch = []; |
278 | foreach ( $res as $row ) { |
279 | $startId = (int)$row->old_id; |
280 | $info = $this->interpretPointer( $row->old_text ); |
281 | if ( !$info ) { |
282 | echo "Invalid DB:// URL in old_id {$row->old_id}\n"; |
283 | continue; |
284 | } |
285 | if ( !in_array( $info['cluster'], $this->clusters ) ) { |
286 | echo "Invalid cluster returned in SQL query\n"; |
287 | continue; |
288 | } |
289 | |
290 | $insertBatch[] = [ |
291 | 'bt_page' => 0, |
292 | 'bt_rev_id' => 0, |
293 | 'bt_text_id' => $row->old_id, |
294 | 'bt_cluster' => $info['cluster'], |
295 | 'bt_blob_id' => $info['id'], |
296 | 'bt_cgz_hash' => $info['hash'] |
297 | ]; |
298 | if ( $this->doBlobOrphans ) { |
299 | gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] ); |
300 | } |
301 | } |
302 | $dbw->newInsertQueryBuilder() |
303 | ->insertInto( 'blob_tracking' ) |
304 | ->rows( $insertBatch ) |
305 | ->caller( __METHOD__ )->execute(); |
306 | |
307 | $rowsInserted += count( $insertBatch ); |
308 | ++$batchesDone; |
309 | if ( $batchesDone >= $this->reportingInterval ) { |
310 | $batchesDone = 0; |
311 | echo "$startId / $endId\n"; |
312 | $this->waitForReplication(); |
313 | } |
314 | } |
315 | echo "Found $rowsInserted orphan text rows\n"; |
316 | } |
317 | |
318 | /** |
319 | * Scan the blobs table for rows not registered in blob_tracking (and thus not |
320 | * registered in the text table). |
321 | * |
322 | * Orphan blobs are indicative of DB corruption. They are inaccessible and |
323 | * should probably be deleted. |
324 | */ |
325 | private function findOrphanBlobs() { |
326 | if ( !extension_loaded( 'gmp' ) ) { |
327 | echo "Can't find orphan blobs, need bitfield support provided by GMP.\n"; |
328 | |
329 | return; |
330 | } |
331 | |
332 | $dbw = $this->getPrimaryDB(); |
333 | $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory(); |
334 | $dbStore = $this->getServiceContainer()->getExternalStoreFactory()->getStore( 'DB' ); |
335 | '@phan-var ExternalStoreDB $dbStore'; /** @var ExternalStoreDB $dbStore */ |
336 | |
337 | foreach ( $this->clusters as $cluster ) { |
338 | echo "Searching for orphan blobs in $cluster...\n"; |
339 | $lb = $lbFactory->getExternalLB( $cluster ); |
340 | try { |
341 | $extDB = $lb->getMaintenanceConnectionRef( DB_REPLICA ); |
342 | } catch ( DBConnectionError $e ) { |
343 | if ( strpos( $e->getMessage(), 'Unknown database' ) !== false ) { |
344 | echo "No database on $cluster\n"; |
345 | } else { |
346 | echo "Error on $cluster: " . $e->getMessage() . "\n"; |
347 | } |
348 | continue; |
349 | } |
350 | $table = $dbStore->getTable( $cluster ); |
351 | if ( !$extDB->tableExists( $table, __METHOD__ ) ) { |
352 | echo "No blobs table on cluster $cluster\n"; |
353 | continue; |
354 | } |
355 | $startId = 0; |
356 | $batchesDone = 0; |
357 | $actualBlobs = gmp_init( 0 ); |
358 | $endId = (int)$extDB->newSelectQueryBuilder() |
359 | ->select( 'MAX(blob_id)' ) |
360 | ->from( $table ) |
361 | ->caller( __METHOD__ )->fetchField(); |
362 | |
363 | // Build a bitmap of actual blob rows |
364 | while ( true ) { |
365 | $res = $extDB->newSelectQueryBuilder() |
366 | ->select( [ 'blob_id' ] ) |
367 | ->from( $table ) |
368 | ->where( $extDB->expr( 'blob_id', '>', $startId ) ) |
369 | ->orderBy( 'blob_id' ) |
370 | ->limit( $this->batchSize ) |
371 | ->caller( __METHOD__ )->fetchResultSet(); |
372 | |
373 | if ( !$res->numRows() ) { |
374 | break; |
375 | } |
376 | |
377 | foreach ( $res as $row ) { |
378 | gmp_setbit( $actualBlobs, $row->blob_id ); |
379 | $startId = (int)$row->blob_id; |
380 | } |
381 | |
382 | ++$batchesDone; |
383 | if ( $batchesDone >= $this->reportingInterval ) { |
384 | $batchesDone = 0; |
385 | echo "$startId / $endId\n"; |
386 | } |
387 | } |
388 | |
389 | // Find actual blobs that weren't tracked by the previous passes |
390 | // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B |
391 | $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) ); |
392 | |
393 | // Traverse the orphan list |
394 | $insertBatch = []; |
395 | $id = 0; |
396 | $numOrphans = 0; |
397 | while ( true ) { |
398 | $id = gmp_scan1( $orphans, $id ); |
399 | if ( $id == -1 ) { |
400 | break; |
401 | } |
402 | $insertBatch[] = [ |
403 | 'bo_cluster' => $cluster, |
404 | 'bo_blob_id' => $id |
405 | ]; |
406 | if ( count( $insertBatch ) > $this->batchSize ) { |
407 | $dbw->newInsertQueryBuilder() |
408 | ->insertInto( 'blob_orphans' ) |
409 | ->rows( $insertBatch ) |
410 | ->caller( __METHOD__ )->execute(); |
411 | $insertBatch = []; |
412 | } |
413 | |
414 | ++$id; |
415 | ++$numOrphans; |
416 | } |
417 | if ( $insertBatch ) { |
418 | $dbw->newInsertQueryBuilder() |
419 | ->insertInto( 'blob_orphans' ) |
420 | ->rows( $insertBatch ) |
421 | ->caller( __METHOD__ )->execute(); |
422 | } |
423 | echo "Found $numOrphans orphan(s) in $cluster\n"; |
424 | } |
425 | } |
426 | } |
427 | |
428 | // @codeCoverageIgnoreStart |
429 | $maintClass = TrackBlobs::class; |
430 | require_once RUN_MAINTENANCE_IF_MAIN; |
431 | // @codeCoverageIgnoreEnd |