Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 264 |
|
0.00% |
0 / 5 |
CRAP | |
0.00% |
0 / 1 |
CompressOld | |
0.00% |
0 / 264 |
|
0.00% |
0 / 5 |
1980 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 38 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
56 | |||
compressOldPages | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
12 | |||
compressPage | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
42 | |||
compressWithConcat | |
0.00% |
0 / 156 |
|
0.00% |
0 / 1 |
756 |
1 | <?php |
2 | /** |
3 | * Compress the text of a wiki. |
4 | * |
5 | * Usage: |
6 | * |
7 | * Non-wikimedia |
8 | * php compressOld.php [options...] |
9 | * |
10 | * Wikimedia |
11 | * php compressOld.php <database> [options...] |
12 | * |
13 | * Options are: |
14 | * -t <type> set compression type to either: |
15 | * gzip: compress revisions independently |
16 | * concat: concatenate revisions and compress in chunks (default) |
17 | * -c <chunk-size> maximum number of revisions in a concat chunk |
18 | * -b <begin-date> earliest date to check for uncompressed revisions |
19 | * -e <end-date> latest revision date to compress |
20 | * -s <startid> the id to start from (referring to the text table for |
21 | * type gzip, and to the page table for type concat) |
22 | * -n <endid> the page_id to stop at (only when using concat compression type) |
23 | * --extdb <cluster> store specified revisions in an external cluster (untested) |
24 | * |
25 | * This program is free software; you can redistribute it and/or modify |
26 | * it under the terms of the GNU General Public License as published by |
27 | * the Free Software Foundation; either version 2 of the License, or |
28 | * (at your option) any later version. |
29 | * |
30 | * This program is distributed in the hope that it will be useful, |
31 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
32 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
33 | * GNU General Public License for more details. |
34 | * |
35 | * You should have received a copy of the GNU General Public License along |
36 | * with this program; if not, write to the Free Software Foundation, Inc., |
37 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
38 | * http://www.gnu.org/copyleft/gpl.html |
39 | * |
40 | * @file |
41 | * @ingroup Maintenance ExternalStorage |
42 | */ |
43 | use MediaWiki\Revision\SlotRecord; |
44 | use MediaWiki\Title\Title; |
45 | use Wikimedia\Rdbms\IExpression; |
46 | use Wikimedia\Rdbms\LikeValue; |
47 | |
48 | // @codeCoverageIgnoreStart |
49 | require_once __DIR__ . '/../Maintenance.php'; |
50 | // @codeCoverageIgnoreEnd |
51 | |
52 | /** |
53 | * Maintenance script that compress the text of a wiki. |
54 | * |
55 | * @ingroup Maintenance ExternalStorage |
56 | */ |
57 | class CompressOld extends Maintenance { |
58 | public function __construct() { |
59 | parent::__construct(); |
60 | $this->addDescription( 'Compress the text of a wiki' ); |
61 | $this->addOption( 'type', 'Set compression type to either: gzip|concat', false, true, 't' ); |
62 | $this->addOption( |
63 | 'chunksize', |
64 | 'Maximum number of revisions in a concat chunk', |
65 | false, |
66 | true, |
67 | 'c' |
68 | ); |
69 | $this->addOption( |
70 | 'begin-date', |
71 | 'Earliest date to check for uncompressed revisions', |
72 | false, |
73 | true, |
74 | 'b' |
75 | ); |
76 | $this->addOption( 'end-date', 'Latest revision date to compress', false, true, 'e' ); |
77 | $this->addOption( |
78 | 'startid', |
79 | 'The id to start from (gzip -> text table, concat -> page table)', |
80 | false, |
81 | true, |
82 | 's' |
83 | ); |
84 | $this->addOption( |
85 | 'extdb', |
86 | 'Store specified revisions in an external cluster (untested)', |
87 | false, |
88 | true |
89 | ); |
90 | $this->addOption( |
91 | 'endid', |
92 | 'The page_id to stop at (only when using concat compression type)', |
93 | false, |
94 | true, |
95 | 'n' |
96 | ); |
97 | } |
98 | |
99 | public function execute() { |
100 | global $wgDBname; |
101 | if ( !function_exists( "gzdeflate" ) ) { |
102 | $this->fatalError( "You must enable zlib support in PHP to compress old revisions!\n" . |
103 | "Please see https://www.php.net/manual/en/ref.zlib.php\n" ); |
104 | } |
105 | |
106 | $type = $this->getOption( 'type', 'concat' ); |
107 | $chunkSize = $this->getOption( 'chunksize', 20 ); |
108 | $startId = $this->getOption( 'startid', 0 ); |
109 | $beginDate = $this->getOption( 'begin-date', '' ); |
110 | $endDate = $this->getOption( 'end-date', '' ); |
111 | $extDB = $this->getOption( 'extdb', '' ); |
112 | $endId = $this->getOption( 'endid', false ); |
113 | |
114 | if ( $type != 'concat' && $type != 'gzip' ) { |
115 | $this->error( "Type \"{$type}\" not supported" ); |
116 | } |
117 | |
118 | if ( $extDB != '' ) { |
119 | $this->output( "Compressing database {$wgDBname} to external cluster {$extDB}\n" |
120 | . str_repeat( '-', 76 ) . "\n\n" ); |
121 | } else { |
122 | $this->output( "Compressing database {$wgDBname}\n" |
123 | . str_repeat( '-', 76 ) . "\n\n" ); |
124 | } |
125 | |
126 | $success = true; |
127 | if ( $type == 'concat' ) { |
128 | $success = $this->compressWithConcat( $startId, $chunkSize, $beginDate, |
129 | $endDate, $extDB, $endId ); |
130 | } else { |
131 | $this->compressOldPages( $startId, $extDB ); |
132 | } |
133 | |
134 | if ( $success ) { |
135 | $this->output( "Done.\n" ); |
136 | } |
137 | } |
138 | |
139 | /** |
140 | * Fetch the text row-by-row to 'compressPage' function for compression. |
141 | * |
142 | * @param int $start |
143 | * @param string $extdb |
144 | */ |
145 | private function compressOldPages( $start = 0, $extdb = '' ) { |
146 | $chunksize = 50; |
147 | $this->output( "Starting from old_id $start...\n" ); |
148 | $dbw = $this->getPrimaryDB(); |
149 | do { |
150 | $res = $dbw->newSelectQueryBuilder() |
151 | ->select( [ 'old_id', 'old_flags', 'old_text' ] ) |
152 | ->forUpdate() |
153 | ->from( 'text' ) |
154 | ->where( "old_id>=$start" ) |
155 | ->orderBy( 'old_id' ) |
156 | ->limit( $chunksize ) |
157 | ->caller( __METHOD__ )->fetchResultSet(); |
158 | |
159 | if ( $res->numRows() == 0 ) { |
160 | break; |
161 | } |
162 | |
163 | $last = $start; |
164 | |
165 | foreach ( $res as $row ) { |
166 | # print " {$row->old_id} - {$row->old_namespace}:{$row->old_title}\n"; |
167 | $this->compressPage( $row, $extdb ); |
168 | $last = $row->old_id; |
169 | } |
170 | |
171 | $start = $last + 1; # Deletion may leave long empty stretches |
172 | $this->output( "$start...\n" ); |
173 | } while ( true ); |
174 | } |
175 | |
176 | /** |
177 | * Compress the text in gzip format. |
178 | * |
179 | * @param stdClass $row |
180 | * @param string $extdb |
181 | * @return bool |
182 | */ |
183 | private function compressPage( $row, $extdb ) { |
184 | if ( strpos( $row->old_flags, 'gzip' ) !== false |
185 | || strpos( $row->old_flags, 'object' ) !== false |
186 | ) { |
187 | # print "Already compressed row {$row->old_id}\n"; |
188 | return false; |
189 | } |
190 | $dbw = $this->getPrimaryDB(); |
191 | $flags = $row->old_flags ? "{$row->old_flags},gzip" : "gzip"; |
192 | $compress = gzdeflate( $row->old_text ); |
193 | |
194 | # Store in external storage if required |
195 | if ( $extdb !== '' ) { |
196 | $esFactory = $this->getServiceContainer()->getExternalStoreFactory(); |
197 | /** @var ExternalStoreDB $storeObj */ |
198 | $storeObj = $esFactory->getStore( 'DB' ); |
199 | $compress = $storeObj->store( $extdb, $compress ); |
200 | if ( $compress === false ) { |
201 | $this->error( "Unable to store object" ); |
202 | |
203 | return false; |
204 | } |
205 | } |
206 | |
207 | # Update text row |
208 | $dbw->newUpdateQueryBuilder() |
209 | ->update( 'text' ) |
210 | ->set( [ |
211 | 'old_flags' => $flags, |
212 | 'old_text' => $compress |
213 | ] ) |
214 | ->where( [ |
215 | 'old_id' => $row->old_id |
216 | ] ) |
217 | ->caller( __METHOD__ ) |
218 | ->execute(); |
219 | |
220 | return true; |
221 | } |
222 | |
223 | /** |
224 | * Compress the text in chunks after concatenating the revisions. |
225 | * |
226 | * @param int $startId |
227 | * @param int $maxChunkSize |
228 | * @param string $beginDate |
229 | * @param string $endDate |
230 | * @param string $extdb |
231 | * @param bool|int $maxPageId |
232 | * @return bool |
233 | */ |
234 | private function compressWithConcat( $startId, $maxChunkSize, $beginDate, |
235 | $endDate, $extdb = "", $maxPageId = false |
236 | ) { |
237 | $dbr = $this->getReplicaDB(); |
238 | $dbw = $this->getPrimaryDB(); |
239 | |
240 | # Set up external storage |
241 | if ( $extdb != '' ) { |
242 | $esFactory = $this->getServiceContainer()->getExternalStoreFactory(); |
243 | /** @var ExternalStoreDB $storeObj */ |
244 | $storeObj = $esFactory->getStore( 'DB' ); |
245 | } |
246 | |
247 | $blobStore = $this->getServiceContainer() |
248 | ->getBlobStoreFactory() |
249 | ->newSqlBlobStore(); |
250 | |
251 | # Get all articles by page_id |
252 | if ( !$maxPageId ) { |
253 | $maxPageId = $dbr->newSelectQueryBuilder() |
254 | ->select( 'max(page_id)' ) |
255 | ->from( 'page' ) |
256 | ->caller( __METHOD__ )->fetchField(); |
257 | } |
258 | $this->output( "Starting from $startId of $maxPageId\n" ); |
259 | $pageConds = []; |
260 | |
261 | /* |
262 | if ( $exclude_ns0 ) { |
263 | print "Excluding main namespace\n"; |
264 | $pageConds[] = 'page_namespace<>0'; |
265 | } |
266 | if ( $queryExtra ) { |
267 | $pageConds[] = $queryExtra; |
268 | } |
269 | */ |
270 | |
271 | # For each article, get a list of revisions which fit the criteria |
272 | |
273 | # No recompression, use a condition on old_flags |
274 | # Don't compress object type entities, because that might produce data loss when |
275 | # overwriting bulk storage concat rows. Don't compress external references, because |
276 | # the script doesn't yet delete rows from external storage. |
277 | $slotRoleStore = $this->getServiceContainer()->getSlotRoleStore(); |
278 | $queryBuilderTemplate = $dbw->newSelectQueryBuilder() |
279 | ->select( [ 'rev_id', 'old_id', 'old_flags', 'old_text' ] ) |
280 | ->forUpdate() |
281 | ->from( 'revision' ) |
282 | ->join( 'slots', null, 'rev_id=slot_revision_id' ) |
283 | ->join( 'content', null, 'content_id=slot_content_id' ) |
284 | ->join( 'text', null, 'SUBSTRING(content_address, 4)=old_id' ) |
285 | ->where( |
286 | $dbr->expr( |
287 | 'old_flags', |
288 | IExpression::NOT_LIKE, |
289 | new LikeValue( $dbr->anyString(), 'object', $dbr->anyString() ) |
290 | )->and( |
291 | 'old_flags', |
292 | IExpression::NOT_LIKE, |
293 | new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() ) |
294 | ) |
295 | ) |
296 | ->andWhere( [ |
297 | 'slot_role_id' => $slotRoleStore->getId( SlotRecord::MAIN ), |
298 | 'SUBSTRING(content_address, 1, 3)=' . $dbr->addQuotes( 'tt:' ), |
299 | ] ); |
300 | |
301 | if ( $beginDate ) { |
302 | if ( !preg_match( '/^\d{14}$/', $beginDate ) ) { |
303 | $this->error( "Invalid begin date \"$beginDate\"\n" ); |
304 | |
305 | return false; |
306 | } |
307 | $queryBuilderTemplate->andWhere( $dbr->expr( 'rev_timestamp', '>', $beginDate ) ); |
308 | } |
309 | if ( $endDate ) { |
310 | if ( !preg_match( '/^\d{14}$/', $endDate ) ) { |
311 | $this->error( "Invalid end date \"$endDate\"\n" ); |
312 | |
313 | return false; |
314 | } |
315 | $queryBuilderTemplate->andWhere( $dbr->expr( 'rev_timestamp', '<', $endDate ) ); |
316 | } |
317 | |
318 | for ( $pageId = $startId; $pageId <= $maxPageId; $pageId++ ) { |
319 | $this->waitForReplication(); |
320 | |
321 | # Wake up |
322 | $dbr->ping(); |
323 | |
324 | # Get the page row |
325 | $pageRow = $dbr->newSelectQueryBuilder() |
326 | ->select( [ 'page_id', 'page_namespace', 'page_title', 'rev_timestamp' ] ) |
327 | ->from( 'page' ) |
328 | ->straightJoin( 'revision', null, 'page_latest = rev_id' ) |
329 | ->where( $pageConds ) |
330 | ->andWhere( [ 'page_id' => $pageId ] ) |
331 | ->caller( __METHOD__ )->fetchRow(); |
332 | if ( $pageRow === false ) { |
333 | continue; |
334 | } |
335 | |
336 | # Display progress |
337 | $titleObj = Title::makeTitle( $pageRow->page_namespace, $pageRow->page_title ); |
338 | $this->output( "$pageId\t" . $titleObj->getPrefixedDBkey() . " " ); |
339 | |
340 | # Load revisions |
341 | $queryBuilder = clone $queryBuilderTemplate; |
342 | $revRes = $queryBuilder->where( |
343 | [ |
344 | 'rev_page' => $pageRow->page_id, |
345 | // Don't operate on the current revision |
346 | // Use < instead of <> in case the current revision has changed |
347 | // since the page select, which wasn't locking |
348 | $dbr->expr( 'rev_timestamp', '<', (int)$pageRow->rev_timestamp ), |
349 | ] ) |
350 | ->caller( __METHOD__ )->fetchResultSet(); |
351 | |
352 | $revs = []; |
353 | foreach ( $revRes as $revRow ) { |
354 | $revs[] = $revRow; |
355 | } |
356 | |
357 | if ( count( $revs ) < 2 ) { |
358 | # No revisions matching, no further processing |
359 | $this->output( "\n" ); |
360 | continue; |
361 | } |
362 | |
363 | # For each chunk |
364 | $i = 0; |
365 | while ( $i < count( $revs ) ) { |
366 | if ( $i < count( $revs ) - $maxChunkSize ) { |
367 | $thisChunkSize = $maxChunkSize; |
368 | } else { |
369 | $thisChunkSize = count( $revs ) - $i; |
370 | } |
371 | |
372 | $chunk = new ConcatenatedGzipHistoryBlob(); |
373 | $stubs = []; |
374 | $this->beginTransaction( $dbw, __METHOD__ ); |
375 | $usedChunk = false; |
376 | $primaryOldid = $revs[$i]->old_id; |
377 | |
378 | # Get the text of each revision and add it to the object |
379 | for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy(); $j++ ) { |
380 | $oldid = $revs[$i + $j]->old_id; |
381 | |
382 | # Get text. We do not need the full `extractBlob` since the query is built |
383 | # to fetch non-externalstore blobs. |
384 | $text = $blobStore->decompressData( |
385 | $revs[$i + $j]->old_text, |
386 | explode( ',', $revs[$i + $j]->old_flags ) |
387 | ); |
388 | |
389 | if ( $text === false ) { |
390 | $this->error( "\nError, unable to get text in old_id $oldid" ); |
391 | # $dbw->delete( 'old', [ 'old_id' => $oldid ] ); |
392 | } |
393 | |
394 | if ( $extdb == "" && $j == 0 ) { |
395 | $chunk->setText( $text ); |
396 | $this->output( '.' ); |
397 | } else { |
398 | # Don't make a stub if it's going to be longer than the article |
399 | # Stubs are typically about 100 bytes |
400 | if ( strlen( $text ) < 120 ) { |
401 | $stub = false; |
402 | $this->output( 'x' ); |
403 | } else { |
404 | $stub = new HistoryBlobStub( $chunk->addItem( $text ) ); |
405 | $stub->setLocation( $primaryOldid ); |
406 | $stub->setReferrer( $oldid ); |
407 | $this->output( '.' ); |
408 | $usedChunk = true; |
409 | } |
410 | $stubs[$j] = $stub; |
411 | } |
412 | } |
413 | $thisChunkSize = $j; |
414 | |
415 | # If we couldn't actually use any stubs because the pages were too small, do nothing |
416 | if ( $usedChunk ) { |
417 | if ( $extdb != "" ) { |
418 | # Move blob objects to External Storage |
419 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable storeObj is set when used |
420 | $stored = $storeObj->store( $extdb, serialize( $chunk ) ); |
421 | if ( $stored === false ) { |
422 | $this->error( "Unable to store object" ); |
423 | |
424 | return false; |
425 | } |
426 | # Store External Storage URLs instead of Stub placeholders |
427 | foreach ( $stubs as $stub ) { |
428 | if ( $stub === false ) { |
429 | continue; |
430 | } |
431 | # $stored should provide base path to a BLOB |
432 | $url = $stored . "/" . $stub->getHash(); |
433 | $dbw->newUpdateQueryBuilder() |
434 | ->update( 'text' ) |
435 | ->set( [ |
436 | 'old_text' => $url, |
437 | 'old_flags' => 'external,utf-8', |
438 | ] ) |
439 | ->where( [ |
440 | 'old_id' => $stub->getReferrer(), |
441 | ] ) |
442 | ->caller( __METHOD__ ) |
443 | ->execute(); |
444 | } |
445 | } else { |
446 | # Store the main object locally |
447 | $dbw->newUpdateQueryBuilder() |
448 | ->update( 'text' ) |
449 | ->set( [ |
450 | 'old_text' => serialize( $chunk ), |
451 | 'old_flags' => 'object,utf-8', |
452 | ] ) |
453 | ->where( [ |
454 | 'old_id' => $primaryOldid |
455 | ] ) |
456 | ->caller( __METHOD__ ) |
457 | ->execute(); |
458 | |
459 | # Store the stub objects |
460 | for ( $j = 1; $j < $thisChunkSize; $j++ ) { |
461 | # Skip if not compressing and don't overwrite the first revision |
462 | if ( $stubs[$j] !== false && $revs[$i + $j]->old_id != $primaryOldid ) { |
463 | $dbw->newUpdateQueryBuilder() |
464 | ->update( 'text' ) |
465 | ->set( [ |
466 | 'old_text' => serialize( $stubs[$j] ), |
467 | 'old_flags' => 'object,utf-8', |
468 | ] ) |
469 | ->where( [ |
470 | 'old_id' => $revs[$i + $j]->old_id |
471 | ] ) |
472 | ->caller( __METHOD__ ) |
473 | ->execute(); |
474 | } |
475 | } |
476 | } |
477 | } |
478 | # Done, next |
479 | $this->output( "/" ); |
480 | $this->commitTransaction( $dbw, __METHOD__ ); |
481 | $i += $thisChunkSize; |
482 | } |
483 | $this->output( "\n" ); |
484 | } |
485 | |
486 | return true; |
487 | } |
488 | } |
489 | |
490 | // @codeCoverageIgnoreStart |
491 | $maintClass = CompressOld::class; |
492 | require_once RUN_MAINTENANCE_IF_MAIN; |
493 | // @codeCoverageIgnoreEnd |