Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 259 |
|
0.00% |
0 / 5 |
CRAP | |
0.00% |
0 / 1 |
CompressOld | |
0.00% |
0 / 256 |
|
0.00% |
0 / 5 |
1980 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 38 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 25 |
|
0.00% |
0 / 1 |
56 | |||
compressOldPages | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
12 | |||
compressPage | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
42 | |||
compressWithConcat | |
0.00% |
0 / 150 |
|
0.00% |
0 / 1 |
756 |
1 | <?php |
2 | /** |
3 | * Compress the text of a wiki. |
4 | * |
5 | * Usage: |
6 | * |
7 | * Non-wikimedia |
8 | * php compressOld.php [options...] |
9 | * |
10 | * Wikimedia |
11 | * php compressOld.php <database> [options...] |
12 | * |
13 | * Options are: |
14 | * -t <type> set compression type to either: |
15 | * gzip: compress revisions independently |
16 | * concat: concatenate revisions and compress in chunks (default) |
17 | * -c <chunk-size> maximum number of revisions in a concat chunk |
18 | * -b <begin-date> earliest date to check for uncompressed revisions |
19 | * -e <end-date> latest revision date to compress |
20 | * -s <startid> the id to start from (referring to the text table for |
21 | * type gzip, and to the page table for type concat) |
22 | * -n <endid> the page_id to stop at (only when using concat compression type) |
23 | * --extdb <cluster> store specified revisions in an external cluster (untested) |
24 | * |
25 | * This program is free software; you can redistribute it and/or modify |
26 | * it under the terms of the GNU General Public License as published by |
27 | * the Free Software Foundation; either version 2 of the License, or |
28 | * (at your option) any later version. |
29 | * |
30 | * This program is distributed in the hope that it will be useful, |
31 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
32 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
33 | * GNU General Public License for more details. |
34 | * |
35 | * You should have received a copy of the GNU General Public License along |
36 | * with this program; if not, write to the Free Software Foundation, Inc., |
37 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
38 | * http://www.gnu.org/copyleft/gpl.html |
39 | * |
40 | * @file |
41 | * @ingroup Maintenance ExternalStorage |
42 | */ |
43 | use MediaWiki\Revision\SlotRecord; |
44 | use MediaWiki\Title\Title; |
45 | use Wikimedia\Rdbms\IExpression; |
46 | use Wikimedia\Rdbms\LikeValue; |
47 | |
48 | require_once __DIR__ . '/../Maintenance.php'; |
49 | |
50 | /** |
51 | * Maintenance script that compress the text of a wiki. |
52 | * |
53 | * @ingroup Maintenance ExternalStorage |
54 | */ |
55 | class CompressOld extends Maintenance { |
56 | public function __construct() { |
57 | parent::__construct(); |
58 | $this->addDescription( 'Compress the text of a wiki' ); |
59 | $this->addOption( 'type', 'Set compression type to either: gzip|concat', false, true, 't' ); |
60 | $this->addOption( |
61 | 'chunksize', |
62 | 'Maximum number of revisions in a concat chunk', |
63 | false, |
64 | true, |
65 | 'c' |
66 | ); |
67 | $this->addOption( |
68 | 'begin-date', |
69 | 'Earliest date to check for uncompressed revisions', |
70 | false, |
71 | true, |
72 | 'b' |
73 | ); |
74 | $this->addOption( 'end-date', 'Latest revision date to compress', false, true, 'e' ); |
75 | $this->addOption( |
76 | 'startid', |
77 | 'The id to start from (gzip -> text table, concat -> page table)', |
78 | false, |
79 | true, |
80 | 's' |
81 | ); |
82 | $this->addOption( |
83 | 'extdb', |
84 | 'Store specified revisions in an external cluster (untested)', |
85 | false, |
86 | true |
87 | ); |
88 | $this->addOption( |
89 | 'endid', |
90 | 'The page_id to stop at (only when using concat compression type)', |
91 | false, |
92 | true, |
93 | 'n' |
94 | ); |
95 | } |
96 | |
97 | public function execute() { |
98 | global $wgDBname; |
99 | if ( !function_exists( "gzdeflate" ) ) { |
100 | $this->fatalError( "You must enable zlib support in PHP to compress old revisions!\n" . |
101 | "Please see https://www.php.net/manual/en/ref.zlib.php\n" ); |
102 | } |
103 | |
104 | $type = $this->getOption( 'type', 'concat' ); |
105 | $chunkSize = $this->getOption( 'chunksize', 20 ); |
106 | $startId = $this->getOption( 'startid', 0 ); |
107 | $beginDate = $this->getOption( 'begin-date', '' ); |
108 | $endDate = $this->getOption( 'end-date', '' ); |
109 | $extDB = $this->getOption( 'extdb', '' ); |
110 | $endId = $this->getOption( 'endid', false ); |
111 | |
112 | if ( $type != 'concat' && $type != 'gzip' ) { |
113 | $this->error( "Type \"{$type}\" not supported" ); |
114 | } |
115 | |
116 | if ( $extDB != '' ) { |
117 | $this->output( "Compressing database {$wgDBname} to external cluster {$extDB}\n" |
118 | . str_repeat( '-', 76 ) . "\n\n" ); |
119 | } else { |
120 | $this->output( "Compressing database {$wgDBname}\n" |
121 | . str_repeat( '-', 76 ) . "\n\n" ); |
122 | } |
123 | |
124 | $success = true; |
125 | if ( $type == 'concat' ) { |
126 | $success = $this->compressWithConcat( $startId, $chunkSize, $beginDate, |
127 | $endDate, $extDB, $endId ); |
128 | } else { |
129 | $this->compressOldPages( $startId, $extDB ); |
130 | } |
131 | |
132 | if ( $success ) { |
133 | $this->output( "Done.\n" ); |
134 | } |
135 | } |
136 | |
137 | /** |
138 | * Fetch the text row-by-row to 'compressPage' function for compression. |
139 | * |
140 | * @param int $start |
141 | * @param string $extdb |
142 | */ |
143 | private function compressOldPages( $start = 0, $extdb = '' ) { |
144 | $chunksize = 50; |
145 | $this->output( "Starting from old_id $start...\n" ); |
146 | $dbw = $this->getPrimaryDB(); |
147 | do { |
148 | $res = $dbw->newSelectQueryBuilder() |
149 | ->select( [ 'old_id', 'old_flags', 'old_text' ] ) |
150 | ->forUpdate() |
151 | ->from( 'text' ) |
152 | ->where( "old_id>=$start" ) |
153 | ->orderBy( 'old_id' ) |
154 | ->limit( $chunksize ) |
155 | ->caller( __METHOD__ )->fetchResultSet(); |
156 | |
157 | if ( $res->numRows() == 0 ) { |
158 | break; |
159 | } |
160 | |
161 | $last = $start; |
162 | |
163 | foreach ( $res as $row ) { |
164 | # print " {$row->old_id} - {$row->old_namespace}:{$row->old_title}\n"; |
165 | $this->compressPage( $row, $extdb ); |
166 | $last = $row->old_id; |
167 | } |
168 | |
169 | $start = $last + 1; # Deletion may leave long empty stretches |
170 | $this->output( "$start...\n" ); |
171 | } while ( true ); |
172 | } |
173 | |
174 | /** |
175 | * Compress the text in gzip format. |
176 | * |
177 | * @param stdClass $row |
178 | * @param string $extdb |
179 | * @return bool |
180 | */ |
181 | private function compressPage( $row, $extdb ) { |
182 | if ( strpos( $row->old_flags, 'gzip' ) !== false |
183 | || strpos( $row->old_flags, 'object' ) !== false |
184 | ) { |
185 | # print "Already compressed row {$row->old_id}\n"; |
186 | return false; |
187 | } |
188 | $dbw = $this->getPrimaryDB(); |
189 | $flags = $row->old_flags ? "{$row->old_flags},gzip" : "gzip"; |
190 | $compress = gzdeflate( $row->old_text ); |
191 | |
192 | # Store in external storage if required |
193 | if ( $extdb !== '' ) { |
194 | $esFactory = $this->getServiceContainer()->getExternalStoreFactory(); |
195 | /** @var ExternalStoreDB $storeObj */ |
196 | $storeObj = $esFactory->getStore( 'DB' ); |
197 | $compress = $storeObj->store( $extdb, $compress ); |
198 | if ( $compress === false ) { |
199 | $this->error( "Unable to store object" ); |
200 | |
201 | return false; |
202 | } |
203 | } |
204 | |
205 | # Update text row |
206 | $dbw->update( 'text', |
207 | [ /* SET */ |
208 | 'old_flags' => $flags, |
209 | 'old_text' => $compress |
210 | ], [ /* WHERE */ |
211 | 'old_id' => $row->old_id |
212 | ], __METHOD__, |
213 | [ 'LIMIT' => 1 ] |
214 | ); |
215 | |
216 | return true; |
217 | } |
218 | |
219 | /** |
220 | * Compress the text in chunks after concatenating the revisions. |
221 | * |
222 | * @param int $startId |
223 | * @param int $maxChunkSize |
224 | * @param string $beginDate |
225 | * @param string $endDate |
226 | * @param string $extdb |
227 | * @param bool|int $maxPageId |
228 | * @return bool |
229 | */ |
230 | private function compressWithConcat( $startId, $maxChunkSize, $beginDate, |
231 | $endDate, $extdb = "", $maxPageId = false |
232 | ) { |
233 | $dbr = $this->getReplicaDB(); |
234 | $dbw = $this->getPrimaryDB(); |
235 | |
236 | # Set up external storage |
237 | if ( $extdb != '' ) { |
238 | $esFactory = $this->getServiceContainer()->getExternalStoreFactory(); |
239 | /** @var ExternalStoreDB $storeObj */ |
240 | $storeObj = $esFactory->getStore( 'DB' ); |
241 | } |
242 | |
243 | $blobStore = $this->getServiceContainer() |
244 | ->getBlobStoreFactory() |
245 | ->newSqlBlobStore(); |
246 | |
247 | # Get all articles by page_id |
248 | if ( !$maxPageId ) { |
249 | $maxPageId = $dbr->newSelectQueryBuilder() |
250 | ->select( 'max(page_id)' ) |
251 | ->from( 'page' ) |
252 | ->caller( __METHOD__ )->fetchField(); |
253 | } |
254 | $this->output( "Starting from $startId of $maxPageId\n" ); |
255 | $pageConds = []; |
256 | |
257 | /* |
258 | if ( $exclude_ns0 ) { |
259 | print "Excluding main namespace\n"; |
260 | $pageConds[] = 'page_namespace<>0'; |
261 | } |
262 | if ( $queryExtra ) { |
263 | $pageConds[] = $queryExtra; |
264 | } |
265 | */ |
266 | |
267 | # For each article, get a list of revisions which fit the criteria |
268 | |
269 | # No recompression, use a condition on old_flags |
270 | # Don't compress object type entities, because that might produce data loss when |
271 | # overwriting bulk storage concat rows. Don't compress external references, because |
272 | # the script doesn't yet delete rows from external storage. |
273 | $slotRoleStore = $this->getServiceContainer()->getSlotRoleStore(); |
274 | $queryBuilderTemplate = $dbw->newSelectQueryBuilder() |
275 | ->select( [ 'rev_id', 'old_id', 'old_flags', 'old_text' ] ) |
276 | ->forUpdate() |
277 | ->from( 'revision' ) |
278 | ->join( 'slots', null, 'rev_id=slot_revision_id' ) |
279 | ->join( 'content', null, 'content_id=slot_content_id' ) |
280 | ->join( 'text', null, 'SUBSTRING(content_address, 4)=old_id' ) |
281 | ->where( |
282 | $dbr->expr( |
283 | 'old_flags', |
284 | IExpression::NOT_LIKE, |
285 | new LikeValue( $dbr->anyString(), 'object', $dbr->anyString() ) |
286 | )->and( |
287 | 'old_flags', |
288 | IExpression::NOT_LIKE, |
289 | new LikeValue( $dbr->anyString(), 'external', $dbr->anyString() ) |
290 | ) |
291 | ) |
292 | ->andWhere( [ |
293 | 'slot_role_id' => $slotRoleStore->getId( SlotRecord::MAIN ), |
294 | 'SUBSTRING(content_address, 1, 3)=' . $dbr->addQuotes( 'tt:' ), |
295 | ] ); |
296 | |
297 | if ( $beginDate ) { |
298 | if ( !preg_match( '/^\d{14}$/', $beginDate ) ) { |
299 | $this->error( "Invalid begin date \"$beginDate\"\n" ); |
300 | |
301 | return false; |
302 | } |
303 | $queryBuilderTemplate->andWhere( "rev_timestamp>'" . $beginDate . "'" ); |
304 | } |
305 | if ( $endDate ) { |
306 | if ( !preg_match( '/^\d{14}$/', $endDate ) ) { |
307 | $this->error( "Invalid end date \"$endDate\"\n" ); |
308 | |
309 | return false; |
310 | } |
311 | $queryBuilderTemplate->andWhere( "rev_timestamp<'" . $endDate . "'" ); |
312 | } |
313 | |
314 | for ( $pageId = $startId; $pageId <= $maxPageId; $pageId++ ) { |
315 | $this->waitForReplication(); |
316 | |
317 | # Wake up |
318 | $dbr->ping(); |
319 | |
320 | # Get the page row |
321 | $pageRow = $dbr->newSelectQueryBuilder() |
322 | ->select( [ 'page_id', 'page_namespace', 'page_title', 'rev_timestamp' ] ) |
323 | ->from( 'page' ) |
324 | ->straightJoin( 'revision', null, 'page_latest = rev_id' ) |
325 | ->where( $pageConds ) |
326 | ->andWhere( [ 'page_id' => $pageId ] ) |
327 | ->caller( __METHOD__ )->fetchRow(); |
328 | if ( $pageRow === false ) { |
329 | continue; |
330 | } |
331 | |
332 | # Display progress |
333 | $titleObj = Title::makeTitle( $pageRow->page_namespace, $pageRow->page_title ); |
334 | $this->output( "$pageId\t" . $titleObj->getPrefixedDBkey() . " " ); |
335 | |
336 | # Load revisions |
337 | $queryBuilder = clone $queryBuilderTemplate; |
338 | $revRes = $queryBuilder->where( |
339 | [ |
340 | 'rev_page' => $pageRow->page_id, |
341 | // Don't operate on the current revision |
342 | // Use < instead of <> in case the current revision has changed |
343 | // since the page select, which wasn't locking |
344 | 'rev_timestamp < ' . (int)$pageRow->rev_timestamp |
345 | ] ) |
346 | ->caller( __METHOD__ )->fetchResultSet(); |
347 | |
348 | $revs = []; |
349 | foreach ( $revRes as $revRow ) { |
350 | $revs[] = $revRow; |
351 | } |
352 | |
353 | if ( count( $revs ) < 2 ) { |
354 | # No revisions matching, no further processing |
355 | $this->output( "\n" ); |
356 | continue; |
357 | } |
358 | |
359 | # For each chunk |
360 | $i = 0; |
361 | while ( $i < count( $revs ) ) { |
362 | if ( $i < count( $revs ) - $maxChunkSize ) { |
363 | $thisChunkSize = $maxChunkSize; |
364 | } else { |
365 | $thisChunkSize = count( $revs ) - $i; |
366 | } |
367 | |
368 | $chunk = new ConcatenatedGzipHistoryBlob(); |
369 | $stubs = []; |
370 | $this->beginTransaction( $dbw, __METHOD__ ); |
371 | $usedChunk = false; |
372 | $primaryOldid = $revs[$i]->old_id; |
373 | |
374 | # Get the text of each revision and add it to the object |
375 | for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy(); $j++ ) { |
376 | $oldid = $revs[$i + $j]->old_id; |
377 | |
378 | # Get text. We do not need the full `extractBlob` since the query is built |
379 | # to fetch non-externalstore blobs. |
380 | $text = $blobStore->decompressData( |
381 | $revs[$i + $j]->old_text, |
382 | explode( ',', $revs[$i + $j]->old_flags ) |
383 | ); |
384 | |
385 | if ( $text === false ) { |
386 | $this->error( "\nError, unable to get text in old_id $oldid" ); |
387 | # $dbw->delete( 'old', [ 'old_id' => $oldid ] ); |
388 | } |
389 | |
390 | if ( $extdb == "" && $j == 0 ) { |
391 | $chunk->setText( $text ); |
392 | $this->output( '.' ); |
393 | } else { |
394 | # Don't make a stub if it's going to be longer than the article |
395 | # Stubs are typically about 100 bytes |
396 | if ( strlen( $text ) < 120 ) { |
397 | $stub = false; |
398 | $this->output( 'x' ); |
399 | } else { |
400 | $stub = new HistoryBlobStub( $chunk->addItem( $text ) ); |
401 | $stub->setLocation( $primaryOldid ); |
402 | $stub->setReferrer( $oldid ); |
403 | $this->output( '.' ); |
404 | $usedChunk = true; |
405 | } |
406 | $stubs[$j] = $stub; |
407 | } |
408 | } |
409 | $thisChunkSize = $j; |
410 | |
411 | # If we couldn't actually use any stubs because the pages were too small, do nothing |
412 | if ( $usedChunk ) { |
413 | if ( $extdb != "" ) { |
414 | # Move blob objects to External Storage |
415 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable storeObj is set when used |
416 | $stored = $storeObj->store( $extdb, serialize( $chunk ) ); |
417 | if ( $stored === false ) { |
418 | $this->error( "Unable to store object" ); |
419 | |
420 | return false; |
421 | } |
422 | # Store External Storage URLs instead of Stub placeholders |
423 | foreach ( $stubs as $stub ) { |
424 | if ( $stub === false ) { |
425 | continue; |
426 | } |
427 | # $stored should provide base path to a BLOB |
428 | $url = $stored . "/" . $stub->getHash(); |
429 | $dbw->update( 'text', |
430 | [ /* SET */ |
431 | 'old_text' => $url, |
432 | 'old_flags' => 'external,utf-8', |
433 | ], [ /* WHERE */ |
434 | 'old_id' => $stub->getReferrer(), |
435 | ], |
436 | __METHOD__ |
437 | ); |
438 | } |
439 | } else { |
440 | # Store the main object locally |
441 | $dbw->update( 'text', |
442 | [ /* SET */ |
443 | 'old_text' => serialize( $chunk ), |
444 | 'old_flags' => 'object,utf-8', |
445 | ], [ /* WHERE */ |
446 | 'old_id' => $primaryOldid |
447 | ], |
448 | __METHOD__ |
449 | ); |
450 | |
451 | # Store the stub objects |
452 | for ( $j = 1; $j < $thisChunkSize; $j++ ) { |
453 | # Skip if not compressing and don't overwrite the first revision |
454 | if ( $stubs[$j] !== false && $revs[$i + $j]->old_id != $primaryOldid ) { |
455 | $dbw->update( 'text', |
456 | [ /* SET */ |
457 | 'old_text' => serialize( $stubs[$j] ), |
458 | 'old_flags' => 'object,utf-8', |
459 | ], [ /* WHERE */ |
460 | 'old_id' => $revs[$i + $j]->old_id |
461 | ], |
462 | __METHOD__ |
463 | ); |
464 | } |
465 | } |
466 | } |
467 | } |
468 | # Done, next |
469 | $this->output( "/" ); |
470 | $this->commitTransaction( $dbw, __METHOD__ ); |
471 | $i += $thisChunkSize; |
472 | } |
473 | $this->output( "\n" ); |
474 | } |
475 | |
476 | return true; |
477 | } |
478 | } |
479 | |
480 | $maintClass = CompressOld::class; |
481 | require_once RUN_MAINTENANCE_IF_MAIN; |