MediaWiki  1.23.13
compressOld.php
Go to the documentation of this file.
1 <?php
44 require_once __DIR__ . '/../Maintenance.php';
45 
51 class CompressOld extends Maintenance {
55  const LS_INDIVIDUAL = 0;
56  const LS_CHUNKED = 1;
57 
58  public function __construct() {
59  parent::__construct();
60  $this->mDescription = 'Compress the text of a wiki';
61  $this->addOption( 'type', 'Set compression type to either: gzip|concat', false, true, 't' );
62  $this->addOption( 'chunksize', 'Maximum number of revisions in a concat chunk', false, true, 'c' );
63  $this->addOption( 'begin-date', 'Earliest date to check for uncompressed revisions', false, true, 'b' );
64  $this->addOption( 'end-date', 'Latest revision date to compress', false, true, 'e' );
65  $this->addOption( 'startid', 'The id to start from (gzip -> text table, concat -> page table)', false, true, 's' );
66  $this->addOption( 'extdb', 'Store specified revisions in an external cluster (untested)', false, true );
67  $this->addOption( 'endid', 'The page_id to stop at (only when using concat compression type)', false, true, 'n' );
68  }
69 
70  public function execute() {
72  if ( !function_exists( "gzdeflate" ) ) {
73  $this->error( "You must enable zlib support in PHP to compress old revisions!\n" .
74  "Please see http://www.php.net/manual/en/ref.zlib.php\n", true );
75  }
76 
77  $type = $this->getOption( 'type', 'concat' );
78  $chunkSize = $this->getOption( 'chunksize', 20 );
79  $startId = $this->getOption( 'startid', 0 );
80  $beginDate = $this->getOption( 'begin-date', '' );
81  $endDate = $this->getOption( 'end-date', '' );
82  $extDB = $this->getOption( 'extdb', '' );
83  $endId = $this->getOption( 'endid', false );
84 
85  if ( $type != 'concat' && $type != 'gzip' ) {
86  $this->error( "Type \"{$type}\" not supported" );
87  }
88 
89  if ( $extDB != '' ) {
90  $this->output( "Compressing database {$wgDBname} to external cluster {$extDB}\n"
91  . str_repeat( '-', 76 ) . "\n\n" );
92  } else {
93  $this->output( "Compressing database {$wgDBname}\n"
94  . str_repeat( '-', 76 ) . "\n\n" );
95  }
96 
97  $success = true;
98  if ( $type == 'concat' ) {
99  $success = $this->compressWithConcat( $startId, $chunkSize, $beginDate,
100  $endDate, $extDB, $endId );
101  } else {
102  $this->compressOldPages( $startId, $extDB );
103  }
104 
105  if ( $success ) {
106  $this->output( "Done.\n" );
107  }
108  }
109 
111  private function compressOldPages( $start = 0, $extdb = '' ) {
112  $chunksize = 50;
113  $this->output( "Starting from old_id $start...\n" );
114  $dbw = wfGetDB( DB_MASTER );
115  do {
116  $res = $dbw->select( 'text', array( 'old_id', 'old_flags', 'old_text' ),
117  "old_id>=$start", __METHOD__, array( 'ORDER BY' => 'old_id', 'LIMIT' => $chunksize, 'FOR UPDATE' ) );
118  if ( $res->numRows() == 0 ) {
119  break;
120  }
121  $last = $start;
122  foreach ( $res as $row ) {
123  # print " {$row->old_id} - {$row->old_namespace}:{$row->old_title}\n";
124  $this->compressPage( $row, $extdb );
125  $last = $row->old_id;
126  }
127  $start = $last + 1; # Deletion may leave long empty stretches
128  $this->output( "$start...\n" );
129  } while ( true );
130  }
131 
138  private function compressPage( $row, $extdb ) {
139  if ( false !== strpos( $row->old_flags, 'gzip' ) || false !== strpos( $row->old_flags, 'object' ) ) {
140  #print "Already compressed row {$row->old_id}\n";
141  return false;
142  }
143  $dbw = wfGetDB( DB_MASTER );
144  $flags = $row->old_flags ? "{$row->old_flags},gzip" : "gzip";
145  $compress = gzdeflate( $row->old_text );
146 
147  # Store in external storage if required
148  if ( $extdb !== '' ) {
149  $storeObj = new ExternalStoreDB;
150  $compress = $storeObj->store( $extdb, $compress );
151  if ( $compress === false ) {
152  $this->error( "Unable to store object" );
153  return false;
154  }
155  }
156 
157  # Update text row
158  $dbw->update( 'text',
159  array( /* SET */
160  'old_flags' => $flags,
161  'old_text' => $compress
162  ), array( /* WHERE */
163  'old_id' => $row->old_id
164  ), __METHOD__,
165  array( 'LIMIT' => 1 )
166  );
167  return true;
168  }
169 
179  private function compressWithConcat( $startId, $maxChunkSize, $beginDate,
180  $endDate, $extdb = "", $maxPageId = false
181  ) {
182  $loadStyle = self::LS_CHUNKED;
183 
184  $dbr = wfGetDB( DB_SLAVE );
185  $dbw = wfGetDB( DB_MASTER );
186 
187  # Set up external storage
188  if ( $extdb != '' ) {
189  $storeObj = new ExternalStoreDB;
190  }
191 
192  # Get all articles by page_id
193  if ( !$maxPageId ) {
194  $maxPageId = $dbr->selectField( 'page', 'max(page_id)', '', __METHOD__ );
195  }
196  $this->output( "Starting from $startId of $maxPageId\n" );
197  $pageConds = array();
198 
199  /*
200  if ( $exclude_ns0 ) {
201  print "Excluding main namespace\n";
202  $pageConds[] = 'page_namespace<>0';
203  }
204  if ( $queryExtra ) {
205  $pageConds[] = $queryExtra;
206  }
207  */
208 
209  # For each article, get a list of revisions which fit the criteria
210 
211  # No recompression, use a condition on old_flags
212  # Don't compress object type entities, because that might produce data loss when
213  # overwriting bulk storage concat rows. Don't compress external references, because
214  # the script doesn't yet delete rows from external storage.
215  $conds = array(
216  'old_flags NOT ' . $dbr->buildLike( $dbr->anyString(), 'object', $dbr->anyString() ) . ' AND old_flags NOT '
217  . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ) );
218 
219  if ( $beginDate ) {
220  if ( !preg_match( '/^\d{14}$/', $beginDate ) ) {
221  $this->error( "Invalid begin date \"$beginDate\"\n" );
222  return false;
223  }
224  $conds[] = "rev_timestamp>'" . $beginDate . "'";
225  }
226  if ( $endDate ) {
227  if ( !preg_match( '/^\d{14}$/', $endDate ) ) {
228  $this->error( "Invalid end date \"$endDate\"\n" );
229  return false;
230  }
231  $conds[] = "rev_timestamp<'" . $endDate . "'";
232  }
233  if ( $loadStyle == self::LS_CHUNKED ) {
234  $tables = array( 'revision', 'text' );
235  $fields = array( 'rev_id', 'rev_text_id', 'old_flags', 'old_text' );
236  $conds[] = 'rev_text_id=old_id';
237  $revLoadOptions = 'FOR UPDATE';
238  } else {
239  $tables = array( 'revision' );
240  $fields = array( 'rev_id', 'rev_text_id' );
241  $revLoadOptions = array();
242  }
243 
244  # Don't work with current revisions
245  # Don't lock the page table for update either -- TS 2006-04-04
246  #$tables[] = 'page';
247  #$conds[] = 'page_id=rev_page AND rev_id != page_latest';
248 
249  for ( $pageId = $startId; $pageId <= $maxPageId; $pageId++ ) {
250  wfWaitForSlaves();
251 
252  # Wake up
253  $dbr->ping();
254 
255  # Get the page row
256  $pageRes = $dbr->select( 'page',
257  array( 'page_id', 'page_namespace', 'page_title', 'page_latest' ),
258  $pageConds + array( 'page_id' => $pageId ), __METHOD__ );
259  if ( $pageRes->numRows() == 0 ) {
260  continue;
261  }
262  $pageRow = $dbr->fetchObject( $pageRes );
263 
264  # Display progress
265  $titleObj = Title::makeTitle( $pageRow->page_namespace, $pageRow->page_title );
266  $this->output( "$pageId\t" . $titleObj->getPrefixedDBkey() . " " );
267 
268  # Load revisions
269  $revRes = $dbw->select( $tables, $fields,
270  array_merge( array(
271  'rev_page' => $pageRow->page_id,
272  # Don't operate on the current revision
273  # Use < instead of <> in case the current revision has changed
274  # since the page select, which wasn't locking
275  'rev_id < ' . $pageRow->page_latest
276  ), $conds ),
277  __METHOD__,
278  $revLoadOptions
279  );
280  $revs = array();
281  foreach ( $revRes as $revRow ) {
282  $revs[] = $revRow;
283  }
284 
285  if ( count( $revs ) < 2 ) {
286  # No revisions matching, no further processing
287  $this->output( "\n" );
288  continue;
289  }
290 
291  # For each chunk
292  $i = 0;
293  while ( $i < count( $revs ) ) {
294  if ( $i < count( $revs ) - $maxChunkSize ) {
295  $thisChunkSize = $maxChunkSize;
296  } else {
297  $thisChunkSize = count( $revs ) - $i;
298  }
299 
300  $chunk = new ConcatenatedGzipHistoryBlob();
301  $stubs = array();
302  $dbw->begin( __METHOD__ );
303  $usedChunk = false;
304  $primaryOldid = $revs[$i]->rev_text_id;
305 
306  # Get the text of each revision and add it to the object
307  for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy(); $j++ ) {
308  $oldid = $revs[$i + $j]->rev_text_id;
309 
310  # Get text
311  if ( $loadStyle == self::LS_INDIVIDUAL ) {
312  $textRow = $dbw->selectRow( 'text',
313  array( 'old_flags', 'old_text' ),
314  array( 'old_id' => $oldid ),
315  __METHOD__,
316  'FOR UPDATE'
317  );
318  $text = Revision::getRevisionText( $textRow );
319  } else {
320  $text = Revision::getRevisionText( $revs[$i + $j] );
321  }
322 
323  if ( $text === false ) {
324  $this->error( "\nError, unable to get text in old_id $oldid" );
325  #$dbw->delete( 'old', array( 'old_id' => $oldid ) );
326  }
327 
328  if ( $extdb == "" && $j == 0 ) {
329  $chunk->setText( $text );
330  $this->output( '.' );
331  } else {
332  # Don't make a stub if it's going to be longer than the article
333  # Stubs are typically about 100 bytes
334  if ( strlen( $text ) < 120 ) {
335  $stub = false;
336  $this->output( 'x' );
337  } else {
338  $stub = new HistoryBlobStub( $chunk->addItem( $text ) );
339  $stub->setLocation( $primaryOldid );
340  $stub->setReferrer( $oldid );
341  $this->output( '.' );
342  $usedChunk = true;
343  }
344  $stubs[$j] = $stub;
345  }
346  }
347  $thisChunkSize = $j;
348 
349  # If we couldn't actually use any stubs because the pages were too small, do nothing
350  if ( $usedChunk ) {
351  if ( $extdb != "" ) {
352  # Move blob objects to External Storage
353  $stored = $storeObj->store( $extdb, serialize( $chunk ));
354  if ( $stored === false ) {
355  $this->error( "Unable to store object" );
356  return false;
357  }
358  # Store External Storage URLs instead of Stub placeholders
359  foreach ( $stubs as $stub ) {
360  if ( $stub === false ) {
361  continue;
362  }
363  # $stored should provide base path to a BLOB
364  $url = $stored . "/" . $stub->getHash();
365  $dbw->update( 'text',
366  array( /* SET */
367  'old_text' => $url,
368  'old_flags' => 'external,utf-8',
369  ), array( /* WHERE */
370  'old_id' => $stub->getReferrer(),
371  )
372  );
373  }
374  } else {
375  # Store the main object locally
376  $dbw->update( 'text',
377  array( /* SET */
378  'old_text' => serialize( $chunk ),
379  'old_flags' => 'object,utf-8',
380  ), array( /* WHERE */
381  'old_id' => $primaryOldid
382  )
383  );
384 
385  # Store the stub objects
386  for ( $j = 1; $j < $thisChunkSize; $j++ ) {
387  # Skip if not compressing and don't overwrite the first revision
388  if ( $stubs[$j] !== false && $revs[$i + $j]->rev_text_id != $primaryOldid ) {
389  $dbw->update( 'text',
390  array( /* SET */
391  'old_text' => serialize( $stubs[$j] ),
392  'old_flags' => 'object,utf-8',
393  ), array( /* WHERE */
394  'old_id' => $revs[$i + $j]->rev_text_id
395  )
396  );
397  }
398  }
399  }
400  }
401  # Done, next
402  $this->output( "/" );
403  $dbw->commit( __METHOD__ );
404  $i += $thisChunkSize;
405  wfWaitForSlaves();
406  }
407  $this->output( "\n" );
408  }
409  return true;
410  }
411 
412 }
413 
414 $maintClass = 'CompressOld';
415 require_once RUN_MAINTENANCE_IF_MAIN;
Title\makeTitle
static & makeTitle( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:398
DB_MASTER
const DB_MASTER
Definition: Defines.php:56
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
ExternalStoreDB
DB accessable external objects.
Definition: ExternalStoreDB.php:31
$tables
namespace and then decline to actually register it RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist e g Watchlist & $tables
Definition: hooks.txt:815
CompressOld\LS_INDIVIDUAL
const LS_INDIVIDUAL
Definition: compressOld.php:55
HistoryBlobStub
Pointer object for an item within a CGZ blob stored in the text table.
Definition: HistoryBlob.php:191
CompressOld\compressWithConcat
compressWithConcat( $startId, $maxChunkSize, $beginDate, $endDate, $extdb="", $maxPageId=false)
Definition: compressOld.php:179
$last
$last
Definition: profileinfo.php:365
wfGetDB
& wfGetDB( $db, $groups=array(), $wiki=false)
Get a Database object.
Definition: GlobalFunctions.php:3706
ConcatenatedGzipHistoryBlob
Concatenated gzip (CGZ) storage Improves compression ratio by concatenating like objects before gzipp...
Definition: HistoryBlob.php:74
Maintenance\addOption
addOption( $name, $description, $required=false, $withArg=false, $shortName=false)
Add a parameter to the script.
Definition: Maintenance.php:169
RUN_MAINTENANCE_IF_MAIN
require_once RUN_MAINTENANCE_IF_MAIN
Definition: maintenance.txt:50
CompressOld\__construct
__construct()
Default constructor.
Definition: compressOld.php:58
Revision\getRevisionText
static getRevisionText( $row, $prefix='old_', $wiki=false)
Get revision text associated with an old or archive row $row is usually an object from wfFetchRow(),...
Definition: Revision.php:1212
$maintClass
$maintClass
Definition: compressOld.php:414
Maintenance
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: maintenance.txt:39
$flags
it s the revision text itself In either if gzip is the revision text is gzipped $flags
Definition: hooks.txt:2118
CompressOld\execute
execute()
Do the actual work.
Definition: compressOld.php:70
$dbr
$dbr
Definition: testCompression.php:48
ExternalStoreDB\store
store( $cluster, $data)
Definition: ExternalStoreDB.php:89
$success
$success
Definition: Utf8Test.php:91
$wgDBname
controlled by $wgMainCacheType controlled by $wgParserCacheType controlled by $wgMessageCacheType If you set CACHE_NONE to one of the three control default value for MediaWiki still create a but requests to it are no ops and we always fall through to the database If the cache daemon can t be it should also disable itself fairly smoothly By $wgMemc is used but when it is $parserMemc or $messageMemc this is mentioned $wgDBname
Definition: memcached.txt:96
CompressOld\compressOldPages
compressOldPages( $start=0, $extdb='')
Definition: compressOld.php:111
CompressOld\compressPage
compressPage( $row, $extdb)
Definition: compressOld.php:138
array
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
global
when a variable name is used in a it is silently declared as a new masking the global
Definition: design.txt:93
wfWaitForSlaves
wfWaitForSlaves( $maxLag=false, $wiki=false, $cluster=false)
Modern version of wfWaitForSlaves().
Definition: GlobalFunctions.php:3851
CompressOld
Maintenance script that compress the text of a wiki.
Definition: compressOld.php:51
HistoryBlobStub\setLocation
setLocation( $id)
Sets the location (old_id) of the main object to which this object points.
Definition: HistoryBlob.php:214
DB_SLAVE
const DB_SLAVE
Definition: Defines.php:55
Maintenance\getOption
getOption( $name, $default=null)
Get an option, or return the default.
Definition: Maintenance.php:191
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
Maintenance\error
error( $err, $die=0)
Throw an error to the user.
Definition: Maintenance.php:333
Maintenance\output
output( $out, $channel=null)
Throw some output to the user.
Definition: Maintenance.php:314
CompressOld\LS_CHUNKED
const LS_CHUNKED
Definition: compressOld.php:56
$res
$res
Definition: database.txt:21
$type
$type
Definition: testCompression.php:46