MediaWiki  master
BackupDumper.php
Go to the documentation of this file.
1 <?php
28 require_once __DIR__ . '/../Maintenance.php';
29 require_once __DIR__ . '/../../includes/export/WikiExporter.php';
30 
34 
39 abstract class BackupDumper extends Maintenance {
40  public $reporting = true;
41  public $pages = null; // all pages
42  public $skipHeader = false; // don't output <mediawiki> and <siteinfo>
43  public $skipFooter = false; // don't output </mediawiki>
44  public $startId = 0;
45  public $endId = 0;
46  public $revStartId = 0;
47  public $revEndId = 0;
48  public $dumpUploads = false;
49  public $dumpUploadFileContents = false;
50  public $orderRevs = false;
51  public $limitNamespaces = [];
52 
53  protected $reportingInterval = 100;
54  protected $pageCount = 0;
55  protected $revCount = 0;
56  protected $schemaVersion = null; // use default
57  protected $server = null; // use default
58  protected $sink = null; // Output filters
59  protected $lastTime = 0;
60  protected $pageCountLast = 0;
61  protected $revCountLast = 0;
62 
63  protected $outputTypes = [];
64  protected $filterTypes = [];
65 
66  protected $ID = 0;
67 
75  protected $forcedDb = null;
76 
78  protected $lb;
79 
83  function __construct( $args = null ) {
84  parent::__construct();
85  $this->stderr = fopen( "php://stderr", "wt" );
86 
87  // Built-in output and filter plugins
88  $this->registerOutput( 'file', DumpFileOutput::class );
89  $this->registerOutput( 'gzip', DumpGZipOutput::class );
90  $this->registerOutput( 'bzip2', DumpBZip2Output::class );
91  $this->registerOutput( 'dbzip2', DumpDBZip2Output::class );
92  $this->registerOutput( 'lbzip2', DumpLBZip2Output::class );
93  $this->registerOutput( '7zip', Dump7ZipOutput::class );
94 
95  $this->registerFilter( 'latest', DumpLatestFilter::class );
96  $this->registerFilter( 'notalk', DumpNotalkFilter::class );
97  $this->registerFilter( 'namespace', DumpNamespaceFilter::class );
98 
99  // These three can be specified multiple times
100  $this->addOption( 'plugin', 'Load a dump plugin class. Specify as <class>[:<file>].',
101  false, true, false, true );
102  $this->addOption( 'output', 'Begin a filtered output stream; Specify as <type>:<file>. ' .
103  '<type>s: file, gzip, bzip2, 7zip, dbzip2, lbzip2', false, true, false, true );
104  $this->addOption( 'filter', 'Add a filter on an output branch. Specify as ' .
105  '<type>[:<options>]. <types>s: latest, notalk, namespace', false, true, false, true );
106  $this->addOption( 'report', 'Report position and speed after every n pages processed. ' .
107  'Default: 100.', false, true );
108  $this->addOption( 'schema-version', 'Schema version to use for output. ' .
109  'Default: ' . WikiExporter::schemaVersion(), false, true );
110  $this->addOption( 'server', 'Force reading from MySQL server', false, true );
111  $this->addOption( '7ziplevel', '7zip compression level for all 7zip outputs. Used for ' .
112  '-mx option to 7za command.', false, true );
113 
114  if ( $args ) {
115  // Args should be loaded and processed so that dump() can be called directly
116  // instead of execute()
117  $this->loadWithArgv( $args );
118  $this->processOptions();
119  }
120  }
121 
126  function registerOutput( $name, $class ) {
127  $this->outputTypes[$name] = $class;
128  }
129 
134  function registerFilter( $name, $class ) {
135  $this->filterTypes[$name] = $class;
136  }
137 
145  function loadPlugin( $class, $file ) {
146  if ( $file != '' ) {
147  require_once $file;
148  }
149  $register = [ $class, 'register' ];
150  $register( $this );
151  }
152 
153  function execute() {
154  throw new MWException( 'execute() must be overridden in subclasses' );
155  }
156 
160  function processOptions() {
161  $sink = null;
162  $sinks = [];
163 
164  $this->schemaVersion = WikiExporter::schemaVersion();
165 
167  foreach ( $options as $arg ) {
168  list( $opt, $param ) = $arg;
169 
170  switch ( $opt ) {
171  case 'plugin':
172  $val = explode( ':', $param, 2 );
173 
174  if ( count( $val ) === 1 ) {
175  $this->loadPlugin( $val[0], '' );
176  } elseif ( count( $val ) === 2 ) {
177  $this->loadPlugin( $val[0], $val[1] );
178  }
179 
180  break;
181  case 'output':
182  $split = explode( ':', $param, 2 );
183  if ( count( $split ) !== 2 ) {
184  $this->fatalError( 'Invalid output parameter' );
185  }
186  list( $type, $file ) = $split;
187  if ( !is_null( $sink ) ) {
188  $sinks[] = $sink;
189  }
190  if ( !isset( $this->outputTypes[$type] ) ) {
191  $this->fatalError( "Unrecognized output sink type '$type'" );
192  }
193  $class = $this->outputTypes[$type];
194  if ( $type === "7zip" ) {
195  $sink = new $class( $file, intval( $this->getOption( '7ziplevel' ) ) );
196  } else {
197  $sink = new $class( $file );
198  }
199 
200  break;
201  case 'filter':
202  if ( is_null( $sink ) ) {
203  $sink = new DumpOutput();
204  }
205 
206  $split = explode( ':', $param, 2 );
207  $key = $split[0];
208 
209  if ( !isset( $this->filterTypes[$key] ) ) {
210  $this->fatalError( "Unrecognized filter type '$key'" );
211  }
212 
213  $type = $this->filterTypes[$key];
214 
215  if ( count( $split ) === 1 ) {
216  $filter = new $type( $sink );
217  } elseif ( count( $split ) === 2 ) {
218  $filter = new $type( $sink, $split[1] );
219  }
220 
221  // references are lame in php...
222  unset( $sink );
223  $sink = $filter;
224 
225  break;
226  case 'schema-version':
227  if ( !in_array( $param, XmlDumpWriter::$supportedSchemas ) ) {
228  $this->fatalError(
229  "Unsupported schema version $param. Supported versions: " .
230  implode( ', ', XmlDumpWriter::$supportedSchemas )
231  );
232  }
233  $this->schemaVersion = $param;
234  break;
235  }
236  }
237 
238  if ( $this->hasOption( 'report' ) ) {
239  $this->reportingInterval = intval( $this->getOption( 'report' ) );
240  }
241 
242  if ( $this->hasOption( 'server' ) ) {
243  $this->server = $this->getOption( 'server' );
244  }
245 
246  if ( is_null( $sink ) ) {
247  $sink = new DumpOutput();
248  }
249  $sinks[] = $sink;
250 
251  if ( count( $sinks ) > 1 ) {
252  $this->sink = new DumpMultiWriter( $sinks );
253  } else {
254  $this->sink = $sink;
255  }
256  }
257 
258  function dump( $history, $text = WikiExporter::TEXT ) {
259  # Notice messages will foul up your XML output even if they're
260  # relatively harmless.
261  if ( ini_get( 'display_errors' ) ) {
262  ini_set( 'display_errors', 'stderr' );
263  }
264 
265  $this->initProgress( $history );
266 
267  $db = $this->backupDb();
268  $exporter = new WikiExporter( $db, $history, $text, $this->limitNamespaces );
269  $exporter->setSchemaVersion( $this->schemaVersion );
270  $exporter->dumpUploads = $this->dumpUploads;
271  $exporter->dumpUploadFileContents = $this->dumpUploadFileContents;
272 
273  $wrapper = new ExportProgressFilter( $this->sink, $this );
274  $exporter->setOutputSink( $wrapper );
275 
276  if ( !$this->skipHeader ) {
277  $exporter->openStream();
278  }
279  # Log item dumps: all or by range
280  if ( $history & WikiExporter::LOGS ) {
281  if ( $this->startId || $this->endId ) {
282  $exporter->logsByRange( $this->startId, $this->endId );
283  } else {
284  $exporter->allLogs();
285  }
286  } elseif ( is_null( $this->pages ) ) {
287  # Page dumps: all or by page ID range
288  if ( $this->startId || $this->endId ) {
289  $exporter->pagesByRange( $this->startId, $this->endId, $this->orderRevs );
290  } elseif ( $this->revStartId || $this->revEndId ) {
291  $exporter->revsByRange( $this->revStartId, $this->revEndId );
292  } else {
293  $exporter->allPages();
294  }
295  } else {
296  # Dump of specific pages
297  $exporter->pagesByName( $this->pages );
298  }
299 
300  if ( !$this->skipFooter ) {
301  $exporter->closeStream();
302  }
303 
304  $this->report( true );
305  }
306 
313  function initProgress( $history = WikiExporter::FULL ) {
314  $table = ( $history == WikiExporter::CURRENT ) ? 'page' : 'revision';
315  $field = ( $history == WikiExporter::CURRENT ) ? 'page_id' : 'rev_id';
316 
318  if ( $this->forcedDb === null ) {
319  $dbr = wfGetDB( DB_REPLICA );
320  }
321  $this->maxCount = $dbr->selectField( $table, "MAX($field)", '', __METHOD__ );
322  $this->startTime = microtime( true );
323  $this->lastTime = $this->startTime;
324  $this->ID = getmypid();
325  }
326 
333  function backupDb() {
334  if ( $this->forcedDb !== null ) {
335  return $this->forcedDb;
336  }
337 
338  $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
339  $this->lb = $lbFactory->newMainLB();
340  $db = $this->lb->getConnection( DB_REPLICA, 'dump' );
341 
342  // Discourage the server from disconnecting us if it takes a long time
343  // to read out the big ol' batch query.
344  $db->setSessionOptions( [ 'connTimeout' => 3600 * 24 ] );
345 
346  return $db;
347  }
348 
356  function setDB( IDatabase $db = null ) {
357  parent::setDB( $db );
358  $this->forcedDb = $db;
359  }
360 
361  function __destruct() {
362  if ( isset( $this->lb ) ) {
363  $this->lb->closeAll();
364  }
365  }
366 
367  function backupServer() {
368  global $wgDBserver;
369 
370  return $this->server ?: $wgDBserver;
371  }
372 
373  function reportPage() {
374  $this->pageCount++;
375  }
376 
377  function revCount() {
378  $this->revCount++;
379  $this->report();
380  }
381 
382  function report( $final = false ) {
383  if ( $final xor ( $this->revCount % $this->reportingInterval == 0 ) ) {
384  $this->showReport();
385  }
386  }
387 
388  function showReport() {
389  if ( $this->reporting ) {
390  $now = wfTimestamp( TS_DB );
391  $nowts = microtime( true );
392  $deltaAll = $nowts - $this->startTime;
393  $deltaPart = $nowts - $this->lastTime;
394  $this->pageCountPart = $this->pageCount - $this->pageCountLast;
395  $this->revCountPart = $this->revCount - $this->revCountLast;
396 
397  if ( $deltaAll ) {
398  $portion = $this->revCount / $this->maxCount;
399  $eta = $this->startTime + $deltaAll / $portion;
400  $etats = wfTimestamp( TS_DB, intval( $eta ) );
401  $pageRate = $this->pageCount / $deltaAll;
402  $revRate = $this->revCount / $deltaAll;
403  } else {
404  $pageRate = '-';
405  $revRate = '-';
406  $etats = '-';
407  }
408  if ( $deltaPart ) {
409  $pageRatePart = $this->pageCountPart / $deltaPart;
410  $revRatePart = $this->revCountPart / $deltaPart;
411  } else {
412  $pageRatePart = '-';
413  $revRatePart = '-';
414  }
415  $this->progress( sprintf(
416  "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), "
417  . "%d revs (%0.1f|%0.1f/sec all|curr), ETA %s [max %d]",
418  $now, wfWikiID(), $this->ID, $this->pageCount, $pageRate,
419  $pageRatePart, $this->revCount, $revRate, $revRatePart, $etats,
420  $this->maxCount
421  ) );
422  $this->lastTime = $nowts;
423  $this->revCountLast = $this->revCount;
424  }
425  }
426 
427  function progress( $string ) {
428  if ( $this->reporting ) {
429  fwrite( $this->stderr, $string . "\n" );
430  }
431  }
432 }
setDB(IDatabase $db=null)
Force the dump to use the provided database connection for database operations, wherever possible...
processOptions()
Processes arguments and sets $this->$sink accordingly.
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
dump( $history, $text=WikiExporter::TEXT)
LoadBalancer $lb
setSessionOptions(array $options)
Override database&#39;s default behavior.
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
getOption( $name, $default=null)
Get an option, or return the default.
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Definition: router.php:42
progress( $string)
array $orderedOptions
Used to read the options in the order they were passed.
registerFilter( $name, $class)
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
Definition: maintenance.txt:39
__construct( $args=null)
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
$wgDBserver
Database host name or IP address.
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
hasOption( $name)
Checks to see if a particular option exists.
loadWithArgv( $argv)
Load params and arguments from a given array of command-line arguments.
if( $line===false) $args
Definition: cdb.php:64
loadPlugin( $class, $file)
Load a plugin and register it.
initProgress( $history=WikiExporter::FULL)
Initialise starting time and maximum revision count.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped & $options
Definition: hooks.txt:1982
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:780
$filter
wfWikiID()
Get an ASCII string identifying this wiki This is used as a prefix in memcached keys.
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
The ContentHandler facility adds support for arbitrary content types on wiki pages
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
you have access to all of the normal MediaWiki so you can get a DB use the etc For full docs on the Maintenance class
Definition: maintenance.txt:52
IDatabase null $forcedDb
The dependency-injected database to use.
report( $final=false)
static schemaVersion()
Returns the default export schema version, as defined by $wgXmlDumpSchemaVersion. ...
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such and we might be restricted by PHP settings such as safe mode or open_basedir We cannot assume that the software even has read access anywhere useful Many shared hosts run all users web applications under the same so they can t rely on Unix and must forbid reads to even standard directories like tmp lest users read each others files We cannot assume that the user has the ability to install or run any programs not written as web accessible PHP scripts Since anything that works on cheap shared hosting will work if you have shell or root access MediaWiki s design is based around catering to the lowest common denominator Although we support higher end setups as the way many things work by default is tailored toward shared hosting These defaults are unconventional from the point of view of and they certainly aren t ideal for someone who s installing MediaWiki as MediaWiki does not conform to normal Unix filesystem layout Hopefully we ll offer direct support for standard layouts in the but for now *any change to the location of files is unsupported *Moving things and leaving symlinks will *probably *not break but it is *strongly *advised not to try any more intrusive changes to get MediaWiki to conform more closely to your filesystem hierarchy Any such attempt will almost certainly result in unnecessary bugs The standard recommended location to install relative to the web is it should be possible to enable the appropriate rewrite rules by if you can reconfigure the web server
static string [] $supportedSchemas
the schema versions supported for output
Allows to change the fields on the form that will be generated $name
Definition: hooks.txt:271
const DB_REPLICA
Definition: defines.php:25
registerOutput( $name, $class)
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.