MediaWiki  master
WikiExporter.php
Go to the documentation of this file.
1 <?php
40 
44 class WikiExporter {
46  public $list_authors = false;
47 
49  public $dumpUploads = false;
50 
52  public $dumpUploadFileContents = false;
53 
55  public $author_list = "";
56 
57  public const FULL = 1;
58  public const CURRENT = 2;
59  public const STABLE = 4; // extension defined
60  public const LOGS = 8;
61  public const RANGE = 16;
62 
65 
66  protected const BATCH_SIZE = 50000;
67 
69  public $text;
70 
72  public $sink;
73 
75  private $writer;
76 
78  protected $db;
79 
81  protected $history;
82 
84  protected $limitNamespaces;
85 
87  private $revisionStore;
88 
90  private $titleParser;
91 
93  private $hookRunner;
94 
99  public static function schemaVersion() {
100  return MediaWikiServices::getInstance()->getMainConfig()->get(
101  MainConfigNames::XmlDumpSchemaVersion );
102  }
103 
117  public function __construct(
118  $db,
119  HookContainer $hookContainer,
120  RevisionStore $revisionStore,
121  TitleParser $titleParser,
122  $history = self::CURRENT,
123  $text = self::TEXT,
124  $limitNamespaces = null
125  ) {
126  $this->db = $db;
127  $this->history = $history;
128  // TODO: add a $hookContainer parameter to XmlDumpWriter so that we can inject
129  // and then be able to convert the factory test to a unit test
130  $this->writer = new XmlDumpWriter( $text, self::schemaVersion() );
131  $this->sink = new DumpOutput();
132  $this->text = $text;
133  $this->limitNamespaces = $limitNamespaces;
134  $this->hookRunner = new HookRunner( $hookContainer );
135  $this->revisionStore = $revisionStore;
136  $this->titleParser = $titleParser;
137  }
138 
144  public function setSchemaVersion( $schemaVersion ) {
145  $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
146  }
147 
155  public function setOutputSink( &$sink ) {
156  $this->sink =& $sink;
157  }
158 
159  public function openStream() {
160  $output = $this->writer->openStream();
161  $this->sink->writeOpenStream( $output );
162  }
163 
164  public function closeStream() {
165  $output = $this->writer->closeStream();
166  $this->sink->writeCloseStream( $output );
167  }
168 
174  public function allPages() {
175  $this->dumpFrom( '' );
176  }
177 
186  public function pagesByRange( $start, $end, $orderRevs ) {
187  if ( $orderRevs ) {
188  $condition = 'rev_page >= ' . intval( $start );
189  if ( $end ) {
190  $condition .= ' AND rev_page < ' . intval( $end );
191  }
192  } else {
193  $condition = 'page_id >= ' . intval( $start );
194  if ( $end ) {
195  $condition .= ' AND page_id < ' . intval( $end );
196  }
197  }
198  $this->dumpFrom( $condition, $orderRevs );
199  }
200 
208  public function revsByRange( $start, $end ) {
209  $condition = 'rev_id >= ' . intval( $start );
210  if ( $end ) {
211  $condition .= ' AND rev_id < ' . intval( $end );
212  }
213  $this->dumpFrom( $condition );
214  }
215 
219  public function pageByTitle( PageIdentity $page ) {
220  $this->dumpFrom(
221  'page_namespace=' . $page->getNamespace() .
222  ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) );
223  }
224 
229  public function pageByName( $name ) {
230  try {
231  $link = $this->titleParser->parseTitle( $name );
232  $this->dumpFrom(
233  'page_namespace=' . $link->getNamespace() .
234  ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) );
235  } catch ( MalformedTitleException $ex ) {
236  throw new MWException( "Can't export invalid title" );
237  }
238  }
239 
243  public function pagesByName( $names ) {
244  foreach ( $names as $name ) {
245  $this->pageByName( $name );
246  }
247  }
248 
249  public function allLogs() {
250  $this->dumpFrom( '' );
251  }
252 
257  public function logsByRange( $start, $end ) {
258  $condition = 'log_id >= ' . intval( $start );
259  if ( $end ) {
260  $condition .= ' AND log_id < ' . intval( $end );
261  }
262  $this->dumpFrom( $condition );
263  }
264 
272  protected function do_list_authors( $cond ) {
273  $this->author_list = "<contributors>";
274  // rev_deleted
275 
276  $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
277  $res = $this->db->newSelectQueryBuilder()
278  ->select( [
279  'rev_user_text' => $revQuery['fields']['rev_user_text'],
280  'rev_user' => $revQuery['fields']['rev_user'],
281  ] )
282  ->tables( $revQuery['tables'] )
283  ->where( [
284  $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0',
285  $cond,
286  ] )
287  ->joinConds( $revQuery['joins'] )
288  ->distinct()
289  ->caller( __METHOD__ )
290  ->fetchResultSet();
291 
292  foreach ( $res as $row ) {
293  $this->author_list .= "<contributor>" .
294  "<username>" .
295  htmlspecialchars( $row->rev_user_text ) .
296  "</username>" .
297  "<id>" .
298  ( (int)$row->rev_user ) .
299  "</id>" .
300  "</contributor>";
301  }
302  $this->author_list .= "</contributors>";
303  }
304 
311  protected function dumpFrom( $cond = '', $orderRevs = false ) {
312  if ( is_int( $this->history ) && ( $this->history & self::LOGS ) ) {
313  $this->dumpLogs( $cond );
314  } else {
315  $this->dumpPages( $cond, $orderRevs );
316  }
317  }
318 
323  protected function dumpLogs( $cond ) {
324  $where = [];
325  # Hide private logs
326  $hideLogs = LogEventsList::getExcludeClause( $this->db );
327  if ( $hideLogs ) {
328  $where[] = $hideLogs;
329  }
330  # Add on any caller specified conditions
331  if ( $cond ) {
332  $where[] = $cond;
333  }
334 
335  $commentQuery = CommentStore::getStore()->getJoin( 'log_comment' );
336 
337  $tables = array_merge(
338  [ 'logging', 'actor' ], $commentQuery['tables']
339  );
340  $fields = [
341  'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
342  'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name'
343  ] + $commentQuery['fields'];
344  $options = [
345  'ORDER BY' => 'log_id',
346  'USE INDEX' => [ 'logging' => 'PRIMARY' ],
347  'LIMIT' => self::BATCH_SIZE,
348  ];
349  $joins = [
350  'actor' => [ 'JOIN', 'actor_id=log_actor' ]
351  ] + $commentQuery['joins'];
352 
353  $lastLogId = 0;
354  while ( true ) {
355  $result = $this->db->select(
356  $tables,
357  $fields,
358  array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
359  __METHOD__,
360  $options,
361  $joins
362  );
363 
364  if ( !$result->numRows() ) {
365  break;
366  }
367 
368  $lastLogId = $this->outputLogStream( $result );
369  $this->reloadDBConfig();
370  }
371  }
372 
379  protected function dumpPages( $cond, $orderRevs ) {
380  $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
381  $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
382 
383  // We want page primary rather than revision.
384  // We also want to join in the slots and content tables.
385  // NOTE: This means we may get multiple rows per revision, and more rows
386  // than the batch size! Should be ok, since the max number of slots is
387  // fixed and low (dozens at worst).
388  $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
389  $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
390  $join = $revQuery['joins'] + [
391  'revision' => $revQuery['joins']['page'],
392  'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
393  'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
394  ];
395  unset( $join['page'] );
396 
397  $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
398 
399  if ( $this->text != self::STUB ) {
400  $fields['_load_content'] = '1';
401  }
402 
403  $conds = [];
404  if ( $cond !== '' ) {
405  $conds[] = $cond;
406  }
407  $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
408  $opts['USE INDEX'] = [];
409 
410  $op = '>';
411  if ( is_array( $this->history ) ) {
412  # Time offset/limit for all pages/history...
413  # Set time order
414  if ( $this->history['dir'] == 'asc' ) {
415  $opts['ORDER BY'] = 'rev_timestamp ASC';
416  } else {
417  $op = '<';
418  $opts['ORDER BY'] = 'rev_timestamp DESC';
419  }
420  # Set offset
421  if ( !empty( $this->history['offset'] ) ) {
422  $conds[] = "rev_timestamp $op " .
423  $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
424  }
425  # Set query limit
426  if ( !empty( $this->history['limit'] ) ) {
427  $maxRowCount = intval( $this->history['limit'] );
428  }
429  } elseif ( $this->history & self::FULL ) {
430  # Full history dumps...
431  # query optimization for history stub dumps
432  if ( $this->text == self::STUB ) {
433  $opts[] = 'STRAIGHT_JOIN';
434  unset( $join['revision'] );
435  $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
436  }
437  } elseif ( $this->history & self::CURRENT ) {
438  # Latest revision dumps...
439  if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
440  $this->do_list_authors( $cond );
441  }
442  $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
443  $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
444  } elseif ( $this->history & self::STABLE ) {
445  # "Stable" revision dumps...
446  # Default JOIN, to be overridden...
447  $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
448  # One, and only one hook should set this, and return false
449  if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
450  throw new MWException( __METHOD__ . " given invalid history dump type." );
451  }
452  } elseif ( $this->history & self::RANGE ) {
453  # Dump of revisions within a specified range. Condition already set in revsByRange().
454  } else {
455  # Unknown history specification parameter?
456  throw new MWException( __METHOD__ . " given invalid history dump type." );
457  }
458 
459  $done = false;
460  $lastRow = null;
461  $revPage = 0;
462  $revId = 0;
463  $rowCount = 0;
464 
465  $opts['LIMIT'] = self::BATCH_SIZE;
466 
467  $this->hookRunner->onModifyExportQuery(
468  $this->db, $tables, $cond, $opts, $join, $conds );
469 
470  while ( !$done ) {
471  // If necessary, impose the overall maximum and stop looping after this iteration.
472  if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
473  $opts['LIMIT'] = $maxRowCount - $rowCount;
474  $done = true;
475  }
476 
477  $queryConds = $conds;
478  $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
479  intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
480 
481  # Do the query and process any results, remembering max ids for the next iteration.
482  $result = $this->db->select(
483  $tables,
484  $fields,
485  $queryConds,
486  __METHOD__,
487  $opts,
488  $join
489  );
490  if ( $result->numRows() > 0 ) {
491  $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
492  $rowCount += $result->numRows();
493  $revPage = $lastRow->rev_page;
494  $revId = $lastRow->rev_id;
495  } else {
496  $done = true;
497  }
498 
499  // If we are finished, close off final page element (if any).
500  if ( $done && $lastRow ) {
501  $this->finishPageStreamOutput( $lastRow );
502  }
503 
504  if ( !$done ) {
505  $this->reloadDBConfig();
506  }
507  }
508  }
509 
519  protected function outputPageStreamBatch( $results, $lastRow ) {
520  $rowCarry = null;
521  while ( true ) {
522  $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
523 
524  if ( !$slotRows ) {
525  break;
526  }
527 
528  // All revision info is present in all slot rows.
529  // Use the first slot row as the revision row.
530  $revRow = $slotRows[0];
531 
532  if ( $this->limitNamespaces &&
533  !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
534  $lastRow = $revRow;
535  continue;
536  }
537 
538  if ( $lastRow === null ||
539  $lastRow->page_namespace !== $revRow->page_namespace ||
540  $lastRow->page_title !== $revRow->page_title ) {
541  if ( $lastRow !== null ) {
542  $output = '';
543  if ( $this->dumpUploads ) {
544  $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
545  }
546  $output .= $this->writer->closePage();
547  $this->sink->writeClosePage( $output );
548  }
549  $output = $this->writer->openPage( $revRow );
550  $this->sink->writeOpenPage( $revRow, $output );
551  }
552  try {
553  $output = $this->writer->writeRevision( $revRow, $slotRows );
554  $this->sink->writeRevision( $revRow, $output );
555  } catch ( RevisionAccessException $ex ) {
556  MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for'
557  . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() );
558  }
559  $lastRow = $revRow;
560  }
561 
562  if ( $rowCarry ) {
563  throw new LogicException( 'Error while processing a stream of slot rows' );
564  }
565 
566  // @phan-suppress-next-line PhanTypeMismatchReturnNullable False positive
567  return $lastRow;
568  }
569 
579  protected function getSlotRowBatch( $results, &$carry = null ) {
580  $slotRows = [];
581  $prev = null;
582 
583  if ( $carry ) {
584  $slotRows[] = $carry;
585  $prev = $carry;
586  $carry = null;
587  }
588 
589  while ( $row = $results->fetchObject() ) {
590  if ( $prev && $prev->rev_id !== $row->rev_id ) {
591  $carry = $row;
592  break;
593  }
594  $slotRows[] = $row;
595  $prev = $row;
596  }
597 
598  return $slotRows;
599  }
600 
606  protected function finishPageStreamOutput( $lastRow ) {
607  $output = '';
608  if ( $this->dumpUploads ) {
609  $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
610  }
611  $output .= $this->author_list;
612  $output .= $this->writer->closePage();
613  $this->sink->writeClosePage( $output );
614  }
615 
620  protected function outputLogStream( $resultset ) {
621  foreach ( $resultset as $row ) {
622  $output = $this->writer->writeLogItem( $row );
623  $this->sink->writeLogItem( $row, $output );
624  }
625  return $row->log_id ?? null;
626  }
627 
634  private function reloadDBConfig() {
635  MediaWikiServices::getInstance()->getDBLoadBalancerFactory()
636  ->autoReconfigure();
637  }
638 }
static getStore()
static getExcludeClause( $db, $audience='public', Authority $performer=null)
SQL clause to skip forbidden log types for this user.
static warning( $msg, $callerOffset=1, $level=E_USER_NOTICE, $log='auto')
Adds a warning entry to the log.
Definition: MWDebug.php:183
MediaWiki exception.
Definition: MWException.php:29
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Definition: HookRunner.php:560
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception representing a failure to look up a revision.
Page revision base class.
Service for looking up page revisions.
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
__construct( $db, HookContainer $hookContainer, RevisionStore $revisionStore, TitleParser $titleParser, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
dumpFrom( $cond='', $orderRevs=false)
pageByName( $name)
pagesByName( $names)
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
bool $dumpUploadFileContents
array int $history
dumpLogs( $cond)
DumpOutput $sink
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
logsByRange( $start, $end)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
array null $limitNamespaces
dumpPages( $cond, $orderRevs)
pageByTitle(PageIdentity $page)
const BATCH_SIZE
outputLogStream( $resultset)
IDatabase $db
setSchemaVersion( $schemaVersion)
static schemaVersion()
Returns the default export schema version, as defined by the XmlDumpSchemaVersion setting.
string $author_list
bool $list_authors
Return distinct author list (when not returning full history)
const WRITE_STUB
Only output subs for revision content.
const WRITE_CONTENT
Output serialized revision content.
Interface for objects (potentially) representing an editable wiki page.
getNamespace()
Returns the page's namespace number.
getDBkey()
Get the page title in DB key form.
A title parser service for MediaWiki.
Definition: TitleParser.php:33
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:40
Result wrapper for grabbing data queried from an IDatabase object.
$revQuery