MediaWiki  master
WikiExporter.php
Go to the documentation of this file.
1 <?php
40 
44 class WikiExporter {
46  public $list_authors = false;
47 
49  public $dumpUploads = false;
50 
52  public $dumpUploadFileContents = false;
53 
55  public $author_list = "";
56 
57  public const FULL = 1;
58  public const CURRENT = 2;
59  public const STABLE = 4; // extension defined
60  public const LOGS = 8;
61  public const RANGE = 16;
62 
65 
66  protected const BATCH_SIZE = 50000;
67 
69  public $text;
70 
72  public $sink;
73 
75  private $writer;
76 
78  protected $db;
79 
81  protected $history;
82 
84  protected $limitNamespaces;
85 
87  private $revisionStore;
88 
90  private $titleParser;
91 
93  private $hookRunner;
94 
99  public static function schemaVersion() {
100  return MediaWikiServices::getInstance()->getMainConfig()->get(
101  MainConfigNames::XmlDumpSchemaVersion );
102  }
103 
117  public function __construct(
118  $db,
119  HookContainer $hookContainer,
122  $history = self::CURRENT,
123  $text = self::TEXT,
124  $limitNamespaces = null
125  ) {
126  $this->db = $db;
127  $this->history = $history;
128  // TODO: add a $hookContainer parameter to XmlDumpWriter so that we can inject
129  // and then be able to convert the factory test to a unit test
130  $this->writer = new XmlDumpWriter( $text, self::schemaVersion() );
131  $this->sink = new DumpOutput();
132  $this->text = $text;
133  $this->limitNamespaces = $limitNamespaces;
134  $this->hookRunner = new HookRunner( $hookContainer );
135  $this->revisionStore = $revisionStore;
136  $this->titleParser = $titleParser;
137  }
138 
144  public function setSchemaVersion( $schemaVersion ) {
145  $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
146  }
147 
155  public function setOutputSink( &$sink ) {
156  $this->sink =& $sink;
157  }
158 
159  public function openStream() {
160  $output = $this->writer->openStream();
161  $this->sink->writeOpenStream( $output );
162  }
163 
164  public function closeStream() {
165  $output = $this->writer->closeStream();
166  $this->sink->writeCloseStream( $output );
167  }
168 
174  public function allPages() {
175  $this->dumpFrom( '' );
176  }
177 
186  public function pagesByRange( $start, $end, $orderRevs ) {
187  if ( $orderRevs ) {
188  $condition = 'rev_page >= ' . intval( $start );
189  if ( $end ) {
190  $condition .= ' AND rev_page < ' . intval( $end );
191  }
192  } else {
193  $condition = 'page_id >= ' . intval( $start );
194  if ( $end ) {
195  $condition .= ' AND page_id < ' . intval( $end );
196  }
197  }
198  $this->dumpFrom( $condition, $orderRevs );
199  }
200 
208  public function revsByRange( $start, $end ) {
209  $condition = 'rev_id >= ' . intval( $start );
210  if ( $end ) {
211  $condition .= ' AND rev_id < ' . intval( $end );
212  }
213  $this->dumpFrom( $condition );
214  }
215 
219  public function pageByTitle( PageIdentity $page ) {
220  $this->dumpFrom(
221  'page_namespace=' . $page->getNamespace() .
222  ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) );
223  }
224 
229  public function pageByName( $name ) {
230  try {
231  $link = $this->titleParser->parseTitle( $name );
232  $this->dumpFrom(
233  'page_namespace=' . $link->getNamespace() .
234  ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) );
235  } catch ( MalformedTitleException $ex ) {
236  throw new MWException( "Can't export invalid title" );
237  }
238  }
239 
243  public function pagesByName( $names ) {
244  foreach ( $names as $name ) {
245  $this->pageByName( $name );
246  }
247  }
248 
249  public function allLogs() {
250  $this->dumpFrom( '' );
251  }
252 
257  public function logsByRange( $start, $end ) {
258  $condition = 'log_id >= ' . intval( $start );
259  if ( $end ) {
260  $condition .= ' AND log_id < ' . intval( $end );
261  }
262  $this->dumpFrom( $condition );
263  }
264 
272  protected function do_list_authors( $cond ) {
273  $this->author_list = "<contributors>";
274  // rev_deleted
275 
276  $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
277  $res = $this->db->select(
278  $revQuery['tables'],
279  [
280  'rev_user_text' => $revQuery['fields']['rev_user_text'],
281  'rev_user' => $revQuery['fields']['rev_user'],
282  ],
283  [
284  $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0',
285  $cond,
286  ],
287  __METHOD__,
288  [ 'DISTINCT' ],
289  $revQuery['joins']
290  );
291 
292  foreach ( $res as $row ) {
293  $this->author_list .= "<contributor>" .
294  "<username>" .
295  htmlspecialchars( $row->rev_user_text ) .
296  "</username>" .
297  "<id>" .
298  ( (int)$row->rev_user ) .
299  "</id>" .
300  "</contributor>";
301  }
302  $this->author_list .= "</contributors>";
303  }
304 
311  protected function dumpFrom( $cond = '', $orderRevs = false ) {
312  if ( is_int( $this->history ) && ( $this->history & self::LOGS ) ) {
313  $this->dumpLogs( $cond );
314  } else {
315  $this->dumpPages( $cond, $orderRevs );
316  }
317  }
318 
323  protected function dumpLogs( $cond ) {
324  $where = [];
325  # Hide private logs
326  $hideLogs = LogEventsList::getExcludeClause( $this->db );
327  if ( $hideLogs ) {
328  $where[] = $hideLogs;
329  }
330  # Add on any caller specified conditions
331  if ( $cond ) {
332  $where[] = $cond;
333  }
334  $result = null; // Assuring $result is not undefined, if exception occurs early
335 
336  $commentQuery = CommentStore::getStore()->getJoin( 'log_comment' );
337 
338  $tables = array_merge(
339  [ 'logging', 'actor' ], $commentQuery['tables']
340  );
341  $fields = [
342  'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
343  'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name'
344  ] + $commentQuery['fields'];
345  $options = [
346  'ORDER BY' => 'log_id',
347  'USE INDEX' => [ 'logging' => 'PRIMARY' ],
348  'LIMIT' => self::BATCH_SIZE,
349  ];
350  $joins = [
351  'actor' => [ 'JOIN', 'actor_id=log_actor' ]
352  ] + $commentQuery['joins'];
353 
354  $lastLogId = 0;
355  while ( true ) {
356  $result = $this->db->select(
357  $tables,
358  $fields,
359  array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
360  __METHOD__,
361  $options,
362  $joins
363  );
364 
365  if ( !$result->numRows() ) {
366  break;
367  }
368 
369  $lastLogId = $this->outputLogStream( $result );
370  }
371  }
372 
379  protected function dumpPages( $cond, $orderRevs ) {
380  $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
381  $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
382 
383  // We want page primary rather than revision.
384  // We also want to join in the slots and content tables.
385  // NOTE: This means we may get multiple rows per revision, and more rows
386  // than the batch size! Should be ok, since the max number of slots is
387  // fixed and low (dozens at worst).
388  $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
389  $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
390  $join = $revQuery['joins'] + [
391  'revision' => $revQuery['joins']['page'],
392  'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
393  'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
394  ];
395  unset( $join['page'] );
396 
397  $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
398 
399  if ( $this->text != self::STUB ) {
400  $fields['_load_content'] = '1';
401  }
402 
403  $conds = [];
404  if ( $cond !== '' ) {
405  $conds[] = $cond;
406  }
407  $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
408  $opts['USE INDEX'] = [];
409 
410  $op = '>';
411  if ( is_array( $this->history ) ) {
412  # Time offset/limit for all pages/history...
413  # Set time order
414  if ( $this->history['dir'] == 'asc' ) {
415  $opts['ORDER BY'] = 'rev_timestamp ASC';
416  } else {
417  $op = '<';
418  $opts['ORDER BY'] = 'rev_timestamp DESC';
419  }
420  # Set offset
421  if ( !empty( $this->history['offset'] ) ) {
422  $conds[] = "rev_timestamp $op " .
423  $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
424  }
425  # Set query limit
426  if ( !empty( $this->history['limit'] ) ) {
427  $maxRowCount = intval( $this->history['limit'] );
428  }
429  } elseif ( $this->history & self::FULL ) {
430  # Full history dumps...
431  # query optimization for history stub dumps
432  if ( $this->text == self::STUB ) {
433  $opts[] = 'STRAIGHT_JOIN';
434  unset( $join['revision'] );
435  $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
436  }
437  } elseif ( $this->history & self::CURRENT ) {
438  # Latest revision dumps...
439  if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
440  $this->do_list_authors( $cond );
441  }
442  $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
443  $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
444  } elseif ( $this->history & self::STABLE ) {
445  # "Stable" revision dumps...
446  # Default JOIN, to be overridden...
447  $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
448  # One, and only one hook should set this, and return false
449  if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
450  throw new MWException( __METHOD__ . " given invalid history dump type." );
451  }
452  } elseif ( $this->history & self::RANGE ) {
453  # Dump of revisions within a specified range. Condition already set in revsByRange().
454  } else {
455  # Unknown history specification parameter?
456  throw new MWException( __METHOD__ . " given invalid history dump type." );
457  }
458 
459  $result = null; // Assuring $result is not undefined, if exception occurs early
460  $done = false;
461  $lastRow = null;
462  $revPage = 0;
463  $revId = 0;
464  $rowCount = 0;
465 
466  $opts['LIMIT'] = self::BATCH_SIZE;
467 
468  $this->hookRunner->onModifyExportQuery(
469  $this->db, $tables, $cond, $opts, $join, $conds );
470 
471  while ( !$done ) {
472  // If necessary, impose the overall maximum and stop looping after this iteration.
473  if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
474  $opts['LIMIT'] = $maxRowCount - $rowCount;
475  $done = true;
476  }
477 
478  $queryConds = $conds;
479  $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
480  intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
481 
482  # Do the query and process any results, remembering max ids for the next iteration.
483  $result = $this->db->select(
484  $tables,
485  $fields,
486  $queryConds,
487  __METHOD__,
488  $opts,
489  $join
490  );
491  if ( $result->numRows() > 0 ) {
492  $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
493  $rowCount += $result->numRows();
494  $revPage = $lastRow->rev_page;
495  $revId = $lastRow->rev_id;
496  } else {
497  $done = true;
498  }
499 
500  // If we are finished, close off final page element (if any).
501  if ( $done && $lastRow ) {
502  $this->finishPageStreamOutput( $lastRow );
503  }
504  }
505  }
506 
516  protected function outputPageStreamBatch( $results, $lastRow ) {
517  $rowCarry = null;
518  while ( true ) {
519  $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
520 
521  if ( !$slotRows ) {
522  break;
523  }
524 
525  // All revision info is present in all slot rows.
526  // Use the first slot row as the revision row.
527  $revRow = $slotRows[0];
528 
529  if ( $this->limitNamespaces &&
530  !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
531  $lastRow = $revRow;
532  continue;
533  }
534 
535  if ( $lastRow === null ||
536  $lastRow->page_namespace !== $revRow->page_namespace ||
537  $lastRow->page_title !== $revRow->page_title ) {
538  if ( $lastRow !== null ) {
539  $output = '';
540  if ( $this->dumpUploads ) {
541  $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
542  }
543  $output .= $this->writer->closePage();
544  $this->sink->writeClosePage( $output );
545  }
546  $output = $this->writer->openPage( $revRow );
547  $this->sink->writeOpenPage( $revRow, $output );
548  }
549  try {
550  $output = $this->writer->writeRevision( $revRow, $slotRows );
551  $this->sink->writeRevision( $revRow, $output );
552  } catch ( RevisionAccessException $ex ) {
553  MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for'
554  . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() );
555  }
556  $lastRow = $revRow;
557  }
558 
559  if ( $rowCarry ) {
560  throw new LogicException( 'Error while processing a stream of slot rows' );
561  }
562 
563  // @phan-suppress-next-line PhanTypeMismatchReturnNullable False positive
564  return $lastRow;
565  }
566 
576  protected function getSlotRowBatch( $results, &$carry = null ) {
577  $slotRows = [];
578  $prev = null;
579 
580  if ( $carry ) {
581  $slotRows[] = $carry;
582  $prev = $carry;
583  $carry = null;
584  }
585 
586  while ( $row = $results->fetchObject() ) {
587  if ( $prev && $prev->rev_id !== $row->rev_id ) {
588  $carry = $row;
589  break;
590  }
591  $slotRows[] = $row;
592  $prev = $row;
593  }
594 
595  return $slotRows;
596  }
597 
603  protected function finishPageStreamOutput( $lastRow ) {
604  $output = '';
605  if ( $this->dumpUploads ) {
606  $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
607  }
608  $output .= $this->author_list;
609  $output .= $this->writer->closePage();
610  $this->sink->writeClosePage( $output );
611  }
612 
617  protected function outputLogStream( $resultset ) {
618  foreach ( $resultset as $row ) {
619  $output = $this->writer->writeLogItem( $row );
620  $this->sink->writeLogItem( $row, $output );
621  }
622  return $row->log_id ?? null;
623  }
624 }
static getStore()
static getExcludeClause( $db, $audience='public', Authority $performer=null)
SQL clause to skip forbidden log types for this user.
static warning( $msg, $callerOffset=1, $level=E_USER_NOTICE, $log='auto')
Adds a warning entry to the log.
Definition: MWDebug.php:183
MediaWiki exception.
Definition: MWException.php:29
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Definition: HookRunner.php:562
A class containing constants representing the names of configuration variables.
MediaWikiServices is the service locator for the application scope of MediaWiki.
Exception representing a failure to look up a revision.
Page revision base class.
Service for looking up page revisions.
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
__construct( $db, HookContainer $hookContainer, RevisionStore $revisionStore, TitleParser $titleParser, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
RevisionStore $revisionStore
dumpFrom( $cond='', $orderRevs=false)
XmlDumpWriter $writer
pageByName( $name)
pagesByName( $names)
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
HookRunner $hookRunner
bool $dumpUploadFileContents
array int $history
dumpLogs( $cond)
DumpOutput $sink
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
logsByRange( $start, $end)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
array null $limitNamespaces
dumpPages( $cond, $orderRevs)
pageByTitle(PageIdentity $page)
const BATCH_SIZE
outputLogStream( $resultset)
IDatabase $db
setSchemaVersion( $schemaVersion)
TitleParser $titleParser
static schemaVersion()
Returns the default export schema version, as defined by the XmlDumpSchemaVersion setting.
string $author_list
bool $list_authors
Return distinct author list (when not returning full history)
const WRITE_STUB
Only output subs for revision content.
const WRITE_CONTENT
Output serialized revision content.
Interface for objects (potentially) representing an editable wiki page.
getDBkey()
Get the page title in DB key form.
getNamespace()
Returns the page's namespace number.
A title parser service for MediaWiki.
Definition: TitleParser.php:33
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:39
Result wrapper for grabbing data queried from an IDatabase object.
$revQuery