MediaWiki  master
WikiExporter.php
Go to the documentation of this file.
1 <?php
41 
45 class WikiExporter {
47  public $list_authors = false;
48 
50  public $dumpUploads = false;
51 
53  public $dumpUploadFileContents = false;
54 
56  public $author_list = "";
57 
58  public const FULL = 1;
59  public const CURRENT = 2;
60  public const STABLE = 4; // extension defined
61  public const LOGS = 8;
62  public const RANGE = 16;
63 
66 
67  protected const BATCH_SIZE = 50000;
68 
70  public $text;
71 
73  public $sink;
74 
76  private $writer;
77 
79  protected $db;
80 
82  protected $history;
83 
85  protected $limitNamespaces;
86 
88  private $revisionStore;
89 
91  private $titleParser;
92 
94  private $hookRunner;
95 
97  private $commentStore;
98 
103  public static function schemaVersion() {
104  return MediaWikiServices::getInstance()->getMainConfig()->get(
105  MainConfigNames::XmlDumpSchemaVersion );
106  }
107 
122  public function __construct(
123  $db,
124  CommentStore $commentStore,
125  HookContainer $hookContainer,
126  RevisionStore $revisionStore,
127  TitleParser $titleParser,
128  $history = self::CURRENT,
129  $text = self::TEXT,
130  $limitNamespaces = null
131  ) {
132  $this->db = $db;
133  $this->commentStore = $commentStore;
134  $this->history = $history;
135  $this->writer = new XmlDumpWriter(
136  $text,
137  self::schemaVersion(),
138  $hookContainer,
139  $commentStore
140  );
141  $this->sink = new DumpOutput();
142  $this->text = $text;
143  $this->limitNamespaces = $limitNamespaces;
144  $this->hookRunner = new HookRunner( $hookContainer );
145  $this->revisionStore = $revisionStore;
146  $this->titleParser = $titleParser;
147  }
148 
154  public function setSchemaVersion( $schemaVersion ) {
155  $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
156  }
157 
165  public function setOutputSink( &$sink ) {
166  $this->sink =& $sink;
167  }
168 
169  public function openStream() {
170  $output = $this->writer->openStream();
171  $this->sink->writeOpenStream( $output );
172  }
173 
174  public function closeStream() {
175  $output = $this->writer->closeStream();
176  $this->sink->writeCloseStream( $output );
177  }
178 
184  public function allPages() {
185  $this->dumpFrom( '' );
186  }
187 
196  public function pagesByRange( $start, $end, $orderRevs ) {
197  if ( $orderRevs ) {
198  $condition = 'rev_page >= ' . intval( $start );
199  if ( $end ) {
200  $condition .= ' AND rev_page < ' . intval( $end );
201  }
202  } else {
203  $condition = 'page_id >= ' . intval( $start );
204  if ( $end ) {
205  $condition .= ' AND page_id < ' . intval( $end );
206  }
207  }
208  $this->dumpFrom( $condition, $orderRevs );
209  }
210 
218  public function revsByRange( $start, $end ) {
219  $condition = 'rev_id >= ' . intval( $start );
220  if ( $end ) {
221  $condition .= ' AND rev_id < ' . intval( $end );
222  }
223  $this->dumpFrom( $condition );
224  }
225 
229  public function pageByTitle( PageIdentity $page ) {
230  $this->dumpFrom(
231  'page_namespace=' . $page->getNamespace() .
232  ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) );
233  }
234 
239  public function pageByName( $name ) {
240  try {
241  $link = $this->titleParser->parseTitle( $name );
242  $this->dumpFrom(
243  'page_namespace=' . $link->getNamespace() .
244  ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) );
245  } catch ( MalformedTitleException $ex ) {
246  throw new MWException( "Can't export invalid title" );
247  }
248  }
249 
253  public function pagesByName( $names ) {
254  foreach ( $names as $name ) {
255  $this->pageByName( $name );
256  }
257  }
258 
259  public function allLogs() {
260  $this->dumpFrom( '' );
261  }
262 
267  public function logsByRange( $start, $end ) {
268  $condition = 'log_id >= ' . intval( $start );
269  if ( $end ) {
270  $condition .= ' AND log_id < ' . intval( $end );
271  }
272  $this->dumpFrom( $condition );
273  }
274 
282  protected function do_list_authors( $cond ) {
283  $this->author_list = "<contributors>";
284  // rev_deleted
285 
286  $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
287  $res = $this->db->newSelectQueryBuilder()
288  ->select( [
289  'rev_user_text' => $revQuery['fields']['rev_user_text'],
290  'rev_user' => $revQuery['fields']['rev_user'],
291  ] )
292  ->tables( $revQuery['tables'] )
293  ->where( [
294  $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0',
295  $cond,
296  ] )
297  ->joinConds( $revQuery['joins'] )
298  ->distinct()
299  ->caller( __METHOD__ )
300  ->fetchResultSet();
301 
302  foreach ( $res as $row ) {
303  $this->author_list .= "<contributor>" .
304  "<username>" .
305  htmlspecialchars( $row->rev_user_text ) .
306  "</username>" .
307  "<id>" .
308  ( (int)$row->rev_user ) .
309  "</id>" .
310  "</contributor>";
311  }
312  $this->author_list .= "</contributors>";
313  }
314 
321  protected function dumpFrom( $cond = '', $orderRevs = false ) {
322  if ( is_int( $this->history ) && ( $this->history & self::LOGS ) ) {
323  $this->dumpLogs( $cond );
324  } else {
325  $this->dumpPages( $cond, $orderRevs );
326  }
327  }
328 
333  protected function dumpLogs( $cond ) {
334  $where = [];
335  # Hide private logs
336  $hideLogs = LogEventsList::getExcludeClause( $this->db );
337  if ( $hideLogs ) {
338  $where[] = $hideLogs;
339  }
340  # Add on any caller specified conditions
341  if ( $cond ) {
342  $where[] = $cond;
343  }
344 
345  $commentQuery = $this->commentStore->getJoin( 'log_comment' );
346 
347  $tables = array_merge(
348  [ 'logging', 'actor' ], $commentQuery['tables']
349  );
350  $fields = [
351  'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
352  'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name'
353  ] + $commentQuery['fields'];
354  $options = [
355  'ORDER BY' => 'log_id',
356  'USE INDEX' => [ 'logging' => 'PRIMARY' ],
357  'LIMIT' => self::BATCH_SIZE,
358  ];
359  $joins = [
360  'actor' => [ 'JOIN', 'actor_id=log_actor' ]
361  ] + $commentQuery['joins'];
362 
363  $lastLogId = 0;
364  while ( true ) {
365  $result = $this->db->select(
366  $tables,
367  $fields,
368  array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
369  __METHOD__,
370  $options,
371  $joins
372  );
373 
374  if ( !$result->numRows() ) {
375  break;
376  }
377 
378  $lastLogId = $this->outputLogStream( $result );
379  $this->reloadDBConfig();
380  }
381  }
382 
389  protected function dumpPages( $cond, $orderRevs ) {
390  $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
391  $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
392 
393  // We want page primary rather than revision.
394  // We also want to join in the slots and content tables.
395  // NOTE: This means we may get multiple rows per revision, and more rows
396  // than the batch size! Should be ok, since the max number of slots is
397  // fixed and low (dozens at worst).
398  $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
399  $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
400  $join = $revQuery['joins'] + [
401  'revision' => $revQuery['joins']['page'],
402  'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
403  'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
404  ];
405  unset( $join['page'] );
406 
407  $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
408 
409  if ( $this->text != self::STUB ) {
410  $fields['_load_content'] = '1';
411  }
412 
413  $conds = [];
414  if ( $cond !== '' ) {
415  $conds[] = $cond;
416  }
417  $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
418  $opts['USE INDEX'] = [];
419 
420  $op = '>';
421  if ( is_array( $this->history ) ) {
422  # Time offset/limit for all pages/history...
423  # Set time order
424  if ( $this->history['dir'] == 'asc' ) {
425  $opts['ORDER BY'] = 'rev_timestamp ASC';
426  } else {
427  $op = '<';
428  $opts['ORDER BY'] = 'rev_timestamp DESC';
429  }
430  # Set offset
431  if ( !empty( $this->history['offset'] ) ) {
432  $conds[] = "rev_timestamp $op " .
433  $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
434  }
435  # Set query limit
436  if ( !empty( $this->history['limit'] ) ) {
437  $maxRowCount = intval( $this->history['limit'] );
438  }
439  } elseif ( $this->history & self::FULL ) {
440  # Full history dumps...
441  # query optimization for history stub dumps
442  if ( $this->text == self::STUB ) {
443  $opts[] = 'STRAIGHT_JOIN';
444  unset( $join['revision'] );
445  $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
446  }
447  } elseif ( $this->history & self::CURRENT ) {
448  # Latest revision dumps...
449  if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
450  $this->do_list_authors( $cond );
451  }
452  $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
453  $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
454  } elseif ( $this->history & self::STABLE ) {
455  # "Stable" revision dumps...
456  # Default JOIN, to be overridden...
457  $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
458  # One, and only one hook should set this, and return false
459  if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
460  throw new MWException( __METHOD__ . " given invalid history dump type." );
461  }
462  } elseif ( $this->history & self::RANGE ) {
463  # Dump of revisions within a specified range. Condition already set in revsByRange().
464  } else {
465  # Unknown history specification parameter?
466  throw new MWException( __METHOD__ . " given invalid history dump type." );
467  }
468 
469  $done = false;
470  $lastRow = null;
471  $revPage = 0;
472  $revId = 0;
473  $rowCount = 0;
474 
475  $opts['LIMIT'] = self::BATCH_SIZE;
476 
477  $this->hookRunner->onModifyExportQuery(
478  $this->db, $tables, $cond, $opts, $join, $conds );
479 
480  while ( !$done ) {
481  // If necessary, impose the overall maximum and stop looping after this iteration.
482  if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
483  $opts['LIMIT'] = $maxRowCount - $rowCount;
484  $done = true;
485  }
486 
487  $queryConds = $conds;
488  $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
489  intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
490 
491  # Do the query and process any results, remembering max ids for the next iteration.
492  $result = $this->db->select(
493  $tables,
494  $fields,
495  $queryConds,
496  __METHOD__,
497  $opts,
498  $join
499  );
500  if ( $result->numRows() > 0 ) {
501  $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
502  $rowCount += $result->numRows();
503  $revPage = $lastRow->rev_page;
504  $revId = $lastRow->rev_id;
505  } else {
506  $done = true;
507  }
508 
509  // If we are finished, close off final page element (if any).
510  if ( $done && $lastRow ) {
511  $this->finishPageStreamOutput( $lastRow );
512  }
513 
514  if ( !$done ) {
515  $this->reloadDBConfig();
516  }
517  }
518  }
519 
529  protected function outputPageStreamBatch( $results, $lastRow ) {
530  $rowCarry = null;
531  while ( true ) {
532  $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
533 
534  if ( !$slotRows ) {
535  break;
536  }
537 
538  // All revision info is present in all slot rows.
539  // Use the first slot row as the revision row.
540  $revRow = $slotRows[0];
541 
542  if ( $this->limitNamespaces &&
543  !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
544  $lastRow = $revRow;
545  continue;
546  }
547 
548  if ( $lastRow === null ||
549  $lastRow->page_namespace !== $revRow->page_namespace ||
550  $lastRow->page_title !== $revRow->page_title ) {
551  if ( $lastRow !== null ) {
552  $output = '';
553  if ( $this->dumpUploads ) {
554  $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
555  }
556  $output .= $this->writer->closePage();
557  $this->sink->writeClosePage( $output );
558  }
559  $output = $this->writer->openPage( $revRow );
560  $this->sink->writeOpenPage( $revRow, $output );
561  }
562  try {
563  $output = $this->writer->writeRevision( $revRow, $slotRows );
564  $this->sink->writeRevision( $revRow, $output );
565  } catch ( RevisionAccessException $ex ) {
566  MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for'
567  . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() );
568  }
569  $lastRow = $revRow;
570  }
571 
572  if ( $rowCarry ) {
573  throw new LogicException( 'Error while processing a stream of slot rows' );
574  }
575 
576  // @phan-suppress-next-line PhanTypeMismatchReturnNullable False positive
577  return $lastRow;
578  }
579 
589  protected function getSlotRowBatch( $results, &$carry = null ) {
590  $slotRows = [];
591  $prev = null;
592 
593  if ( $carry ) {
594  $slotRows[] = $carry;
595  $prev = $carry;
596  $carry = null;
597  }
598 
599  while ( $row = $results->fetchObject() ) {
600  if ( $prev && $prev->rev_id !== $row->rev_id ) {
601  $carry = $row;
602  break;
603  }
604  $slotRows[] = $row;
605  $prev = $row;
606  }
607 
608  return $slotRows;
609  }
610 
616  protected function finishPageStreamOutput( $lastRow ) {
617  $output = '';
618  if ( $this->dumpUploads ) {
619  $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
620  }
621  $output .= $this->author_list;
622  $output .= $this->writer->closePage();
623  $this->sink->writeClosePage( $output );
624  }
625 
630  protected function outputLogStream( $resultset ) {
631  foreach ( $resultset as $row ) {
632  $output = $this->writer->writeLogItem( $row );
633  $this->sink->writeLogItem( $row, $output );
634  }
635  return $row->log_id ?? null;
636  }
637 
644  private function reloadDBConfig() {
645  MediaWikiServices::getInstance()->getDBLoadBalancerFactory()
646  ->autoReconfigure();
647  }
648 }
static getExcludeClause( $db, $audience='public', Authority $performer=null)
SQL clause to skip forbidden log types for this user.
static warning( $msg, $callerOffset=1, $level=E_USER_NOTICE, $log='auto')
Adds a warning entry to the log.
Definition: MWDebug.php:184
MediaWiki exception.
Definition: MWException.php:32
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
Handle database storage of comments such as edit summaries and log reasons.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Definition: HookRunner.php:568
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception representing a failure to look up a revision.
Page revision base class.
Service for looking up page revisions.
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
dumpFrom( $cond='', $orderRevs=false)
pageByName( $name)
pagesByName( $names)
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
bool $dumpUploadFileContents
array int $history
dumpLogs( $cond)
__construct( $db, CommentStore $commentStore, HookContainer $hookContainer, RevisionStore $revisionStore, TitleParser $titleParser, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
DumpOutput $sink
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
logsByRange( $start, $end)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
array null $limitNamespaces
dumpPages( $cond, $orderRevs)
pageByTitle(PageIdentity $page)
const BATCH_SIZE
outputLogStream( $resultset)
IDatabase $db
setSchemaVersion( $schemaVersion)
static schemaVersion()
Returns the default export schema version, as defined by the XmlDumpSchemaVersion setting.
string $author_list
bool $list_authors
Return distinct author list (when not returning full history)
const WRITE_STUB
Only output subs for revision content.
const WRITE_CONTENT
Output serialized revision content.
Interface for objects (potentially) representing an editable wiki page.
getNamespace()
Returns the page's namespace number.
getDBkey()
Get the page title in DB key form.
A title parser service for MediaWiki.
Definition: TitleParser.php:33
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:36
Result wrapper for grabbing data queried from an IDatabase object.
$revQuery