MediaWiki  master
WikiExporter.php
Go to the documentation of this file.
1 <?php
43 
47 class WikiExporter {
49  public $list_authors = false;
50 
52  public $dumpUploads = false;
53 
55  public $dumpUploadFileContents = false;
56 
58  public $author_list = "";
59 
60  public const FULL = 1;
61  public const CURRENT = 2;
62  public const STABLE = 4; // extension defined
63  public const LOGS = 8;
64  public const RANGE = 16;
65 
68 
69  protected const BATCH_SIZE = 50000;
70 
72  public $text;
73 
75  public $sink;
76 
78  private $writer;
79 
81  protected $db;
82 
84  protected $history;
85 
87  protected $limitNamespaces;
88 
90  private $revisionStore;
91 
93  private $titleParser;
94 
96  private $hookRunner;
97 
99  private $commentStore;
100 
105  public static function schemaVersion() {
106  return MediaWikiServices::getInstance()->getMainConfig()->get(
107  MainConfigNames::XmlDumpSchemaVersion );
108  }
109 
124  public function __construct(
125  $db,
126  CommentStore $commentStore,
127  HookContainer $hookContainer,
128  RevisionStore $revisionStore,
129  TitleParser $titleParser,
130  $history = self::CURRENT,
131  $text = self::TEXT,
132  $limitNamespaces = null
133  ) {
134  $this->db = $db;
135  $this->commentStore = $commentStore;
136  $this->history = $history;
137  $this->writer = new XmlDumpWriter(
138  $text,
139  self::schemaVersion(),
140  $hookContainer,
141  $commentStore
142  );
143  $this->sink = new DumpOutput();
144  $this->text = $text;
145  $this->limitNamespaces = $limitNamespaces;
146  $this->hookRunner = new HookRunner( $hookContainer );
147  $this->revisionStore = $revisionStore;
148  $this->titleParser = $titleParser;
149  }
150 
156  public function setSchemaVersion( $schemaVersion ) {
157  $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
158  }
159 
167  public function setOutputSink( &$sink ) {
168  $this->sink =& $sink;
169  }
170 
171  public function openStream() {
172  $output = $this->writer->openStream();
173  $this->sink->writeOpenStream( $output );
174  }
175 
176  public function closeStream() {
177  $output = $this->writer->closeStream();
178  $this->sink->writeCloseStream( $output );
179  }
180 
186  public function allPages() {
187  $this->dumpFrom( '' );
188  }
189 
198  public function pagesByRange( $start, $end, $orderRevs ) {
199  if ( $orderRevs ) {
200  $condition = 'rev_page >= ' . intval( $start );
201  if ( $end ) {
202  $condition .= ' AND rev_page < ' . intval( $end );
203  }
204  } else {
205  $condition = 'page_id >= ' . intval( $start );
206  if ( $end ) {
207  $condition .= ' AND page_id < ' . intval( $end );
208  }
209  }
210  $this->dumpFrom( $condition, $orderRevs );
211  }
212 
220  public function revsByRange( $start, $end ) {
221  $condition = 'rev_id >= ' . intval( $start );
222  if ( $end ) {
223  $condition .= ' AND rev_id < ' . intval( $end );
224  }
225  $this->dumpFrom( $condition );
226  }
227 
231  public function pageByTitle( PageIdentity $page ) {
232  $this->dumpFrom(
233  'page_namespace=' . $page->getNamespace() .
234  ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) );
235  }
236 
241  public function pageByName( $name ) {
242  try {
243  $link = $this->titleParser->parseTitle( $name );
244  $this->dumpFrom(
245  'page_namespace=' . $link->getNamespace() .
246  ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) );
247  } catch ( MalformedTitleException $ex ) {
248  throw new MWException( "Can't export invalid title" );
249  }
250  }
251 
255  public function pagesByName( $names ) {
256  foreach ( $names as $name ) {
257  $this->pageByName( $name );
258  }
259  }
260 
261  public function allLogs() {
262  $this->dumpFrom( '' );
263  }
264 
269  public function logsByRange( $start, $end ) {
270  $condition = 'log_id >= ' . intval( $start );
271  if ( $end ) {
272  $condition .= ' AND log_id < ' . intval( $end );
273  }
274  $this->dumpFrom( $condition );
275  }
276 
284  protected function do_list_authors( $cond ) {
285  $this->author_list = "<contributors>";
286  // rev_deleted
287 
288  $res = $this->revisionStore->newSelectQueryBuilder( $this->db )
289  ->joinPage()
290  ->distinct()
291  ->where( $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0' )
292  ->andWhere( $cond )
293  ->caller( __METHOD__ )->fetchResultSet();
294 
295  foreach ( $res as $row ) {
296  $this->author_list .= "<contributor>" .
297  "<username>" .
298  htmlspecialchars( $row->rev_user_text ) .
299  "</username>" .
300  "<id>" .
301  ( (int)$row->rev_user ) .
302  "</id>" .
303  "</contributor>";
304  }
305  $this->author_list .= "</contributors>";
306  }
307 
314  protected function dumpFrom( $cond = '', $orderRevs = false ) {
315  if ( is_int( $this->history ) && ( $this->history & self::LOGS ) ) {
316  $this->dumpLogs( $cond );
317  } else {
318  $this->dumpPages( $cond, $orderRevs );
319  }
320  }
321 
326  protected function dumpLogs( $cond ) {
327  $where = [];
328  # Hide private logs
329  $hideLogs = LogEventsList::getExcludeClause( $this->db );
330  if ( $hideLogs ) {
331  $where[] = $hideLogs;
332  }
333  # Add on any caller specified conditions
334  if ( $cond ) {
335  $where[] = $cond;
336  }
337 
338  $commentQuery = $this->commentStore->getJoin( 'log_comment' );
339 
340  $tables = array_merge(
341  [ 'logging', 'actor' ], $commentQuery['tables']
342  );
343  $fields = [
344  'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
345  'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name'
346  ] + $commentQuery['fields'];
347  $options = [
348  'ORDER BY' => 'log_id',
349  'USE INDEX' => [ 'logging' => 'PRIMARY' ],
350  'LIMIT' => self::BATCH_SIZE,
351  ];
352  $joins = [
353  'actor' => [ 'JOIN', 'actor_id=log_actor' ]
354  ] + $commentQuery['joins'];
355 
356  $lastLogId = 0;
357  while ( true ) {
358  $result = $this->db->select(
359  $tables,
360  $fields,
361  array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
362  __METHOD__,
363  $options,
364  $joins
365  );
366 
367  if ( !$result->numRows() ) {
368  break;
369  }
370 
371  $lastLogId = $this->outputLogStream( $result );
372  $this->reloadDBConfig();
373  }
374  }
375 
382  protected function dumpPages( $cond, $orderRevs ) {
383  $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
384  $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
385 
386  // We want page primary rather than revision.
387  // We also want to join in the slots and content tables.
388  // NOTE: This means we may get multiple rows per revision, and more rows
389  // than the batch size! Should be ok, since the max number of slots is
390  // fixed and low (dozens at worst).
391  $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
392  $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
393  $join = $revQuery['joins'] + [
394  'revision' => $revQuery['joins']['page'],
395  'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
396  'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
397  ];
398  unset( $join['page'] );
399 
400  $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
401 
402  if ( $this->text != self::STUB ) {
403  $fields['_load_content'] = '1';
404  }
405 
406  $conds = [];
407  if ( $cond !== '' ) {
408  $conds[] = $cond;
409  }
410  $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
411  $opts['USE INDEX'] = [];
412 
413  $op = '>';
414  if ( is_array( $this->history ) ) {
415  # Time offset/limit for all pages/history...
416  # Set time order
417  if ( $this->history['dir'] == 'asc' ) {
418  $opts['ORDER BY'] = 'rev_timestamp ASC';
419  } else {
420  $op = '<';
421  $opts['ORDER BY'] = 'rev_timestamp DESC';
422  }
423  # Set offset
424  if ( !empty( $this->history['offset'] ) ) {
425  $conds[] = "rev_timestamp $op " .
426  $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
427  }
428  # Set query limit
429  if ( !empty( $this->history['limit'] ) ) {
430  $maxRowCount = intval( $this->history['limit'] );
431  }
432  } elseif ( $this->history & self::FULL ) {
433  # Full history dumps...
434  # query optimization for history stub dumps
435  if ( $this->text == self::STUB ) {
436  $opts[] = 'STRAIGHT_JOIN';
437  unset( $join['revision'] );
438  $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
439  }
440  } elseif ( $this->history & self::CURRENT ) {
441  # Latest revision dumps...
442  if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
443  $this->do_list_authors( $cond );
444  }
445  $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
446  $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
447  } elseif ( $this->history & self::STABLE ) {
448  # "Stable" revision dumps...
449  # Default JOIN, to be overridden...
450  $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
451  # One, and only one hook should set this, and return false
452  if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
453  throw new MWException( __METHOD__ . " given invalid history dump type." );
454  }
455  } elseif ( $this->history & self::RANGE ) {
456  # Dump of revisions within a specified range. Condition already set in revsByRange().
457  } else {
458  # Unknown history specification parameter?
459  throw new MWException( __METHOD__ . " given invalid history dump type." );
460  }
461 
462  $done = false;
463  $lastRow = null;
464  $revPage = 0;
465  $revId = 0;
466  $rowCount = 0;
467 
468  $opts['LIMIT'] = self::BATCH_SIZE;
469 
470  $this->hookRunner->onModifyExportQuery(
471  $this->db, $tables, $cond, $opts, $join, $conds );
472 
473  while ( !$done ) {
474  // If necessary, impose the overall maximum and stop looping after this iteration.
475  if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
476  $opts['LIMIT'] = $maxRowCount - $rowCount;
477  $done = true;
478  }
479 
480  $queryConds = $conds;
481  $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
482  intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
483 
484  # Do the query and process any results, remembering max ids for the next iteration.
485  $result = $this->db->select(
486  $tables,
487  $fields,
488  $queryConds,
489  __METHOD__,
490  $opts,
491  $join
492  );
493  if ( $result->numRows() > 0 ) {
494  $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
495  $rowCount += $result->numRows();
496  $revPage = $lastRow->rev_page;
497  $revId = $lastRow->rev_id;
498  } else {
499  $done = true;
500  }
501 
502  // If we are finished, close off final page element (if any).
503  if ( $done && $lastRow ) {
504  $this->finishPageStreamOutput( $lastRow );
505  }
506 
507  if ( !$done ) {
508  $this->reloadDBConfig();
509  }
510  }
511  }
512 
522  protected function outputPageStreamBatch( $results, $lastRow ) {
523  $rowCarry = null;
524  while ( true ) {
525  $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
526 
527  if ( !$slotRows ) {
528  break;
529  }
530 
531  // All revision info is present in all slot rows.
532  // Use the first slot row as the revision row.
533  $revRow = $slotRows[0];
534 
535  if ( $this->limitNamespaces &&
536  !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
537  $lastRow = $revRow;
538  continue;
539  }
540 
541  if ( $lastRow === null ||
542  $lastRow->page_namespace !== $revRow->page_namespace ||
543  $lastRow->page_title !== $revRow->page_title ) {
544  if ( $lastRow !== null ) {
545  $output = '';
546  if ( $this->dumpUploads ) {
547  $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
548  }
549  $output .= $this->writer->closePage();
550  $this->sink->writeClosePage( $output );
551  }
552  $output = $this->writer->openPage( $revRow );
553  $this->sink->writeOpenPage( $revRow, $output );
554  }
555  try {
556  $output = $this->writer->writeRevision( $revRow, $slotRows );
557  $this->sink->writeRevision( $revRow, $output );
558  } catch ( RevisionAccessException $ex ) {
559  MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for'
560  . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() );
561  }
562  $lastRow = $revRow;
563  }
564 
565  if ( $rowCarry ) {
566  throw new LogicException( 'Error while processing a stream of slot rows' );
567  }
568 
569  // @phan-suppress-next-line PhanTypeMismatchReturnNullable False positive
570  return $lastRow;
571  }
572 
582  protected function getSlotRowBatch( $results, &$carry = null ) {
583  $slotRows = [];
584  $prev = null;
585 
586  if ( $carry ) {
587  $slotRows[] = $carry;
588  $prev = $carry;
589  $carry = null;
590  }
591 
592  while ( $row = $results->fetchObject() ) {
593  if ( $prev && $prev->rev_id !== $row->rev_id ) {
594  $carry = $row;
595  break;
596  }
597  $slotRows[] = $row;
598  $prev = $row;
599  }
600 
601  return $slotRows;
602  }
603 
609  protected function finishPageStreamOutput( $lastRow ) {
610  $output = '';
611  if ( $this->dumpUploads ) {
612  $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
613  }
614  $output .= $this->author_list;
615  $output .= $this->writer->closePage();
616  $this->sink->writeClosePage( $output );
617  }
618 
623  protected function outputLogStream( $resultset ) {
624  foreach ( $resultset as $row ) {
625  $output = $this->writer->writeLogItem( $row );
626  $this->sink->writeLogItem( $row, $output );
627  }
628  return $row->log_id ?? null;
629  }
630 
637  private function reloadDBConfig() {
638  MediaWikiServices::getInstance()->getDBLoadBalancerFactory()
639  ->autoReconfigure();
640  }
641 }
static getExcludeClause( $db, $audience='public', Authority $performer=null)
SQL clause to skip forbidden log types for this user.
static warning( $msg, $callerOffset=1, $level=E_USER_NOTICE, $log='auto')
Adds a warning entry to the log.
Definition: MWDebug.php:187
MediaWiki exception.
Definition: MWException.php:33
Handle database storage of comments such as edit summaries and log reasons.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Definition: HookRunner.php:568
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception representing a failure to look up a revision.
Page revision base class.
Service for looking up page revisions.
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
dumpFrom( $cond='', $orderRevs=false)
pageByName( $name)
pagesByName( $names)
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
IReadableDatabase $db
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
bool $dumpUploadFileContents
array int $history
dumpLogs( $cond)
__construct( $db, CommentStore $commentStore, HookContainer $hookContainer, RevisionStore $revisionStore, TitleParser $titleParser, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
DumpOutput $sink
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
logsByRange( $start, $end)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
array null $limitNamespaces
dumpPages( $cond, $orderRevs)
pageByTitle(PageIdentity $page)
const BATCH_SIZE
outputLogStream( $resultset)
setSchemaVersion( $schemaVersion)
static schemaVersion()
Returns the default export schema version, as defined by the XmlDumpSchemaVersion setting.
string $author_list
bool $list_authors
Return distinct author list (when not returning full history)
const WRITE_STUB
Only output subs for revision content.
const WRITE_CONTENT
Output serialized revision content.
Interface for objects (potentially) representing an editable wiki page.
getNamespace()
Returns the page's namespace number.
getDBkey()
Get the page title in DB key form.
A title parser service for MediaWiki.
Definition: TitleParser.php:35
A database connection without write operations.
Result wrapper for grabbing data queried from an IDatabase object.