MediaWiki  master
WikiExporter.php
Go to the documentation of this file.
1 <?php
38 
42 class WikiExporter {
44  public $list_authors = false;
45 
47  public $dumpUploads = false;
48 
50  public $dumpUploadFileContents = false;
51 
53  public $author_list = "";
54 
55  public const FULL = 1;
56  public const CURRENT = 2;
57  public const STABLE = 4; // extension defined
58  public const LOGS = 8;
59  public const RANGE = 16;
60 
63 
64  protected const BATCH_SIZE = 50000;
65 
67  public $text;
68 
70  public $sink;
71 
73  private $writer;
74 
76  protected $db;
77 
79  protected $history;
80 
82  protected $limitNamespaces;
83 
85  private $revisionStore;
86 
88  private $titleParser;
89 
91  private $hookRunner;
92 
97  public static function schemaVersion() {
100  }
101 
116  public function __construct(
117  $db,
118  HookContainer $hookContainer,
121  $history = self::CURRENT,
122  $text = self::TEXT,
123  $limitNamespaces = null
124  ) {
125  $this->db = $db;
126  $this->history = $history;
127  // TODO: add a $hookContainer parameter to XmlDumpWriter so that we can inject
128  // and then be able to convert the factory test to a unit test
129  $this->writer = new XmlDumpWriter( $text, self::schemaVersion() );
130  $this->sink = new DumpOutput();
131  $this->text = $text;
132  $this->limitNamespaces = $limitNamespaces;
133  $this->hookRunner = new HookRunner( $hookContainer );
134  $this->revisionStore = $revisionStore;
135  $this->titleParser = $titleParser;
136  }
137 
143  public function setSchemaVersion( $schemaVersion ) {
144  $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
145  }
146 
154  public function setOutputSink( &$sink ) {
155  $this->sink =& $sink;
156  }
157 
158  public function openStream() {
159  $output = $this->writer->openStream();
160  $this->sink->writeOpenStream( $output );
161  }
162 
163  public function closeStream() {
164  $output = $this->writer->closeStream();
165  $this->sink->writeCloseStream( $output );
166  }
167 
173  public function allPages() {
174  $this->dumpFrom( '' );
175  }
176 
185  public function pagesByRange( $start, $end, $orderRevs ) {
186  if ( $orderRevs ) {
187  $condition = 'rev_page >= ' . intval( $start );
188  if ( $end ) {
189  $condition .= ' AND rev_page < ' . intval( $end );
190  }
191  } else {
192  $condition = 'page_id >= ' . intval( $start );
193  if ( $end ) {
194  $condition .= ' AND page_id < ' . intval( $end );
195  }
196  }
197  $this->dumpFrom( $condition, $orderRevs );
198  }
199 
207  public function revsByRange( $start, $end ) {
208  $condition = 'rev_id >= ' . intval( $start );
209  if ( $end ) {
210  $condition .= ' AND rev_id < ' . intval( $end );
211  }
212  $this->dumpFrom( $condition );
213  }
214 
218  public function pageByTitle( PageIdentity $page ) {
219  $this->dumpFrom(
220  'page_namespace=' . $page->getNamespace() .
221  ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) );
222  }
223 
228  public function pageByName( $name ) {
229  try {
230  $link = $this->titleParser->parseTitle( $name );
231  $this->dumpFrom(
232  'page_namespace=' . $link->getNamespace() .
233  ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) );
234  } catch ( MalformedTitleException $ex ) {
235  throw new MWException( "Can't export invalid title" );
236  }
237  }
238 
242  public function pagesByName( $names ) {
243  foreach ( $names as $name ) {
244  $this->pageByName( $name );
245  }
246  }
247 
248  public function allLogs() {
249  $this->dumpFrom( '' );
250  }
251 
256  public function logsByRange( $start, $end ) {
257  $condition = 'log_id >= ' . intval( $start );
258  if ( $end ) {
259  $condition .= ' AND log_id < ' . intval( $end );
260  }
261  $this->dumpFrom( $condition );
262  }
263 
271  protected function do_list_authors( $cond ) {
272  $this->author_list = "<contributors>";
273  // rev_deleted
274 
275  $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
276  $res = $this->db->select(
277  $revQuery['tables'],
278  [
279  'rev_user_text' => $revQuery['fields']['rev_user_text'],
280  'rev_user' => $revQuery['fields']['rev_user'],
281  ],
282  [
283  $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0',
284  $cond,
285  ],
286  __METHOD__,
287  [ 'DISTINCT' ],
288  $revQuery['joins']
289  );
290 
291  foreach ( $res as $row ) {
292  $this->author_list .= "<contributor>" .
293  "<username>" .
294  htmlspecialchars( $row->rev_user_text ) .
295  "</username>" .
296  "<id>" .
297  ( (int)$row->rev_user ) .
298  "</id>" .
299  "</contributor>";
300  }
301  $this->author_list .= "</contributors>";
302  }
303 
310  protected function dumpFrom( $cond = '', $orderRevs = false ) {
311  if ( $this->history & self::LOGS ) {
312  $this->dumpLogs( $cond );
313  } else {
314  $this->dumpPages( $cond, $orderRevs );
315  }
316  }
317 
322  protected function dumpLogs( $cond ) {
323  $where = [];
324  # Hide private logs
325  $hideLogs = LogEventsList::getExcludeClause( $this->db );
326  if ( $hideLogs ) {
327  $where[] = $hideLogs;
328  }
329  # Add on any caller specified conditions
330  if ( $cond ) {
331  $where[] = $cond;
332  }
333  $result = null; // Assuring $result is not undefined, if exception occurs early
334 
335  $commentQuery = CommentStore::getStore()->getJoin( 'log_comment' );
336 
337  $tables = array_merge(
338  [ 'logging', 'actor' ], $commentQuery['tables']
339  );
340  $fields = [
341  'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
342  'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name'
343  ] + $commentQuery['fields'];
344  $options = [
345  'ORDER BY' => 'log_id',
346  'USE INDEX' => [ 'logging' => 'PRIMARY' ],
347  'LIMIT' => self::BATCH_SIZE,
348  ];
349  $joins = [
350  'actor' => [ 'JOIN', 'actor_id=log_actor' ]
351  ] + $commentQuery['joins'];
352 
353  $lastLogId = 0;
354  while ( true ) {
355  $result = $this->db->select(
356  $tables,
357  $fields,
358  array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
359  __METHOD__,
360  $options,
361  $joins
362  );
363 
364  if ( !$result->numRows() ) {
365  break;
366  }
367 
368  $lastLogId = $this->outputLogStream( $result );
369  }
370  }
371 
378  protected function dumpPages( $cond, $orderRevs ) {
379  $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
380  $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
381 
382  // We want page primary rather than revision.
383  // We also want to join in the slots and content tables.
384  // NOTE: This means we may get multiple rows per revision, and more rows
385  // than the batch size! Should be ok, since the max number of slots is
386  // fixed and low (dozens at worst).
387  $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
388  $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
389  $join = $revQuery['joins'] + [
390  'revision' => $revQuery['joins']['page'],
391  'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
392  'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
393  ];
394  unset( $join['page'] );
395 
396  $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
397  $fields[] = 'page_restrictions';
398 
399  if ( $this->text != self::STUB ) {
400  $fields['_load_content'] = '1';
401  }
402 
403  $conds = [];
404  if ( $cond !== '' ) {
405  $conds[] = $cond;
406  }
407  $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
408  $opts['USE INDEX'] = [];
409 
410  $op = '>';
411  if ( is_array( $this->history ) ) {
412  # Time offset/limit for all pages/history...
413  # Set time order
414  if ( $this->history['dir'] == 'asc' ) {
415  $opts['ORDER BY'] = 'rev_timestamp ASC';
416  } else {
417  $op = '<';
418  $opts['ORDER BY'] = 'rev_timestamp DESC';
419  }
420  # Set offset
421  if ( !empty( $this->history['offset'] ) ) {
422  $conds[] = "rev_timestamp $op " .
423  $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
424  }
425  # Set query limit
426  if ( !empty( $this->history['limit'] ) ) {
427  $maxRowCount = intval( $this->history['limit'] );
428  }
429  } elseif ( $this->history & self::FULL ) {
430  # Full history dumps...
431  # query optimization for history stub dumps
432  if ( $this->text == self::STUB ) {
433  $opts[] = 'STRAIGHT_JOIN';
434  $opts['USE INDEX']['revision'] = 'rev_page_id';
435  unset( $join['revision'] );
436  $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
437  }
438  } elseif ( $this->history & self::CURRENT ) {
439  # Latest revision dumps...
440  if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
441  $this->do_list_authors( $cond );
442  }
443  $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
444  $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
445  } elseif ( $this->history & self::STABLE ) {
446  # "Stable" revision dumps...
447  # Default JOIN, to be overridden...
448  $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
449  # One, and only one hook should set this, and return false
450  if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
451  throw new MWException( __METHOD__ . " given invalid history dump type." );
452  }
453  } elseif ( $this->history & self::RANGE ) {
454  # Dump of revisions within a specified range. Condition already set in revsByRange().
455  } else {
456  # Unknown history specification parameter?
457  throw new MWException( __METHOD__ . " given invalid history dump type." );
458  }
459 
460  $result = null; // Assuring $result is not undefined, if exception occurs early
461  $done = false;
462  $lastRow = null;
463  $revPage = 0;
464  $revId = 0;
465  $rowCount = 0;
466 
467  $opts['LIMIT'] = self::BATCH_SIZE;
468 
469  $this->hookRunner->onModifyExportQuery(
470  $this->db, $tables, $cond, $opts, $join, $conds );
471 
472  while ( !$done ) {
473  // If necessary, impose the overall maximum and stop looping after this iteration.
474  if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
475  $opts['LIMIT'] = $maxRowCount - $rowCount;
476  $done = true;
477  }
478 
479  $queryConds = $conds;
480  $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
481  intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
482 
483  # Do the query and process any results, remembering max ids for the next iteration.
484  $result = $this->db->select(
485  $tables,
486  $fields,
487  $queryConds,
488  __METHOD__,
489  $opts,
490  $join
491  );
492  if ( $result->numRows() > 0 ) {
493  $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
494  $rowCount += $result->numRows();
495  $revPage = $lastRow->rev_page;
496  $revId = $lastRow->rev_id;
497  } else {
498  $done = true;
499  }
500 
501  // If we are finished, close off final page element (if any).
502  if ( $done && $lastRow ) {
503  $this->finishPageStreamOutput( $lastRow );
504  }
505  }
506  }
507 
517  protected function outputPageStreamBatch( $results, $lastRow ) {
518  $rowCarry = null;
519  while ( true ) {
520  $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
521 
522  if ( !$slotRows ) {
523  break;
524  }
525 
526  // All revision info is present in all slot rows.
527  // Use the first slot row as the revision row.
528  $revRow = $slotRows[0];
529 
530  if ( $this->limitNamespaces &&
531  !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
532  $lastRow = $revRow;
533  continue;
534  }
535 
536  if ( $lastRow === null ||
537  $lastRow->page_namespace !== $revRow->page_namespace ||
538  $lastRow->page_title !== $revRow->page_title ) {
539  if ( $lastRow !== null ) {
540  $output = '';
541  if ( $this->dumpUploads ) {
542  $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
543  }
544  $output .= $this->writer->closePage();
545  $this->sink->writeClosePage( $output );
546  }
547  $output = $this->writer->openPage( $revRow );
548  $this->sink->writeOpenPage( $revRow, $output );
549  }
550  try {
551  $output = $this->writer->writeRevision( $revRow, $slotRows );
552  $this->sink->writeRevision( $revRow, $output );
553  } catch ( RevisionAccessException $ex ) {
554  MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for'
555  . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() );
556  }
557  $lastRow = $revRow;
558  }
559 
560  if ( $rowCarry ) {
561  throw new LogicException( 'Error while processing a stream of slot rows' );
562  }
563 
564  return $lastRow;
565  }
566 
576  protected function getSlotRowBatch( $results, &$carry = null ) {
577  $slotRows = [];
578  $prev = null;
579 
580  if ( $carry ) {
581  $slotRows[] = $carry;
582  $prev = $carry;
583  $carry = null;
584  }
585 
586  while ( $row = $results->fetchObject() ) {
587  if ( $prev && $prev->rev_id !== $row->rev_id ) {
588  $carry = $row;
589  break;
590  }
591  $slotRows[] = $row;
592  $prev = $row;
593  }
594 
595  return $slotRows;
596  }
597 
603  protected function finishPageStreamOutput( $lastRow ) {
604  $output = '';
605  if ( $this->dumpUploads ) {
606  $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
607  }
608  $output .= $this->author_list;
609  $output .= $this->writer->closePage();
610  $this->sink->writeClosePage( $output );
611  }
612 
617  protected function outputLogStream( $resultset ) {
618  foreach ( $resultset as $row ) {
619  $output = $this->writer->writeLogItem( $row );
620  $this->sink->writeLogItem( $row, $output );
621  }
622  return $row->log_id ?? null;
623  }
624 }
Page\PageIdentity
Interface for objects (potentially) representing an editable wiki page.
Definition: PageIdentity.php:64
WikiExporter\schemaVersion
static schemaVersion()
Returns the default export schema version, as defined by $wgXmlDumpSchemaVersion.
Definition: WikiExporter.php:97
MediaWiki\Revision\RevisionAccessException
Exception representing a failure to look up a revision.
Definition: RevisionAccessException.php:37
MediaWiki\Revision\RevisionRecord
Page revision base class.
Definition: RevisionRecord.php:47
WikiExporter\revsByRange
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
Definition: WikiExporter.php:207
WikiExporter\$revisionStore
RevisionStore $revisionStore
Definition: WikiExporter.php:85
WikiExporter\CURRENT
const CURRENT
Definition: WikiExporter.php:56
MediaWiki\Revision\RevisionStore
Service for looking up page revisions.
Definition: RevisionStore.php:88
WikiExporter\finishPageStreamOutput
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
Definition: WikiExporter.php:603
WikiExporter\getSlotRowBatch
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
Definition: WikiExporter.php:576
WikiExporter\$dumpUploadFileContents
bool $dumpUploadFileContents
Definition: WikiExporter.php:50
LogEventsList\getExcludeClause
static getExcludeClause( $db, $audience='public', Authority $performer=null)
SQL clause to skip forbidden log types for this user.
Definition: LogEventsList.php:786
WikiExporter\__construct
__construct( $db, HookContainer $hookContainer, RevisionStore $revisionStore, TitleParser $titleParser, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
Definition: WikiExporter.php:116
$res
$res
Definition: testCompression.php:57
WikiExporter\allLogs
allLogs()
Definition: WikiExporter.php:248
$revQuery
$revQuery
Definition: testCompression.php:56
WikiExporter\dumpFrom
dumpFrom( $cond='', $orderRevs=false)
Definition: WikiExporter.php:310
WikiExporter\openStream
openStream()
Definition: WikiExporter.php:158
Wikimedia\Rdbms\IDatabase
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
WikiExporter\STUB
const STUB
Definition: WikiExporter.php:62
WikiExporter\$text
int $text
Definition: WikiExporter.php:67
WikiExporter\$list_authors
bool $list_authors
Return distinct author list (when not returning full history)
Definition: WikiExporter.php:44
MWException
MediaWiki exception.
Definition: MWException.php:29
WikiExporter\$history
array int $history
Definition: WikiExporter.php:79
XmlDumpWriter\WRITE_CONTENT
const WRITE_CONTENT
Output serialized revision content.
Definition: XmlDumpWriter.php:42
WikiExporter\pagesByName
pagesByName( $names)
Definition: WikiExporter.php:242
$wgXmlDumpSchemaVersion
$wgXmlDumpSchemaVersion
The schema to use per default when generating XML dumps.
Definition: DefaultSettings.php:8437
Wikimedia\Rdbms\IResultWrapper
Result wrapper for grabbing data queried from an IDatabase object.
Definition: IResultWrapper.php:26
WikiExporter\TEXT
const TEXT
Definition: WikiExporter.php:61
Page\PageReference\getNamespace
getNamespace()
Returns the page's namespace number.
WikiExporter\outputLogStream
outputLogStream( $resultset)
Definition: WikiExporter.php:617
WikiExporter\closeStream
closeStream()
Definition: WikiExporter.php:163
WikiExporter\$limitNamespaces
array null $limitNamespaces
Definition: WikiExporter.php:82
WikiExporter\allPages
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
Definition: WikiExporter.php:173
WikiExporter\$dumpUploads
bool $dumpUploads
Definition: WikiExporter.php:47
WikiExporter\$hookRunner
HookRunner $hookRunner
Definition: WikiExporter.php:91
WikiExporter\STABLE
const STABLE
Definition: WikiExporter.php:57
WikiExporter\setSchemaVersion
setSchemaVersion( $schemaVersion)
Definition: WikiExporter.php:143
TitleParser
A title parser service for MediaWiki.
Definition: TitleParser.php:33
DumpOutput
Definition: DumpOutput.php:29
WikiExporter
Definition: WikiExporter.php:42
WikiExporter\$author_list
string $author_list
Definition: WikiExporter.php:53
WikiExporter\dumpPages
dumpPages( $cond, $orderRevs)
Definition: WikiExporter.php:378
WikiExporter\pagesByRange
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
Definition: WikiExporter.php:185
Page\PageReference\getDBkey
getDBkey()
Get the page title in DB key form.
WikiExporter\do_list_authors
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
Definition: WikiExporter.php:271
WikiExporter\FULL
const FULL
Definition: WikiExporter.php:55
WikiExporter\pageByTitle
pageByTitle(PageIdentity $page)
Definition: WikiExporter.php:218
WikiExporter\$writer
XmlDumpWriter $writer
Definition: WikiExporter.php:73
WikiExporter\$titleParser
TitleParser $titleParser
Definition: WikiExporter.php:88
WikiExporter\RANGE
const RANGE
Definition: WikiExporter.php:59
XmlDumpWriter\WRITE_STUB
const WRITE_STUB
Only output subs for revision content.
Definition: XmlDumpWriter.php:45
WikiExporter\setOutputSink
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
Definition: WikiExporter.php:154
MalformedTitleException
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
Definition: MalformedTitleException.php:26
WikiExporter\LOGS
const LOGS
Definition: WikiExporter.php:58
WikiExporter\BATCH_SIZE
const BATCH_SIZE
Definition: WikiExporter.php:64
XmlDumpWriter
Definition: XmlDumpWriter.php:39
WikiExporter\logsByRange
logsByRange( $start, $end)
Definition: WikiExporter.php:256
MediaWiki\HookContainer\HookContainer
HookContainer class.
Definition: HookContainer.php:45
WikiExporter\outputPageStreamBatch
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
Definition: WikiExporter.php:517
MediaWiki\HookContainer\HookRunner
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Definition: HookRunner.php:558
MWDebug\warning
static warning( $msg, $callerOffset=1, $level=E_USER_NOTICE, $log='auto')
Adds a warning entry to the log.
Definition: MWDebug.php:180
WikiExporter\dumpLogs
dumpLogs( $cond)
Definition: WikiExporter.php:322
WikiExporter\pageByName
pageByName( $name)
Definition: WikiExporter.php:228
WikiExporter\$db
IDatabase $db
Definition: WikiExporter.php:76
WikiExporter\$sink
DumpOutput $sink
Definition: WikiExporter.php:70
CommentStore\getStore
static getStore()
Definition: CommentStore.php:120