MediaWiki  master
WikiExporter.php
Go to the documentation of this file.
1 <?php
30 use MediaWiki\MediaWikiServices as MediaWikiServicesAlias;
34 
38 class WikiExporter {
40  public $list_authors = false;
41 
43  public $dumpUploads = false;
44 
46  public $dumpUploadFileContents = false;
47 
49  public $author_list = "";
50 
51  public const FULL = 1;
52  public const CURRENT = 2;
53  public const STABLE = 4; // extension defined
54  public const LOGS = 8;
55  public const RANGE = 16;
56 
59 
60  protected const BATCH_SIZE = 50000;
61 
63  public $text;
64 
66  public $sink;
67 
69  private $writer;
70 
72  protected $db;
73 
75  protected $history;
76 
78  protected $limitNamespaces;
79 
84  public static function schemaVersion() {
87  }
88 
100  public function __construct(
101  $db,
102  $history = self::CURRENT,
103  $text = self::TEXT,
104  $limitNamespaces = null
105  ) {
106  $this->db = $db;
107  $this->history = $history;
108  $this->writer = new XmlDumpWriter( $text, self::schemaVersion() );
109  $this->sink = new DumpOutput();
110  $this->text = $text;
111  $this->limitNamespaces = $limitNamespaces;
112  }
113 
119  public function setSchemaVersion( $schemaVersion ) {
120  $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
121  }
122 
130  public function setOutputSink( &$sink ) {
131  $this->sink =& $sink;
132  }
133 
134  public function openStream() {
135  $output = $this->writer->openStream();
136  $this->sink->writeOpenStream( $output );
137  }
138 
139  public function closeStream() {
140  $output = $this->writer->closeStream();
141  $this->sink->writeCloseStream( $output );
142  }
143 
149  public function allPages() {
150  $this->dumpFrom( '' );
151  }
152 
161  public function pagesByRange( $start, $end, $orderRevs ) {
162  if ( $orderRevs ) {
163  $condition = 'rev_page >= ' . intval( $start );
164  if ( $end ) {
165  $condition .= ' AND rev_page < ' . intval( $end );
166  }
167  } else {
168  $condition = 'page_id >= ' . intval( $start );
169  if ( $end ) {
170  $condition .= ' AND page_id < ' . intval( $end );
171  }
172  }
173  $this->dumpFrom( $condition, $orderRevs );
174  }
175 
183  public function revsByRange( $start, $end ) {
184  $condition = 'rev_id >= ' . intval( $start );
185  if ( $end ) {
186  $condition .= ' AND rev_id < ' . intval( $end );
187  }
188  $this->dumpFrom( $condition );
189  }
190 
194  public function pageByTitle( $title ) {
195  $this->dumpFrom(
196  'page_namespace=' . $title->getNamespace() .
197  ' AND page_title=' . $this->db->addQuotes( $title->getDBkey() ) );
198  }
199 
204  public function pageByName( $name ) {
205  $title = Title::newFromText( $name );
206  if ( $title === null ) {
207  throw new MWException( "Can't export invalid title" );
208  } else {
209  $this->pageByTitle( $title );
210  }
211  }
212 
216  public function pagesByName( $names ) {
217  foreach ( $names as $name ) {
218  $this->pageByName( $name );
219  }
220  }
221 
222  public function allLogs() {
223  $this->dumpFrom( '' );
224  }
225 
230  public function logsByRange( $start, $end ) {
231  $condition = 'log_id >= ' . intval( $start );
232  if ( $end ) {
233  $condition .= ' AND log_id < ' . intval( $end );
234  }
235  $this->dumpFrom( $condition );
236  }
237 
245  protected function do_list_authors( $cond ) {
246  $this->author_list = "<contributors>";
247  // rev_deleted
248 
249  $revQuery = MediaWikiServicesAlias::getInstance()
250  ->getRevisionStore()
251  ->getQueryInfo( [ 'page' ] );
252  $res = $this->db->select(
253  $revQuery['tables'],
254  [
255  'rev_user_text' => $revQuery['fields']['rev_user_text'],
256  'rev_user' => $revQuery['fields']['rev_user'],
257  ],
258  [
259  $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0',
260  $cond,
261  ],
262  __METHOD__,
263  [ 'DISTINCT' ],
264  $revQuery['joins']
265  );
266 
267  foreach ( $res as $row ) {
268  $this->author_list .= "<contributor>" .
269  "<username>" .
270  htmlspecialchars( $row->rev_user_text ) .
271  "</username>" .
272  "<id>" .
273  ( (int)$row->rev_user ) .
274  "</id>" .
275  "</contributor>";
276  }
277  $this->author_list .= "</contributors>";
278  }
279 
286  protected function dumpFrom( $cond = '', $orderRevs = false ) {
287  if ( $this->history & self::LOGS ) {
288  $this->dumpLogs( $cond );
289  } else {
290  $this->dumpPages( $cond, $orderRevs );
291  }
292  }
293 
298  protected function dumpLogs( $cond ) {
299  $where = [];
300  # Hide private logs
301  $hideLogs = LogEventsList::getExcludeClause( $this->db );
302  if ( $hideLogs ) {
303  $where[] = $hideLogs;
304  }
305  # Add on any caller specified conditions
306  if ( $cond ) {
307  $where[] = $cond;
308  }
309  $result = null; // Assuring $result is not undefined, if exception occurs early
310 
311  $commentQuery = CommentStore::getStore()->getJoin( 'log_comment' );
312  $actorQuery = ActorMigration::newMigration()->getJoin( 'log_user' );
313 
314  $tables = array_merge(
315  [ 'logging' ], $commentQuery['tables'], $actorQuery['tables'], [ 'user' ]
316  );
317  $fields = [
318  'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
319  'log_title', 'log_params', 'log_deleted', 'user_name'
320  ] + $commentQuery['fields'] + $actorQuery['fields'];
321  $options = [
322  'ORDER BY' => 'log_id',
323  'USE INDEX' => [ 'logging' => 'PRIMARY' ],
324  'LIMIT' => self::BATCH_SIZE,
325  ];
326  $joins = [
327  'user' => [ 'JOIN', 'user_id = ' . $actorQuery['fields']['log_user'] ]
328  ] + $commentQuery['joins'] + $actorQuery['joins'];
329 
330  $lastLogId = 0;
331  while ( true ) {
332  $result = $this->db->select(
333  $tables,
334  $fields,
335  array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
336  __METHOD__,
337  $options,
338  $joins
339  );
340 
341  if ( !$result->numRows() ) {
342  break;
343  }
344 
345  $lastLogId = $this->outputLogStream( $result );
346  }
347  }
348 
355  protected function dumpPages( $cond, $orderRevs ) {
356  $revQuery = MediaWikiServicesAlias::getInstance()->getRevisionStore()->getQueryInfo(
357  [ 'page' ]
358  );
359  $slotQuery = MediaWikiServicesAlias::getInstance()->getRevisionStore()->getSlotsQueryInfo(
360  [ 'content' ]
361  );
362 
363  // We want page primary rather than revision.
364  // We also want to join in the slots and content tables.
365  // NOTE: This means we may get multiple rows per revision, and more rows
366  // than the batch size! Should be ok, since the max number of slots is
367  // fixed and low (dozens at worst).
368  $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
369  $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
370  $join = $revQuery['joins'] + [
371  'revision' => $revQuery['joins']['page'],
372  'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
373  'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
374  ];
375  unset( $join['page'] );
376 
377  $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
378  $fields[] = 'page_restrictions';
379 
380  if ( $this->text != self::STUB ) {
381  $fields['_load_content'] = '1';
382  }
383 
384  $conds = [];
385  if ( $cond !== '' ) {
386  $conds[] = $cond;
387  }
388  $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
389  $opts['USE INDEX'] = [];
390 
391  $op = '>';
392  if ( is_array( $this->history ) ) {
393  # Time offset/limit for all pages/history...
394  # Set time order
395  if ( $this->history['dir'] == 'asc' ) {
396  $opts['ORDER BY'] = 'rev_timestamp ASC';
397  } else {
398  $op = '<';
399  $opts['ORDER BY'] = 'rev_timestamp DESC';
400  }
401  # Set offset
402  if ( !empty( $this->history['offset'] ) ) {
403  $conds[] = "rev_timestamp $op " .
404  $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
405  }
406  # Set query limit
407  if ( !empty( $this->history['limit'] ) ) {
408  $maxRowCount = intval( $this->history['limit'] );
409  }
410  } elseif ( $this->history & self::FULL ) {
411  # Full history dumps...
412  # query optimization for history stub dumps
413  if ( $this->text == self::STUB ) {
414  $opts[] = 'STRAIGHT_JOIN';
415  $opts['USE INDEX']['revision'] = 'rev_page_id';
416  unset( $join['revision'] );
417  $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
418  }
419  } elseif ( $this->history & self::CURRENT ) {
420  # Latest revision dumps...
421  if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
422  $this->do_list_authors( $cond );
423  }
424  $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
425  $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
426  } elseif ( $this->history & self::STABLE ) {
427  # "Stable" revision dumps...
428  # Default JOIN, to be overridden...
429  $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
430  # One, and only one hook should set this, and return false
431  if ( Hooks::run( 'WikiExporter::dumpStableQuery', [ &$tables, &$opts, &$join ] ) ) {
432  throw new MWException( __METHOD__ . " given invalid history dump type." );
433  }
434  } elseif ( $this->history & self::RANGE ) {
435  # Dump of revisions within a specified range. Condition already set in revsByRange().
436  } else {
437  # Unknown history specification parameter?
438  throw new MWException( __METHOD__ . " given invalid history dump type." );
439  }
440 
441  $result = null; // Assuring $result is not undefined, if exception occurs early
442  $done = false;
443  $lastRow = null;
444  $revPage = 0;
445  $revId = 0;
446  $rowCount = 0;
447 
448  $opts['LIMIT'] = self::BATCH_SIZE;
449 
450  Hooks::run( 'ModifyExportQuery',
451  [ $this->db, &$tables, &$cond, &$opts, &$join, &$conds ] );
452 
453  while ( !$done ) {
454  // If necessary, impose the overall maximum and stop looping after this iteration.
455  if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
456  $opts['LIMIT'] = $maxRowCount - $rowCount;
457  $done = true;
458  }
459 
460  $queryConds = $conds;
461  $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
462  intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
463 
464  # Do the query and process any results, remembering max ids for the next iteration.
465  $result = $this->db->select(
466  $tables,
467  $fields,
468  $queryConds,
469  __METHOD__,
470  $opts,
471  $join
472  );
473  if ( $result->numRows() > 0 ) {
474  $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
475  $rowCount += $result->numRows();
476  $revPage = $lastRow->rev_page;
477  $revId = $lastRow->rev_id;
478  } else {
479  $done = true;
480  }
481 
482  // If we are finished, close off final page element (if any).
483  if ( $done && $lastRow ) {
484  $this->finishPageStreamOutput( $lastRow );
485  }
486  }
487  }
488 
498  protected function outputPageStreamBatch( $results, $lastRow ) {
499  $rowCarry = null;
500  while ( true ) {
501  $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
502 
503  if ( !$slotRows ) {
504  break;
505  }
506 
507  // All revision info is present in all slot rows.
508  // Use the first slot row as the revision row.
509  $revRow = $slotRows[0];
510 
511  if ( $this->limitNamespaces &&
512  !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
513  $lastRow = $revRow;
514  continue;
515  }
516 
517  if ( $lastRow === null ||
518  $lastRow->page_namespace !== $revRow->page_namespace ||
519  $lastRow->page_title !== $revRow->page_title ) {
520  if ( $lastRow !== null ) {
521  $output = '';
522  if ( $this->dumpUploads ) {
523  $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
524  }
525  $output .= $this->writer->closePage();
526  $this->sink->writeClosePage( $output );
527  }
528  $output = $this->writer->openPage( $revRow );
529  $this->sink->writeOpenPage( $revRow, $output );
530  }
531  $output = $this->writer->writeRevision( $revRow, $slotRows );
532  $this->sink->writeRevision( $revRow, $output );
533  $lastRow = $revRow;
534  }
535 
536  if ( $rowCarry ) {
537  throw new LogicException( 'Error while processing a stream of slot rows' );
538  }
539 
540  return $lastRow;
541  }
542 
552  protected function getSlotRowBatch( $results, &$carry = null ) {
553  $slotRows = [];
554  $prev = null;
555 
556  if ( $carry ) {
557  $slotRows[] = $carry;
558  $prev = $carry;
559  $carry = null;
560  }
561 
562  while ( $row = $results->fetchObject() ) {
563  if ( $prev && $prev->rev_id !== $row->rev_id ) {
564  $carry = $row;
565  break;
566  }
567  $slotRows[] = $row;
568  $prev = $row;
569  }
570 
571  return $slotRows;
572  }
573 
579  protected function finishPageStreamOutput( $lastRow ) {
580  $output = '';
581  if ( $this->dumpUploads ) {
582  $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
583  }
584  $output .= $this->author_list;
585  $output .= $this->writer->closePage();
586  $this->sink->writeClosePage( $output );
587  }
588 
593  protected function outputLogStream( $resultset ) {
594  foreach ( $resultset as $row ) {
595  $output = $this->writer->writeLogItem( $row );
596  $this->sink->writeLogItem( $row, $output );
597  }
598  return $row->log_id ?? null;
599  }
600 }
WikiExporter\schemaVersion
static schemaVersion()
Returns the default export schema version, as defined by $wgXmlDumpSchemaVersion.
Definition: WikiExporter.php:84
Title\newFromText
static newFromText( $text, $defaultNamespace=NS_MAIN)
Create a new Title from text, such as what one would find in a link.
Definition: Title.php:332
Revision\RevisionRecord
Page revision base class.
Definition: RevisionRecord.php:46
WikiExporter\revsByRange
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
Definition: WikiExporter.php:183
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:144
WikiExporter\CURRENT
const CURRENT
Definition: WikiExporter.php:52
WikiExporter\finishPageStreamOutput
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
Definition: WikiExporter.php:579
WikiExporter\getSlotRowBatch
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
Definition: WikiExporter.php:552
WikiExporter\$dumpUploadFileContents
bool $dumpUploadFileContents
Definition: WikiExporter.php:46
$res
$res
Definition: testCompression.php:57
WikiExporter\allLogs
allLogs()
Definition: WikiExporter.php:222
$revQuery
$revQuery
Definition: testCompression.php:56
WikiExporter\dumpFrom
dumpFrom( $cond='', $orderRevs=false)
Definition: WikiExporter.php:286
ActorMigration\newMigration
static newMigration()
Static constructor.
Definition: ActorMigration.php:139
WikiExporter\openStream
openStream()
Definition: WikiExporter.php:134
Wikimedia\Rdbms\IDatabase
Basic database interface for live and lazy-loaded relation database handles.
Definition: IDatabase.php:38
WikiExporter\STUB
const STUB
Definition: WikiExporter.php:58
WikiExporter\$text
int $text
Definition: WikiExporter.php:63
WikiExporter\$list_authors
bool $list_authors
Return distinct author list (when not returning full history)
Definition: WikiExporter.php:40
MWException
MediaWiki exception.
Definition: MWException.php:26
WikiExporter\$history
array int $history
Definition: WikiExporter.php:75
XmlDumpWriter\WRITE_CONTENT
const WRITE_CONTENT
Output serialized revision content.
Definition: XmlDumpWriter.php:40
WikiExporter\pagesByName
pagesByName( $names)
Definition: WikiExporter.php:216
$wgXmlDumpSchemaVersion
$wgXmlDumpSchemaVersion
The schema to use per default when generating XML dumps.
Definition: DefaultSettings.php:9391
Wikimedia\Rdbms\IResultWrapper
Result wrapper for grabbing data queried from an IDatabase object.
Definition: IResultWrapper.php:24
WikiExporter\TEXT
const TEXT
Definition: WikiExporter.php:57
WikiExporter\outputLogStream
outputLogStream( $resultset)
Definition: WikiExporter.php:593
WikiExporter\closeStream
closeStream()
Definition: WikiExporter.php:139
WikiExporter\$limitNamespaces
array null $limitNamespaces
Definition: WikiExporter.php:78
WikiExporter\allPages
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
Definition: WikiExporter.php:149
WikiExporter\$dumpUploads
bool $dumpUploads
Definition: WikiExporter.php:43
$title
$title
Definition: testCompression.php:38
WikiExporter\STABLE
const STABLE
Definition: WikiExporter.php:53
WikiExporter\setSchemaVersion
setSchemaVersion( $schemaVersion)
Definition: WikiExporter.php:119
DumpOutput
Definition: DumpOutput.php:29
WikiExporter
Definition: WikiExporter.php:38
WikiExporter\$author_list
string $author_list
Definition: WikiExporter.php:49
WikiExporter\dumpPages
dumpPages( $cond, $orderRevs)
Definition: WikiExporter.php:355
WikiExporter\pagesByRange
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
Definition: WikiExporter.php:161
LogEventsList\getExcludeClause
static getExcludeClause( $db, $audience='public', User $user=null)
SQL clause to skip forbidden log types for this user.
Definition: LogEventsList.php:792
WikiExporter\do_list_authors
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
Definition: WikiExporter.php:245
WikiExporter\FULL
const FULL
Definition: WikiExporter.php:51
WikiExporter\__construct
__construct( $db, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
Definition: WikiExporter.php:100
WikiExporter\$writer
XmlDumpWriter $writer
Definition: WikiExporter.php:69
WikiExporter\RANGE
const RANGE
Definition: WikiExporter.php:55
XmlDumpWriter\WRITE_STUB
const WRITE_STUB
Only output subs for revision content.
Definition: XmlDumpWriter.php:43
WikiExporter\setOutputSink
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
Definition: WikiExporter.php:130
WikiExporter\LOGS
const LOGS
Definition: WikiExporter.php:54
WikiExporter\BATCH_SIZE
const BATCH_SIZE
Definition: WikiExporter.php:60
XmlDumpWriter
Definition: XmlDumpWriter.php:37
WikiExporter\logsByRange
logsByRange( $start, $end)
Definition: WikiExporter.php:230
WikiExporter\pageByTitle
pageByTitle( $title)
Definition: WikiExporter.php:194
WikiExporter\outputPageStreamBatch
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
Definition: WikiExporter.php:498
WikiExporter\dumpLogs
dumpLogs( $cond)
Definition: WikiExporter.php:298
WikiExporter\pageByName
pageByName( $name)
Definition: WikiExporter.php:204
WikiExporter\$db
IDatabase $db
Definition: WikiExporter.php:72
WikiExporter\$sink
DumpOutput $sink
Definition: WikiExporter.php:66
CommentStore\getStore
static getStore()
Definition: CommentStore.php:109
Hooks\run
static run( $event, array $args=[], $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
Definition: Hooks.php:133