MediaWiki 1.40.4
WikiExporter.php
Go to the documentation of this file.
1<?php
41
47 public $list_authors = false;
48
50 public $dumpUploads = false;
51
53 public $dumpUploadFileContents = false;
54
56 public $author_list = "";
57
58 public const FULL = 1;
59 public const CURRENT = 2;
60 public const STABLE = 4; // extension defined
61 public const LOGS = 8;
62 public const RANGE = 16;
63
64 public const TEXT = XmlDumpWriter::WRITE_CONTENT;
65 public const STUB = XmlDumpWriter::WRITE_STUB;
66
67 protected const BATCH_SIZE = 50000;
68
70 public $text;
71
73 public $sink;
74
76 private $writer;
77
79 protected $db;
80
82 protected $history;
83
86
88 private $revisionStore;
89
91 private $titleParser;
92
94 private $hookRunner;
95
97 private $commentStore;
98
103 public static function schemaVersion() {
104 return MediaWikiServices::getInstance()->getMainConfig()->get(
105 MainConfigNames::XmlDumpSchemaVersion );
106 }
107
122 public function __construct(
123 $db,
124 CommentStore $commentStore,
125 HookContainer $hookContainer,
126 RevisionStore $revisionStore,
127 TitleParser $titleParser,
128 $history = self::CURRENT,
129 $text = self::TEXT,
130 $limitNamespaces = null
131 ) {
132 $this->db = $db;
133 $this->commentStore = $commentStore;
134 $this->history = $history;
135 $this->writer = new XmlDumpWriter(
136 $text,
137 self::schemaVersion(),
138 $hookContainer,
139 $commentStore
140 );
141 $this->sink = new DumpOutput();
142 $this->text = $text;
143 $this->limitNamespaces = $limitNamespaces;
144 $this->hookRunner = new HookRunner( $hookContainer );
145 $this->revisionStore = $revisionStore;
146 $this->titleParser = $titleParser;
147 }
148
154 public function setSchemaVersion( $schemaVersion ) {
155 $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
156 }
157
165 public function setOutputSink( &$sink ) {
166 $this->sink =& $sink;
167 }
168
169 public function openStream() {
170 $output = $this->writer->openStream();
171 $this->sink->writeOpenStream( $output );
172 }
173
174 public function closeStream() {
175 $output = $this->writer->closeStream();
176 $this->sink->writeCloseStream( $output );
177 }
178
184 public function allPages() {
185 $this->dumpFrom( '' );
186 }
187
196 public function pagesByRange( $start, $end, $orderRevs ) {
197 if ( $orderRevs ) {
198 $condition = 'rev_page >= ' . intval( $start );
199 if ( $end ) {
200 $condition .= ' AND rev_page < ' . intval( $end );
201 }
202 } else {
203 $condition = 'page_id >= ' . intval( $start );
204 if ( $end ) {
205 $condition .= ' AND page_id < ' . intval( $end );
206 }
207 }
208 $this->dumpFrom( $condition, $orderRevs );
209 }
210
218 public function revsByRange( $start, $end ) {
219 $condition = 'rev_id >= ' . intval( $start );
220 if ( $end ) {
221 $condition .= ' AND rev_id < ' . intval( $end );
222 }
223 $this->dumpFrom( $condition );
224 }
225
229 public function pageByTitle( PageIdentity $page ) {
230 $this->dumpFrom(
231 'page_namespace=' . $page->getNamespace() .
232 ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) );
233 }
234
239 public function pageByName( $name ) {
240 try {
241 $link = $this->titleParser->parseTitle( $name );
242 $this->dumpFrom(
243 'page_namespace=' . $link->getNamespace() .
244 ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) );
245 } catch ( MalformedTitleException $ex ) {
246 throw new MWException( "Can't export invalid title" );
247 }
248 }
249
253 public function pagesByName( $names ) {
254 foreach ( $names as $name ) {
255 $this->pageByName( $name );
256 }
257 }
258
259 public function allLogs() {
260 $this->dumpFrom( '' );
261 }
262
267 public function logsByRange( $start, $end ) {
268 $condition = 'log_id >= ' . intval( $start );
269 if ( $end ) {
270 $condition .= ' AND log_id < ' . intval( $end );
271 }
272 $this->dumpFrom( $condition );
273 }
274
282 protected function do_list_authors( $cond ) {
283 $this->author_list = "<contributors>";
284 // rev_deleted
285
286 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
287 $res = $this->db->newSelectQueryBuilder()
288 ->select( [
289 'rev_user_text' => $revQuery['fields']['rev_user_text'],
290 'rev_user' => $revQuery['fields']['rev_user'],
291 ] )
292 ->tables( $revQuery['tables'] )
293 ->where( [
294 $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0',
295 $cond,
296 ] )
297 ->joinConds( $revQuery['joins'] )
298 ->distinct()
299 ->caller( __METHOD__ )
300 ->fetchResultSet();
301
302 foreach ( $res as $row ) {
303 $this->author_list .= "<contributor>" .
304 "<username>" .
305 htmlspecialchars( $row->rev_user_text ) .
306 "</username>" .
307 "<id>" .
308 ( (int)$row->rev_user ) .
309 "</id>" .
310 "</contributor>";
311 }
312 $this->author_list .= "</contributors>";
313 }
314
321 protected function dumpFrom( $cond = '', $orderRevs = false ) {
322 if ( is_int( $this->history ) && ( $this->history & self::LOGS ) ) {
323 $this->dumpLogs( $cond );
324 } else {
325 $this->dumpPages( $cond, $orderRevs );
326 }
327 }
328
333 protected function dumpLogs( $cond ) {
334 $where = [];
335 # Hide private logs
336 $hideLogs = LogEventsList::getExcludeClause( $this->db );
337 if ( $hideLogs ) {
338 $where[] = $hideLogs;
339 }
340 # Add on any caller specified conditions
341 if ( $cond ) {
342 $where[] = $cond;
343 }
344
345 $commentQuery = $this->commentStore->getJoin( 'log_comment' );
346
347 $tables = array_merge(
348 [ 'logging', 'actor' ], $commentQuery['tables']
349 );
350 $fields = [
351 'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
352 'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name'
353 ] + $commentQuery['fields'];
354 $options = [
355 'ORDER BY' => 'log_id',
356 'USE INDEX' => [ 'logging' => 'PRIMARY' ],
357 'LIMIT' => self::BATCH_SIZE,
358 ];
359 $joins = [
360 'actor' => [ 'JOIN', 'actor_id=log_actor' ]
361 ] + $commentQuery['joins'];
362
363 $lastLogId = 0;
364 while ( true ) {
365 $result = $this->db->select(
366 $tables,
367 $fields,
368 array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
369 __METHOD__,
370 $options,
371 $joins
372 );
373
374 if ( !$result->numRows() ) {
375 break;
376 }
377
378 $lastLogId = $this->outputLogStream( $result );
379 $this->reloadDBConfig();
380 }
381 }
382
389 protected function dumpPages( $cond, $orderRevs ) {
390 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
391 $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
392
393 // We want page primary rather than revision.
394 // We also want to join in the slots and content tables.
395 // NOTE: This means we may get multiple rows per revision, and more rows
396 // than the batch size! Should be ok, since the max number of slots is
397 // fixed and low (dozens at worst).
398 $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
399 $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
400 $join = $revQuery['joins'] + [
401 'revision' => $revQuery['joins']['page'],
402 'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
403 'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
404 ];
405 unset( $join['page'] );
406
407 $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
408
409 if ( $this->text != self::STUB ) {
410 $fields['_load_content'] = '1';
411 }
412
413 $conds = [];
414 if ( $cond !== '' ) {
415 $conds[] = $cond;
416 }
417 $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
418 $opts['USE INDEX'] = [];
419
420 $op = '>';
421 if ( is_array( $this->history ) ) {
422 # Time offset/limit for all pages/history...
423 # Set time order
424 if ( $this->history['dir'] == 'asc' ) {
425 $opts['ORDER BY'] = 'rev_timestamp ASC';
426 } else {
427 $op = '<';
428 $opts['ORDER BY'] = 'rev_timestamp DESC';
429 }
430 # Set offset
431 if ( !empty( $this->history['offset'] ) ) {
432 $conds[] = "rev_timestamp $op " .
433 $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
434 }
435 # Set query limit
436 if ( !empty( $this->history['limit'] ) ) {
437 $maxRowCount = intval( $this->history['limit'] );
438 }
439 } elseif ( $this->history & self::FULL ) {
440 # Full history dumps...
441 # query optimization for history stub dumps
442 if ( $this->text == self::STUB ) {
443 $opts[] = 'STRAIGHT_JOIN';
444 unset( $join['revision'] );
445 $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
446 }
447 } elseif ( $this->history & self::CURRENT ) {
448 # Latest revision dumps...
449 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
450 $this->do_list_authors( $cond );
451 }
452 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
453 $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
454 } elseif ( $this->history & self::STABLE ) {
455 # "Stable" revision dumps...
456 # Default JOIN, to be overridden...
457 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
458 # One, and only one hook should set this, and return false
459 if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
460 throw new MWException( __METHOD__ . " given invalid history dump type." );
461 }
462 } elseif ( $this->history & self::RANGE ) {
463 # Dump of revisions within a specified range. Condition already set in revsByRange().
464 } else {
465 # Unknown history specification parameter?
466 throw new MWException( __METHOD__ . " given invalid history dump type." );
467 }
468
469 $done = false;
470 $lastRow = null;
471 $revPage = 0;
472 $revId = 0;
473 $rowCount = 0;
474
475 $opts['LIMIT'] = self::BATCH_SIZE;
476
477 $this->hookRunner->onModifyExportQuery(
478 $this->db, $tables, $cond, $opts, $join, $conds );
479
480 while ( !$done ) {
481 // If necessary, impose the overall maximum and stop looping after this iteration.
482 if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
483 $opts['LIMIT'] = $maxRowCount - $rowCount;
484 $done = true;
485 }
486
487 $queryConds = $conds;
488 $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
489 intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
490
491 # Do the query and process any results, remembering max ids for the next iteration.
492 $result = $this->db->select(
493 $tables,
494 $fields,
495 $queryConds,
496 __METHOD__,
497 $opts,
498 $join
499 );
500 if ( $result->numRows() > 0 ) {
501 $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
502 $rowCount += $result->numRows();
503 $revPage = $lastRow->rev_page;
504 $revId = $lastRow->rev_id;
505 } else {
506 $done = true;
507 }
508
509 // If we are finished, close off final page element (if any).
510 if ( $done && $lastRow ) {
511 $this->finishPageStreamOutput( $lastRow );
512 }
513
514 if ( !$done ) {
515 $this->reloadDBConfig();
516 }
517 }
518 }
519
529 protected function outputPageStreamBatch( $results, $lastRow ) {
530 $rowCarry = null;
531 while ( true ) {
532 $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
533
534 if ( !$slotRows ) {
535 break;
536 }
537
538 // All revision info is present in all slot rows.
539 // Use the first slot row as the revision row.
540 $revRow = $slotRows[0];
541
542 if ( $this->limitNamespaces &&
543 !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
544 $lastRow = $revRow;
545 continue;
546 }
547
548 if ( $lastRow === null ||
549 $lastRow->page_namespace !== $revRow->page_namespace ||
550 $lastRow->page_title !== $revRow->page_title ) {
551 if ( $lastRow !== null ) {
552 $output = '';
553 if ( $this->dumpUploads ) {
554 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
555 }
556 $output .= $this->writer->closePage();
557 $this->sink->writeClosePage( $output );
558 }
559 $output = $this->writer->openPage( $revRow );
560 $this->sink->writeOpenPage( $revRow, $output );
561 }
562 try {
563 $output = $this->writer->writeRevision( $revRow, $slotRows );
564 $this->sink->writeRevision( $revRow, $output );
565 } catch ( RevisionAccessException $ex ) {
566 MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for'
567 . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() );
568 }
569 $lastRow = $revRow;
570 }
571
572 if ( $rowCarry ) {
573 throw new LogicException( 'Error while processing a stream of slot rows' );
574 }
575
576 // @phan-suppress-next-line PhanTypeMismatchReturnNullable False positive
577 return $lastRow;
578 }
579
589 protected function getSlotRowBatch( $results, &$carry = null ) {
590 $slotRows = [];
591 $prev = null;
592
593 if ( $carry ) {
594 $slotRows[] = $carry;
595 $prev = $carry;
596 $carry = null;
597 }
598
599 while ( $row = $results->fetchObject() ) {
600 if ( $prev && $prev->rev_id !== $row->rev_id ) {
601 $carry = $row;
602 break;
603 }
604 $slotRows[] = $row;
605 $prev = $row;
606 }
607
608 return $slotRows;
609 }
610
616 protected function finishPageStreamOutput( $lastRow ) {
617 $output = '';
618 if ( $this->dumpUploads ) {
619 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
620 }
621 $output .= $this->author_list;
622 $output .= $this->writer->closePage();
623 $this->sink->writeClosePage( $output );
624 }
625
630 protected function outputLogStream( $resultset ) {
631 foreach ( $resultset as $row ) {
632 $output = $this->writer->writeLogItem( $row );
633 $this->sink->writeLogItem( $row, $output );
634 }
635 return $row->log_id ?? null;
636 }
637
644 private function reloadDBConfig() {
645 MediaWikiServices::getInstance()->getDBLoadBalancerFactory()
646 ->autoReconfigure();
647 }
648}
MediaWiki exception.
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
Handle database storage of comments such as edit summaries and log reasons.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception representing a failure to look up a revision.
Page revision base class.
Service for looking up page revisions.
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
dumpFrom( $cond='', $orderRevs=false)
pageByName( $name)
pagesByName( $names)
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
bool $dumpUploadFileContents
array int $history
__construct( $db, CommentStore $commentStore, HookContainer $hookContainer, RevisionStore $revisionStore, TitleParser $titleParser, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
DumpOutput $sink
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
logsByRange( $start, $end)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
array null $limitNamespaces
dumpPages( $cond, $orderRevs)
pageByTitle(PageIdentity $page)
outputLogStream( $resultset)
IDatabase $db
setSchemaVersion( $schemaVersion)
static schemaVersion()
Returns the default export schema version, as defined by the XmlDumpSchemaVersion setting.
string $author_list
bool $list_authors
Return distinct author list (when not returning full history)
Interface for objects (potentially) representing an editable wiki page.
getNamespace()
Returns the page's namespace number.
getDBkey()
Get the page title in DB key form.
A title parser service for MediaWiki.
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:36
Result wrapper for grabbing data queried from an IDatabase object.