MediaWiki REL1_41
WikiExporter.php
Go to the documentation of this file.
1<?php
43
49 public $list_authors = false;
50
52 public $dumpUploads = false;
53
55 public $dumpUploadFileContents = false;
56
58 public $author_list = "";
59
60 public const FULL = 1;
61 public const CURRENT = 2;
62 public const STABLE = 4; // extension defined
63 public const LOGS = 8;
64 public const RANGE = 16;
65
66 public const TEXT = XmlDumpWriter::WRITE_CONTENT;
67 public const STUB = XmlDumpWriter::WRITE_STUB;
68
69 protected const BATCH_SIZE = 50000;
70
72 public $text;
73
75 public $sink;
76
78 private $writer;
79
81 protected $db;
82
84 protected $history;
85
88
90 private $revisionStore;
91
93 private $titleParser;
94
96 private $hookRunner;
97
99 private $commentStore;
100
105 public static function schemaVersion() {
106 return MediaWikiServices::getInstance()->getMainConfig()->get(
107 MainConfigNames::XmlDumpSchemaVersion );
108 }
109
124 public function __construct(
125 $db,
126 CommentStore $commentStore,
127 HookContainer $hookContainer,
128 RevisionStore $revisionStore,
129 TitleParser $titleParser,
130 $history = self::CURRENT,
131 $text = self::TEXT,
132 $limitNamespaces = null
133 ) {
134 $this->db = $db;
135 $this->commentStore = $commentStore;
136 $this->history = $history;
137 $this->writer = new XmlDumpWriter(
138 $text,
139 self::schemaVersion(),
140 $hookContainer,
141 $commentStore
142 );
143 $this->sink = new DumpOutput();
144 $this->text = $text;
145 $this->limitNamespaces = $limitNamespaces;
146 $this->hookRunner = new HookRunner( $hookContainer );
147 $this->revisionStore = $revisionStore;
148 $this->titleParser = $titleParser;
149 }
150
156 public function setSchemaVersion( $schemaVersion ) {
157 $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
158 }
159
167 public function setOutputSink( &$sink ) {
168 $this->sink =& $sink;
169 }
170
171 public function openStream() {
172 $output = $this->writer->openStream();
173 $this->sink->writeOpenStream( $output );
174 }
175
176 public function closeStream() {
177 $output = $this->writer->closeStream();
178 $this->sink->writeCloseStream( $output );
179 }
180
186 public function allPages() {
187 $this->dumpFrom( '' );
188 }
189
198 public function pagesByRange( $start, $end, $orderRevs ) {
199 if ( $orderRevs ) {
200 $condition = 'rev_page >= ' . intval( $start );
201 if ( $end ) {
202 $condition .= ' AND rev_page < ' . intval( $end );
203 }
204 } else {
205 $condition = 'page_id >= ' . intval( $start );
206 if ( $end ) {
207 $condition .= ' AND page_id < ' . intval( $end );
208 }
209 }
210 $this->dumpFrom( $condition, $orderRevs );
211 }
212
220 public function revsByRange( $start, $end ) {
221 $condition = 'rev_id >= ' . intval( $start );
222 if ( $end ) {
223 $condition .= ' AND rev_id < ' . intval( $end );
224 }
225 $this->dumpFrom( $condition );
226 }
227
231 public function pageByTitle( PageIdentity $page ) {
232 $this->dumpFrom(
233 'page_namespace=' . $page->getNamespace() .
234 ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) );
235 }
236
241 public function pageByName( $name ) {
242 try {
243 $link = $this->titleParser->parseTitle( $name );
244 $this->dumpFrom(
245 'page_namespace=' . $link->getNamespace() .
246 ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) );
247 } catch ( MalformedTitleException $ex ) {
248 throw new MWException( "Can't export invalid title" );
249 }
250 }
251
255 public function pagesByName( $names ) {
256 foreach ( $names as $name ) {
257 $this->pageByName( $name );
258 }
259 }
260
261 public function allLogs() {
262 $this->dumpFrom( '' );
263 }
264
269 public function logsByRange( $start, $end ) {
270 $condition = 'log_id >= ' . intval( $start );
271 if ( $end ) {
272 $condition .= ' AND log_id < ' . intval( $end );
273 }
274 $this->dumpFrom( $condition );
275 }
276
284 protected function do_list_authors( $cond ) {
285 $this->author_list = "<contributors>";
286 // rev_deleted
287
288 $res = $this->revisionStore->newSelectQueryBuilder( $this->db )
289 ->joinPage()
290 ->distinct()
291 ->where( $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0' )
292 ->andWhere( $cond )
293 ->caller( __METHOD__ )->fetchResultSet();
294
295 foreach ( $res as $row ) {
296 $this->author_list .= "<contributor>" .
297 "<username>" .
298 htmlspecialchars( $row->rev_user_text ) .
299 "</username>" .
300 "<id>" .
301 ( (int)$row->rev_user ) .
302 "</id>" .
303 "</contributor>";
304 }
305 $this->author_list .= "</contributors>";
306 }
307
314 protected function dumpFrom( $cond = '', $orderRevs = false ) {
315 if ( is_int( $this->history ) && ( $this->history & self::LOGS ) ) {
316 $this->dumpLogs( $cond );
317 } else {
318 $this->dumpPages( $cond, $orderRevs );
319 }
320 }
321
326 protected function dumpLogs( $cond ) {
327 $where = [];
328 # Hide private logs
329 $hideLogs = LogEventsList::getExcludeClause( $this->db );
330 if ( $hideLogs ) {
331 $where[] = $hideLogs;
332 }
333 # Add on any caller specified conditions
334 if ( $cond ) {
335 $where[] = $cond;
336 }
337
338 $commentQuery = $this->commentStore->getJoin( 'log_comment' );
339
340 $tables = array_merge(
341 [ 'logging', 'actor' ], $commentQuery['tables']
342 );
343 $fields = [
344 'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
345 'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name'
346 ] + $commentQuery['fields'];
347 $options = [
348 'ORDER BY' => 'log_id',
349 'USE INDEX' => [ 'logging' => 'PRIMARY' ],
350 'LIMIT' => self::BATCH_SIZE,
351 ];
352 $joins = [
353 'actor' => [ 'JOIN', 'actor_id=log_actor' ]
354 ] + $commentQuery['joins'];
355
356 $lastLogId = 0;
357 while ( true ) {
358 $result = $this->db->select(
359 $tables,
360 $fields,
361 array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
362 __METHOD__,
363 $options,
364 $joins
365 );
366
367 if ( !$result->numRows() ) {
368 break;
369 }
370
371 $lastLogId = $this->outputLogStream( $result );
372 $this->reloadDBConfig();
373 }
374 }
375
382 protected function dumpPages( $cond, $orderRevs ) {
383 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
384 $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
385
386 // We want page primary rather than revision.
387 // We also want to join in the slots and content tables.
388 // NOTE: This means we may get multiple rows per revision, and more rows
389 // than the batch size! Should be ok, since the max number of slots is
390 // fixed and low (dozens at worst).
391 $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
392 $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
393 $join = $revQuery['joins'] + [
394 'revision' => $revQuery['joins']['page'],
395 'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
396 'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
397 ];
398 unset( $join['page'] );
399
400 $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
401
402 if ( $this->text != self::STUB ) {
403 $fields['_load_content'] = '1';
404 }
405
406 $conds = [];
407 if ( $cond !== '' ) {
408 $conds[] = $cond;
409 }
410 $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
411 $opts['USE INDEX'] = [];
412
413 $op = '>';
414 if ( is_array( $this->history ) ) {
415 # Time offset/limit for all pages/history...
416 # Set time order
417 if ( $this->history['dir'] == 'asc' ) {
418 $opts['ORDER BY'] = 'rev_timestamp ASC';
419 } else {
420 $op = '<';
421 $opts['ORDER BY'] = 'rev_timestamp DESC';
422 }
423 # Set offset
424 if ( !empty( $this->history['offset'] ) ) {
425 $conds[] = "rev_timestamp $op " .
426 $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
427 }
428 # Set query limit
429 if ( !empty( $this->history['limit'] ) ) {
430 $maxRowCount = intval( $this->history['limit'] );
431 }
432 } elseif ( $this->history & self::FULL ) {
433 # Full history dumps...
434 # query optimization for history stub dumps
435 if ( $this->text == self::STUB ) {
436 $opts[] = 'STRAIGHT_JOIN';
437 unset( $join['revision'] );
438 $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
439 }
440 } elseif ( $this->history & self::CURRENT ) {
441 # Latest revision dumps...
442 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
443 $this->do_list_authors( $cond );
444 }
445 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
446 $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
447 } elseif ( $this->history & self::STABLE ) {
448 # "Stable" revision dumps...
449 # Default JOIN, to be overridden...
450 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
451 # One, and only one hook should set this, and return false
452 if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
453 throw new MWException( __METHOD__ . " given invalid history dump type." );
454 }
455 } elseif ( $this->history & self::RANGE ) {
456 # Dump of revisions within a specified range. Condition already set in revsByRange().
457 } else {
458 # Unknown history specification parameter?
459 throw new MWException( __METHOD__ . " given invalid history dump type." );
460 }
461
462 $done = false;
463 $lastRow = null;
464 $revPage = 0;
465 $revId = 0;
466 $rowCount = 0;
467
468 $opts['LIMIT'] = self::BATCH_SIZE;
469
470 $this->hookRunner->onModifyExportQuery(
471 $this->db, $tables, $cond, $opts, $join, $conds );
472
473 while ( !$done ) {
474 // If necessary, impose the overall maximum and stop looping after this iteration.
475 if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
476 $opts['LIMIT'] = $maxRowCount - $rowCount;
477 $done = true;
478 }
479
480 $queryConds = $conds;
481 $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
482 intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
483
484 # Do the query and process any results, remembering max ids for the next iteration.
485 $result = $this->db->select(
486 $tables,
487 $fields,
488 $queryConds,
489 __METHOD__,
490 $opts,
491 $join
492 );
493 if ( $result->numRows() > 0 ) {
494 $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
495 $rowCount += $result->numRows();
496 $revPage = $lastRow->rev_page;
497 $revId = $lastRow->rev_id;
498 } else {
499 $done = true;
500 }
501
502 // If we are finished, close off final page element (if any).
503 if ( $done && $lastRow ) {
504 $this->finishPageStreamOutput( $lastRow );
505 }
506
507 if ( !$done ) {
508 $this->reloadDBConfig();
509 }
510 }
511 }
512
522 protected function outputPageStreamBatch( $results, $lastRow ) {
523 $rowCarry = null;
524 while ( true ) {
525 $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
526
527 if ( !$slotRows ) {
528 break;
529 }
530
531 // All revision info is present in all slot rows.
532 // Use the first slot row as the revision row.
533 $revRow = $slotRows[0];
534
535 if ( $this->limitNamespaces &&
536 !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
537 $lastRow = $revRow;
538 continue;
539 }
540
541 if ( $lastRow === null ||
542 $lastRow->page_namespace !== $revRow->page_namespace ||
543 $lastRow->page_title !== $revRow->page_title ) {
544 if ( $lastRow !== null ) {
545 $output = '';
546 if ( $this->dumpUploads ) {
547 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
548 }
549 $output .= $this->writer->closePage();
550 $this->sink->writeClosePage( $output );
551 }
552 $output = $this->writer->openPage( $revRow );
553 $this->sink->writeOpenPage( $revRow, $output );
554 }
555 try {
556 $output = $this->writer->writeRevision( $revRow, $slotRows );
557 $this->sink->writeRevision( $revRow, $output );
558 } catch ( RevisionAccessException $ex ) {
559 MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for'
560 . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() );
561 }
562 $lastRow = $revRow;
563 }
564
565 if ( $rowCarry ) {
566 throw new LogicException( 'Error while processing a stream of slot rows' );
567 }
568
569 // @phan-suppress-next-line PhanTypeMismatchReturnNullable False positive
570 return $lastRow;
571 }
572
582 protected function getSlotRowBatch( $results, &$carry = null ) {
583 $slotRows = [];
584 $prev = null;
585
586 if ( $carry ) {
587 $slotRows[] = $carry;
588 $prev = $carry;
589 $carry = null;
590 }
591
592 while ( $row = $results->fetchObject() ) {
593 if ( $prev && $prev->rev_id !== $row->rev_id ) {
594 $carry = $row;
595 break;
596 }
597 $slotRows[] = $row;
598 $prev = $row;
599 }
600
601 return $slotRows;
602 }
603
609 protected function finishPageStreamOutput( $lastRow ) {
610 $output = '';
611 if ( $this->dumpUploads ) {
612 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
613 }
614 $output .= $this->author_list;
615 $output .= $this->writer->closePage();
616 $this->sink->writeClosePage( $output );
617 }
618
623 protected function outputLogStream( $resultset ) {
624 foreach ( $resultset as $row ) {
625 $output = $this->writer->writeLogItem( $row );
626 $this->sink->writeLogItem( $row, $output );
627 }
628 return $row->log_id ?? null;
629 }
630
637 private function reloadDBConfig() {
638 MediaWikiServices::getInstance()->getDBLoadBalancerFactory()
639 ->autoReconfigure();
640 }
641}
MediaWiki exception.
Handle database storage of comments such as edit summaries and log reasons.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception representing a failure to look up a revision.
Page revision base class.
Service for looking up page revisions.
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
dumpFrom( $cond='', $orderRevs=false)
pageByName( $name)
pagesByName( $names)
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
IReadableDatabase $db
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
bool $dumpUploadFileContents
array int $history
__construct( $db, CommentStore $commentStore, HookContainer $hookContainer, RevisionStore $revisionStore, TitleParser $titleParser, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
DumpOutput $sink
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
logsByRange( $start, $end)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
array null $limitNamespaces
dumpPages( $cond, $orderRevs)
pageByTitle(PageIdentity $page)
outputLogStream( $resultset)
setSchemaVersion( $schemaVersion)
static schemaVersion()
Returns the default export schema version, as defined by the XmlDumpSchemaVersion setting.
string $author_list
bool $list_authors
Return distinct author list (when not returning full history)
Interface for objects (potentially) representing an editable wiki page.
getNamespace()
Returns the page's namespace number.
getDBkey()
Get the page title in DB key form.
A title parser service for MediaWiki.
A database connection without write operations.
Result wrapper for grabbing data queried from an IDatabase object.