MediaWiki master
WikiExporter.php
Go to the documentation of this file.
1<?php
43
49 public $list_authors = false;
50
52 public $dumpUploads = false;
53
55 public $dumpUploadFileContents = false;
56
58 public $author_list = "";
59
60 public const FULL = 1;
61 public const CURRENT = 2;
62 public const STABLE = 4; // extension defined
63 public const LOGS = 8;
64 public const RANGE = 16;
65
66 public const TEXT = XmlDumpWriter::WRITE_CONTENT;
67 public const STUB = XmlDumpWriter::WRITE_STUB;
68
69 protected const BATCH_SIZE = 50000;
70
72 public $text;
73
75 public $sink;
76
78 private $writer;
79
81 protected $db;
82
84 protected $history;
85
88
90 private $revisionStore;
91
93 private $titleParser;
94
96 private $hookRunner;
97
99 private $commentStore;
100
105 public static function schemaVersion() {
106 return MediaWikiServices::getInstance()->getMainConfig()->get(
107 MainConfigNames::XmlDumpSchemaVersion );
108 }
109
124 public function __construct(
125 $db,
126 CommentStore $commentStore,
127 HookContainer $hookContainer,
128 RevisionStore $revisionStore,
129 TitleParser $titleParser,
130 $history = self::CURRENT,
131 $text = self::TEXT,
132 $limitNamespaces = null
133 ) {
134 $this->db = $db;
135 $this->commentStore = $commentStore;
136 $this->history = $history;
137 $this->writer = new XmlDumpWriter(
138 $text,
139 self::schemaVersion(),
140 $hookContainer,
141 $commentStore
142 );
143 $this->sink = new DumpOutput();
144 $this->text = $text;
145 $this->limitNamespaces = $limitNamespaces;
146 $this->hookRunner = new HookRunner( $hookContainer );
147 $this->revisionStore = $revisionStore;
148 $this->titleParser = $titleParser;
149 }
150
156 public function setSchemaVersion( $schemaVersion ) {
157 $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
158 }
159
167 public function setOutputSink( &$sink ) {
168 $this->sink =& $sink;
169 }
170
171 public function openStream() {
172 $output = $this->writer->openStream();
173 $this->sink->writeOpenStream( $output );
174 }
175
176 public function closeStream() {
177 $output = $this->writer->closeStream();
178 $this->sink->writeCloseStream( $output );
179 }
180
186 public function allPages() {
187 $this->dumpFrom( '' );
188 }
189
198 public function pagesByRange( $start, $end, $orderRevs ) {
199 if ( $orderRevs ) {
200 $condition = 'rev_page >= ' . intval( $start );
201 if ( $end ) {
202 $condition .= ' AND rev_page < ' . intval( $end );
203 }
204 } else {
205 $condition = 'page_id >= ' . intval( $start );
206 if ( $end ) {
207 $condition .= ' AND page_id < ' . intval( $end );
208 }
209 }
210 $this->dumpFrom( $condition, $orderRevs );
211 }
212
220 public function revsByRange( $start, $end ) {
221 $condition = 'rev_id >= ' . intval( $start );
222 if ( $end ) {
223 $condition .= ' AND rev_id < ' . intval( $end );
224 }
225 $this->dumpFrom( $condition );
226 }
227
231 public function pageByTitle( PageIdentity $page ) {
232 $this->dumpFrom(
233 'page_namespace=' . $page->getNamespace() .
234 ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) );
235 }
236
240 public function pageByName( $name ) {
241 try {
242 $link = $this->titleParser->parseTitle( $name );
243 $this->dumpFrom(
244 'page_namespace=' . $link->getNamespace() .
245 ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) );
246 } catch ( MalformedTitleException $ex ) {
247 throw new RuntimeException( "Can't export invalid title" );
248 }
249 }
250
254 public function pagesByName( $names ) {
255 foreach ( $names as $name ) {
256 $this->pageByName( $name );
257 }
258 }
259
260 public function allLogs() {
261 $this->dumpFrom( '' );
262 }
263
268 public function logsByRange( $start, $end ) {
269 $condition = 'log_id >= ' . intval( $start );
270 if ( $end ) {
271 $condition .= ' AND log_id < ' . intval( $end );
272 }
273 $this->dumpFrom( $condition );
274 }
275
283 protected function do_list_authors( $cond ) {
284 $this->author_list = "<contributors>";
285 // rev_deleted
286
287 $res = $this->revisionStore->newSelectQueryBuilder( $this->db )
288 ->joinPage()
289 ->distinct()
290 ->where( $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0' )
291 ->andWhere( $cond )
292 ->caller( __METHOD__ )->fetchResultSet();
293
294 foreach ( $res as $row ) {
295 $this->author_list .= "<contributor>" .
296 "<username>" .
297 htmlspecialchars( $row->rev_user_text ) .
298 "</username>" .
299 "<id>" .
300 ( (int)$row->rev_user ) .
301 "</id>" .
302 "</contributor>";
303 }
304 $this->author_list .= "</contributors>";
305 }
306
311 protected function dumpFrom( $cond = '', $orderRevs = false ) {
312 if ( is_int( $this->history ) && ( $this->history & self::LOGS ) ) {
313 $this->dumpLogs( $cond );
314 } else {
315 $this->dumpPages( $cond, $orderRevs );
316 }
317 }
318
322 protected function dumpLogs( $cond ) {
323 $where = [];
324 # Hide private logs
325 $hideLogs = LogEventsList::getExcludeClause( $this->db );
326 if ( $hideLogs ) {
327 $where[] = $hideLogs;
328 }
329 # Add on any caller specified conditions
330 if ( $cond ) {
331 $where[] = $cond;
332 }
333
334 $commentQuery = $this->commentStore->getJoin( 'log_comment' );
335
336 $tables = array_merge(
337 [ 'logging', 'actor' ], $commentQuery['tables']
338 );
339 $fields = [
340 'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
341 'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name'
342 ] + $commentQuery['fields'];
343 $options = [
344 'ORDER BY' => 'log_id',
345 'USE INDEX' => [ 'logging' => 'PRIMARY' ],
346 'LIMIT' => self::BATCH_SIZE,
347 ];
348 $joins = [
349 'actor' => [ 'JOIN', 'actor_id=log_actor' ]
350 ] + $commentQuery['joins'];
351
352 $lastLogId = 0;
353 while ( true ) {
354 $result = $this->db->select(
355 $tables,
356 $fields,
357 array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
358 __METHOD__,
359 $options,
360 $joins
361 );
362
363 if ( !$result->numRows() ) {
364 break;
365 }
366
367 $lastLogId = $this->outputLogStream( $result );
368 $this->reloadDBConfig();
369 }
370 }
371
376 protected function dumpPages( $cond, $orderRevs ) {
377 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
378 $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
379
380 // We want page primary rather than revision.
381 // We also want to join in the slots and content tables.
382 // NOTE: This means we may get multiple rows per revision, and more rows
383 // than the batch size! Should be ok, since the max number of slots is
384 // fixed and low (dozens at worst).
385 $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
386 $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
387 $join = $revQuery['joins'] + [
388 'revision' => $revQuery['joins']['page'],
389 'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
390 'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
391 ];
392 unset( $join['page'] );
393
394 $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
395
396 if ( $this->text != self::STUB ) {
397 $fields['_load_content'] = '1';
398 }
399
400 $conds = [];
401 if ( $cond !== '' ) {
402 $conds[] = $cond;
403 }
404 $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
405 $opts['USE INDEX'] = [];
406
407 $op = '>';
408 if ( is_array( $this->history ) ) {
409 # Time offset/limit for all pages/history...
410 # Set time order
411 if ( $this->history['dir'] == 'asc' ) {
412 $opts['ORDER BY'] = 'rev_timestamp ASC';
413 } else {
414 $op = '<';
415 $opts['ORDER BY'] = 'rev_timestamp DESC';
416 }
417 # Set offset
418 if ( !empty( $this->history['offset'] ) ) {
419 $conds[] = "rev_timestamp $op " .
420 $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
421 }
422 # Set query limit
423 if ( !empty( $this->history['limit'] ) ) {
424 $maxRowCount = intval( $this->history['limit'] );
425 }
426 } elseif ( $this->history & self::FULL ) {
427 # Full history dumps...
428 # query optimization for history stub dumps
429 if ( $this->text == self::STUB ) {
430 $opts[] = 'STRAIGHT_JOIN';
431 unset( $join['revision'] );
432 $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
433 }
434 } elseif ( $this->history & self::CURRENT ) {
435 # Latest revision dumps...
436 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
437 $this->do_list_authors( $cond );
438 }
439 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
440 $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
441 } elseif ( $this->history & self::STABLE ) {
442 # "Stable" revision dumps...
443 # Default JOIN, to be overridden...
444 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
445 # One, and only one hook should set this, and return false
446 if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
447 throw new LogicException( __METHOD__ . " given invalid history dump type." );
448 }
449 } elseif ( $this->history & self::RANGE ) {
450 # Dump of revisions within a specified range. Condition already set in revsByRange().
451 } else {
452 # Unknown history specification parameter?
453 throw new UnexpectedValueException( __METHOD__ . " given invalid history dump type." );
454 }
455
456 $done = false;
457 $lastRow = null;
458 $revPage = 0;
459 $revId = 0;
460 $rowCount = 0;
461
462 $opts['LIMIT'] = self::BATCH_SIZE;
463
464 $this->hookRunner->onModifyExportQuery(
465 $this->db, $tables, $cond, $opts, $join, $conds );
466
467 while ( !$done ) {
468 // If necessary, impose the overall maximum and stop looping after this iteration.
469 if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
470 $opts['LIMIT'] = $maxRowCount - $rowCount;
471 $done = true;
472 }
473
474 $queryConds = $conds;
475 $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
476 intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
477
478 # Do the query and process any results, remembering max ids for the next iteration.
479 $result = $this->db->select(
480 $tables,
481 $fields,
482 $queryConds,
483 __METHOD__,
484 $opts,
485 $join
486 );
487 if ( $result->numRows() > 0 ) {
488 $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
489 $rowCount += $result->numRows();
490 $revPage = $lastRow->rev_page;
491 $revId = $lastRow->rev_id;
492 } else {
493 $done = true;
494 }
495
496 // If we are finished, close off final page element (if any).
497 if ( $done && $lastRow ) {
498 $this->finishPageStreamOutput( $lastRow );
499 }
500
501 if ( !$done ) {
502 $this->reloadDBConfig();
503 }
504 }
505 }
506
516 protected function outputPageStreamBatch( $results, $lastRow ) {
517 $rowCarry = null;
518 while ( true ) {
519 $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
520
521 if ( !$slotRows ) {
522 break;
523 }
524
525 // All revision info is present in all slot rows.
526 // Use the first slot row as the revision row.
527 $revRow = $slotRows[0];
528
529 if ( $this->limitNamespaces &&
530 !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
531 $lastRow = $revRow;
532 continue;
533 }
534
535 if ( $lastRow === null ||
536 $lastRow->page_namespace !== $revRow->page_namespace ||
537 $lastRow->page_title !== $revRow->page_title ) {
538 if ( $lastRow !== null ) {
539 $output = '';
540 if ( $this->dumpUploads ) {
541 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
542 }
543 $output .= $this->writer->closePage();
544 $this->sink->writeClosePage( $output );
545 }
546 $output = $this->writer->openPage( $revRow );
547 $this->sink->writeOpenPage( $revRow, $output );
548 }
549 try {
550 $output = $this->writer->writeRevision( $revRow, $slotRows );
551 $this->sink->writeRevision( $revRow, $output );
552 } catch ( RevisionAccessException $ex ) {
553 MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for'
554 . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() );
555 }
556 $lastRow = $revRow;
557 }
558
559 if ( $rowCarry ) {
560 throw new LogicException( 'Error while processing a stream of slot rows' );
561 }
562
563 // @phan-suppress-next-line PhanTypeMismatchReturnNullable False positive
564 return $lastRow;
565 }
566
576 protected function getSlotRowBatch( $results, &$carry = null ) {
577 $slotRows = [];
578 $prev = null;
579
580 if ( $carry ) {
581 $slotRows[] = $carry;
582 $prev = $carry;
583 $carry = null;
584 }
585
586 while ( $row = $results->fetchObject() ) {
587 if ( $prev && $prev->rev_id !== $row->rev_id ) {
588 $carry = $row;
589 break;
590 }
591 $slotRows[] = $row;
592 $prev = $row;
593 }
594
595 return $slotRows;
596 }
597
603 protected function finishPageStreamOutput( $lastRow ) {
604 $output = '';
605 if ( $this->dumpUploads ) {
606 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
607 }
608 $output .= $this->author_list;
609 $output .= $this->writer->closePage();
610 $this->sink->writeClosePage( $output );
611 }
612
617 protected function outputLogStream( $resultset ) {
618 foreach ( $resultset as $row ) {
619 $output = $this->writer->writeLogItem( $row );
620 $this->sink->writeLogItem( $row, $output );
621 }
622 return $row->log_id ?? null;
623 }
624
631 private function reloadDBConfig() {
632 MediaWikiServices::getInstance()->getDBLoadBalancerFactory()
633 ->autoReconfigure();
634 }
635}
Handle database storage of comments such as edit summaries and log reasons.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception representing a failure to look up a revision.
Page revision base class.
Service for looking up page revisions.
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
dumpFrom( $cond='', $orderRevs=false)
pageByName( $name)
pagesByName( $names)
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
IReadableDatabase $db
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
bool $dumpUploadFileContents
array int $history
__construct( $db, CommentStore $commentStore, HookContainer $hookContainer, RevisionStore $revisionStore, TitleParser $titleParser, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
DumpOutput $sink
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
logsByRange( $start, $end)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
array null $limitNamespaces
dumpPages( $cond, $orderRevs)
pageByTitle(PageIdentity $page)
outputLogStream( $resultset)
setSchemaVersion( $schemaVersion)
static schemaVersion()
Returns the default export schema version, as defined by the XmlDumpSchemaVersion setting.
string $author_list
bool $list_authors
Return distinct author list (when not returning full history)
Interface for objects (potentially) representing an editable wiki page.
getNamespace()
Returns the page's namespace number.
getDBkey()
Get the page title in DB key form.
A title parser service for MediaWiki.
A database connection without write operations.
Result wrapper for grabbing data queried from an IDatabase object.