MediaWiki master
WikiExporter.php
Go to the documentation of this file.
1<?php
44
50 public $list_authors = false;
51
53 public $dumpUploads = false;
54
56 public $dumpUploadFileContents = false;
57
59 public $author_list = "";
60
61 public const FULL = 1;
62 public const CURRENT = 2;
63 public const STABLE = 4; // extension defined
64 public const LOGS = 8;
65 public const RANGE = 16;
66
67 public const TEXT = XmlDumpWriter::WRITE_CONTENT;
68 public const STUB = XmlDumpWriter::WRITE_STUB;
69
70 protected const BATCH_SIZE = 10000;
71
73 public $text;
74
76 public $sink;
77
79 private $writer;
80
82 protected $db;
83
85 protected $history;
86
89
91 private $revisionStore;
92
94 private $titleParser;
95
97 private $hookRunner;
98
100 private $commentStore;
101
106 public static function schemaVersion() {
107 return MediaWikiServices::getInstance()->getMainConfig()->get(
108 MainConfigNames::XmlDumpSchemaVersion );
109 }
110
125 public function __construct(
126 $db,
127 CommentStore $commentStore,
128 HookContainer $hookContainer,
129 RevisionStore $revisionStore,
130 TitleParser $titleParser,
131 $history = self::CURRENT,
132 $text = self::TEXT,
133 $limitNamespaces = null
134 ) {
135 $this->db = $db;
136 $this->commentStore = $commentStore;
137 $this->history = $history;
138 $this->writer = new XmlDumpWriter(
139 $text,
140 self::schemaVersion(),
141 $hookContainer,
142 $commentStore
143 );
144 $this->sink = new DumpOutput();
145 $this->text = $text;
146 $this->limitNamespaces = $limitNamespaces;
147 $this->hookRunner = new HookRunner( $hookContainer );
148 $this->revisionStore = $revisionStore;
149 $this->titleParser = $titleParser;
150 }
151
157 public function setSchemaVersion( $schemaVersion ) {
158 $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
159 }
160
168 public function setOutputSink( &$sink ) {
169 $this->sink =& $sink;
170 }
171
172 public function openStream() {
173 $output = $this->writer->openStream();
174 $this->sink->writeOpenStream( $output );
175 }
176
177 public function closeStream() {
178 $output = $this->writer->closeStream();
179 $this->sink->writeCloseStream( $output );
180 }
181
187 public function allPages() {
188 $this->dumpFrom( '' );
189 }
190
199 public function pagesByRange( $start, $end, $orderRevs ) {
200 if ( $orderRevs ) {
201 $condition = 'rev_page >= ' . intval( $start );
202 if ( $end ) {
203 $condition .= ' AND rev_page < ' . intval( $end );
204 }
205 } else {
206 $condition = 'page_id >= ' . intval( $start );
207 if ( $end ) {
208 $condition .= ' AND page_id < ' . intval( $end );
209 }
210 }
211 $this->dumpFrom( $condition, $orderRevs );
212 }
213
221 public function revsByRange( $start, $end ) {
222 $condition = 'rev_id >= ' . intval( $start );
223 if ( $end ) {
224 $condition .= ' AND rev_id < ' . intval( $end );
225 }
226 $this->dumpFrom( $condition );
227 }
228
232 public function pageByTitle( PageIdentity $page ) {
233 $this->dumpFrom(
234 'page_namespace=' . $page->getNamespace() .
235 ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) );
236 }
237
241 public function pageByName( $name ) {
242 try {
243 $link = $this->titleParser->parseTitle( $name );
244 $this->dumpFrom(
245 'page_namespace=' . $link->getNamespace() .
246 ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) );
247 } catch ( MalformedTitleException $ex ) {
248 throw new RuntimeException( "Can't export invalid title" );
249 }
250 }
251
255 public function pagesByName( $names ) {
256 foreach ( $names as $name ) {
257 $this->pageByName( $name );
258 }
259 }
260
261 public function allLogs() {
262 $this->dumpFrom( '' );
263 }
264
269 public function logsByRange( $start, $end ) {
270 $condition = 'log_id >= ' . intval( $start );
271 if ( $end ) {
272 $condition .= ' AND log_id < ' . intval( $end );
273 }
274 $this->dumpFrom( $condition );
275 }
276
284 protected function do_list_authors( $cond ) {
285 $this->author_list = "<contributors>";
286 // rev_deleted
287
288 $res = $this->revisionStore->newSelectQueryBuilder( $this->db )
289 ->joinPage()
290 ->distinct()
291 ->where( $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0' )
292 ->andWhere( $cond )
293 ->caller( __METHOD__ )->fetchResultSet();
294
295 foreach ( $res as $row ) {
296 $this->author_list .= "<contributor>" .
297 "<username>" .
298 htmlspecialchars( $row->rev_user_text ) .
299 "</username>" .
300 "<id>" .
301 ( (int)$row->rev_user ) .
302 "</id>" .
303 "</contributor>";
304 }
305 $this->author_list .= "</contributors>";
306 }
307
312 protected function dumpFrom( $cond = '', $orderRevs = false ) {
313 if ( is_int( $this->history ) && ( $this->history & self::LOGS ) ) {
314 $this->dumpLogs( $cond );
315 } else {
316 $this->dumpPages( $cond, $orderRevs );
317 }
318 }
319
323 protected function dumpLogs( $cond ) {
324 $where = [];
325 # Hide private logs
326 $hideLogs = LogEventsList::getExcludeClause( $this->db );
327 if ( $hideLogs ) {
328 $where[] = $hideLogs;
329 }
330 # Add on any caller specified conditions
331 if ( $cond ) {
332 $where[] = $cond;
333 }
334
335 $commentQuery = $this->commentStore->getJoin( 'log_comment' );
336
337 $lastLogId = 0;
338 while ( true ) {
339 $result = $this->db->newSelectQueryBuilder()
340 ->select( [
341 'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
342 'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name'
343 ] )
344 ->from( 'logging' )
345 ->join( 'actor', null, 'actor_id=log_actor' )
346 ->where( $where )
347 ->andWhere( $this->db->expr( 'log_id', '>', intval( $lastLogId ) ) )
348 ->orderBy( 'log_id' )
349 ->useIndex( [ 'logging' => 'PRIMARY' ] )
350 ->limit( self::BATCH_SIZE )
351 ->queryInfo( $commentQuery )
352 ->caller( __METHOD__ )
353 ->fetchResultSet();
354
355 if ( !$result->numRows() ) {
356 break;
357 }
358
359 $lastLogId = $this->outputLogStream( $result );
360 $this->reloadDBConfig();
361 }
362 }
363
368 protected function dumpPages( $cond, $orderRevs ) {
369 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
370 $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
371
372 // We want page primary rather than revision.
373 // We also want to join in the slots and content tables.
374 // NOTE: This means we may get multiple rows per revision, and more rows
375 // than the batch size! Should be ok, since the max number of slots is
376 // fixed and low (dozens at worst).
377 $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
378 $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
379 $join = $revQuery['joins'] + [
380 'revision' => $revQuery['joins']['page'],
381 'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
382 'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
383 ];
384 unset( $join['page'] );
385
386 $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
387
388 if ( $this->text != self::STUB ) {
389 $fields['_load_content'] = '1';
390 }
391
392 $conds = [];
393 if ( $cond !== '' ) {
394 $conds[] = $cond;
395 }
396 $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
397 $opts['USE INDEX'] = [];
398
399 $op = '>';
400 if ( is_array( $this->history ) ) {
401 # Time offset/limit for all pages/history...
402 # Set time order
403 if ( $this->history['dir'] == 'asc' ) {
404 $opts['ORDER BY'] = 'rev_timestamp ASC';
405 } else {
406 $op = '<';
407 $opts['ORDER BY'] = 'rev_timestamp DESC';
408 }
409 # Set offset
410 if ( !empty( $this->history['offset'] ) ) {
411 $conds[] = "rev_timestamp $op " .
412 $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
413 }
414 # Set query limit
415 if ( !empty( $this->history['limit'] ) ) {
416 $maxRowCount = intval( $this->history['limit'] );
417 }
418 } elseif ( $this->history & self::FULL ) {
419 # Full history dumps...
420 # query optimization for history stub dumps
421 if ( $this->text == self::STUB ) {
422 $opts[] = 'STRAIGHT_JOIN';
423 unset( $join['revision'] );
424 $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
425 }
426 } elseif ( $this->history & self::CURRENT ) {
427 # Latest revision dumps...
428 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
429 $this->do_list_authors( $cond );
430 }
431 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
432 $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
433 } elseif ( $this->history & self::STABLE ) {
434 # "Stable" revision dumps...
435 # Default JOIN, to be overridden...
436 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
437 # One, and only one hook should set this, and return false
438 if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
439 throw new LogicException( __METHOD__ . " given invalid history dump type." );
440 }
441 } elseif ( $this->history & self::RANGE ) {
442 # Dump of revisions within a specified range. Condition already set in revsByRange().
443 } else {
444 # Unknown history specification parameter?
445 throw new UnexpectedValueException( __METHOD__ . " given invalid history dump type." );
446 }
447
448 $done = false;
449 $lastRow = null;
450 $revPage = 0;
451 $revId = 0;
452 $rowCount = 0;
453
454 $opts['LIMIT'] = self::BATCH_SIZE;
455
456 $this->hookRunner->onModifyExportQuery(
457 $this->db, $tables, $cond, $opts, $join, $conds );
458
459 while ( !$done ) {
460 // If necessary, impose the overall maximum and stop looping after this iteration.
461 if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
462 $opts['LIMIT'] = $maxRowCount - $rowCount;
463 $done = true;
464 }
465
466 # Do the query and process any results, remembering max ids for the next iteration.
467 $result = $this->db->newSelectQueryBuilder()
468 ->tables( $tables )
469 ->fields( $fields )
470 ->where( $conds )
471 ->andWhere( $this->db->expr( 'rev_page', '>', intval( $revPage ) )->orExpr(
472 $this->db->expr( 'rev_page', '=', intval( $revPage ) )->and( 'rev_id', $op, intval( $revId ) )
473 ) )
474 ->caller( __METHOD__ )
475 ->options( $opts )
476 ->joinConds( $join )
477 ->fetchResultSet();
478 if ( $result->numRows() > 0 ) {
479 $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
480 $rowCount += $result->numRows();
481 $revPage = $lastRow->rev_page;
482 $revId = $lastRow->rev_id;
483 } else {
484 $done = true;
485 }
486
487 // If we are finished, close off final page element (if any).
488 if ( $done && $lastRow ) {
489 $this->finishPageStreamOutput( $lastRow );
490 }
491
492 if ( !$done ) {
493 $this->reloadDBConfig();
494 }
495 }
496 }
497
507 protected function outputPageStreamBatch( $results, $lastRow ) {
508 $rowCarry = null;
509 while ( true ) {
510 $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
511
512 if ( !$slotRows ) {
513 break;
514 }
515
516 // All revision info is present in all slot rows.
517 // Use the first slot row as the revision row.
518 $revRow = $slotRows[0];
519
520 if ( $this->limitNamespaces &&
521 !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
522 $lastRow = $revRow;
523 continue;
524 }
525
526 if ( $lastRow === null ||
527 $lastRow->page_namespace !== $revRow->page_namespace ||
528 $lastRow->page_title !== $revRow->page_title ) {
529 if ( $lastRow !== null ) {
530 $output = '';
531 if ( $this->dumpUploads ) {
532 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
533 }
534 $output .= $this->writer->closePage();
535 $this->sink->writeClosePage( $output );
536 }
537 $output = $this->writer->openPage( $revRow );
538 $this->sink->writeOpenPage( $revRow, $output );
539 }
540 try {
541 $output = $this->writer->writeRevision( $revRow, $slotRows );
542 $this->sink->writeRevision( $revRow, $output );
543 } catch ( RevisionAccessException $ex ) {
544 MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for'
545 . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() );
546 }
547 $lastRow = $revRow;
548 }
549
550 if ( $rowCarry ) {
551 throw new LogicException( 'Error while processing a stream of slot rows' );
552 }
553
554 // @phan-suppress-next-line PhanTypeMismatchReturnNullable False positive
555 return $lastRow;
556 }
557
567 protected function getSlotRowBatch( $results, &$carry = null ) {
568 $slotRows = [];
569 $prev = null;
570
571 if ( $carry ) {
572 $slotRows[] = $carry;
573 $prev = $carry;
574 $carry = null;
575 }
576
577 // Reading further rows from the result set for the same rev id
578 // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
579 while ( $row = $results->fetchObject() ) {
580 if ( $prev && $prev->rev_id !== $row->rev_id ) {
581 $carry = $row;
582 break;
583 }
584 $slotRows[] = $row;
585 $prev = $row;
586 }
587
588 return $slotRows;
589 }
590
596 protected function finishPageStreamOutput( $lastRow ) {
597 $output = '';
598 if ( $this->dumpUploads ) {
599 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
600 }
601 $output .= $this->author_list;
602 $output .= $this->writer->closePage();
603 $this->sink->writeClosePage( $output );
604 }
605
610 protected function outputLogStream( $resultset ) {
611 foreach ( $resultset as $row ) {
612 $output = $this->writer->writeLogItem( $row );
613 $this->sink->writeLogItem( $row, $output );
614 }
615 return $row->log_id ?? null;
616 }
617
624 private function reloadDBConfig() {
625 MediaWikiServices::getInstance()->getDBLoadBalancerFactory()
626 ->autoReconfigure();
627 }
628}
Handle database storage of comments such as edit summaries and log reasons.
Debug toolbar.
Definition MWDebug.php:49
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception representing a failure to look up a revision.
Page revision base class.
Service for looking up page revisions.
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
dumpFrom( $cond='', $orderRevs=false)
pageByName( $name)
pagesByName( $names)
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
IReadableDatabase $db
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
bool $dumpUploadFileContents
array int $history
__construct( $db, CommentStore $commentStore, HookContainer $hookContainer, RevisionStore $revisionStore, TitleParser $titleParser, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
DumpOutput $sink
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
logsByRange( $start, $end)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
array null $limitNamespaces
dumpPages( $cond, $orderRevs)
pageByTitle(PageIdentity $page)
outputLogStream( $resultset)
setSchemaVersion( $schemaVersion)
static schemaVersion()
Returns the default export schema version, as defined by the XmlDumpSchemaVersion setting.
string $author_list
bool $list_authors
Return distinct author list (when not returning full history)
Interface for objects (potentially) representing an editable wiki page.
getNamespace()
Returns the page's namespace number.
getDBkey()
Get the page title in DB key form.
A title parser service for MediaWiki.
A database connection without write operations.
Result wrapper for grabbing data queried from an IDatabase object.