MediaWiki master
WikiExporter.php
Go to the documentation of this file.
1<?php
12namespace MediaWiki\Export;
13
14use LogicException;
28use RuntimeException;
29use UnexpectedValueException;
32
42 public $list_authors = false;
43
45 public $dumpUploads = false;
46
49
51 public $author_list = "";
52
53 public const FULL = 1;
54 public const CURRENT = 2;
55 public const STABLE = 4; // extension defined
56 public const LOGS = 8;
57 public const RANGE = 16;
58
61
62 protected const BATCH_SIZE = 10000;
63
65 public $text;
66
68 public $sink;
69
71 private $writer;
72
74 protected $db;
75
77 protected $history;
78
81
83 private $revisionStore;
84
86 private $titleParser;
87
89 private $hookRunner;
90
92 private $commentStore;
93
98 public static function schemaVersion() {
99 return MediaWikiServices::getInstance()->getMainConfig()->get(
101 }
102
117 public function __construct(
118 $db,
119 CommentStore $commentStore,
120 HookContainer $hookContainer,
121 RevisionStore $revisionStore,
122 TitleParser $titleParser,
123 $history = self::CURRENT,
124 $text = self::TEXT,
125 $limitNamespaces = null
126 ) {
127 $this->db = $db;
128 $this->commentStore = $commentStore;
129 $this->history = $history;
130 $this->writer = new XmlDumpWriter(
131 $text,
132 self::schemaVersion(),
133 $hookContainer,
134 $commentStore
135 );
136 $this->sink = new DumpOutput();
137 $this->text = $text;
138 $this->limitNamespaces = $limitNamespaces;
139 $this->hookRunner = new HookRunner( $hookContainer );
140 $this->revisionStore = $revisionStore;
141 $this->titleParser = $titleParser;
142 }
143
149 public function setSchemaVersion( $schemaVersion ) {
150 $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
151 }
152
160 public function setOutputSink( &$sink ) {
161 $this->sink =& $sink;
162 }
163
164 public function openStream() {
165 $output = $this->writer->openStream();
166 $this->sink->writeOpenStream( $output );
167 }
168
169 public function closeStream() {
170 $output = $this->writer->closeStream();
171 $this->sink->writeCloseStream( $output );
172 }
173
179 public function allPages() {
180 $this->dumpFrom( '' );
181 }
182
191 public function pagesByRange( $start, $end, $orderRevs ) {
192 if ( $orderRevs ) {
193 $condition = 'rev_page >= ' . intval( $start );
194 if ( $end ) {
195 $condition .= ' AND rev_page < ' . intval( $end );
196 }
197 } else {
198 $condition = 'page_id >= ' . intval( $start );
199 if ( $end ) {
200 $condition .= ' AND page_id < ' . intval( $end );
201 }
202 }
203 $this->dumpFrom( $condition, $orderRevs );
204 }
205
213 public function revsByRange( $start, $end ) {
214 $condition = 'rev_id >= ' . intval( $start );
215 if ( $end ) {
216 $condition .= ' AND rev_id < ' . intval( $end );
217 }
218 $this->dumpFrom( $condition );
219 }
220
221 public function pageByTitle( PageIdentity $page ) {
222 $this->dumpFrom(
223 'page_namespace=' . $page->getNamespace() .
224 ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) );
225 }
226
230 public function pageByName( $name ) {
231 try {
232 $link = $this->titleParser->parseTitle( $name );
233 $this->dumpFrom(
234 'page_namespace=' . $link->getNamespace() .
235 ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) );
236 } catch ( MalformedTitleException ) {
237 throw new RuntimeException( "Can't export invalid title" );
238 }
239 }
240
244 public function pagesByName( $names ) {
245 foreach ( $names as $name ) {
246 $this->pageByName( $name );
247 }
248 }
249
250 public function allLogs() {
251 $this->dumpFrom( '' );
252 }
253
258 public function logsByRange( $start, $end ) {
259 $condition = 'log_id >= ' . intval( $start );
260 if ( $end ) {
261 $condition .= ' AND log_id < ' . intval( $end );
262 }
263 $this->dumpFrom( $condition );
264 }
265
273 protected function do_list_authors( $cond ) {
274 $this->author_list = "<contributors>";
275 // rev_deleted
276
277 $res = $this->revisionStore->newSelectQueryBuilder( $this->db )
278 ->joinPage()
279 ->distinct()
280 ->where( $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0' )
281 ->andWhere( $cond )
282 ->caller( __METHOD__ )->fetchResultSet();
283
284 foreach ( $res as $row ) {
285 $this->author_list .= "<contributor>" .
286 "<username>" .
287 htmlspecialchars( $row->rev_user_text ) .
288 "</username>" .
289 "<id>" .
290 ( (int)$row->rev_user ) .
291 "</id>" .
292 "</contributor>";
293 }
294 $this->author_list .= "</contributors>";
295 }
296
301 protected function dumpFrom( $cond = '', $orderRevs = false ) {
302 if ( is_int( $this->history ) && ( $this->history & self::LOGS ) ) {
303 $this->dumpLogs( $cond );
304 } else {
305 $this->dumpPages( $cond, $orderRevs );
306 }
307 }
308
312 protected function dumpLogs( $cond ) {
313 $where = [];
314 # Hide private logs
315 $hideLogs = LogEventsList::getExcludeClause( $this->db );
316 if ( $hideLogs ) {
317 $where[] = $hideLogs;
318 }
319 # Add on any caller specified conditions
320 if ( $cond ) {
321 $where[] = $cond;
322 }
323
324 $commentQuery = $this->commentStore->getJoin( 'log_comment' );
325
326 $lastLogId = 0;
327 while ( true ) {
328 $result = $this->db->newSelectQueryBuilder()
329 ->select( [
330 'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
331 'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name'
332 ] )
333 ->from( 'logging' )
334 ->join( 'actor', null, 'actor_id=log_actor' )
335 ->where( $where )
336 ->andWhere( $this->db->expr( 'log_id', '>', intval( $lastLogId ) ) )
337 ->orderBy( 'log_id' )
338 ->useIndex( [ 'logging' => 'PRIMARY' ] )
339 ->limit( self::BATCH_SIZE )
340 ->queryInfo( $commentQuery )
341 ->caller( __METHOD__ )
342 ->fetchResultSet();
343
344 if ( !$result->numRows() ) {
345 break;
346 }
347
348 $lastLogId = $this->outputLogStream( $result );
349 $this->reloadDBConfig();
350 }
351 }
352
357 protected function dumpPages( $cond, $orderRevs ) {
358 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
359 $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
360
361 // We want page primary rather than revision.
362 // We also want to join in the slots and content tables.
363 // NOTE: This means we may get multiple rows per revision, and more rows
364 // than the batch size! Should be ok, since the max number of slots is
365 // fixed and low (dozens at worst).
366 $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
367 $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
368 $join = $revQuery['joins'] + [
369 'revision' => $revQuery['joins']['page'],
370 'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
371 'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
372 ];
373 unset( $join['page'] );
374
375 $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
376
377 if ( $this->text != self::STUB ) {
378 $fields['_load_content'] = '1';
379 }
380
381 $conds = [];
382 if ( $cond !== '' ) {
383 $conds[] = $cond;
384 }
385 $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
386 $opts['USE INDEX'] = [];
387
388 $op = '>';
389 if ( is_array( $this->history ) ) {
390 # Time offset/limit for all pages/history...
391 # Set time order
392 if ( $this->history['dir'] == 'asc' ) {
393 $opts['ORDER BY'] = 'rev_timestamp ASC';
394 } else {
395 $op = '<';
396 $opts['ORDER BY'] = 'rev_timestamp DESC';
397 }
398 # Set offset
399 if ( !empty( $this->history['offset'] ) ) {
400 $conds[] = "rev_timestamp $op " .
401 $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
402 }
403 # Set query limit
404 if ( !empty( $this->history['limit'] ) ) {
405 $maxRowCount = intval( $this->history['limit'] );
406 }
407 } elseif ( $this->history & self::FULL ) {
408 # Full history dumps...
409 # query optimization for history stub dumps
410 if ( $this->text == self::STUB ) {
411 $opts[] = 'STRAIGHT_JOIN';
412 unset( $join['revision'] );
413 $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
414 }
415 } elseif ( $this->history & self::CURRENT ) {
416 # Latest revision dumps...
417 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
418 $this->do_list_authors( $cond );
419 }
420 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
421 $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
422 } elseif ( $this->history & self::STABLE ) {
423 # "Stable" revision dumps...
424 # Default JOIN, to be overridden...
425 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
426 # One, and only one hook should set this, and return false
427 if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
428 throw new LogicException( __METHOD__ . " given invalid history dump type." );
429 }
430 } elseif ( $this->history & self::RANGE ) {
431 # Dump of revisions within a specified range. Condition already set in revsByRange().
432 } else {
433 # Unknown history specification parameter?
434 throw new UnexpectedValueException( __METHOD__ . " given invalid history dump type." );
435 }
436
437 $done = false;
438 $lastRow = null;
439 $revPage = 0;
440 $revId = 0;
441 $rowCount = 0;
442
443 $opts['LIMIT'] = self::BATCH_SIZE;
444
445 $this->hookRunner->onModifyExportQuery(
446 $this->db, $tables, $cond, $opts, $join, $conds );
447
448 while ( !$done ) {
449 // If necessary, impose the overall maximum and stop looping after this iteration.
450 if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
451 $opts['LIMIT'] = $maxRowCount - $rowCount;
452 $done = true;
453 }
454
455 # Do the query and process any results, remembering max ids for the next iteration.
456 $result = $this->db->newSelectQueryBuilder()
457 ->tables( $tables )
458 ->fields( $fields )
459 ->where( $conds )
460 ->andWhere( $this->db->expr( 'rev_page', '>', intval( $revPage ) )->orExpr(
461 $this->db->expr( 'rev_page', '=', intval( $revPage ) )->and( 'rev_id', $op, intval( $revId ) )
462 ) )
463 ->caller( __METHOD__ )
464 ->options( $opts )
465 ->joinConds( $join )
466 ->fetchResultSet();
467 if ( $result->numRows() > 0 ) {
468 $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
469 $rowCount += $result->numRows();
470 $revPage = $lastRow->rev_page;
471 $revId = $lastRow->rev_id;
472 } else {
473 $done = true;
474 }
475
476 // If we are finished, close off final page element (if any).
477 if ( $done && $lastRow ) {
478 $this->finishPageStreamOutput( $lastRow );
479 }
480
481 if ( !$done ) {
482 $this->reloadDBConfig();
483 }
484 }
485 }
486
496 protected function outputPageStreamBatch( $results, $lastRow ) {
497 $rowCarry = null;
498 while ( true ) {
499 $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
500
501 if ( !$slotRows ) {
502 break;
503 }
504
505 // All revision info is present in all slot rows.
506 // Use the first slot row as the revision row.
507 $revRow = $slotRows[0];
508
509 if ( $this->limitNamespaces &&
510 !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
511 $lastRow = $revRow;
512 continue;
513 }
514
515 if ( $lastRow === null ||
516 $lastRow->page_namespace !== $revRow->page_namespace ||
517 $lastRow->page_title !== $revRow->page_title ) {
518 if ( $lastRow !== null ) {
519 $output = '';
520 if ( $this->dumpUploads ) {
521 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
522 }
523 $output .= $this->writer->closePage();
524 $this->sink->writeClosePage( $output );
525 }
526 $output = $this->writer->openPage( $revRow );
527 $this->sink->writeOpenPage( $revRow, $output );
528 }
529 try {
530 $output = $this->writer->writeRevision( $revRow, $slotRows );
531 $this->sink->writeRevision( $revRow, $output );
532 } catch ( RevisionAccessException $ex ) {
533 MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for'
534 . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() );
535 }
536 $lastRow = $revRow;
537 }
538
539 if ( $rowCarry ) {
540 throw new LogicException( 'Error while processing a stream of slot rows' );
541 }
542
543 // @phan-suppress-next-line PhanTypeMismatchReturnNullable False positive
544 return $lastRow;
545 }
546
556 protected function getSlotRowBatch( $results, &$carry = null ) {
557 $slotRows = [];
558 $prev = null;
559
560 if ( $carry ) {
561 $slotRows[] = $carry;
562 $prev = $carry;
563 $carry = null;
564 }
565
566 // Reading further rows from the result set for the same rev id
567 // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
568 while ( $row = $results->fetchObject() ) {
569 if ( $prev && $prev->rev_id !== $row->rev_id ) {
570 $carry = $row;
571 break;
572 }
573 $slotRows[] = $row;
574 $prev = $row;
575 }
576
577 return $slotRows;
578 }
579
585 protected function finishPageStreamOutput( $lastRow ) {
586 $output = '';
587 if ( $this->dumpUploads ) {
588 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
589 }
590 $output .= $this->author_list;
591 $output .= $this->writer->closePage();
592 $this->sink->writeClosePage( $output );
593 }
594
599 protected function outputLogStream( $resultset ) {
600 foreach ( $resultset as $row ) {
601 $output = $this->writer->writeLogItem( $row );
602 $this->sink->writeLogItem( $row, $output );
603 }
604 return $row->log_id ?? null;
605 }
606
613 private function reloadDBConfig() {
614 MediaWikiServices::getInstance()->getDBLoadBalancerFactory()
615 ->autoReconfigure();
616 }
617}
618
620class_alias( WikiExporter::class, 'WikiExporter' );
Handle database storage of comments such as edit summaries and log reasons.
Debug toolbar.
Definition MWDebug.php:35
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
dumpPages( $cond, $orderRevs)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
static schemaVersion()
Returns the default export schema version, as defined by the XmlDumpSchemaVersion setting.
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
pageByTitle(PageIdentity $page)
setSchemaVersion( $schemaVersion)
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
bool $list_authors
Return distinct author list (when not returning full history)
dumpFrom( $cond='', $orderRevs=false)
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
__construct( $db, CommentStore $commentStore, HookContainer $hookContainer, RevisionStore $revisionStore, TitleParser $titleParser, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
const WRITE_CONTENT
Output serialized revision content.
const WRITE_STUB
Only output subs for revision content.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
A class containing constants representing the names of configuration variables.
const XmlDumpSchemaVersion
Name constant for the XmlDumpSchemaVersion setting, for use with Config::get()
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Exception representing a failure to look up a revision.
Page revision base class.
Service for looking up page revisions.
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
A title parser service for MediaWiki.
Interface for objects (potentially) representing an editable wiki page.
getNamespace()
Returns the page's namespace number.
getDBkey()
Get the page title in DB key form.
A database connection without write operations.
Result wrapper for grabbing data queried from an IDatabase object.