MediaWiki REL1_39
WikiExporter.php
Go to the documentation of this file.
1<?php
40
46 public $list_authors = false;
47
49 public $dumpUploads = false;
50
52 public $dumpUploadFileContents = false;
53
55 public $author_list = "";
56
57 public const FULL = 1;
58 public const CURRENT = 2;
59 public const STABLE = 4; // extension defined
60 public const LOGS = 8;
61 public const RANGE = 16;
62
63 public const TEXT = XmlDumpWriter::WRITE_CONTENT;
64 public const STUB = XmlDumpWriter::WRITE_STUB;
65
66 protected const BATCH_SIZE = 50000;
67
69 public $text;
70
72 public $sink;
73
75 private $writer;
76
78 protected $db;
79
81 protected $history;
82
85
87 private $revisionStore;
88
90 private $titleParser;
91
93 private $hookRunner;
94
99 public static function schemaVersion() {
100 return MediaWikiServices::getInstance()->getMainConfig()->get(
101 MainConfigNames::XmlDumpSchemaVersion );
102 }
103
117 public function __construct(
118 $db,
119 HookContainer $hookContainer,
120 RevisionStore $revisionStore,
121 TitleParser $titleParser,
122 $history = self::CURRENT,
123 $text = self::TEXT,
124 $limitNamespaces = null
125 ) {
126 $this->db = $db;
127 $this->history = $history;
128 // TODO: add a $hookContainer parameter to XmlDumpWriter so that we can inject
129 // and then be able to convert the factory test to a unit test
130 $this->writer = new XmlDumpWriter( $text, self::schemaVersion() );
131 $this->sink = new DumpOutput();
132 $this->text = $text;
133 $this->limitNamespaces = $limitNamespaces;
134 $this->hookRunner = new HookRunner( $hookContainer );
135 $this->revisionStore = $revisionStore;
136 $this->titleParser = $titleParser;
137 }
138
144 public function setSchemaVersion( $schemaVersion ) {
145 $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
146 }
147
155 public function setOutputSink( &$sink ) {
156 $this->sink =& $sink;
157 }
158
159 public function openStream() {
160 $output = $this->writer->openStream();
161 $this->sink->writeOpenStream( $output );
162 }
163
164 public function closeStream() {
165 $output = $this->writer->closeStream();
166 $this->sink->writeCloseStream( $output );
167 }
168
174 public function allPages() {
175 $this->dumpFrom( '' );
176 }
177
186 public function pagesByRange( $start, $end, $orderRevs ) {
187 if ( $orderRevs ) {
188 $condition = 'rev_page >= ' . intval( $start );
189 if ( $end ) {
190 $condition .= ' AND rev_page < ' . intval( $end );
191 }
192 } else {
193 $condition = 'page_id >= ' . intval( $start );
194 if ( $end ) {
195 $condition .= ' AND page_id < ' . intval( $end );
196 }
197 }
198 $this->dumpFrom( $condition, $orderRevs );
199 }
200
208 public function revsByRange( $start, $end ) {
209 $condition = 'rev_id >= ' . intval( $start );
210 if ( $end ) {
211 $condition .= ' AND rev_id < ' . intval( $end );
212 }
213 $this->dumpFrom( $condition );
214 }
215
219 public function pageByTitle( PageIdentity $page ) {
220 $this->dumpFrom(
221 'page_namespace=' . $page->getNamespace() .
222 ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) );
223 }
224
229 public function pageByName( $name ) {
230 try {
231 $link = $this->titleParser->parseTitle( $name );
232 $this->dumpFrom(
233 'page_namespace=' . $link->getNamespace() .
234 ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) );
235 } catch ( MalformedTitleException $ex ) {
236 throw new MWException( "Can't export invalid title" );
237 }
238 }
239
243 public function pagesByName( $names ) {
244 foreach ( $names as $name ) {
245 $this->pageByName( $name );
246 }
247 }
248
249 public function allLogs() {
250 $this->dumpFrom( '' );
251 }
252
257 public function logsByRange( $start, $end ) {
258 $condition = 'log_id >= ' . intval( $start );
259 if ( $end ) {
260 $condition .= ' AND log_id < ' . intval( $end );
261 }
262 $this->dumpFrom( $condition );
263 }
264
272 protected function do_list_authors( $cond ) {
273 $this->author_list = "<contributors>";
274 // rev_deleted
275
276 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
277 $res = $this->db->select(
278 $revQuery['tables'],
279 [
280 'rev_user_text' => $revQuery['fields']['rev_user_text'],
281 'rev_user' => $revQuery['fields']['rev_user'],
282 ],
283 [
284 $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0',
285 $cond,
286 ],
287 __METHOD__,
288 [ 'DISTINCT' ],
289 $revQuery['joins']
290 );
291
292 foreach ( $res as $row ) {
293 $this->author_list .= "<contributor>" .
294 "<username>" .
295 htmlspecialchars( $row->rev_user_text ) .
296 "</username>" .
297 "<id>" .
298 ( (int)$row->rev_user ) .
299 "</id>" .
300 "</contributor>";
301 }
302 $this->author_list .= "</contributors>";
303 }
304
311 protected function dumpFrom( $cond = '', $orderRevs = false ) {
312 if ( is_int( $this->history ) && ( $this->history & self::LOGS ) ) {
313 $this->dumpLogs( $cond );
314 } else {
315 $this->dumpPages( $cond, $orderRevs );
316 }
317 }
318
323 protected function dumpLogs( $cond ) {
324 $where = [];
325 # Hide private logs
326 $hideLogs = LogEventsList::getExcludeClause( $this->db );
327 if ( $hideLogs ) {
328 $where[] = $hideLogs;
329 }
330 # Add on any caller specified conditions
331 if ( $cond ) {
332 $where[] = $cond;
333 }
334 $result = null; // Assuring $result is not undefined, if exception occurs early
335
336 $commentQuery = CommentStore::getStore()->getJoin( 'log_comment' );
337
338 $tables = array_merge(
339 [ 'logging', 'actor' ], $commentQuery['tables']
340 );
341 $fields = [
342 'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
343 'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name'
344 ] + $commentQuery['fields'];
345 $options = [
346 'ORDER BY' => 'log_id',
347 'USE INDEX' => [ 'logging' => 'PRIMARY' ],
348 'LIMIT' => self::BATCH_SIZE,
349 ];
350 $joins = [
351 'actor' => [ 'JOIN', 'actor_id=log_actor' ]
352 ] + $commentQuery['joins'];
353
354 $lastLogId = 0;
355 while ( true ) {
356 $result = $this->db->select(
357 $tables,
358 $fields,
359 array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
360 __METHOD__,
361 $options,
362 $joins
363 );
364
365 if ( !$result->numRows() ) {
366 break;
367 }
368
369 $lastLogId = $this->outputLogStream( $result );
370 $this->reloadDBConfig();
371 }
372 }
373
380 protected function dumpPages( $cond, $orderRevs ) {
381 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
382 $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
383
384 // We want page primary rather than revision.
385 // We also want to join in the slots and content tables.
386 // NOTE: This means we may get multiple rows per revision, and more rows
387 // than the batch size! Should be ok, since the max number of slots is
388 // fixed and low (dozens at worst).
389 $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
390 $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
391 $join = $revQuery['joins'] + [
392 'revision' => $revQuery['joins']['page'],
393 'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
394 'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
395 ];
396 unset( $join['page'] );
397
398 $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
399
400 if ( $this->text != self::STUB ) {
401 $fields['_load_content'] = '1';
402 }
403
404 $conds = [];
405 if ( $cond !== '' ) {
406 $conds[] = $cond;
407 }
408 $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
409 $opts['USE INDEX'] = [];
410
411 $op = '>';
412 if ( is_array( $this->history ) ) {
413 # Time offset/limit for all pages/history...
414 # Set time order
415 if ( $this->history['dir'] == 'asc' ) {
416 $opts['ORDER BY'] = 'rev_timestamp ASC';
417 } else {
418 $op = '<';
419 $opts['ORDER BY'] = 'rev_timestamp DESC';
420 }
421 # Set offset
422 if ( !empty( $this->history['offset'] ) ) {
423 $conds[] = "rev_timestamp $op " .
424 $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
425 }
426 # Set query limit
427 if ( !empty( $this->history['limit'] ) ) {
428 $maxRowCount = intval( $this->history['limit'] );
429 }
430 } elseif ( $this->history & self::FULL ) {
431 # Full history dumps...
432 # query optimization for history stub dumps
433 if ( $this->text == self::STUB ) {
434 $opts[] = 'STRAIGHT_JOIN';
435 unset( $join['revision'] );
436 $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
437 }
438 } elseif ( $this->history & self::CURRENT ) {
439 # Latest revision dumps...
440 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
441 $this->do_list_authors( $cond );
442 }
443 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
444 $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
445 } elseif ( $this->history & self::STABLE ) {
446 # "Stable" revision dumps...
447 # Default JOIN, to be overridden...
448 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
449 # One, and only one hook should set this, and return false
450 if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
451 throw new MWException( __METHOD__ . " given invalid history dump type." );
452 }
453 } elseif ( $this->history & self::RANGE ) {
454 # Dump of revisions within a specified range. Condition already set in revsByRange().
455 } else {
456 # Unknown history specification parameter?
457 throw new MWException( __METHOD__ . " given invalid history dump type." );
458 }
459
460 $result = null; // Assuring $result is not undefined, if exception occurs early
461 $done = false;
462 $lastRow = null;
463 $revPage = 0;
464 $revId = 0;
465 $rowCount = 0;
466
467 $opts['LIMIT'] = self::BATCH_SIZE;
468
469 $this->hookRunner->onModifyExportQuery(
470 $this->db, $tables, $cond, $opts, $join, $conds );
471
472 while ( !$done ) {
473 // If necessary, impose the overall maximum and stop looping after this iteration.
474 if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
475 $opts['LIMIT'] = $maxRowCount - $rowCount;
476 $done = true;
477 }
478
479 $queryConds = $conds;
480 $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
481 intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
482
483 # Do the query and process any results, remembering max ids for the next iteration.
484 $result = $this->db->select(
485 $tables,
486 $fields,
487 $queryConds,
488 __METHOD__,
489 $opts,
490 $join
491 );
492 if ( $result->numRows() > 0 ) {
493 $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
494 $rowCount += $result->numRows();
495 $revPage = $lastRow->rev_page;
496 $revId = $lastRow->rev_id;
497 } else {
498 $done = true;
499 }
500
501 // If we are finished, close off final page element (if any).
502 if ( $done && $lastRow ) {
503 $this->finishPageStreamOutput( $lastRow );
504 }
505
506 if ( !$done ) {
507 $this->reloadDBConfig();
508 }
509 }
510 }
511
521 protected function outputPageStreamBatch( $results, $lastRow ) {
522 $rowCarry = null;
523 while ( true ) {
524 $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
525
526 if ( !$slotRows ) {
527 break;
528 }
529
530 // All revision info is present in all slot rows.
531 // Use the first slot row as the revision row.
532 $revRow = $slotRows[0];
533
534 if ( $this->limitNamespaces &&
535 !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
536 $lastRow = $revRow;
537 continue;
538 }
539
540 if ( $lastRow === null ||
541 $lastRow->page_namespace !== $revRow->page_namespace ||
542 $lastRow->page_title !== $revRow->page_title ) {
543 if ( $lastRow !== null ) {
544 $output = '';
545 if ( $this->dumpUploads ) {
546 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
547 }
548 $output .= $this->writer->closePage();
549 $this->sink->writeClosePage( $output );
550 }
551 $output = $this->writer->openPage( $revRow );
552 $this->sink->writeOpenPage( $revRow, $output );
553 }
554 try {
555 $output = $this->writer->writeRevision( $revRow, $slotRows );
556 $this->sink->writeRevision( $revRow, $output );
557 } catch ( RevisionAccessException $ex ) {
558 MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for'
559 . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() );
560 }
561 $lastRow = $revRow;
562 }
563
564 if ( $rowCarry ) {
565 throw new LogicException( 'Error while processing a stream of slot rows' );
566 }
567
568 // @phan-suppress-next-line PhanTypeMismatchReturnNullable False positive
569 return $lastRow;
570 }
571
581 protected function getSlotRowBatch( $results, &$carry = null ) {
582 $slotRows = [];
583 $prev = null;
584
585 if ( $carry ) {
586 $slotRows[] = $carry;
587 $prev = $carry;
588 $carry = null;
589 }
590
591 while ( $row = $results->fetchObject() ) {
592 if ( $prev && $prev->rev_id !== $row->rev_id ) {
593 $carry = $row;
594 break;
595 }
596 $slotRows[] = $row;
597 $prev = $row;
598 }
599
600 return $slotRows;
601 }
602
608 protected function finishPageStreamOutput( $lastRow ) {
609 $output = '';
610 if ( $this->dumpUploads ) {
611 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
612 }
613 $output .= $this->author_list;
614 $output .= $this->writer->closePage();
615 $this->sink->writeClosePage( $output );
616 }
617
622 protected function outputLogStream( $resultset ) {
623 foreach ( $resultset as $row ) {
624 $output = $this->writer->writeLogItem( $row );
625 $this->sink->writeLogItem( $row, $output );
626 }
627 return $row->log_id ?? null;
628 }
629
636 private function reloadDBConfig() {
637 MediaWikiServices::getInstance()->getDBLoadBalancerFactory()
638 ->autoReconfigure();
639 }
640}
MediaWiki exception.
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Exception representing a failure to look up a revision.
Page revision base class.
Service for looking up page revisions.
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
__construct( $db, HookContainer $hookContainer, RevisionStore $revisionStore, TitleParser $titleParser, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
dumpFrom( $cond='', $orderRevs=false)
pageByName( $name)
pagesByName( $names)
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
bool $dumpUploadFileContents
array int $history
DumpOutput $sink
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
logsByRange( $start, $end)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
array null $limitNamespaces
dumpPages( $cond, $orderRevs)
pageByTitle(PageIdentity $page)
outputLogStream( $resultset)
IDatabase $db
setSchemaVersion( $schemaVersion)
static schemaVersion()
Returns the default export schema version, as defined by the XmlDumpSchemaVersion setting.
string $author_list
bool $list_authors
Return distinct author list (when not returning full history)
Interface for objects (potentially) representing an editable wiki page.
getNamespace()
Returns the page's namespace number.
getDBkey()
Get the page title in DB key form.
A title parser service for MediaWiki.
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:39
Result wrapper for grabbing data queried from an IDatabase object.