MediaWiki REL1_37
WikiExporter.php
Go to the documentation of this file.
1<?php
38
44 public $list_authors = false;
45
47 public $dumpUploads = false;
48
51
53 public $author_list = "";
54
55 public const FULL = 1;
56 public const CURRENT = 2;
57 public const STABLE = 4; // extension defined
58 public const LOGS = 8;
59 public const RANGE = 16;
60
61 public const TEXT = XmlDumpWriter::WRITE_CONTENT;
62 public const STUB = XmlDumpWriter::WRITE_STUB;
63
64 protected const BATCH_SIZE = 50000;
65
67 public $text;
68
70 public $sink;
71
73 private $writer;
74
76 protected $db;
77
79 protected $history;
80
83
86
88 private $titleParser;
89
91 private $hookRunner;
92
97 public static function schemaVersion() {
100 }
101
113 public function __construct(
114 $db,
115 $history = self::CURRENT,
116 $text = self::TEXT,
117 $limitNamespaces = null
118 ) {
119 $this->db = $db;
120 $this->history = $history;
121 $this->writer = new XmlDumpWriter( $text, self::schemaVersion() );
122 $this->sink = new DumpOutput();
123 $this->text = $text;
124 $this->limitNamespaces = $limitNamespaces;
125 $services = MediaWikiServices::getInstance();
126 $this->hookRunner = new HookRunner( $services->getHookContainer() );
127 $this->revisionStore = $services->getRevisionStore();
128 $this->titleParser = $services->getTitleParser();
129 }
130
136 public function setSchemaVersion( $schemaVersion ) {
137 $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
138 }
139
147 public function setOutputSink( &$sink ) {
148 $this->sink =& $sink;
149 }
150
151 public function openStream() {
152 $output = $this->writer->openStream();
153 $this->sink->writeOpenStream( $output );
154 }
155
156 public function closeStream() {
157 $output = $this->writer->closeStream();
158 $this->sink->writeCloseStream( $output );
159 }
160
166 public function allPages() {
167 $this->dumpFrom( '' );
168 }
169
178 public function pagesByRange( $start, $end, $orderRevs ) {
179 if ( $orderRevs ) {
180 $condition = 'rev_page >= ' . intval( $start );
181 if ( $end ) {
182 $condition .= ' AND rev_page < ' . intval( $end );
183 }
184 } else {
185 $condition = 'page_id >= ' . intval( $start );
186 if ( $end ) {
187 $condition .= ' AND page_id < ' . intval( $end );
188 }
189 }
190 $this->dumpFrom( $condition, $orderRevs );
191 }
192
200 public function revsByRange( $start, $end ) {
201 $condition = 'rev_id >= ' . intval( $start );
202 if ( $end ) {
203 $condition .= ' AND rev_id < ' . intval( $end );
204 }
205 $this->dumpFrom( $condition );
206 }
207
211 public function pageByTitle( PageIdentity $page ) {
212 $this->dumpFrom(
213 'page_namespace=' . $page->getNamespace() .
214 ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) );
215 }
216
221 public function pageByName( $name ) {
222 try {
223 $link = $this->titleParser->parseTitle( $name );
224 $this->dumpFrom(
225 'page_namespace=' . $link->getNamespace() .
226 ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) );
227 } catch ( MalformedTitleException $ex ) {
228 throw new MWException( "Can't export invalid title" );
229 }
230 }
231
235 public function pagesByName( $names ) {
236 foreach ( $names as $name ) {
237 $this->pageByName( $name );
238 }
239 }
240
241 public function allLogs() {
242 $this->dumpFrom( '' );
243 }
244
249 public function logsByRange( $start, $end ) {
250 $condition = 'log_id >= ' . intval( $start );
251 if ( $end ) {
252 $condition .= ' AND log_id < ' . intval( $end );
253 }
254 $this->dumpFrom( $condition );
255 }
256
264 protected function do_list_authors( $cond ) {
265 $this->author_list = "<contributors>";
266 // rev_deleted
267
268 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
269 $res = $this->db->select(
270 $revQuery['tables'],
271 [
272 'rev_user_text' => $revQuery['fields']['rev_user_text'],
273 'rev_user' => $revQuery['fields']['rev_user'],
274 ],
275 [
276 $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0',
277 $cond,
278 ],
279 __METHOD__,
280 [ 'DISTINCT' ],
281 $revQuery['joins']
282 );
283
284 foreach ( $res as $row ) {
285 $this->author_list .= "<contributor>" .
286 "<username>" .
287 htmlspecialchars( $row->rev_user_text ) .
288 "</username>" .
289 "<id>" .
290 ( (int)$row->rev_user ) .
291 "</id>" .
292 "</contributor>";
293 }
294 $this->author_list .= "</contributors>";
295 }
296
303 protected function dumpFrom( $cond = '', $orderRevs = false ) {
304 if ( $this->history & self::LOGS ) {
305 $this->dumpLogs( $cond );
306 } else {
307 $this->dumpPages( $cond, $orderRevs );
308 }
309 }
310
315 protected function dumpLogs( $cond ) {
316 $where = [];
317 # Hide private logs
318 $hideLogs = LogEventsList::getExcludeClause( $this->db );
319 if ( $hideLogs ) {
320 $where[] = $hideLogs;
321 }
322 # Add on any caller specified conditions
323 if ( $cond ) {
324 $where[] = $cond;
325 }
326 $result = null; // Assuring $result is not undefined, if exception occurs early
327
328 $commentQuery = CommentStore::getStore()->getJoin( 'log_comment' );
329
330 $tables = array_merge(
331 [ 'logging', 'actor' ], $commentQuery['tables']
332 );
333 $fields = [
334 'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
335 'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name'
336 ] + $commentQuery['fields'];
337 $options = [
338 'ORDER BY' => 'log_id',
339 'USE INDEX' => [ 'logging' => 'PRIMARY' ],
340 'LIMIT' => self::BATCH_SIZE,
341 ];
342 $joins = [
343 'actor' => [ 'JOIN', 'actor_id=log_actor' ]
344 ] + $commentQuery['joins'];
345
346 $lastLogId = 0;
347 while ( true ) {
348 $result = $this->db->select(
349 $tables,
350 $fields,
351 array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
352 __METHOD__,
353 $options,
354 $joins
355 );
356
357 if ( !$result->numRows() ) {
358 break;
359 }
360
361 $lastLogId = $this->outputLogStream( $result );
362 }
363 }
364
371 protected function dumpPages( $cond, $orderRevs ) {
372 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
373 $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
374
375 // We want page primary rather than revision.
376 // We also want to join in the slots and content tables.
377 // NOTE: This means we may get multiple rows per revision, and more rows
378 // than the batch size! Should be ok, since the max number of slots is
379 // fixed and low (dozens at worst).
380 $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
381 $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
382 $join = $revQuery['joins'] + [
383 'revision' => $revQuery['joins']['page'],
384 'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
385 'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
386 ];
387 unset( $join['page'] );
388
389 $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
390 $fields[] = 'page_restrictions';
391
392 if ( $this->text != self::STUB ) {
393 $fields['_load_content'] = '1';
394 }
395
396 $conds = [];
397 if ( $cond !== '' ) {
398 $conds[] = $cond;
399 }
400 $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
401 $opts['USE INDEX'] = [];
402
403 $op = '>';
404 if ( is_array( $this->history ) ) {
405 # Time offset/limit for all pages/history...
406 # Set time order
407 if ( $this->history['dir'] == 'asc' ) {
408 $opts['ORDER BY'] = 'rev_timestamp ASC';
409 } else {
410 $op = '<';
411 $opts['ORDER BY'] = 'rev_timestamp DESC';
412 }
413 # Set offset
414 if ( !empty( $this->history['offset'] ) ) {
415 $conds[] = "rev_timestamp $op " .
416 $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
417 }
418 # Set query limit
419 if ( !empty( $this->history['limit'] ) ) {
420 $maxRowCount = intval( $this->history['limit'] );
421 }
422 } elseif ( $this->history & self::FULL ) {
423 # Full history dumps...
424 # query optimization for history stub dumps
425 if ( $this->text == self::STUB ) {
426 $opts[] = 'STRAIGHT_JOIN';
427 $opts['USE INDEX']['revision'] = 'rev_page_id';
428 unset( $join['revision'] );
429 $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
430 }
431 } elseif ( $this->history & self::CURRENT ) {
432 # Latest revision dumps...
433 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
434 $this->do_list_authors( $cond );
435 }
436 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
437 $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
438 } elseif ( $this->history & self::STABLE ) {
439 # "Stable" revision dumps...
440 # Default JOIN, to be overridden...
441 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
442 # One, and only one hook should set this, and return false
443 if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
444 throw new MWException( __METHOD__ . " given invalid history dump type." );
445 }
446 } elseif ( $this->history & self::RANGE ) {
447 # Dump of revisions within a specified range. Condition already set in revsByRange().
448 } else {
449 # Unknown history specification parameter?
450 throw new MWException( __METHOD__ . " given invalid history dump type." );
451 }
452
453 $result = null; // Assuring $result is not undefined, if exception occurs early
454 $done = false;
455 $lastRow = null;
456 $revPage = 0;
457 $revId = 0;
458 $rowCount = 0;
459
460 $opts['LIMIT'] = self::BATCH_SIZE;
461
462 $this->hookRunner->onModifyExportQuery(
463 $this->db, $tables, $cond, $opts, $join, $conds );
464
465 while ( !$done ) {
466 // If necessary, impose the overall maximum and stop looping after this iteration.
467 if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
468 $opts['LIMIT'] = $maxRowCount - $rowCount;
469 $done = true;
470 }
471
472 $queryConds = $conds;
473 $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
474 intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
475
476 # Do the query and process any results, remembering max ids for the next iteration.
477 $result = $this->db->select(
478 $tables,
479 $fields,
480 $queryConds,
481 __METHOD__,
482 $opts,
483 $join
484 );
485 if ( $result->numRows() > 0 ) {
486 $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
487 $rowCount += $result->numRows();
488 $revPage = $lastRow->rev_page;
489 $revId = $lastRow->rev_id;
490 } else {
491 $done = true;
492 }
493
494 // If we are finished, close off final page element (if any).
495 if ( $done && $lastRow ) {
496 $this->finishPageStreamOutput( $lastRow );
497 }
498 }
499 }
500
510 protected function outputPageStreamBatch( $results, $lastRow ) {
511 $rowCarry = null;
512 while ( true ) {
513 $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
514
515 if ( !$slotRows ) {
516 break;
517 }
518
519 // All revision info is present in all slot rows.
520 // Use the first slot row as the revision row.
521 $revRow = $slotRows[0];
522
523 if ( $this->limitNamespaces &&
524 !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
525 $lastRow = $revRow;
526 continue;
527 }
528
529 if ( $lastRow === null ||
530 $lastRow->page_namespace !== $revRow->page_namespace ||
531 $lastRow->page_title !== $revRow->page_title ) {
532 if ( $lastRow !== null ) {
533 $output = '';
534 if ( $this->dumpUploads ) {
535 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
536 }
537 $output .= $this->writer->closePage();
538 $this->sink->writeClosePage( $output );
539 }
540 $output = $this->writer->openPage( $revRow );
541 $this->sink->writeOpenPage( $revRow, $output );
542 }
543 try {
544 $output = $this->writer->writeRevision( $revRow, $slotRows );
545 $this->sink->writeRevision( $revRow, $output );
546 } catch ( RevisionAccessException $ex ) {
547 MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for'
548 . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() );
549 }
550 $lastRow = $revRow;
551 }
552
553 if ( $rowCarry ) {
554 throw new LogicException( 'Error while processing a stream of slot rows' );
555 }
556
557 return $lastRow;
558 }
559
569 protected function getSlotRowBatch( $results, &$carry = null ) {
570 $slotRows = [];
571 $prev = null;
572
573 if ( $carry ) {
574 $slotRows[] = $carry;
575 $prev = $carry;
576 $carry = null;
577 }
578
579 while ( $row = $results->fetchObject() ) {
580 if ( $prev && $prev->rev_id !== $row->rev_id ) {
581 $carry = $row;
582 break;
583 }
584 $slotRows[] = $row;
585 $prev = $row;
586 }
587
588 return $slotRows;
589 }
590
596 protected function finishPageStreamOutput( $lastRow ) {
597 $output = '';
598 if ( $this->dumpUploads ) {
599 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
600 }
601 $output .= $this->author_list;
602 $output .= $this->writer->closePage();
603 $this->sink->writeClosePage( $output );
604 }
605
610 protected function outputLogStream( $resultset ) {
611 foreach ( $resultset as $row ) {
612 $output = $this->writer->writeLogItem( $row );
613 $this->sink->writeLogItem( $row, $output );
614 }
615 return $row->log_id ?? null;
616 }
617}
$wgXmlDumpSchemaVersion
The schema to use per default when generating XML dumps.
MediaWiki exception.
MalformedTitleException is thrown when a TitleParser is unable to parse a title string.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
MediaWikiServices is the service locator for the application scope of MediaWiki.
Exception representing a failure to look up a revision.
Page revision base class.
Service for looking up page revisions.
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
RevisionStore $revisionStore
dumpFrom( $cond='', $orderRevs=false)
XmlDumpWriter $writer
pageByName( $name)
pagesByName( $names)
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
__construct( $db, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
HookRunner $hookRunner
bool $dumpUploadFileContents
array int $history
DumpOutput $sink
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
logsByRange( $start, $end)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
array null $limitNamespaces
dumpPages( $cond, $orderRevs)
pageByTitle(PageIdentity $page)
outputLogStream( $resultset)
IDatabase $db
setSchemaVersion( $schemaVersion)
TitleParser $titleParser
static schemaVersion()
Returns the default export schema version, as defined by $wgXmlDumpSchemaVersion.
string $author_list
bool $list_authors
Return distinct author list (when not returning full history)
Interface for objects (potentially) representing an editable wiki page.
getNamespace()
Returns the page's namespace number.
getDBkey()
Get the page title in DB key form.
A title parser service for MediaWiki.
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:38
Result wrapper for grabbing data queried from an IDatabase object.