MediaWiki REL1_35
WikiExporter.php
Go to the documentation of this file.
1<?php
36
42 public $list_authors = false;
43
45 public $dumpUploads = false;
46
49
51 public $author_list = "";
52
53 public const FULL = 1;
54 public const CURRENT = 2;
55 public const STABLE = 4; // extension defined
56 public const LOGS = 8;
57 public const RANGE = 16;
58
59 public const TEXT = XmlDumpWriter::WRITE_CONTENT;
60 public const STUB = XmlDumpWriter::WRITE_STUB;
61
62 protected const BATCH_SIZE = 50000;
63
65 public $text;
66
68 public $sink;
69
71 private $writer;
72
74 protected $db;
75
77 protected $history;
78
81
84
86 private $hookRunner;
87
92 public static function schemaVersion() {
95 }
96
108 public function __construct(
109 $db,
110 $history = self::CURRENT,
111 $text = self::TEXT,
112 $limitNamespaces = null
113 ) {
114 $this->db = $db;
115 $this->history = $history;
116 $this->writer = new XmlDumpWriter( $text, self::schemaVersion() );
117 $this->sink = new DumpOutput();
118 $this->text = $text;
119 $this->limitNamespaces = $limitNamespaces;
120 $services = MediaWikiServices::getInstance();
121 $this->hookRunner = new HookRunner( $services->getHookContainer() );
122 $this->revisionStore = $services->getRevisionStore();
123 }
124
130 public function setSchemaVersion( $schemaVersion ) {
131 $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
132 }
133
141 public function setOutputSink( &$sink ) {
142 $this->sink =& $sink;
143 }
144
145 public function openStream() {
146 $output = $this->writer->openStream();
147 $this->sink->writeOpenStream( $output );
148 }
149
150 public function closeStream() {
151 $output = $this->writer->closeStream();
152 $this->sink->writeCloseStream( $output );
153 }
154
160 public function allPages() {
161 $this->dumpFrom( '' );
162 }
163
172 public function pagesByRange( $start, $end, $orderRevs ) {
173 if ( $orderRevs ) {
174 $condition = 'rev_page >= ' . intval( $start );
175 if ( $end ) {
176 $condition .= ' AND rev_page < ' . intval( $end );
177 }
178 } else {
179 $condition = 'page_id >= ' . intval( $start );
180 if ( $end ) {
181 $condition .= ' AND page_id < ' . intval( $end );
182 }
183 }
184 $this->dumpFrom( $condition, $orderRevs );
185 }
186
194 public function revsByRange( $start, $end ) {
195 $condition = 'rev_id >= ' . intval( $start );
196 if ( $end ) {
197 $condition .= ' AND rev_id < ' . intval( $end );
198 }
199 $this->dumpFrom( $condition );
200 }
201
205 public function pageByTitle( $title ) {
206 $this->dumpFrom(
207 'page_namespace=' . $title->getNamespace() .
208 ' AND page_title=' . $this->db->addQuotes( $title->getDBkey() ) );
209 }
210
215 public function pageByName( $name ) {
216 $title = Title::newFromText( $name );
217 if ( $title === null ) {
218 throw new MWException( "Can't export invalid title" );
219 } else {
220 $this->pageByTitle( $title );
221 }
222 }
223
227 public function pagesByName( $names ) {
228 foreach ( $names as $name ) {
229 $this->pageByName( $name );
230 }
231 }
232
233 public function allLogs() {
234 $this->dumpFrom( '' );
235 }
236
241 public function logsByRange( $start, $end ) {
242 $condition = 'log_id >= ' . intval( $start );
243 if ( $end ) {
244 $condition .= ' AND log_id < ' . intval( $end );
245 }
246 $this->dumpFrom( $condition );
247 }
248
256 protected function do_list_authors( $cond ) {
257 $this->author_list = "<contributors>";
258 // rev_deleted
259
260 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
261 $res = $this->db->select(
262 $revQuery['tables'],
263 [
264 'rev_user_text' => $revQuery['fields']['rev_user_text'],
265 'rev_user' => $revQuery['fields']['rev_user'],
266 ],
267 [
268 $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0',
269 $cond,
270 ],
271 __METHOD__,
272 [ 'DISTINCT' ],
273 $revQuery['joins']
274 );
275
276 foreach ( $res as $row ) {
277 $this->author_list .= "<contributor>" .
278 "<username>" .
279 htmlspecialchars( $row->rev_user_text ) .
280 "</username>" .
281 "<id>" .
282 ( (int)$row->rev_user ) .
283 "</id>" .
284 "</contributor>";
285 }
286 $this->author_list .= "</contributors>";
287 }
288
295 protected function dumpFrom( $cond = '', $orderRevs = false ) {
296 if ( $this->history & self::LOGS ) {
297 $this->dumpLogs( $cond );
298 } else {
299 $this->dumpPages( $cond, $orderRevs );
300 }
301 }
302
307 protected function dumpLogs( $cond ) {
308 $where = [];
309 # Hide private logs
310 $hideLogs = LogEventsList::getExcludeClause( $this->db );
311 if ( $hideLogs ) {
312 $where[] = $hideLogs;
313 }
314 # Add on any caller specified conditions
315 if ( $cond ) {
316 $where[] = $cond;
317 }
318 $result = null; // Assuring $result is not undefined, if exception occurs early
319
320 $commentQuery = CommentStore::getStore()->getJoin( 'log_comment' );
321 $actorQuery = ActorMigration::newMigration()->getJoin( 'log_user' );
322
323 $tables = array_merge(
324 [ 'logging' ], $commentQuery['tables'], $actorQuery['tables'], [ 'user' ]
325 );
326 $fields = [
327 'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
328 'log_title', 'log_params', 'log_deleted', 'user_name'
329 ] + $commentQuery['fields'] + $actorQuery['fields'];
330 $options = [
331 'ORDER BY' => 'log_id',
332 'USE INDEX' => [ 'logging' => 'PRIMARY' ],
333 'LIMIT' => self::BATCH_SIZE,
334 ];
335 $joins = [
336 'user' => [ 'JOIN', 'user_id = ' . $actorQuery['fields']['log_user'] ]
337 ] + $commentQuery['joins'] + $actorQuery['joins'];
338
339 $lastLogId = 0;
340 while ( true ) {
341 $result = $this->db->select(
342 $tables,
343 $fields,
344 array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
345 __METHOD__,
346 $options,
347 $joins
348 );
349
350 if ( !$result->numRows() ) {
351 break;
352 }
353
354 $lastLogId = $this->outputLogStream( $result );
355 }
356 }
357
364 protected function dumpPages( $cond, $orderRevs ) {
365 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
366 $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
367
368 // We want page primary rather than revision.
369 // We also want to join in the slots and content tables.
370 // NOTE: This means we may get multiple rows per revision, and more rows
371 // than the batch size! Should be ok, since the max number of slots is
372 // fixed and low (dozens at worst).
373 $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
374 $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
375 $join = $revQuery['joins'] + [
376 'revision' => $revQuery['joins']['page'],
377 'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
378 'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
379 ];
380 unset( $join['page'] );
381
382 $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
383 $fields[] = 'page_restrictions';
384
385 if ( $this->text != self::STUB ) {
386 $fields['_load_content'] = '1';
387 }
388
389 $conds = [];
390 if ( $cond !== '' ) {
391 $conds[] = $cond;
392 }
393 $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
394 $opts['USE INDEX'] = [];
395
396 $op = '>';
397 if ( is_array( $this->history ) ) {
398 # Time offset/limit for all pages/history...
399 # Set time order
400 if ( $this->history['dir'] == 'asc' ) {
401 $opts['ORDER BY'] = 'rev_timestamp ASC';
402 } else {
403 $op = '<';
404 $opts['ORDER BY'] = 'rev_timestamp DESC';
405 }
406 # Set offset
407 if ( !empty( $this->history['offset'] ) ) {
408 $conds[] = "rev_timestamp $op " .
409 $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
410 }
411 # Set query limit
412 if ( !empty( $this->history['limit'] ) ) {
413 $maxRowCount = intval( $this->history['limit'] );
414 }
415 } elseif ( $this->history & self::FULL ) {
416 # Full history dumps...
417 # query optimization for history stub dumps
418 if ( $this->text == self::STUB ) {
419 $opts[] = 'STRAIGHT_JOIN';
420 $opts['USE INDEX']['revision'] = 'rev_page_id';
421 unset( $join['revision'] );
422 $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
423 }
424 } elseif ( $this->history & self::CURRENT ) {
425 # Latest revision dumps...
426 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
427 $this->do_list_authors( $cond );
428 }
429 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
430 $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
431 } elseif ( $this->history & self::STABLE ) {
432 # "Stable" revision dumps...
433 # Default JOIN, to be overridden...
434 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
435 # One, and only one hook should set this, and return false
436 if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
437 throw new MWException( __METHOD__ . " given invalid history dump type." );
438 }
439 } elseif ( $this->history & self::RANGE ) {
440 # Dump of revisions within a specified range. Condition already set in revsByRange().
441 } else {
442 # Unknown history specification parameter?
443 throw new MWException( __METHOD__ . " given invalid history dump type." );
444 }
445
446 $result = null; // Assuring $result is not undefined, if exception occurs early
447 $done = false;
448 $lastRow = null;
449 $revPage = 0;
450 $revId = 0;
451 $rowCount = 0;
452
453 $opts['LIMIT'] = self::BATCH_SIZE;
454
455 $this->hookRunner->onModifyExportQuery(
456 $this->db, $tables, $cond, $opts, $join, $conds );
457
458 while ( !$done ) {
459 // If necessary, impose the overall maximum and stop looping after this iteration.
460 if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
461 $opts['LIMIT'] = $maxRowCount - $rowCount;
462 $done = true;
463 }
464
465 $queryConds = $conds;
466 $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
467 intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
468
469 # Do the query and process any results, remembering max ids for the next iteration.
470 $result = $this->db->select(
471 $tables,
472 $fields,
473 $queryConds,
474 __METHOD__,
475 $opts,
476 $join
477 );
478 if ( $result->numRows() > 0 ) {
479 $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
480 $rowCount += $result->numRows();
481 $revPage = $lastRow->rev_page;
482 $revId = $lastRow->rev_id;
483 } else {
484 $done = true;
485 }
486
487 // If we are finished, close off final page element (if any).
488 if ( $done && $lastRow ) {
489 $this->finishPageStreamOutput( $lastRow );
490 }
491 }
492 }
493
503 protected function outputPageStreamBatch( $results, $lastRow ) {
504 $rowCarry = null;
505 while ( true ) {
506 $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
507
508 if ( !$slotRows ) {
509 break;
510 }
511
512 // All revision info is present in all slot rows.
513 // Use the first slot row as the revision row.
514 $revRow = $slotRows[0];
515
516 if ( $this->limitNamespaces &&
517 !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
518 $lastRow = $revRow;
519 continue;
520 }
521
522 if ( $lastRow === null ||
523 $lastRow->page_namespace !== $revRow->page_namespace ||
524 $lastRow->page_title !== $revRow->page_title ) {
525 if ( $lastRow !== null ) {
526 $output = '';
527 if ( $this->dumpUploads ) {
528 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
529 }
530 $output .= $this->writer->closePage();
531 $this->sink->writeClosePage( $output );
532 }
533 $output = $this->writer->openPage( $revRow );
534 $this->sink->writeOpenPage( $revRow, $output );
535 }
536 $output = $this->writer->writeRevision( $revRow, $slotRows );
537 $this->sink->writeRevision( $revRow, $output );
538 $lastRow = $revRow;
539 }
540
541 if ( $rowCarry ) {
542 throw new LogicException( 'Error while processing a stream of slot rows' );
543 }
544
545 return $lastRow;
546 }
547
557 protected function getSlotRowBatch( $results, &$carry = null ) {
558 $slotRows = [];
559 $prev = null;
560
561 if ( $carry ) {
562 $slotRows[] = $carry;
563 $prev = $carry;
564 $carry = null;
565 }
566
567 while ( $row = $results->fetchObject() ) {
568 if ( $prev && $prev->rev_id !== $row->rev_id ) {
569 $carry = $row;
570 break;
571 }
572 $slotRows[] = $row;
573 $prev = $row;
574 }
575
576 return $slotRows;
577 }
578
584 protected function finishPageStreamOutput( $lastRow ) {
585 $output = '';
586 if ( $this->dumpUploads ) {
587 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
588 }
589 $output .= $this->author_list;
590 $output .= $this->writer->closePage();
591 $this->sink->writeClosePage( $output );
592 }
593
598 protected function outputLogStream( $resultset ) {
599 foreach ( $resultset as $row ) {
600 $output = $this->writer->writeLogItem( $row );
601 $this->sink->writeLogItem( $row, $output );
602 }
603 return $row->log_id ?? null;
604 }
605}
$wgXmlDumpSchemaVersion
The schema to use per default when generating XML dumps.
MediaWiki exception.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
MediaWikiServices is the service locator for the application scope of MediaWiki.
Page revision base class.
Service for looking up page revisions.
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
RevisionStore $revisionStore
dumpFrom( $cond='', $orderRevs=false)
XmlDumpWriter $writer
pageByName( $name)
pagesByName( $names)
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pageByTitle( $title)
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
__construct( $db, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
HookRunner $hookRunner
bool $dumpUploadFileContents
array int $history
DumpOutput $sink
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
logsByRange( $start, $end)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
array null $limitNamespaces
dumpPages( $cond, $orderRevs)
outputLogStream( $resultset)
IDatabase $db
setSchemaVersion( $schemaVersion)
static schemaVersion()
Returns the default export schema version, as defined by $wgXmlDumpSchemaVersion.
string $author_list
bool $list_authors
Return distinct author list (when not returning full history)
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:38
Result wrapper for grabbing data queried from an IDatabase object.