MediaWiki REL1_34
WikiExporter.php
Go to the documentation of this file.
1<?php
30use MediaWiki\MediaWikiServices as MediaWikiServicesAlias;
34
40 public $list_authors = false;
41
43 public $dumpUploads = false;
44
47
49 public $author_list = "";
50
51 const FULL = 1;
52 const CURRENT = 2;
53 const STABLE = 4; // extension defined
54 const LOGS = 8;
55 const RANGE = 16;
56
59
60 const BATCH_SIZE = 50000;
61
63 public $text;
64
66 public $sink;
67
69 private $writer;
70
72 protected $db;
73
75 protected $history;
76
79
84 public static function schemaVersion() {
87 }
88
100 function __construct(
101 $db,
102 $history = self::CURRENT,
103 $text = self::TEXT,
104 $limitNamespaces = null
105 ) {
106 $this->db = $db;
107 $this->history = $history;
108 $this->writer = new XmlDumpWriter( $text, self::schemaVersion() );
109 $this->sink = new DumpOutput();
110 $this->text = $text;
111 $this->limitNamespaces = $limitNamespaces;
112 }
113
119 public function setSchemaVersion( $schemaVersion ) {
120 $this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
121 }
122
130 public function setOutputSink( &$sink ) {
131 $this->sink =& $sink;
132 }
133
134 public function openStream() {
135 $output = $this->writer->openStream();
136 $this->sink->writeOpenStream( $output );
137 }
138
139 public function closeStream() {
140 $output = $this->writer->closeStream();
141 $this->sink->writeCloseStream( $output );
142 }
143
149 public function allPages() {
150 $this->dumpFrom( '' );
151 }
152
161 public function pagesByRange( $start, $end, $orderRevs ) {
162 if ( $orderRevs ) {
163 $condition = 'rev_page >= ' . intval( $start );
164 if ( $end ) {
165 $condition .= ' AND rev_page < ' . intval( $end );
166 }
167 } else {
168 $condition = 'page_id >= ' . intval( $start );
169 if ( $end ) {
170 $condition .= ' AND page_id < ' . intval( $end );
171 }
172 }
173 $this->dumpFrom( $condition, $orderRevs );
174 }
175
183 public function revsByRange( $start, $end ) {
184 $condition = 'rev_id >= ' . intval( $start );
185 if ( $end ) {
186 $condition .= ' AND rev_id < ' . intval( $end );
187 }
188 $this->dumpFrom( $condition );
189 }
190
194 public function pageByTitle( $title ) {
195 $this->dumpFrom(
196 'page_namespace=' . $title->getNamespace() .
197 ' AND page_title=' . $this->db->addQuotes( $title->getDBkey() ) );
198 }
199
204 public function pageByName( $name ) {
205 $title = Title::newFromText( $name );
206 if ( is_null( $title ) ) {
207 throw new MWException( "Can't export invalid title" );
208 } else {
209 $this->pageByTitle( $title );
210 }
211 }
212
216 public function pagesByName( $names ) {
217 foreach ( $names as $name ) {
218 $this->pageByName( $name );
219 }
220 }
221
222 public function allLogs() {
223 $this->dumpFrom( '' );
224 }
225
230 public function logsByRange( $start, $end ) {
231 $condition = 'log_id >= ' . intval( $start );
232 if ( $end ) {
233 $condition .= ' AND log_id < ' . intval( $end );
234 }
235 $this->dumpFrom( $condition );
236 }
237
245 protected function do_list_authors( $cond ) {
246 $this->author_list = "<contributors>";
247 // rev_deleted
248
249 $revQuery = Revision::getQueryInfo( [ 'page' ] );
250 $res = $this->db->select(
251 $revQuery['tables'],
252 [
253 'rev_user_text' => $revQuery['fields']['rev_user_text'],
254 'rev_user' => $revQuery['fields']['rev_user'],
255 ],
256 [
257 $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0',
258 $cond,
259 ],
260 __METHOD__,
261 [ 'DISTINCT' ],
262 $revQuery['joins']
263 );
264
265 foreach ( $res as $row ) {
266 $this->author_list .= "<contributor>" .
267 "<username>" .
268 htmlspecialchars( $row->rev_user_text ) .
269 "</username>" .
270 "<id>" .
271 ( (int)$row->rev_user ) .
272 "</id>" .
273 "</contributor>";
274 }
275 $this->author_list .= "</contributors>";
276 }
277
284 protected function dumpFrom( $cond = '', $orderRevs = false ) {
285 if ( $this->history & self::LOGS ) {
286 $this->dumpLogs( $cond );
287 } else {
288 $this->dumpPages( $cond, $orderRevs );
289 }
290 }
291
296 protected function dumpLogs( $cond ) {
297 $where = [];
298 # Hide private logs
299 $hideLogs = LogEventsList::getExcludeClause( $this->db );
300 if ( $hideLogs ) {
301 $where[] = $hideLogs;
302 }
303 # Add on any caller specified conditions
304 if ( $cond ) {
305 $where[] = $cond;
306 }
307 $result = null; // Assuring $result is not undefined, if exception occurs early
308
309 $commentQuery = CommentStore::getStore()->getJoin( 'log_comment' );
310 $actorQuery = ActorMigration::newMigration()->getJoin( 'log_user' );
311
312 $tables = array_merge(
313 [ 'logging' ], $commentQuery['tables'], $actorQuery['tables'], [ 'user' ]
314 );
315 $fields = [
316 'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
317 'log_title', 'log_params', 'log_deleted', 'user_name'
318 ] + $commentQuery['fields'] + $actorQuery['fields'];
319 $options = [
320 'ORDER BY' => 'log_id',
321 'USE INDEX' => [ 'logging' => 'PRIMARY' ],
322 'LIMIT' => self::BATCH_SIZE,
323 ];
324 $joins = [
325 'user' => [ 'JOIN', 'user_id = ' . $actorQuery['fields']['log_user'] ]
326 ] + $commentQuery['joins'] + $actorQuery['joins'];
327
328 $lastLogId = 0;
329 while ( true ) {
330 $result = $this->db->select(
331 $tables,
332 $fields,
333 array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
334 __METHOD__,
335 $options,
336 $joins
337 );
338
339 if ( !$result->numRows() ) {
340 break;
341 }
342
343 $lastLogId = $this->outputLogStream( $result );
344 }
345 }
346
353 protected function dumpPages( $cond, $orderRevs ) {
354 $revQuery = MediaWikiServicesAlias::getInstance()->getRevisionStore()->getQueryInfo(
355 [ 'page' ]
356 );
357 $slotQuery = MediaWikiServicesAlias::getInstance()->getRevisionStore()->getSlotsQueryInfo(
358 [ 'content' ]
359 );
360
361 // We want page primary rather than revision.
362 // We also want to join in the slots and content tables.
363 // NOTE: This means we may get multiple rows per revision, and more rows
364 // than the batch size! Should be ok, since the max number of slots is
365 // fixed and low (dozens at worst).
366 $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
367 $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
368 $join = $revQuery['joins'] + [
369 'revision' => $revQuery['joins']['page'],
370 'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
371 'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
372 ];
373 unset( $join['page'] );
374
375 $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
376 $fields[] = 'page_restrictions';
377
378 if ( $this->text != self::STUB ) {
379 $fields['_load_content'] = '1';
380 }
381
382 $conds = [];
383 if ( $cond !== '' ) {
384 $conds[] = $cond;
385 }
386 $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
387 $opts['USE INDEX'] = [];
388
389 $op = '>';
390 if ( is_array( $this->history ) ) {
391 # Time offset/limit for all pages/history...
392 # Set time order
393 if ( $this->history['dir'] == 'asc' ) {
394 $opts['ORDER BY'] = 'rev_timestamp ASC';
395 } else {
396 $op = '<';
397 $opts['ORDER BY'] = 'rev_timestamp DESC';
398 }
399 # Set offset
400 if ( !empty( $this->history['offset'] ) ) {
401 $conds[] = "rev_timestamp $op " .
402 $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
403 }
404 # Set query limit
405 if ( !empty( $this->history['limit'] ) ) {
406 $maxRowCount = intval( $this->history['limit'] );
407 }
408 } elseif ( $this->history & self::FULL ) {
409 # Full history dumps...
410 # query optimization for history stub dumps
411 if ( $this->text == self::STUB ) {
412 $opts[] = 'STRAIGHT_JOIN';
413 $opts['USE INDEX']['revision'] = 'rev_page_id';
414 unset( $join['revision'] );
415 $join['page'] = [ 'JOIN', 'rev_page=page_id' ];
416 }
417 } elseif ( $this->history & self::CURRENT ) {
418 # Latest revision dumps...
419 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
420 $this->do_list_authors( $cond );
421 }
422 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
423 $opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
424 } elseif ( $this->history & self::STABLE ) {
425 # "Stable" revision dumps...
426 # Default JOIN, to be overridden...
427 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
428 # One, and only one hook should set this, and return false
429 if ( Hooks::run( 'WikiExporter::dumpStableQuery', [ &$tables, &$opts, &$join ] ) ) {
430 throw new MWException( __METHOD__ . " given invalid history dump type." );
431 }
432 } elseif ( $this->history & self::RANGE ) {
433 # Dump of revisions within a specified range. Condition already set in revsByRange().
434 } else {
435 # Unknown history specification parameter?
436 throw new MWException( __METHOD__ . " given invalid history dump type." );
437 }
438
439 $result = null; // Assuring $result is not undefined, if exception occurs early
440 $done = false;
441 $lastRow = null;
442 $revPage = 0;
443 $revId = 0;
444 $rowCount = 0;
445
446 $opts['LIMIT'] = self::BATCH_SIZE;
447
448 Hooks::run( 'ModifyExportQuery',
449 [ $this->db, &$tables, &$cond, &$opts, &$join ] );
450
451 while ( !$done ) {
452 // If necessary, impose the overall maximum and stop looping after this iteration.
453 if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
454 $opts['LIMIT'] = $maxRowCount - $rowCount;
455 $done = true;
456 }
457
458 $queryConds = $conds;
459 $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
460 intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
461
462 # Do the query and process any results, remembering max ids for the next iteration.
463 $result = $this->db->select(
464 $tables,
465 $fields,
466 $queryConds,
467 __METHOD__,
468 $opts,
469 $join
470 );
471 if ( $result->numRows() > 0 ) {
472 $lastRow = $this->outputPageStreamBatch( $result, $lastRow );
473 $rowCount += $result->numRows();
474 $revPage = $lastRow->rev_page;
475 $revId = $lastRow->rev_id;
476 } else {
477 $done = true;
478 }
479
480 // If we are finished, close off final page element (if any).
481 if ( $done && $lastRow ) {
482 $this->finishPageStreamOutput( $lastRow );
483 }
484 }
485 }
486
496 protected function outputPageStreamBatch( $results, $lastRow ) {
497 $rowCarry = null;
498 while ( true ) {
499 $slotRows = $this->getSlotRowBatch( $results, $rowCarry );
500
501 if ( !$slotRows ) {
502 break;
503 }
504
505 // All revision info is present in all slot rows.
506 // Use the first slot row as the revision row.
507 $revRow = $slotRows[0];
508
509 if ( $this->limitNamespaces &&
510 !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
511 $lastRow = $revRow;
512 continue;
513 }
514
515 if ( $lastRow === null ||
516 $lastRow->page_namespace !== $revRow->page_namespace ||
517 $lastRow->page_title !== $revRow->page_title ) {
518 if ( $lastRow !== null ) {
519 $output = '';
520 if ( $this->dumpUploads ) {
521 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
522 }
523 $output .= $this->writer->closePage();
524 $this->sink->writeClosePage( $output );
525 }
526 $output = $this->writer->openPage( $revRow );
527 $this->sink->writeOpenPage( $revRow, $output );
528 }
529 $output = $this->writer->writeRevision( $revRow, $slotRows );
530 $this->sink->writeRevision( $revRow, $output );
531 $lastRow = $revRow;
532 }
533
534 if ( $rowCarry ) {
535 throw new LogicException( 'Error while processing a stream of slot rows' );
536 }
537
538 return $lastRow;
539 }
540
550 protected function getSlotRowBatch( $results, &$carry = null ) {
551 $slotRows = [];
552 $prev = null;
553
554 if ( $carry ) {
555 $slotRows[] = $carry;
556 $prev = $carry;
557 $carry = null;
558 }
559
560 while ( $row = $results->fetchObject() ) {
561 if ( $prev && $prev->rev_id !== $row->rev_id ) {
562 $carry = $row;
563 break;
564 }
565 $slotRows[] = $row;
566 $prev = $row;
567 }
568
569 return $slotRows;
570 }
571
577 protected function finishPageStreamOutput( $lastRow ) {
578 $output = '';
579 if ( $this->dumpUploads ) {
580 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
581 }
582 $output .= $this->author_list;
583 $output .= $this->writer->closePage();
584 $this->sink->writeClosePage( $output );
585 }
586
591 protected function outputLogStream( $resultset ) {
592 foreach ( $resultset as $row ) {
593 $output = $this->writer->writeLogItem( $row );
594 $this->sink->writeLogItem( $row, $output );
595 }
596 return isset( $row ) ? $row->log_id : null;
597 }
598}
$wgXmlDumpSchemaVersion
The schema to use per default when generating XML dumps.
static getExcludeClause( $db, $audience='public', User $user=null)
SQL clause to skip forbidden log types for this user.
MediaWiki exception.
MediaWikiServices is the service locator for the application scope of MediaWiki.
static getInstance()
Returns the global default instance of the top level service locator.
Page revision base class.
static getQueryInfo( $options=[])
Return the tables, fields, and join conditions to be selected to create a new revision object.
Definition Revision.php:315
revsByRange( $start, $end)
Dumps a series of page and revision records for those pages in the database with revisions falling wi...
dumpFrom( $cond='', $orderRevs=false)
XmlDumpWriter $writer
pageByName( $name)
pagesByName( $names)
finishPageStreamOutput( $lastRow)
Final page stream output, after all batches are complete.
getSlotRowBatch( $results, &$carry=null)
Returns all slot rows for a revision.
setOutputSink(&$sink)
Set the DumpOutput or DumpFilter object which will receive various row objects and XML output for fil...
pageByTitle( $title)
pagesByRange( $start, $end, $orderRevs)
Dumps a series of page and revision records for those pages in the database falling within the page_i...
__construct( $db, $history=self::CURRENT, $text=self::TEXT, $limitNamespaces=null)
outputPageStreamBatch( $results, $lastRow)
Runs through a query result set dumping page, revision, and slot records.
bool $dumpUploadFileContents
array int $history
DumpOutput $sink
allPages()
Dumps a series of page and revision records for all pages in the database, either including complete ...
logsByRange( $start, $end)
do_list_authors( $cond)
Generates the distinct list of authors of an article Not called by default (depends on $this->list_au...
array null $limitNamespaces
dumpPages( $cond, $orderRevs)
outputLogStream( $resultset)
IDatabase $db
setSchemaVersion( $schemaVersion)
static schemaVersion()
Returns the default export schema version, as defined by $wgXmlDumpSchemaVersion.
string $author_list
bool $list_authors
Return distinct author list (when not returning full history)
const WRITE_STUB
Only output subs for revision content.
const WRITE_CONTENT
Output serialized revision content.
Basic database interface for live and lazy-loaded relation database handles.
Definition IDatabase.php:38
Result wrapper for grabbing data queried from an IDatabase object.