MediaWiki REL1_34
XmlDumpWriter.php
Go to the documentation of this file.
1<?php
31use Wikimedia\Assert\Assert;
32
37
39 const WRITE_CONTENT = 0;
40
42 const WRITE_STUB = 1;
43
49
54 public static $supportedSchemas = [
57 ];
58
65
71 private $currentTitle = null;
72
76 private $contentMode;
77
86 public function __construct(
87 $contentMode = self::WRITE_CONTENT,
89 ) {
90 Assert::parameter(
91 in_array( $contentMode, [ self::WRITE_CONTENT, self::WRITE_STUB ] ),
92 '$contentMode',
93 'must be one of the following constants: WRITE_CONTENT or WRITE_STUB.'
94 );
95
96 Assert::parameter(
97 in_array( $schemaVersion, self::$supportedSchemas ),
98 '$schemaVersion',
99 'must be one of the following schema versions: '
100 . implode( ',', self::$supportedSchemas )
101 );
102
103 $this->contentMode = $contentMode;
104 $this->schemaVersion = $schemaVersion;
105 }
106
117 function openStream() {
119 return Xml::element( 'mediawiki', [
120 'xmlns' => "http://www.mediawiki.org/xml/export-$ver/",
121 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
122 /*
123 * When a new version of the schema is created, it needs staging on mediawiki.org.
124 * This requires a change in the operations/mediawiki-config git repo.
125 *
126 * Create a changeset like https://gerrit.wikimedia.org/r/#/c/149643/ in which
127 * you copy in the new xsd file.
128 *
129 * After it is reviewed, merged and deployed (sync-docroot), the index.html needs purging.
130 * echo "https://www.mediawiki.org/xml/index.html" | mwscript purgeList.php --wiki=aawiki
131 */
132 'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " .
133 "http://www.mediawiki.org/xml/export-$ver.xsd",
134 'version' => $ver,
135 'xml:lang' => MediaWikiServices::getInstance()->getContentLanguage()->getHtmlCode() ],
136 null ) .
137 "\n" .
138 $this->siteInfo();
139 }
140
144 function siteInfo() {
145 $info = [
146 $this->sitename(),
147 $this->dbname(),
148 $this->homelink(),
149 $this->generator(),
150 $this->caseSetting(),
151 $this->namespaces() ];
152 return " <siteinfo>\n " .
153 implode( "\n ", $info ) .
154 "\n </siteinfo>\n";
155 }
156
160 function sitename() {
161 global $wgSitename;
162 return Xml::element( 'sitename', [], $wgSitename );
163 }
164
168 function dbname() {
169 global $wgDBname;
170 return Xml::element( 'dbname', [], $wgDBname );
171 }
172
176 function generator() {
177 global $wgVersion;
178 return Xml::element( 'generator', [], "MediaWiki $wgVersion" );
179 }
180
184 function homelink() {
185 return Xml::element( 'base', [], Title::newMainPage()->getCanonicalURL() );
186 }
187
191 function caseSetting() {
192 global $wgCapitalLinks;
193 // "case-insensitive" option is reserved for future
194 $sensitivity = $wgCapitalLinks ? 'first-letter' : 'case-sensitive';
195 return Xml::element( 'case', [], $sensitivity );
196 }
197
201 function namespaces() {
202 $spaces = "<namespaces>\n";
203 $nsInfo = MediaWikiServices::getInstance()->getNamespaceInfo();
204 foreach (
205 MediaWikiServices::getInstance()->getContentLanguage()->getFormattedNamespaces()
206 as $ns => $title
207 ) {
208 $spaces .= ' ' .
209 Xml::element( 'namespace',
210 [
211 'key' => $ns,
212 'case' => $nsInfo->isCapitalized( $ns )
213 ? 'first-letter' : 'case-sensitive',
214 ], $title ) . "\n";
215 }
216 $spaces .= " </namespaces>";
217 return $spaces;
218 }
219
226 function closeStream() {
227 return "</mediawiki>\n";
228 }
229
237 public function openPage( $row ) {
238 $out = " <page>\n";
239 $this->currentTitle = Title::newFromRow( $row );
240 $canonicalTitle = self::canonicalTitle( $this->currentTitle );
241 $out .= ' ' . Xml::elementClean( 'title', [], $canonicalTitle ) . "\n";
242 $out .= ' ' . Xml::element( 'ns', [], strval( $row->page_namespace ) ) . "\n";
243 $out .= ' ' . Xml::element( 'id', [], strval( $row->page_id ) ) . "\n";
244 if ( $row->page_is_redirect ) {
245 $page = WikiPage::factory( $this->currentTitle );
246 $redirect = $page->getRedirectTarget();
247 if ( $redirect instanceof Title && $redirect->isValidRedirectTarget() ) {
248 $out .= ' ';
249 $out .= Xml::element( 'redirect', [ 'title' => self::canonicalTitle( $redirect ) ] );
250 $out .= "\n";
251 }
252 }
253
254 if ( $row->page_restrictions != '' ) {
255 $out .= ' ' . Xml::element( 'restrictions', [],
256 strval( $row->page_restrictions ) ) . "\n";
257 }
258
259 Hooks::run( 'XmlDumpWriterOpenPage', [ $this, &$out, $row, $this->currentTitle ] );
260
261 return $out;
262 }
263
270 function closePage() {
271 if ( $this->currentTitle !== null ) {
272 $linkCache = MediaWikiServices::getInstance()->getLinkCache();
273 // In rare cases, link cache has the same key for some pages which
274 // might be read as part of the same batch. T220424 and T220316
275 $linkCache->clearLink( $this->currentTitle );
276 }
277 return " </page>\n";
278 }
279
283 private function getRevisionStore() {
284 return MediaWikiServices::getInstance()->getRevisionStore();
285 }
286
290 private function getBlobStore() {
291 return MediaWikiServices::getInstance()->getBlobStore();
292 }
293
307 private function invokeLenient( $obj, $method, $args = [], $warning ) {
308 try {
309 return call_user_func_array( [ $obj, $method ], $args );
310 } catch ( SuppressedDataException $ex ) {
311 return null;
312 } catch ( Exception $ex ) {
313 if ( $ex instanceof MWException || $ex instanceof RuntimeException ||
314 $ex instanceof InvalidArgumentException ) {
315 MWDebug::warning( $warning . ': ' . $ex->getMessage() );
316 return null;
317 } else {
318 throw $ex;
319 }
320 }
321 }
322
335 function writeRevision( $row, $slotRows = null ) {
336 $rev = $this->getRevisionStore()->newRevisionFromRowAndSlots(
337 $row,
338 $slotRows,
339 0,
340 $this->currentTitle
341 );
342
343 $out = " <revision>\n";
344 $out .= " " . Xml::element( 'id', null, strval( $rev->getId() ) ) . "\n";
345
346 if ( $rev->getParentId() ) {
347 $out .= " " . Xml::element( 'parentid', null, strval( $rev->getParentId() ) ) . "\n";
348 }
349
350 $out .= $this->writeTimestamp( $rev->getTimestamp() );
351
352 if ( $rev->isDeleted( RevisionRecord::DELETED_USER ) ) {
353 $out .= " " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
354 } else {
355 // empty values get written out as uid 0, see T224221
356 $user = $rev->getUser();
357 $out .= $this->writeContributor(
358 $user ? $user->getId() : 0,
359 $user ? $user->getName() : ''
360 );
361 }
362
363 if ( $rev->isMinor() ) {
364 $out .= " <minor/>\n";
365 }
366 if ( $rev->isDeleted( RevisionRecord::DELETED_COMMENT ) ) {
367 $out .= " " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
368 } else {
369 if ( $rev->getComment()->text != '' ) {
370 $out .= " "
371 . Xml::elementClean( 'comment', [], strval( $rev->getComment()->text ) )
372 . "\n";
373 }
374 }
375
376 $contentMode = $rev->isDeleted( RevisionRecord::DELETED_TEXT ) ? self::WRITE_STUB_DELETED
378
379 foreach ( $rev->getSlots()->getSlots() as $slot ) {
380 $out .= $this->writeSlot( $slot, $contentMode );
381 }
382
383 if ( $rev->isDeleted( RevisionRecord::DELETED_TEXT ) ) {
384 $out .= " <sha1/>\n";
385 } else {
386 $sha1 = $this->invokeLenient(
387 $rev,
388 'getSha1',
389 [],
390 'failed to determine sha1 for revision ' . $rev->getId()
391 );
392 $out .= " " . Xml::element( 'sha1', null, strval( $sha1 ) ) . "\n";
393 }
394
395 // Avoid PHP 7.1 warning from passing $this by reference
396 $writer = $this;
397 $text = '';
398 if ( $contentMode === self::WRITE_CONTENT ) {
400 $content = $this->invokeLenient(
401 $rev,
402 'getContent',
403 [ SlotRecord::MAIN, RevisionRecord::RAW ],
404 'Failed to load main slot content of revision ' . $rev->getId()
405 );
406
407 $text = $content ? $content->serialize() : '';
408 }
409 Hooks::run( 'XmlDumpWriterWriteRevision', [ &$writer, &$out, $row, $text, $rev ] );
410
411 $out .= " </revision>\n";
412
413 return $out;
414 }
415
422 private function writeSlot( SlotRecord $slot, $contentMode ) {
423 $isMain = $slot->getRole() === SlotRecord::MAIN;
424 $isV11 = $this->schemaVersion >= XML_DUMP_SCHEMA_VERSION_11;
425
426 if ( !$isV11 && !$isMain ) {
427 // ignore extra slots
428 return '';
429 }
430
431 $out = '';
432 $indent = ' ';
433
434 if ( !$isMain ) {
435 // non-main slots are wrapped into an additional element.
436 $out .= ' ' . Xml::openElement( 'content' ) . "\n";
437 $indent .= ' ';
438 $out .= $indent . Xml::element( 'role', null, strval( $slot->getRole() ) ) . "\n";
439 }
440
441 if ( $isV11 ) {
442 $out .= $indent . Xml::element( 'origin', null, strval( $slot->getOrigin() ) ) . "\n";
443 }
444
445 $contentModel = $slot->getModel();
446 $contentHandler = ContentHandler::getForModelID( $contentModel );
447 $contentFormat = $contentHandler->getDefaultFormat();
448
449 // XXX: The content format is only relevant when actually outputting serialized content.
450 // It should probably be an attribute on the text tag.
451 $out .= $indent . Xml::element( 'model', null, strval( $contentModel ) ) . "\n";
452 $out .= $indent . Xml::element( 'format', null, strval( $contentFormat ) ) . "\n";
453
454 $textAttributes = [
455 'xml:space' => 'preserve',
456 'bytes' => $this->invokeLenient(
457 $slot,
458 'getSize',
459 [],
460 'failed to determine size for slot ' . $slot->getRole() . ' of revision '
461 . $slot->getRevision()
462 ) ?: '0'
463 ];
464
465 if ( $isV11 ) {
466 $textAttributes['sha1'] = $this->invokeLenient(
467 $slot,
468 'getSha1',
469 [],
470 'failed to determine sha1 for slot ' . $slot->getRole() . ' of revision '
471 . $slot->getRevision()
472 ) ?: '';
473 }
474
475 if ( $contentMode === self::WRITE_CONTENT ) {
476 $content = $this->invokeLenient(
477 $slot,
478 'getContent',
479 [],
480 'failed to load content for slot ' . $slot->getRole() . ' of revision '
481 . $slot->getRevision()
482 );
483
484 if ( $content === null ) {
485 $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n";
486 } else {
487 $out .= $this->writeText( $content, $textAttributes, $indent );
488 }
489 } elseif ( $contentMode === self::WRITE_STUB_DELETED ) {
490 // write <text> placeholder tag
491 $textAttributes['deleted'] = 'deleted';
492 $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n";
493 } else {
494 // write <text> stub tag
495 if ( $isV11 ) {
496 $textAttributes['location'] = $slot->getAddress();
497 }
498
499 // Output the numerical text ID if possible, for backwards compatibility.
500 // Note that this is currently the ONLY reason we have a BlobStore here at all.
501 // When removing this line, check whether the BlobStore has become unused.
502 try {
503 // NOTE: this will only work for addresses of the form "tt:12345".
504 // If we want to support other kinds of addresses in the future,
505 // we will have to silently ignore failures here.
506 // For now, this fails for "tt:0", which is present in the WMF production
507 // database of of Juli 2019, due to data corruption.
508 $textId = $this->getBlobStore()->getTextIdFromAddress( $slot->getAddress() );
509 } catch ( InvalidArgumentException $ex ) {
510 MWDebug::warning( 'Bad content address for slot ' . $slot->getRole()
511 . ' of revision ' . $slot->getRevision() . ': ' . $ex->getMessage() );
512 $textId = 0;
513 }
514
515 if ( $textId ) {
516 $textAttributes['id'] = $textId;
517 }
518
519 $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n";
520 }
521
522 if ( !$isMain ) {
523 $out .= ' ' . Xml::closeElement( 'content' ) . "\n";
524 }
525
526 return $out;
527 }
528
536 private function writeText( Content $content, $textAttributes, $indent ) {
537 $out = '';
538
539 $contentHandler = $content->getContentHandler();
540 $contentFormat = $contentHandler->getDefaultFormat();
541
542 if ( $content instanceof TextContent ) {
543 // HACK: For text based models, bypass the serialization step. This allows extensions (like Flow)
544 // that use incompatible combinations of serialization format and content model.
545 $data = $content->getNativeData();
546 } else {
547 $data = $content->serialize( $contentFormat );
548 }
549
550 $data = $contentHandler->exportTransform( $data, $contentFormat );
551 $textAttributes['bytes'] = $size = strlen( $data ); // make sure to use the actual size
552 $out .= $indent . Xml::elementClean( 'text', $textAttributes, strval( $data ) ) . "\n";
553
554 return $out;
555 }
556
565 function writeLogItem( $row ) {
566 $out = " <logitem>\n";
567 $out .= " " . Xml::element( 'id', null, strval( $row->log_id ) ) . "\n";
568
569 $out .= $this->writeTimestamp( $row->log_timestamp, " " );
570
571 if ( $row->log_deleted & LogPage::DELETED_USER ) {
572 $out .= " " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
573 } else {
574 $out .= $this->writeContributor( $row->log_user, $row->user_name, " " );
575 }
576
577 if ( $row->log_deleted & LogPage::DELETED_COMMENT ) {
578 $out .= " " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
579 } else {
580 $comment = CommentStore::getStore()->getComment( 'log_comment', $row )->text;
581 if ( $comment != '' ) {
582 $out .= " " . Xml::elementClean( 'comment', null, strval( $comment ) ) . "\n";
583 }
584 }
585
586 $out .= " " . Xml::element( 'type', null, strval( $row->log_type ) ) . "\n";
587 $out .= " " . Xml::element( 'action', null, strval( $row->log_action ) ) . "\n";
588
589 if ( $row->log_deleted & LogPage::DELETED_ACTION ) {
590 $out .= " " . Xml::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n";
591 } else {
592 $title = Title::makeTitle( $row->log_namespace, $row->log_title );
593 $out .= " " . Xml::elementClean( 'logtitle', null, self::canonicalTitle( $title ) ) . "\n";
594 $out .= " " . Xml::elementClean( 'params',
595 [ 'xml:space' => 'preserve' ],
596 strval( $row->log_params ) ) . "\n";
597 }
598
599 $out .= " </logitem>\n";
600
601 return $out;
602 }
603
609 function writeTimestamp( $timestamp, $indent = " " ) {
610 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
611 return $indent . Xml::element( 'timestamp', null, $ts ) . "\n";
612 }
613
620 function writeContributor( $id, $text, $indent = " " ) {
621 $out = $indent . "<contributor>\n";
622 if ( $id || !IP::isValid( $text ) ) {
623 $out .= $indent . " " . Xml::elementClean( 'username', null, strval( $text ) ) . "\n";
624 $out .= $indent . " " . Xml::element( 'id', null, strval( $id ) ) . "\n";
625 } else {
626 $out .= $indent . " " . Xml::elementClean( 'ip', null, strval( $text ) ) . "\n";
627 }
628 $out .= $indent . "</contributor>\n";
629 return $out;
630 }
631
638 function writeUploads( $row, $dumpContents = false ) {
639 if ( $row->page_namespace == NS_FILE ) {
640 $img = MediaWikiServices::getInstance()->getRepoGroup()->getLocalRepo()
641 ->newFile( $row->page_title );
642 if ( $img && $img->exists() ) {
643 $out = '';
644 foreach ( array_reverse( $img->getHistory() ) as $ver ) {
645 $out .= $this->writeUpload( $ver, $dumpContents );
646 }
647 $out .= $this->writeUpload( $img, $dumpContents );
648 return $out;
649 }
650 }
651 return '';
652 }
653
659 function writeUpload( $file, $dumpContents = false ) {
660 if ( $file->isOld() ) {
662 '@phan-var OldLocalFile $file';
663 $archiveName = " " .
664 Xml::element( 'archivename', null, $file->getArchiveName() ) . "\n";
665 } else {
666 $archiveName = '';
667 }
668 if ( $dumpContents ) {
669 $be = $file->getRepo()->getBackend();
670 # Dump file as base64
671 # Uses only XML-safe characters, so does not need escaping
672 # @todo Too bad this loads the contents into memory (script might swap)
673 $contents = ' <contents encoding="base64">' .
674 chunk_split( base64_encode(
675 $be->getFileContents( [ 'src' => $file->getPath() ] ) ) ) .
676 " </contents>\n";
677 } else {
678 $contents = '';
679 }
680 if ( $file->isDeleted( File::DELETED_COMMENT ) ) {
681 $comment = Xml::element( 'comment', [ 'deleted' => 'deleted' ] );
682 } else {
683 $comment = Xml::elementClean( 'comment', null, strval( $file->getDescription() ) );
684 }
685 return " <upload>\n" .
686 $this->writeTimestamp( $file->getTimestamp() ) .
687 $this->writeContributor( $file->getUser( 'id' ), $file->getUser( 'text' ) ) .
688 " " . $comment . "\n" .
689 " " . Xml::element( 'filename', null, $file->getName() ) . "\n" .
690 $archiveName .
691 " " . Xml::element( 'src', null, $file->getCanonicalUrl() ) . "\n" .
692 " " . Xml::element( 'size', null, $file->getSize() ) . "\n" .
693 " " . Xml::element( 'sha1base36', null, $file->getSha1() ) . "\n" .
694 " " . Xml::element( 'rel', null, $file->getRel() ) . "\n" .
695 $contents .
696 " </upload>\n";
697 }
698
709 public static function canonicalTitle( Title $title ) {
710 if ( $title->isExternal() ) {
711 return $title->getPrefixedText();
712 }
713
714 $prefix = MediaWikiServices::getInstance()->getContentLanguage()->
715 getFormattedNsText( $title->getNamespace() );
716
717 // @todo Emit some kind of warning to the user if $title->getNamespace() !==
718 // NS_MAIN and $prefix === '' (viz. pages in an unregistered namespace)
719
720 if ( $prefix !== '' ) {
721 $prefix .= ':';
722 }
723
724 return $prefix . $title->getText();
725 }
726}
$wgCapitalLinks
Set this to false to avoid forcing the first letter of links to capitals.
$wgDBname
Current wiki database name.
$wgSitename
Name of the site.
$wgVersion
MediaWiki version number.
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
if( $line===false) $args
Definition cdb.php:64
const DELETED_COMMENT
Definition File.php:64
const DELETED_USER
Definition LogPage.php:36
const DELETED_COMMENT
Definition LogPage.php:35
const DELETED_ACTION
Definition LogPage.php:34
MediaWiki exception.
MediaWikiServices is the service locator for the application scope of MediaWiki.
Page revision base class.
Service for looking up page revisions.
Value object representing a content slot associated with a page revision.
getRole()
Returns the role of the slot.
getAddress()
Returns the address of this slot's content.
getModel()
Returns the content model.
getOrigin()
Returns the revision ID of the revision that originated the slot's content.
getRevision()
Returns the ID of the revision this slot is associated with.
Exception raised in response to an audience check when attempting to access suppressed information wi...
Service for storing and loading Content objects.
Content object implementation for representing flat text.
Represents a title within MediaWiki.
Definition Title.php:42
isValidRedirectTarget()
Check if this Title is a valid redirect target.
Definition Title.php:4492
closeStream()
Closes the output stream with the closing root element.
__construct( $contentMode=self::WRITE_CONTENT, $schemaVersion=XML_DUMP_SCHEMA_VERSION_11)
XmlDumpWriter constructor.
static string[] $supportedSchemas
the schema versions supported for output @final
invokeLenient( $obj, $method, $args=[], $warning)
Invokes the given method on the given object, catching and logging any storage related exceptions.
const WRITE_STUB_DELETED
Only output subs for revision content, indicating that the content has been deleted/suppressed.
static canonicalTitle(Title $title)
Return prefixed text form of title, but using the content language's canonical namespace.
int $contentMode
Whether to output revision content or just stubs.
const WRITE_STUB
Only output subs for revision content.
string $schemaVersion
which schema version the generated XML should comply to.
writeUpload( $file, $dumpContents=false)
writeLogItem( $row)
Dumps a "<logitem>" section on the output stream, with data filled in from the given database row.
writeTimestamp( $timestamp, $indent=" ")
const WRITE_CONTENT
Output serialized revision content.
writeUploads( $row, $dumpContents=false)
Warning! This data is potentially inconsistent.
closePage()
Closes a "<page>" section on the output stream.
openStream()
Opens the XML output stream's root "<mediawiki>" element.
writeRevision( $row, $slotRows=null)
Dumps a "<revision>" section on the output stream, with data filled in from the given database row.
openPage( $row)
Opens a "<page>" section on the output stream, with data from the given database row.
writeText(Content $content, $textAttributes, $indent)
Title null $currentTitle
Title of the currently processed page.
writeSlot(SlotRecord $slot, $contentMode)
writeContributor( $id, $text, $indent=" ")
const NS_FILE
Definition Defines.php:75
const XML_DUMP_SCHEMA_VERSION_11
Definition Defines.php:318
const XML_DUMP_SCHEMA_VERSION_10
Definition Defines.php:317
Base interface for content objects.
Definition Content.php:34
$content
Definition router.php:78
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition router.php:42