MediaWiki REL1_35
XmlDumpWriter.php
Go to the documentation of this file.
1<?php
33use Wikimedia\Assert\Assert;
34use Wikimedia\IPUtils;
35
40
42 public const WRITE_CONTENT = 0;
43
45 public const WRITE_STUB = 1;
46
51 private const WRITE_STUB_DELETED = 2;
52
57 public static $supportedSchemas = [
60 ];
61
68
74 private $currentTitle = null;
75
79 private $contentMode;
80
82 private $hookRunner;
83
90 public function __construct(
91 $contentMode = self::WRITE_CONTENT,
92 $schemaVersion = XML_DUMP_SCHEMA_VERSION_11
93 ) {
94 Assert::parameter(
95 in_array( $contentMode, [ self::WRITE_CONTENT, self::WRITE_STUB ], true ),
96 '$contentMode',
97 'must be one of the following constants: WRITE_CONTENT or WRITE_STUB.'
98 );
99
100 Assert::parameter(
101 in_array( $schemaVersion, self::$supportedSchemas, true ),
102 '$schemaVersion',
103 'must be one of the following schema versions: '
104 . implode( ',', self::$supportedSchemas )
105 );
106
107 $this->contentMode = $contentMode;
108 $this->schemaVersion = $schemaVersion;
109 $this->hookRunner = new HookRunner( MediaWikiServices::getInstance()->getHookContainer() );
110 }
111
122 public function openStream() {
123 $ver = $this->schemaVersion;
124 return Xml::element( 'mediawiki', [
125 'xmlns' => "http://www.mediawiki.org/xml/export-$ver/",
126 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
127 /*
128 * When a new version of the schema is created, it needs staging on mediawiki.org.
129 * This requires a change in the operations/mediawiki-config git repo.
130 *
131 * Create a changeset like https://gerrit.wikimedia.org/r/#/c/149643/ in which
132 * you copy in the new xsd file.
133 *
134 * After it is reviewed, merged and deployed (sync-docroot), the index.html needs purging.
135 * echo "https://www.mediawiki.org/xml/index.html" | mwscript purgeList.php --wiki=aawiki
136 */
137 'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " .
138 "http://www.mediawiki.org/xml/export-$ver.xsd",
139 'version' => $ver,
140 'xml:lang' => MediaWikiServices::getInstance()->getContentLanguage()->getHtmlCode() ],
141 null ) .
142 "\n" .
143 $this->siteInfo();
144 }
145
149 private function siteInfo() {
150 $info = [
151 $this->sitename(),
152 $this->dbname(),
153 $this->homelink(),
154 $this->generator(),
155 $this->caseSetting(),
156 $this->namespaces() ];
157 return " <siteinfo>\n " .
158 implode( "\n ", $info ) .
159 "\n </siteinfo>\n";
160 }
161
165 private function sitename() {
166 global $wgSitename;
167 return Xml::element( 'sitename', [], $wgSitename );
168 }
169
173 private function dbname() {
174 global $wgDBname;
175 return Xml::element( 'dbname', [], $wgDBname );
176 }
177
181 private function generator() {
182 return Xml::element( 'generator', [], 'MediaWiki ' . MW_VERSION );
183 }
184
188 private function homelink() {
189 return Xml::element( 'base', [], Title::newMainPage()->getCanonicalURL() );
190 }
191
195 private function caseSetting() {
196 global $wgCapitalLinks;
197 // "case-insensitive" option is reserved for future
198 $sensitivity = $wgCapitalLinks ? 'first-letter' : 'case-sensitive';
199 return Xml::element( 'case', [], $sensitivity );
200 }
201
205 private function namespaces() {
206 $spaces = "<namespaces>\n";
207 $nsInfo = MediaWikiServices::getInstance()->getNamespaceInfo();
208 foreach (
209 MediaWikiServices::getInstance()->getContentLanguage()->getFormattedNamespaces()
210 as $ns => $title
211 ) {
212 $spaces .= ' ' .
213 Xml::element( 'namespace',
214 [
215 'key' => $ns,
216 'case' => $nsInfo->isCapitalized( $ns )
217 ? 'first-letter' : 'case-sensitive',
218 ], $title ) . "\n";
219 }
220 $spaces .= " </namespaces>";
221 return $spaces;
222 }
223
230 public function closeStream() {
231 return "</mediawiki>\n";
232 }
233
241 public function openPage( $row ) {
242 $out = " <page>\n";
243 $this->currentTitle = Title::newFromRow( $row );
244 $canonicalTitle = self::canonicalTitle( $this->currentTitle );
245 $out .= ' ' . Xml::elementClean( 'title', [], $canonicalTitle ) . "\n";
246 $out .= ' ' . Xml::element( 'ns', [], strval( $row->page_namespace ) ) . "\n";
247 $out .= ' ' . Xml::element( 'id', [], strval( $row->page_id ) ) . "\n";
248 if ( $row->page_is_redirect ) {
249 $page = WikiPage::factory( $this->currentTitle );
250 $redirect = $this->invokeLenient(
251 function () use ( $page ) {
252 return $page->getRedirectTarget();
253 },
254 'Failed to get redirect target of page ' . $page->getId()
255 );
256 if ( $redirect instanceof Title && $redirect->isValidRedirectTarget() ) {
257 $out .= ' ';
258 $out .= Xml::element( 'redirect', [ 'title' => self::canonicalTitle( $redirect ) ] );
259 $out .= "\n";
260 }
261 }
262
263 if ( $row->page_restrictions != '' ) {
264 $out .= ' ' . Xml::element( 'restrictions', [],
265 strval( $row->page_restrictions ) ) . "\n";
266 }
267
268 $this->hookRunner->onXmlDumpWriterOpenPage( $this, $out, $row, $this->currentTitle );
269
270 return $out;
271 }
272
279 public function closePage() {
280 if ( $this->currentTitle !== null ) {
281 $linkCache = MediaWikiServices::getInstance()->getLinkCache();
282 // In rare cases, link cache has the same key for some pages which
283 // might be read as part of the same batch. T220424 and T220316
284 $linkCache->clearLink( $this->currentTitle );
285 }
286 return " </page>\n";
287 }
288
292 private function getRevisionStore() {
293 return MediaWikiServices::getInstance()->getRevisionStore();
294 }
295
299 private function getBlobStore() {
300 return MediaWikiServices::getInstance()->getBlobStore();
301 }
302
314 private function invokeLenient( $callback, $warning ) {
315 try {
316 return $callback();
317 } catch ( SuppressedDataException $ex ) {
318 return null;
319 } catch ( Exception $ex ) {
320 if ( $ex instanceof MWException || $ex instanceof RuntimeException ||
321 $ex instanceof InvalidArgumentException ) {
322 MWDebug::warning( $warning . ': ' . $ex->getMessage() );
323 return null;
324 } else {
325 throw $ex;
326 }
327 }
328 }
329
341 public function writeRevision( $row, $slotRows = null ) {
342 $rev = $this->getRevisionStore()->newRevisionFromRowAndSlots(
343 $row,
344 $slotRows,
345 0,
346 $this->currentTitle
347 );
348
349 $out = " <revision>\n";
350 $out .= " " . Xml::element( 'id', null, strval( $rev->getId() ) ) . "\n";
351
352 if ( $rev->getParentId() ) {
353 $out .= " " . Xml::element( 'parentid', null, strval( $rev->getParentId() ) ) . "\n";
354 }
355
356 $out .= $this->writeTimestamp( $rev->getTimestamp() );
357
358 if ( $rev->isDeleted( RevisionRecord::DELETED_USER ) ) {
359 $out .= " " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
360 } else {
361 // empty values get written out as uid 0, see T224221
362 $user = $rev->getUser();
363 $out .= $this->writeContributor(
364 $user ? $user->getId() : 0,
365 $user ? $user->getName() : ''
366 );
367 }
368
369 if ( $rev->isMinor() ) {
370 $out .= " <minor/>\n";
371 }
372 if ( $rev->isDeleted( RevisionRecord::DELETED_COMMENT ) ) {
373 $out .= " " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
374 } else {
375 if ( $rev->getComment()->text != '' ) {
376 $out .= " "
377 . Xml::elementClean( 'comment', [], strval( $rev->getComment()->text ) )
378 . "\n";
379 }
380 }
381
382 $contentMode = $rev->isDeleted( RevisionRecord::DELETED_TEXT ) ? self::WRITE_STUB_DELETED
383 : $this->contentMode;
384
385 $slots = $rev->getSlots()->getSlots();
386
387 // use predictable order, put main slot first
388 ksort( $slots );
389 $out .= $this->writeSlot( $slots[SlotRecord::MAIN], $contentMode );
390
391 foreach ( $slots as $role => $slot ) {
392 if ( $role === SlotRecord::MAIN ) {
393 continue;
394 }
395 $out .= $this->writeSlot( $slot, $contentMode );
396 }
397
398 if ( $rev->isDeleted( RevisionRecord::DELETED_TEXT ) ) {
399 $out .= " <sha1/>\n";
400 } else {
401 $sha1 = $this->invokeLenient(
402 function () use ( $rev ) {
403 return $rev->getSha1();
404 },
405 'failed to determine sha1 for revision ' . $rev->getId()
406 );
407 $out .= " " . Xml::element( 'sha1', null, strval( $sha1 ) ) . "\n";
408 }
409
410 // Avoid PHP 7.1 warning from passing $this by reference
411 $writer = $this;
412 $text = '';
413 if ( $contentMode === self::WRITE_CONTENT ) {
415 $content = $this->invokeLenient(
416 function () use ( $rev ) {
417 return $rev->getContent( SlotRecord::MAIN, RevisionRecord::RAW );
418 },
419 'Failed to load main slot content of revision ' . $rev->getId()
420 );
421
422 $text = $content ? $content->serialize() : '';
423 }
424 $this->hookRunner->onXmlDumpWriterWriteRevision( $writer, $out, $row, $text, $rev );
425
426 $out .= " </revision>\n";
427
428 return $out;
429 }
430
437 private function writeSlot( SlotRecord $slot, $contentMode ) {
438 $isMain = $slot->getRole() === SlotRecord::MAIN;
439 $isV11 = $this->schemaVersion >= XML_DUMP_SCHEMA_VERSION_11;
440
441 if ( !$isV11 && !$isMain ) {
442 // ignore extra slots
443 return '';
444 }
445
446 $out = '';
447 $indent = ' ';
448
449 if ( !$isMain ) {
450 // non-main slots are wrapped into an additional element.
451 $out .= ' ' . Xml::openElement( 'content' ) . "\n";
452 $indent .= ' ';
453 $out .= $indent . Xml::element( 'role', null, strval( $slot->getRole() ) ) . "\n";
454 }
455
456 if ( $isV11 ) {
457 $out .= $indent . Xml::element( 'origin', null, strval( $slot->getOrigin() ) ) . "\n";
458 }
459
460 $contentModel = $slot->getModel();
461 $contentHandler = MediaWikiServices::getInstance()
462 ->getContentHandlerFactory()
463 ->getContentHandler( $contentModel );
464 $contentFormat = $contentHandler->getDefaultFormat();
465
466 // XXX: The content format is only relevant when actually outputting serialized content.
467 // It should probably be an attribute on the text tag.
468 $out .= $indent . Xml::element( 'model', null, strval( $contentModel ) ) . "\n";
469 $out .= $indent . Xml::element( 'format', null, strval( $contentFormat ) ) . "\n";
470
471 $textAttributes = [
472 'bytes' => $this->invokeLenient(
473 function () use ( $slot ) {
474 return $slot->getSize();
475 },
476 'failed to determine size for slot ' . $slot->getRole() . ' of revision '
477 . $slot->getRevision()
478 ) ?: '0'
479 ];
480
481 if ( $isV11 ) {
482 $textAttributes['sha1'] = $this->invokeLenient(
483 function () use ( $slot ) {
484 return $slot->getSha1();
485 },
486 'failed to determine sha1 for slot ' . $slot->getRole() . ' of revision '
487 . $slot->getRevision()
488 ) ?: '';
489 }
490
491 if ( $contentMode === self::WRITE_CONTENT ) {
492 $content = $this->invokeLenient(
493 function () use ( $slot ) {
494 return $slot->getContent();
495 },
496 'failed to load content for slot ' . $slot->getRole() . ' of revision '
497 . $slot->getRevision()
498 );
499
500 if ( $content === null ) {
501 $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n";
502 } else {
503 $out .= $this->writeText( $content, $textAttributes, $indent );
504 }
505 } elseif ( $contentMode === self::WRITE_STUB_DELETED ) {
506 // write <text> placeholder tag
507 $textAttributes['deleted'] = 'deleted';
508 $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n";
509 } else {
510 // write <text> stub tag
511 if ( $isV11 ) {
512 $textAttributes['location'] = $slot->getAddress();
513 }
514
515 if ( $isMain ) {
516 // Output the numerical text ID if possible, for backwards compatibility.
517 // Note that this is currently the ONLY reason we have a BlobStore here at all.
518 // When removing this line, check whether the BlobStore has become unused.
519 try {
520 // NOTE: this will only work for addresses of the form "tt:12345".
521 // If we want to support other kinds of addresses in the future,
522 // we will have to silently ignore failures here.
523 // For now, this fails for "tt:0", which is present in the WMF production
524 // database of of Juli 2019, due to data corruption.
525 $textId = $this->getBlobStore()->getTextIdFromAddress( $slot->getAddress() );
526 } catch ( InvalidArgumentException $ex ) {
527 MWDebug::warning( 'Bad content address for slot ' . $slot->getRole()
528 . ' of revision ' . $slot->getRevision() . ': ' . $ex->getMessage() );
529 $textId = 0;
530 }
531
532 if ( is_int( $textId ) ) {
533 $textAttributes['id'] = $textId;
534 }
535 }
536
537 $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n";
538 }
539
540 if ( !$isMain ) {
541 $out .= ' ' . Xml::closeElement( 'content' ) . "\n";
542 }
543
544 return $out;
545 }
546
554 private function writeText( Content $content, $textAttributes, $indent ) {
555 $out = '';
556
557 $contentHandler = $content->getContentHandler();
558 $contentFormat = $contentHandler->getDefaultFormat();
559
560 if ( $content instanceof TextContent ) {
561 // HACK: For text based models, bypass the serialization step. This allows extensions (like Flow)
562 // that use incompatible combinations of serialization format and content model.
563 $data = $content->getText();
564 } else {
565 $data = $content->serialize( $contentFormat );
566 }
567
568 $data = $contentHandler->exportTransform( $data, $contentFormat );
569 $textAttributes['bytes'] = $size = strlen( $data ); // make sure to use the actual size
570 $textAttributes['xml:space'] = 'preserve';
571 $out .= $indent . Xml::elementClean( 'text', $textAttributes, strval( $data ) ) . "\n";
572
573 return $out;
574 }
575
583 public function writeLogItem( $row ) {
584 $out = " <logitem>\n";
585 $out .= " " . Xml::element( 'id', null, strval( $row->log_id ) ) . "\n";
586
587 $out .= $this->writeTimestamp( $row->log_timestamp, " " );
588
589 if ( $row->log_deleted & LogPage::DELETED_USER ) {
590 $out .= " " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
591 } else {
592 $out .= $this->writeContributor( $row->log_user, $row->user_name, " " );
593 }
594
595 if ( $row->log_deleted & LogPage::DELETED_COMMENT ) {
596 $out .= " " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
597 } else {
598 $comment = CommentStore::getStore()->getComment( 'log_comment', $row )->text;
599 if ( $comment != '' ) {
600 $out .= " " . Xml::elementClean( 'comment', null, strval( $comment ) ) . "\n";
601 }
602 }
603
604 $out .= " " . Xml::element( 'type', null, strval( $row->log_type ) ) . "\n";
605 $out .= " " . Xml::element( 'action', null, strval( $row->log_action ) ) . "\n";
606
607 if ( $row->log_deleted & LogPage::DELETED_ACTION ) {
608 $out .= " " . Xml::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n";
609 } else {
610 $title = Title::makeTitle( $row->log_namespace, $row->log_title );
611 $out .= " " . Xml::elementClean( 'logtitle', null, self::canonicalTitle( $title ) ) . "\n";
612 $out .= " " . Xml::elementClean( 'params',
613 [ 'xml:space' => 'preserve' ],
614 strval( $row->log_params ) ) . "\n";
615 }
616
617 $out .= " </logitem>\n";
618
619 return $out;
620 }
621
627 public function writeTimestamp( $timestamp, $indent = " " ) {
628 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
629 return $indent . Xml::element( 'timestamp', null, $ts ) . "\n";
630 }
631
638 public function writeContributor( $id, $text, $indent = " " ) {
639 $out = $indent . "<contributor>\n";
640 if ( $id || !IPUtils::isValid( $text ) ) {
641 $out .= $indent . " " . Xml::elementClean( 'username', null, strval( $text ) ) . "\n";
642 $out .= $indent . " " . Xml::element( 'id', null, strval( $id ) ) . "\n";
643 } else {
644 $out .= $indent . " " . Xml::elementClean( 'ip', null, strval( $text ) ) . "\n";
645 }
646 $out .= $indent . "</contributor>\n";
647 return $out;
648 }
649
656 public function writeUploads( $row, $dumpContents = false ) {
657 if ( $row->page_namespace == NS_FILE ) {
658 $img = MediaWikiServices::getInstance()->getRepoGroup()->getLocalRepo()
659 ->newFile( $row->page_title );
660 if ( $img && $img->exists() ) {
661 $out = '';
662 foreach ( array_reverse( $img->getHistory() ) as $ver ) {
663 $out .= $this->writeUpload( $ver, $dumpContents );
664 }
665 $out .= $this->writeUpload( $img, $dumpContents );
666 return $out;
667 }
668 }
669 return '';
670 }
671
677 private function writeUpload( $file, $dumpContents = false ) {
678 if ( $file->isOld() ) {
680 '@phan-var OldLocalFile $file';
681 $archiveName = " " .
682 Xml::element( 'archivename', null, $file->getArchiveName() ) . "\n";
683 } else {
684 $archiveName = '';
685 }
686 if ( $dumpContents ) {
687 $be = $file->getRepo()->getBackend();
688 # Dump file as base64
689 # Uses only XML-safe characters, so does not need escaping
690 # @todo Too bad this loads the contents into memory (script might swap)
691 $contents = ' <contents encoding="base64">' .
692 chunk_split( base64_encode(
693 $be->getFileContents( [ 'src' => $file->getPath() ] ) ) ) .
694 " </contents>\n";
695 } else {
696 $contents = '';
697 }
698 if ( $file->isDeleted( File::DELETED_COMMENT ) ) {
699 $comment = Xml::element( 'comment', [ 'deleted' => 'deleted' ] );
700 } else {
701 $comment = Xml::elementClean( 'comment', null, strval( $file->getDescription() ) );
702 }
703 return " <upload>\n" .
704 $this->writeTimestamp( $file->getTimestamp() ) .
705 $this->writeContributor( $file->getUser( 'id' ), $file->getUser( 'text' ) ) .
706 " " . $comment . "\n" .
707 " " . Xml::element( 'filename', null, $file->getName() ) . "\n" .
708 $archiveName .
709 " " . Xml::element( 'src', null, $file->getCanonicalUrl() ) . "\n" .
710 " " . Xml::element( 'size', null, $file->getSize() ) . "\n" .
711 " " . Xml::element( 'sha1base36', null, $file->getSha1() ) . "\n" .
712 " " . Xml::element( 'rel', null, $file->getRel() ) . "\n" .
713 $contents .
714 " </upload>\n";
715 }
716
727 public static function canonicalTitle( Title $title ) {
728 if ( $title->isExternal() ) {
729 return $title->getPrefixedText();
730 }
731
732 $prefix = MediaWikiServices::getInstance()->getContentLanguage()->
733 getFormattedNsText( $title->getNamespace() );
734
735 // @todo Emit some kind of warning to the user if $title->getNamespace() !==
736 // NS_MAIN and $prefix === '' (viz. pages in an unregistered namespace)
737
738 if ( $prefix !== '' ) {
739 $prefix .= ':';
740 }
741
742 return $prefix . $title->getText();
743 }
744}
$wgCapitalLinks
Set this to false to avoid forcing the first letter of links to capitals.
$wgDBname
Current wiki database name.
$wgSitename
Name of the site.
const MW_VERSION
The running version of MediaWiki.
Definition Defines.php:40
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
const DELETED_USER
Definition LogPage.php:40
const DELETED_COMMENT
Definition LogPage.php:39
const DELETED_ACTION
Definition LogPage.php:38
MediaWiki exception.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
MediaWikiServices is the service locator for the application scope of MediaWiki.
Page revision base class.
Service for looking up page revisions.
Value object representing a content slot associated with a page revision.
getContent()
Returns the Content of the given slot.
getRole()
Returns the role of the slot.
getSha1()
Returns the content size.
getSize()
Returns the content size.
getAddress()
Returns the address of this slot's content.
getModel()
Returns the content model.
getOrigin()
Returns the revision ID of the revision that originated the slot's content.
getRevision()
Returns the ID of the revision this slot is associated with.
Exception raised in response to an audience check when attempting to access suppressed information wi...
Service for storing and loading Content objects.
Content object implementation for representing flat text.
Represents a title within MediaWiki.
Definition Title.php:42
isValidRedirectTarget()
Check if this Title is a valid redirect target.
Definition Title.php:4343
closeStream()
Closes the output stream with the closing root element.
__construct( $contentMode=self::WRITE_CONTENT, $schemaVersion=XML_DUMP_SCHEMA_VERSION_11)
static string[] $supportedSchemas
the schema versions supported for output @final
const WRITE_STUB_DELETED
Only output subs for revision content, indicating that the content has been deleted/suppressed.
static canonicalTitle(Title $title)
Return prefixed text form of title, but using the content language's canonical namespace.
int $contentMode
Whether to output revision content or just stubs.
const WRITE_STUB
Only output subs for revision content.
string $schemaVersion
which schema version the generated XML should comply to.
writeUpload( $file, $dumpContents=false)
invokeLenient( $callback, $warning)
Invokes the given callback, catching and logging any storage related exceptions.
HookRunner $hookRunner
writeLogItem( $row)
Dumps a "<logitem>" section on the output stream, with data filled in from the given database row.
writeTimestamp( $timestamp, $indent=" ")
const WRITE_CONTENT
Output serialized revision content.
writeUploads( $row, $dumpContents=false)
Warning! This data is potentially inconsistent.
closePage()
Closes a "<page>" section on the output stream.
openStream()
Opens the XML output stream's root "<mediawiki>" element.
writeRevision( $row, $slotRows=null)
Dumps a "<revision>" section on the output stream, with data filled in from the given database row.
openPage( $row)
Opens a "<page>" section on the output stream, with data from the given database row.
writeText(Content $content, $textAttributes, $indent)
Title null $currentTitle
Title of the currently processed page.
writeSlot(SlotRecord $slot, $contentMode)
writeContributor( $id, $text, $indent=" ")
const NS_FILE
Definition Defines.php:76
const XML_DUMP_SCHEMA_VERSION_11
Definition Defines.php:319
const XML_DUMP_SCHEMA_VERSION_10
Definition Defines.php:318
Base interface for content objects.
Definition Content.php:35
$content
Definition router.php:76
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition router.php:42