MediaWiki  master
XmlDumpWriter.php
Go to the documentation of this file.
1 <?php
28 
37  public static $supportedSchemas = [
39  ];
40 
46  private $currentTitle = null;
47 
58  function openStream() {
60  return Xml::element( 'mediawiki', [
61  'xmlns' => "http://www.mediawiki.org/xml/export-$ver/",
62  'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
63  /*
64  * When a new version of the schema is created, it needs staging on mediawiki.org.
65  * This requires a change in the operations/mediawiki-config git repo.
66  *
67  * Create a changeset like https://gerrit.wikimedia.org/r/#/c/149643/ in which
68  * you copy in the new xsd file.
69  *
70  * After it is reviewed, merged and deployed (sync-docroot), the index.html needs purging.
71  * echo "https://www.mediawiki.org/xml/index.html" | mwscript purgeList.php --wiki=aawiki
72  */
73  'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " .
74  "http://www.mediawiki.org/xml/export-$ver.xsd",
75  'version' => $ver,
76  'xml:lang' => MediaWikiServices::getInstance()->getContentLanguage()->getHtmlCode() ],
77  null ) .
78  "\n" .
79  $this->siteInfo();
80  }
81 
85  function siteInfo() {
86  $info = [
87  $this->sitename(),
88  $this->dbname(),
89  $this->homelink(),
90  $this->generator(),
91  $this->caseSetting(),
92  $this->namespaces() ];
93  return " <siteinfo>\n " .
94  implode( "\n ", $info ) .
95  "\n </siteinfo>\n";
96  }
97 
101  function sitename() {
102  global $wgSitename;
103  return Xml::element( 'sitename', [], $wgSitename );
104  }
105 
109  function dbname() {
110  global $wgDBname;
111  return Xml::element( 'dbname', [], $wgDBname );
112  }
113 
117  function generator() {
118  global $wgVersion;
119  return Xml::element( 'generator', [], "MediaWiki $wgVersion" );
120  }
121 
125  function homelink() {
126  return Xml::element( 'base', [], Title::newMainPage()->getCanonicalURL() );
127  }
128 
132  function caseSetting() {
133  global $wgCapitalLinks;
134  // "case-insensitive" option is reserved for future
135  $sensitivity = $wgCapitalLinks ? 'first-letter' : 'case-sensitive';
136  return Xml::element( 'case', [], $sensitivity );
137  }
138 
142  function namespaces() {
143  $spaces = "<namespaces>\n";
144  $nsInfo = MediaWikiServices::getInstance()->getNamespaceInfo();
145  foreach (
146  MediaWikiServices::getInstance()->getContentLanguage()->getFormattedNamespaces()
147  as $ns => $title
148  ) {
149  $spaces .= ' ' .
150  Xml::element( 'namespace',
151  [
152  'key' => $ns,
153  'case' => $nsInfo->isCapitalized( $ns )
154  ? 'first-letter' : 'case-sensitive',
155  ], $title ) . "\n";
156  }
157  $spaces .= " </namespaces>";
158  return $spaces;
159  }
160 
167  function closeStream() {
168  return "</mediawiki>\n";
169  }
170 
178  public function openPage( $row ) {
179  $out = " <page>\n";
180  $this->currentTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
181  $canonicalTitle = self::canonicalTitle( $this->currentTitle );
182  $out .= ' ' . Xml::elementClean( 'title', [], $canonicalTitle ) . "\n";
183  $out .= ' ' . Xml::element( 'ns', [], strval( $row->page_namespace ) ) . "\n";
184  $out .= ' ' . Xml::element( 'id', [], strval( $row->page_id ) ) . "\n";
185  if ( $row->page_is_redirect ) {
186  $page = WikiPage::factory( $this->currentTitle );
187  $redirect = $page->getRedirectTarget();
188  if ( $redirect instanceof Title && $redirect->isValidRedirectTarget() ) {
189  $out .= ' ';
190  $out .= Xml::element( 'redirect', [ 'title' => self::canonicalTitle( $redirect ) ] );
191  $out .= "\n";
192  }
193  }
194 
195  if ( $row->page_restrictions != '' ) {
196  $out .= ' ' . Xml::element( 'restrictions', [],
197  strval( $row->page_restrictions ) ) . "\n";
198  }
199 
200  Hooks::run( 'XmlDumpWriterOpenPage', [ $this, &$out, $row, $this->currentTitle ] );
201 
202  return $out;
203  }
204 
211  function closePage() {
212  if ( $this->currentTitle !== null ) {
213  $linkCache = MediaWikiServices::getInstance()->getLinkCache();
214  // In rare cases, link cache has the same key for some pages which
215  // might be read as part of the same batch. T220424 and T220316
216  $linkCache->clearLink( $this->currentTitle );
217  }
218  return " </page>\n";
219  }
220 
224  private function getRevisionStore() {
225  return MediaWikiServices::getInstance()->getRevisionStore();
226  }
227 
231  private function getBlobStore() {
232  return MediaWikiServices::getInstance()->getBlobStore();
233  }
234 
243  function writeRevision( $row ) {
244  $out = " <revision>\n";
245  $out .= " " . Xml::element( 'id', null, strval( $row->rev_id ) ) . "\n";
246  if ( isset( $row->rev_parent_id ) && $row->rev_parent_id ) {
247  $out .= " " . Xml::element( 'parentid', null, strval( $row->rev_parent_id ) ) . "\n";
248  }
249 
250  $out .= $this->writeTimestamp( $row->rev_timestamp );
251 
252  if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_USER ) ) {
253  $out .= " " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
254  } else {
255  // empty values get written out as uid 0, see T224221
256  $out .= $this->writeContributor( $row->rev_user ?: 0, $row->rev_user_text );
257  }
258 
259  if ( isset( $row->rev_minor_edit ) && $row->rev_minor_edit ) {
260  $out .= " <minor/>\n";
261  }
262  if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_COMMENT ) ) {
263  $out .= " " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
264  } else {
265  $comment = CommentStore::getStore()->getComment( 'rev_comment', $row )->text;
266  if ( $comment != '' ) {
267  $out .= " " . Xml::elementClean( 'comment', [], strval( $comment ) ) . "\n";
268  }
269  }
270 
271  // TODO: rev_content_model no longer exists with MCR, see T174031
272  if ( isset( $row->rev_content_model ) && !is_null( $row->rev_content_model ) ) {
273  $content_model = strval( $row->rev_content_model );
274  } else {
275  // probably using $wgContentHandlerUseDB = false;
276  $content_model = ContentHandler::getDefaultModelFor( $this->currentTitle );
277  }
278 
279  $content_handler = ContentHandler::getForModelID( $content_model );
280 
281  // TODO: rev_content_format no longer exists with MCR, see T174031
282  if ( isset( $row->rev_content_format ) && !is_null( $row->rev_content_format ) ) {
283  $content_format = strval( $row->rev_content_format );
284  } else {
285  // probably using $wgContentHandlerUseDB = false;
286  $content_format = $content_handler->getDefaultFormat();
287  }
288 
289  $out .= " " . Xml::element( 'model', null, strval( $content_model ) ) . "\n";
290  $out .= " " . Xml::element( 'format', null, strval( $content_format ) ) . "\n";
291 
292  $text = '';
293  if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_TEXT ) ) {
294  $out .= " " . Xml::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n";
295  } elseif ( isset( $row->old_text ) ) {
296  // Raw text from the database may have invalid chars
297  $text = strval( Revision::getRevisionText( $row ) );
298  try {
299  $text = $content_handler->exportTransform( $text, $content_format );
300  }
301  catch ( Exception $ex ) {
302  if ( $ex instanceof MWException || $ex instanceof RuntimeException ) {
303  // leave text as is; that's the way it goes
304  wfLogWarning( 'exportTransform failed on text for revid ' . $row->rev_id . "\n" );
305  } else {
306  throw $ex;
307  }
308  }
309  $out .= " " . Xml::elementClean( 'text',
310  [ 'xml:space' => 'preserve', 'bytes' => intval( $row->rev_len ) ],
311  strval( $text ) ) . "\n";
312  } elseif ( isset( $row->_load_content ) ) {
313  // TODO: make this fully MCR aware, see T174031
314  $rev = $this->getRevisionStore()->newRevisionFromRow( $row, 0, $this->currentTitle );
315  $slot = $rev->getSlot( 'main' );
316  try {
317  $content = $slot->getContent();
318 
319  if ( $content instanceof TextContent ) {
320  // HACK: For text based models, bypass the serialization step.
321  // This allows extensions (like Flow)that use incompatible combinations
322  // of serialization format and content model.
323  $text = $content->getNativeData();
324  } else {
325  $text = $content->serialize( $content_format );
326  }
327  $text = $content_handler->exportTransform( $text, $content_format );
328  $out .= " " . Xml::elementClean( 'text',
329  [ 'xml:space' => 'preserve', 'bytes' => intval( $slot->getSize() ) ],
330  strval( $text ) ) . "\n";
331  }
332  catch ( Exception $ex ) {
333  if ( $ex instanceof MWException || $ex instanceof RuntimeException ) {
334  // there's no provsion in the schema for an attribute that will let
335  // the user know this element was unavailable due to error; an empty
336  // tag is the best we can do
337  $out .= " " . Xml::element( 'text' ) . "\n";
338  wfLogWarning( 'failed to load content for revid ' . $row->rev_id . "\n" );
339  } else {
340  throw $ex;
341  }
342  }
343  } elseif ( isset( $row->rev_text_id ) ) {
344  // Stub output for pre-MCR schema
345  // TODO: MCR: rev_text_id only exists in the pre-MCR schema. Remove this when
346  // we drop support for the old schema.
347  $out .= " " . Xml::element( 'text',
348  [ 'id' => $row->rev_text_id, 'bytes' => intval( $row->rev_len ) ],
349  "" ) . "\n";
350  } else {
351  // Backwards-compatible stub output for MCR aware schema
352  // TODO: MCR: emit content addresses instead of text ids, see T174031, T199121
353  $rev = $this->getRevisionStore()->newRevisionFromRow( $row, 0, $this->currentTitle );
354  $slot = $rev->getSlot( 'main' );
355 
356  // Note that this is currently the ONLY reason we have a BlobStore here at all.
357  // When removing this line, check whether the BlobStore has become unused.
358  $textId = $this->getBlobStore()->getTextIdFromAddress( $slot->getAddress() );
359  $out .= " " . Xml::element( 'text',
360  [ 'id' => $textId, 'bytes' => intval( $slot->getSize() ) ],
361  "" ) . "\n";
362  }
363 
364  if ( isset( $row->rev_sha1 )
365  && $row->rev_sha1
366  && !( $row->rev_deleted & Revision::DELETED_TEXT )
367  ) {
368  $out .= " " . Xml::element( 'sha1', null, strval( $row->rev_sha1 ) ) . "\n";
369  } else {
370  $out .= " <sha1/>\n";
371  }
372 
373  // Avoid PHP 7.1 warning from passing $this by reference
374  $writer = $this;
375  Hooks::run( 'XmlDumpWriterWriteRevision', [ &$writer, &$out, $row, $text ] );
376 
377  $out .= " </revision>\n";
378 
379  return $out;
380  }
381 
390  function writeLogItem( $row ) {
391  $out = " <logitem>\n";
392  $out .= " " . Xml::element( 'id', null, strval( $row->log_id ) ) . "\n";
393 
394  $out .= $this->writeTimestamp( $row->log_timestamp, " " );
395 
396  if ( $row->log_deleted & LogPage::DELETED_USER ) {
397  $out .= " " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
398  } else {
399  $out .= $this->writeContributor( $row->log_user, $row->user_name, " " );
400  }
401 
402  if ( $row->log_deleted & LogPage::DELETED_COMMENT ) {
403  $out .= " " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
404  } else {
405  $comment = CommentStore::getStore()->getComment( 'log_comment', $row )->text;
406  if ( $comment != '' ) {
407  $out .= " " . Xml::elementClean( 'comment', null, strval( $comment ) ) . "\n";
408  }
409  }
410 
411  $out .= " " . Xml::element( 'type', null, strval( $row->log_type ) ) . "\n";
412  $out .= " " . Xml::element( 'action', null, strval( $row->log_action ) ) . "\n";
413 
414  if ( $row->log_deleted & LogPage::DELETED_ACTION ) {
415  $out .= " " . Xml::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n";
416  } else {
417  $title = Title::makeTitle( $row->log_namespace, $row->log_title );
418  $out .= " " . Xml::elementClean( 'logtitle', null, self::canonicalTitle( $title ) ) . "\n";
419  $out .= " " . Xml::elementClean( 'params',
420  [ 'xml:space' => 'preserve' ],
421  strval( $row->log_params ) ) . "\n";
422  }
423 
424  $out .= " </logitem>\n";
425 
426  return $out;
427  }
428 
434  function writeTimestamp( $timestamp, $indent = " " ) {
435  $ts = wfTimestamp( TS_ISO_8601, $timestamp );
436  return $indent . Xml::element( 'timestamp', null, $ts ) . "\n";
437  }
438 
445  function writeContributor( $id, $text, $indent = " " ) {
446  $out = $indent . "<contributor>\n";
447  if ( $id || !IP::isValid( $text ) ) {
448  $out .= $indent . " " . Xml::elementClean( 'username', null, strval( $text ) ) . "\n";
449  $out .= $indent . " " . Xml::element( 'id', null, strval( $id ) ) . "\n";
450  } else {
451  $out .= $indent . " " . Xml::elementClean( 'ip', null, strval( $text ) ) . "\n";
452  }
453  $out .= $indent . "</contributor>\n";
454  return $out;
455  }
456 
463  function writeUploads( $row, $dumpContents = false ) {
464  if ( $row->page_namespace == NS_FILE ) {
465  $img = MediaWikiServices::getInstance()->getRepoGroup()->getLocalRepo()
466  ->newFile( $row->page_title );
467  if ( $img && $img->exists() ) {
468  $out = '';
469  foreach ( array_reverse( $img->getHistory() ) as $ver ) {
470  $out .= $this->writeUpload( $ver, $dumpContents );
471  }
472  $out .= $this->writeUpload( $img, $dumpContents );
473  return $out;
474  }
475  }
476  return '';
477  }
478 
484  function writeUpload( $file, $dumpContents = false ) {
485  if ( $file->isOld() ) {
486  $archiveName = " " .
487  Xml::element( 'archivename', null, $file->getArchiveName() ) . "\n";
488  } else {
489  $archiveName = '';
490  }
491  if ( $dumpContents ) {
492  $be = $file->getRepo()->getBackend();
493  # Dump file as base64
494  # Uses only XML-safe characters, so does not need escaping
495  # @todo Too bad this loads the contents into memory (script might swap)
496  $contents = ' <contents encoding="base64">' .
497  chunk_split( base64_encode(
498  $be->getFileContents( [ 'src' => $file->getPath() ] ) ) ) .
499  " </contents>\n";
500  } else {
501  $contents = '';
502  }
503  if ( $file->isDeleted( File::DELETED_COMMENT ) ) {
504  $comment = Xml::element( 'comment', [ 'deleted' => 'deleted' ] );
505  } else {
506  $comment = Xml::elementClean( 'comment', null, strval( $file->getDescription() ) );
507  }
508  return " <upload>\n" .
509  $this->writeTimestamp( $file->getTimestamp() ) .
510  $this->writeContributor( $file->getUser( 'id' ), $file->getUser( 'text' ) ) .
511  " " . $comment . "\n" .
512  " " . Xml::element( 'filename', null, $file->getName() ) . "\n" .
513  $archiveName .
514  " " . Xml::element( 'src', null, $file->getCanonicalUrl() ) . "\n" .
515  " " . Xml::element( 'size', null, $file->getSize() ) . "\n" .
516  " " . Xml::element( 'sha1base36', null, $file->getSha1() ) . "\n" .
517  " " . Xml::element( 'rel', null, $file->getRel() ) . "\n" .
518  $contents .
519  " </upload>\n";
520  }
521 
532  public static function canonicalTitle( Title $title ) {
533  if ( $title->isExternal() ) {
534  return $title->getPrefixedText();
535  }
536 
537  $prefix = MediaWikiServices::getInstance()->getContentLanguage()->
538  getFormattedNsText( $title->getNamespace() );
539 
540  // @todo Emit some kind of warning to the user if $title->getNamespace() !==
541  // NS_MAIN and $prefix === '' (viz. pages in an unregistered namespace)
542 
543  if ( $prefix !== '' ) {
544  $prefix .= ':';
545  }
546 
547  return $prefix . $title->getText();
548  }
549 }
static factory(Title $title)
Create a WikiPage object of the appropriate class for the given title.
Definition: WikiPage.php:138
const DELETED_COMMENT
Definition: File.php:64
$wgVersion
MediaWiki version number.
$wgSitename
Name of the site.
getText()
Get the text form (spaces not underscores) of the main part.
Definition: Title.php:984
static newMainPage(MessageLocalizer $localizer=null)
Create a new Title for the Main Page.
Definition: Title.php:653
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Definition: router.php:42
const XML_DUMP_SCHEMA_VERSION_10
Definition: Defines.php:324
static getDefaultModelFor(Title $title)
Returns the name of the default content model to be used for the page with the given title...
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
wfLogWarning( $msg, $callerOffset=1, $level=E_USER_WARNING)
Send a warning as a PHP error and the debug log.
closePage()
Closes a "<page>" section on the output stream.
static elementClean( $element, $attribs=[], $contents='')
Format an XML element as with self::element(), but run text through the content language&#39;s normalize(...
Definition: Xml.php:90
getPrefixedText()
Get the prefixed title with spaces.
Definition: Title.php:1696
closeStream()
Closes the output stream with the closing root element.
static getRevisionText( $row, $prefix='old_', $wiki=false)
Get revision text associated with an old or archive row.
Definition: Revision.php:1046
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not it can be in the form of< username >< more info > e g for bot passwords intended to be added to log contexts Fields it might only if the login was with a bot password it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output $out
Definition: hooks.txt:780
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
const DELETED_COMMENT
Definition: LogPage.php:35
openPage( $row)
Opens a "<page>" section on the output stream, with data from the given database row.
writeUpload( $file, $dumpContents=false)
writeTimestamp( $timestamp, $indent=" ")
isExternal()
Is this Title interwiki?
Definition: Title.php:901
static getForModelID( $modelId)
Returns the ContentHandler singleton for the given model ID.
$wgCapitalLinks
Set this to false to avoid forcing the first letter of links to capitals.
writeRevision( $row)
Dumps a "<revision>" section on the output stream, with data filled in from the given database row...
static isValid( $ip)
Validate an IP address.
Definition: IP.php:111
writeUploads( $row, $dumpContents=false)
Warning! This data is potentially inconsistent.
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:780
namespace and then decline to actually register it file or subcat img or subcat $title
Definition: hooks.txt:925
openStream()
Opens the XML output stream&#39;s root "<mediawiki>" element.
getNamespace()
Get the namespace index, i.e.
Definition: Title.php:1026
const NS_FILE
Definition: Defines.php:66
presenting them properly to the user as errors is done by the caller return true use this to change the list i e etc $rev
Definition: hooks.txt:1766
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
const DELETED_USER
Definition: LogPage.php:36
const DELETED_TEXT
Definition: Revision.php:46
static makeTitle( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:589
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
writeLogItem( $row)
Dumps a "<logitem>" section on the output stream, with data filled in from the given database row...
const DELETED_USER
Definition: Revision.php:48
static getStore()
Title null $currentTitle
Title of the currently processed page.
static schemaVersion()
Returns the default export schema version, as defined by $wgXmlDumpSchemaVersion. ...
static element( $element, $attribs=null, $contents='', $allowShortTag=true)
Format an XML element with given attributes and, optionally, text content.
Definition: Xml.php:41
static string [] $supportedSchemas
the schema versions supported for output
const DELETED_COMMENT
Definition: Revision.php:47
The wiki should then use memcached to cache various data To use multiple just add more items to the array To increase the weight of a make its entry a controlled by the following MediaWiki still creates a BagOStuff but calls it to it are no ops If the cache daemon can t be it should also disable itself fairly $wgDBname
Definition: memcached.txt:93
const DELETED_ACTION
Definition: LogPage.php:34
writeContributor( $id, $text, $indent=" ")
$content
Definition: pageupdater.txt:72
static canonicalTitle(Title $title)
Return prefixed text form of title, but using the content language&#39;s canonical namespace.
static run( $event, array $args=[], $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
Definition: Hooks.php:200