MediaWiki  master
XmlDumpWriter.php
Go to the documentation of this file.
1 <?php
28 
37  public static $supportedSchemas = [
39  ];
40 
46  private $currentTitle = null;
47 
58  function openStream() {
60  return Xml::element( 'mediawiki', [
61  'xmlns' => "http://www.mediawiki.org/xml/export-$ver/",
62  'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
63  /*
64  * When a new version of the schema is created, it needs staging on mediawiki.org.
65  * This requires a change in the operations/mediawiki-config git repo.
66  *
67  * Create a changeset like https://gerrit.wikimedia.org/r/#/c/149643/ in which
68  * you copy in the new xsd file.
69  *
70  * After it is reviewed, merged and deployed (sync-docroot), the index.html needs purging.
71  * echo "https://www.mediawiki.org/xml/index.html" | mwscript purgeList.php --wiki=aawiki
72  */
73  'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " .
74  "http://www.mediawiki.org/xml/export-$ver.xsd",
75  'version' => $ver,
76  'xml:lang' => MediaWikiServices::getInstance()->getContentLanguage()->getHtmlCode() ],
77  null ) .
78  "\n" .
79  $this->siteInfo();
80  }
81 
85  function siteInfo() {
86  $info = [
87  $this->sitename(),
88  $this->dbname(),
89  $this->homelink(),
90  $this->generator(),
91  $this->caseSetting(),
92  $this->namespaces() ];
93  return " <siteinfo>\n " .
94  implode( "\n ", $info ) .
95  "\n </siteinfo>\n";
96  }
97 
101  function sitename() {
102  global $wgSitename;
103  return Xml::element( 'sitename', [], $wgSitename );
104  }
105 
109  function dbname() {
110  global $wgDBname;
111  return Xml::element( 'dbname', [], $wgDBname );
112  }
113 
117  function generator() {
118  global $wgVersion;
119  return Xml::element( 'generator', [], "MediaWiki $wgVersion" );
120  }
121 
125  function homelink() {
126  return Xml::element( 'base', [], Title::newMainPage()->getCanonicalURL() );
127  }
128 
132  function caseSetting() {
133  global $wgCapitalLinks;
134  // "case-insensitive" option is reserved for future
135  $sensitivity = $wgCapitalLinks ? 'first-letter' : 'case-sensitive';
136  return Xml::element( 'case', [], $sensitivity );
137  }
138 
142  function namespaces() {
143  $spaces = "<namespaces>\n";
144  foreach (
145  MediaWikiServices::getInstance()->getContentLanguage()->getFormattedNamespaces()
146  as $ns => $title
147  ) {
148  $spaces .= ' ' .
149  Xml::element( 'namespace',
150  [
151  'key' => $ns,
152  'case' => MWNamespace::isCapitalized( $ns ) ? 'first-letter' : 'case-sensitive',
153  ], $title ) . "\n";
154  }
155  $spaces .= " </namespaces>";
156  return $spaces;
157  }
158 
165  function closeStream() {
166  return "</mediawiki>\n";
167  }
168 
176  public function openPage( $row ) {
177  $out = " <page>\n";
178  $this->currentTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
179  $canonicalTitle = self::canonicalTitle( $this->currentTitle );
180  $out .= ' ' . Xml::elementClean( 'title', [], $canonicalTitle ) . "\n";
181  $out .= ' ' . Xml::element( 'ns', [], strval( $row->page_namespace ) ) . "\n";
182  $out .= ' ' . Xml::element( 'id', [], strval( $row->page_id ) ) . "\n";
183  if ( $row->page_is_redirect ) {
184  $page = WikiPage::factory( $this->currentTitle );
185  $redirect = $page->getRedirectTarget();
186  if ( $redirect instanceof Title && $redirect->isValidRedirectTarget() ) {
187  $out .= ' ';
188  $out .= Xml::element( 'redirect', [ 'title' => self::canonicalTitle( $redirect ) ] );
189  $out .= "\n";
190  }
191  }
192 
193  if ( $row->page_restrictions != '' ) {
194  $out .= ' ' . Xml::element( 'restrictions', [],
195  strval( $row->page_restrictions ) ) . "\n";
196  }
197 
198  Hooks::run( 'XmlDumpWriterOpenPage', [ $this, &$out, $row, $this->currentTitle ] );
199 
200  return $out;
201  }
202 
209  function closePage() {
210  if ( $this->currentTitle !== null ) {
211  $linkCache = MediaWikiServices::getInstance()->getLinkCache();
212  // In rare cases, link cache has the same key for some pages which
213  // might be read as part of the same batch. T220424 and T220316
214  $linkCache->clearLink( $this->currentTitle );
215  }
216  return " </page>\n";
217  }
218 
222  private function getRevisionStore() {
223  return MediaWikiServices::getInstance()->getRevisionStore();
224  }
225 
229  private function getBlobStore() {
230  return MediaWikiServices::getInstance()->getBlobStore();
231  }
232 
241  function writeRevision( $row ) {
242  $out = " <revision>\n";
243  $out .= " " . Xml::element( 'id', null, strval( $row->rev_id ) ) . "\n";
244  if ( isset( $row->rev_parent_id ) && $row->rev_parent_id ) {
245  $out .= " " . Xml::element( 'parentid', null, strval( $row->rev_parent_id ) ) . "\n";
246  }
247 
248  $out .= $this->writeTimestamp( $row->rev_timestamp );
249 
250  if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_USER ) ) {
251  $out .= " " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
252  } else {
253  $out .= $this->writeContributor( $row->rev_user, $row->rev_user_text );
254  }
255 
256  if ( isset( $row->rev_minor_edit ) && $row->rev_minor_edit ) {
257  $out .= " <minor/>\n";
258  }
259  if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_COMMENT ) ) {
260  $out .= " " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
261  } else {
262  $comment = CommentStore::getStore()->getComment( 'rev_comment', $row )->text;
263  if ( $comment != '' ) {
264  $out .= " " . Xml::elementClean( 'comment', [], strval( $comment ) ) . "\n";
265  }
266  }
267 
268  // TODO: rev_content_model no longer exists with MCR, see T174031
269  if ( isset( $row->rev_content_model ) && !is_null( $row->rev_content_model ) ) {
270  $content_model = strval( $row->rev_content_model );
271  } else {
272  // probably using $wgContentHandlerUseDB = false;
273  $content_model = ContentHandler::getDefaultModelFor( $this->currentTitle );
274  }
275 
276  $content_handler = ContentHandler::getForModelID( $content_model );
277 
278  // TODO: rev_content_format no longer exists with MCR, see T174031
279  if ( isset( $row->rev_content_format ) && !is_null( $row->rev_content_format ) ) {
280  $content_format = strval( $row->rev_content_format );
281  } else {
282  // probably using $wgContentHandlerUseDB = false;
283  $content_format = $content_handler->getDefaultFormat();
284  }
285 
286  $out .= " " . Xml::element( 'model', null, strval( $content_model ) ) . "\n";
287  $out .= " " . Xml::element( 'format', null, strval( $content_format ) ) . "\n";
288 
289  $text = '';
290  if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_TEXT ) ) {
291  $out .= " " . Xml::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n";
292  } elseif ( isset( $row->old_text ) ) {
293  // Raw text from the database may have invalid chars
294  $text = strval( Revision::getRevisionText( $row ) );
295  try {
296  $text = $content_handler->exportTransform( $text, $content_format );
297  }
298  catch ( Exception $ex ) {
299  if ( $ex instanceof MWException || $ex instanceof RuntimeException ) {
300  // leave text as is; that's the way it goes
301  wfLogWarning( 'exportTransform failed on text for revid ' . $row->rev_id . "\n" );
302  } else {
303  throw $ex;
304  }
305  }
306  $out .= " " . Xml::elementClean( 'text',
307  [ 'xml:space' => 'preserve', 'bytes' => intval( $row->rev_len ) ],
308  strval( $text ) ) . "\n";
309  } elseif ( isset( $row->_load_content ) ) {
310  // TODO: make this fully MCR aware, see T174031
311  $rev = $this->getRevisionStore()->newRevisionFromRow( $row, 0, $this->currentTitle );
312  $slot = $rev->getSlot( 'main' );
313  try {
314  $content = $slot->getContent();
315 
316  if ( $content instanceof TextContent ) {
317  // HACK: For text based models, bypass the serialization step.
318  // This allows extensions (like Flow)that use incompatible combinations
319  // of serialization format and content model.
320  $text = $content->getNativeData();
321  } else {
322  $text = $content->serialize( $content_format );
323  }
324  $text = $content_handler->exportTransform( $text, $content_format );
325  $out .= " " . Xml::elementClean( 'text',
326  [ 'xml:space' => 'preserve', 'bytes' => intval( $slot->getSize() ) ],
327  strval( $text ) ) . "\n";
328  }
329  catch ( Exception $ex ) {
330  if ( $ex instanceof MWException || $ex instanceof RuntimeException ) {
331  // there's no provsion in the schema for an attribute that will let
332  // the user know this element was unavailable due to error; an empty
333  // tag is the best we can do
334  $out .= " " . Xml::element( 'text' ) . "\n";
335  wfLogWarning( 'failed to load content for revid ' . $row->rev_id . "\n" );
336  } else {
337  throw $ex;
338  }
339  }
340  } elseif ( isset( $row->rev_text_id ) ) {
341  // Stub output for pre-MCR schema
342  // TODO: MCR: rev_text_id only exists in the pre-MCR schema. Remove this when
343  // we drop support for the old schema.
344  $out .= " " . Xml::element( 'text',
345  [ 'id' => $row->rev_text_id, 'bytes' => intval( $row->rev_len ) ],
346  "" ) . "\n";
347  } else {
348  // Backwards-compatible stub output for MCR aware schema
349  // TODO: MCR: emit content addresses instead of text ids, see T174031, T199121
350  $rev = $this->getRevisionStore()->newRevisionFromRow( $row, 0, $this->currentTitle );
351  $slot = $rev->getSlot( 'main' );
352 
353  // Note that this is currently the ONLY reason we have a BlobStore here at all.
354  // When removing this line, check whether the BlobStore has become unused.
355  $textId = $this->getBlobStore()->getTextIdFromAddress( $slot->getAddress() );
356  $out .= " " . Xml::element( 'text',
357  [ 'id' => $textId, 'bytes' => intval( $slot->getSize() ) ],
358  "" ) . "\n";
359  }
360 
361  if ( isset( $row->rev_sha1 )
362  && $row->rev_sha1
363  && !( $row->rev_deleted & Revision::DELETED_TEXT )
364  ) {
365  $out .= " " . Xml::element( 'sha1', null, strval( $row->rev_sha1 ) ) . "\n";
366  } else {
367  $out .= " <sha1/>\n";
368  }
369 
370  // Avoid PHP 7.1 warning from passing $this by reference
371  $writer = $this;
372  Hooks::run( 'XmlDumpWriterWriteRevision', [ &$writer, &$out, $row, $text ] );
373 
374  $out .= " </revision>\n";
375 
376  return $out;
377  }
378 
387  function writeLogItem( $row ) {
388  $out = " <logitem>\n";
389  $out .= " " . Xml::element( 'id', null, strval( $row->log_id ) ) . "\n";
390 
391  $out .= $this->writeTimestamp( $row->log_timestamp, " " );
392 
393  if ( $row->log_deleted & LogPage::DELETED_USER ) {
394  $out .= " " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
395  } else {
396  $out .= $this->writeContributor( $row->log_user, $row->user_name, " " );
397  }
398 
399  if ( $row->log_deleted & LogPage::DELETED_COMMENT ) {
400  $out .= " " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
401  } else {
402  $comment = CommentStore::getStore()->getComment( 'log_comment', $row )->text;
403  if ( $comment != '' ) {
404  $out .= " " . Xml::elementClean( 'comment', null, strval( $comment ) ) . "\n";
405  }
406  }
407 
408  $out .= " " . Xml::element( 'type', null, strval( $row->log_type ) ) . "\n";
409  $out .= " " . Xml::element( 'action', null, strval( $row->log_action ) ) . "\n";
410 
411  if ( $row->log_deleted & LogPage::DELETED_ACTION ) {
412  $out .= " " . Xml::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n";
413  } else {
414  $title = Title::makeTitle( $row->log_namespace, $row->log_title );
415  $out .= " " . Xml::elementClean( 'logtitle', null, self::canonicalTitle( $title ) ) . "\n";
416  $out .= " " . Xml::elementClean( 'params',
417  [ 'xml:space' => 'preserve' ],
418  strval( $row->log_params ) ) . "\n";
419  }
420 
421  $out .= " </logitem>\n";
422 
423  return $out;
424  }
425 
431  function writeTimestamp( $timestamp, $indent = " " ) {
432  $ts = wfTimestamp( TS_ISO_8601, $timestamp );
433  return $indent . Xml::element( 'timestamp', null, $ts ) . "\n";
434  }
435 
442  function writeContributor( $id, $text, $indent = " " ) {
443  $out = $indent . "<contributor>\n";
444  if ( $id || !IP::isValid( $text ) ) {
445  $out .= $indent . " " . Xml::elementClean( 'username', null, strval( $text ) ) . "\n";
446  $out .= $indent . " " . Xml::element( 'id', null, strval( $id ) ) . "\n";
447  } else {
448  $out .= $indent . " " . Xml::elementClean( 'ip', null, strval( $text ) ) . "\n";
449  }
450  $out .= $indent . "</contributor>\n";
451  return $out;
452  }
453 
460  function writeUploads( $row, $dumpContents = false ) {
461  if ( $row->page_namespace == NS_FILE ) {
462  $img = wfLocalFile( $row->page_title );
463  if ( $img && $img->exists() ) {
464  $out = '';
465  foreach ( array_reverse( $img->getHistory() ) as $ver ) {
466  $out .= $this->writeUpload( $ver, $dumpContents );
467  }
468  $out .= $this->writeUpload( $img, $dumpContents );
469  return $out;
470  }
471  }
472  return '';
473  }
474 
480  function writeUpload( $file, $dumpContents = false ) {
481  if ( $file->isOld() ) {
482  $archiveName = " " .
483  Xml::element( 'archivename', null, $file->getArchiveName() ) . "\n";
484  } else {
485  $archiveName = '';
486  }
487  if ( $dumpContents ) {
488  $be = $file->getRepo()->getBackend();
489  # Dump file as base64
490  # Uses only XML-safe characters, so does not need escaping
491  # @todo Too bad this loads the contents into memory (script might swap)
492  $contents = ' <contents encoding="base64">' .
493  chunk_split( base64_encode(
494  $be->getFileContents( [ 'src' => $file->getPath() ] ) ) ) .
495  " </contents>\n";
496  } else {
497  $contents = '';
498  }
499  if ( $file->isDeleted( File::DELETED_COMMENT ) ) {
500  $comment = Xml::element( 'comment', [ 'deleted' => 'deleted' ] );
501  } else {
502  $comment = Xml::elementClean( 'comment', null, strval( $file->getDescription() ) );
503  }
504  return " <upload>\n" .
505  $this->writeTimestamp( $file->getTimestamp() ) .
506  $this->writeContributor( $file->getUser( 'id' ), $file->getUser( 'text' ) ) .
507  " " . $comment . "\n" .
508  " " . Xml::element( 'filename', null, $file->getName() ) . "\n" .
509  $archiveName .
510  " " . Xml::element( 'src', null, $file->getCanonicalUrl() ) . "\n" .
511  " " . Xml::element( 'size', null, $file->getSize() ) . "\n" .
512  " " . Xml::element( 'sha1base36', null, $file->getSha1() ) . "\n" .
513  " " . Xml::element( 'rel', null, $file->getRel() ) . "\n" .
514  $contents .
515  " </upload>\n";
516  }
517 
528  public static function canonicalTitle( Title $title ) {
529  if ( $title->isExternal() ) {
530  return $title->getPrefixedText();
531  }
532 
533  $prefix = MediaWikiServices::getInstance()->getContentLanguage()->
534  getFormattedNsText( $title->getNamespace() );
535 
536  // @todo Emit some kind of warning to the user if $title->getNamespace() !==
537  // NS_MAIN and $prefix === '' (viz. pages in an unregistered namespace)
538 
539  if ( $prefix !== '' ) {
540  $prefix .= ':';
541  }
542 
543  return $prefix . $title->getText();
544  }
545 }
static factory(Title $title)
Create a WikiPage object of the appropriate class for the given title.
Definition: WikiPage.php:138
const DELETED_COMMENT
Definition: File.php:55
$wgVersion
MediaWiki version number.
$wgSitename
Name of the site.
getText()
Get the text form (spaces not underscores) of the main part.
Definition: Title.php:983
static newMainPage(MessageLocalizer $localizer=null)
Create a new Title for the Main Page.
Definition: Title.php:653
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Definition: router.php:42
const XML_DUMP_SCHEMA_VERSION_10
Definition: Defines.php:328
static getDefaultModelFor(Title $title)
Returns the name of the default content model to be used for the page with the given title...
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
wfLogWarning( $msg, $callerOffset=1, $level=E_USER_WARNING)
Send a warning as a PHP error and the debug log.
closePage()
Closes a "<page>" section on the output stream.
static elementClean( $element, $attribs=[], $contents='')
Format an XML element as with self::element(), but run text through the content language&#39;s normalize(...
Definition: Xml.php:90
getPrefixedText()
Get the prefixed title with spaces.
Definition: Title.php:1691
closeStream()
Closes the output stream with the closing root element.
wfLocalFile( $title)
Get an object referring to a locally registered file.
static getRevisionText( $row, $prefix='old_', $wiki=false)
Get revision text associated with an old or archive row.
Definition: Revision.php:1048
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not it can be in the form of< username >< more info > e g for bot passwords intended to be added to log contexts Fields it might only if the login was with a bot password it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output $out
Definition: hooks.txt:780
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
const DELETED_COMMENT
Definition: LogPage.php:35
openPage( $row)
Opens a "<page>" section on the output stream, with data from the given database row.
writeUpload( $file, $dumpContents=false)
writeTimestamp( $timestamp, $indent=" ")
isExternal()
Is this Title interwiki?
Definition: Title.php:900
static getForModelID( $modelId)
Returns the ContentHandler singleton for the given model ID.
$wgCapitalLinks
Set this to false to avoid forcing the first letter of links to capitals.
writeRevision( $row)
Dumps a "<revision>" section on the output stream, with data filled in from the given database row...
static isValid( $ip)
Validate an IP address.
Definition: IP.php:111
writeUploads( $row, $dumpContents=false)
Warning! This data is potentially inconsistent.
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:780
namespace and then decline to actually register it file or subcat img or subcat $title
Definition: hooks.txt:925
openStream()
Opens the XML output stream&#39;s root "<mediawiki>" element.
getNamespace()
Get the namespace index, i.e.
Definition: Title.php:1025
const NS_FILE
Definition: Defines.php:70
presenting them properly to the user as errors is done by the caller return true use this to change the list i e etc $rev
Definition: hooks.txt:1766
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
const DELETED_USER
Definition: LogPage.php:36
const DELETED_TEXT
Definition: Revision.php:46
static makeTitle( $ns, $title, $fragment='', $interwiki='')
Create a new Title from a namespace index and a DB key.
Definition: Title.php:589
static isCapitalized( $index)
Is the namespace first-letter capitalized?
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
writeLogItem( $row)
Dumps a "<logitem>" section on the output stream, with data filled in from the given database row...
const DELETED_USER
Definition: Revision.php:48
static getStore()
Title null $currentTitle
Title of the currently processed page.
static schemaVersion()
Returns the default export schema version, as defined by $wgXmlDumpSchemaVersion. ...
static element( $element, $attribs=null, $contents='', $allowShortTag=true)
Format an XML element with given attributes and, optionally, text content.
Definition: Xml.php:41
controlled by the following MediaWiki still creates a BagOStuff but calls it to it are no ops If the cache daemon can t be it should also disable itself fairly $wgDBname
Definition: memcached.txt:93
static string [] $supportedSchemas
the schema versions supported for output
const DELETED_COMMENT
Definition: Revision.php:47
const DELETED_ACTION
Definition: LogPage.php:34
writeContributor( $id, $text, $indent=" ")
$content
Definition: pageupdater.txt:72
static canonicalTitle(Title $title)
Return prefixed text form of title, but using the content language&#39;s canonical namespace.
static run( $event, array $args=[], $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
Definition: Hooks.php:200