MediaWiki  master
WikiImporter.php
Go to the documentation of this file.
1 <?php
30 
37 class WikiImporter {
39  private $reader;
40  private $foreignNamespaces = null;
45  private $mNoUpdates = false;
46  private $pageOffset = 0;
48  private $config;
52  private $hookRunner;
54  private $countableCache = [];
56  private $disableStatisticsUpdate = false;
59 
67  $this->reader = new XMLReader();
68  $this->config = $config;
69  $this->hookRunner = Hooks::runner();
70 
71  if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
72  stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
73  }
75 
76  // Enable the entity loader, as it is needed for loading external URLs via
77  // XMLReader::open (T86036)
78  $oldDisable = libxml_disable_entity_loader( false );
79  if ( defined( 'LIBXML_PARSEHUGE' ) ) {
80  $status = $this->reader->open( "uploadsource://$id", null, LIBXML_PARSEHUGE );
81  } else {
82  $status = $this->reader->open( "uploadsource://$id" );
83  }
84  if ( !$status ) {
85  $error = libxml_get_last_error();
86  libxml_disable_entity_loader( $oldDisable );
87  throw new MWException( 'Encountered an internal error while initializing WikiImporter object: ' .
88  $error->message );
89  }
90  libxml_disable_entity_loader( $oldDisable );
91 
92  // Default callbacks
93  $this->setPageCallback( [ $this, 'beforeImportPage' ] );
94  $this->setRevisionCallback( [ $this, "importRevision" ] );
95  $this->setUploadCallback( [ $this, 'importUpload' ] );
96  $this->setLogItemCallback( [ $this, 'importLogItem' ] );
97  $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
98 
99  $this->importTitleFactory = new NaiveImportTitleFactory();
100  $this->externalUserNames = new ExternalUserNames( 'imported', false );
101  }
102 
106  public function getReader() {
107  return $this->reader;
108  }
109 
110  public function throwXmlError( $err ) {
111  $this->debug( "FAILURE: $err" );
112  wfDebug( "WikiImporter XML error: $err" );
113  }
114 
115  public function debug( $data ) {
116  if ( $this->mDebug ) {
117  wfDebug( "IMPORT: $data" );
118  }
119  }
120 
121  public function warn( $data ) {
122  wfDebug( "IMPORT: $data" );
123  }
124 
125  public function notice( $msg, ...$params ) {
126  if ( is_callable( $this->mNoticeCallback ) ) {
127  call_user_func( $this->mNoticeCallback, $msg, $params );
128  } else { # No ImportReporter -> CLI
129  // T177997: the command line importers should call setNoticeCallback()
130  // for their own custom callback to echo the notice
131  wfDebug( wfMessage( $msg, $params )->text() );
132  }
133  }
134 
139  public function setDebug( $debug ) {
140  $this->mDebug = $debug;
141  }
142 
147  public function setNoUpdates( $noupdates ) {
148  $this->mNoUpdates = $noupdates;
149  }
150 
157  public function setPageOffset( $nthPage ) {
158  $this->pageOffset = $nthPage;
159  }
160 
167  public function setNoticeCallback( $callback ) {
168  return wfSetVar( $this->mNoticeCallback, $callback );
169  }
170 
176  public function setPageCallback( $callback ) {
177  $previous = $this->mPageCallback;
178  $this->mPageCallback = $callback;
179  return $previous;
180  }
181 
191  public function setPageOutCallback( $callback ) {
192  $previous = $this->mPageOutCallback;
193  $this->mPageOutCallback = $callback;
194  return $previous;
195  }
196 
202  public function setRevisionCallback( $callback ) {
203  $previous = $this->mRevisionCallback;
204  $this->mRevisionCallback = $callback;
205  return $previous;
206  }
207 
213  public function setUploadCallback( $callback ) {
214  $previous = $this->mUploadCallback;
215  $this->mUploadCallback = $callback;
216  return $previous;
217  }
218 
224  public function setLogItemCallback( $callback ) {
225  $previous = $this->mLogItemCallback;
226  $this->mLogItemCallback = $callback;
227  return $previous;
228  }
229 
235  public function setSiteInfoCallback( $callback ) {
236  $previous = $this->mSiteInfoCallback;
237  $this->mSiteInfoCallback = $callback;
238  return $previous;
239  }
240 
246  public function setImportTitleFactory( $factory ) {
247  $this->importTitleFactory = $factory;
248  }
249 
255  public function setTargetNamespace( $namespace ) {
256  if ( $namespace === null ) {
257  // Don't override namespaces
259  return true;
260  } elseif (
261  $namespace >= 0 &&
262  MediaWikiServices::getInstance()->getNamespaceInfo()->exists( intval( $namespace ) )
263  ) {
264  $namespace = intval( $namespace );
265  $this->setImportTitleFactory( new NamespaceImportTitleFactory( $namespace ) );
266  return true;
267  } else {
268  return false;
269  }
270  }
271 
277  public function setTargetRootPage( $rootpage ) {
278  $status = Status::newGood();
279  if ( $rootpage === null ) {
280  // No rootpage
282  } elseif ( $rootpage !== '' ) {
283  $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
284  $title = Title::newFromText( $rootpage );
285 
286  if ( !$title || $title->isExternal() ) {
287  $status->fatal( 'import-rootpage-invalid' );
288  } elseif (
289  !MediaWikiServices::getInstance()->getNamespaceInfo()->
290  hasSubpages( $title->getNamespace() )
291  ) {
292  $displayNSText = $title->getNamespace() === NS_MAIN
293  ? wfMessage( 'blanknamespace' )->text()
294  : MediaWikiServices::getInstance()->getContentLanguage()->
295  getNsText( $title->getNamespace() );
296  $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
297  } else {
298  // set namespace to 'all', so the namespace check in processTitle() can pass
299  $this->setTargetNamespace( null );
301  }
302  }
303  return $status;
304  }
305 
309  public function setImageBasePath( $dir ) {
310  $this->mImageBasePath = $dir;
311  }
312 
316  public function setImportUploads( $import ) {
317  $this->mImportUploads = $import;
318  }
319 
325  public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
326  $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
327  }
328 
333  public function disableStatisticsUpdate() {
334  $this->disableStatisticsUpdate = true;
335  }
336 
343  public function beforeImportPage( $titleAndForeignTitle ) {
344  $title = $titleAndForeignTitle[0];
345  $page = MediaWikiServices::getInstance()->getWikiPageFactory()->newFromTitle( $title );
346  $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
347  return true;
348  }
349 
355  public function importRevision( $revision ) {
356  if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
357  $this->notice( 'import-error-bad-location',
358  $revision->getTitle()->getPrefixedText(),
359  $revision->getID(),
360  $revision->getModel(),
361  $revision->getFormat() );
362 
363  return false;
364  }
365 
366  try {
367  return $revision->importOldRevision();
368  } catch ( MWContentSerializationException $ex ) {
369  $this->notice( 'import-error-unserialize',
370  $revision->getTitle()->getPrefixedText(),
371  $revision->getID(),
372  $revision->getModel(),
373  $revision->getFormat() );
374  }
375 
376  return false;
377  }
378 
384  public function importLogItem( $revision ) {
385  return $revision->importLogItem();
386  }
387 
393  public function importUpload( $revision ) {
394  $importer = MediaWikiServices::getInstance()->getWikiRevisionUploadImporter();
395  $status = $importer->import( $revision );
396  return $status->isGood();
397  }
398 
408  public function finishImportPage( $title, $foreignTitle, $revCount,
409  $sRevCount, $pageInfo
410  ) {
411  // Update article count statistics (T42009)
412  // The normal counting logic in WikiPage->doEditUpdates() is designed for
413  // one-revision-at-a-time editing, not bulk imports. In this situation it
414  // suffers from issues of replica DB lag. We let WikiPage handle the total page
415  // and revision count, and we implement our own custom logic for the
416  // article (content page) count.
417  if ( !$this->disableStatisticsUpdate ) {
418  $page = MediaWikiServices::getInstance()->getWikiPageFactory()->newFromTitle( $title );
419  $page->loadPageData( 'fromdbmaster' );
420  $content = $page->getContent();
421  if ( $content === null ) {
422  wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $title .
423  ' because WikiPage::getContent() returned null' );
424  } else {
425  $editInfo = $page->prepareContentForEdit( $content );
426  $countKey = 'title_' . $title->getPrefixedText();
427  $countable = $page->isCountable( $editInfo );
428  if ( array_key_exists( $countKey, $this->countableCache ) &&
429  $countable != $this->countableCache[$countKey] ) {
431  'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
432  ] ) );
433  }
434  }
435  }
436 
437  return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
438  $revCount, $sRevCount, $pageInfo );
439  }
440 
445  public function debugRevisionHandler( &$revision ) {
446  $this->debug( "Got revision:" );
447  if ( is_object( $revision->title ) ) {
448  $this->debug( "-- Title: " . $revision->title->getPrefixedText() );
449  } else {
450  $this->debug( "-- Title: <invalid>" );
451  }
452  $this->debug( "-- User: " . $revision->user_text );
453  $this->debug( "-- Timestamp: " . $revision->timestamp );
454  $this->debug( "-- Comment: " . $revision->comment );
455  $this->debug( "-- Text: " . $revision->text );
456  }
457 
463  private function siteInfoCallback( $siteInfo ) {
464  if ( isset( $this->mSiteInfoCallback ) ) {
465  return call_user_func_array( $this->mSiteInfoCallback,
466  [ $siteInfo, $this ] );
467  } else {
468  return false;
469  }
470  }
471 
476  public function pageCallback( $title ) {
477  if ( isset( $this->mPageCallback ) ) {
478  call_user_func( $this->mPageCallback, $title );
479  }
480  }
481 
490  private function pageOutCallback( $title, $foreignTitle, $revCount,
491  $sucCount, $pageInfo ) {
492  if ( isset( $this->mPageOutCallback ) ) {
493  call_user_func_array( $this->mPageOutCallback, func_get_args() );
494  }
495  }
496 
502  private function revisionCallback( $revision ) {
503  if ( isset( $this->mRevisionCallback ) ) {
504  return call_user_func_array( $this->mRevisionCallback,
505  [ $revision, $this ] );
506  } else {
507  return false;
508  }
509  }
510 
516  private function logItemCallback( $revision ) {
517  if ( isset( $this->mLogItemCallback ) ) {
518  return call_user_func_array( $this->mLogItemCallback,
519  [ $revision, $this ] );
520  } else {
521  return false;
522  }
523  }
524 
531  public function nodeAttribute( $attr ) {
532  return $this->reader->getAttribute( $attr );
533  }
534 
542  public function nodeContents() {
543  if ( $this->reader->isEmptyElement ) {
544  return "";
545  }
546  $buffer = "";
547  while ( $this->reader->read() ) {
548  switch ( $this->reader->nodeType ) {
549  case XMLReader::TEXT:
550  case XMLReader::CDATA:
551  case XMLReader::SIGNIFICANT_WHITESPACE:
552  $buffer .= $this->reader->value;
553  break;
554  case XMLReader::END_ELEMENT:
555  return $buffer;
556  }
557  }
558 
559  $this->reader->close();
560  return '';
561  }
562 
569  public function doImport() {
570  // Calls to reader->read need to be wrapped in calls to
571  // libxml_disable_entity_loader() to avoid local file
572  // inclusion attacks (T48932).
573  $oldDisable = libxml_disable_entity_loader( true );
574  $rethrow = null;
575  try {
576  $this->reader->read();
577 
578  if ( $this->reader->localName != 'mediawiki' ) {
579  libxml_disable_entity_loader( $oldDisable );
580  throw new MWException( "Expected <mediawiki> tag, got " .
581  $this->reader->localName );
582  }
583  $this->debug( "<mediawiki> tag is correct." );
584 
585  $this->debug( "Starting primary dump processing loop." );
586 
587  $keepReading = $this->reader->read();
588  $skip = false;
589  $pageCount = 0;
590  while ( $keepReading ) {
591  $tag = $this->reader->localName;
592  if ( $this->pageOffset ) {
593  if ( $tag === 'page' ) {
594  $pageCount++;
595  }
596  if ( $pageCount < $this->pageOffset ) {
597  $keepReading = $this->reader->next();
598  continue;
599  }
600  }
601  $type = $this->reader->nodeType;
602 
603  if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
604  // Do nothing
605  } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
606  break;
607  } elseif ( $tag == 'siteinfo' ) {
608  $this->handleSiteInfo();
609  } elseif ( $tag == 'page' ) {
610  $this->handlePage();
611  } elseif ( $tag == 'logitem' ) {
612  $this->handleLogItem();
613  } elseif ( $tag != '#text' ) {
614  $this->warn( "Unhandled top-level XML tag $tag" );
615 
616  $skip = true;
617  }
618 
619  if ( $skip ) {
620  $keepReading = $this->reader->next();
621  $skip = false;
622  $this->debug( "Skip" );
623  } else {
624  $keepReading = $this->reader->read();
625  }
626  }
627  } finally {
628  libxml_disable_entity_loader( $oldDisable );
629  $this->reader->close();
630  }
631 
632  return true;
633  }
634 
635  private function handleSiteInfo() {
636  $this->debug( "Enter site info handler." );
637  $siteInfo = [];
638 
639  // Fields that can just be stuffed in the siteInfo object
640  $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
641 
642  while ( $this->reader->read() ) {
643  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
644  $this->reader->localName == 'siteinfo' ) {
645  break;
646  }
647 
648  $tag = $this->reader->localName;
649 
650  if ( $tag == 'namespace' ) {
651  $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
652  $this->nodeContents();
653  } elseif ( in_array( $tag, $normalFields ) ) {
654  $siteInfo[$tag] = $this->nodeContents();
655  }
656  }
657 
658  $siteInfo['_namespaces'] = $this->foreignNamespaces;
659  $this->siteInfoCallback( $siteInfo );
660  }
661 
662  private function handleLogItem() {
663  $this->debug( "Enter log item handler." );
664  $logInfo = [];
665 
666  // Fields that can just be stuffed in the pageInfo object
667  $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
668  'logtitle', 'params' ];
669 
670  while ( $this->reader->read() ) {
671  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
672  $this->reader->localName == 'logitem' ) {
673  break;
674  }
675 
676  $tag = $this->reader->localName;
677 
678  if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
679  // Do nothing
680  } elseif ( in_array( $tag, $normalFields ) ) {
681  $logInfo[$tag] = $this->nodeContents();
682  } elseif ( $tag == 'contributor' ) {
683  $logInfo['contributor'] = $this->handleContributor();
684  } elseif ( $tag != '#text' ) {
685  $this->warn( "Unhandled log-item XML tag $tag" );
686  }
687  }
688 
689  $this->processLogItem( $logInfo );
690  }
691 
696  private function processLogItem( $logInfo ) {
697  $revision = new WikiRevision( $this->config );
698 
699  if ( isset( $logInfo['id'] ) ) {
700  $revision->setID( $logInfo['id'] );
701  }
702  $revision->setType( $logInfo['type'] );
703  $revision->setAction( $logInfo['action'] );
704  if ( isset( $logInfo['timestamp'] ) ) {
705  $revision->setTimestamp( $logInfo['timestamp'] );
706  }
707  if ( isset( $logInfo['params'] ) ) {
708  $revision->setParams( $logInfo['params'] );
709  }
710  if ( isset( $logInfo['logtitle'] ) ) {
711  // @todo Using Title for non-local titles is a recipe for disaster.
712  // We should use ForeignTitle here instead.
713  $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
714  }
715 
716  $revision->setNoUpdates( $this->mNoUpdates );
717 
718  if ( isset( $logInfo['comment'] ) ) {
719  $revision->setComment( $logInfo['comment'] );
720  }
721 
722  if ( isset( $logInfo['contributor']['ip'] ) ) {
723  $revision->setUserIP( $logInfo['contributor']['ip'] );
724  }
725 
726  if ( !isset( $logInfo['contributor']['username'] ) ) {
727  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
728  } else {
729  $revision->setUsername(
730  $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
731  );
732  }
733 
734  return $this->logItemCallback( $revision );
735  }
736 
737  private function handlePage() {
738  // Handle page data.
739  $this->debug( "Enter page handler." );
740  $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
741 
742  // Fields that can just be stuffed in the pageInfo object
743  $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
744 
745  $skip = false;
746  $badTitle = false;
747 
748  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
749  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
750  $this->reader->localName == 'page' ) {
751  break;
752  }
753 
754  $skip = false;
755 
756  $tag = $this->reader->localName;
757 
758  if ( $badTitle ) {
759  // The title is invalid, bail out of this page
760  $skip = true;
761  } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
762  // Do nothing
763  } elseif ( in_array( $tag, $normalFields ) ) {
764  // An XML snippet:
765  // <page>
766  // <id>123</id>
767  // <title>Page</title>
768  // <redirect title="NewTitle"/>
769  // ...
770  // Because the redirect tag is built differently, we need special handling for that case.
771  if ( $tag == 'redirect' ) {
772  $pageInfo[$tag] = $this->nodeAttribute( 'title' );
773  } else {
774  $pageInfo[$tag] = $this->nodeContents();
775  }
776  } elseif ( $tag == 'revision' || $tag == 'upload' ) {
777  if ( !isset( $title ) ) {
778  $title = $this->processTitle( $pageInfo['title'],
779  $pageInfo['ns'] ?? null );
780 
781  // $title is either an array of two titles or false.
782  if ( is_array( $title ) ) {
783  $this->pageCallback( $title );
784  list( $pageInfo['_title'], $foreignTitle ) = $title;
785  } else {
786  $badTitle = true;
787  $skip = true;
788  }
789  }
790 
791  if ( $title ) {
792  if ( $tag == 'revision' ) {
793  $this->handleRevision( $pageInfo );
794  } else {
795  $this->handleUpload( $pageInfo );
796  }
797  }
798  } elseif ( $tag != '#text' ) {
799  $this->warn( "Unhandled page XML tag $tag" );
800  $skip = true;
801  }
802  }
803 
804  // @note $pageInfo is only set if a valid $title is processed above with
805  // no error. If we have a valid $title, then pageCallback is called
806  // above, $pageInfo['title'] is set and we do pageOutCallback here.
807  // If $pageInfo['_title'] is not set, then $foreignTitle is also not
808  // set since they both come from $title above.
809  if ( array_key_exists( '_title', $pageInfo ) ) {
810  $this->pageOutCallback( $pageInfo['_title'], $foreignTitle,
811  $pageInfo['revisionCount'],
812  $pageInfo['successfulRevisionCount'],
813  $pageInfo );
814  }
815  }
816 
820  private function handleRevision( &$pageInfo ) {
821  $this->debug( "Enter revision handler" );
822  $revisionInfo = [];
823 
824  $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
825  'model', 'format', 'text', 'sha1' ];
826 
827  $skip = false;
828 
829  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
830  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
831  $this->reader->localName == 'revision' ) {
832  break;
833  }
834 
835  $tag = $this->reader->localName;
836 
837  if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
838  $this, $pageInfo, $revisionInfo )
839  ) {
840  // Do nothing
841  } elseif ( in_array( $tag, $normalFields ) ) {
842  $revisionInfo[$tag] = $this->nodeContents();
843  } elseif ( $tag == 'content' ) {
844  // We can have multiple content tags, so make this an array.
845  $revisionInfo[$tag][] = $this->handleContent();
846  } elseif ( $tag == 'contributor' ) {
847  $revisionInfo['contributor'] = $this->handleContributor();
848  } elseif ( $tag != '#text' ) {
849  $this->warn( "Unhandled revision XML tag $tag" );
850  $skip = true;
851  }
852  }
853 
854  $pageInfo['revisionCount']++;
855  if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
856  $pageInfo['successfulRevisionCount']++;
857  }
858  }
859 
860  private function handleContent() {
861  $this->debug( "Enter content handler" );
862  $contentInfo = [];
863 
864  $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
865 
866  $skip = false;
867 
868  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
869  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
870  $this->reader->localName == 'content' ) {
871  break;
872  }
873 
874  $tag = $this->reader->localName;
875 
876  if ( !$this->hookRunner->onImportHandleContentXMLTag(
877  $this, $contentInfo )
878  ) {
879  // Do nothing
880  } elseif ( in_array( $tag, $normalFields ) ) {
881  $contentInfo[$tag] = $this->nodeContents();
882  } elseif ( $tag != '#text' ) {
883  $this->warn( "Unhandled content XML tag $tag" );
884  $skip = true;
885  }
886  }
887 
888  return $contentInfo;
889  }
890 
899  private function makeContent( Title $title, $revisionId, $contentInfo ) {
900  global $wgMaxArticleSize;
901 
902  if ( !isset( $contentInfo['text'] ) ) {
903  throw new MWException( 'Missing text field in import.' );
904  }
905 
906  // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
907  // database errors and instability. Testing for revisions with only listed
908  // content models, as other content models might use serialization formats
909  // which aren't checked against $wgMaxArticleSize.
910  if ( ( !isset( $contentInfo['model'] ) ||
911  in_array( $contentInfo['model'], [
912  'wikitext',
913  'css',
914  'json',
915  'javascript',
916  'text',
917  ''
918  ] ) ) &&
919  strlen( $contentInfo['text'] ) > $wgMaxArticleSize * 1024
920  ) {
921  throw new MWException( 'The text of ' .
922  ( $revisionId ?
923  "the revision with ID $revisionId" :
924  'a revision'
925  ) . " exceeds the maximum allowable size ($wgMaxArticleSize KB)" );
926  }
927 
928  $role = $contentInfo['role'] ?? SlotRecord::MAIN;
929  $model = $contentInfo['model'] ?? $this->getDefaultContentModel( $title, $role );
930  $handler = $this->getContentHandler( $model );
931 
932  $text = $handler->importTransform( $contentInfo['text'] );
933 
934  $content = $handler->unserializeContent( $text );
935 
936  return $content;
937  }
938 
945  private function processRevision( $pageInfo, $revisionInfo ) {
946  $revision = new WikiRevision( $this->config );
947 
948  $revId = $revisionInfo['id'] ?? 0;
949  if ( $revId ) {
950  $revision->setID( $revisionInfo['id'] );
951  }
952 
953  $title = $pageInfo['_title'];
954  $revision->setTitle( $title );
955 
956  $content = $this->makeContent( $title, $revId, $revisionInfo );
957  $revision->setContent( SlotRecord::MAIN, $content );
958 
959  foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
960  if ( !isset( $slotInfo['role'] ) ) {
961  throw new MWException( "Missing role for imported slot." );
962  }
963 
964  $content = $this->makeContent( $title, $revId, $slotInfo );
965  $revision->setContent( $slotInfo['role'], $content );
966  }
967  $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
968 
969  if ( isset( $revisionInfo['comment'] ) ) {
970  $revision->setComment( $revisionInfo['comment'] );
971  }
972 
973  if ( isset( $revisionInfo['minor'] ) ) {
974  $revision->setMinor( true );
975  }
976  if ( isset( $revisionInfo['contributor']['ip'] ) ) {
977  $revision->setUserIP( $revisionInfo['contributor']['ip'] );
978  } elseif ( isset( $revisionInfo['contributor']['username'] ) ) {
979  $revision->setUsername(
980  $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
981  );
982  } else {
983  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
984  }
985  if ( isset( $revisionInfo['sha1'] ) ) {
986  $revision->setSha1Base36( $revisionInfo['sha1'] );
987  }
988  $revision->setNoUpdates( $this->mNoUpdates );
989 
990  return $this->revisionCallback( $revision );
991  }
992 
997  private function handleUpload( &$pageInfo ) {
998  $this->debug( "Enter upload handler" );
999  $uploadInfo = [];
1000 
1001  $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1002  'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1003 
1004  $skip = false;
1005 
1006  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1007  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1008  $this->reader->localName == 'upload' ) {
1009  break;
1010  }
1011 
1012  $tag = $this->reader->localName;
1013 
1014  if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1015  // Do nothing
1016  } elseif ( in_array( $tag, $normalFields ) ) {
1017  $uploadInfo[$tag] = $this->nodeContents();
1018  } elseif ( $tag == 'contributor' ) {
1019  $uploadInfo['contributor'] = $this->handleContributor();
1020  } elseif ( $tag == 'contents' ) {
1021  $contents = $this->nodeContents();
1022  $encoding = $this->reader->getAttribute( 'encoding' );
1023  if ( $encoding === 'base64' ) {
1024  $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1025  $uploadInfo['isTempSrc'] = true;
1026  }
1027  } elseif ( $tag != '#text' ) {
1028  $this->warn( "Unhandled upload XML tag $tag" );
1029  $skip = true;
1030  }
1031  }
1032 
1033  if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1034  $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1035  if ( file_exists( $path ) ) {
1036  $uploadInfo['fileSrc'] = $path;
1037  $uploadInfo['isTempSrc'] = false;
1038  }
1039  }
1040 
1041  if ( $this->mImportUploads ) {
1042  return $this->processUpload( $pageInfo, $uploadInfo );
1043  }
1044  }
1045 
1050  private function dumpTemp( $contents ) {
1051  $filename = tempnam( wfTempDir(), 'importupload' );
1052  file_put_contents( $filename, $contents );
1053  return $filename;
1054  }
1055 
1061  private function processUpload( $pageInfo, $uploadInfo ) {
1062  $revision = new WikiRevision( $this->config );
1063  $revId = $pageInfo['id'];
1064  $title = $pageInfo['_title'];
1065  $content = $this->makeContent( $title, $revId, $uploadInfo );
1066 
1067  $revision->setTitle( $title );
1068  $revision->setID( $revId );
1069  $revision->setTimestamp( $uploadInfo['timestamp'] );
1070  $revision->setContent( SlotRecord::MAIN, $content );
1071  $revision->setFilename( $uploadInfo['filename'] );
1072  if ( isset( $uploadInfo['archivename'] ) ) {
1073  $revision->setArchiveName( $uploadInfo['archivename'] );
1074  }
1075  $revision->setSrc( $uploadInfo['src'] );
1076  if ( isset( $uploadInfo['fileSrc'] ) ) {
1077  $revision->setFileSrc( $uploadInfo['fileSrc'],
1078  !empty( $uploadInfo['isTempSrc'] ) );
1079  }
1080  if ( isset( $uploadInfo['sha1base36'] ) ) {
1081  $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1082  }
1083  $revision->setSize( intval( $uploadInfo['size'] ) );
1084  $revision->setComment( $uploadInfo['comment'] );
1085 
1086  if ( isset( $uploadInfo['contributor']['ip'] ) ) {
1087  $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1088  }
1089  if ( isset( $uploadInfo['contributor']['username'] ) ) {
1090  $revision->setUsername(
1091  $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1092  );
1093  }
1094  $revision->setNoUpdates( $this->mNoUpdates );
1095 
1096  return call_user_func( $this->mUploadCallback, $revision );
1097  }
1098 
1102  private function handleContributor() {
1103  $this->debug( "Enter contributor handler." );
1104  $fields = [ 'id', 'ip', 'username' ];
1105  $info = [];
1106 
1107  if ( $this->reader->isEmptyElement ) {
1108  return $info;
1109  }
1110  while ( $this->reader->read() ) {
1111  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1112  $this->reader->localName == 'contributor' ) {
1113  break;
1114  }
1115 
1116  $tag = $this->reader->localName;
1117 
1118  if ( in_array( $tag, $fields ) ) {
1119  $info[$tag] = $this->nodeContents();
1120  }
1121  }
1122 
1123  return $info;
1124  }
1125 
1131  private function processTitle( $text, $ns = null ) {
1132  if ( $this->foreignNamespaces === null ) {
1133  $foreignTitleFactory = new NaiveForeignTitleFactory();
1134  } else {
1135  $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1136  $this->foreignNamespaces );
1137  }
1138 
1139  $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1140  intval( $ns ) );
1141 
1142  $title = $this->importTitleFactory->createTitleFromForeignTitle(
1143  $foreignTitle );
1144 
1145  $commandLineMode = $this->config->get( 'CommandLineMode' );
1146  if ( $title === null ) {
1147  # Invalid page title? Ignore the page
1148  $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1149  return false;
1150  } elseif ( $title->isExternal() ) {
1151  $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1152  return false;
1153  } elseif ( !$title->canExist() ) {
1154  $this->notice( 'import-error-special', $title->getPrefixedText() );
1155  return false;
1156  } elseif ( !$commandLineMode ) {
1157  $permissionManager = MediaWikiServices::getInstance()->getPermissionManager();
1158  $user = RequestContext::getMain()->getUser();
1159 
1160  if ( !$permissionManager->userCan( 'edit', $user, $title ) ) {
1161  # Do not import if the importing wiki user cannot edit this page
1162  $this->notice( 'import-error-edit', $title->getPrefixedText() );
1163 
1164  return false;
1165  }
1166 
1167  if ( !$title->exists() && !$permissionManager->userCan( 'create', $user, $title ) ) {
1168  # Do not import if the importing wiki user cannot create this page
1169  $this->notice( 'import-error-create', $title->getPrefixedText() );
1170 
1171  return false;
1172  }
1173  }
1174 
1175  return [ $title, $foreignTitle ];
1176  }
1177 
1182  private function getContentHandler( $model ) {
1183  return MediaWikiServices::getInstance()
1184  ->getContentHandlerFactory()
1185  ->getContentHandler( $model );
1186  }
1187 
1194  private function getDefaultContentModel( $title, $role ) {
1195  return MediaWikiServices::getInstance()
1196  ->getSlotRoleRegistry()
1197  ->getRoleHandler( $role )
1198  ->getDefaultModel( $title );
1199  }
1200 }
NaiveImportTitleFactory
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Definition: NaiveImportTitleFactory.php:34
WikiImporter\processRevision
processRevision( $pageInfo, $revisionInfo)
Definition: WikiImporter.php:945
WikiImporter\$mUploadCallback
$mUploadCallback
Definition: WikiImporter.php:41
WikiImporter
XML file reader for the page data importer.
Definition: WikiImporter.php:37
WikiImporter\makeContent
makeContent(Title $title, $revisionId, $contentInfo)
Definition: WikiImporter.php:899
Title\newFromText
static newFromText( $text, $defaultNamespace=NS_MAIN)
Create a new Title from text, such as what one would find in a link.
Definition: Title.php:361
$wgMaxArticleSize
$wgMaxArticleSize
Maximum article size in kilobytes.
Definition: DefaultSettings.php:2398
WikiImporter\setImageBasePath
setImageBasePath( $dir)
Definition: WikiImporter.php:309
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:166
wfSetVar
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
Definition: GlobalFunctions.php:1560
UploadSourceAdapter\registerSource
static registerSource(ImportSource $source)
Definition: UploadSourceAdapter.php:48
WikiImporter\$mImportUploads
$mImportUploads
Definition: WikiImporter.php:44
NamespaceAwareForeignTitleFactory
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
Definition: NamespaceAwareForeignTitleFactory.php:25
WikiImporter\$mRevisionCallback
$mRevisionCallback
Definition: WikiImporter.php:41
WikiImporter\revisionCallback
revisionCallback( $revision)
Notify the callback function of a revision.
Definition: WikiImporter.php:502
WikiImporter\setNoticeCallback
setNoticeCallback( $callback)
Set a callback that displays notice messages.
Definition: WikiImporter.php:167
DeferredUpdates\addUpdate
static addUpdate(DeferrableUpdate $update, $stage=self::POSTSEND)
Add an update to the pending update queue for execution at the appropriate time.
Definition: DeferredUpdates.php:119
NaiveForeignTitleFactory
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
Definition: NaiveForeignTitleFactory.php:27
WikiImporter\$mPageOutCallback
$mPageOutCallback
Definition: WikiImporter.php:42
WikiImporter\setNoUpdates
setNoUpdates( $noupdates)
Set 'no updates' mode.
Definition: WikiImporter.php:147
WikiImporter\getReader
getReader()
Definition: WikiImporter.php:106
ExternalUserNames
Class to parse and build external user names.
Definition: ExternalUserNames.php:29
WikiImporter\processLogItem
processLogItem( $logInfo)
Definition: WikiImporter.php:696
WikiImporter\handleRevision
handleRevision(&$pageInfo)
Definition: WikiImporter.php:820
WikiImporter\setRevisionCallback
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
Definition: WikiImporter.php:202
wfMessage
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Definition: GlobalFunctions.php:1230
WikiImporter\handleContributor
handleContributor()
Definition: WikiImporter.php:1102
WikiImporter\setUsernamePrefix
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
Definition: WikiImporter.php:325
ImportReporter
Reporting callback.
Definition: ImportReporter.php:28
WikiImporter\$externalUserNames
ExternalUserNames $externalUserNames
Definition: WikiImporter.php:58
NamespaceImportTitleFactory
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Definition: NamespaceImportTitleFactory.php:28
WikiImporter\nodeContents
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
Definition: WikiImporter.php:542
WikiImporter\siteInfoCallback
siteInfoCallback( $siteInfo)
Notify the callback function of site info.
Definition: WikiImporter.php:463
NS_MAIN
const NS_MAIN
Definition: Defines.php:63
$debug
$debug
Definition: mcc.php:31
WikiImporter\getContentHandler
getContentHandler( $model)
Definition: WikiImporter.php:1182
Config
Interface for configuration instances.
Definition: Config.php:30
MWException
MediaWiki exception.
Definition: MWException.php:29
ImportTitleFactory
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
Definition: ImportTitleFactory.php:25
WikiImporter\$hookRunner
HookRunner $hookRunner
Definition: WikiImporter.php:52
WikiImporter\dumpTemp
dumpTemp( $contents)
Definition: WikiImporter.php:1050
WikiImporter\$countableCache
array $countableCache
Definition: WikiImporter.php:54
WikiImporter\pageOutCallback
pageOutCallback( $title, $foreignTitle, $revCount, $sucCount, $pageInfo)
Notify the callback function when a "</page>" is closed.
Definition: WikiImporter.php:490
WikiImporter\__construct
__construct(ImportSource $source, Config $config)
Creates an ImportXMLReader drawing from the source provided.
Definition: WikiImporter.php:66
MWContentSerializationException
Exception representing a failure to serialize or unserialize a content object.
Definition: MWContentSerializationException.php:8
WikiImporter\throwXmlError
throwXmlError( $err)
Definition: WikiImporter.php:110
SubpageImportTitleFactory
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Definition: SubpageImportTitleFactory.php:28
$title
$title
Definition: testCompression.php:38
SiteStatsUpdate\factory
static factory(array $deltas)
Definition: SiteStatsUpdate.php:71
WikiImporter\finishImportPage
finishImportPage( $title, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
Definition: WikiImporter.php:408
wfTimestampNow
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
Definition: GlobalFunctions.php:1860
WikiImporter\$importTitleFactory
ImportTitleFactory $importTitleFactory
Definition: WikiImporter.php:50
WikiImporter\processUpload
processUpload( $pageInfo, $uploadInfo)
Definition: WikiImporter.php:1061
WikiImporter\disableStatisticsUpdate
disableStatisticsUpdate()
Statistics update can cause a lot of time.
Definition: WikiImporter.php:333
WikiImporter\setImportUploads
setImportUploads( $import)
Definition: WikiImporter.php:316
WikiImporter\$mNoUpdates
$mNoUpdates
Definition: WikiImporter.php:45
wfDebug
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Definition: GlobalFunctions.php:914
WikiImporter\$mPageCallback
$mPageCallback
Definition: WikiImporter.php:41
WikiImporter\$mSiteInfoCallback
$mSiteInfoCallback
Definition: WikiImporter.php:42
$content
$content
Definition: router.php:76
WikiImporter\beforeImportPage
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
Definition: WikiImporter.php:343
StatusValue\newGood
static newGood( $value=null)
Factory function for good results.
Definition: StatusValue.php:82
WikiImporter\importRevision
importRevision( $revision)
Default per-revision callback, performs the import.
Definition: WikiImporter.php:355
Hooks\runner
static runner()
Get a HookRunner instance for calling hooks using the new interfaces.
Definition: Hooks.php:172
WikiImporter\doImport
doImport()
Primary entry point.
Definition: WikiImporter.php:569
WikiImporter\processTitle
processTitle( $text, $ns=null)
Definition: WikiImporter.php:1131
WikiImporter\$mImageBasePath
$mImageBasePath
Definition: WikiImporter.php:44
WikiImporter\$foreignNamespaces
$foreignNamespaces
Definition: WikiImporter.php:40
WikiImporter\setDebug
setDebug( $debug)
Set debug mode...
Definition: WikiImporter.php:139
WikiImporter\notice
notice( $msg,... $params)
Definition: WikiImporter.php:125
RequestContext\getMain
static getMain()
Get the RequestContext object associated with the main request.
Definition: RequestContext.php:454
WikiImporter\handleUpload
handleUpload(&$pageInfo)
Definition: WikiImporter.php:997
WikiImporter\setUploadCallback
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
Definition: WikiImporter.php:213
WikiImporter\warn
warn( $data)
Definition: WikiImporter.php:121
WikiImporter\handleSiteInfo
handleSiteInfo()
Definition: WikiImporter.php:635
WikiImporter\setTargetRootPage
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
Definition: WikiImporter.php:277
Title
Represents a title within MediaWiki.
Definition: Title.php:46
WikiImporter\$mNoticeCallback
$mNoticeCallback
Definition: WikiImporter.php:43
wfTempDir
wfTempDir()
Tries to get the system directory for temporary files.
Definition: GlobalFunctions.php:1894
WikiRevision
Represents a revision, log entry or upload during the import process.
Definition: WikiRevision.php:39
WikiImporter\debug
debug( $data)
Definition: WikiImporter.php:115
WikiImporter\importLogItem
importLogItem( $revision)
Default per-revision callback, performs the import.
Definition: WikiImporter.php:384
WikiImporter\handleContent
handleContent()
Definition: WikiImporter.php:860
WikiImporter\importUpload
importUpload( $revision)
Dummy for now...
Definition: WikiImporter.php:393
WikiImporter\setPageOffset
setPageOffset( $nthPage)
Sets 'pageOffset' value.
Definition: WikiImporter.php:157
WikiImporter\handleLogItem
handleLogItem()
Definition: WikiImporter.php:662
ImportSource
Source interface for XML import.
Definition: ImportSource.php:32
$path
$path
Definition: NoLocalSettings.php:25
WikiImporter\$pageOffset
$pageOffset
Definition: WikiImporter.php:46
WikiImporter\$config
Config $config
Definition: WikiImporter.php:48
WikiImporter\setSiteInfoCallback
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
Definition: WikiImporter.php:235
WikiImporter\getDefaultContentModel
getDefaultContentModel( $title, $role)
Definition: WikiImporter.php:1194
WikiImporter\setPageOutCallback
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
Definition: WikiImporter.php:191
WikiImporter\debugRevisionHandler
debugRevisionHandler(&$revision)
Alternate per-revision callback, for debugging.
Definition: WikiImporter.php:445
WikiImporter\$disableStatisticsUpdate
bool $disableStatisticsUpdate
Definition: WikiImporter.php:56
$source
$source
Definition: mwdoc-filter.php:34
WikiImporter\$reader
XMLReader $reader
Definition: WikiImporter.php:39
WikiImporter\handlePage
handlePage()
Definition: WikiImporter.php:737
WikiImporter\nodeAttribute
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
Definition: WikiImporter.php:531
MediaWiki\HookContainer\HookRunner
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Definition: HookRunner.php:571
WikiImporter\setPageCallback
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
Definition: WikiImporter.php:176
WikiImporter\setImportTitleFactory
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
Definition: WikiImporter.php:246
WikiImporter\setTargetNamespace
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
Definition: WikiImporter.php:255
WikiImporter\logItemCallback
logItemCallback( $revision)
Notify the callback function of a new log item.
Definition: WikiImporter.php:516
WikiImporter\pageCallback
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
Definition: WikiImporter.php:476
WikiImporter\$mLogItemCallback
$mLogItemCallback
Definition: WikiImporter.php:41
WikiImporter\$mDebug
$mDebug
Definition: WikiImporter.php:43
Revision\SlotRecord
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:40
WikiImporter\setLogItemCallback
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
Definition: WikiImporter.php:224
$type
$type
Definition: testCompression.php:52