MediaWiki  master
WikiImporter.php
Go to the documentation of this file.
1 <?php
40 use Wikimedia\AtEase\AtEase;
41 use Wikimedia\NormalizedException\NormalizedException;
42 
49 class WikiImporter {
51  private $reader;
52 
54  private $sourceAdapterId;
55 
57  private $foreignNamespaces = null;
58 
60  private $mLogItemCallback;
61 
63  private $mUploadCallback;
64 
66  private $mRevisionCallback;
67 
69  private $mPageCallback;
70 
72  private $mSiteInfoCallback;
73 
75  private $mPageOutCallback;
76 
78  private $mNoticeCallback;
79 
81  private $mDebug;
82 
84  private $mImportUploads;
85 
87  private $mImageBasePath;
88 
90  private $mNoUpdates = false;
91 
93  private $pageOffset = 0;
94 
96  private $config;
97 
99  private $importTitleFactory;
100 
102  private $hookRunner;
103 
105  private $countableCache = [];
106 
108  private $disableStatisticsUpdate = false;
109 
111  private $externalUserNames;
112 
114  private $contentLanguage;
115 
117  private $namespaceInfo;
118 
120  private $titleFactory;
121 
123  private $wikiPageFactory;
124 
126  private $uploadRevisionImporter;
127 
129  private $permissionManager;
130 
132  private $contentHandlerFactory;
133 
135  private $slotRoleRegistry;
136 
152  public function __construct(
154  Config $config,
155  HookContainer $hookContainer,
156  Language $contentLanguage,
157  NamespaceInfo $namespaceInfo,
158  TitleFactory $titleFactory,
159  WikiPageFactory $wikiPageFactory,
160  UploadRevisionImporter $uploadRevisionImporter,
161  PermissionManager $permissionManager,
162  IContentHandlerFactory $contentHandlerFactory,
163  SlotRoleRegistry $slotRoleRegistry
164  ) {
165  $this->reader = new XMLReader();
166  $this->config = $config;
167  $this->hookRunner = new HookRunner( $hookContainer );
168  $this->contentLanguage = $contentLanguage;
169  $this->namespaceInfo = $namespaceInfo;
170  $this->titleFactory = $titleFactory;
171  $this->wikiPageFactory = $wikiPageFactory;
172  $this->uploadRevisionImporter = $uploadRevisionImporter;
173  $this->permissionManager = $permissionManager;
174  $this->contentHandlerFactory = $contentHandlerFactory;
175  $this->slotRoleRegistry = $slotRoleRegistry;
176 
177  if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
178  stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
179  }
180  $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source );
181 
182  $this->openReader();
183 
184  // Default callbacks
185  $this->setPageCallback( [ $this, 'beforeImportPage' ] );
186  $this->setRevisionCallback( [ $this, "importRevision" ] );
187  $this->setUploadCallback( [ $this, 'importUpload' ] );
188  $this->setLogItemCallback( [ $this, 'importLogItem' ] );
189  $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
190 
191  $this->importTitleFactory = new NaiveImportTitleFactory(
192  $this->contentLanguage,
193  $this->namespaceInfo,
194  $this->titleFactory
195  );
196  $this->externalUserNames = new ExternalUserNames( 'imported', false );
197  }
198 
202  public function getReader() {
203  return $this->reader;
204  }
205 
209  public function throwXmlError( $err ) {
210  $this->debug( "FAILURE: $err" );
211  wfDebug( "WikiImporter XML error: $err" );
212  }
213 
217  public function debug( $data ) {
218  if ( $this->mDebug ) {
219  wfDebug( "IMPORT: $data" );
220  }
221  }
222 
226  public function warn( $data ) {
227  wfDebug( "IMPORT: $data" );
228  }
229 
234  public function notice( $msg, ...$params ) {
235  if ( is_callable( $this->mNoticeCallback ) ) {
236  call_user_func( $this->mNoticeCallback, $msg, $params );
237  } else { # No ImportReporter -> CLI
238  // T177997: the command line importers should call setNoticeCallback()
239  // for their own custom callback to echo the notice
240  wfDebug( wfMessage( $msg, $params )->text() );
241  }
242  }
243 
248  public function setDebug( $debug ) {
249  $this->mDebug = $debug;
250  }
251 
256  public function setNoUpdates( $noupdates ) {
257  $this->mNoUpdates = $noupdates;
258  }
259 
266  public function setPageOffset( $nthPage ) {
267  $this->pageOffset = $nthPage;
268  }
269 
276  public function setNoticeCallback( $callback ) {
277  return wfSetVar( $this->mNoticeCallback, $callback );
278  }
279 
285  public function setPageCallback( $callback ) {
286  $previous = $this->mPageCallback;
287  $this->mPageCallback = $callback;
288  return $previous;
289  }
290 
300  public function setPageOutCallback( $callback ) {
301  $previous = $this->mPageOutCallback;
302  $this->mPageOutCallback = $callback;
303  return $previous;
304  }
305 
311  public function setRevisionCallback( $callback ) {
312  $previous = $this->mRevisionCallback;
313  $this->mRevisionCallback = $callback;
314  return $previous;
315  }
316 
322  public function setUploadCallback( $callback ) {
323  $previous = $this->mUploadCallback;
324  $this->mUploadCallback = $callback;
325  return $previous;
326  }
327 
333  public function setLogItemCallback( $callback ) {
334  $previous = $this->mLogItemCallback;
335  $this->mLogItemCallback = $callback;
336  return $previous;
337  }
338 
344  public function setSiteInfoCallback( $callback ) {
345  $previous = $this->mSiteInfoCallback;
346  $this->mSiteInfoCallback = $callback;
347  return $previous;
348  }
349 
355  public function setImportTitleFactory( $factory ) {
356  $this->importTitleFactory = $factory;
357  }
358 
364  public function setTargetNamespace( $namespace ) {
365  if ( $namespace === null ) {
366  // Don't override namespaces
367  $this->setImportTitleFactory(
369  $this->contentLanguage,
370  $this->namespaceInfo,
371  $this->titleFactory
372  )
373  );
374  return true;
375  } elseif (
376  $namespace >= 0 &&
377  $this->namespaceInfo->exists( intval( $namespace ) )
378  ) {
379  $namespace = intval( $namespace );
380  $this->setImportTitleFactory(
382  $this->namespaceInfo,
383  $this->titleFactory,
384  $namespace
385  )
386  );
387  return true;
388  } else {
389  return false;
390  }
391  }
392 
398  public function setTargetRootPage( $rootpage ) {
399  $status = Status::newGood();
400  $nsInfo = $this->namespaceInfo;
401  if ( $rootpage === null ) {
402  // No rootpage
403  $this->setImportTitleFactory(
405  $this->contentLanguage,
406  $nsInfo,
407  $this->titleFactory
408  )
409  );
410  } elseif ( $rootpage !== '' ) {
411  $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
412  $title = Title::newFromText( $rootpage );
413 
414  if ( !$title || $title->isExternal() ) {
415  $status->fatal( 'import-rootpage-invalid' );
416  } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
417  $displayNSText = $title->getNamespace() === NS_MAIN
418  ? wfMessage( 'blanknamespace' )->text()
419  : $this->contentLanguage->getNsText( $title->getNamespace() );
420  $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
421  } else {
422  // set namespace to 'all', so the namespace check in processTitle() can pass
423  $this->setTargetNamespace( null );
424  $this->setImportTitleFactory(
426  $nsInfo,
427  $this->titleFactory,
428  $title
429  )
430  );
431  }
432  }
433  return $status;
434  }
435 
439  public function setImageBasePath( $dir ) {
440  $this->mImageBasePath = $dir;
441  }
442 
446  public function setImportUploads( $import ) {
447  $this->mImportUploads = $import;
448  }
449 
455  public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
456  $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
457  }
458 
463  public function disableStatisticsUpdate() {
464  $this->disableStatisticsUpdate = true;
465  }
466 
473  public function beforeImportPage( $titleAndForeignTitle ) {
474  $title = $titleAndForeignTitle[0];
475  $page = $this->wikiPageFactory->newFromTitle( $title );
476  $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
477  return true;
478  }
479 
485  public function importRevision( $revision ) {
486  if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
487  $this->notice( 'import-error-bad-location',
488  $revision->getTitle()->getPrefixedText(),
489  $revision->getID(),
490  $revision->getModel(),
491  $revision->getFormat()
492  );
493 
494  return false;
495  }
496 
497  try {
498  return $revision->importOldRevision();
499  } catch ( MWContentSerializationException $ex ) {
500  $this->notice( 'import-error-unserialize',
501  $revision->getTitle()->getPrefixedText(),
502  $revision->getID(),
503  $revision->getModel(),
504  $revision->getFormat()
505  );
506  }
507 
508  return false;
509  }
510 
516  public function importLogItem( $revision ) {
517  return $revision->importLogItem();
518  }
519 
525  public function importUpload( $revision ) {
526  $status = $this->uploadRevisionImporter->import( $revision );
527  return $status->isGood();
528  }
529 
539  public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
540  $sRevCount, $pageInfo
541  ) {
542  // Update article count statistics (T42009)
543  // The normal counting logic in WikiPage->doEditUpdates() is designed for
544  // one-revision-at-a-time editing, not bulk imports. In this situation it
545  // suffers from issues of replica DB lag. We let WikiPage handle the total page
546  // and revision count, and we implement our own custom logic for the
547  // article (content page) count.
548  if ( !$this->disableStatisticsUpdate ) {
549  $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
550 
551  $page->loadPageData( WikiPage::READ_LATEST );
552  $rev = $page->getRevisionRecord();
553  if ( $rev === null ) {
554 
555  wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
556  ' because WikiPage::getRevisionRecord() returned null' );
557  } else {
558  $user = RequestContext::getMain()->getUser();
559  $update = $page->newPageUpdater( $user )->prepareUpdate();
560  $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
561  $countable = $update->isCountable();
562  if ( array_key_exists( $countKey, $this->countableCache ) &&
563  $countable != $this->countableCache[$countKey] ) {
565  'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
566  ] ) );
567  }
568  }
569  }
570 
571  $title = Title::castFromPageIdentity( $pageIdentity );
572  // @phan-suppress-next-line PhanTypeMismatchArgumentNullable castFrom does not return null here
573  return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
574  $revCount, $sRevCount, $pageInfo );
575  }
576 
582  private function siteInfoCallback( $siteInfo ) {
583  if ( isset( $this->mSiteInfoCallback ) ) {
584  return call_user_func_array(
585  $this->mSiteInfoCallback,
586  [ $siteInfo, $this ]
587  );
588  } else {
589  return false;
590  }
591  }
592 
597  public function pageCallback( $title ) {
598  if ( isset( $this->mPageCallback ) ) {
599  call_user_func( $this->mPageCallback, $title );
600  }
601  }
602 
611  private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
612  $sucCount, $pageInfo ) {
613  if ( isset( $this->mPageOutCallback ) ) {
614  call_user_func_array( $this->mPageOutCallback, func_get_args() );
615  }
616  }
617 
623  private function revisionCallback( $revision ) {
624  if ( isset( $this->mRevisionCallback ) ) {
625  return call_user_func_array(
626  $this->mRevisionCallback,
627  [ $revision, $this ]
628  );
629  } else {
630  return false;
631  }
632  }
633 
639  private function logItemCallback( $revision ) {
640  if ( isset( $this->mLogItemCallback ) ) {
641  return call_user_func_array(
642  $this->mLogItemCallback,
643  [ $revision, $this ]
644  );
645  } else {
646  return false;
647  }
648  }
649 
656  public function nodeAttribute( $attr ) {
657  return $this->reader->getAttribute( $attr ) ?? '';
658  }
659 
667  public function nodeContents() {
668  if ( $this->reader->isEmptyElement ) {
669  return "";
670  }
671  $buffer = "";
672  while ( $this->reader->read() ) {
673  switch ( $this->reader->nodeType ) {
674  case XMLReader::TEXT:
675  case XMLReader::CDATA:
676  case XMLReader::SIGNIFICANT_WHITESPACE:
677  $buffer .= $this->reader->value;
678  break;
679  case XMLReader::END_ELEMENT:
680  return $buffer;
681  }
682  }
683 
684  $this->reader->close();
685  return '';
686  }
687 
694  public function doImport() {
695  $this->syntaxCheckXML();
696 
697  // Calls to reader->read need to be wrapped in calls to
698  // libxml_disable_entity_loader() to avoid local file
699  // inclusion attacks (T48932).
700  // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
701  $oldDisable = @libxml_disable_entity_loader( true );
702  try {
703  $this->reader->read();
704 
705  if ( $this->reader->localName != 'mediawiki' ) {
706  // phpcs:ignore Generic.PHP.NoSilencedErrors
707  @libxml_disable_entity_loader( $oldDisable );
708  $error = libxml_get_last_error();
709  if ( $error ) {
710  throw new NormalizedException( "XML error at line {line}: {message}", [
711  'line' => $error->line,
712  'message' => $error->message,
713  ] );
714  } else {
715  throw new MWException( "Expected <mediawiki> tag, got " .
716  $this->reader->localName );
717  }
718  }
719  $this->debug( "<mediawiki> tag is correct." );
720 
721  $this->debug( "Starting primary dump processing loop." );
722 
723  $keepReading = $this->reader->read();
724  $skip = false;
725  $pageCount = 0;
726  while ( $keepReading ) {
727  $tag = $this->reader->localName;
728  if ( $this->pageOffset ) {
729  if ( $tag === 'page' ) {
730  $pageCount++;
731  }
732  if ( $pageCount < $this->pageOffset ) {
733  $keepReading = $this->reader->next();
734  continue;
735  }
736  }
737  $type = $this->reader->nodeType;
738 
739  if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
740  // Do nothing
741  } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
742  break;
743  } elseif ( $tag == 'siteinfo' ) {
744  $this->handleSiteInfo();
745  } elseif ( $tag == 'page' ) {
746  $this->handlePage();
747  } elseif ( $tag == 'logitem' ) {
748  $this->handleLogItem();
749  } elseif ( $tag != '#text' ) {
750  $this->warn( "Unhandled top-level XML tag $tag" );
751 
752  $skip = true;
753  }
754 
755  if ( $skip ) {
756  $keepReading = $this->reader->next();
757  $skip = false;
758  $this->debug( "Skip" );
759  } else {
760  $keepReading = $this->reader->read();
761  }
762  }
763  } finally {
764  // phpcs:ignore Generic.PHP.NoSilencedErrors
765  @libxml_disable_entity_loader( $oldDisable );
766  $this->reader->close();
767  }
768 
769  return true;
770  }
771 
772  private function handleSiteInfo() {
773  $this->debug( "Enter site info handler." );
774  $siteInfo = [];
775 
776  // Fields that can just be stuffed in the siteInfo object
777  $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
778 
779  while ( $this->reader->read() ) {
780  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
781  $this->reader->localName == 'siteinfo' ) {
782  break;
783  }
784 
785  $tag = $this->reader->localName;
786 
787  if ( $tag == 'namespace' ) {
788  $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
789  $this->nodeContents();
790  } elseif ( in_array( $tag, $normalFields ) ) {
791  $siteInfo[$tag] = $this->nodeContents();
792  }
793  }
794 
795  $siteInfo['_namespaces'] = $this->foreignNamespaces;
796  $this->siteInfoCallback( $siteInfo );
797  }
798 
799  private function handleLogItem() {
800  $this->debug( "Enter log item handler." );
801  $logInfo = [];
802 
803  // Fields that can just be stuffed in the pageInfo object
804  $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
805  'logtitle', 'params' ];
806 
807  while ( $this->reader->read() ) {
808  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
809  $this->reader->localName == 'logitem' ) {
810  break;
811  }
812 
813  $tag = $this->reader->localName;
814 
815  if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
816  // Do nothing
817  } elseif ( in_array( $tag, $normalFields ) ) {
818  $logInfo[$tag] = $this->nodeContents();
819  } elseif ( $tag == 'contributor' ) {
820  $logInfo['contributor'] = $this->handleContributor();
821  } elseif ( $tag != '#text' ) {
822  $this->warn( "Unhandled log-item XML tag $tag" );
823  }
824  }
825 
826  $this->processLogItem( $logInfo );
827  }
828 
833  private function processLogItem( $logInfo ) {
834  $revision = new WikiRevision();
835 
836  if ( isset( $logInfo['id'] ) ) {
837  $revision->setID( $logInfo['id'] );
838  }
839  $revision->setType( $logInfo['type'] );
840  $revision->setAction( $logInfo['action'] );
841  if ( isset( $logInfo['timestamp'] ) ) {
842  $revision->setTimestamp( $logInfo['timestamp'] );
843  }
844  if ( isset( $logInfo['params'] ) ) {
845  $revision->setParams( $logInfo['params'] );
846  }
847  if ( isset( $logInfo['logtitle'] ) ) {
848  // @todo Using Title for non-local titles is a recipe for disaster.
849  // We should use ForeignTitle here instead.
850  $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
851  }
852 
853  $revision->setNoUpdates( $this->mNoUpdates );
854 
855  if ( isset( $logInfo['comment'] ) ) {
856  $revision->setComment( $logInfo['comment'] );
857  }
858 
859  if ( isset( $logInfo['contributor']['username'] ) ) {
860  $revision->setUsername(
861  $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
862  );
863  } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
864  $revision->setUserIP( $logInfo['contributor']['ip'] );
865  } else {
866  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
867  }
868 
869  return $this->logItemCallback( $revision );
870  }
871 
872  private function handlePage() {
873  // Handle page data.
874  $this->debug( "Enter page handler." );
875  $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
876 
877  // Fields that can just be stuffed in the pageInfo object
878  $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
879 
880  $skip = false;
881  $badTitle = false;
882 
883  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
884  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
885  $this->reader->localName == 'page' ) {
886  break;
887  }
888 
889  $skip = false;
890 
891  $tag = $this->reader->localName;
892 
893  if ( $badTitle ) {
894  // The title is invalid, bail out of this page
895  $skip = true;
896  } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
897  // Do nothing
898  } elseif ( in_array( $tag, $normalFields ) ) {
899  // An XML snippet:
900  // <page>
901  // <id>123</id>
902  // <title>Page</title>
903  // <redirect title="NewTitle"/>
904  // ...
905  // Because the redirect tag is built differently, we need special handling for that case.
906  if ( $tag == 'redirect' ) {
907  $pageInfo[$tag] = $this->nodeAttribute( 'title' );
908  } else {
909  $pageInfo[$tag] = $this->nodeContents();
910  }
911  } elseif ( $tag == 'revision' || $tag == 'upload' ) {
912  if ( !isset( $title ) ) {
913  $title = $this->processTitle( $pageInfo['title'],
914  $pageInfo['ns'] ?? null );
915 
916  // $title is either an array of two titles or false.
917  if ( is_array( $title ) ) {
918  $this->pageCallback( $title );
919  [ $pageInfo['_title'], $foreignTitle ] = $title;
920  } else {
921  $badTitle = true;
922  $skip = true;
923  }
924  }
925 
926  if ( $title ) {
927  if ( $tag == 'revision' ) {
928  $this->handleRevision( $pageInfo );
929  } else {
930  $this->handleUpload( $pageInfo );
931  }
932  }
933  } elseif ( $tag != '#text' ) {
934  $this->warn( "Unhandled page XML tag $tag" );
935  $skip = true;
936  }
937  }
938 
939  // @note $pageInfo is only set if a valid $title is processed above with
940  // no error. If we have a valid $title, then pageCallback is called
941  // above, $pageInfo['title'] is set and we do pageOutCallback here.
942  // If $pageInfo['_title'] is not set, then $foreignTitle is also not
943  // set since they both come from $title above.
944  if ( array_key_exists( '_title', $pageInfo ) ) {
946  $title = $pageInfo['_title'];
947  $this->pageOutCallback(
948  $title,
949  // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
950  $foreignTitle,
951  $pageInfo['revisionCount'],
952  $pageInfo['successfulRevisionCount'],
953  $pageInfo
954  );
955  }
956  }
957 
961  private function handleRevision( &$pageInfo ) {
962  $this->debug( "Enter revision handler" );
963  $revisionInfo = [];
964 
965  $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
966  'model', 'format', 'text', 'sha1' ];
967 
968  $skip = false;
969 
970  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
971  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
972  $this->reader->localName == 'revision' ) {
973  break;
974  }
975 
976  $tag = $this->reader->localName;
977 
978  if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
979  $this, $pageInfo, $revisionInfo )
980  ) {
981  // Do nothing
982  } elseif ( in_array( $tag, $normalFields ) ) {
983  $revisionInfo[$tag] = $this->nodeContents();
984  } elseif ( $tag == 'content' ) {
985  // We can have multiple content tags, so make this an array.
986  $revisionInfo[$tag][] = $this->handleContent();
987  } elseif ( $tag == 'contributor' ) {
988  $revisionInfo['contributor'] = $this->handleContributor();
989  } elseif ( $tag != '#text' ) {
990  $this->warn( "Unhandled revision XML tag $tag" );
991  $skip = true;
992  }
993  }
994 
995  $pageInfo['revisionCount']++;
996  if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
997  $pageInfo['successfulRevisionCount']++;
998  }
999  }
1000 
1001  private function handleContent() {
1002  $this->debug( "Enter content handler" );
1003  $contentInfo = [];
1004 
1005  $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
1006 
1007  $skip = false;
1008 
1009  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1010  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1011  $this->reader->localName == 'content' ) {
1012  break;
1013  }
1014 
1015  $tag = $this->reader->localName;
1016 
1017  if ( !$this->hookRunner->onImportHandleContentXMLTag(
1018  $this, $contentInfo )
1019  ) {
1020  // Do nothing
1021  } elseif ( in_array( $tag, $normalFields ) ) {
1022  $contentInfo[$tag] = $this->nodeContents();
1023  } elseif ( $tag != '#text' ) {
1024  $this->warn( "Unhandled content XML tag $tag" );
1025  $skip = true;
1026  }
1027  }
1028 
1029  return $contentInfo;
1030  }
1031 
1040  private function makeContent( Title $title, $revisionId, $contentInfo ) {
1041  $maxArticleSize = MediaWikiServices::getInstance()->getMainConfig()->get(
1042  MainConfigNames::MaxArticleSize );
1043 
1044  if ( !isset( $contentInfo['text'] ) ) {
1045  throw new MWException( 'Missing text field in import.' );
1046  }
1047 
1048  // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1049  // database errors and instability. Testing for revisions with only listed
1050  // content models, as other content models might use serialization formats
1051  // which aren't checked against $wgMaxArticleSize.
1052  if ( ( !isset( $contentInfo['model'] ) ||
1053  in_array( $contentInfo['model'], [
1054  'wikitext',
1055  'css',
1056  'json',
1057  'javascript',
1058  'text',
1059  ''
1060  ] ) ) &&
1061  strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1062  ) {
1063  throw new MWException( 'The text of ' .
1064  ( $revisionId ?
1065  "the revision with ID $revisionId" :
1066  'a revision'
1067  ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1068  }
1069 
1070  $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1071  $model = $contentInfo['model'] ?? $this->getDefaultContentModel( $title, $role );
1072  $handler = $this->getContentHandler( $model );
1073 
1074  $text = $handler->importTransform( $contentInfo['text'] );
1075 
1076  return $handler->unserializeContent( $text );
1077  }
1078 
1085  private function processRevision( $pageInfo, $revisionInfo ) {
1086  $revision = new WikiRevision();
1087 
1088  $revId = $revisionInfo['id'] ?? 0;
1089  if ( $revId ) {
1090  $revision->setID( $revisionInfo['id'] );
1091  }
1092 
1093  $title = $pageInfo['_title'];
1094  $revision->setTitle( $title );
1095 
1096  $content = $this->makeContent( $title, $revId, $revisionInfo );
1097  $revision->setContent( SlotRecord::MAIN, $content );
1098 
1099  foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1100  if ( !isset( $slotInfo['role'] ) ) {
1101  throw new MWException( "Missing role for imported slot." );
1102  }
1103 
1104  $content = $this->makeContent( $title, $revId, $slotInfo );
1105  $revision->setContent( $slotInfo['role'], $content );
1106  }
1107  $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1108 
1109  if ( isset( $revisionInfo['comment'] ) ) {
1110  $revision->setComment( $revisionInfo['comment'] );
1111  }
1112 
1113  if ( isset( $revisionInfo['minor'] ) ) {
1114  $revision->setMinor( true );
1115  }
1116  if ( isset( $revisionInfo['contributor']['username'] ) ) {
1117  $revision->setUsername(
1118  $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1119  );
1120  } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1121  $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1122  } else {
1123  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1124  }
1125  if ( isset( $revisionInfo['sha1'] ) ) {
1126  $revision->setSha1Base36( $revisionInfo['sha1'] );
1127  }
1128  $revision->setNoUpdates( $this->mNoUpdates );
1129 
1130  return $this->revisionCallback( $revision );
1131  }
1132 
1137  private function handleUpload( &$pageInfo ) {
1138  $this->debug( "Enter upload handler" );
1139  $uploadInfo = [];
1140 
1141  $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1142  'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1143 
1144  $skip = false;
1145 
1146  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1147  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1148  $this->reader->localName == 'upload' ) {
1149  break;
1150  }
1151 
1152  $tag = $this->reader->localName;
1153 
1154  if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1155  // Do nothing
1156  } elseif ( in_array( $tag, $normalFields ) ) {
1157  $uploadInfo[$tag] = $this->nodeContents();
1158  } elseif ( $tag == 'contributor' ) {
1159  $uploadInfo['contributor'] = $this->handleContributor();
1160  } elseif ( $tag == 'contents' ) {
1161  $contents = $this->nodeContents();
1162  $encoding = $this->reader->getAttribute( 'encoding' );
1163  if ( $encoding === 'base64' ) {
1164  $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1165  $uploadInfo['isTempSrc'] = true;
1166  }
1167  } elseif ( $tag != '#text' ) {
1168  $this->warn( "Unhandled upload XML tag $tag" );
1169  $skip = true;
1170  }
1171  }
1172 
1173  if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1174  $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1175  if ( file_exists( $path ) ) {
1176  $uploadInfo['fileSrc'] = $path;
1177  $uploadInfo['isTempSrc'] = false;
1178  }
1179  }
1180 
1181  if ( $this->mImportUploads ) {
1182  return $this->processUpload( $pageInfo, $uploadInfo );
1183  }
1184  }
1185 
1190  private function dumpTemp( $contents ) {
1191  $filename = tempnam( wfTempDir(), 'importupload' );
1192  file_put_contents( $filename, $contents );
1193  return $filename;
1194  }
1195 
1201  private function processUpload( $pageInfo, $uploadInfo ) {
1202  $revision = new WikiRevision();
1203  $revId = $pageInfo['id'];
1204  $title = $pageInfo['_title'];
1205  // T292348: text key may be absent, force addition if null
1206  $uploadInfo['text'] = $uploadInfo['text'] ?? '';
1207  $content = $this->makeContent( $title, $revId, $uploadInfo );
1208 
1209  $revision->setTitle( $title );
1210  $revision->setID( $revId );
1211  $revision->setTimestamp( $uploadInfo['timestamp'] );
1212  $revision->setContent( SlotRecord::MAIN, $content );
1213  $revision->setFilename( $uploadInfo['filename'] );
1214  if ( isset( $uploadInfo['archivename'] ) ) {
1215  $revision->setArchiveName( $uploadInfo['archivename'] );
1216  }
1217  $revision->setSrc( $uploadInfo['src'] );
1218  if ( isset( $uploadInfo['fileSrc'] ) ) {
1219  $revision->setFileSrc( $uploadInfo['fileSrc'],
1220  !empty( $uploadInfo['isTempSrc'] )
1221  );
1222  }
1223  if ( isset( $uploadInfo['sha1base36'] ) ) {
1224  $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1225  }
1226  $revision->setSize( intval( $uploadInfo['size'] ) );
1227  $revision->setComment( $uploadInfo['comment'] );
1228 
1229  if ( isset( $uploadInfo['contributor']['username'] ) ) {
1230  $revision->setUsername(
1231  $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1232  );
1233  } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1234  $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1235  }
1236  $revision->setNoUpdates( $this->mNoUpdates );
1237 
1238  return call_user_func( $this->mUploadCallback, $revision );
1239  }
1240 
1244  private function handleContributor() {
1245  $this->debug( "Enter contributor handler." );
1246 
1247  if ( $this->reader->isEmptyElement ) {
1248  return [];
1249  }
1250 
1251  $fields = [ 'id', 'ip', 'username' ];
1252  $info = [];
1253 
1254  while ( $this->reader->read() ) {
1255  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1256  $this->reader->localName == 'contributor' ) {
1257  break;
1258  }
1259 
1260  $tag = $this->reader->localName;
1261 
1262  if ( in_array( $tag, $fields ) ) {
1263  $info[$tag] = $this->nodeContents();
1264  }
1265  }
1266 
1267  return $info;
1268  }
1269 
1275  private function processTitle( $text, $ns = null ) {
1276  if ( $this->foreignNamespaces === null ) {
1277  $foreignTitleFactory = new NaiveForeignTitleFactory(
1278  $this->contentLanguage
1279  );
1280  } else {
1281  $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1282  $this->foreignNamespaces );
1283  }
1284 
1285  $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1286  intval( $ns ) );
1287 
1288  $title = $this->importTitleFactory->createTitleFromForeignTitle(
1289  $foreignTitle );
1290 
1291  $commandLineMode = $this->config->get( 'CommandLineMode' );
1292  if ( $title === null ) {
1293  # Invalid page title? Ignore the page
1294  $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1295  return false;
1296  } elseif ( $title->isExternal() ) {
1297  $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1298  return false;
1299  } elseif ( !$title->canExist() ) {
1300  $this->notice( 'import-error-special', $title->getPrefixedText() );
1301  return false;
1302  } elseif ( !$commandLineMode ) {
1303  $user = RequestContext::getMain()->getUser();
1304 
1305  if ( !$this->permissionManager->userCan( 'edit', $user, $title ) ) {
1306  # Do not import if the importing wiki user cannot edit this page
1307  $this->notice( 'import-error-edit', $title->getPrefixedText() );
1308 
1309  return false;
1310  }
1311  }
1312 
1313  return [ $title, $foreignTitle ];
1314  }
1315 
1320  private function getContentHandler( $model ) {
1321  return $this->contentHandlerFactory->getContentHandler( $model );
1322  }
1323 
1330  private function getDefaultContentModel( $title, $role ) {
1331  return $this->slotRoleRegistry
1332  ->getRoleHandler( $role )
1333  ->getDefaultModel( $title );
1334  }
1335 
1339  private function openReader() {
1340  // Enable the entity loader, as it is needed for loading external URLs via
1341  // XMLReader::open (T86036)
1342  // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
1343  $oldDisable = @libxml_disable_entity_loader( false );
1344  if ( defined( 'LIBXML_PARSEHUGE' ) ) {
1345  $status = $this->reader->open( 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1346  } else {
1347  $status = $this->reader->open( 'uploadsource://' . $this->sourceAdapterId );
1348  }
1349  if ( !$status ) {
1350  $error = libxml_get_last_error();
1351  // phpcs:ignore Generic.PHP.NoSilencedErrors
1352  @libxml_disable_entity_loader( $oldDisable );
1353  throw new MWException(
1354  'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1355  );
1356  }
1357  // phpcs:ignore Generic.PHP.NoSilencedErrors
1358  @libxml_disable_entity_loader( $oldDisable );
1359  }
1360 
1364  private function syntaxCheckXML() {
1365  if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) {
1366  return;
1367  }
1368  AtEase::suppressWarnings();
1369  $oldDisable = libxml_disable_entity_loader( false );
1370  try {
1371  while ( $this->reader->read() );
1372  $error = libxml_get_last_error();
1373  if ( $error ) {
1374  $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message;
1375  wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage );
1376  throw new MWException( $errorMessage );
1377  }
1378  } finally {
1379  libxml_disable_entity_loader( $oldDisable );
1380  AtEase::restoreWarnings();
1381  $this->reader->close();
1382  }
1383 
1384  // Reopen for the real import
1385  UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 );
1386  $this->openReader();
1387  }
1388 }
const NS_MAIN
Definition: Defines.php:64
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
static addUpdate(DeferrableUpdate $update, $stage=self::POSTSEND)
Add an update to the pending update queue for execution at the appropriate time.
Class to parse and build external user names.
Reporting callback.
Base class for language-specific code.
Definition: Language.php:56
Exception representing a failure to serialize or unserialize a content object.
MediaWiki exception.
Definition: MWException.php:32
Helper class for mapping value objects representing basic entities to cache keys.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Definition: HookRunner.php:568
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Service for creating WikiPage objects.
A service class for checking permissions To obtain an instance, use MediaWikiServices::getInstance()-...
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:40
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
Creates Title objects.
Represents a title within MediaWiki.
Definition: Title.php:82
getPrefixedText()
Get the prefixed title with spaces.
Definition: Title.php:1927
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
static getMain()
Get the RequestContext object associated with the main request.
static factory(array $deltas)
static newGood( $value=null)
Factory function for good results.
Definition: StatusValue.php:85
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
static seekSource(string $id, int $offset)
static isSeekableSource(string $id)
static registerSource(ImportSource $source)
XML file reader for the page data importer.
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setNoUpdates( $noupdates)
Set 'no updates' mode.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
importLogItem( $revision)
Default per-revision callback, performs the import.
setImageBasePath( $dir)
throwXmlError( $err)
setDebug( $debug)
Set debug mode...
__construct(ImportSource $source, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, PermissionManager $permissionManager, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
Represents a revision, log entry or upload during the import process.
Interface for configuration instances.
Definition: Config.php:30
Source interface for XML import.
Interface for objects (potentially) representing an editable wiki page.
$source
$content
Definition: router.php:76