MediaWiki  master
WikiImporter.php
Go to the documentation of this file.
1 <?php
51 use Wikimedia\AtEase\AtEase;
52 use Wikimedia\NormalizedException\NormalizedException;
53 
60 class WikiImporter {
62  private $reader;
63 
65  private $sourceAdapterId;
66 
68  private $foreignNamespaces = null;
69 
71  private $mLogItemCallback;
72 
74  private $mUploadCallback;
75 
77  private $mRevisionCallback;
78 
80  private $mPageCallback;
81 
83  private $mSiteInfoCallback;
84 
86  private $mPageOutCallback;
87 
89  private $mNoticeCallback;
90 
92  private $mDebug;
93 
95  private $mImportUploads;
96 
98  private $mImageBasePath;
99 
101  private $mNoUpdates = false;
102 
104  private $pageOffset = 0;
105 
107  private $config;
108 
110  private $importTitleFactory;
111 
113  private $hookRunner;
114 
116  private $countableCache = [];
117 
119  private $disableStatisticsUpdate = false;
120 
122  private $externalUserNames;
123 
125  private $contentLanguage;
126 
128  private $namespaceInfo;
129 
131  private $titleFactory;
132 
134  private $wikiPageFactory;
135 
137  private $uploadRevisionImporter;
138 
140  private $permissionManager;
141 
143  private $contentHandlerFactory;
144 
146  private $slotRoleRegistry;
147 
163  public function __construct(
165  Config $config,
166  HookContainer $hookContainer,
167  Language $contentLanguage,
168  NamespaceInfo $namespaceInfo,
169  TitleFactory $titleFactory,
170  WikiPageFactory $wikiPageFactory,
171  UploadRevisionImporter $uploadRevisionImporter,
172  PermissionManager $permissionManager,
173  IContentHandlerFactory $contentHandlerFactory,
174  SlotRoleRegistry $slotRoleRegistry
175  ) {
176  $this->config = $config;
177  $this->hookRunner = new HookRunner( $hookContainer );
178  $this->contentLanguage = $contentLanguage;
179  $this->namespaceInfo = $namespaceInfo;
180  $this->titleFactory = $titleFactory;
181  $this->wikiPageFactory = $wikiPageFactory;
182  $this->uploadRevisionImporter = $uploadRevisionImporter;
183  $this->permissionManager = $permissionManager;
184  $this->contentHandlerFactory = $contentHandlerFactory;
185  $this->slotRoleRegistry = $slotRoleRegistry;
186 
187  if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
188  stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
189  }
190  $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source );
191 
192  $this->openReader();
193 
194  // Default callbacks
195  $this->setPageCallback( [ $this, 'beforeImportPage' ] );
196  $this->setRevisionCallback( [ $this, "importRevision" ] );
197  $this->setUploadCallback( [ $this, 'importUpload' ] );
198  $this->setLogItemCallback( [ $this, 'importLogItem' ] );
199  $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
200 
201  $this->importTitleFactory = new NaiveImportTitleFactory(
202  $this->contentLanguage,
203  $this->namespaceInfo,
204  $this->titleFactory
205  );
206  $this->externalUserNames = new ExternalUserNames( 'imported', false );
207  }
208 
212  public function getReader() {
213  return $this->reader;
214  }
215 
219  public function throwXmlError( $err ) {
220  $this->debug( "FAILURE: $err" );
221  wfDebug( "WikiImporter XML error: $err" );
222  }
223 
227  public function debug( $data ) {
228  if ( $this->mDebug ) {
229  wfDebug( "IMPORT: $data" );
230  }
231  }
232 
236  public function warn( $data ) {
237  wfDebug( "IMPORT: $data" );
238  }
239 
244  public function notice( $msg, ...$params ) {
245  if ( is_callable( $this->mNoticeCallback ) ) {
246  call_user_func( $this->mNoticeCallback, $msg, $params );
247  } else { # No ImportReporter -> CLI
248  // T177997: the command line importers should call setNoticeCallback()
249  // for their own custom callback to echo the notice
250  wfDebug( wfMessage( $msg, $params )->text() );
251  }
252  }
253 
258  public function setDebug( $debug ) {
259  $this->mDebug = $debug;
260  }
261 
266  public function setNoUpdates( $noupdates ) {
267  $this->mNoUpdates = $noupdates;
268  }
269 
276  public function setPageOffset( $nthPage ) {
277  $this->pageOffset = $nthPage;
278  }
279 
286  public function setNoticeCallback( $callback ) {
287  return wfSetVar( $this->mNoticeCallback, $callback );
288  }
289 
295  public function setPageCallback( $callback ) {
296  $previous = $this->mPageCallback;
297  $this->mPageCallback = $callback;
298  return $previous;
299  }
300 
310  public function setPageOutCallback( $callback ) {
311  $previous = $this->mPageOutCallback;
312  $this->mPageOutCallback = $callback;
313  return $previous;
314  }
315 
321  public function setRevisionCallback( $callback ) {
322  $previous = $this->mRevisionCallback;
323  $this->mRevisionCallback = $callback;
324  return $previous;
325  }
326 
332  public function setUploadCallback( $callback ) {
333  $previous = $this->mUploadCallback;
334  $this->mUploadCallback = $callback;
335  return $previous;
336  }
337 
343  public function setLogItemCallback( $callback ) {
344  $previous = $this->mLogItemCallback;
345  $this->mLogItemCallback = $callback;
346  return $previous;
347  }
348 
354  public function setSiteInfoCallback( $callback ) {
355  $previous = $this->mSiteInfoCallback;
356  $this->mSiteInfoCallback = $callback;
357  return $previous;
358  }
359 
365  public function setImportTitleFactory( $factory ) {
366  $this->importTitleFactory = $factory;
367  }
368 
374  public function setTargetNamespace( $namespace ) {
375  if ( $namespace === null ) {
376  // Don't override namespaces
377  $this->setImportTitleFactory(
379  $this->contentLanguage,
380  $this->namespaceInfo,
381  $this->titleFactory
382  )
383  );
384  return true;
385  } elseif (
386  $namespace >= 0 &&
387  $this->namespaceInfo->exists( intval( $namespace ) )
388  ) {
389  $namespace = intval( $namespace );
390  $this->setImportTitleFactory(
392  $this->namespaceInfo,
393  $this->titleFactory,
394  $namespace
395  )
396  );
397  return true;
398  } else {
399  return false;
400  }
401  }
402 
408  public function setTargetRootPage( $rootpage ) {
409  $status = Status::newGood();
410  $nsInfo = $this->namespaceInfo;
411  if ( $rootpage === null ) {
412  // No rootpage
413  $this->setImportTitleFactory(
415  $this->contentLanguage,
416  $nsInfo,
417  $this->titleFactory
418  )
419  );
420  } elseif ( $rootpage !== '' ) {
421  $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
422  $title = Title::newFromText( $rootpage );
423 
424  if ( !$title || $title->isExternal() ) {
425  $status->fatal( 'import-rootpage-invalid' );
426  } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
427  $displayNSText = $title->getNamespace() === NS_MAIN
428  ? wfMessage( 'blanknamespace' )->text()
429  : $this->contentLanguage->getNsText( $title->getNamespace() );
430  $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
431  } else {
432  // set namespace to 'all', so the namespace check in processTitle() can pass
433  $this->setTargetNamespace( null );
434  $this->setImportTitleFactory(
436  $nsInfo,
437  $this->titleFactory,
438  $title
439  )
440  );
441  }
442  }
443  return $status;
444  }
445 
449  public function setImageBasePath( $dir ) {
450  $this->mImageBasePath = $dir;
451  }
452 
456  public function setImportUploads( $import ) {
457  $this->mImportUploads = $import;
458  }
459 
465  public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
466  $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
467  }
468 
473  public function disableStatisticsUpdate() {
474  $this->disableStatisticsUpdate = true;
475  }
476 
483  public function beforeImportPage( $titleAndForeignTitle ) {
484  $title = $titleAndForeignTitle[0];
485  $page = $this->wikiPageFactory->newFromTitle( $title );
486  $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
487  return true;
488  }
489 
495  public function importRevision( $revision ) {
496  if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
497  $this->notice( 'import-error-bad-location',
498  $revision->getTitle()->getPrefixedText(),
499  $revision->getID(),
500  $revision->getModel(),
501  $revision->getFormat()
502  );
503 
504  return false;
505  }
506 
507  try {
508  return $revision->importOldRevision();
509  } catch ( MWContentSerializationException $ex ) {
510  $this->notice( 'import-error-unserialize',
511  $revision->getTitle()->getPrefixedText(),
512  $revision->getID(),
513  $revision->getModel(),
514  $revision->getFormat()
515  );
516  }
517 
518  return false;
519  }
520 
526  public function importLogItem( $revision ) {
527  return $revision->importLogItem();
528  }
529 
535  public function importUpload( $revision ) {
536  $status = $this->uploadRevisionImporter->import( $revision );
537  return $status->isGood();
538  }
539 
549  public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
550  $sRevCount, $pageInfo
551  ) {
552  // Update article count statistics (T42009)
553  // The normal counting logic in WikiPage->doEditUpdates() is designed for
554  // one-revision-at-a-time editing, not bulk imports. In this situation it
555  // suffers from issues of replica DB lag. We let WikiPage handle the total page
556  // and revision count, and we implement our own custom logic for the
557  // article (content page) count.
558  if ( !$this->disableStatisticsUpdate ) {
559  $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
560 
561  $page->loadPageData( WikiPage::READ_LATEST );
562  $rev = $page->getRevisionRecord();
563  if ( $rev === null ) {
564 
565  wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
566  ' because WikiPage::getRevisionRecord() returned null' );
567  } else {
568  $user = RequestContext::getMain()->getUser();
569  $update = $page->newPageUpdater( $user )->prepareUpdate();
570  $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
571  $countable = $update->isCountable();
572  if ( array_key_exists( $countKey, $this->countableCache ) &&
573  $countable != $this->countableCache[$countKey] ) {
575  'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
576  ] ) );
577  }
578  }
579  }
580 
581  $title = Title::newFromPageIdentity( $pageIdentity );
582  return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
583  $revCount, $sRevCount, $pageInfo );
584  }
585 
591  private function siteInfoCallback( $siteInfo ) {
592  if ( isset( $this->mSiteInfoCallback ) ) {
593  return call_user_func_array(
594  $this->mSiteInfoCallback,
595  [ $siteInfo, $this ]
596  );
597  } else {
598  return false;
599  }
600  }
601 
606  public function pageCallback( $title ) {
607  if ( isset( $this->mPageCallback ) ) {
608  call_user_func( $this->mPageCallback, $title );
609  }
610  }
611 
620  private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
621  $sucCount, $pageInfo ) {
622  if ( isset( $this->mPageOutCallback ) ) {
623  call_user_func_array( $this->mPageOutCallback, func_get_args() );
624  }
625  }
626 
632  private function revisionCallback( $revision ) {
633  if ( isset( $this->mRevisionCallback ) ) {
634  return call_user_func_array(
635  $this->mRevisionCallback,
636  [ $revision, $this ]
637  );
638  } else {
639  return false;
640  }
641  }
642 
648  private function logItemCallback( $revision ) {
649  if ( isset( $this->mLogItemCallback ) ) {
650  return call_user_func_array(
651  $this->mLogItemCallback,
652  [ $revision, $this ]
653  );
654  } else {
655  return false;
656  }
657  }
658 
665  public function nodeAttribute( $attr ) {
666  return $this->reader->getAttribute( $attr ) ?? '';
667  }
668 
676  public function nodeContents() {
677  if ( $this->reader->isEmptyElement ) {
678  return "";
679  }
680  $buffer = "";
681  while ( $this->reader->read() ) {
682  switch ( $this->reader->nodeType ) {
683  case XMLReader::TEXT:
684  case XMLReader::CDATA:
685  case XMLReader::SIGNIFICANT_WHITESPACE:
686  $buffer .= $this->reader->value;
687  break;
688  case XMLReader::END_ELEMENT:
689  return $buffer;
690  }
691  }
692 
693  $this->reader->close();
694  return '';
695  }
696 
702  public function doImport() {
703  $this->syntaxCheckXML();
704 
705  // Calls to reader->read need to be wrapped in calls to
706  // libxml_disable_entity_loader() to avoid local file
707  // inclusion attacks (T48932).
708  // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
709  $oldDisable = @libxml_disable_entity_loader( true );
710  try {
711  $this->reader->read();
712 
713  if ( $this->reader->localName != 'mediawiki' ) {
714  // phpcs:ignore Generic.PHP.NoSilencedErrors
715  @libxml_disable_entity_loader( $oldDisable );
716  $error = libxml_get_last_error();
717  if ( $error ) {
718  throw new NormalizedException( "XML error at line {line}: {message}", [
719  'line' => $error->line,
720  'message' => $error->message,
721  ] );
722  } else {
723  throw new MWException(
724  "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
725  );
726  }
727  }
728  $this->debug( "<mediawiki> tag is correct." );
729 
730  $this->debug( "Starting primary dump processing loop." );
731 
732  $keepReading = $this->reader->read();
733  $skip = false;
734  $pageCount = 0;
735  while ( $keepReading ) {
736  $tag = $this->reader->localName;
737  if ( $this->pageOffset ) {
738  if ( $tag === 'page' ) {
739  $pageCount++;
740  }
741  if ( $pageCount < $this->pageOffset ) {
742  $keepReading = $this->reader->next();
743  continue;
744  }
745  }
746  $type = $this->reader->nodeType;
747 
748  if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
749  // Do nothing
750  } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
751  break;
752  } elseif ( $tag == 'siteinfo' ) {
753  $this->handleSiteInfo();
754  } elseif ( $tag == 'page' ) {
755  $this->handlePage();
756  } elseif ( $tag == 'logitem' ) {
757  $this->handleLogItem();
758  } elseif ( $tag != '#text' ) {
759  $this->warn( "Unhandled top-level XML tag $tag" );
760 
761  $skip = true;
762  }
763 
764  if ( $skip ) {
765  $keepReading = $this->reader->next();
766  $skip = false;
767  $this->debug( "Skip" );
768  } else {
769  $keepReading = $this->reader->read();
770  }
771  }
772  } finally {
773  // phpcs:ignore Generic.PHP.NoSilencedErrors
774  @libxml_disable_entity_loader( $oldDisable );
775  $this->reader->close();
776  }
777 
778  return true;
779  }
780 
781  private function handleSiteInfo() {
782  $this->debug( "Enter site info handler." );
783  $siteInfo = [];
784 
785  // Fields that can just be stuffed in the siteInfo object
786  $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
787 
788  while ( $this->reader->read() ) {
789  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
790  $this->reader->localName == 'siteinfo' ) {
791  break;
792  }
793 
794  $tag = $this->reader->localName;
795 
796  if ( $tag == 'namespace' ) {
797  $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
798  $this->nodeContents();
799  } elseif ( in_array( $tag, $normalFields ) ) {
800  $siteInfo[$tag] = $this->nodeContents();
801  }
802  }
803 
804  $siteInfo['_namespaces'] = $this->foreignNamespaces;
805  $this->siteInfoCallback( $siteInfo );
806  }
807 
808  private function handleLogItem() {
809  $this->debug( "Enter log item handler." );
810  $logInfo = [];
811 
812  // Fields that can just be stuffed in the pageInfo object
813  $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
814  'logtitle', 'params' ];
815 
816  while ( $this->reader->read() ) {
817  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
818  $this->reader->localName == 'logitem' ) {
819  break;
820  }
821 
822  $tag = $this->reader->localName;
823 
824  if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
825  // Do nothing
826  } elseif ( in_array( $tag, $normalFields ) ) {
827  $logInfo[$tag] = $this->nodeContents();
828  } elseif ( $tag == 'contributor' ) {
829  $logInfo['contributor'] = $this->handleContributor();
830  } elseif ( $tag != '#text' ) {
831  $this->warn( "Unhandled log-item XML tag $tag" );
832  }
833  }
834 
835  $this->processLogItem( $logInfo );
836  }
837 
842  private function processLogItem( $logInfo ) {
843  $revision = new WikiRevision();
844 
845  if ( isset( $logInfo['id'] ) ) {
846  $revision->setID( $logInfo['id'] );
847  }
848  $revision->setType( $logInfo['type'] );
849  $revision->setAction( $logInfo['action'] );
850  if ( isset( $logInfo['timestamp'] ) ) {
851  $revision->setTimestamp( $logInfo['timestamp'] );
852  }
853  if ( isset( $logInfo['params'] ) ) {
854  $revision->setParams( $logInfo['params'] );
855  }
856  if ( isset( $logInfo['logtitle'] ) ) {
857  // @todo Using Title for non-local titles is a recipe for disaster.
858  // We should use ForeignTitle here instead.
859  $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
860  }
861 
862  $revision->setNoUpdates( $this->mNoUpdates );
863 
864  if ( isset( $logInfo['comment'] ) ) {
865  $revision->setComment( $logInfo['comment'] );
866  }
867 
868  if ( isset( $logInfo['contributor']['username'] ) ) {
869  $revision->setUsername(
870  $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
871  );
872  } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
873  $revision->setUserIP( $logInfo['contributor']['ip'] );
874  } else {
875  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
876  }
877 
878  return $this->logItemCallback( $revision );
879  }
880 
881  private function handlePage() {
882  // Handle page data.
883  $this->debug( "Enter page handler." );
884  $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
885 
886  // Fields that can just be stuffed in the pageInfo object
887  $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
888 
889  $skip = false;
890  $badTitle = false;
891 
892  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
893  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
894  $this->reader->localName == 'page' ) {
895  break;
896  }
897 
898  $skip = false;
899 
900  $tag = $this->reader->localName;
901 
902  if ( $badTitle ) {
903  // The title is invalid, bail out of this page
904  $skip = true;
905  } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
906  // Do nothing
907  } elseif ( in_array( $tag, $normalFields ) ) {
908  // An XML snippet:
909  // <page>
910  // <id>123</id>
911  // <title>Page</title>
912  // <redirect title="NewTitle"/>
913  // ...
914  // Because the redirect tag is built differently, we need special handling for that case.
915  if ( $tag == 'redirect' ) {
916  $pageInfo[$tag] = $this->nodeAttribute( 'title' );
917  } else {
918  $pageInfo[$tag] = $this->nodeContents();
919  }
920  } elseif ( $tag == 'revision' || $tag == 'upload' ) {
921  if ( !isset( $title ) ) {
922  $title = $this->processTitle( $pageInfo['title'],
923  $pageInfo['ns'] ?? null );
924 
925  // $title is either an array of two titles or false.
926  if ( is_array( $title ) ) {
927  $this->pageCallback( $title );
928  [ $pageInfo['_title'], $foreignTitle ] = $title;
929  } else {
930  $badTitle = true;
931  $skip = true;
932  }
933  }
934 
935  if ( $title ) {
936  if ( $tag == 'revision' ) {
937  $this->handleRevision( $pageInfo );
938  } else {
939  $this->handleUpload( $pageInfo );
940  }
941  }
942  } elseif ( $tag != '#text' ) {
943  $this->warn( "Unhandled page XML tag $tag" );
944  $skip = true;
945  }
946  }
947 
948  // @note $pageInfo is only set if a valid $title is processed above with
949  // no error. If we have a valid $title, then pageCallback is called
950  // above, $pageInfo['title'] is set and we do pageOutCallback here.
951  // If $pageInfo['_title'] is not set, then $foreignTitle is also not
952  // set since they both come from $title above.
953  if ( array_key_exists( '_title', $pageInfo ) ) {
955  $title = $pageInfo['_title'];
956  $this->pageOutCallback(
957  $title,
958  // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
959  $foreignTitle,
960  $pageInfo['revisionCount'],
961  $pageInfo['successfulRevisionCount'],
962  $pageInfo
963  );
964  }
965  }
966 
970  private function handleRevision( &$pageInfo ) {
971  $this->debug( "Enter revision handler" );
972  $revisionInfo = [];
973 
974  $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
975  'model', 'format', 'text', 'sha1' ];
976 
977  $skip = false;
978 
979  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
980  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
981  $this->reader->localName == 'revision' ) {
982  break;
983  }
984 
985  $tag = $this->reader->localName;
986 
987  if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
988  $this, $pageInfo, $revisionInfo )
989  ) {
990  // Do nothing
991  } elseif ( in_array( $tag, $normalFields ) ) {
992  $revisionInfo[$tag] = $this->nodeContents();
993  } elseif ( $tag == 'content' ) {
994  // We can have multiple content tags, so make this an array.
995  $revisionInfo[$tag][] = $this->handleContent();
996  } elseif ( $tag == 'contributor' ) {
997  $revisionInfo['contributor'] = $this->handleContributor();
998  } elseif ( $tag != '#text' ) {
999  $this->warn( "Unhandled revision XML tag $tag" );
1000  $skip = true;
1001  }
1002  }
1003 
1004  $pageInfo['revisionCount']++;
1005  if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
1006  $pageInfo['successfulRevisionCount']++;
1007  }
1008  }
1009 
1010  private function handleContent() {
1011  $this->debug( "Enter content handler" );
1012  $contentInfo = [];
1013 
1014  $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
1015 
1016  $skip = false;
1017 
1018  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1019  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1020  $this->reader->localName == 'content' ) {
1021  break;
1022  }
1023 
1024  $tag = $this->reader->localName;
1025 
1026  if ( !$this->hookRunner->onImportHandleContentXMLTag(
1027  $this, $contentInfo )
1028  ) {
1029  // Do nothing
1030  } elseif ( in_array( $tag, $normalFields ) ) {
1031  $contentInfo[$tag] = $this->nodeContents();
1032  } elseif ( $tag != '#text' ) {
1033  $this->warn( "Unhandled content XML tag $tag" );
1034  $skip = true;
1035  }
1036  }
1037 
1038  return $contentInfo;
1039  }
1040 
1049  private function makeContent( Title $title, $revisionId, $contentInfo ) {
1050  $maxArticleSize = MediaWikiServices::getInstance()->getMainConfig()->get(
1051  MainConfigNames::MaxArticleSize );
1052 
1053  if ( !isset( $contentInfo['text'] ) ) {
1054  throw new MWException( 'Missing text field in import.' );
1055  }
1056 
1057  // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1058  // database errors and instability. Testing for revisions with only listed
1059  // content models, as other content models might use serialization formats
1060  // which aren't checked against $wgMaxArticleSize.
1061  if ( ( !isset( $contentInfo['model'] ) ||
1062  in_array( $contentInfo['model'], [
1063  'wikitext',
1064  'css',
1065  'json',
1066  'javascript',
1067  'text',
1068  ''
1069  ] ) ) &&
1070  strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1071  ) {
1072  throw new MWException( 'The text of ' .
1073  ( $revisionId ?
1074  "the revision with ID $revisionId" :
1075  'a revision'
1076  ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1077  }
1078 
1079  $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1080  $model = $contentInfo['model'] ?? $this->getDefaultContentModel( $title, $role );
1081  $handler = $this->getContentHandler( $model );
1082 
1083  $text = $handler->importTransform( $contentInfo['text'] );
1084 
1085  return $handler->unserializeContent( $text );
1086  }
1087 
1094  private function processRevision( $pageInfo, $revisionInfo ) {
1095  $revision = new WikiRevision();
1096 
1097  $revId = $revisionInfo['id'] ?? 0;
1098  if ( $revId ) {
1099  $revision->setID( $revisionInfo['id'] );
1100  }
1101 
1102  $title = $pageInfo['_title'];
1103  $revision->setTitle( $title );
1104 
1105  $content = $this->makeContent( $title, $revId, $revisionInfo );
1106  $revision->setContent( SlotRecord::MAIN, $content );
1107 
1108  foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1109  if ( !isset( $slotInfo['role'] ) ) {
1110  throw new MWException( "Missing role for imported slot." );
1111  }
1112 
1113  $content = $this->makeContent( $title, $revId, $slotInfo );
1114  $revision->setContent( $slotInfo['role'], $content );
1115  }
1116  $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1117 
1118  if ( isset( $revisionInfo['comment'] ) ) {
1119  $revision->setComment( $revisionInfo['comment'] );
1120  }
1121 
1122  if ( isset( $revisionInfo['minor'] ) ) {
1123  $revision->setMinor( true );
1124  }
1125  if ( isset( $revisionInfo['contributor']['username'] ) ) {
1126  $revision->setUsername(
1127  $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1128  );
1129  } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1130  $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1131  } else {
1132  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1133  }
1134  if ( isset( $revisionInfo['sha1'] ) ) {
1135  $revision->setSha1Base36( $revisionInfo['sha1'] );
1136  }
1137  $revision->setNoUpdates( $this->mNoUpdates );
1138 
1139  return $this->revisionCallback( $revision );
1140  }
1141 
1146  private function handleUpload( &$pageInfo ) {
1147  $this->debug( "Enter upload handler" );
1148  $uploadInfo = [];
1149 
1150  $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1151  'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1152 
1153  $skip = false;
1154 
1155  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1156  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1157  $this->reader->localName == 'upload' ) {
1158  break;
1159  }
1160 
1161  $tag = $this->reader->localName;
1162 
1163  if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1164  // Do nothing
1165  } elseif ( in_array( $tag, $normalFields ) ) {
1166  $uploadInfo[$tag] = $this->nodeContents();
1167  } elseif ( $tag == 'contributor' ) {
1168  $uploadInfo['contributor'] = $this->handleContributor();
1169  } elseif ( $tag == 'contents' ) {
1170  $contents = $this->nodeContents();
1171  $encoding = $this->reader->getAttribute( 'encoding' );
1172  if ( $encoding === 'base64' ) {
1173  $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1174  $uploadInfo['isTempSrc'] = true;
1175  }
1176  } elseif ( $tag != '#text' ) {
1177  $this->warn( "Unhandled upload XML tag $tag" );
1178  $skip = true;
1179  }
1180  }
1181 
1182  if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1183  $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1184  if ( file_exists( $path ) ) {
1185  $uploadInfo['fileSrc'] = $path;
1186  $uploadInfo['isTempSrc'] = false;
1187  }
1188  }
1189 
1190  if ( $this->mImportUploads ) {
1191  return $this->processUpload( $pageInfo, $uploadInfo );
1192  }
1193  }
1194 
1199  private function dumpTemp( $contents ) {
1200  $filename = tempnam( wfTempDir(), 'importupload' );
1201  file_put_contents( $filename, $contents );
1202  return $filename;
1203  }
1204 
1210  private function processUpload( $pageInfo, $uploadInfo ) {
1211  $revision = new WikiRevision();
1212  $revId = $pageInfo['id'];
1213  $title = $pageInfo['_title'];
1214  // T292348: text key may be absent, force addition if null
1215  $uploadInfo['text'] = $uploadInfo['text'] ?? '';
1216  $content = $this->makeContent( $title, $revId, $uploadInfo );
1217 
1218  $revision->setTitle( $title );
1219  $revision->setID( $revId );
1220  $revision->setTimestamp( $uploadInfo['timestamp'] );
1221  $revision->setContent( SlotRecord::MAIN, $content );
1222  $revision->setFilename( $uploadInfo['filename'] );
1223  if ( isset( $uploadInfo['archivename'] ) ) {
1224  $revision->setArchiveName( $uploadInfo['archivename'] );
1225  }
1226  $revision->setSrc( $uploadInfo['src'] );
1227  if ( isset( $uploadInfo['fileSrc'] ) ) {
1228  $revision->setFileSrc( $uploadInfo['fileSrc'],
1229  !empty( $uploadInfo['isTempSrc'] )
1230  );
1231  }
1232  if ( isset( $uploadInfo['sha1base36'] ) ) {
1233  $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1234  }
1235  $revision->setSize( intval( $uploadInfo['size'] ) );
1236  $revision->setComment( $uploadInfo['comment'] );
1237 
1238  if ( isset( $uploadInfo['contributor']['username'] ) ) {
1239  $revision->setUsername(
1240  $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1241  );
1242  } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1243  $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1244  }
1245  $revision->setNoUpdates( $this->mNoUpdates );
1246 
1247  return call_user_func( $this->mUploadCallback, $revision );
1248  }
1249 
1253  private function handleContributor() {
1254  $this->debug( "Enter contributor handler." );
1255 
1256  if ( $this->reader->isEmptyElement ) {
1257  return [];
1258  }
1259 
1260  $fields = [ 'id', 'ip', 'username' ];
1261  $info = [];
1262 
1263  while ( $this->reader->read() ) {
1264  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1265  $this->reader->localName == 'contributor' ) {
1266  break;
1267  }
1268 
1269  $tag = $this->reader->localName;
1270 
1271  if ( in_array( $tag, $fields ) ) {
1272  $info[$tag] = $this->nodeContents();
1273  }
1274  }
1275 
1276  return $info;
1277  }
1278 
1284  private function processTitle( $text, $ns = null ) {
1285  if ( $this->foreignNamespaces === null ) {
1286  $foreignTitleFactory = new NaiveForeignTitleFactory(
1287  $this->contentLanguage
1288  );
1289  } else {
1290  $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1291  $this->foreignNamespaces );
1292  }
1293 
1294  $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1295  intval( $ns ) );
1296 
1297  $title = $this->importTitleFactory->createTitleFromForeignTitle(
1298  $foreignTitle );
1299 
1300  $commandLineMode = $this->config->get( 'CommandLineMode' );
1301  if ( $title === null ) {
1302  # Invalid page title? Ignore the page
1303  $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1304  return false;
1305  } elseif ( $title->isExternal() ) {
1306  $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1307  return false;
1308  } elseif ( !$title->canExist() ) {
1309  $this->notice( 'import-error-special', $title->getPrefixedText() );
1310  return false;
1311  } elseif ( !$commandLineMode ) {
1312  $user = RequestContext::getMain()->getUser();
1313 
1314  if ( !$this->permissionManager->userCan( 'edit', $user, $title ) ) {
1315  # Do not import if the importing wiki user cannot edit this page
1316  $this->notice( 'import-error-edit', $title->getPrefixedText() );
1317 
1318  return false;
1319  }
1320  }
1321 
1322  return [ $title, $foreignTitle ];
1323  }
1324 
1329  private function getContentHandler( $model ) {
1330  return $this->contentHandlerFactory->getContentHandler( $model );
1331  }
1332 
1339  private function getDefaultContentModel( $title, $role ) {
1340  return $this->slotRoleRegistry
1341  ->getRoleHandler( $role )
1342  ->getDefaultModel( $title );
1343  }
1344 
1349  private function openReader() {
1350  // Enable the entity loader, as it is needed for loading external URLs via
1351  // XMLReader::open (T86036)
1352  // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
1353  $oldDisable = @libxml_disable_entity_loader( false );
1354 
1355  if ( PHP_VERSION_ID >= 80000 ) {
1356  // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548
1357  $reader = XMLReader::open(
1358  'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1359  if ( $reader instanceof XMLReader ) {
1360  $this->reader = $reader;
1361  $status = true;
1362  } else {
1363  $status = false;
1364  }
1365  } else {
1366  // A static call generated a deprecation warning prior to PHP 8.0
1367  $this->reader = new XMLReader;
1368  $status = $this->reader->open(
1369  'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1370  }
1371  if ( !$status ) {
1372  $error = libxml_get_last_error();
1373  // phpcs:ignore Generic.PHP.NoSilencedErrors
1374  @libxml_disable_entity_loader( $oldDisable );
1375  throw new MWException(
1376  'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1377  );
1378  }
1379  // phpcs:ignore Generic.PHP.NoSilencedErrors
1380  @libxml_disable_entity_loader( $oldDisable );
1381  }
1382 
1386  private function syntaxCheckXML() {
1387  if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) {
1388  return;
1389  }
1390  AtEase::suppressWarnings();
1391  $oldDisable = libxml_disable_entity_loader( false );
1392  try {
1393  while ( $this->reader->read() );
1394  $error = libxml_get_last_error();
1395  if ( $error ) {
1396  $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message;
1397  wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage );
1398  throw new MWException( $errorMessage );
1399  }
1400  } finally {
1401  libxml_disable_entity_loader( $oldDisable );
1402  AtEase::restoreWarnings();
1403  $this->reader->close();
1404  }
1405 
1406  // Reopen for the real import
1407  UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 );
1408  $this->openReader();
1409  }
1410 }
const NS_MAIN
Definition: Defines.php:64
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
static addUpdate(DeferrableUpdate $update, $stage=self::POSTSEND)
Add an update to the pending update queue for execution at the appropriate time.
Reporting callback.
Base class for language-specific code.
Definition: Language.php:61
Exception representing a failure to serialize or unserialize a content object.
MediaWiki exception.
Definition: MWException.php:33
Helper class for mapping value objects representing basic entities to cache keys.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Definition: HookRunner.php:568
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Service for creating WikiPage objects.
A service class for checking permissions To obtain an instance, use MediaWikiServices::getInstance()-...
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:40
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition: Status.php:58
A simple, immutable structure to hold the title of a page on a foreign MediaWiki installation.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Creates Title objects.
Represents a title within MediaWiki.
Definition: Title.php:76
canExist()
Can this title represent a page in the wiki's database?
Definition: Title.php:1226
isExternal()
Is this Title interwiki?
Definition: Title.php:948
getNamespace()
Get the namespace index, i.e.
Definition: Title.php:1058
getPrefixedText()
Get the prefixed title with spaces.
Definition: Title.php:1885
Class to parse and build external user names.
static getMain()
Get the RequestContext object associated with the main request.
static factory(array $deltas)
static seekSource(string $id, int $offset)
static isSeekableSource(string $id)
static registerSource(ImportSource $source)
XML file reader for the page data importer.
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setNoUpdates( $noupdates)
Set 'no updates' mode.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
importLogItem( $revision)
Default per-revision callback, performs the import.
setImageBasePath( $dir)
throwXmlError( $err)
setDebug( $debug)
Set debug mode...
__construct(ImportSource $source, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, PermissionManager $permissionManager, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
Represents a revision, log entry or upload during the import process.
Source interface for XML import.
Interface for configuration instances.
Definition: Config.php:32
Interface for objects (potentially) representing an editable wiki page.
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
$source
$content
Definition: router.php:76