MediaWiki  master
WikiImporter.php
Go to the documentation of this file.
1 <?php
38 use Wikimedia\NormalizedException\NormalizedException;
39 
46 class WikiImporter {
48  private $reader;
49 
51  private $foreignNamespaces = null;
52 
54  private $mLogItemCallback;
55 
57  private $mUploadCallback;
58 
60  private $mRevisionCallback;
61 
63  private $mPageCallback;
64 
66  private $mSiteInfoCallback;
67 
69  private $mPageOutCallback;
70 
72  private $mNoticeCallback;
73 
75  private $mDebug;
76 
78  private $mImportUploads;
79 
81  private $mImageBasePath;
82 
84  private $mNoUpdates = false;
85 
87  private $pageOffset = 0;
88 
90  private $config;
91 
93  private $importTitleFactory;
94 
96  private $hookRunner;
97 
99  private $countableCache = [];
100 
102  private $disableStatisticsUpdate = false;
103 
105  private $externalUserNames;
106 
108  private $contentLanguage;
109 
111  private $namespaceInfo;
112 
114  private $titleFactory;
115 
117  private $wikiPageFactory;
118 
120  private $uploadRevisionImporter;
121 
123  private $permissionManager;
124 
126  private $contentHandlerFactory;
127 
129  private $slotRoleRegistry;
130 
146  public function __construct(
148  Config $config,
149  HookContainer $hookContainer,
150  Language $contentLanguage,
151  NamespaceInfo $namespaceInfo,
152  TitleFactory $titleFactory,
153  WikiPageFactory $wikiPageFactory,
154  UploadRevisionImporter $uploadRevisionImporter,
155  PermissionManager $permissionManager,
156  IContentHandlerFactory $contentHandlerFactory,
157  SlotRoleRegistry $slotRoleRegistry
158  ) {
159  $this->reader = new XMLReader();
160  $this->config = $config;
161  $this->hookRunner = new HookRunner( $hookContainer );
162  $this->contentLanguage = $contentLanguage;
163  $this->namespaceInfo = $namespaceInfo;
164  $this->titleFactory = $titleFactory;
165  $this->wikiPageFactory = $wikiPageFactory;
166  $this->uploadRevisionImporter = $uploadRevisionImporter;
167  $this->permissionManager = $permissionManager;
168  $this->contentHandlerFactory = $contentHandlerFactory;
169  $this->slotRoleRegistry = $slotRoleRegistry;
170 
171  if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
172  stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
173  }
175 
176  // Enable the entity loader, as it is needed for loading external URLs via
177  // XMLReader::open (T86036)
178  // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
179  $oldDisable = @libxml_disable_entity_loader( false );
180  if ( defined( 'LIBXML_PARSEHUGE' ) ) {
181  $status = $this->reader->open( "uploadsource://$id", null, LIBXML_PARSEHUGE );
182  } else {
183  $status = $this->reader->open( "uploadsource://$id" );
184  }
185  if ( !$status ) {
186  $error = libxml_get_last_error();
187  // phpcs:ignore Generic.PHP.NoSilencedErrors
188  @libxml_disable_entity_loader( $oldDisable );
189  throw new MWException( 'Encountered an internal error while initializing WikiImporter object: ' .
190  $error->message );
191  }
192  // phpcs:ignore Generic.PHP.NoSilencedErrors
193  @libxml_disable_entity_loader( $oldDisable );
194 
195  // Default callbacks
196  $this->setPageCallback( [ $this, 'beforeImportPage' ] );
197  $this->setRevisionCallback( [ $this, "importRevision" ] );
198  $this->setUploadCallback( [ $this, 'importUpload' ] );
199  $this->setLogItemCallback( [ $this, 'importLogItem' ] );
200  $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
201 
202  $this->importTitleFactory = new NaiveImportTitleFactory(
203  $this->contentLanguage,
204  $this->namespaceInfo,
205  $this->titleFactory
206  );
207  $this->externalUserNames = new ExternalUserNames( 'imported', false );
208  }
209 
213  public function getReader() {
214  return $this->reader;
215  }
216 
220  public function throwXmlError( $err ) {
221  $this->debug( "FAILURE: $err" );
222  wfDebug( "WikiImporter XML error: $err" );
223  }
224 
228  public function debug( $data ) {
229  if ( $this->mDebug ) {
230  wfDebug( "IMPORT: $data" );
231  }
232  }
233 
237  public function warn( $data ) {
238  wfDebug( "IMPORT: $data" );
239  }
240 
245  public function notice( $msg, ...$params ) {
246  if ( is_callable( $this->mNoticeCallback ) ) {
247  call_user_func( $this->mNoticeCallback, $msg, $params );
248  } else { # No ImportReporter -> CLI
249  // T177997: the command line importers should call setNoticeCallback()
250  // for their own custom callback to echo the notice
251  wfDebug( wfMessage( $msg, $params )->text() );
252  }
253  }
254 
259  public function setDebug( $debug ) {
260  $this->mDebug = $debug;
261  }
262 
267  public function setNoUpdates( $noupdates ) {
268  $this->mNoUpdates = $noupdates;
269  }
270 
277  public function setPageOffset( $nthPage ) {
278  $this->pageOffset = $nthPage;
279  }
280 
287  public function setNoticeCallback( $callback ) {
288  return wfSetVar( $this->mNoticeCallback, $callback );
289  }
290 
296  public function setPageCallback( $callback ) {
297  $previous = $this->mPageCallback;
298  $this->mPageCallback = $callback;
299  return $previous;
300  }
301 
311  public function setPageOutCallback( $callback ) {
312  $previous = $this->mPageOutCallback;
313  $this->mPageOutCallback = $callback;
314  return $previous;
315  }
316 
322  public function setRevisionCallback( $callback ) {
323  $previous = $this->mRevisionCallback;
324  $this->mRevisionCallback = $callback;
325  return $previous;
326  }
327 
333  public function setUploadCallback( $callback ) {
334  $previous = $this->mUploadCallback;
335  $this->mUploadCallback = $callback;
336  return $previous;
337  }
338 
344  public function setLogItemCallback( $callback ) {
345  $previous = $this->mLogItemCallback;
346  $this->mLogItemCallback = $callback;
347  return $previous;
348  }
349 
355  public function setSiteInfoCallback( $callback ) {
356  $previous = $this->mSiteInfoCallback;
357  $this->mSiteInfoCallback = $callback;
358  return $previous;
359  }
360 
366  public function setImportTitleFactory( $factory ) {
367  $this->importTitleFactory = $factory;
368  }
369 
375  public function setTargetNamespace( $namespace ) {
376  if ( $namespace === null ) {
377  // Don't override namespaces
378  $this->setImportTitleFactory(
380  $this->contentLanguage,
381  $this->namespaceInfo,
382  $this->titleFactory
383  )
384  );
385  return true;
386  } elseif (
387  $namespace >= 0 &&
388  $this->namespaceInfo->exists( intval( $namespace ) )
389  ) {
390  $namespace = intval( $namespace );
391  $this->setImportTitleFactory(
393  $this->namespaceInfo,
394  $this->titleFactory,
395  $namespace
396  )
397  );
398  return true;
399  } else {
400  return false;
401  }
402  }
403 
409  public function setTargetRootPage( $rootpage ) {
410  $status = Status::newGood();
411  $nsInfo = $this->namespaceInfo;
412  if ( $rootpage === null ) {
413  // No rootpage
414  $this->setImportTitleFactory(
416  $this->contentLanguage,
417  $nsInfo,
418  $this->titleFactory
419  )
420  );
421  } elseif ( $rootpage !== '' ) {
422  $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
423  $title = Title::newFromText( $rootpage );
424 
425  if ( !$title || $title->isExternal() ) {
426  $status->fatal( 'import-rootpage-invalid' );
427  } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
428  $displayNSText = $title->getNamespace() === NS_MAIN
429  ? wfMessage( 'blanknamespace' )->text()
430  : $this->contentLanguage->getNsText( $title->getNamespace() );
431  $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
432  } else {
433  // set namespace to 'all', so the namespace check in processTitle() can pass
434  $this->setTargetNamespace( null );
435  $this->setImportTitleFactory(
437  $nsInfo,
438  $this->titleFactory,
439  $title
440  )
441  );
442  }
443  }
444  return $status;
445  }
446 
450  public function setImageBasePath( $dir ) {
451  $this->mImageBasePath = $dir;
452  }
453 
457  public function setImportUploads( $import ) {
458  $this->mImportUploads = $import;
459  }
460 
466  public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
467  $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
468  }
469 
474  public function disableStatisticsUpdate() {
475  $this->disableStatisticsUpdate = true;
476  }
477 
484  public function beforeImportPage( $titleAndForeignTitle ) {
485  $title = $titleAndForeignTitle[0];
486  $page = $this->wikiPageFactory->newFromTitle( $title );
487  $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
488  return true;
489  }
490 
496  public function importRevision( $revision ) {
497  if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
498  $this->notice( 'import-error-bad-location',
499  $revision->getTitle()->getPrefixedText(),
500  $revision->getID(),
501  $revision->getModel(),
502  $revision->getFormat()
503  );
504 
505  return false;
506  }
507 
508  try {
509  return $revision->importOldRevision();
510  } catch ( MWContentSerializationException $ex ) {
511  $this->notice( 'import-error-unserialize',
512  $revision->getTitle()->getPrefixedText(),
513  $revision->getID(),
514  $revision->getModel(),
515  $revision->getFormat()
516  );
517  }
518 
519  return false;
520  }
521 
527  public function importLogItem( $revision ) {
528  return $revision->importLogItem();
529  }
530 
536  public function importUpload( $revision ) {
537  $status = $this->uploadRevisionImporter->import( $revision );
538  return $status->isGood();
539  }
540 
550  public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
551  $sRevCount, $pageInfo
552  ) {
553  // Update article count statistics (T42009)
554  // The normal counting logic in WikiPage->doEditUpdates() is designed for
555  // one-revision-at-a-time editing, not bulk imports. In this situation it
556  // suffers from issues of replica DB lag. We let WikiPage handle the total page
557  // and revision count, and we implement our own custom logic for the
558  // article (content page) count.
559  if ( !$this->disableStatisticsUpdate ) {
560  $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
561 
562  $page->loadPageData( WikiPage::READ_LATEST );
563  $rev = $page->getRevisionRecord();
564  if ( $rev === null ) {
565 
566  wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
567  ' because WikiPage::getRevisionRecord() returned null' );
568  } else {
569  $user = RequestContext::getMain()->getUser();
570  $update = $page->newPageUpdater( $user )->prepareUpdate();
571  $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
572  $countable = $update->isCountable();
573  if ( array_key_exists( $countKey, $this->countableCache ) &&
574  $countable != $this->countableCache[$countKey] ) {
576  'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
577  ] ) );
578  }
579  }
580  }
581 
582  $title = Title::castFromPageIdentity( $pageIdentity );
583  // @phan-suppress-next-line PhanTypeMismatchArgumentNullable castFrom does not return null here
584  return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
585  $revCount, $sRevCount, $pageInfo );
586  }
587 
593  private function siteInfoCallback( $siteInfo ) {
594  if ( isset( $this->mSiteInfoCallback ) ) {
595  return call_user_func_array(
596  $this->mSiteInfoCallback,
597  [ $siteInfo, $this ]
598  );
599  } else {
600  return false;
601  }
602  }
603 
608  public function pageCallback( $title ) {
609  if ( isset( $this->mPageCallback ) ) {
610  call_user_func( $this->mPageCallback, $title );
611  }
612  }
613 
622  private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
623  $sucCount, $pageInfo ) {
624  if ( isset( $this->mPageOutCallback ) ) {
625  call_user_func_array( $this->mPageOutCallback, func_get_args() );
626  }
627  }
628 
634  private function revisionCallback( $revision ) {
635  if ( isset( $this->mRevisionCallback ) ) {
636  return call_user_func_array(
637  $this->mRevisionCallback,
638  [ $revision, $this ]
639  );
640  } else {
641  return false;
642  }
643  }
644 
650  private function logItemCallback( $revision ) {
651  if ( isset( $this->mLogItemCallback ) ) {
652  return call_user_func_array(
653  $this->mLogItemCallback,
654  [ $revision, $this ]
655  );
656  } else {
657  return false;
658  }
659  }
660 
667  public function nodeAttribute( $attr ) {
668  return $this->reader->getAttribute( $attr ) ?? '';
669  }
670 
678  public function nodeContents() {
679  if ( $this->reader->isEmptyElement ) {
680  return "";
681  }
682  $buffer = "";
683  while ( $this->reader->read() ) {
684  switch ( $this->reader->nodeType ) {
685  case XMLReader::TEXT:
686  case XMLReader::CDATA:
687  case XMLReader::SIGNIFICANT_WHITESPACE:
688  $buffer .= $this->reader->value;
689  break;
690  case XMLReader::END_ELEMENT:
691  return $buffer;
692  }
693  }
694 
695  $this->reader->close();
696  return '';
697  }
698 
705  public function doImport() {
706  // Calls to reader->read need to be wrapped in calls to
707  // libxml_disable_entity_loader() to avoid local file
708  // inclusion attacks (T48932).
709  // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
710  $oldDisable = @libxml_disable_entity_loader( true );
711  try {
712  $this->reader->read();
713 
714  if ( $this->reader->localName != 'mediawiki' ) {
715  // phpcs:ignore Generic.PHP.NoSilencedErrors
716  @libxml_disable_entity_loader( $oldDisable );
717  $error = libxml_get_last_error();
718  if ( $error ) {
719  throw new NormalizedException( "XML error at line {line}: {message}", [
720  'line' => $error->line,
721  'message' => $error->message,
722  ] );
723  } else {
724  throw new MWException( "Expected <mediawiki> tag, got " .
725  $this->reader->localName );
726  }
727  }
728  $this->debug( "<mediawiki> tag is correct." );
729 
730  $this->debug( "Starting primary dump processing loop." );
731 
732  $keepReading = $this->reader->read();
733  $skip = false;
734  $pageCount = 0;
735  while ( $keepReading ) {
736  $tag = $this->reader->localName;
737  if ( $this->pageOffset ) {
738  if ( $tag === 'page' ) {
739  $pageCount++;
740  }
741  if ( $pageCount < $this->pageOffset ) {
742  $keepReading = $this->reader->next();
743  continue;
744  }
745  }
746  $type = $this->reader->nodeType;
747 
748  if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
749  // Do nothing
750  } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
751  break;
752  } elseif ( $tag == 'siteinfo' ) {
753  $this->handleSiteInfo();
754  } elseif ( $tag == 'page' ) {
755  $this->handlePage();
756  } elseif ( $tag == 'logitem' ) {
757  $this->handleLogItem();
758  } elseif ( $tag != '#text' ) {
759  $this->warn( "Unhandled top-level XML tag $tag" );
760 
761  $skip = true;
762  }
763 
764  if ( $skip ) {
765  $keepReading = $this->reader->next();
766  $skip = false;
767  $this->debug( "Skip" );
768  } else {
769  $keepReading = $this->reader->read();
770  }
771  }
772  } finally {
773  // phpcs:ignore Generic.PHP.NoSilencedErrors
774  @libxml_disable_entity_loader( $oldDisable );
775  $this->reader->close();
776  }
777 
778  return true;
779  }
780 
781  private function handleSiteInfo() {
782  $this->debug( "Enter site info handler." );
783  $siteInfo = [];
784 
785  // Fields that can just be stuffed in the siteInfo object
786  $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
787 
788  while ( $this->reader->read() ) {
789  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
790  $this->reader->localName == 'siteinfo' ) {
791  break;
792  }
793 
794  $tag = $this->reader->localName;
795 
796  if ( $tag == 'namespace' ) {
797  $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
798  $this->nodeContents();
799  } elseif ( in_array( $tag, $normalFields ) ) {
800  $siteInfo[$tag] = $this->nodeContents();
801  }
802  }
803 
804  $siteInfo['_namespaces'] = $this->foreignNamespaces;
805  $this->siteInfoCallback( $siteInfo );
806  }
807 
808  private function handleLogItem() {
809  $this->debug( "Enter log item handler." );
810  $logInfo = [];
811 
812  // Fields that can just be stuffed in the pageInfo object
813  $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
814  'logtitle', 'params' ];
815 
816  while ( $this->reader->read() ) {
817  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
818  $this->reader->localName == 'logitem' ) {
819  break;
820  }
821 
822  $tag = $this->reader->localName;
823 
824  if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
825  // Do nothing
826  } elseif ( in_array( $tag, $normalFields ) ) {
827  $logInfo[$tag] = $this->nodeContents();
828  } elseif ( $tag == 'contributor' ) {
829  $logInfo['contributor'] = $this->handleContributor();
830  } elseif ( $tag != '#text' ) {
831  $this->warn( "Unhandled log-item XML tag $tag" );
832  }
833  }
834 
835  $this->processLogItem( $logInfo );
836  }
837 
842  private function processLogItem( $logInfo ) {
843  $revision = new WikiRevision( $this->config );
844 
845  if ( isset( $logInfo['id'] ) ) {
846  $revision->setID( $logInfo['id'] );
847  }
848  $revision->setType( $logInfo['type'] );
849  $revision->setAction( $logInfo['action'] );
850  if ( isset( $logInfo['timestamp'] ) ) {
851  $revision->setTimestamp( $logInfo['timestamp'] );
852  }
853  if ( isset( $logInfo['params'] ) ) {
854  $revision->setParams( $logInfo['params'] );
855  }
856  if ( isset( $logInfo['logtitle'] ) ) {
857  // @todo Using Title for non-local titles is a recipe for disaster.
858  // We should use ForeignTitle here instead.
859  $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
860  }
861 
862  $revision->setNoUpdates( $this->mNoUpdates );
863 
864  if ( isset( $logInfo['comment'] ) ) {
865  $revision->setComment( $logInfo['comment'] );
866  }
867 
868  if ( isset( $logInfo['contributor']['username'] ) ) {
869  $revision->setUsername(
870  $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
871  );
872  } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
873  $revision->setUserIP( $logInfo['contributor']['ip'] );
874  } else {
875  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
876  }
877 
878  return $this->logItemCallback( $revision );
879  }
880 
881  private function handlePage() {
882  // Handle page data.
883  $this->debug( "Enter page handler." );
884  $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
885 
886  // Fields that can just be stuffed in the pageInfo object
887  $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
888 
889  $skip = false;
890  $badTitle = false;
891 
892  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
893  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
894  $this->reader->localName == 'page' ) {
895  break;
896  }
897 
898  $skip = false;
899 
900  $tag = $this->reader->localName;
901 
902  if ( $badTitle ) {
903  // The title is invalid, bail out of this page
904  $skip = true;
905  } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
906  // Do nothing
907  } elseif ( in_array( $tag, $normalFields ) ) {
908  // An XML snippet:
909  // <page>
910  // <id>123</id>
911  // <title>Page</title>
912  // <redirect title="NewTitle"/>
913  // ...
914  // Because the redirect tag is built differently, we need special handling for that case.
915  if ( $tag == 'redirect' ) {
916  $pageInfo[$tag] = $this->nodeAttribute( 'title' );
917  } else {
918  $pageInfo[$tag] = $this->nodeContents();
919  }
920  } elseif ( $tag == 'revision' || $tag == 'upload' ) {
921  if ( !isset( $title ) ) {
922  $title = $this->processTitle( $pageInfo['title'],
923  $pageInfo['ns'] ?? null );
924 
925  // $title is either an array of two titles or false.
926  if ( is_array( $title ) ) {
927  $this->pageCallback( $title );
928  list( $pageInfo['_title'], $foreignTitle ) = $title;
929  } else {
930  $badTitle = true;
931  $skip = true;
932  }
933  }
934 
935  if ( $title ) {
936  if ( $tag == 'revision' ) {
937  $this->handleRevision( $pageInfo );
938  } else {
939  $this->handleUpload( $pageInfo );
940  }
941  }
942  } elseif ( $tag != '#text' ) {
943  $this->warn( "Unhandled page XML tag $tag" );
944  $skip = true;
945  }
946  }
947 
948  // @note $pageInfo is only set if a valid $title is processed above with
949  // no error. If we have a valid $title, then pageCallback is called
950  // above, $pageInfo['title'] is set and we do pageOutCallback here.
951  // If $pageInfo['_title'] is not set, then $foreignTitle is also not
952  // set since they both come from $title above.
953  if ( array_key_exists( '_title', $pageInfo ) ) {
955  $title = $pageInfo['_title'];
956  $this->pageOutCallback(
957  $title,
958  // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
959  $foreignTitle,
960  $pageInfo['revisionCount'],
961  $pageInfo['successfulRevisionCount'],
962  $pageInfo
963  );
964  }
965  }
966 
970  private function handleRevision( &$pageInfo ) {
971  $this->debug( "Enter revision handler" );
972  $revisionInfo = [];
973 
974  $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
975  'model', 'format', 'text', 'sha1' ];
976 
977  $skip = false;
978 
979  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
980  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
981  $this->reader->localName == 'revision' ) {
982  break;
983  }
984 
985  $tag = $this->reader->localName;
986 
987  if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
988  $this, $pageInfo, $revisionInfo )
989  ) {
990  // Do nothing
991  } elseif ( in_array( $tag, $normalFields ) ) {
992  $revisionInfo[$tag] = $this->nodeContents();
993  } elseif ( $tag == 'content' ) {
994  // We can have multiple content tags, so make this an array.
995  $revisionInfo[$tag][] = $this->handleContent();
996  } elseif ( $tag == 'contributor' ) {
997  $revisionInfo['contributor'] = $this->handleContributor();
998  } elseif ( $tag != '#text' ) {
999  $this->warn( "Unhandled revision XML tag $tag" );
1000  $skip = true;
1001  }
1002  }
1003 
1004  $pageInfo['revisionCount']++;
1005  if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
1006  $pageInfo['successfulRevisionCount']++;
1007  }
1008  }
1009 
1010  private function handleContent() {
1011  $this->debug( "Enter content handler" );
1012  $contentInfo = [];
1013 
1014  $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
1015 
1016  $skip = false;
1017 
1018  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1019  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1020  $this->reader->localName == 'content' ) {
1021  break;
1022  }
1023 
1024  $tag = $this->reader->localName;
1025 
1026  if ( !$this->hookRunner->onImportHandleContentXMLTag(
1027  $this, $contentInfo )
1028  ) {
1029  // Do nothing
1030  } elseif ( in_array( $tag, $normalFields ) ) {
1031  $contentInfo[$tag] = $this->nodeContents();
1032  } elseif ( $tag != '#text' ) {
1033  $this->warn( "Unhandled content XML tag $tag" );
1034  $skip = true;
1035  }
1036  }
1037 
1038  return $contentInfo;
1039  }
1040 
1049  private function makeContent( Title $title, $revisionId, $contentInfo ) {
1050  $maxArticleSize = MediaWikiServices::getInstance()->getMainConfig()->get(
1051  MainConfigNames::MaxArticleSize );
1052 
1053  if ( !isset( $contentInfo['text'] ) ) {
1054  throw new MWException( 'Missing text field in import.' );
1055  }
1056 
1057  // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1058  // database errors and instability. Testing for revisions with only listed
1059  // content models, as other content models might use serialization formats
1060  // which aren't checked against $wgMaxArticleSize.
1061  if ( ( !isset( $contentInfo['model'] ) ||
1062  in_array( $contentInfo['model'], [
1063  'wikitext',
1064  'css',
1065  'json',
1066  'javascript',
1067  'text',
1068  ''
1069  ] ) ) &&
1070  strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1071  ) {
1072  throw new MWException( 'The text of ' .
1073  ( $revisionId ?
1074  "the revision with ID $revisionId" :
1075  'a revision'
1076  ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1077  }
1078 
1079  $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1080  $model = $contentInfo['model'] ?? $this->getDefaultContentModel( $title, $role );
1081  $handler = $this->getContentHandler( $model );
1082 
1083  $text = $handler->importTransform( $contentInfo['text'] );
1084 
1085  return $handler->unserializeContent( $text );
1086  }
1087 
1094  private function processRevision( $pageInfo, $revisionInfo ) {
1095  $revision = new WikiRevision( $this->config );
1096 
1097  $revId = $revisionInfo['id'] ?? 0;
1098  if ( $revId ) {
1099  $revision->setID( $revisionInfo['id'] );
1100  }
1101 
1102  $title = $pageInfo['_title'];
1103  $revision->setTitle( $title );
1104 
1105  $content = $this->makeContent( $title, $revId, $revisionInfo );
1106  $revision->setContent( SlotRecord::MAIN, $content );
1107 
1108  foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1109  if ( !isset( $slotInfo['role'] ) ) {
1110  throw new MWException( "Missing role for imported slot." );
1111  }
1112 
1113  $content = $this->makeContent( $title, $revId, $slotInfo );
1114  $revision->setContent( $slotInfo['role'], $content );
1115  }
1116  $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1117 
1118  if ( isset( $revisionInfo['comment'] ) ) {
1119  $revision->setComment( $revisionInfo['comment'] );
1120  }
1121 
1122  if ( isset( $revisionInfo['minor'] ) ) {
1123  $revision->setMinor( true );
1124  }
1125  if ( isset( $revisionInfo['contributor']['username'] ) ) {
1126  $revision->setUsername(
1127  $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1128  );
1129  } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1130  $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1131  } else {
1132  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1133  }
1134  if ( isset( $revisionInfo['sha1'] ) ) {
1135  $revision->setSha1Base36( $revisionInfo['sha1'] );
1136  }
1137  $revision->setNoUpdates( $this->mNoUpdates );
1138 
1139  return $this->revisionCallback( $revision );
1140  }
1141 
1146  private function handleUpload( &$pageInfo ) {
1147  $this->debug( "Enter upload handler" );
1148  $uploadInfo = [];
1149 
1150  $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1151  'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1152 
1153  $skip = false;
1154 
1155  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1156  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1157  $this->reader->localName == 'upload' ) {
1158  break;
1159  }
1160 
1161  $tag = $this->reader->localName;
1162 
1163  if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1164  // Do nothing
1165  } elseif ( in_array( $tag, $normalFields ) ) {
1166  $uploadInfo[$tag] = $this->nodeContents();
1167  } elseif ( $tag == 'contributor' ) {
1168  $uploadInfo['contributor'] = $this->handleContributor();
1169  } elseif ( $tag == 'contents' ) {
1170  $contents = $this->nodeContents();
1171  $encoding = $this->reader->getAttribute( 'encoding' );
1172  if ( $encoding === 'base64' ) {
1173  $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1174  $uploadInfo['isTempSrc'] = true;
1175  }
1176  } elseif ( $tag != '#text' ) {
1177  $this->warn( "Unhandled upload XML tag $tag" );
1178  $skip = true;
1179  }
1180  }
1181 
1182  if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1183  $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1184  if ( file_exists( $path ) ) {
1185  $uploadInfo['fileSrc'] = $path;
1186  $uploadInfo['isTempSrc'] = false;
1187  }
1188  }
1189 
1190  if ( $this->mImportUploads ) {
1191  return $this->processUpload( $pageInfo, $uploadInfo );
1192  }
1193  }
1194 
1199  private function dumpTemp( $contents ) {
1200  $filename = tempnam( wfTempDir(), 'importupload' );
1201  file_put_contents( $filename, $contents );
1202  return $filename;
1203  }
1204 
1210  private function processUpload( $pageInfo, $uploadInfo ) {
1211  $revision = new WikiRevision( $this->config );
1212  $revId = $pageInfo['id'];
1213  $title = $pageInfo['_title'];
1214  $content = $this->makeContent( $title, $revId, $uploadInfo );
1215 
1216  $revision->setTitle( $title );
1217  $revision->setID( $revId );
1218  $revision->setTimestamp( $uploadInfo['timestamp'] );
1219  $revision->setContent( SlotRecord::MAIN, $content );
1220  $revision->setFilename( $uploadInfo['filename'] );
1221  if ( isset( $uploadInfo['archivename'] ) ) {
1222  $revision->setArchiveName( $uploadInfo['archivename'] );
1223  }
1224  $revision->setSrc( $uploadInfo['src'] );
1225  if ( isset( $uploadInfo['fileSrc'] ) ) {
1226  $revision->setFileSrc( $uploadInfo['fileSrc'],
1227  !empty( $uploadInfo['isTempSrc'] )
1228  );
1229  }
1230  if ( isset( $uploadInfo['sha1base36'] ) ) {
1231  $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1232  }
1233  $revision->setSize( intval( $uploadInfo['size'] ) );
1234  $revision->setComment( $uploadInfo['comment'] );
1235 
1236  if ( isset( $uploadInfo['contributor']['username'] ) ) {
1237  $revision->setUsername(
1238  $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1239  );
1240  } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1241  $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1242  }
1243  $revision->setNoUpdates( $this->mNoUpdates );
1244 
1245  return call_user_func( $this->mUploadCallback, $revision );
1246  }
1247 
1251  private function handleContributor() {
1252  $this->debug( "Enter contributor handler." );
1253 
1254  if ( $this->reader->isEmptyElement ) {
1255  return [];
1256  }
1257 
1258  $fields = [ 'id', 'ip', 'username' ];
1259  $info = [];
1260 
1261  while ( $this->reader->read() ) {
1262  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1263  $this->reader->localName == 'contributor' ) {
1264  break;
1265  }
1266 
1267  $tag = $this->reader->localName;
1268 
1269  if ( in_array( $tag, $fields ) ) {
1270  $info[$tag] = $this->nodeContents();
1271  }
1272  }
1273 
1274  return $info;
1275  }
1276 
1282  private function processTitle( $text, $ns = null ) {
1283  if ( $this->foreignNamespaces === null ) {
1284  $foreignTitleFactory = new NaiveForeignTitleFactory(
1285  $this->contentLanguage
1286  );
1287  } else {
1288  $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1289  $this->foreignNamespaces );
1290  }
1291 
1292  $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1293  intval( $ns ) );
1294 
1295  $title = $this->importTitleFactory->createTitleFromForeignTitle(
1296  $foreignTitle );
1297 
1298  $commandLineMode = $this->config->get( 'CommandLineMode' );
1299  if ( $title === null ) {
1300  # Invalid page title? Ignore the page
1301  $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1302  return false;
1303  } elseif ( $title->isExternal() ) {
1304  $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1305  return false;
1306  } elseif ( !$title->canExist() ) {
1307  $this->notice( 'import-error-special', $title->getPrefixedText() );
1308  return false;
1309  } elseif ( !$commandLineMode ) {
1310  $user = RequestContext::getMain()->getUser();
1311 
1312  if ( !$this->permissionManager->userCan( 'edit', $user, $title ) ) {
1313  # Do not import if the importing wiki user cannot edit this page
1314  $this->notice( 'import-error-edit', $title->getPrefixedText() );
1315 
1316  return false;
1317  }
1318  }
1319 
1320  return [ $title, $foreignTitle ];
1321  }
1322 
1327  private function getContentHandler( $model ) {
1328  return $this->contentHandlerFactory->getContentHandler( $model );
1329  }
1330 
1337  private function getDefaultContentModel( $title, $role ) {
1338  return $this->slotRoleRegistry
1339  ->getRoleHandler( $role )
1340  ->getDefaultModel( $title );
1341  }
1342 }
const NS_MAIN
Definition: Defines.php:64
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
static addUpdate(DeferrableUpdate $update, $stage=self::POSTSEND)
Add an update to the pending update queue for execution at the appropriate time.
Class to parse and build external user names.
Reporting callback.
Base class for language-specific code.
Definition: Language.php:53
Exception representing a failure to serialize or unserialize a content object.
MediaWiki exception.
Definition: MWException.php:29
Helper class for mapping value objects representing basic entities to cache keys.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Definition: HookRunner.php:561
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Service for creating WikiPage objects.
A service class for checking permissions To obtain an instance, use MediaWikiServices::getInstance()-...
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:40
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
static getMain()
Get the RequestContext object associated with the main request.
static factory(array $deltas)
static newGood( $value=null)
Factory function for good results.
Definition: StatusValue.php:82
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Creates Title objects.
Represents a title within MediaWiki.
Definition: Title.php:49
static newFromText( $text, $defaultNamespace=NS_MAIN)
Create a new Title from text, such as what one would find in a link.
Definition: Title.php:370
static castFromPageIdentity(?PageIdentity $pageIdentity)
Return a Title for a given PageIdentity.
Definition: Title.php:319
getPrefixedText()
Get the prefixed title with spaces.
Definition: Title.php:1888
static registerSource(ImportSource $source)
XML file reader for the page data importer.
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setNoUpdates( $noupdates)
Set 'no updates' mode.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
importLogItem( $revision)
Default per-revision callback, performs the import.
setImageBasePath( $dir)
throwXmlError( $err)
setDebug( $debug)
Set debug mode...
__construct(ImportSource $source, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, PermissionManager $permissionManager, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
Represents a revision, log entry or upload during the import process.
Interface for configuration instances.
Definition: Config.php:30
Source interface for XML import.
Interface for objects (potentially) representing an editable wiki page.
$debug
Definition: mcc.php:31
$source
$content
Definition: router.php:76