MediaWiki  master
WikiImporter.php
Go to the documentation of this file.
1 <?php
39 use Wikimedia\NormalizedException\NormalizedException;
40 
47 class WikiImporter {
49  private $reader;
50 
52  private $foreignNamespaces = null;
53 
55  private $mLogItemCallback;
56 
58  private $mUploadCallback;
59 
61  private $mRevisionCallback;
62 
64  private $mPageCallback;
65 
67  private $mSiteInfoCallback;
68 
70  private $mPageOutCallback;
71 
73  private $mNoticeCallback;
74 
76  private $mDebug;
77 
79  private $mImportUploads;
80 
82  private $mImageBasePath;
83 
85  private $mNoUpdates = false;
86 
88  private $pageOffset = 0;
89 
91  private $config;
92 
94  private $importTitleFactory;
95 
97  private $hookRunner;
98 
100  private $countableCache = [];
101 
103  private $disableStatisticsUpdate = false;
104 
106  private $externalUserNames;
107 
109  private $contentLanguage;
110 
112  private $namespaceInfo;
113 
115  private $titleFactory;
116 
118  private $wikiPageFactory;
119 
121  private $uploadRevisionImporter;
122 
124  private $permissionManager;
125 
127  private $contentHandlerFactory;
128 
130  private $slotRoleRegistry;
131 
147  public function __construct(
149  Config $config,
150  HookContainer $hookContainer,
151  Language $contentLanguage,
152  NamespaceInfo $namespaceInfo,
153  TitleFactory $titleFactory,
154  WikiPageFactory $wikiPageFactory,
155  UploadRevisionImporter $uploadRevisionImporter,
156  PermissionManager $permissionManager,
157  IContentHandlerFactory $contentHandlerFactory,
158  SlotRoleRegistry $slotRoleRegistry
159  ) {
160  $this->reader = new XMLReader();
161  $this->config = $config;
162  $this->hookRunner = new HookRunner( $hookContainer );
163  $this->contentLanguage = $contentLanguage;
164  $this->namespaceInfo = $namespaceInfo;
165  $this->titleFactory = $titleFactory;
166  $this->wikiPageFactory = $wikiPageFactory;
167  $this->uploadRevisionImporter = $uploadRevisionImporter;
168  $this->permissionManager = $permissionManager;
169  $this->contentHandlerFactory = $contentHandlerFactory;
170  $this->slotRoleRegistry = $slotRoleRegistry;
171 
172  if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
173  stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
174  }
176 
177  // Enable the entity loader, as it is needed for loading external URLs via
178  // XMLReader::open (T86036)
179  // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
180  $oldDisable = @libxml_disable_entity_loader( false );
181  if ( defined( 'LIBXML_PARSEHUGE' ) ) {
182  $status = $this->reader->open( "uploadsource://$id", null, LIBXML_PARSEHUGE );
183  } else {
184  $status = $this->reader->open( "uploadsource://$id" );
185  }
186  if ( !$status ) {
187  $error = libxml_get_last_error();
188  // phpcs:ignore Generic.PHP.NoSilencedErrors
189  @libxml_disable_entity_loader( $oldDisable );
190  throw new MWException( 'Encountered an internal error while initializing WikiImporter object: ' .
191  $error->message );
192  }
193  // phpcs:ignore Generic.PHP.NoSilencedErrors
194  @libxml_disable_entity_loader( $oldDisable );
195 
196  // Default callbacks
197  $this->setPageCallback( [ $this, 'beforeImportPage' ] );
198  $this->setRevisionCallback( [ $this, "importRevision" ] );
199  $this->setUploadCallback( [ $this, 'importUpload' ] );
200  $this->setLogItemCallback( [ $this, 'importLogItem' ] );
201  $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
202 
203  $this->importTitleFactory = new NaiveImportTitleFactory(
204  $this->contentLanguage,
205  $this->namespaceInfo,
206  $this->titleFactory
207  );
208  $this->externalUserNames = new ExternalUserNames( 'imported', false );
209  }
210 
214  public function getReader() {
215  return $this->reader;
216  }
217 
221  public function throwXmlError( $err ) {
222  $this->debug( "FAILURE: $err" );
223  wfDebug( "WikiImporter XML error: $err" );
224  }
225 
229  public function debug( $data ) {
230  if ( $this->mDebug ) {
231  wfDebug( "IMPORT: $data" );
232  }
233  }
234 
238  public function warn( $data ) {
239  wfDebug( "IMPORT: $data" );
240  }
241 
246  public function notice( $msg, ...$params ) {
247  if ( is_callable( $this->mNoticeCallback ) ) {
248  call_user_func( $this->mNoticeCallback, $msg, $params );
249  } else { # No ImportReporter -> CLI
250  // T177997: the command line importers should call setNoticeCallback()
251  // for their own custom callback to echo the notice
252  wfDebug( wfMessage( $msg, $params )->text() );
253  }
254  }
255 
260  public function setDebug( $debug ) {
261  $this->mDebug = $debug;
262  }
263 
268  public function setNoUpdates( $noupdates ) {
269  $this->mNoUpdates = $noupdates;
270  }
271 
278  public function setPageOffset( $nthPage ) {
279  $this->pageOffset = $nthPage;
280  }
281 
288  public function setNoticeCallback( $callback ) {
289  return wfSetVar( $this->mNoticeCallback, $callback );
290  }
291 
297  public function setPageCallback( $callback ) {
298  $previous = $this->mPageCallback;
299  $this->mPageCallback = $callback;
300  return $previous;
301  }
302 
312  public function setPageOutCallback( $callback ) {
313  $previous = $this->mPageOutCallback;
314  $this->mPageOutCallback = $callback;
315  return $previous;
316  }
317 
323  public function setRevisionCallback( $callback ) {
324  $previous = $this->mRevisionCallback;
325  $this->mRevisionCallback = $callback;
326  return $previous;
327  }
328 
334  public function setUploadCallback( $callback ) {
335  $previous = $this->mUploadCallback;
336  $this->mUploadCallback = $callback;
337  return $previous;
338  }
339 
345  public function setLogItemCallback( $callback ) {
346  $previous = $this->mLogItemCallback;
347  $this->mLogItemCallback = $callback;
348  return $previous;
349  }
350 
356  public function setSiteInfoCallback( $callback ) {
357  $previous = $this->mSiteInfoCallback;
358  $this->mSiteInfoCallback = $callback;
359  return $previous;
360  }
361 
367  public function setImportTitleFactory( $factory ) {
368  $this->importTitleFactory = $factory;
369  }
370 
376  public function setTargetNamespace( $namespace ) {
377  if ( $namespace === null ) {
378  // Don't override namespaces
379  $this->setImportTitleFactory(
381  $this->contentLanguage,
382  $this->namespaceInfo,
383  $this->titleFactory
384  )
385  );
386  return true;
387  } elseif (
388  $namespace >= 0 &&
389  $this->namespaceInfo->exists( intval( $namespace ) )
390  ) {
391  $namespace = intval( $namespace );
392  $this->setImportTitleFactory(
394  $this->namespaceInfo,
395  $this->titleFactory,
396  $namespace
397  )
398  );
399  return true;
400  } else {
401  return false;
402  }
403  }
404 
410  public function setTargetRootPage( $rootpage ) {
411  $status = Status::newGood();
412  $nsInfo = $this->namespaceInfo;
413  if ( $rootpage === null ) {
414  // No rootpage
415  $this->setImportTitleFactory(
417  $this->contentLanguage,
418  $nsInfo,
419  $this->titleFactory
420  )
421  );
422  } elseif ( $rootpage !== '' ) {
423  $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
424  $title = Title::newFromText( $rootpage );
425 
426  if ( !$title || $title->isExternal() ) {
427  $status->fatal( 'import-rootpage-invalid' );
428  } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
429  $displayNSText = $title->getNamespace() === NS_MAIN
430  ? wfMessage( 'blanknamespace' )->text()
431  : $this->contentLanguage->getNsText( $title->getNamespace() );
432  $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
433  } else {
434  // set namespace to 'all', so the namespace check in processTitle() can pass
435  $this->setTargetNamespace( null );
436  $this->setImportTitleFactory(
438  $nsInfo,
439  $this->titleFactory,
440  $title
441  )
442  );
443  }
444  }
445  return $status;
446  }
447 
451  public function setImageBasePath( $dir ) {
452  $this->mImageBasePath = $dir;
453  }
454 
458  public function setImportUploads( $import ) {
459  $this->mImportUploads = $import;
460  }
461 
467  public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
468  $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
469  }
470 
475  public function disableStatisticsUpdate() {
476  $this->disableStatisticsUpdate = true;
477  }
478 
485  public function beforeImportPage( $titleAndForeignTitle ) {
486  $title = $titleAndForeignTitle[0];
487  $page = $this->wikiPageFactory->newFromTitle( $title );
488  $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
489  return true;
490  }
491 
497  public function importRevision( $revision ) {
498  if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
499  $this->notice( 'import-error-bad-location',
500  $revision->getTitle()->getPrefixedText(),
501  $revision->getID(),
502  $revision->getModel(),
503  $revision->getFormat()
504  );
505 
506  return false;
507  }
508 
509  try {
510  return $revision->importOldRevision();
511  } catch ( MWContentSerializationException $ex ) {
512  $this->notice( 'import-error-unserialize',
513  $revision->getTitle()->getPrefixedText(),
514  $revision->getID(),
515  $revision->getModel(),
516  $revision->getFormat()
517  );
518  }
519 
520  return false;
521  }
522 
528  public function importLogItem( $revision ) {
529  return $revision->importLogItem();
530  }
531 
537  public function importUpload( $revision ) {
538  $status = $this->uploadRevisionImporter->import( $revision );
539  return $status->isGood();
540  }
541 
551  public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
552  $sRevCount, $pageInfo
553  ) {
554  // Update article count statistics (T42009)
555  // The normal counting logic in WikiPage->doEditUpdates() is designed for
556  // one-revision-at-a-time editing, not bulk imports. In this situation it
557  // suffers from issues of replica DB lag. We let WikiPage handle the total page
558  // and revision count, and we implement our own custom logic for the
559  // article (content page) count.
560  if ( !$this->disableStatisticsUpdate ) {
561  $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
562 
563  $page->loadPageData( WikiPage::READ_LATEST );
564  $rev = $page->getRevisionRecord();
565  if ( $rev === null ) {
566 
567  wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
568  ' because WikiPage::getRevisionRecord() returned null' );
569  } else {
570  $user = RequestContext::getMain()->getUser();
571  $update = $page->newPageUpdater( $user )->prepareUpdate();
572  $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
573  $countable = $update->isCountable();
574  if ( array_key_exists( $countKey, $this->countableCache ) &&
575  $countable != $this->countableCache[$countKey] ) {
577  'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
578  ] ) );
579  }
580  }
581  }
582 
583  $title = Title::castFromPageIdentity( $pageIdentity );
584  // @phan-suppress-next-line PhanTypeMismatchArgumentNullable castFrom does not return null here
585  return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
586  $revCount, $sRevCount, $pageInfo );
587  }
588 
594  private function siteInfoCallback( $siteInfo ) {
595  if ( isset( $this->mSiteInfoCallback ) ) {
596  return call_user_func_array(
597  $this->mSiteInfoCallback,
598  [ $siteInfo, $this ]
599  );
600  } else {
601  return false;
602  }
603  }
604 
609  public function pageCallback( $title ) {
610  if ( isset( $this->mPageCallback ) ) {
611  call_user_func( $this->mPageCallback, $title );
612  }
613  }
614 
623  private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
624  $sucCount, $pageInfo ) {
625  if ( isset( $this->mPageOutCallback ) ) {
626  call_user_func_array( $this->mPageOutCallback, func_get_args() );
627  }
628  }
629 
635  private function revisionCallback( $revision ) {
636  if ( isset( $this->mRevisionCallback ) ) {
637  return call_user_func_array(
638  $this->mRevisionCallback,
639  [ $revision, $this ]
640  );
641  } else {
642  return false;
643  }
644  }
645 
651  private function logItemCallback( $revision ) {
652  if ( isset( $this->mLogItemCallback ) ) {
653  return call_user_func_array(
654  $this->mLogItemCallback,
655  [ $revision, $this ]
656  );
657  } else {
658  return false;
659  }
660  }
661 
668  public function nodeAttribute( $attr ) {
669  return $this->reader->getAttribute( $attr ) ?? '';
670  }
671 
679  public function nodeContents() {
680  if ( $this->reader->isEmptyElement ) {
681  return "";
682  }
683  $buffer = "";
684  while ( $this->reader->read() ) {
685  switch ( $this->reader->nodeType ) {
686  case XMLReader::TEXT:
687  case XMLReader::CDATA:
688  case XMLReader::SIGNIFICANT_WHITESPACE:
689  $buffer .= $this->reader->value;
690  break;
691  case XMLReader::END_ELEMENT:
692  return $buffer;
693  }
694  }
695 
696  $this->reader->close();
697  return '';
698  }
699 
706  public function doImport() {
707  // Calls to reader->read need to be wrapped in calls to
708  // libxml_disable_entity_loader() to avoid local file
709  // inclusion attacks (T48932).
710  // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
711  $oldDisable = @libxml_disable_entity_loader( true );
712  try {
713  $this->reader->read();
714 
715  if ( $this->reader->localName != 'mediawiki' ) {
716  // phpcs:ignore Generic.PHP.NoSilencedErrors
717  @libxml_disable_entity_loader( $oldDisable );
718  $error = libxml_get_last_error();
719  if ( $error ) {
720  throw new NormalizedException( "XML error at line {line}: {message}", [
721  'line' => $error->line,
722  'message' => $error->message,
723  ] );
724  } else {
725  throw new MWException( "Expected <mediawiki> tag, got " .
726  $this->reader->localName );
727  }
728  }
729  $this->debug( "<mediawiki> tag is correct." );
730 
731  $this->debug( "Starting primary dump processing loop." );
732 
733  $keepReading = $this->reader->read();
734  $skip = false;
735  $pageCount = 0;
736  while ( $keepReading ) {
737  $tag = $this->reader->localName;
738  if ( $this->pageOffset ) {
739  if ( $tag === 'page' ) {
740  $pageCount++;
741  }
742  if ( $pageCount < $this->pageOffset ) {
743  $keepReading = $this->reader->next();
744  continue;
745  }
746  }
747  $type = $this->reader->nodeType;
748 
749  if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
750  // Do nothing
751  } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
752  break;
753  } elseif ( $tag == 'siteinfo' ) {
754  $this->handleSiteInfo();
755  } elseif ( $tag == 'page' ) {
756  $this->handlePage();
757  } elseif ( $tag == 'logitem' ) {
758  $this->handleLogItem();
759  } elseif ( $tag != '#text' ) {
760  $this->warn( "Unhandled top-level XML tag $tag" );
761 
762  $skip = true;
763  }
764 
765  if ( $skip ) {
766  $keepReading = $this->reader->next();
767  $skip = false;
768  $this->debug( "Skip" );
769  } else {
770  $keepReading = $this->reader->read();
771  }
772  }
773  } finally {
774  // phpcs:ignore Generic.PHP.NoSilencedErrors
775  @libxml_disable_entity_loader( $oldDisable );
776  $this->reader->close();
777  }
778 
779  return true;
780  }
781 
782  private function handleSiteInfo() {
783  $this->debug( "Enter site info handler." );
784  $siteInfo = [];
785 
786  // Fields that can just be stuffed in the siteInfo object
787  $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
788 
789  while ( $this->reader->read() ) {
790  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
791  $this->reader->localName == 'siteinfo' ) {
792  break;
793  }
794 
795  $tag = $this->reader->localName;
796 
797  if ( $tag == 'namespace' ) {
798  $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
799  $this->nodeContents();
800  } elseif ( in_array( $tag, $normalFields ) ) {
801  $siteInfo[$tag] = $this->nodeContents();
802  }
803  }
804 
805  $siteInfo['_namespaces'] = $this->foreignNamespaces;
806  $this->siteInfoCallback( $siteInfo );
807  }
808 
809  private function handleLogItem() {
810  $this->debug( "Enter log item handler." );
811  $logInfo = [];
812 
813  // Fields that can just be stuffed in the pageInfo object
814  $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
815  'logtitle', 'params' ];
816 
817  while ( $this->reader->read() ) {
818  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
819  $this->reader->localName == 'logitem' ) {
820  break;
821  }
822 
823  $tag = $this->reader->localName;
824 
825  if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
826  // Do nothing
827  } elseif ( in_array( $tag, $normalFields ) ) {
828  $logInfo[$tag] = $this->nodeContents();
829  } elseif ( $tag == 'contributor' ) {
830  $logInfo['contributor'] = $this->handleContributor();
831  } elseif ( $tag != '#text' ) {
832  $this->warn( "Unhandled log-item XML tag $tag" );
833  }
834  }
835 
836  $this->processLogItem( $logInfo );
837  }
838 
843  private function processLogItem( $logInfo ) {
844  $revision = new WikiRevision();
845 
846  if ( isset( $logInfo['id'] ) ) {
847  $revision->setID( $logInfo['id'] );
848  }
849  $revision->setType( $logInfo['type'] );
850  $revision->setAction( $logInfo['action'] );
851  if ( isset( $logInfo['timestamp'] ) ) {
852  $revision->setTimestamp( $logInfo['timestamp'] );
853  }
854  if ( isset( $logInfo['params'] ) ) {
855  $revision->setParams( $logInfo['params'] );
856  }
857  if ( isset( $logInfo['logtitle'] ) ) {
858  // @todo Using Title for non-local titles is a recipe for disaster.
859  // We should use ForeignTitle here instead.
860  $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
861  }
862 
863  $revision->setNoUpdates( $this->mNoUpdates );
864 
865  if ( isset( $logInfo['comment'] ) ) {
866  $revision->setComment( $logInfo['comment'] );
867  }
868 
869  if ( isset( $logInfo['contributor']['username'] ) ) {
870  $revision->setUsername(
871  $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
872  );
873  } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
874  $revision->setUserIP( $logInfo['contributor']['ip'] );
875  } else {
876  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
877  }
878 
879  return $this->logItemCallback( $revision );
880  }
881 
882  private function handlePage() {
883  // Handle page data.
884  $this->debug( "Enter page handler." );
885  $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
886 
887  // Fields that can just be stuffed in the pageInfo object
888  $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
889 
890  $skip = false;
891  $badTitle = false;
892 
893  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
894  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
895  $this->reader->localName == 'page' ) {
896  break;
897  }
898 
899  $skip = false;
900 
901  $tag = $this->reader->localName;
902 
903  if ( $badTitle ) {
904  // The title is invalid, bail out of this page
905  $skip = true;
906  } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
907  // Do nothing
908  } elseif ( in_array( $tag, $normalFields ) ) {
909  // An XML snippet:
910  // <page>
911  // <id>123</id>
912  // <title>Page</title>
913  // <redirect title="NewTitle"/>
914  // ...
915  // Because the redirect tag is built differently, we need special handling for that case.
916  if ( $tag == 'redirect' ) {
917  $pageInfo[$tag] = $this->nodeAttribute( 'title' );
918  } else {
919  $pageInfo[$tag] = $this->nodeContents();
920  }
921  } elseif ( $tag == 'revision' || $tag == 'upload' ) {
922  if ( !isset( $title ) ) {
923  $title = $this->processTitle( $pageInfo['title'],
924  $pageInfo['ns'] ?? null );
925 
926  // $title is either an array of two titles or false.
927  if ( is_array( $title ) ) {
928  $this->pageCallback( $title );
929  [ $pageInfo['_title'], $foreignTitle ] = $title;
930  } else {
931  $badTitle = true;
932  $skip = true;
933  }
934  }
935 
936  if ( $title ) {
937  if ( $tag == 'revision' ) {
938  $this->handleRevision( $pageInfo );
939  } else {
940  $this->handleUpload( $pageInfo );
941  }
942  }
943  } elseif ( $tag != '#text' ) {
944  $this->warn( "Unhandled page XML tag $tag" );
945  $skip = true;
946  }
947  }
948 
949  // @note $pageInfo is only set if a valid $title is processed above with
950  // no error. If we have a valid $title, then pageCallback is called
951  // above, $pageInfo['title'] is set and we do pageOutCallback here.
952  // If $pageInfo['_title'] is not set, then $foreignTitle is also not
953  // set since they both come from $title above.
954  if ( array_key_exists( '_title', $pageInfo ) ) {
956  $title = $pageInfo['_title'];
957  $this->pageOutCallback(
958  $title,
959  // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
960  $foreignTitle,
961  $pageInfo['revisionCount'],
962  $pageInfo['successfulRevisionCount'],
963  $pageInfo
964  );
965  }
966  }
967 
971  private function handleRevision( &$pageInfo ) {
972  $this->debug( "Enter revision handler" );
973  $revisionInfo = [];
974 
975  $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
976  'model', 'format', 'text', 'sha1' ];
977 
978  $skip = false;
979 
980  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
981  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
982  $this->reader->localName == 'revision' ) {
983  break;
984  }
985 
986  $tag = $this->reader->localName;
987 
988  if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
989  $this, $pageInfo, $revisionInfo )
990  ) {
991  // Do nothing
992  } elseif ( in_array( $tag, $normalFields ) ) {
993  $revisionInfo[$tag] = $this->nodeContents();
994  } elseif ( $tag == 'content' ) {
995  // We can have multiple content tags, so make this an array.
996  $revisionInfo[$tag][] = $this->handleContent();
997  } elseif ( $tag == 'contributor' ) {
998  $revisionInfo['contributor'] = $this->handleContributor();
999  } elseif ( $tag != '#text' ) {
1000  $this->warn( "Unhandled revision XML tag $tag" );
1001  $skip = true;
1002  }
1003  }
1004 
1005  $pageInfo['revisionCount']++;
1006  if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
1007  $pageInfo['successfulRevisionCount']++;
1008  }
1009  }
1010 
1011  private function handleContent() {
1012  $this->debug( "Enter content handler" );
1013  $contentInfo = [];
1014 
1015  $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
1016 
1017  $skip = false;
1018 
1019  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1020  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1021  $this->reader->localName == 'content' ) {
1022  break;
1023  }
1024 
1025  $tag = $this->reader->localName;
1026 
1027  if ( !$this->hookRunner->onImportHandleContentXMLTag(
1028  $this, $contentInfo )
1029  ) {
1030  // Do nothing
1031  } elseif ( in_array( $tag, $normalFields ) ) {
1032  $contentInfo[$tag] = $this->nodeContents();
1033  } elseif ( $tag != '#text' ) {
1034  $this->warn( "Unhandled content XML tag $tag" );
1035  $skip = true;
1036  }
1037  }
1038 
1039  return $contentInfo;
1040  }
1041 
1050  private function makeContent( Title $title, $revisionId, $contentInfo ) {
1051  $maxArticleSize = MediaWikiServices::getInstance()->getMainConfig()->get(
1052  MainConfigNames::MaxArticleSize );
1053 
1054  if ( !isset( $contentInfo['text'] ) ) {
1055  throw new MWException( 'Missing text field in import.' );
1056  }
1057 
1058  // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1059  // database errors and instability. Testing for revisions with only listed
1060  // content models, as other content models might use serialization formats
1061  // which aren't checked against $wgMaxArticleSize.
1062  if ( ( !isset( $contentInfo['model'] ) ||
1063  in_array( $contentInfo['model'], [
1064  'wikitext',
1065  'css',
1066  'json',
1067  'javascript',
1068  'text',
1069  ''
1070  ] ) ) &&
1071  strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1072  ) {
1073  throw new MWException( 'The text of ' .
1074  ( $revisionId ?
1075  "the revision with ID $revisionId" :
1076  'a revision'
1077  ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1078  }
1079 
1080  $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1081  $model = $contentInfo['model'] ?? $this->getDefaultContentModel( $title, $role );
1082  $handler = $this->getContentHandler( $model );
1083 
1084  $text = $handler->importTransform( $contentInfo['text'] );
1085 
1086  return $handler->unserializeContent( $text );
1087  }
1088 
1095  private function processRevision( $pageInfo, $revisionInfo ) {
1096  $revision = new WikiRevision();
1097 
1098  $revId = $revisionInfo['id'] ?? 0;
1099  if ( $revId ) {
1100  $revision->setID( $revisionInfo['id'] );
1101  }
1102 
1103  $title = $pageInfo['_title'];
1104  $revision->setTitle( $title );
1105 
1106  $content = $this->makeContent( $title, $revId, $revisionInfo );
1107  $revision->setContent( SlotRecord::MAIN, $content );
1108 
1109  foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1110  if ( !isset( $slotInfo['role'] ) ) {
1111  throw new MWException( "Missing role for imported slot." );
1112  }
1113 
1114  $content = $this->makeContent( $title, $revId, $slotInfo );
1115  $revision->setContent( $slotInfo['role'], $content );
1116  }
1117  $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1118 
1119  if ( isset( $revisionInfo['comment'] ) ) {
1120  $revision->setComment( $revisionInfo['comment'] );
1121  }
1122 
1123  if ( isset( $revisionInfo['minor'] ) ) {
1124  $revision->setMinor( true );
1125  }
1126  if ( isset( $revisionInfo['contributor']['username'] ) ) {
1127  $revision->setUsername(
1128  $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1129  );
1130  } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1131  $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1132  } else {
1133  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1134  }
1135  if ( isset( $revisionInfo['sha1'] ) ) {
1136  $revision->setSha1Base36( $revisionInfo['sha1'] );
1137  }
1138  $revision->setNoUpdates( $this->mNoUpdates );
1139 
1140  return $this->revisionCallback( $revision );
1141  }
1142 
1147  private function handleUpload( &$pageInfo ) {
1148  $this->debug( "Enter upload handler" );
1149  $uploadInfo = [];
1150 
1151  $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1152  'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1153 
1154  $skip = false;
1155 
1156  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1157  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1158  $this->reader->localName == 'upload' ) {
1159  break;
1160  }
1161 
1162  $tag = $this->reader->localName;
1163 
1164  if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1165  // Do nothing
1166  } elseif ( in_array( $tag, $normalFields ) ) {
1167  $uploadInfo[$tag] = $this->nodeContents();
1168  } elseif ( $tag == 'contributor' ) {
1169  $uploadInfo['contributor'] = $this->handleContributor();
1170  } elseif ( $tag == 'contents' ) {
1171  $contents = $this->nodeContents();
1172  $encoding = $this->reader->getAttribute( 'encoding' );
1173  if ( $encoding === 'base64' ) {
1174  $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1175  $uploadInfo['isTempSrc'] = true;
1176  }
1177  } elseif ( $tag != '#text' ) {
1178  $this->warn( "Unhandled upload XML tag $tag" );
1179  $skip = true;
1180  }
1181  }
1182 
1183  if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1184  $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1185  if ( file_exists( $path ) ) {
1186  $uploadInfo['fileSrc'] = $path;
1187  $uploadInfo['isTempSrc'] = false;
1188  }
1189  }
1190 
1191  if ( $this->mImportUploads ) {
1192  return $this->processUpload( $pageInfo, $uploadInfo );
1193  }
1194  }
1195 
1200  private function dumpTemp( $contents ) {
1201  $filename = tempnam( wfTempDir(), 'importupload' );
1202  file_put_contents( $filename, $contents );
1203  return $filename;
1204  }
1205 
1211  private function processUpload( $pageInfo, $uploadInfo ) {
1212  $revision = new WikiRevision();
1213  $revId = $pageInfo['id'];
1214  $title = $pageInfo['_title'];
1215  $content = $this->makeContent( $title, $revId, $uploadInfo );
1216 
1217  $revision->setTitle( $title );
1218  $revision->setID( $revId );
1219  $revision->setTimestamp( $uploadInfo['timestamp'] );
1220  $revision->setContent( SlotRecord::MAIN, $content );
1221  $revision->setFilename( $uploadInfo['filename'] );
1222  if ( isset( $uploadInfo['archivename'] ) ) {
1223  $revision->setArchiveName( $uploadInfo['archivename'] );
1224  }
1225  $revision->setSrc( $uploadInfo['src'] );
1226  if ( isset( $uploadInfo['fileSrc'] ) ) {
1227  $revision->setFileSrc( $uploadInfo['fileSrc'],
1228  !empty( $uploadInfo['isTempSrc'] )
1229  );
1230  }
1231  if ( isset( $uploadInfo['sha1base36'] ) ) {
1232  $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1233  }
1234  $revision->setSize( intval( $uploadInfo['size'] ) );
1235  $revision->setComment( $uploadInfo['comment'] );
1236 
1237  if ( isset( $uploadInfo['contributor']['username'] ) ) {
1238  $revision->setUsername(
1239  $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1240  );
1241  } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1242  $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1243  }
1244  $revision->setNoUpdates( $this->mNoUpdates );
1245 
1246  return call_user_func( $this->mUploadCallback, $revision );
1247  }
1248 
1252  private function handleContributor() {
1253  $this->debug( "Enter contributor handler." );
1254 
1255  if ( $this->reader->isEmptyElement ) {
1256  return [];
1257  }
1258 
1259  $fields = [ 'id', 'ip', 'username' ];
1260  $info = [];
1261 
1262  while ( $this->reader->read() ) {
1263  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1264  $this->reader->localName == 'contributor' ) {
1265  break;
1266  }
1267 
1268  $tag = $this->reader->localName;
1269 
1270  if ( in_array( $tag, $fields ) ) {
1271  $info[$tag] = $this->nodeContents();
1272  }
1273  }
1274 
1275  return $info;
1276  }
1277 
1283  private function processTitle( $text, $ns = null ) {
1284  if ( $this->foreignNamespaces === null ) {
1285  $foreignTitleFactory = new NaiveForeignTitleFactory(
1286  $this->contentLanguage
1287  );
1288  } else {
1289  $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1290  $this->foreignNamespaces );
1291  }
1292 
1293  $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1294  intval( $ns ) );
1295 
1296  $title = $this->importTitleFactory->createTitleFromForeignTitle(
1297  $foreignTitle );
1298 
1299  $commandLineMode = $this->config->get( 'CommandLineMode' );
1300  if ( $title === null ) {
1301  # Invalid page title? Ignore the page
1302  $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1303  return false;
1304  } elseif ( $title->isExternal() ) {
1305  $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1306  return false;
1307  } elseif ( !$title->canExist() ) {
1308  $this->notice( 'import-error-special', $title->getPrefixedText() );
1309  return false;
1310  } elseif ( !$commandLineMode ) {
1311  $user = RequestContext::getMain()->getUser();
1312 
1313  if ( !$this->permissionManager->userCan( 'edit', $user, $title ) ) {
1314  # Do not import if the importing wiki user cannot edit this page
1315  $this->notice( 'import-error-edit', $title->getPrefixedText() );
1316 
1317  return false;
1318  }
1319  }
1320 
1321  return [ $title, $foreignTitle ];
1322  }
1323 
1328  private function getContentHandler( $model ) {
1329  return $this->contentHandlerFactory->getContentHandler( $model );
1330  }
1331 
1338  private function getDefaultContentModel( $title, $role ) {
1339  return $this->slotRoleRegistry
1340  ->getRoleHandler( $role )
1341  ->getDefaultModel( $title );
1342  }
1343 }
const NS_MAIN
Definition: Defines.php:64
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
static addUpdate(DeferrableUpdate $update, $stage=self::POSTSEND)
Add an update to the pending update queue for execution at the appropriate time.
Class to parse and build external user names.
Reporting callback.
Base class for language-specific code.
Definition: Language.php:54
Exception representing a failure to serialize or unserialize a content object.
MediaWiki exception.
Definition: MWException.php:29
Helper class for mapping value objects representing basic entities to cache keys.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Definition: HookRunner.php:569
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Service for creating WikiPage objects.
A service class for checking permissions To obtain an instance, use MediaWikiServices::getInstance()-...
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:40
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
Creates Title objects.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
static getMain()
Get the RequestContext object associated with the main request.
static factory(array $deltas)
static newGood( $value=null)
Factory function for good results.
Definition: StatusValue.php:85
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Represents a title within MediaWiki.
Definition: Title.php:52
static newFromText( $text, $defaultNamespace=NS_MAIN)
Create a new Title from text, such as what one would find in a link.
Definition: Title.php:373
static castFromPageIdentity(?PageIdentity $pageIdentity)
Return a Title for a given PageIdentity.
Definition: Title.php:322
getPrefixedText()
Get the prefixed title with spaces.
Definition: Title.php:1891
static registerSource(ImportSource $source)
XML file reader for the page data importer.
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setNoUpdates( $noupdates)
Set 'no updates' mode.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
importLogItem( $revision)
Default per-revision callback, performs the import.
setImageBasePath( $dir)
throwXmlError( $err)
setDebug( $debug)
Set debug mode...
__construct(ImportSource $source, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, PermissionManager $permissionManager, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
Represents a revision, log entry or upload during the import process.
Interface for configuration instances.
Definition: Config.php:30
Source interface for XML import.
Interface for objects (potentially) representing an editable wiki page.
$debug
Definition: mcc.php:31
$source
$content
Definition: router.php:76