MediaWiki REL1_41
WikiImporter.php
Go to the documentation of this file.
1<?php
51use Wikimedia\AtEase\AtEase;
52use Wikimedia\NormalizedException\NormalizedException;
53
62 private $reader;
63
65 private $sourceAdapterId;
66
68 private $foreignNamespaces = null;
69
71 private $mLogItemCallback;
72
74 private $mUploadCallback;
75
77 private $mRevisionCallback;
78
80 private $mPageCallback;
81
83 private $mSiteInfoCallback;
84
86 private $mPageOutCallback;
87
89 private $mNoticeCallback;
90
92 private $mDebug;
93
95 private $mImportUploads;
96
98 private $mImageBasePath;
99
101 private $mNoUpdates = false;
102
104 private $pageOffset = 0;
105
107 private $config;
108
110 private $importTitleFactory;
111
113 private $hookRunner;
114
116 private $countableCache = [];
117
119 private $disableStatisticsUpdate = false;
120
122 private $externalUserNames;
123
125 private $contentLanguage;
126
128 private $namespaceInfo;
129
131 private $titleFactory;
132
134 private $wikiPageFactory;
135
137 private $uploadRevisionImporter;
138
140 private $permissionManager;
141
143 private $contentHandlerFactory;
144
146 private $slotRoleRegistry;
147
163 public function __construct(
165 Config $config,
166 HookContainer $hookContainer,
167 Language $contentLanguage,
168 NamespaceInfo $namespaceInfo,
169 TitleFactory $titleFactory,
170 WikiPageFactory $wikiPageFactory,
171 UploadRevisionImporter $uploadRevisionImporter,
172 PermissionManager $permissionManager,
173 IContentHandlerFactory $contentHandlerFactory,
174 SlotRoleRegistry $slotRoleRegistry
175 ) {
176 $this->config = $config;
177 $this->hookRunner = new HookRunner( $hookContainer );
178 $this->contentLanguage = $contentLanguage;
179 $this->namespaceInfo = $namespaceInfo;
180 $this->titleFactory = $titleFactory;
181 $this->wikiPageFactory = $wikiPageFactory;
182 $this->uploadRevisionImporter = $uploadRevisionImporter;
183 $this->permissionManager = $permissionManager;
184 $this->contentHandlerFactory = $contentHandlerFactory;
185 $this->slotRoleRegistry = $slotRoleRegistry;
186
187 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
188 stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
189 }
190 $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source );
191
192 $this->openReader();
193
194 // Default callbacks
195 $this->setPageCallback( [ $this, 'beforeImportPage' ] );
196 $this->setRevisionCallback( [ $this, "importRevision" ] );
197 $this->setUploadCallback( [ $this, 'importUpload' ] );
198 $this->setLogItemCallback( [ $this, 'importLogItem' ] );
199 $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
200
201 $this->importTitleFactory = new NaiveImportTitleFactory(
202 $this->contentLanguage,
203 $this->namespaceInfo,
204 $this->titleFactory
205 );
206 $this->externalUserNames = new ExternalUserNames( 'imported', false );
207 }
208
212 public function getReader() {
213 return $this->reader;
214 }
215
219 public function throwXmlError( $err ) {
220 $this->debug( "FAILURE: $err" );
221 wfDebug( "WikiImporter XML error: $err" );
222 }
223
227 public function debug( $data ) {
228 if ( $this->mDebug ) {
229 wfDebug( "IMPORT: $data" );
230 }
231 }
232
236 public function warn( $data ) {
237 wfDebug( "IMPORT: $data" );
238 }
239
244 public function notice( $msg, ...$params ) {
245 if ( is_callable( $this->mNoticeCallback ) ) {
246 call_user_func( $this->mNoticeCallback, $msg, $params );
247 } else { # No ImportReporter -> CLI
248 // T177997: the command line importers should call setNoticeCallback()
249 // for their own custom callback to echo the notice
250 wfDebug( wfMessage( $msg, $params )->text() );
251 }
252 }
253
258 public function setDebug( $debug ) {
259 $this->mDebug = $debug;
260 }
261
266 public function setNoUpdates( $noupdates ) {
267 $this->mNoUpdates = $noupdates;
268 }
269
276 public function setPageOffset( $nthPage ) {
277 $this->pageOffset = $nthPage;
278 }
279
286 public function setNoticeCallback( $callback ) {
287 return wfSetVar( $this->mNoticeCallback, $callback );
288 }
289
295 public function setPageCallback( $callback ) {
296 $previous = $this->mPageCallback;
297 $this->mPageCallback = $callback;
298 return $previous;
299 }
300
310 public function setPageOutCallback( $callback ) {
311 $previous = $this->mPageOutCallback;
312 $this->mPageOutCallback = $callback;
313 return $previous;
314 }
315
321 public function setRevisionCallback( $callback ) {
322 $previous = $this->mRevisionCallback;
323 $this->mRevisionCallback = $callback;
324 return $previous;
325 }
326
332 public function setUploadCallback( $callback ) {
333 $previous = $this->mUploadCallback;
334 $this->mUploadCallback = $callback;
335 return $previous;
336 }
337
343 public function setLogItemCallback( $callback ) {
344 $previous = $this->mLogItemCallback;
345 $this->mLogItemCallback = $callback;
346 return $previous;
347 }
348
354 public function setSiteInfoCallback( $callback ) {
355 $previous = $this->mSiteInfoCallback;
356 $this->mSiteInfoCallback = $callback;
357 return $previous;
358 }
359
365 public function setImportTitleFactory( $factory ) {
366 $this->importTitleFactory = $factory;
367 }
368
374 public function setTargetNamespace( $namespace ) {
375 if ( $namespace === null ) {
376 // Don't override namespaces
379 $this->contentLanguage,
380 $this->namespaceInfo,
381 $this->titleFactory
382 )
383 );
384 return true;
385 } elseif (
386 $namespace >= 0 &&
387 $this->namespaceInfo->exists( intval( $namespace ) )
388 ) {
389 $namespace = intval( $namespace );
392 $this->namespaceInfo,
393 $this->titleFactory,
394 $namespace
395 )
396 );
397 return true;
398 } else {
399 return false;
400 }
401 }
402
408 public function setTargetRootPage( $rootpage ) {
409 $status = Status::newGood();
410 $nsInfo = $this->namespaceInfo;
411 if ( $rootpage === null ) {
412 // No rootpage
415 $this->contentLanguage,
416 $nsInfo,
417 $this->titleFactory
418 )
419 );
420 } elseif ( $rootpage !== '' ) {
421 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
422 $title = Title::newFromText( $rootpage );
423
424 if ( !$title || $title->isExternal() ) {
425 $status->fatal( 'import-rootpage-invalid' );
426 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
427 $displayNSText = $title->getNamespace() === NS_MAIN
428 ? wfMessage( 'blanknamespace' )->text()
429 : $this->contentLanguage->getNsText( $title->getNamespace() );
430 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
431 } else {
432 // set namespace to 'all', so the namespace check in processTitle() can pass
433 $this->setTargetNamespace( null );
436 $nsInfo,
437 $this->titleFactory,
438 $title
439 )
440 );
441 }
442 }
443 return $status;
444 }
445
449 public function setImageBasePath( $dir ) {
450 $this->mImageBasePath = $dir;
451 }
452
456 public function setImportUploads( $import ) {
457 $this->mImportUploads = $import;
458 }
459
465 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
466 $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
467 }
468
473 public function disableStatisticsUpdate() {
474 $this->disableStatisticsUpdate = true;
475 }
476
483 public function beforeImportPage( $titleAndForeignTitle ) {
484 $title = $titleAndForeignTitle[0];
485 $page = $this->wikiPageFactory->newFromTitle( $title );
486 $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
487 return true;
488 }
489
495 public function importRevision( $revision ) {
496 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
497 $this->notice( 'import-error-bad-location',
498 $revision->getTitle()->getPrefixedText(),
499 $revision->getID(),
500 $revision->getModel(),
501 $revision->getFormat()
502 );
503
504 return false;
505 }
506
507 try {
508 return $revision->importOldRevision();
509 } catch ( MWContentSerializationException $ex ) {
510 $this->notice( 'import-error-unserialize',
511 $revision->getTitle()->getPrefixedText(),
512 $revision->getID(),
513 $revision->getModel(),
514 $revision->getFormat()
515 );
516 }
517
518 return false;
519 }
520
526 public function importLogItem( $revision ) {
527 return $revision->importLogItem();
528 }
529
535 public function importUpload( $revision ) {
536 $status = $this->uploadRevisionImporter->import( $revision );
537 return $status->isGood();
538 }
539
549 public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
550 $sRevCount, $pageInfo
551 ) {
552 // Update article count statistics (T42009)
553 // The normal counting logic in WikiPage->doEditUpdates() is designed for
554 // one-revision-at-a-time editing, not bulk imports. In this situation it
555 // suffers from issues of replica DB lag. We let WikiPage handle the total page
556 // and revision count, and we implement our own custom logic for the
557 // article (content page) count.
558 if ( !$this->disableStatisticsUpdate ) {
559 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
560
561 $page->loadPageData( WikiPage::READ_LATEST );
562 $rev = $page->getRevisionRecord();
563 if ( $rev === null ) {
564
565 wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
566 ' because WikiPage::getRevisionRecord() returned null' );
567 } else {
568 $user = RequestContext::getMain()->getUser();
569 $update = $page->newPageUpdater( $user )->prepareUpdate();
570 $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
571 $countable = $update->isCountable();
572 if ( array_key_exists( $countKey, $this->countableCache ) &&
573 $countable != $this->countableCache[$countKey] ) {
574 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
575 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
576 ] ) );
577 }
578 }
579 }
580
581 $title = Title::newFromPageIdentity( $pageIdentity );
582 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
583 $revCount, $sRevCount, $pageInfo );
584 }
585
591 private function siteInfoCallback( $siteInfo ) {
592 if ( isset( $this->mSiteInfoCallback ) ) {
593 return call_user_func_array(
594 $this->mSiteInfoCallback,
595 [ $siteInfo, $this ]
596 );
597 } else {
598 return false;
599 }
600 }
601
606 public function pageCallback( $title ) {
607 if ( isset( $this->mPageCallback ) ) {
608 call_user_func( $this->mPageCallback, $title );
609 }
610 }
611
620 private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
621 $sucCount, $pageInfo ) {
622 if ( isset( $this->mPageOutCallback ) ) {
623 call_user_func_array( $this->mPageOutCallback, func_get_args() );
624 }
625 }
626
632 private function revisionCallback( $revision ) {
633 if ( isset( $this->mRevisionCallback ) ) {
634 return call_user_func_array(
635 $this->mRevisionCallback,
636 [ $revision, $this ]
637 );
638 } else {
639 return false;
640 }
641 }
642
648 private function logItemCallback( $revision ) {
649 if ( isset( $this->mLogItemCallback ) ) {
650 return call_user_func_array(
651 $this->mLogItemCallback,
652 [ $revision, $this ]
653 );
654 } else {
655 return false;
656 }
657 }
658
665 public function nodeAttribute( $attr ) {
666 return $this->reader->getAttribute( $attr ) ?? '';
667 }
668
676 public function nodeContents() {
677 if ( $this->reader->isEmptyElement ) {
678 return "";
679 }
680 $buffer = "";
681 while ( $this->reader->read() ) {
682 switch ( $this->reader->nodeType ) {
683 case XMLReader::TEXT:
684 case XMLReader::CDATA:
685 case XMLReader::SIGNIFICANT_WHITESPACE:
686 $buffer .= $this->reader->value;
687 break;
688 case XMLReader::END_ELEMENT:
689 return $buffer;
690 }
691 }
692
693 $this->reader->close();
694 return '';
695 }
696
702 public function doImport() {
703 $this->syntaxCheckXML();
704
705 // Calls to reader->read need to be wrapped in calls to
706 // libxml_disable_entity_loader() to avoid local file
707 // inclusion attacks (T48932).
708 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
709 $oldDisable = @libxml_disable_entity_loader( true );
710 try {
711 $this->reader->read();
712
713 if ( $this->reader->localName != 'mediawiki' ) {
714 // phpcs:ignore Generic.PHP.NoSilencedErrors
715 @libxml_disable_entity_loader( $oldDisable );
716 $error = libxml_get_last_error();
717 if ( $error ) {
718 throw new NormalizedException( "XML error at line {line}: {message}", [
719 'line' => $error->line,
720 'message' => $error->message,
721 ] );
722 } else {
723 throw new MWException(
724 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
725 );
726 }
727 }
728 $this->debug( "<mediawiki> tag is correct." );
729
730 $this->debug( "Starting primary dump processing loop." );
731
732 $keepReading = $this->reader->read();
733 $skip = false;
734 $pageCount = 0;
735 while ( $keepReading ) {
736 $tag = $this->reader->localName;
737 if ( $this->pageOffset ) {
738 if ( $tag === 'page' ) {
739 $pageCount++;
740 }
741 if ( $pageCount < $this->pageOffset ) {
742 $keepReading = $this->reader->next();
743 continue;
744 }
745 }
746 $type = $this->reader->nodeType;
747
748 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
749 // Do nothing
750 } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
751 break;
752 } elseif ( $tag == 'siteinfo' ) {
753 $this->handleSiteInfo();
754 } elseif ( $tag == 'page' ) {
755 $this->handlePage();
756 } elseif ( $tag == 'logitem' ) {
757 $this->handleLogItem();
758 } elseif ( $tag != '#text' ) {
759 $this->warn( "Unhandled top-level XML tag $tag" );
760
761 $skip = true;
762 }
763
764 if ( $skip ) {
765 $keepReading = $this->reader->next();
766 $skip = false;
767 $this->debug( "Skip" );
768 } else {
769 $keepReading = $this->reader->read();
770 }
771 }
772 } finally {
773 // phpcs:ignore Generic.PHP.NoSilencedErrors
774 @libxml_disable_entity_loader( $oldDisable );
775 $this->reader->close();
776 }
777
778 return true;
779 }
780
781 private function handleSiteInfo() {
782 $this->debug( "Enter site info handler." );
783 $siteInfo = [];
784
785 // Fields that can just be stuffed in the siteInfo object
786 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
787
788 while ( $this->reader->read() ) {
789 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
790 $this->reader->localName == 'siteinfo' ) {
791 break;
792 }
793
794 $tag = $this->reader->localName;
795
796 if ( $tag == 'namespace' ) {
797 $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
798 $this->nodeContents();
799 } elseif ( in_array( $tag, $normalFields ) ) {
800 $siteInfo[$tag] = $this->nodeContents();
801 }
802 }
803
804 $siteInfo['_namespaces'] = $this->foreignNamespaces;
805 $this->siteInfoCallback( $siteInfo );
806 }
807
808 private function handleLogItem() {
809 $this->debug( "Enter log item handler." );
810 $logInfo = [];
811
812 // Fields that can just be stuffed in the pageInfo object
813 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
814 'logtitle', 'params' ];
815
816 while ( $this->reader->read() ) {
817 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
818 $this->reader->localName == 'logitem' ) {
819 break;
820 }
821
822 $tag = $this->reader->localName;
823
824 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
825 // Do nothing
826 } elseif ( in_array( $tag, $normalFields ) ) {
827 $logInfo[$tag] = $this->nodeContents();
828 } elseif ( $tag == 'contributor' ) {
829 $logInfo['contributor'] = $this->handleContributor();
830 } elseif ( $tag != '#text' ) {
831 $this->warn( "Unhandled log-item XML tag $tag" );
832 }
833 }
834
835 $this->processLogItem( $logInfo );
836 }
837
842 private function processLogItem( $logInfo ) {
843 $revision = new WikiRevision();
844
845 if ( isset( $logInfo['id'] ) ) {
846 $revision->setID( $logInfo['id'] );
847 }
848 $revision->setType( $logInfo['type'] );
849 $revision->setAction( $logInfo['action'] );
850 if ( isset( $logInfo['timestamp'] ) ) {
851 $revision->setTimestamp( $logInfo['timestamp'] );
852 }
853 if ( isset( $logInfo['params'] ) ) {
854 $revision->setParams( $logInfo['params'] );
855 }
856 if ( isset( $logInfo['logtitle'] ) ) {
857 // @todo Using Title for non-local titles is a recipe for disaster.
858 // We should use ForeignTitle here instead.
859 $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
860 }
861
862 $revision->setNoUpdates( $this->mNoUpdates );
863
864 if ( isset( $logInfo['comment'] ) ) {
865 $revision->setComment( $logInfo['comment'] );
866 }
867
868 if ( isset( $logInfo['contributor']['username'] ) ) {
869 $revision->setUsername(
870 $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
871 );
872 } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
873 $revision->setUserIP( $logInfo['contributor']['ip'] );
874 } else {
875 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
876 }
877
878 return $this->logItemCallback( $revision );
879 }
880
881 private function handlePage() {
882 // Handle page data.
883 $this->debug( "Enter page handler." );
884 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
885
886 // Fields that can just be stuffed in the pageInfo object
887 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
888
889 $skip = false;
890 $badTitle = false;
891
892 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
893 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
894 $this->reader->localName == 'page' ) {
895 break;
896 }
897
898 $skip = false;
899
900 $tag = $this->reader->localName;
901
902 if ( $badTitle ) {
903 // The title is invalid, bail out of this page
904 $skip = true;
905 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
906 // Do nothing
907 } elseif ( in_array( $tag, $normalFields ) ) {
908 // An XML snippet:
909 // <page>
910 // <id>123</id>
911 // <title>Page</title>
912 // <redirect title="NewTitle"/>
913 // ...
914 // Because the redirect tag is built differently, we need special handling for that case.
915 if ( $tag == 'redirect' ) {
916 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
917 } else {
918 $pageInfo[$tag] = $this->nodeContents();
919 }
920 } elseif ( $tag == 'revision' || $tag == 'upload' ) {
921 if ( !isset( $title ) ) {
922 $title = $this->processTitle( $pageInfo['title'],
923 $pageInfo['ns'] ?? null );
924
925 // $title is either an array of two titles or false.
926 if ( is_array( $title ) ) {
927 $this->pageCallback( $title );
928 [ $pageInfo['_title'], $foreignTitle ] = $title;
929 } else {
930 $badTitle = true;
931 $skip = true;
932 }
933 }
934
935 if ( $title ) {
936 if ( $tag == 'revision' ) {
937 $this->handleRevision( $pageInfo );
938 } else {
939 $this->handleUpload( $pageInfo );
940 }
941 }
942 } elseif ( $tag != '#text' ) {
943 $this->warn( "Unhandled page XML tag $tag" );
944 $skip = true;
945 }
946 }
947
948 // @note $pageInfo is only set if a valid $title is processed above with
949 // no error. If we have a valid $title, then pageCallback is called
950 // above, $pageInfo['title'] is set and we do pageOutCallback here.
951 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
952 // set since they both come from $title above.
953 if ( array_key_exists( '_title', $pageInfo ) ) {
955 $title = $pageInfo['_title'];
956 $this->pageOutCallback(
957 $title,
958 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
959 $foreignTitle,
960 $pageInfo['revisionCount'],
961 $pageInfo['successfulRevisionCount'],
962 $pageInfo
963 );
964 }
965 }
966
970 private function handleRevision( &$pageInfo ) {
971 $this->debug( "Enter revision handler" );
972 $revisionInfo = [];
973
974 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
975 'model', 'format', 'text', 'sha1' ];
976
977 $skip = false;
978
979 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
980 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
981 $this->reader->localName == 'revision' ) {
982 break;
983 }
984
985 $tag = $this->reader->localName;
986
987 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
988 $this, $pageInfo, $revisionInfo )
989 ) {
990 // Do nothing
991 } elseif ( in_array( $tag, $normalFields ) ) {
992 $revisionInfo[$tag] = $this->nodeContents();
993 } elseif ( $tag == 'content' ) {
994 // We can have multiple content tags, so make this an array.
995 $revisionInfo[$tag][] = $this->handleContent();
996 } elseif ( $tag == 'contributor' ) {
997 $revisionInfo['contributor'] = $this->handleContributor();
998 } elseif ( $tag != '#text' ) {
999 $this->warn( "Unhandled revision XML tag $tag" );
1000 $skip = true;
1001 }
1002 }
1003
1004 $pageInfo['revisionCount']++;
1005 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
1006 $pageInfo['successfulRevisionCount']++;
1007 }
1008 }
1009
1010 private function handleContent() {
1011 $this->debug( "Enter content handler" );
1012 $contentInfo = [];
1013
1014 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
1015
1016 $skip = false;
1017
1018 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1019 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1020 $this->reader->localName == 'content' ) {
1021 break;
1022 }
1023
1024 $tag = $this->reader->localName;
1025
1026 if ( !$this->hookRunner->onImportHandleContentXMLTag(
1027 $this, $contentInfo )
1028 ) {
1029 // Do nothing
1030 } elseif ( in_array( $tag, $normalFields ) ) {
1031 $contentInfo[$tag] = $this->nodeContents();
1032 } elseif ( $tag != '#text' ) {
1033 $this->warn( "Unhandled content XML tag $tag" );
1034 $skip = true;
1035 }
1036 }
1037
1038 return $contentInfo;
1039 }
1040
1049 private function makeContent( Title $title, $revisionId, $contentInfo ) {
1050 $maxArticleSize = MediaWikiServices::getInstance()->getMainConfig()->get(
1051 MainConfigNames::MaxArticleSize );
1052
1053 if ( !isset( $contentInfo['text'] ) ) {
1054 throw new MWException( 'Missing text field in import.' );
1055 }
1056
1057 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1058 // database errors and instability. Testing for revisions with only listed
1059 // content models, as other content models might use serialization formats
1060 // which aren't checked against $wgMaxArticleSize.
1061 if ( ( !isset( $contentInfo['model'] ) ||
1062 in_array( $contentInfo['model'], [
1063 'wikitext',
1064 'css',
1065 'json',
1066 'javascript',
1067 'text',
1068 ''
1069 ] ) ) &&
1070 strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1071 ) {
1072 throw new MWException( 'The text of ' .
1073 ( $revisionId ?
1074 "the revision with ID $revisionId" :
1075 'a revision'
1076 ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1077 }
1078
1079 $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1080 $model = $contentInfo['model'] ?? $this->getDefaultContentModel( $title, $role );
1081 $handler = $this->getContentHandler( $model );
1082
1083 $text = $handler->importTransform( $contentInfo['text'] );
1084
1085 return $handler->unserializeContent( $text );
1086 }
1087
1094 private function processRevision( $pageInfo, $revisionInfo ) {
1095 $revision = new WikiRevision();
1096
1097 $revId = $revisionInfo['id'] ?? 0;
1098 if ( $revId ) {
1099 $revision->setID( $revisionInfo['id'] );
1100 }
1101
1102 $title = $pageInfo['_title'];
1103 $revision->setTitle( $title );
1104
1105 $content = $this->makeContent( $title, $revId, $revisionInfo );
1106 $revision->setContent( SlotRecord::MAIN, $content );
1107
1108 foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1109 if ( !isset( $slotInfo['role'] ) ) {
1110 throw new MWException( "Missing role for imported slot." );
1111 }
1112
1113 $content = $this->makeContent( $title, $revId, $slotInfo );
1114 $revision->setContent( $slotInfo['role'], $content );
1115 }
1116 $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1117
1118 if ( isset( $revisionInfo['comment'] ) ) {
1119 $revision->setComment( $revisionInfo['comment'] );
1120 }
1121
1122 if ( isset( $revisionInfo['minor'] ) ) {
1123 $revision->setMinor( true );
1124 }
1125 if ( isset( $revisionInfo['contributor']['username'] ) ) {
1126 $revision->setUsername(
1127 $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1128 );
1129 } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1130 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1131 } else {
1132 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1133 }
1134 if ( isset( $revisionInfo['sha1'] ) ) {
1135 $revision->setSha1Base36( $revisionInfo['sha1'] );
1136 }
1137 $revision->setNoUpdates( $this->mNoUpdates );
1138
1139 return $this->revisionCallback( $revision );
1140 }
1141
1146 private function handleUpload( &$pageInfo ) {
1147 $this->debug( "Enter upload handler" );
1148 $uploadInfo = [];
1149
1150 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1151 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1152
1153 $skip = false;
1154
1155 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1156 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1157 $this->reader->localName == 'upload' ) {
1158 break;
1159 }
1160
1161 $tag = $this->reader->localName;
1162
1163 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1164 // Do nothing
1165 } elseif ( in_array( $tag, $normalFields ) ) {
1166 $uploadInfo[$tag] = $this->nodeContents();
1167 } elseif ( $tag == 'contributor' ) {
1168 $uploadInfo['contributor'] = $this->handleContributor();
1169 } elseif ( $tag == 'contents' ) {
1170 $contents = $this->nodeContents();
1171 $encoding = $this->reader->getAttribute( 'encoding' );
1172 if ( $encoding === 'base64' ) {
1173 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1174 $uploadInfo['isTempSrc'] = true;
1175 }
1176 } elseif ( $tag != '#text' ) {
1177 $this->warn( "Unhandled upload XML tag $tag" );
1178 $skip = true;
1179 }
1180 }
1181
1182 if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1183 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1184 if ( file_exists( $path ) ) {
1185 $uploadInfo['fileSrc'] = $path;
1186 $uploadInfo['isTempSrc'] = false;
1187 }
1188 }
1189
1190 if ( $this->mImportUploads ) {
1191 return $this->processUpload( $pageInfo, $uploadInfo );
1192 }
1193 }
1194
1199 private function dumpTemp( $contents ) {
1200 $filename = tempnam( wfTempDir(), 'importupload' );
1201 file_put_contents( $filename, $contents );
1202 return $filename;
1203 }
1204
1210 private function processUpload( $pageInfo, $uploadInfo ) {
1211 $revision = new WikiRevision();
1212 $revId = $pageInfo['id'];
1213 $title = $pageInfo['_title'];
1214 // T292348: text key may be absent, force addition if null
1215 $uploadInfo['text'] = $uploadInfo['text'] ?? '';
1216 $content = $this->makeContent( $title, $revId, $uploadInfo );
1217
1218 $revision->setTitle( $title );
1219 $revision->setID( $revId );
1220 $revision->setTimestamp( $uploadInfo['timestamp'] );
1221 $revision->setContent( SlotRecord::MAIN, $content );
1222 $revision->setFilename( $uploadInfo['filename'] );
1223 if ( isset( $uploadInfo['archivename'] ) ) {
1224 $revision->setArchiveName( $uploadInfo['archivename'] );
1225 }
1226 $revision->setSrc( $uploadInfo['src'] );
1227 if ( isset( $uploadInfo['fileSrc'] ) ) {
1228 $revision->setFileSrc( $uploadInfo['fileSrc'],
1229 !empty( $uploadInfo['isTempSrc'] )
1230 );
1231 }
1232 if ( isset( $uploadInfo['sha1base36'] ) ) {
1233 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1234 }
1235 $revision->setSize( intval( $uploadInfo['size'] ) );
1236 $revision->setComment( $uploadInfo['comment'] );
1237
1238 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1239 $revision->setUsername(
1240 $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1241 );
1242 } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1243 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1244 }
1245 $revision->setNoUpdates( $this->mNoUpdates );
1246
1247 return call_user_func( $this->mUploadCallback, $revision );
1248 }
1249
1253 private function handleContributor() {
1254 $this->debug( "Enter contributor handler." );
1255
1256 if ( $this->reader->isEmptyElement ) {
1257 return [];
1258 }
1259
1260 $fields = [ 'id', 'ip', 'username' ];
1261 $info = [];
1262
1263 while ( $this->reader->read() ) {
1264 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1265 $this->reader->localName == 'contributor' ) {
1266 break;
1267 }
1268
1269 $tag = $this->reader->localName;
1270
1271 if ( in_array( $tag, $fields ) ) {
1272 $info[$tag] = $this->nodeContents();
1273 }
1274 }
1275
1276 return $info;
1277 }
1278
1284 private function processTitle( $text, $ns = null ) {
1285 if ( $this->foreignNamespaces === null ) {
1286 $foreignTitleFactory = new NaiveForeignTitleFactory(
1287 $this->contentLanguage
1288 );
1289 } else {
1290 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1291 $this->foreignNamespaces );
1292 }
1293
1294 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1295 intval( $ns ) );
1296
1297 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1298 $foreignTitle );
1299
1300 $commandLineMode = $this->config->get( 'CommandLineMode' );
1301 if ( $title === null ) {
1302 # Invalid page title? Ignore the page
1303 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1304 return false;
1305 } elseif ( $title->isExternal() ) {
1306 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1307 return false;
1308 } elseif ( !$title->canExist() ) {
1309 $this->notice( 'import-error-special', $title->getPrefixedText() );
1310 return false;
1311 } elseif ( !$commandLineMode ) {
1312 $user = RequestContext::getMain()->getUser();
1313
1314 if ( !$this->permissionManager->userCan( 'edit', $user, $title ) ) {
1315 # Do not import if the importing wiki user cannot edit this page
1316 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1317
1318 return false;
1319 }
1320 }
1321
1322 return [ $title, $foreignTitle ];
1323 }
1324
1329 private function getContentHandler( $model ) {
1330 return $this->contentHandlerFactory->getContentHandler( $model );
1331 }
1332
1339 private function getDefaultContentModel( $title, $role ) {
1340 return $this->slotRoleRegistry
1341 ->getRoleHandler( $role )
1342 ->getDefaultModel( $title );
1343 }
1344
1349 private function openReader() {
1350 // Enable the entity loader, as it is needed for loading external URLs via
1351 // XMLReader::open (T86036)
1352 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
1353 $oldDisable = @libxml_disable_entity_loader( false );
1354
1355 if ( PHP_VERSION_ID >= 80000 ) {
1356 // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548
1357 $reader = XMLReader::open(
1358 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1359 if ( $reader instanceof XMLReader ) {
1360 $this->reader = $reader;
1361 $status = true;
1362 } else {
1363 $status = false;
1364 }
1365 } else {
1366 // A static call generated a deprecation warning prior to PHP 8.0
1367 $this->reader = new XMLReader;
1368 $status = $this->reader->open(
1369 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1370 }
1371 if ( !$status ) {
1372 $error = libxml_get_last_error();
1373 // phpcs:ignore Generic.PHP.NoSilencedErrors
1374 @libxml_disable_entity_loader( $oldDisable );
1375 throw new MWException(
1376 'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1377 );
1378 }
1379 // phpcs:ignore Generic.PHP.NoSilencedErrors
1380 @libxml_disable_entity_loader( $oldDisable );
1381 }
1382
1386 private function syntaxCheckXML() {
1387 if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) {
1388 return;
1389 }
1390 AtEase::suppressWarnings();
1391 $oldDisable = libxml_disable_entity_loader( false );
1392 try {
1393 while ( $this->reader->read() );
1394 $error = libxml_get_last_error();
1395 if ( $error ) {
1396 $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message;
1397 wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage );
1398 throw new MWException( $errorMessage );
1399 }
1400 } finally {
1401 libxml_disable_entity_loader( $oldDisable );
1402 AtEase::restoreWarnings();
1403 $this->reader->close();
1404 }
1405
1406 // Reopen for the real import
1407 UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 );
1408 $this->openReader();
1409 }
1410}
const NS_MAIN
Definition Defines.php:64
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Reporting callback.
Base class for language-specific code.
Definition Language.php:63
Exception representing a failure to serialize or unserialize a content object.
MediaWiki exception.
Helper class for mapping value objects representing basic entities to cache keys.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Service for creating WikiPage objects.
A service class for checking permissions To obtain an instance, use MediaWikiServices::getInstance()-...
Value object representing a content slot associated with a page revision.
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition Status.php:58
A simple, immutable structure to hold the title of a page on a foreign MediaWiki installation.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Creates Title objects.
Represents a title within MediaWiki.
Definition Title.php:76
canExist()
Can this title represent a page in the wiki's database?
Definition Title.php:1226
isExternal()
Is this Title interwiki?
Definition Title.php:948
getNamespace()
Get the namespace index, i.e.
Definition Title.php:1058
getPrefixedText()
Get the prefixed title with spaces.
Definition Title.php:1885
Class to parse and build external user names.
static getMain()
Get the RequestContext object associated with the main request.
static seekSource(string $id, int $offset)
static isSeekableSource(string $id)
static registerSource(ImportSource $source)
XML file reader for the page data importer.
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setNoUpdates( $noupdates)
Set 'no updates' mode.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
importLogItem( $revision)
Default per-revision callback, performs the import.
setImageBasePath( $dir)
throwXmlError( $err)
setDebug( $debug)
Set debug mode...
__construct(ImportSource $source, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, PermissionManager $permissionManager, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
Represents a revision, log entry or upload during the import process.
Source interface for XML import.
Interface for configuration instances.
Definition Config.php:32
Interface for objects (potentially) representing an editable wiki page.
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
$source
$content
Definition router.php:76