MediaWiki REL1_40
WikiImporter.php
Go to the documentation of this file.
1<?php
40use Wikimedia\AtEase\AtEase;
41use Wikimedia\NormalizedException\NormalizedException;
42
51 private $reader;
52
54 private $sourceAdapterId;
55
57 private $foreignNamespaces = null;
58
60 private $mLogItemCallback;
61
63 private $mUploadCallback;
64
66 private $mRevisionCallback;
67
69 private $mPageCallback;
70
72 private $mSiteInfoCallback;
73
75 private $mPageOutCallback;
76
78 private $mNoticeCallback;
79
81 private $mDebug;
82
84 private $mImportUploads;
85
87 private $mImageBasePath;
88
90 private $mNoUpdates = false;
91
93 private $pageOffset = 0;
94
96 private $config;
97
99 private $importTitleFactory;
100
102 private $hookRunner;
103
105 private $countableCache = [];
106
108 private $disableStatisticsUpdate = false;
109
111 private $externalUserNames;
112
114 private $contentLanguage;
115
117 private $namespaceInfo;
118
120 private $titleFactory;
121
123 private $wikiPageFactory;
124
126 private $uploadRevisionImporter;
127
129 private $permissionManager;
130
132 private $contentHandlerFactory;
133
135 private $slotRoleRegistry;
136
152 public function __construct(
154 Config $config,
155 HookContainer $hookContainer,
156 Language $contentLanguage,
157 NamespaceInfo $namespaceInfo,
158 TitleFactory $titleFactory,
159 WikiPageFactory $wikiPageFactory,
160 UploadRevisionImporter $uploadRevisionImporter,
161 PermissionManager $permissionManager,
162 IContentHandlerFactory $contentHandlerFactory,
163 SlotRoleRegistry $slotRoleRegistry
164 ) {
165 $this->config = $config;
166 $this->hookRunner = new HookRunner( $hookContainer );
167 $this->contentLanguage = $contentLanguage;
168 $this->namespaceInfo = $namespaceInfo;
169 $this->titleFactory = $titleFactory;
170 $this->wikiPageFactory = $wikiPageFactory;
171 $this->uploadRevisionImporter = $uploadRevisionImporter;
172 $this->permissionManager = $permissionManager;
173 $this->contentHandlerFactory = $contentHandlerFactory;
174 $this->slotRoleRegistry = $slotRoleRegistry;
175
176 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
177 stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
178 }
179 $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source );
180
181 $this->openReader();
182
183 // Default callbacks
184 $this->setPageCallback( [ $this, 'beforeImportPage' ] );
185 $this->setRevisionCallback( [ $this, "importRevision" ] );
186 $this->setUploadCallback( [ $this, 'importUpload' ] );
187 $this->setLogItemCallback( [ $this, 'importLogItem' ] );
188 $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
189
190 $this->importTitleFactory = new NaiveImportTitleFactory(
191 $this->contentLanguage,
192 $this->namespaceInfo,
193 $this->titleFactory
194 );
195 $this->externalUserNames = new ExternalUserNames( 'imported', false );
196 }
197
201 public function getReader() {
202 return $this->reader;
203 }
204
208 public function throwXmlError( $err ) {
209 $this->debug( "FAILURE: $err" );
210 wfDebug( "WikiImporter XML error: $err" );
211 }
212
216 public function debug( $data ) {
217 if ( $this->mDebug ) {
218 wfDebug( "IMPORT: $data" );
219 }
220 }
221
225 public function warn( $data ) {
226 wfDebug( "IMPORT: $data" );
227 }
228
233 public function notice( $msg, ...$params ) {
234 if ( is_callable( $this->mNoticeCallback ) ) {
235 call_user_func( $this->mNoticeCallback, $msg, $params );
236 } else { # No ImportReporter -> CLI
237 // T177997: the command line importers should call setNoticeCallback()
238 // for their own custom callback to echo the notice
239 wfDebug( wfMessage( $msg, $params )->text() );
240 }
241 }
242
247 public function setDebug( $debug ) {
248 $this->mDebug = $debug;
249 }
250
255 public function setNoUpdates( $noupdates ) {
256 $this->mNoUpdates = $noupdates;
257 }
258
265 public function setPageOffset( $nthPage ) {
266 $this->pageOffset = $nthPage;
267 }
268
275 public function setNoticeCallback( $callback ) {
276 return wfSetVar( $this->mNoticeCallback, $callback );
277 }
278
284 public function setPageCallback( $callback ) {
285 $previous = $this->mPageCallback;
286 $this->mPageCallback = $callback;
287 return $previous;
288 }
289
299 public function setPageOutCallback( $callback ) {
300 $previous = $this->mPageOutCallback;
301 $this->mPageOutCallback = $callback;
302 return $previous;
303 }
304
310 public function setRevisionCallback( $callback ) {
311 $previous = $this->mRevisionCallback;
312 $this->mRevisionCallback = $callback;
313 return $previous;
314 }
315
321 public function setUploadCallback( $callback ) {
322 $previous = $this->mUploadCallback;
323 $this->mUploadCallback = $callback;
324 return $previous;
325 }
326
332 public function setLogItemCallback( $callback ) {
333 $previous = $this->mLogItemCallback;
334 $this->mLogItemCallback = $callback;
335 return $previous;
336 }
337
343 public function setSiteInfoCallback( $callback ) {
344 $previous = $this->mSiteInfoCallback;
345 $this->mSiteInfoCallback = $callback;
346 return $previous;
347 }
348
354 public function setImportTitleFactory( $factory ) {
355 $this->importTitleFactory = $factory;
356 }
357
363 public function setTargetNamespace( $namespace ) {
364 if ( $namespace === null ) {
365 // Don't override namespaces
368 $this->contentLanguage,
369 $this->namespaceInfo,
370 $this->titleFactory
371 )
372 );
373 return true;
374 } elseif (
375 $namespace >= 0 &&
376 $this->namespaceInfo->exists( intval( $namespace ) )
377 ) {
378 $namespace = intval( $namespace );
381 $this->namespaceInfo,
382 $this->titleFactory,
383 $namespace
384 )
385 );
386 return true;
387 } else {
388 return false;
389 }
390 }
391
397 public function setTargetRootPage( $rootpage ) {
398 $status = Status::newGood();
399 $nsInfo = $this->namespaceInfo;
400 if ( $rootpage === null ) {
401 // No rootpage
404 $this->contentLanguage,
405 $nsInfo,
406 $this->titleFactory
407 )
408 );
409 } elseif ( $rootpage !== '' ) {
410 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
411 $title = Title::newFromText( $rootpage );
412
413 if ( !$title || $title->isExternal() ) {
414 $status->fatal( 'import-rootpage-invalid' );
415 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
416 $displayNSText = $title->getNamespace() === NS_MAIN
417 ? wfMessage( 'blanknamespace' )->text()
418 : $this->contentLanguage->getNsText( $title->getNamespace() );
419 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
420 } else {
421 // set namespace to 'all', so the namespace check in processTitle() can pass
422 $this->setTargetNamespace( null );
425 $nsInfo,
426 $this->titleFactory,
427 $title
428 )
429 );
430 }
431 }
432 return $status;
433 }
434
438 public function setImageBasePath( $dir ) {
439 $this->mImageBasePath = $dir;
440 }
441
445 public function setImportUploads( $import ) {
446 $this->mImportUploads = $import;
447 }
448
454 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
455 $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
456 }
457
462 public function disableStatisticsUpdate() {
463 $this->disableStatisticsUpdate = true;
464 }
465
472 public function beforeImportPage( $titleAndForeignTitle ) {
473 $title = $titleAndForeignTitle[0];
474 $page = $this->wikiPageFactory->newFromTitle( $title );
475 $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
476 return true;
477 }
478
484 public function importRevision( $revision ) {
485 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
486 $this->notice( 'import-error-bad-location',
487 $revision->getTitle()->getPrefixedText(),
488 $revision->getID(),
489 $revision->getModel(),
490 $revision->getFormat()
491 );
492
493 return false;
494 }
495
496 try {
497 return $revision->importOldRevision();
498 } catch ( MWContentSerializationException $ex ) {
499 $this->notice( 'import-error-unserialize',
500 $revision->getTitle()->getPrefixedText(),
501 $revision->getID(),
502 $revision->getModel(),
503 $revision->getFormat()
504 );
505 }
506
507 return false;
508 }
509
515 public function importLogItem( $revision ) {
516 return $revision->importLogItem();
517 }
518
524 public function importUpload( $revision ) {
525 $status = $this->uploadRevisionImporter->import( $revision );
526 return $status->isGood();
527 }
528
538 public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
539 $sRevCount, $pageInfo
540 ) {
541 // Update article count statistics (T42009)
542 // The normal counting logic in WikiPage->doEditUpdates() is designed for
543 // one-revision-at-a-time editing, not bulk imports. In this situation it
544 // suffers from issues of replica DB lag. We let WikiPage handle the total page
545 // and revision count, and we implement our own custom logic for the
546 // article (content page) count.
547 if ( !$this->disableStatisticsUpdate ) {
548 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
549
550 $page->loadPageData( WikiPage::READ_LATEST );
551 $rev = $page->getRevisionRecord();
552 if ( $rev === null ) {
553
554 wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
555 ' because WikiPage::getRevisionRecord() returned null' );
556 } else {
557 $user = RequestContext::getMain()->getUser();
558 $update = $page->newPageUpdater( $user )->prepareUpdate();
559 $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
560 $countable = $update->isCountable();
561 if ( array_key_exists( $countKey, $this->countableCache ) &&
562 $countable != $this->countableCache[$countKey] ) {
563 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
564 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
565 ] ) );
566 }
567 }
568 }
569
570 $title = Title::castFromPageIdentity( $pageIdentity );
571 // @phan-suppress-next-line PhanTypeMismatchArgumentNullable castFrom does not return null here
572 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
573 $revCount, $sRevCount, $pageInfo );
574 }
575
581 private function siteInfoCallback( $siteInfo ) {
582 if ( isset( $this->mSiteInfoCallback ) ) {
583 return call_user_func_array(
584 $this->mSiteInfoCallback,
585 [ $siteInfo, $this ]
586 );
587 } else {
588 return false;
589 }
590 }
591
596 public function pageCallback( $title ) {
597 if ( isset( $this->mPageCallback ) ) {
598 call_user_func( $this->mPageCallback, $title );
599 }
600 }
601
610 private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
611 $sucCount, $pageInfo ) {
612 if ( isset( $this->mPageOutCallback ) ) {
613 call_user_func_array( $this->mPageOutCallback, func_get_args() );
614 }
615 }
616
622 private function revisionCallback( $revision ) {
623 if ( isset( $this->mRevisionCallback ) ) {
624 return call_user_func_array(
625 $this->mRevisionCallback,
626 [ $revision, $this ]
627 );
628 } else {
629 return false;
630 }
631 }
632
638 private function logItemCallback( $revision ) {
639 if ( isset( $this->mLogItemCallback ) ) {
640 return call_user_func_array(
641 $this->mLogItemCallback,
642 [ $revision, $this ]
643 );
644 } else {
645 return false;
646 }
647 }
648
655 public function nodeAttribute( $attr ) {
656 return $this->reader->getAttribute( $attr ) ?? '';
657 }
658
666 public function nodeContents() {
667 if ( $this->reader->isEmptyElement ) {
668 return "";
669 }
670 $buffer = "";
671 while ( $this->reader->read() ) {
672 switch ( $this->reader->nodeType ) {
673 case XMLReader::TEXT:
674 case XMLReader::CDATA:
675 case XMLReader::SIGNIFICANT_WHITESPACE:
676 $buffer .= $this->reader->value;
677 break;
678 case XMLReader::END_ELEMENT:
679 return $buffer;
680 }
681 }
682
683 $this->reader->close();
684 return '';
685 }
686
693 public function doImport() {
694 $this->syntaxCheckXML();
695
696 // Calls to reader->read need to be wrapped in calls to
697 // libxml_disable_entity_loader() to avoid local file
698 // inclusion attacks (T48932).
699 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
700 $oldDisable = @libxml_disable_entity_loader( true );
701 try {
702 $this->reader->read();
703
704 if ( $this->reader->localName != 'mediawiki' ) {
705 // phpcs:ignore Generic.PHP.NoSilencedErrors
706 @libxml_disable_entity_loader( $oldDisable );
707 $error = libxml_get_last_error();
708 if ( $error ) {
709 throw new NormalizedException( "XML error at line {line}: {message}", [
710 'line' => $error->line,
711 'message' => $error->message,
712 ] );
713 } else {
714 throw new MWException(
715 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
716 );
717 }
718 }
719 $this->debug( "<mediawiki> tag is correct." );
720
721 $this->debug( "Starting primary dump processing loop." );
722
723 $keepReading = $this->reader->read();
724 $skip = false;
725 $pageCount = 0;
726 while ( $keepReading ) {
727 $tag = $this->reader->localName;
728 if ( $this->pageOffset ) {
729 if ( $tag === 'page' ) {
730 $pageCount++;
731 }
732 if ( $pageCount < $this->pageOffset ) {
733 $keepReading = $this->reader->next();
734 continue;
735 }
736 }
737 $type = $this->reader->nodeType;
738
739 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
740 // Do nothing
741 } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
742 break;
743 } elseif ( $tag == 'siteinfo' ) {
744 $this->handleSiteInfo();
745 } elseif ( $tag == 'page' ) {
746 $this->handlePage();
747 } elseif ( $tag == 'logitem' ) {
748 $this->handleLogItem();
749 } elseif ( $tag != '#text' ) {
750 $this->warn( "Unhandled top-level XML tag $tag" );
751
752 $skip = true;
753 }
754
755 if ( $skip ) {
756 $keepReading = $this->reader->next();
757 $skip = false;
758 $this->debug( "Skip" );
759 } else {
760 $keepReading = $this->reader->read();
761 }
762 }
763 } finally {
764 // phpcs:ignore Generic.PHP.NoSilencedErrors
765 @libxml_disable_entity_loader( $oldDisable );
766 $this->reader->close();
767 }
768
769 return true;
770 }
771
772 private function handleSiteInfo() {
773 $this->debug( "Enter site info handler." );
774 $siteInfo = [];
775
776 // Fields that can just be stuffed in the siteInfo object
777 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
778
779 while ( $this->reader->read() ) {
780 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
781 $this->reader->localName == 'siteinfo' ) {
782 break;
783 }
784
785 $tag = $this->reader->localName;
786
787 if ( $tag == 'namespace' ) {
788 $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
789 $this->nodeContents();
790 } elseif ( in_array( $tag, $normalFields ) ) {
791 $siteInfo[$tag] = $this->nodeContents();
792 }
793 }
794
795 $siteInfo['_namespaces'] = $this->foreignNamespaces;
796 $this->siteInfoCallback( $siteInfo );
797 }
798
799 private function handleLogItem() {
800 $this->debug( "Enter log item handler." );
801 $logInfo = [];
802
803 // Fields that can just be stuffed in the pageInfo object
804 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
805 'logtitle', 'params' ];
806
807 while ( $this->reader->read() ) {
808 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
809 $this->reader->localName == 'logitem' ) {
810 break;
811 }
812
813 $tag = $this->reader->localName;
814
815 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
816 // Do nothing
817 } elseif ( in_array( $tag, $normalFields ) ) {
818 $logInfo[$tag] = $this->nodeContents();
819 } elseif ( $tag == 'contributor' ) {
820 $logInfo['contributor'] = $this->handleContributor();
821 } elseif ( $tag != '#text' ) {
822 $this->warn( "Unhandled log-item XML tag $tag" );
823 }
824 }
825
826 $this->processLogItem( $logInfo );
827 }
828
833 private function processLogItem( $logInfo ) {
834 $revision = new WikiRevision();
835
836 if ( isset( $logInfo['id'] ) ) {
837 $revision->setID( $logInfo['id'] );
838 }
839 $revision->setType( $logInfo['type'] );
840 $revision->setAction( $logInfo['action'] );
841 if ( isset( $logInfo['timestamp'] ) ) {
842 $revision->setTimestamp( $logInfo['timestamp'] );
843 }
844 if ( isset( $logInfo['params'] ) ) {
845 $revision->setParams( $logInfo['params'] );
846 }
847 if ( isset( $logInfo['logtitle'] ) ) {
848 // @todo Using Title for non-local titles is a recipe for disaster.
849 // We should use ForeignTitle here instead.
850 $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
851 }
852
853 $revision->setNoUpdates( $this->mNoUpdates );
854
855 if ( isset( $logInfo['comment'] ) ) {
856 $revision->setComment( $logInfo['comment'] );
857 }
858
859 if ( isset( $logInfo['contributor']['username'] ) ) {
860 $revision->setUsername(
861 $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
862 );
863 } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
864 $revision->setUserIP( $logInfo['contributor']['ip'] );
865 } else {
866 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
867 }
868
869 return $this->logItemCallback( $revision );
870 }
871
872 private function handlePage() {
873 // Handle page data.
874 $this->debug( "Enter page handler." );
875 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
876
877 // Fields that can just be stuffed in the pageInfo object
878 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
879
880 $skip = false;
881 $badTitle = false;
882
883 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
884 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
885 $this->reader->localName == 'page' ) {
886 break;
887 }
888
889 $skip = false;
890
891 $tag = $this->reader->localName;
892
893 if ( $badTitle ) {
894 // The title is invalid, bail out of this page
895 $skip = true;
896 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
897 // Do nothing
898 } elseif ( in_array( $tag, $normalFields ) ) {
899 // An XML snippet:
900 // <page>
901 // <id>123</id>
902 // <title>Page</title>
903 // <redirect title="NewTitle"/>
904 // ...
905 // Because the redirect tag is built differently, we need special handling for that case.
906 if ( $tag == 'redirect' ) {
907 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
908 } else {
909 $pageInfo[$tag] = $this->nodeContents();
910 }
911 } elseif ( $tag == 'revision' || $tag == 'upload' ) {
912 if ( !isset( $title ) ) {
913 $title = $this->processTitle( $pageInfo['title'],
914 $pageInfo['ns'] ?? null );
915
916 // $title is either an array of two titles or false.
917 if ( is_array( $title ) ) {
918 $this->pageCallback( $title );
919 [ $pageInfo['_title'], $foreignTitle ] = $title;
920 } else {
921 $badTitle = true;
922 $skip = true;
923 }
924 }
925
926 if ( $title ) {
927 if ( $tag == 'revision' ) {
928 $this->handleRevision( $pageInfo );
929 } else {
930 $this->handleUpload( $pageInfo );
931 }
932 }
933 } elseif ( $tag != '#text' ) {
934 $this->warn( "Unhandled page XML tag $tag" );
935 $skip = true;
936 }
937 }
938
939 // @note $pageInfo is only set if a valid $title is processed above with
940 // no error. If we have a valid $title, then pageCallback is called
941 // above, $pageInfo['title'] is set and we do pageOutCallback here.
942 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
943 // set since they both come from $title above.
944 if ( array_key_exists( '_title', $pageInfo ) ) {
946 $title = $pageInfo['_title'];
947 $this->pageOutCallback(
948 $title,
949 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
950 $foreignTitle,
951 $pageInfo['revisionCount'],
952 $pageInfo['successfulRevisionCount'],
953 $pageInfo
954 );
955 }
956 }
957
961 private function handleRevision( &$pageInfo ) {
962 $this->debug( "Enter revision handler" );
963 $revisionInfo = [];
964
965 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
966 'model', 'format', 'text', 'sha1' ];
967
968 $skip = false;
969
970 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
971 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
972 $this->reader->localName == 'revision' ) {
973 break;
974 }
975
976 $tag = $this->reader->localName;
977
978 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
979 $this, $pageInfo, $revisionInfo )
980 ) {
981 // Do nothing
982 } elseif ( in_array( $tag, $normalFields ) ) {
983 $revisionInfo[$tag] = $this->nodeContents();
984 } elseif ( $tag == 'content' ) {
985 // We can have multiple content tags, so make this an array.
986 $revisionInfo[$tag][] = $this->handleContent();
987 } elseif ( $tag == 'contributor' ) {
988 $revisionInfo['contributor'] = $this->handleContributor();
989 } elseif ( $tag != '#text' ) {
990 $this->warn( "Unhandled revision XML tag $tag" );
991 $skip = true;
992 }
993 }
994
995 $pageInfo['revisionCount']++;
996 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
997 $pageInfo['successfulRevisionCount']++;
998 }
999 }
1000
1001 private function handleContent() {
1002 $this->debug( "Enter content handler" );
1003 $contentInfo = [];
1004
1005 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
1006
1007 $skip = false;
1008
1009 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1010 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1011 $this->reader->localName == 'content' ) {
1012 break;
1013 }
1014
1015 $tag = $this->reader->localName;
1016
1017 if ( !$this->hookRunner->onImportHandleContentXMLTag(
1018 $this, $contentInfo )
1019 ) {
1020 // Do nothing
1021 } elseif ( in_array( $tag, $normalFields ) ) {
1022 $contentInfo[$tag] = $this->nodeContents();
1023 } elseif ( $tag != '#text' ) {
1024 $this->warn( "Unhandled content XML tag $tag" );
1025 $skip = true;
1026 }
1027 }
1028
1029 return $contentInfo;
1030 }
1031
1040 private function makeContent( Title $title, $revisionId, $contentInfo ) {
1041 $maxArticleSize = MediaWikiServices::getInstance()->getMainConfig()->get(
1042 MainConfigNames::MaxArticleSize );
1043
1044 if ( !isset( $contentInfo['text'] ) ) {
1045 throw new MWException( 'Missing text field in import.' );
1046 }
1047
1048 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1049 // database errors and instability. Testing for revisions with only listed
1050 // content models, as other content models might use serialization formats
1051 // which aren't checked against $wgMaxArticleSize.
1052 if ( ( !isset( $contentInfo['model'] ) ||
1053 in_array( $contentInfo['model'], [
1054 'wikitext',
1055 'css',
1056 'json',
1057 'javascript',
1058 'text',
1059 ''
1060 ] ) ) &&
1061 strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1062 ) {
1063 throw new MWException( 'The text of ' .
1064 ( $revisionId ?
1065 "the revision with ID $revisionId" :
1066 'a revision'
1067 ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1068 }
1069
1070 $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1071 $model = $contentInfo['model'] ?? $this->getDefaultContentModel( $title, $role );
1072 $handler = $this->getContentHandler( $model );
1073
1074 $text = $handler->importTransform( $contentInfo['text'] );
1075
1076 return $handler->unserializeContent( $text );
1077 }
1078
1085 private function processRevision( $pageInfo, $revisionInfo ) {
1086 $revision = new WikiRevision();
1087
1088 $revId = $revisionInfo['id'] ?? 0;
1089 if ( $revId ) {
1090 $revision->setID( $revisionInfo['id'] );
1091 }
1092
1093 $title = $pageInfo['_title'];
1094 $revision->setTitle( $title );
1095
1096 $content = $this->makeContent( $title, $revId, $revisionInfo );
1097 $revision->setContent( SlotRecord::MAIN, $content );
1098
1099 foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1100 if ( !isset( $slotInfo['role'] ) ) {
1101 throw new MWException( "Missing role for imported slot." );
1102 }
1103
1104 $content = $this->makeContent( $title, $revId, $slotInfo );
1105 $revision->setContent( $slotInfo['role'], $content );
1106 }
1107 $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1108
1109 if ( isset( $revisionInfo['comment'] ) ) {
1110 $revision->setComment( $revisionInfo['comment'] );
1111 }
1112
1113 if ( isset( $revisionInfo['minor'] ) ) {
1114 $revision->setMinor( true );
1115 }
1116 if ( isset( $revisionInfo['contributor']['username'] ) ) {
1117 $revision->setUsername(
1118 $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1119 );
1120 } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1121 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1122 } else {
1123 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1124 }
1125 if ( isset( $revisionInfo['sha1'] ) ) {
1126 $revision->setSha1Base36( $revisionInfo['sha1'] );
1127 }
1128 $revision->setNoUpdates( $this->mNoUpdates );
1129
1130 return $this->revisionCallback( $revision );
1131 }
1132
1137 private function handleUpload( &$pageInfo ) {
1138 $this->debug( "Enter upload handler" );
1139 $uploadInfo = [];
1140
1141 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1142 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1143
1144 $skip = false;
1145
1146 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1147 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1148 $this->reader->localName == 'upload' ) {
1149 break;
1150 }
1151
1152 $tag = $this->reader->localName;
1153
1154 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1155 // Do nothing
1156 } elseif ( in_array( $tag, $normalFields ) ) {
1157 $uploadInfo[$tag] = $this->nodeContents();
1158 } elseif ( $tag == 'contributor' ) {
1159 $uploadInfo['contributor'] = $this->handleContributor();
1160 } elseif ( $tag == 'contents' ) {
1161 $contents = $this->nodeContents();
1162 $encoding = $this->reader->getAttribute( 'encoding' );
1163 if ( $encoding === 'base64' ) {
1164 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1165 $uploadInfo['isTempSrc'] = true;
1166 }
1167 } elseif ( $tag != '#text' ) {
1168 $this->warn( "Unhandled upload XML tag $tag" );
1169 $skip = true;
1170 }
1171 }
1172
1173 if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1174 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1175 if ( file_exists( $path ) ) {
1176 $uploadInfo['fileSrc'] = $path;
1177 $uploadInfo['isTempSrc'] = false;
1178 }
1179 }
1180
1181 if ( $this->mImportUploads ) {
1182 return $this->processUpload( $pageInfo, $uploadInfo );
1183 }
1184 }
1185
1190 private function dumpTemp( $contents ) {
1191 $filename = tempnam( wfTempDir(), 'importupload' );
1192 file_put_contents( $filename, $contents );
1193 return $filename;
1194 }
1195
1201 private function processUpload( $pageInfo, $uploadInfo ) {
1202 $revision = new WikiRevision();
1203 $revId = $pageInfo['id'];
1204 $title = $pageInfo['_title'];
1205 // T292348: text key may be absent, force addition if null
1206 $uploadInfo['text'] = $uploadInfo['text'] ?? '';
1207 $content = $this->makeContent( $title, $revId, $uploadInfo );
1208
1209 $revision->setTitle( $title );
1210 $revision->setID( $revId );
1211 $revision->setTimestamp( $uploadInfo['timestamp'] );
1212 $revision->setContent( SlotRecord::MAIN, $content );
1213 $revision->setFilename( $uploadInfo['filename'] );
1214 if ( isset( $uploadInfo['archivename'] ) ) {
1215 $revision->setArchiveName( $uploadInfo['archivename'] );
1216 }
1217 $revision->setSrc( $uploadInfo['src'] );
1218 if ( isset( $uploadInfo['fileSrc'] ) ) {
1219 $revision->setFileSrc( $uploadInfo['fileSrc'],
1220 !empty( $uploadInfo['isTempSrc'] )
1221 );
1222 }
1223 if ( isset( $uploadInfo['sha1base36'] ) ) {
1224 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1225 }
1226 $revision->setSize( intval( $uploadInfo['size'] ) );
1227 $revision->setComment( $uploadInfo['comment'] );
1228
1229 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1230 $revision->setUsername(
1231 $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1232 );
1233 } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1234 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1235 }
1236 $revision->setNoUpdates( $this->mNoUpdates );
1237
1238 return call_user_func( $this->mUploadCallback, $revision );
1239 }
1240
1244 private function handleContributor() {
1245 $this->debug( "Enter contributor handler." );
1246
1247 if ( $this->reader->isEmptyElement ) {
1248 return [];
1249 }
1250
1251 $fields = [ 'id', 'ip', 'username' ];
1252 $info = [];
1253
1254 while ( $this->reader->read() ) {
1255 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1256 $this->reader->localName == 'contributor' ) {
1257 break;
1258 }
1259
1260 $tag = $this->reader->localName;
1261
1262 if ( in_array( $tag, $fields ) ) {
1263 $info[$tag] = $this->nodeContents();
1264 }
1265 }
1266
1267 return $info;
1268 }
1269
1275 private function processTitle( $text, $ns = null ) {
1276 if ( $this->foreignNamespaces === null ) {
1277 $foreignTitleFactory = new NaiveForeignTitleFactory(
1278 $this->contentLanguage
1279 );
1280 } else {
1281 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1282 $this->foreignNamespaces );
1283 }
1284
1285 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1286 intval( $ns ) );
1287
1288 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1289 $foreignTitle );
1290
1291 $commandLineMode = $this->config->get( 'CommandLineMode' );
1292 if ( $title === null ) {
1293 # Invalid page title? Ignore the page
1294 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1295 return false;
1296 } elseif ( $title->isExternal() ) {
1297 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1298 return false;
1299 } elseif ( !$title->canExist() ) {
1300 $this->notice( 'import-error-special', $title->getPrefixedText() );
1301 return false;
1302 } elseif ( !$commandLineMode ) {
1303 $user = RequestContext::getMain()->getUser();
1304
1305 if ( !$this->permissionManager->userCan( 'edit', $user, $title ) ) {
1306 # Do not import if the importing wiki user cannot edit this page
1307 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1308
1309 return false;
1310 }
1311 }
1312
1313 return [ $title, $foreignTitle ];
1314 }
1315
1320 private function getContentHandler( $model ) {
1321 return $this->contentHandlerFactory->getContentHandler( $model );
1322 }
1323
1330 private function getDefaultContentModel( $title, $role ) {
1331 return $this->slotRoleRegistry
1332 ->getRoleHandler( $role )
1333 ->getDefaultModel( $title );
1334 }
1335
1340 private function openReader() {
1341 // Enable the entity loader, as it is needed for loading external URLs via
1342 // XMLReader::open (T86036)
1343 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
1344 $oldDisable = @libxml_disable_entity_loader( false );
1345
1346 if ( PHP_VERSION_ID >= 80000 ) {
1347 // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548
1348 $reader = XMLReader::open(
1349 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1350 if ( $reader instanceof XMLReader ) {
1351 $this->reader = $reader;
1352 $status = true;
1353 } else {
1354 $status = false;
1355 }
1356 } else {
1357 // A static call generated a deprecation warning prior to PHP 8.0
1358 $this->reader = new XMLReader;
1359 $status = $this->reader->open(
1360 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1361 }
1362 if ( !$status ) {
1363 $error = libxml_get_last_error();
1364 // phpcs:ignore Generic.PHP.NoSilencedErrors
1365 @libxml_disable_entity_loader( $oldDisable );
1366 throw new MWException(
1367 'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1368 );
1369 }
1370 // phpcs:ignore Generic.PHP.NoSilencedErrors
1371 @libxml_disable_entity_loader( $oldDisable );
1372 }
1373
1377 private function syntaxCheckXML() {
1378 if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) {
1379 return;
1380 }
1381 AtEase::suppressWarnings();
1382 $oldDisable = libxml_disable_entity_loader( false );
1383 try {
1384 while ( $this->reader->read() );
1385 $error = libxml_get_last_error();
1386 if ( $error ) {
1387 $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message;
1388 wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage );
1389 throw new MWException( $errorMessage );
1390 }
1391 } finally {
1392 libxml_disable_entity_loader( $oldDisable );
1393 AtEase::restoreWarnings();
1394 $this->reader->close();
1395 }
1396
1397 // Reopen for the real import
1398 UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 );
1399 $this->openReader();
1400 }
1401}
const NS_MAIN
Definition Defines.php:64
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Class to parse and build external user names.
Reporting callback.
Base class for language-specific code.
Definition Language.php:56
Exception representing a failure to serialize or unserialize a content object.
MediaWiki exception.
Helper class for mapping value objects representing basic entities to cache keys.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Service for creating WikiPage objects.
A service class for checking permissions To obtain an instance, use MediaWikiServices::getInstance()-...
Value object representing a content slot associated with a page revision.
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
Creates Title objects.
Represents a title within MediaWiki.
Definition Title.php:82
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
static getMain()
Get the RequestContext object associated with the main request.
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
static seekSource(string $id, int $offset)
static isSeekableSource(string $id)
static registerSource(ImportSource $source)
XML file reader for the page data importer.
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setNoUpdates( $noupdates)
Set 'no updates' mode.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
importLogItem( $revision)
Default per-revision callback, performs the import.
setImageBasePath( $dir)
throwXmlError( $err)
setDebug( $debug)
Set debug mode...
__construct(ImportSource $source, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, PermissionManager $permissionManager, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
Represents a revision, log entry or upload during the import process.
Interface for configuration instances.
Definition Config.php:30
Source interface for XML import.
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
Interface for objects (potentially) representing an editable wiki page.
$source
$content
Definition router.php:76