MediaWiki master
WikiImporter.php
Go to the documentation of this file.
1<?php
55use Wikimedia\AtEase\AtEase;
58use Wikimedia\NormalizedException\NormalizedException;
60
69 private $reader;
70
72 private $sourceAdapterId;
73
75 private $foreignNamespaces = null;
76
78 private $mLogItemCallback;
79
81 private $mUploadCallback;
82
84 private $mRevisionCallback;
85
87 private $mPageCallback;
88
90 private $mSiteInfoCallback;
91
93 private $mPageOutCallback;
94
96 private $mNoticeCallback;
97
99 private $mDebug;
100
102 private $mImportUploads;
103
105 private $mImageBasePath;
106
108 private $mNoUpdates = false;
109
111 private $pageOffset = 0;
112
113 private ImportTitleFactory $importTitleFactory;
114 private ExternalUserNames $externalUserNames;
115
117 private $countableCache = [];
118
120 private $disableStatisticsUpdate = false;
121
128 private Authority $performer;
129
130 private Config $config;
131 private HookRunner $hookRunner;
132 private Language $contentLanguage;
133 private NamespaceInfo $namespaceInfo;
134 private TitleFactory $titleFactory;
135 private WikiPageFactory $wikiPageFactory;
136 private UploadRevisionImporter $uploadRevisionImporter;
137 private IContentHandlerFactory $contentHandlerFactory;
138 private SlotRoleRegistry $slotRoleRegistry;
139
143 public function __construct(
145 Authority $performer,
146 Config $config,
147 HookContainer $hookContainer,
148 Language $contentLanguage,
149 NamespaceInfo $namespaceInfo,
150 TitleFactory $titleFactory,
151 WikiPageFactory $wikiPageFactory,
152 UploadRevisionImporter $uploadRevisionImporter,
153 IContentHandlerFactory $contentHandlerFactory,
154 SlotRoleRegistry $slotRoleRegistry
155 ) {
156 $this->performer = $performer;
157 $this->config = $config;
158 $this->hookRunner = new HookRunner( $hookContainer );
159 $this->contentLanguage = $contentLanguage;
160 $this->namespaceInfo = $namespaceInfo;
161 $this->titleFactory = $titleFactory;
162 $this->wikiPageFactory = $wikiPageFactory;
163 $this->uploadRevisionImporter = $uploadRevisionImporter;
164 $this->contentHandlerFactory = $contentHandlerFactory;
165 $this->slotRoleRegistry = $slotRoleRegistry;
166
167 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
168 stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
169 }
170 $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source );
171
172 $this->openReader();
173
174 // Default callbacks
175 $this->setPageCallback( [ $this, 'beforeImportPage' ] );
176 $this->setRevisionCallback( [ $this, "importRevision" ] );
177 $this->setUploadCallback( [ $this, 'importUpload' ] );
178 $this->setLogItemCallback( [ $this, 'importLogItem' ] );
179 $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
180
181 $this->importTitleFactory = new NaiveImportTitleFactory(
182 $this->contentLanguage,
183 $this->namespaceInfo,
184 $this->titleFactory
185 );
186 $this->externalUserNames = new ExternalUserNames( 'imported', false );
187 }
188
192 public function getReader() {
193 return $this->reader;
194 }
195
199 public function throwXmlError( $err ) {
200 $this->debug( "FAILURE: $err" );
201 wfDebug( "WikiImporter XML error: $err" );
202 }
203
207 public function debug( $data ) {
208 if ( $this->mDebug ) {
209 wfDebug( "IMPORT: $data" );
210 }
211 }
212
216 public function warn( $data ) {
217 wfDebug( "IMPORT: $data" );
218 }
219
226 public function notice( $msg, ...$params ) {
227 if ( is_callable( $this->mNoticeCallback ) ) {
228 ( $this->mNoticeCallback )( $msg, $params );
229 } else { # No ImportReporter -> CLI
230 // T177997: the command line importers should call setNoticeCallback()
231 // for their own custom callback to echo the notice
232 wfDebug( wfMessage( $msg, $params )->text() );
233 }
234 }
235
240 public function setDebug( $debug ) {
241 $this->mDebug = $debug;
242 }
243
248 public function setNoUpdates( $noupdates ) {
249 $this->mNoUpdates = $noupdates;
250 }
251
258 public function setPageOffset( $nthPage ) {
259 $this->pageOffset = $nthPage;
260 }
261
268 public function setNoticeCallback( $callback ) {
269 return wfSetVar( $this->mNoticeCallback, $callback );
270 }
271
277 public function setPageCallback( $callback ) {
278 $previous = $this->mPageCallback;
279 $this->mPageCallback = $callback;
280 return $previous;
281 }
282
292 public function setPageOutCallback( $callback ) {
293 $previous = $this->mPageOutCallback;
294 $this->mPageOutCallback = $callback;
295 return $previous;
296 }
297
303 public function setRevisionCallback( $callback ) {
304 $previous = $this->mRevisionCallback;
305 $this->mRevisionCallback = $callback;
306 return $previous;
307 }
308
314 public function setUploadCallback( $callback ) {
315 $previous = $this->mUploadCallback;
316 $this->mUploadCallback = $callback;
317 return $previous;
318 }
319
325 public function setLogItemCallback( $callback ) {
326 $previous = $this->mLogItemCallback;
327 $this->mLogItemCallback = $callback;
328 return $previous;
329 }
330
336 public function setSiteInfoCallback( $callback ) {
337 $previous = $this->mSiteInfoCallback;
338 $this->mSiteInfoCallback = $callback;
339 return $previous;
340 }
341
347 public function setImportTitleFactory( $factory ) {
348 $this->importTitleFactory = $factory;
349 }
350
356 public function setTargetNamespace( $namespace ) {
357 if ( $namespace === null ) {
358 // Don't override namespaces
361 $this->contentLanguage,
362 $this->namespaceInfo,
363 $this->titleFactory
364 )
365 );
366 return true;
367 } elseif (
368 $namespace >= 0 &&
369 $this->namespaceInfo->exists( intval( $namespace ) )
370 ) {
371 $namespace = intval( $namespace );
374 $this->namespaceInfo,
375 $this->titleFactory,
376 $namespace
377 )
378 );
379 return true;
380 } else {
381 return false;
382 }
383 }
384
390 public function setTargetRootPage( $rootpage ) {
391 $status = Status::newGood();
392 $nsInfo = $this->namespaceInfo;
393 if ( $rootpage === null ) {
394 // No rootpage
397 $this->contentLanguage,
398 $nsInfo,
399 $this->titleFactory
400 )
401 );
402 } elseif ( $rootpage !== '' ) {
403 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
404 $title = Title::newFromText( $rootpage );
405
406 if ( !$title || $title->isExternal() ) {
407 $status->fatal( 'import-rootpage-invalid' );
408 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
409 $displayNSText = $title->getNamespace() === NS_MAIN
410 ? wfMessage( 'blanknamespace' )->text()
411 : $this->contentLanguage->getNsText( $title->getNamespace() );
412 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
413 } else {
414 // set namespace to 'all', so the namespace check in processTitle() can pass
415 $this->setTargetNamespace( null );
418 $nsInfo,
419 $this->titleFactory,
420 $title
421 )
422 );
423 }
424 }
425 return $status;
426 }
427
431 public function setImageBasePath( $dir ) {
432 $this->mImageBasePath = $dir;
433 }
434
438 public function setImportUploads( $import ) {
439 $this->mImportUploads = $import;
440 }
441
447 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
448 $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
449 }
450
455 public function disableStatisticsUpdate() {
456 $this->disableStatisticsUpdate = true;
457 }
458
465 public function beforeImportPage( $titleAndForeignTitle ) {
466 $title = $titleAndForeignTitle[0];
467 $page = $this->wikiPageFactory->newFromTitle( $title );
468 $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
469 return true;
470 }
471
477 public function importRevision( $revision ) {
478 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
479 $this->notice( 'import-error-bad-location',
480 $revision->getTitle()->getPrefixedText(),
481 $revision->getID(),
482 $revision->getModel(),
483 $revision->getFormat()
484 );
485
486 return false;
487 }
488
489 try {
490 return $revision->importOldRevision();
491 } catch ( MWContentSerializationException $ex ) {
492 $this->notice( 'import-error-unserialize',
493 $revision->getTitle()->getPrefixedText(),
494 $revision->getID(),
495 $revision->getModel(),
496 $revision->getFormat()
497 );
498 }
499
500 return false;
501 }
502
508 public function importLogItem( $revision ) {
509 return $revision->importLogItem();
510 }
511
517 public function importUpload( $revision ) {
518 $status = $this->uploadRevisionImporter->import( $revision );
519 return $status->isGood();
520 }
521
531 public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
532 $sRevCount, $pageInfo
533 ) {
534 // Update article count statistics (T42009)
535 // The normal counting logic in WikiPage->doEditUpdates() is designed for
536 // one-revision-at-a-time editing, not bulk imports. In this situation it
537 // suffers from issues of replica DB lag. We let WikiPage handle the total page
538 // and revision count, and we implement our own custom logic for the
539 // article (content page) count.
540 if ( !$this->disableStatisticsUpdate ) {
541 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
542
543 $page->loadPageData( IDBAccessObject::READ_LATEST );
544 $rev = $page->getRevisionRecord();
545 if ( $rev === null ) {
546
547 wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
548 ' because WikiPage::getRevisionRecord() returned null' );
549 } else {
550 $update = $page->newPageUpdater( $this->performer )->prepareUpdate();
551 $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
552 $countable = $update->isCountable();
553 if ( array_key_exists( $countKey, $this->countableCache ) &&
554 $countable != $this->countableCache[$countKey] ) {
555 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
556 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
557 ] ) );
558 }
559 }
560 }
561
562 $title = Title::newFromPageIdentity( $pageIdentity );
563 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
564 $revCount, $sRevCount, $pageInfo );
565 }
566
572 private function siteInfoCallback( $siteInfo ) {
573 if ( $this->mSiteInfoCallback ) {
574 return ( $this->mSiteInfoCallback )( $siteInfo, $this );
575 } else {
576 return false;
577 }
578 }
579
584 public function pageCallback( $title ) {
585 if ( $this->mPageCallback ) {
586 ( $this->mPageCallback )( $title );
587 }
588 }
589
598 private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
599 $sucCount, $pageInfo ) {
600 if ( $this->mPageOutCallback ) {
601 ( $this->mPageOutCallback )( $pageIdentity, $foreignTitle, $revCount, $sucCount, $pageInfo );
602 }
603 }
604
610 private function revisionCallback( $revision ) {
611 if ( $this->mRevisionCallback ) {
612 return ( $this->mRevisionCallback )( $revision, $this );
613 } else {
614 return false;
615 }
616 }
617
623 private function logItemCallback( $revision ) {
624 if ( $this->mLogItemCallback ) {
625 return ( $this->mLogItemCallback )( $revision, $this );
626 } else {
627 return false;
628 }
629 }
630
637 public function nodeAttribute( $attr ) {
638 return $this->reader->getAttribute( $attr ) ?? '';
639 }
640
648 public function nodeContents() {
649 if ( $this->reader->isEmptyElement ) {
650 return "";
651 }
652 $buffer = "";
653 while ( $this->reader->read() ) {
654 switch ( $this->reader->nodeType ) {
655 case XMLReader::TEXT:
656 case XMLReader::CDATA:
657 case XMLReader::SIGNIFICANT_WHITESPACE:
658 $buffer .= $this->reader->value;
659 break;
660 case XMLReader::END_ELEMENT:
661 return $buffer;
662 }
663 }
664
665 $this->reader->close();
666 return '';
667 }
668
674 public function doImport() {
675 $this->syntaxCheckXML();
676
677 // Calls to reader->read need to be wrapped in calls to
678 // libxml_disable_entity_loader() to avoid local file
679 // inclusion attacks (T48932).
680 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
681 $oldDisable = @libxml_disable_entity_loader( true );
682 try {
683 $this->reader->read();
684
685 if ( $this->reader->localName != 'mediawiki' ) {
686 // phpcs:ignore Generic.PHP.NoSilencedErrors
687 @libxml_disable_entity_loader( $oldDisable );
688 $error = libxml_get_last_error();
689 if ( $error ) {
690 throw new NormalizedException( "XML error at line {line}: {message}", [
691 'line' => $error->line,
692 'message' => $error->message,
693 ] );
694 } else {
695 throw new UnexpectedValueException(
696 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
697 );
698 }
699 }
700 $this->debug( "<mediawiki> tag is correct." );
701
702 $this->debug( "Starting primary dump processing loop." );
703
704 $keepReading = $this->reader->read();
705 $skip = false;
706 $pageCount = 0;
707 while ( $keepReading ) {
708 $tag = $this->reader->localName;
709 if ( $this->pageOffset ) {
710 if ( $tag === 'page' ) {
711 $pageCount++;
712 }
713 if ( $pageCount < $this->pageOffset ) {
714 $keepReading = $this->reader->next();
715 continue;
716 }
717 }
718 $type = $this->reader->nodeType;
719
720 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
721 // Do nothing
722 } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
723 break;
724 } elseif ( $tag == 'siteinfo' ) {
725 $this->handleSiteInfo();
726 } elseif ( $tag == 'page' ) {
727 $this->handlePage();
728 } elseif ( $tag == 'logitem' ) {
729 $this->handleLogItem();
730 } elseif ( $tag != '#text' ) {
731 $this->warn( "Unhandled top-level XML tag $tag" );
732
733 $skip = true;
734 }
735
736 if ( $skip ) {
737 $keepReading = $this->reader->next();
738 $skip = false;
739 $this->debug( "Skip" );
740 } else {
741 $keepReading = $this->reader->read();
742 }
743 }
744 } finally {
745 // phpcs:ignore Generic.PHP.NoSilencedErrors
746 @libxml_disable_entity_loader( $oldDisable );
747 $this->reader->close();
748 }
749
750 return true;
751 }
752
753 private function handleSiteInfo() {
754 $this->debug( "Enter site info handler." );
755 $siteInfo = [];
756
757 // Fields that can just be stuffed in the siteInfo object
758 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
759
760 while ( $this->reader->read() ) {
761 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
762 $this->reader->localName == 'siteinfo' ) {
763 break;
764 }
765
766 $tag = $this->reader->localName;
767
768 if ( $tag == 'namespace' ) {
769 $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
770 $this->nodeContents();
771 } elseif ( in_array( $tag, $normalFields ) ) {
772 $siteInfo[$tag] = $this->nodeContents();
773 }
774 }
775
776 $siteInfo['_namespaces'] = $this->foreignNamespaces;
777 $this->siteInfoCallback( $siteInfo );
778 }
779
780 private function handleLogItem() {
781 $this->debug( "Enter log item handler." );
782 $logInfo = [];
783
784 // Fields that can just be stuffed in the pageInfo object
785 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
786 'logtitle', 'params' ];
787
788 while ( $this->reader->read() ) {
789 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
790 $this->reader->localName == 'logitem' ) {
791 break;
792 }
793
794 $tag = $this->reader->localName;
795
796 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
797 // Do nothing
798 } elseif ( in_array( $tag, $normalFields ) ) {
799 $logInfo[$tag] = $this->nodeContents();
800 } elseif ( $tag == 'contributor' ) {
801 $logInfo['contributor'] = $this->handleContributor();
802 } elseif ( $tag != '#text' ) {
803 $this->warn( "Unhandled log-item XML tag $tag" );
804 }
805 }
806
807 $this->processLogItem( $logInfo );
808 }
809
814 private function processLogItem( $logInfo ) {
815 $revision = new WikiRevision();
816
817 if ( isset( $logInfo['id'] ) ) {
818 $revision->setID( $logInfo['id'] );
819 }
820 $revision->setType( $logInfo['type'] );
821 $revision->setAction( $logInfo['action'] );
822 if ( isset( $logInfo['timestamp'] ) ) {
823 $revision->setTimestamp( $logInfo['timestamp'] );
824 }
825 if ( isset( $logInfo['params'] ) ) {
826 $revision->setParams( $logInfo['params'] );
827 }
828 if ( isset( $logInfo['logtitle'] ) ) {
829 // @todo Using Title for non-local titles is a recipe for disaster.
830 // We should use ForeignTitle here instead.
831 $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
832 }
833
834 $revision->setNoUpdates( $this->mNoUpdates );
835
836 if ( isset( $logInfo['comment'] ) ) {
837 $revision->setComment( $logInfo['comment'] );
838 }
839
840 if ( isset( $logInfo['contributor']['username'] ) ) {
841 $revision->setUsername(
842 $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
843 );
844 } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
845 $revision->setUserIP( $logInfo['contributor']['ip'] );
846 } else {
847 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
848 }
849
850 return $this->logItemCallback( $revision );
851 }
852
853 private function handlePage() {
854 // Handle page data.
855 $this->debug( "Enter page handler." );
856 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
857
858 // Fields that can just be stuffed in the pageInfo object
859 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
860
861 $skip = false;
862 $badTitle = false;
863
864 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
865 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
866 $this->reader->localName == 'page' ) {
867 break;
868 }
869
870 $skip = false;
871
872 $tag = $this->reader->localName;
873
874 if ( $badTitle ) {
875 // The title is invalid, bail out of this page
876 $skip = true;
877 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
878 // Do nothing
879 } elseif ( in_array( $tag, $normalFields ) ) {
880 // An XML snippet:
881 // <page>
882 // <id>123</id>
883 // <title>Page</title>
884 // <redirect title="NewTitle"/>
885 // ...
886 // Because the redirect tag is built differently, we need special handling for that case.
887 if ( $tag == 'redirect' ) {
888 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
889 } else {
890 $pageInfo[$tag] = $this->nodeContents();
891 }
892 } elseif ( $tag == 'revision' || $tag == 'upload' ) {
893 if ( !isset( $title ) ) {
894 $title = $this->processTitle( $pageInfo['title'],
895 $pageInfo['ns'] ?? null );
896
897 // $title is either an array of two titles or false.
898 if ( is_array( $title ) ) {
899 $this->pageCallback( $title );
900 [ $pageInfo['_title'], $foreignTitle ] = $title;
901 } else {
902 $badTitle = true;
903 $skip = true;
904 }
905 }
906
907 if ( $title ) {
908 if ( $tag == 'revision' ) {
909 $this->handleRevision( $pageInfo );
910 } else {
911 $this->handleUpload( $pageInfo );
912 }
913 }
914 } elseif ( $tag != '#text' ) {
915 $this->warn( "Unhandled page XML tag $tag" );
916 $skip = true;
917 }
918 }
919
920 // @note $pageInfo is only set if a valid $title is processed above with
921 // no error. If we have a valid $title, then pageCallback is called
922 // above, $pageInfo['title'] is set and we do pageOutCallback here.
923 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
924 // set since they both come from $title above.
925 if ( array_key_exists( '_title', $pageInfo ) ) {
927 $title = $pageInfo['_title'];
928 $this->pageOutCallback(
929 $title,
930 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
931 $foreignTitle,
932 $pageInfo['revisionCount'],
933 $pageInfo['successfulRevisionCount'],
934 $pageInfo
935 );
936 }
937 }
938
942 private function handleRevision( &$pageInfo ) {
943 $this->debug( "Enter revision handler" );
944 $revisionInfo = [];
945
946 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
947 'model', 'format', 'text', 'sha1' ];
948
949 $skip = false;
950
951 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
952 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
953 $this->reader->localName == 'revision' ) {
954 break;
955 }
956
957 $tag = $this->reader->localName;
958
959 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
960 $this, $pageInfo, $revisionInfo )
961 ) {
962 // Do nothing
963 } elseif ( in_array( $tag, $normalFields ) ) {
964 $revisionInfo[$tag] = $this->nodeContents();
965 } elseif ( $tag == 'content' ) {
966 // We can have multiple content tags, so make this an array.
967 $revisionInfo[$tag][] = $this->handleContent();
968 } elseif ( $tag == 'contributor' ) {
969 $revisionInfo['contributor'] = $this->handleContributor();
970 } elseif ( $tag != '#text' ) {
971 $this->warn( "Unhandled revision XML tag $tag" );
972 $skip = true;
973 }
974 }
975
976 $pageInfo['revisionCount']++;
977 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
978 $pageInfo['successfulRevisionCount']++;
979 }
980 }
981
982 private function handleContent(): array {
983 $this->debug( "Enter content handler" );
984 $contentInfo = [];
985
986 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
987
988 $skip = false;
989
990 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
991 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
992 $this->reader->localName == 'content' ) {
993 break;
994 }
995
996 $tag = $this->reader->localName;
997
998 if ( !$this->hookRunner->onImportHandleContentXMLTag(
999 $this, $contentInfo )
1000 ) {
1001 // Do nothing
1002 } elseif ( in_array( $tag, $normalFields ) ) {
1003 $contentInfo[$tag] = $this->nodeContents();
1004 } elseif ( $tag != '#text' ) {
1005 $this->warn( "Unhandled content XML tag $tag" );
1006 $skip = true;
1007 }
1008 }
1009
1010 return $contentInfo;
1011 }
1012
1020 private function makeContent( PageIdentity $page, $revisionId, $contentInfo ) {
1021 $maxArticleSize = $this->config->get( MainConfigNames::MaxArticleSize );
1022
1023 if ( !isset( $contentInfo['text'] ) ) {
1024 throw new InvalidArgumentException( 'Missing text field in import.' );
1025 }
1026
1027 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1028 // database errors and instability. Testing for revisions with only listed
1029 // content models, as other content models might use serialization formats
1030 // which aren't checked against $wgMaxArticleSize.
1031 if ( ( !isset( $contentInfo['model'] ) ||
1032 in_array( $contentInfo['model'], [
1033 'wikitext',
1034 'css',
1035 'json',
1036 'javascript',
1037 'text',
1038 ''
1039 ] ) ) &&
1040 strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1041 ) {
1042 throw new RuntimeException( 'The text of ' .
1043 ( $revisionId ?
1044 "the revision with ID $revisionId" :
1045 'a revision'
1046 ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1047 }
1048
1049 $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1050 $model = $contentInfo['model'] ?? $this->slotRoleRegistry
1051 ->getRoleHandler( $role )
1052 ->getDefaultModel( $page );
1053 $handler = $this->contentHandlerFactory->getContentHandler( $model );
1054
1055 $text = $handler->importTransform( $contentInfo['text'] );
1056
1057 return $handler->unserializeContent( $text );
1058 }
1059
1065 private function processRevision( $pageInfo, $revisionInfo ) {
1066 $revision = new WikiRevision();
1067
1068 $revId = $revisionInfo['id'] ?? 0;
1069 if ( $revId ) {
1070 $revision->setID( $revisionInfo['id'] );
1071 }
1072
1073 $title = $pageInfo['_title'];
1074 $revision->setTitle( $title );
1075
1076 $content = $this->makeContent( $title, $revId, $revisionInfo );
1077 $revision->setContent( SlotRecord::MAIN, $content );
1078
1079 foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1080 if ( !isset( $slotInfo['role'] ) ) {
1081 throw new RuntimeException( "Missing role for imported slot." );
1082 }
1083
1084 $content = $this->makeContent( $title, $revId, $slotInfo );
1085 $revision->setContent( $slotInfo['role'], $content );
1086 }
1087 $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1088
1089 if ( isset( $revisionInfo['comment'] ) ) {
1090 $revision->setComment( $revisionInfo['comment'] );
1091 }
1092
1093 if ( isset( $revisionInfo['minor'] ) ) {
1094 $revision->setMinor( true );
1095 }
1096 if ( isset( $revisionInfo['contributor']['username'] ) ) {
1097 $revision->setUsername(
1098 $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1099 );
1100 } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1101 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1102 } else {
1103 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1104 }
1105 if ( isset( $revisionInfo['sha1'] ) ) {
1106 $revision->setSha1Base36( $revisionInfo['sha1'] );
1107 }
1108 $revision->setNoUpdates( $this->mNoUpdates );
1109
1110 return $this->revisionCallback( $revision );
1111 }
1112
1117 private function handleUpload( &$pageInfo ) {
1118 $this->debug( "Enter upload handler" );
1119 $uploadInfo = [];
1120
1121 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1122 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1123
1124 $skip = false;
1125
1126 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1127 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1128 $this->reader->localName == 'upload' ) {
1129 break;
1130 }
1131
1132 $tag = $this->reader->localName;
1133
1134 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1135 // Do nothing
1136 } elseif ( in_array( $tag, $normalFields ) ) {
1137 $uploadInfo[$tag] = $this->nodeContents();
1138 } elseif ( $tag == 'contributor' ) {
1139 $uploadInfo['contributor'] = $this->handleContributor();
1140 } elseif ( $tag == 'contents' ) {
1141 $contents = $this->nodeContents();
1142 $encoding = $this->reader->getAttribute( 'encoding' );
1143 if ( $encoding === 'base64' ) {
1144 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1145 $uploadInfo['isTempSrc'] = true;
1146 }
1147 } elseif ( $tag != '#text' ) {
1148 $this->warn( "Unhandled upload XML tag $tag" );
1149 $skip = true;
1150 }
1151 }
1152
1153 if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1154 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1155 if ( file_exists( $path ) ) {
1156 $uploadInfo['fileSrc'] = $path;
1157 $uploadInfo['isTempSrc'] = false;
1158 }
1159 }
1160
1161 if ( $this->mImportUploads ) {
1162 return $this->processUpload( $pageInfo, $uploadInfo );
1163 }
1164 }
1165
1170 private function dumpTemp( $contents ) {
1171 $filename = tempnam( wfTempDir(), 'importupload' );
1172 file_put_contents( $filename, $contents );
1173 return $filename;
1174 }
1175
1181 private function processUpload( $pageInfo, $uploadInfo ) {
1182 $revision = new WikiRevision();
1183 $revId = $pageInfo['id'];
1184 $title = $pageInfo['_title'];
1185 // T292348: text key may be absent, force addition if null
1186 $uploadInfo['text'] ??= '';
1187 $content = $this->makeContent( $title, $revId, $uploadInfo );
1188
1189 $revision->setTitle( $title );
1190 $revision->setID( $revId );
1191 $revision->setTimestamp( $uploadInfo['timestamp'] );
1192 $revision->setContent( SlotRecord::MAIN, $content );
1193 $revision->setFilename( $uploadInfo['filename'] );
1194 if ( isset( $uploadInfo['archivename'] ) ) {
1195 $revision->setArchiveName( $uploadInfo['archivename'] );
1196 }
1197 $revision->setSrc( $uploadInfo['src'] );
1198 if ( isset( $uploadInfo['fileSrc'] ) ) {
1199 $revision->setFileSrc( $uploadInfo['fileSrc'],
1200 !empty( $uploadInfo['isTempSrc'] )
1201 );
1202 }
1203 if ( isset( $uploadInfo['sha1base36'] ) ) {
1204 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1205 }
1206 $revision->setSize( intval( $uploadInfo['size'] ) );
1207 $revision->setComment( $uploadInfo['comment'] );
1208
1209 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1210 $revision->setUsername(
1211 $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1212 );
1213 } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1214 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1215 }
1216 $revision->setNoUpdates( $this->mNoUpdates );
1217
1218 return ( $this->mUploadCallback )( $revision );
1219 }
1220
1224 private function handleContributor() {
1225 $this->debug( "Enter contributor handler." );
1226
1227 if ( $this->reader->isEmptyElement ) {
1228 return [];
1229 }
1230
1231 $fields = [ 'id', 'ip', 'username' ];
1232 $info = [];
1233
1234 while ( $this->reader->read() ) {
1235 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1236 $this->reader->localName == 'contributor' ) {
1237 break;
1238 }
1239
1240 $tag = $this->reader->localName;
1241
1242 if ( in_array( $tag, $fields ) ) {
1243 $info[$tag] = $this->nodeContents();
1244 }
1245 }
1246
1247 return $info;
1248 }
1249
1255 private function processTitle( $text, $ns = null ) {
1256 if ( $this->foreignNamespaces === null ) {
1257 $foreignTitleFactory = new NaiveForeignTitleFactory(
1258 $this->contentLanguage
1259 );
1260 } else {
1261 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1262 $this->foreignNamespaces );
1263 }
1264
1265 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1266 intval( $ns ) );
1267
1268 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1269 $foreignTitle );
1270
1271 if ( $title === null ) {
1272 # Invalid page title? Ignore the page
1273 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1274 return false;
1275 } elseif ( $title->isExternal() ) {
1276 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1277 return false;
1278 } elseif ( !$title->canExist() ) {
1279 $this->notice( 'import-error-special', $title->getPrefixedText() );
1280 return false;
1281 } elseif ( !$this->performer->definitelyCan( 'edit', $title ) ) {
1282 # Do not import if the importing wiki user cannot edit this page
1283 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1284 return false;
1285 }
1286
1287 return [ $title, $foreignTitle ];
1288 }
1289
1294 private function openReader() {
1295 // Enable the entity loader, as it is needed for loading external URLs via
1296 // XMLReader::open (T86036)
1297 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
1298 $oldDisable = @libxml_disable_entity_loader( false );
1299
1300 if ( PHP_VERSION_ID >= 80000 ) {
1301 // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548
1302 $reader = XMLReader::open(
1303 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1304 if ( $reader instanceof XMLReader ) {
1305 $this->reader = $reader;
1306 $status = true;
1307 } else {
1308 $status = false;
1309 }
1310 } else {
1311 // A static call generated a deprecation warning prior to PHP 8.0
1312 $this->reader = new XMLReader;
1313 $status = $this->reader->open(
1314 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1315 }
1316 if ( !$status ) {
1317 $error = libxml_get_last_error();
1318 // phpcs:ignore Generic.PHP.NoSilencedErrors
1319 @libxml_disable_entity_loader( $oldDisable );
1320 throw new RuntimeException(
1321 'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1322 );
1323 }
1324 // phpcs:ignore Generic.PHP.NoSilencedErrors
1325 @libxml_disable_entity_loader( $oldDisable );
1326 }
1327
1331 private function syntaxCheckXML() {
1332 if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) {
1333 return;
1334 }
1335 AtEase::suppressWarnings();
1336 $oldDisable = libxml_disable_entity_loader( false );
1337 try {
1338 while ( $this->reader->read() );
1339 $error = libxml_get_last_error();
1340 if ( $error ) {
1341 $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message;
1342 wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage );
1343 throw new RuntimeException( $errorMessage );
1344 }
1345 } finally {
1346 libxml_disable_entity_loader( $oldDisable );
1347 AtEase::restoreWarnings();
1348 $this->reader->close();
1349 }
1350
1351 // Reopen for the real import
1352 UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 );
1353 $this->openReader();
1354 }
1355}
const NS_MAIN
Definition Defines.php:65
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Reporting callback.
Helper class for mapping value objects representing basic entities to cache keys.
Defer callable updates to run later in the PHP process.
Class for handling updates to the site_stats table.
Exception representing a failure to serialize or unserialize a content object.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Base class for language-specific code.
Definition Language.php:81
A class containing constants representing the names of configuration variables.
Service for creating WikiPage objects.
Value object representing a content slot associated with a page revision.
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition Status.php:54
A simple, immutable structure to hold the title of a page on a foreign MediaWiki installation.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Creates Title objects.
Represents a title within MediaWiki.
Definition Title.php:78
Class to parse and build external user names.
static seekSource(string $id, int $offset)
static isSeekableSource(string $id)
static registerSource(ImportSource $source)
XML file reader for the page data importer.
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setNoUpdates( $noupdates)
Set 'no updates' mode.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
importLogItem( $revision)
Default per-revision callback, performs the import.
setImageBasePath( $dir)
throwXmlError( $err)
setDebug( $debug)
Set debug mode...
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
__construct(ImportSource $source, Authority $performer, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
Represents a revision, log entry or upload during the import process.
Value object representing a message parameter that consists of a list of values.
Source interface for XML import.
Interface for configuration instances.
Definition Config.php:32
Content objects represent page content, e.g.
Definition Content.php:42
Interface for objects (potentially) representing an editable wiki page.
This interface represents the authority associated with the current execution context,...
Definition Authority.php:37
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
Interface for database access objects.
$source