MediaWiki master
WikiImporter.php
Go to the documentation of this file.
1<?php
52use Wikimedia\AtEase\AtEase;
53use Wikimedia\NormalizedException\NormalizedException;
54
63 private $reader;
64
66 private $sourceAdapterId;
67
69 private $foreignNamespaces = null;
70
72 private $mLogItemCallback;
73
75 private $mUploadCallback;
76
78 private $mRevisionCallback;
79
81 private $mPageCallback;
82
84 private $mSiteInfoCallback;
85
87 private $mPageOutCallback;
88
90 private $mNoticeCallback;
91
93 private $mDebug;
94
96 private $mImportUploads;
97
99 private $mImageBasePath;
100
102 private $mNoUpdates = false;
103
105 private $pageOffset = 0;
106
107 private ImportTitleFactory $importTitleFactory;
108 private ExternalUserNames $externalUserNames;
109
111 private $countableCache = [];
112
114 private $disableStatisticsUpdate = false;
115
122 private Authority $performer;
123
124 private Config $config;
125 private HookRunner $hookRunner;
126 private Language $contentLanguage;
127 private NamespaceInfo $namespaceInfo;
128 private TitleFactory $titleFactory;
129 private WikiPageFactory $wikiPageFactory;
130 private UploadRevisionImporter $uploadRevisionImporter;
131 private IContentHandlerFactory $contentHandlerFactory;
132 private SlotRoleRegistry $slotRoleRegistry;
133
137 public function __construct(
139 Authority $performer,
140 Config $config,
141 HookContainer $hookContainer,
142 Language $contentLanguage,
143 NamespaceInfo $namespaceInfo,
144 TitleFactory $titleFactory,
145 WikiPageFactory $wikiPageFactory,
146 UploadRevisionImporter $uploadRevisionImporter,
147 IContentHandlerFactory $contentHandlerFactory,
148 SlotRoleRegistry $slotRoleRegistry
149 ) {
150 $this->performer = $performer;
151 $this->config = $config;
152 $this->hookRunner = new HookRunner( $hookContainer );
153 $this->contentLanguage = $contentLanguage;
154 $this->namespaceInfo = $namespaceInfo;
155 $this->titleFactory = $titleFactory;
156 $this->wikiPageFactory = $wikiPageFactory;
157 $this->uploadRevisionImporter = $uploadRevisionImporter;
158 $this->contentHandlerFactory = $contentHandlerFactory;
159 $this->slotRoleRegistry = $slotRoleRegistry;
160
161 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
162 stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
163 }
164 $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source );
165
166 $this->openReader();
167
168 // Default callbacks
169 $this->setPageCallback( [ $this, 'beforeImportPage' ] );
170 $this->setRevisionCallback( [ $this, "importRevision" ] );
171 $this->setUploadCallback( [ $this, 'importUpload' ] );
172 $this->setLogItemCallback( [ $this, 'importLogItem' ] );
173 $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
174
175 $this->importTitleFactory = new NaiveImportTitleFactory(
176 $this->contentLanguage,
177 $this->namespaceInfo,
178 $this->titleFactory
179 );
180 $this->externalUserNames = new ExternalUserNames( 'imported', false );
181 }
182
186 public function getReader() {
187 return $this->reader;
188 }
189
193 public function throwXmlError( $err ) {
194 $this->debug( "FAILURE: $err" );
195 wfDebug( "WikiImporter XML error: $err" );
196 }
197
201 public function debug( $data ) {
202 if ( $this->mDebug ) {
203 wfDebug( "IMPORT: $data" );
204 }
205 }
206
210 public function warn( $data ) {
211 wfDebug( "IMPORT: $data" );
212 }
213
218 public function notice( $msg, ...$params ) {
219 if ( is_callable( $this->mNoticeCallback ) ) {
220 call_user_func( $this->mNoticeCallback, $msg, $params );
221 } else { # No ImportReporter -> CLI
222 // T177997: the command line importers should call setNoticeCallback()
223 // for their own custom callback to echo the notice
224 wfDebug( wfMessage( $msg, $params )->text() );
225 }
226 }
227
232 public function setDebug( $debug ) {
233 $this->mDebug = $debug;
234 }
235
240 public function setNoUpdates( $noupdates ) {
241 $this->mNoUpdates = $noupdates;
242 }
243
250 public function setPageOffset( $nthPage ) {
251 $this->pageOffset = $nthPage;
252 }
253
260 public function setNoticeCallback( $callback ) {
261 return wfSetVar( $this->mNoticeCallback, $callback );
262 }
263
269 public function setPageCallback( $callback ) {
270 $previous = $this->mPageCallback;
271 $this->mPageCallback = $callback;
272 return $previous;
273 }
274
284 public function setPageOutCallback( $callback ) {
285 $previous = $this->mPageOutCallback;
286 $this->mPageOutCallback = $callback;
287 return $previous;
288 }
289
295 public function setRevisionCallback( $callback ) {
296 $previous = $this->mRevisionCallback;
297 $this->mRevisionCallback = $callback;
298 return $previous;
299 }
300
306 public function setUploadCallback( $callback ) {
307 $previous = $this->mUploadCallback;
308 $this->mUploadCallback = $callback;
309 return $previous;
310 }
311
317 public function setLogItemCallback( $callback ) {
318 $previous = $this->mLogItemCallback;
319 $this->mLogItemCallback = $callback;
320 return $previous;
321 }
322
328 public function setSiteInfoCallback( $callback ) {
329 $previous = $this->mSiteInfoCallback;
330 $this->mSiteInfoCallback = $callback;
331 return $previous;
332 }
333
339 public function setImportTitleFactory( $factory ) {
340 $this->importTitleFactory = $factory;
341 }
342
348 public function setTargetNamespace( $namespace ) {
349 if ( $namespace === null ) {
350 // Don't override namespaces
353 $this->contentLanguage,
354 $this->namespaceInfo,
355 $this->titleFactory
356 )
357 );
358 return true;
359 } elseif (
360 $namespace >= 0 &&
361 $this->namespaceInfo->exists( intval( $namespace ) )
362 ) {
363 $namespace = intval( $namespace );
366 $this->namespaceInfo,
367 $this->titleFactory,
368 $namespace
369 )
370 );
371 return true;
372 } else {
373 return false;
374 }
375 }
376
382 public function setTargetRootPage( $rootpage ) {
383 $status = Status::newGood();
384 $nsInfo = $this->namespaceInfo;
385 if ( $rootpage === null ) {
386 // No rootpage
389 $this->contentLanguage,
390 $nsInfo,
391 $this->titleFactory
392 )
393 );
394 } elseif ( $rootpage !== '' ) {
395 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
396 $title = Title::newFromText( $rootpage );
397
398 if ( !$title || $title->isExternal() ) {
399 $status->fatal( 'import-rootpage-invalid' );
400 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
401 $displayNSText = $title->getNamespace() === NS_MAIN
402 ? wfMessage( 'blanknamespace' )->text()
403 : $this->contentLanguage->getNsText( $title->getNamespace() );
404 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
405 } else {
406 // set namespace to 'all', so the namespace check in processTitle() can pass
407 $this->setTargetNamespace( null );
410 $nsInfo,
411 $this->titleFactory,
412 $title
413 )
414 );
415 }
416 }
417 return $status;
418 }
419
423 public function setImageBasePath( $dir ) {
424 $this->mImageBasePath = $dir;
425 }
426
430 public function setImportUploads( $import ) {
431 $this->mImportUploads = $import;
432 }
433
439 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
440 $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
441 }
442
447 public function disableStatisticsUpdate() {
448 $this->disableStatisticsUpdate = true;
449 }
450
457 public function beforeImportPage( $titleAndForeignTitle ) {
458 $title = $titleAndForeignTitle[0];
459 $page = $this->wikiPageFactory->newFromTitle( $title );
460 $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
461 return true;
462 }
463
469 public function importRevision( $revision ) {
470 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
471 $this->notice( 'import-error-bad-location',
472 $revision->getTitle()->getPrefixedText(),
473 $revision->getID(),
474 $revision->getModel(),
475 $revision->getFormat()
476 );
477
478 return false;
479 }
480
481 try {
482 return $revision->importOldRevision();
483 } catch ( MWContentSerializationException $ex ) {
484 $this->notice( 'import-error-unserialize',
485 $revision->getTitle()->getPrefixedText(),
486 $revision->getID(),
487 $revision->getModel(),
488 $revision->getFormat()
489 );
490 }
491
492 return false;
493 }
494
500 public function importLogItem( $revision ) {
501 return $revision->importLogItem();
502 }
503
509 public function importUpload( $revision ) {
510 $status = $this->uploadRevisionImporter->import( $revision );
511 return $status->isGood();
512 }
513
523 public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
524 $sRevCount, $pageInfo
525 ) {
526 // Update article count statistics (T42009)
527 // The normal counting logic in WikiPage->doEditUpdates() is designed for
528 // one-revision-at-a-time editing, not bulk imports. In this situation it
529 // suffers from issues of replica DB lag. We let WikiPage handle the total page
530 // and revision count, and we implement our own custom logic for the
531 // article (content page) count.
532 if ( !$this->disableStatisticsUpdate ) {
533 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
534
535 $page->loadPageData( IDBAccessObject::READ_LATEST );
536 $rev = $page->getRevisionRecord();
537 if ( $rev === null ) {
538
539 wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
540 ' because WikiPage::getRevisionRecord() returned null' );
541 } else {
542 $update = $page->newPageUpdater( $this->performer )->prepareUpdate();
543 $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
544 $countable = $update->isCountable();
545 if ( array_key_exists( $countKey, $this->countableCache ) &&
546 $countable != $this->countableCache[$countKey] ) {
547 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
548 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
549 ] ) );
550 }
551 }
552 }
553
554 $title = Title::newFromPageIdentity( $pageIdentity );
555 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
556 $revCount, $sRevCount, $pageInfo );
557 }
558
564 private function siteInfoCallback( $siteInfo ) {
565 if ( isset( $this->mSiteInfoCallback ) ) {
566 return call_user_func_array(
567 $this->mSiteInfoCallback,
568 [ $siteInfo, $this ]
569 );
570 } else {
571 return false;
572 }
573 }
574
579 public function pageCallback( $title ) {
580 if ( isset( $this->mPageCallback ) ) {
581 call_user_func( $this->mPageCallback, $title );
582 }
583 }
584
593 private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
594 $sucCount, $pageInfo ) {
595 if ( isset( $this->mPageOutCallback ) ) {
596 call_user_func_array( $this->mPageOutCallback, func_get_args() );
597 }
598 }
599
605 private function revisionCallback( $revision ) {
606 if ( isset( $this->mRevisionCallback ) ) {
607 return call_user_func_array(
608 $this->mRevisionCallback,
609 [ $revision, $this ]
610 );
611 } else {
612 return false;
613 }
614 }
615
621 private function logItemCallback( $revision ) {
622 if ( isset( $this->mLogItemCallback ) ) {
623 return call_user_func_array(
624 $this->mLogItemCallback,
625 [ $revision, $this ]
626 );
627 } else {
628 return false;
629 }
630 }
631
638 public function nodeAttribute( $attr ) {
639 return $this->reader->getAttribute( $attr ) ?? '';
640 }
641
649 public function nodeContents() {
650 if ( $this->reader->isEmptyElement ) {
651 return "";
652 }
653 $buffer = "";
654 while ( $this->reader->read() ) {
655 switch ( $this->reader->nodeType ) {
656 case XMLReader::TEXT:
657 case XMLReader::CDATA:
658 case XMLReader::SIGNIFICANT_WHITESPACE:
659 $buffer .= $this->reader->value;
660 break;
661 case XMLReader::END_ELEMENT:
662 return $buffer;
663 }
664 }
665
666 $this->reader->close();
667 return '';
668 }
669
675 public function doImport() {
676 $this->syntaxCheckXML();
677
678 // Calls to reader->read need to be wrapped in calls to
679 // libxml_disable_entity_loader() to avoid local file
680 // inclusion attacks (T48932).
681 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
682 $oldDisable = @libxml_disable_entity_loader( true );
683 try {
684 $this->reader->read();
685
686 if ( $this->reader->localName != 'mediawiki' ) {
687 // phpcs:ignore Generic.PHP.NoSilencedErrors
688 @libxml_disable_entity_loader( $oldDisable );
689 $error = libxml_get_last_error();
690 if ( $error ) {
691 throw new NormalizedException( "XML error at line {line}: {message}", [
692 'line' => $error->line,
693 'message' => $error->message,
694 ] );
695 } else {
696 throw new UnexpectedValueException(
697 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
698 );
699 }
700 }
701 $this->debug( "<mediawiki> tag is correct." );
702
703 $this->debug( "Starting primary dump processing loop." );
704
705 $keepReading = $this->reader->read();
706 $skip = false;
707 $pageCount = 0;
708 while ( $keepReading ) {
709 $tag = $this->reader->localName;
710 if ( $this->pageOffset ) {
711 if ( $tag === 'page' ) {
712 $pageCount++;
713 }
714 if ( $pageCount < $this->pageOffset ) {
715 $keepReading = $this->reader->next();
716 continue;
717 }
718 }
719 $type = $this->reader->nodeType;
720
721 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
722 // Do nothing
723 } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
724 break;
725 } elseif ( $tag == 'siteinfo' ) {
726 $this->handleSiteInfo();
727 } elseif ( $tag == 'page' ) {
728 $this->handlePage();
729 } elseif ( $tag == 'logitem' ) {
730 $this->handleLogItem();
731 } elseif ( $tag != '#text' ) {
732 $this->warn( "Unhandled top-level XML tag $tag" );
733
734 $skip = true;
735 }
736
737 if ( $skip ) {
738 $keepReading = $this->reader->next();
739 $skip = false;
740 $this->debug( "Skip" );
741 } else {
742 $keepReading = $this->reader->read();
743 }
744 }
745 } finally {
746 // phpcs:ignore Generic.PHP.NoSilencedErrors
747 @libxml_disable_entity_loader( $oldDisable );
748 $this->reader->close();
749 }
750
751 return true;
752 }
753
754 private function handleSiteInfo() {
755 $this->debug( "Enter site info handler." );
756 $siteInfo = [];
757
758 // Fields that can just be stuffed in the siteInfo object
759 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
760
761 while ( $this->reader->read() ) {
762 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
763 $this->reader->localName == 'siteinfo' ) {
764 break;
765 }
766
767 $tag = $this->reader->localName;
768
769 if ( $tag == 'namespace' ) {
770 $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
771 $this->nodeContents();
772 } elseif ( in_array( $tag, $normalFields ) ) {
773 $siteInfo[$tag] = $this->nodeContents();
774 }
775 }
776
777 $siteInfo['_namespaces'] = $this->foreignNamespaces;
778 $this->siteInfoCallback( $siteInfo );
779 }
780
781 private function handleLogItem() {
782 $this->debug( "Enter log item handler." );
783 $logInfo = [];
784
785 // Fields that can just be stuffed in the pageInfo object
786 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
787 'logtitle', 'params' ];
788
789 while ( $this->reader->read() ) {
790 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
791 $this->reader->localName == 'logitem' ) {
792 break;
793 }
794
795 $tag = $this->reader->localName;
796
797 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
798 // Do nothing
799 } elseif ( in_array( $tag, $normalFields ) ) {
800 $logInfo[$tag] = $this->nodeContents();
801 } elseif ( $tag == 'contributor' ) {
802 $logInfo['contributor'] = $this->handleContributor();
803 } elseif ( $tag != '#text' ) {
804 $this->warn( "Unhandled log-item XML tag $tag" );
805 }
806 }
807
808 $this->processLogItem( $logInfo );
809 }
810
815 private function processLogItem( $logInfo ) {
816 $revision = new WikiRevision();
817
818 if ( isset( $logInfo['id'] ) ) {
819 $revision->setID( $logInfo['id'] );
820 }
821 $revision->setType( $logInfo['type'] );
822 $revision->setAction( $logInfo['action'] );
823 if ( isset( $logInfo['timestamp'] ) ) {
824 $revision->setTimestamp( $logInfo['timestamp'] );
825 }
826 if ( isset( $logInfo['params'] ) ) {
827 $revision->setParams( $logInfo['params'] );
828 }
829 if ( isset( $logInfo['logtitle'] ) ) {
830 // @todo Using Title for non-local titles is a recipe for disaster.
831 // We should use ForeignTitle here instead.
832 $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
833 }
834
835 $revision->setNoUpdates( $this->mNoUpdates );
836
837 if ( isset( $logInfo['comment'] ) ) {
838 $revision->setComment( $logInfo['comment'] );
839 }
840
841 if ( isset( $logInfo['contributor']['username'] ) ) {
842 $revision->setUsername(
843 $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
844 );
845 } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
846 $revision->setUserIP( $logInfo['contributor']['ip'] );
847 } else {
848 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
849 }
850
851 return $this->logItemCallback( $revision );
852 }
853
854 private function handlePage() {
855 // Handle page data.
856 $this->debug( "Enter page handler." );
857 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
858
859 // Fields that can just be stuffed in the pageInfo object
860 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
861
862 $skip = false;
863 $badTitle = false;
864
865 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
866 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
867 $this->reader->localName == 'page' ) {
868 break;
869 }
870
871 $skip = false;
872
873 $tag = $this->reader->localName;
874
875 if ( $badTitle ) {
876 // The title is invalid, bail out of this page
877 $skip = true;
878 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
879 // Do nothing
880 } elseif ( in_array( $tag, $normalFields ) ) {
881 // An XML snippet:
882 // <page>
883 // <id>123</id>
884 // <title>Page</title>
885 // <redirect title="NewTitle"/>
886 // ...
887 // Because the redirect tag is built differently, we need special handling for that case.
888 if ( $tag == 'redirect' ) {
889 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
890 } else {
891 $pageInfo[$tag] = $this->nodeContents();
892 }
893 } elseif ( $tag == 'revision' || $tag == 'upload' ) {
894 if ( !isset( $title ) ) {
895 $title = $this->processTitle( $pageInfo['title'],
896 $pageInfo['ns'] ?? null );
897
898 // $title is either an array of two titles or false.
899 if ( is_array( $title ) ) {
900 $this->pageCallback( $title );
901 [ $pageInfo['_title'], $foreignTitle ] = $title;
902 } else {
903 $badTitle = true;
904 $skip = true;
905 }
906 }
907
908 if ( $title ) {
909 if ( $tag == 'revision' ) {
910 $this->handleRevision( $pageInfo );
911 } else {
912 $this->handleUpload( $pageInfo );
913 }
914 }
915 } elseif ( $tag != '#text' ) {
916 $this->warn( "Unhandled page XML tag $tag" );
917 $skip = true;
918 }
919 }
920
921 // @note $pageInfo is only set if a valid $title is processed above with
922 // no error. If we have a valid $title, then pageCallback is called
923 // above, $pageInfo['title'] is set and we do pageOutCallback here.
924 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
925 // set since they both come from $title above.
926 if ( array_key_exists( '_title', $pageInfo ) ) {
928 $title = $pageInfo['_title'];
929 $this->pageOutCallback(
930 $title,
931 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
932 $foreignTitle,
933 $pageInfo['revisionCount'],
934 $pageInfo['successfulRevisionCount'],
935 $pageInfo
936 );
937 }
938 }
939
943 private function handleRevision( &$pageInfo ) {
944 $this->debug( "Enter revision handler" );
945 $revisionInfo = [];
946
947 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
948 'model', 'format', 'text', 'sha1' ];
949
950 $skip = false;
951
952 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
953 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
954 $this->reader->localName == 'revision' ) {
955 break;
956 }
957
958 $tag = $this->reader->localName;
959
960 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
961 $this, $pageInfo, $revisionInfo )
962 ) {
963 // Do nothing
964 } elseif ( in_array( $tag, $normalFields ) ) {
965 $revisionInfo[$tag] = $this->nodeContents();
966 } elseif ( $tag == 'content' ) {
967 // We can have multiple content tags, so make this an array.
968 $revisionInfo[$tag][] = $this->handleContent();
969 } elseif ( $tag == 'contributor' ) {
970 $revisionInfo['contributor'] = $this->handleContributor();
971 } elseif ( $tag != '#text' ) {
972 $this->warn( "Unhandled revision XML tag $tag" );
973 $skip = true;
974 }
975 }
976
977 $pageInfo['revisionCount']++;
978 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
979 $pageInfo['successfulRevisionCount']++;
980 }
981 }
982
983 private function handleContent() {
984 $this->debug( "Enter content handler" );
985 $contentInfo = [];
986
987 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
988
989 $skip = false;
990
991 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
992 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
993 $this->reader->localName == 'content' ) {
994 break;
995 }
996
997 $tag = $this->reader->localName;
998
999 if ( !$this->hookRunner->onImportHandleContentXMLTag(
1000 $this, $contentInfo )
1001 ) {
1002 // Do nothing
1003 } elseif ( in_array( $tag, $normalFields ) ) {
1004 $contentInfo[$tag] = $this->nodeContents();
1005 } elseif ( $tag != '#text' ) {
1006 $this->warn( "Unhandled content XML tag $tag" );
1007 $skip = true;
1008 }
1009 }
1010
1011 return $contentInfo;
1012 }
1013
1021 private function makeContent( Title $title, $revisionId, $contentInfo ) {
1022 $maxArticleSize = $this->config->get( MainConfigNames::MaxArticleSize );
1023
1024 if ( !isset( $contentInfo['text'] ) ) {
1025 throw new InvalidArgumentException( 'Missing text field in import.' );
1026 }
1027
1028 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1029 // database errors and instability. Testing for revisions with only listed
1030 // content models, as other content models might use serialization formats
1031 // which aren't checked against $wgMaxArticleSize.
1032 if ( ( !isset( $contentInfo['model'] ) ||
1033 in_array( $contentInfo['model'], [
1034 'wikitext',
1035 'css',
1036 'json',
1037 'javascript',
1038 'text',
1039 ''
1040 ] ) ) &&
1041 strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1042 ) {
1043 throw new RuntimeException( 'The text of ' .
1044 ( $revisionId ?
1045 "the revision with ID $revisionId" :
1046 'a revision'
1047 ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1048 }
1049
1050 $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1051 $model = $contentInfo['model'] ?? $this->slotRoleRegistry
1052 ->getRoleHandler( $role )
1053 ->getDefaultModel( $title );
1054 $handler = $this->contentHandlerFactory->getContentHandler( $model );
1055
1056 $text = $handler->importTransform( $contentInfo['text'] );
1057
1058 return $handler->unserializeContent( $text );
1059 }
1060
1066 private function processRevision( $pageInfo, $revisionInfo ) {
1067 $revision = new WikiRevision();
1068
1069 $revId = $revisionInfo['id'] ?? 0;
1070 if ( $revId ) {
1071 $revision->setID( $revisionInfo['id'] );
1072 }
1073
1074 $title = $pageInfo['_title'];
1075 $revision->setTitle( $title );
1076
1077 $content = $this->makeContent( $title, $revId, $revisionInfo );
1078 $revision->setContent( SlotRecord::MAIN, $content );
1079
1080 foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1081 if ( !isset( $slotInfo['role'] ) ) {
1082 throw new RuntimeException( "Missing role for imported slot." );
1083 }
1084
1085 $content = $this->makeContent( $title, $revId, $slotInfo );
1086 $revision->setContent( $slotInfo['role'], $content );
1087 }
1088 $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1089
1090 if ( isset( $revisionInfo['comment'] ) ) {
1091 $revision->setComment( $revisionInfo['comment'] );
1092 }
1093
1094 if ( isset( $revisionInfo['minor'] ) ) {
1095 $revision->setMinor( true );
1096 }
1097 if ( isset( $revisionInfo['contributor']['username'] ) ) {
1098 $revision->setUsername(
1099 $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1100 );
1101 } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1102 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1103 } else {
1104 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1105 }
1106 if ( isset( $revisionInfo['sha1'] ) ) {
1107 $revision->setSha1Base36( $revisionInfo['sha1'] );
1108 }
1109 $revision->setNoUpdates( $this->mNoUpdates );
1110
1111 return $this->revisionCallback( $revision );
1112 }
1113
1118 private function handleUpload( &$pageInfo ) {
1119 $this->debug( "Enter upload handler" );
1120 $uploadInfo = [];
1121
1122 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1123 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1124
1125 $skip = false;
1126
1127 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1128 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1129 $this->reader->localName == 'upload' ) {
1130 break;
1131 }
1132
1133 $tag = $this->reader->localName;
1134
1135 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1136 // Do nothing
1137 } elseif ( in_array( $tag, $normalFields ) ) {
1138 $uploadInfo[$tag] = $this->nodeContents();
1139 } elseif ( $tag == 'contributor' ) {
1140 $uploadInfo['contributor'] = $this->handleContributor();
1141 } elseif ( $tag == 'contents' ) {
1142 $contents = $this->nodeContents();
1143 $encoding = $this->reader->getAttribute( 'encoding' );
1144 if ( $encoding === 'base64' ) {
1145 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1146 $uploadInfo['isTempSrc'] = true;
1147 }
1148 } elseif ( $tag != '#text' ) {
1149 $this->warn( "Unhandled upload XML tag $tag" );
1150 $skip = true;
1151 }
1152 }
1153
1154 if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1155 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1156 if ( file_exists( $path ) ) {
1157 $uploadInfo['fileSrc'] = $path;
1158 $uploadInfo['isTempSrc'] = false;
1159 }
1160 }
1161
1162 if ( $this->mImportUploads ) {
1163 return $this->processUpload( $pageInfo, $uploadInfo );
1164 }
1165 }
1166
1171 private function dumpTemp( $contents ) {
1172 $filename = tempnam( wfTempDir(), 'importupload' );
1173 file_put_contents( $filename, $contents );
1174 return $filename;
1175 }
1176
1182 private function processUpload( $pageInfo, $uploadInfo ) {
1183 $revision = new WikiRevision();
1184 $revId = $pageInfo['id'];
1185 $title = $pageInfo['_title'];
1186 // T292348: text key may be absent, force addition if null
1187 $uploadInfo['text'] ??= '';
1188 $content = $this->makeContent( $title, $revId, $uploadInfo );
1189
1190 $revision->setTitle( $title );
1191 $revision->setID( $revId );
1192 $revision->setTimestamp( $uploadInfo['timestamp'] );
1193 $revision->setContent( SlotRecord::MAIN, $content );
1194 $revision->setFilename( $uploadInfo['filename'] );
1195 if ( isset( $uploadInfo['archivename'] ) ) {
1196 $revision->setArchiveName( $uploadInfo['archivename'] );
1197 }
1198 $revision->setSrc( $uploadInfo['src'] );
1199 if ( isset( $uploadInfo['fileSrc'] ) ) {
1200 $revision->setFileSrc( $uploadInfo['fileSrc'],
1201 !empty( $uploadInfo['isTempSrc'] )
1202 );
1203 }
1204 if ( isset( $uploadInfo['sha1base36'] ) ) {
1205 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1206 }
1207 $revision->setSize( intval( $uploadInfo['size'] ) );
1208 $revision->setComment( $uploadInfo['comment'] );
1209
1210 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1211 $revision->setUsername(
1212 $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1213 );
1214 } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1215 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1216 }
1217 $revision->setNoUpdates( $this->mNoUpdates );
1218
1219 return call_user_func( $this->mUploadCallback, $revision );
1220 }
1221
1225 private function handleContributor() {
1226 $this->debug( "Enter contributor handler." );
1227
1228 if ( $this->reader->isEmptyElement ) {
1229 return [];
1230 }
1231
1232 $fields = [ 'id', 'ip', 'username' ];
1233 $info = [];
1234
1235 while ( $this->reader->read() ) {
1236 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1237 $this->reader->localName == 'contributor' ) {
1238 break;
1239 }
1240
1241 $tag = $this->reader->localName;
1242
1243 if ( in_array( $tag, $fields ) ) {
1244 $info[$tag] = $this->nodeContents();
1245 }
1246 }
1247
1248 return $info;
1249 }
1250
1256 private function processTitle( $text, $ns = null ) {
1257 if ( $this->foreignNamespaces === null ) {
1258 $foreignTitleFactory = new NaiveForeignTitleFactory(
1259 $this->contentLanguage
1260 );
1261 } else {
1262 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1263 $this->foreignNamespaces );
1264 }
1265
1266 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1267 intval( $ns ) );
1268
1269 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1270 $foreignTitle );
1271
1272 if ( $title === null ) {
1273 # Invalid page title? Ignore the page
1274 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1275 return false;
1276 } elseif ( $title->isExternal() ) {
1277 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1278 return false;
1279 } elseif ( !$title->canExist() ) {
1280 $this->notice( 'import-error-special', $title->getPrefixedText() );
1281 return false;
1282 } elseif ( !$this->performer->definitelyCan( 'edit', $title ) ) {
1283 # Do not import if the importing wiki user cannot edit this page
1284 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1285 return false;
1286 }
1287
1288 return [ $title, $foreignTitle ];
1289 }
1290
1295 private function openReader() {
1296 // Enable the entity loader, as it is needed for loading external URLs via
1297 // XMLReader::open (T86036)
1298 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
1299 $oldDisable = @libxml_disable_entity_loader( false );
1300
1301 if ( PHP_VERSION_ID >= 80000 ) {
1302 // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548
1303 $reader = XMLReader::open(
1304 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1305 if ( $reader instanceof XMLReader ) {
1306 $this->reader = $reader;
1307 $status = true;
1308 } else {
1309 $status = false;
1310 }
1311 } else {
1312 // A static call generated a deprecation warning prior to PHP 8.0
1313 $this->reader = new XMLReader;
1314 $status = $this->reader->open(
1315 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1316 }
1317 if ( !$status ) {
1318 $error = libxml_get_last_error();
1319 // phpcs:ignore Generic.PHP.NoSilencedErrors
1320 @libxml_disable_entity_loader( $oldDisable );
1321 throw new RuntimeException(
1322 'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1323 );
1324 }
1325 // phpcs:ignore Generic.PHP.NoSilencedErrors
1326 @libxml_disable_entity_loader( $oldDisable );
1327 }
1328
1332 private function syntaxCheckXML() {
1333 if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) {
1334 return;
1335 }
1336 AtEase::suppressWarnings();
1337 $oldDisable = libxml_disable_entity_loader( false );
1338 try {
1339 while ( $this->reader->read() );
1340 $error = libxml_get_last_error();
1341 if ( $error ) {
1342 $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message;
1343 wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage );
1344 throw new RuntimeException( $errorMessage );
1345 }
1346 } finally {
1347 libxml_disable_entity_loader( $oldDisable );
1348 AtEase::restoreWarnings();
1349 $this->reader->close();
1350 }
1351
1352 // Reopen for the real import
1353 UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 );
1354 $this->openReader();
1355 }
1356}
const NS_MAIN
Definition Defines.php:64
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Reporting callback.
Base class for language-specific code.
Definition Language.php:63
Exception representing a failure to serialize or unserialize a content object.
Helper class for mapping value objects representing basic entities to cache keys.
Defer callable updates to run later in the PHP process.
Class for handling updates to the site_stats table.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
A class containing constants representing the names of configuration variables.
Service for creating WikiPage objects.
Value object representing a content slot associated with a page revision.
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition Status.php:54
A simple, immutable structure to hold the title of a page on a foreign MediaWiki installation.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Creates Title objects.
Represents a title within MediaWiki.
Definition Title.php:78
canExist()
Can this title represent a page in the wiki's database?
Definition Title.php:1212
getNamespace()
Get the namespace index, i.e.
Definition Title.php:1044
getPrefixedText()
Get the prefixed title with spaces.
Definition Title.php:1861
Class to parse and build external user names.
static seekSource(string $id, int $offset)
static isSeekableSource(string $id)
static registerSource(ImportSource $source)
XML file reader for the page data importer.
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setNoUpdates( $noupdates)
Set 'no updates' mode.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
importLogItem( $revision)
Default per-revision callback, performs the import.
setImageBasePath( $dir)
throwXmlError( $err)
setDebug( $debug)
Set debug mode...
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
__construct(ImportSource $source, Authority $performer, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
Represents a revision, log entry or upload during the import process.
Source interface for XML import.
Interface for configuration instances.
Definition Config.php:32
isExternal()
Whether this LinkTarget has an interwiki component.
Interface for objects (potentially) representing an editable wiki page.
This interface represents the authority associated with the current execution context,...
Definition Authority.php:37
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
$source