MediaWiki master
WikiImporter.php
Go to the documentation of this file.
1<?php
13namespace MediaWiki\Import;
14
15use InvalidArgumentException;
44use RuntimeException;
45use UnexpectedValueException;
46use Wikimedia\AtEase\AtEase;
49use Wikimedia\NormalizedException\NormalizedException;
51use XMLReader;
52
61 private $reader;
62
64 private $sourceAdapterId;
65
67 private $foreignNamespaces = null;
68
70 private $mLogItemCallback;
71
73 private $mUploadCallback;
74
76 private $mRevisionCallback;
77
79 private $mPageCallback;
80
82 private $mSiteInfoCallback;
83
85 private $mPageOutCallback;
86
88 private $mNoticeCallback;
89
91 private $mDebug;
92
94 private $mImportUploads;
95
97 private $mImageBasePath;
98
100 private $mNoUpdates = false;
101
103 private $pageOffset = 0;
104
105 private ImportTitleFactory $importTitleFactory;
106 private ExternalUserNames $externalUserNames;
107
109 private $countableCache = [];
110
112 private $disableStatisticsUpdate = false;
113
120 private Authority $performer;
121
122 private Config $config;
123 private HookRunner $hookRunner;
124 private Language $contentLanguage;
125 private NamespaceInfo $namespaceInfo;
126 private TitleFactory $titleFactory;
127 private WikiPageFactory $wikiPageFactory;
128 private UploadRevisionImporter $uploadRevisionImporter;
129 private IContentHandlerFactory $contentHandlerFactory;
130 private SlotRoleRegistry $slotRoleRegistry;
131
135 public function __construct(
137 Authority $performer,
138 Config $config,
139 HookContainer $hookContainer,
140 Language $contentLanguage,
141 NamespaceInfo $namespaceInfo,
142 TitleFactory $titleFactory,
143 WikiPageFactory $wikiPageFactory,
144 UploadRevisionImporter $uploadRevisionImporter,
145 IContentHandlerFactory $contentHandlerFactory,
146 SlotRoleRegistry $slotRoleRegistry
147 ) {
148 $this->performer = $performer;
149 $this->config = $config;
150 $this->hookRunner = new HookRunner( $hookContainer );
151 $this->contentLanguage = $contentLanguage;
152 $this->namespaceInfo = $namespaceInfo;
153 $this->titleFactory = $titleFactory;
154 $this->wikiPageFactory = $wikiPageFactory;
155 $this->uploadRevisionImporter = $uploadRevisionImporter;
156 $this->contentHandlerFactory = $contentHandlerFactory;
157 $this->slotRoleRegistry = $slotRoleRegistry;
158
159 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
160 stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
161 }
162 $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source );
163
164 $this->openReader();
165
166 // Default callbacks
167 $this->setPageCallback( $this->beforeImportPage( ... ) );
168 $this->setRevisionCallback( $this->importRevision( ... ) );
169 $this->setUploadCallback( $this->importUpload( ... ) );
170 $this->setLogItemCallback( $this->importLogItem( ... ) );
171 $this->setPageOutCallback( $this->finishImportPage( ... ) );
172
173 $this->importTitleFactory = new NaiveImportTitleFactory(
174 $this->contentLanguage,
175 $this->namespaceInfo,
176 $this->titleFactory
177 );
178 $this->externalUserNames = new ExternalUserNames( 'imported', false );
179 }
180
184 public function getReader() {
185 return $this->reader;
186 }
187
191 public function throwXmlError( $err ) {
192 $this->debug( "FAILURE: $err" );
193 wfDebug( "WikiImporter XML error: $err" );
194 }
195
199 public function debug( $data ) {
200 if ( $this->mDebug ) {
201 wfDebug( "IMPORT: $data" );
202 }
203 }
204
208 public function warn( $data ) {
209 wfDebug( "IMPORT: $data" );
210 }
211
218 public function notice( $msg, ...$params ) {
219 if ( is_callable( $this->mNoticeCallback ) ) {
220 ( $this->mNoticeCallback )( $msg, $params );
221 } else { # No ImportReporter -> CLI
222 // T177997: the command line importers should call setNoticeCallback()
223 // for their own custom callback to echo the notice
224 wfDebug( wfMessage( $msg, $params )->text() );
225 }
226 }
227
232 public function setDebug( $debug ) {
233 $this->mDebug = $debug;
234 }
235
240 public function setNoUpdates( $noupdates ) {
241 $this->mNoUpdates = $noupdates;
242 }
243
250 public function setPageOffset( $nthPage ) {
251 $this->pageOffset = $nthPage;
252 }
253
260 public function setNoticeCallback( $callback ) {
261 return wfSetVar( $this->mNoticeCallback, $callback );
262 }
263
269 public function setPageCallback( $callback ) {
270 $previous = $this->mPageCallback;
271 $this->mPageCallback = $callback;
272 return $previous;
273 }
274
284 public function setPageOutCallback( $callback ) {
285 $previous = $this->mPageOutCallback;
286 $this->mPageOutCallback = $callback;
287 return $previous;
288 }
289
295 public function setRevisionCallback( $callback ) {
296 $previous = $this->mRevisionCallback;
297 $this->mRevisionCallback = $callback;
298 return $previous;
299 }
300
306 public function setUploadCallback( $callback ) {
307 $previous = $this->mUploadCallback;
308 $this->mUploadCallback = $callback;
309 return $previous;
310 }
311
317 public function setLogItemCallback( $callback ) {
318 $previous = $this->mLogItemCallback;
319 $this->mLogItemCallback = $callback;
320 return $previous;
321 }
322
328 public function setSiteInfoCallback( $callback ) {
329 $previous = $this->mSiteInfoCallback;
330 $this->mSiteInfoCallback = $callback;
331 return $previous;
332 }
333
339 public function setImportTitleFactory( $factory ) {
340 $this->importTitleFactory = $factory;
341 }
342
348 public function setTargetNamespace( $namespace ) {
349 if ( $namespace === null ) {
350 // Don't override namespaces
353 $this->contentLanguage,
354 $this->namespaceInfo,
355 $this->titleFactory
356 )
357 );
358 return true;
359 } elseif (
360 $namespace >= 0 &&
361 $this->namespaceInfo->exists( intval( $namespace ) )
362 ) {
363 $namespace = intval( $namespace );
366 $this->namespaceInfo,
367 $this->titleFactory,
368 $namespace
369 )
370 );
371 return true;
372 } else {
373 return false;
374 }
375 }
376
382 public function setTargetRootPage( $rootpage ) {
383 $status = Status::newGood();
384 $nsInfo = $this->namespaceInfo;
385 if ( $rootpage === null ) {
386 // No rootpage
389 $this->contentLanguage,
390 $nsInfo,
391 $this->titleFactory
392 )
393 );
394 } elseif ( $rootpage !== '' ) {
395 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
396 $title = Title::newFromText( $rootpage );
397
398 if ( !$title || $title->isExternal() ) {
399 $status->fatal( 'import-rootpage-invalid' );
400 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
401 $displayNSText = $title->getNamespace() === NS_MAIN
402 ? wfMessage( 'blanknamespace' )->text()
403 : $this->contentLanguage->getNsText( $title->getNamespace() );
404 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
405 } else {
406 // set namespace to 'all', so the namespace check in processTitle() can pass
407 $this->setTargetNamespace( null );
410 $nsInfo,
411 $this->titleFactory,
412 $title
413 )
414 );
415 }
416 }
417 return $status;
418 }
419
423 public function setImageBasePath( $dir ) {
424 $this->mImageBasePath = $dir;
425 }
426
430 public function setImportUploads( $import ) {
431 $this->mImportUploads = $import;
432 }
433
439 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
440 $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
441 }
442
447 public function disableStatisticsUpdate() {
448 $this->disableStatisticsUpdate = true;
449 }
450
457 public function beforeImportPage( $titleAndForeignTitle ) {
458 $title = $titleAndForeignTitle[0];
459 $page = $this->wikiPageFactory->newFromTitle( $title );
460 $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
461 return true;
462 }
463
469 public function importRevision( $revision ) {
470 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
471 $this->notice( 'import-error-bad-location',
472 $revision->getTitle()->getPrefixedText(),
473 $revision->getID(),
474 $revision->getModel(),
475 $revision->getFormat()
476 );
477
478 return false;
479 }
480
481 try {
482 return $revision->importOldRevision();
484 $this->notice( 'import-error-unserialize',
485 $revision->getTitle()->getPrefixedText(),
486 $revision->getID(),
487 $revision->getModel(),
488 $revision->getFormat()
489 );
490 }
491
492 return false;
493 }
494
500 public function importLogItem( $revision ) {
501 return $revision->importLogItem();
502 }
503
509 public function importUpload( $revision ) {
510 $status = $this->uploadRevisionImporter->import( $revision );
511 return $status->isGood();
512 }
513
523 public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
524 $sRevCount, $pageInfo
525 ) {
526 // Update article count statistics (T42009)
527 // The normal counting logic in WikiPage->doEditUpdates() is designed for
528 // one-revision-at-a-time editing, not bulk imports. In this situation it
529 // suffers from issues of replica DB lag. We let WikiPage handle the total page
530 // and revision count, and we implement our own custom logic for the
531 // article (content page) count.
532 if ( !$this->disableStatisticsUpdate ) {
533 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
534
535 $page->loadPageData( IDBAccessObject::READ_LATEST );
536 $rev = $page->getRevisionRecord();
537 if ( $rev === null ) {
538
539 wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
540 ' because WikiPage::getRevisionRecord() returned null' );
541 } else {
542 $update = $page->newPageUpdater( $this->performer )->prepareUpdate();
543 $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
544 $countable = $update->isCountable();
545 if ( array_key_exists( $countKey, $this->countableCache ) &&
546 $countable != $this->countableCache[$countKey] ) {
547 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
548 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
549 ] ) );
550 }
551 }
552 }
553
554 $title = Title::newFromPageIdentity( $pageIdentity );
555 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
556 $revCount, $sRevCount, $pageInfo );
557 }
558
564 private function siteInfoCallback( $siteInfo ) {
565 if ( $this->mSiteInfoCallback ) {
566 return ( $this->mSiteInfoCallback )( $siteInfo, $this );
567 } else {
568 return false;
569 }
570 }
571
576 public function pageCallback( $title ) {
577 if ( $this->mPageCallback ) {
578 ( $this->mPageCallback )( $title );
579 }
580 }
581
590 private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
591 $sucCount, $pageInfo ) {
592 if ( $this->mPageOutCallback ) {
593 ( $this->mPageOutCallback )( $pageIdentity, $foreignTitle, $revCount, $sucCount, $pageInfo );
594 }
595 }
596
602 private function revisionCallback( $revision ) {
603 if ( $this->mRevisionCallback ) {
604 return ( $this->mRevisionCallback )( $revision, $this );
605 } else {
606 return false;
607 }
608 }
609
615 private function logItemCallback( $revision ) {
616 if ( $this->mLogItemCallback ) {
617 return ( $this->mLogItemCallback )( $revision, $this );
618 } else {
619 return false;
620 }
621 }
622
629 public function nodeAttribute( $attr ) {
630 return $this->reader->getAttribute( $attr ) ?? '';
631 }
632
640 public function nodeContents() {
641 if ( $this->reader->isEmptyElement ) {
642 return "";
643 }
644 $buffer = "";
645 while ( $this->reader->read() ) {
646 switch ( $this->reader->nodeType ) {
647 case XMLReader::TEXT:
648 case XMLReader::CDATA:
649 case XMLReader::SIGNIFICANT_WHITESPACE:
650 $buffer .= $this->reader->value;
651 break;
652 case XMLReader::END_ELEMENT:
653 return $buffer;
654 }
655 }
656
657 $this->reader->close();
658 return '';
659 }
660
666 public function doImport() {
667 $this->syntaxCheckXML();
668
669 // Calls to reader->read need to be wrapped in calls to
670 // libxml_disable_entity_loader() to avoid local file
671 // inclusion attacks (T48932).
672 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
673 $oldDisable = @libxml_disable_entity_loader( true );
674 try {
675 $this->reader->read();
676
677 if ( $this->reader->localName != 'mediawiki' ) {
678 // phpcs:ignore Generic.PHP.NoSilencedErrors
679 @libxml_disable_entity_loader( $oldDisable );
680 $error = libxml_get_last_error();
681 if ( $error ) {
682 throw new NormalizedException( "XML error at line {line}: {message}", [
683 'line' => $error->line,
684 'message' => $error->message,
685 ] );
686 } else {
687 throw new UnexpectedValueException(
688 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
689 );
690 }
691 }
692 $this->debug( "<mediawiki> tag is correct." );
693
694 $this->debug( "Starting primary dump processing loop." );
695
696 $keepReading = $this->reader->read();
697 $skip = false;
698 $pageCount = 0;
699 while ( $keepReading ) {
700 $tag = $this->reader->localName;
701 if ( $this->pageOffset ) {
702 if ( $tag === 'page' ) {
703 $pageCount++;
704 }
705 if ( $pageCount < $this->pageOffset ) {
706 $keepReading = $this->reader->next();
707 continue;
708 }
709 }
710 $type = $this->reader->nodeType;
711
712 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
713 // Do nothing
714 } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
715 break;
716 } elseif ( $tag == 'siteinfo' ) {
717 $this->handleSiteInfo();
718 } elseif ( $tag == 'page' ) {
719 $this->handlePage();
720 } elseif ( $tag == 'logitem' ) {
721 $this->handleLogItem();
722 } elseif ( $tag != '#text' ) {
723 $this->warn( "Unhandled top-level XML tag $tag" );
724
725 $skip = true;
726 }
727
728 if ( $skip ) {
729 $keepReading = $this->reader->next();
730 $skip = false;
731 $this->debug( "Skip" );
732 } else {
733 $keepReading = $this->reader->read();
734 }
735 }
736 } finally {
737 // phpcs:ignore Generic.PHP.NoSilencedErrors
738 @libxml_disable_entity_loader( $oldDisable );
739 $this->reader->close();
740 }
741
742 return true;
743 }
744
745 private function handleSiteInfo() {
746 $this->debug( "Enter site info handler." );
747 $siteInfo = [];
748
749 // Fields that can just be stuffed in the siteInfo object
750 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
751
752 while ( $this->reader->read() ) {
753 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
754 $this->reader->localName == 'siteinfo' ) {
755 break;
756 }
757
758 $tag = $this->reader->localName;
759
760 if ( $tag == 'namespace' ) {
761 $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
762 $this->nodeContents();
763 } elseif ( in_array( $tag, $normalFields ) ) {
764 $siteInfo[$tag] = $this->nodeContents();
765 }
766 }
767
768 $siteInfo['_namespaces'] = $this->foreignNamespaces;
769 $this->siteInfoCallback( $siteInfo );
770 }
771
772 private function handleLogItem() {
773 $this->debug( "Enter log item handler." );
774 $logInfo = [];
775
776 // Fields that can just be stuffed in the pageInfo object
777 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
778 'logtitle', 'params' ];
779
780 while ( $this->reader->read() ) {
781 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
782 $this->reader->localName == 'logitem' ) {
783 break;
784 }
785
786 $tag = $this->reader->localName;
787
788 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
789 // Do nothing
790 } elseif ( in_array( $tag, $normalFields ) ) {
791 $logInfo[$tag] = $this->nodeContents();
792 } elseif ( $tag == 'contributor' ) {
793 $logInfo['contributor'] = $this->handleContributor();
794 } elseif ( $tag != '#text' ) {
795 $this->warn( "Unhandled log-item XML tag $tag" );
796 }
797 }
798
799 $this->processLogItem( $logInfo );
800 }
801
806 private function processLogItem( $logInfo ) {
807 $revision = new WikiRevision();
808
809 if ( isset( $logInfo['id'] ) ) {
810 $revision->setID( $logInfo['id'] );
811 }
812 $revision->setType( $logInfo['type'] );
813 $revision->setAction( $logInfo['action'] );
814 if ( isset( $logInfo['timestamp'] ) ) {
815 $revision->setTimestamp( $logInfo['timestamp'] );
816 }
817 if ( isset( $logInfo['params'] ) ) {
818 $revision->setParams( $logInfo['params'] );
819 }
820 if ( isset( $logInfo['logtitle'] ) ) {
821 // @todo Using Title for non-local titles is a recipe for disaster.
822 // We should use ForeignTitle here instead.
823 $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
824 }
825
826 $revision->setNoUpdates( $this->mNoUpdates );
827
828 if ( isset( $logInfo['comment'] ) ) {
829 $revision->setComment( $logInfo['comment'] );
830 }
831
832 if ( isset( $logInfo['contributor']['username'] ) ) {
833 $revision->setUsername(
834 $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
835 );
836 } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
837 $revision->setUserIP( $logInfo['contributor']['ip'] );
838 } else {
839 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
840 }
841
842 return $this->logItemCallback( $revision );
843 }
844
845 private function handlePage() {
846 // Handle page data.
847 $this->debug( "Enter page handler." );
848 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
849
850 // Fields that can just be stuffed in the pageInfo object
851 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
852
853 $skip = false;
854 $badTitle = false;
855
856 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
857 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
858 $this->reader->localName == 'page' ) {
859 break;
860 }
861
862 $skip = false;
863
864 $tag = $this->reader->localName;
865
866 if ( $badTitle ) {
867 // The title is invalid, bail out of this page
868 $skip = true;
869 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
870 // Do nothing
871 } elseif ( in_array( $tag, $normalFields ) ) {
872 // An XML snippet:
873 // <page>
874 // <id>123</id>
875 // <title>Page</title>
876 // <redirect title="NewTitle"/>
877 // ...
878 // Because the redirect tag is built differently, we need special handling for that case.
879 if ( $tag == 'redirect' ) {
880 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
881 } else {
882 $pageInfo[$tag] = $this->nodeContents();
883 }
884 } elseif ( $tag == 'revision' || $tag == 'upload' ) {
885 if ( !isset( $title ) ) {
886 $title = $this->processTitle( $pageInfo['title'],
887 $pageInfo['ns'] ?? null );
888
889 // $title is either an array of two titles or false.
890 if ( is_array( $title ) ) {
891 $this->pageCallback( $title );
892 [ $pageInfo['_title'], $foreignTitle ] = $title;
893 } else {
894 $badTitle = true;
895 $skip = true;
896 }
897 }
898
899 if ( $title ) {
900 if ( $tag == 'revision' ) {
901 $this->handleRevision( $pageInfo );
902 } else {
903 $this->handleUpload( $pageInfo );
904 }
905 }
906 } elseif ( $tag != '#text' ) {
907 $this->warn( "Unhandled page XML tag $tag" );
908 $skip = true;
909 }
910 }
911
912 // @note $pageInfo is only set if a valid $title is processed above with
913 // no error. If we have a valid $title, then pageCallback is called
914 // above, $pageInfo['title'] is set and we do pageOutCallback here.
915 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
916 // set since they both come from $title above.
917 if ( array_key_exists( '_title', $pageInfo ) ) {
919 $title = $pageInfo['_title'];
920 $this->pageOutCallback(
921 $title,
922 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
923 $foreignTitle,
924 $pageInfo['revisionCount'],
925 $pageInfo['successfulRevisionCount'],
926 $pageInfo
927 );
928 }
929 }
930
934 private function handleRevision( &$pageInfo ) {
935 $this->debug( "Enter revision handler" );
936 $revisionInfo = [];
937
938 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
939 'model', 'format', 'text', 'sha1' ];
940
941 $skip = false;
942
943 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
944 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
945 $this->reader->localName == 'revision' ) {
946 break;
947 }
948
949 $tag = $this->reader->localName;
950
951 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
952 $this, $pageInfo, $revisionInfo )
953 ) {
954 // Do nothing
955 } elseif ( in_array( $tag, $normalFields ) ) {
956 $revisionInfo[$tag] = $this->nodeContents();
957 } elseif ( $tag == 'content' ) {
958 // We can have multiple content tags, so make this an array.
959 $revisionInfo[$tag][] = $this->handleContent();
960 } elseif ( $tag == 'contributor' ) {
961 $revisionInfo['contributor'] = $this->handleContributor();
962 } elseif ( $tag != '#text' ) {
963 $this->warn( "Unhandled revision XML tag $tag" );
964 $skip = true;
965 }
966 }
967
968 $pageInfo['revisionCount']++;
969 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
970 $pageInfo['successfulRevisionCount']++;
971 }
972 }
973
974 private function handleContent(): array {
975 $this->debug( "Enter content handler" );
976 $contentInfo = [];
977
978 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
979
980 $skip = false;
981
982 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
983 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
984 $this->reader->localName == 'content' ) {
985 break;
986 }
987
988 $tag = $this->reader->localName;
989
990 if ( !$this->hookRunner->onImportHandleContentXMLTag(
991 $this, $contentInfo )
992 ) {
993 // Do nothing
994 } elseif ( in_array( $tag, $normalFields ) ) {
995 $contentInfo[$tag] = $this->nodeContents();
996 } elseif ( $tag != '#text' ) {
997 $this->warn( "Unhandled content XML tag $tag" );
998 $skip = true;
999 }
1000 }
1001
1002 return $contentInfo;
1003 }
1004
1012 private function makeContent( PageIdentity $page, $revisionId, $contentInfo ) {
1013 $maxArticleSize = $this->config->get( MainConfigNames::MaxArticleSize );
1014
1015 if ( !isset( $contentInfo['text'] ) ) {
1016 throw new InvalidArgumentException( 'Missing text field in import.' );
1017 }
1018
1019 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1020 // database errors and instability. Testing for revisions with only listed
1021 // content models, as other content models might use serialization formats
1022 // which aren't checked against $wgMaxArticleSize.
1023 if ( ( !isset( $contentInfo['model'] ) ||
1024 in_array( $contentInfo['model'], [
1025 'wikitext',
1026 'css',
1027 'json',
1028 'javascript',
1029 'text',
1030 ''
1031 ] ) ) &&
1032 strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1033 ) {
1034 throw new RuntimeException( 'The text of ' .
1035 ( $revisionId ?
1036 "the revision with ID $revisionId" :
1037 'a revision'
1038 ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1039 }
1040
1041 $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1042 $model = $contentInfo['model'] ?? $this->slotRoleRegistry
1043 ->getRoleHandler( $role )
1044 ->getDefaultModel( $page );
1045 $handler = $this->contentHandlerFactory->getContentHandler( $model );
1046
1047 $text = $handler->importTransform( $contentInfo['text'] );
1048
1049 return $handler->unserializeContent( $text );
1050 }
1051
1057 private function processRevision( $pageInfo, $revisionInfo ) {
1058 $revision = new WikiRevision();
1059
1060 $revId = $revisionInfo['id'] ?? 0;
1061 if ( $revId ) {
1062 $revision->setID( $revisionInfo['id'] );
1063 }
1064
1065 $title = $pageInfo['_title'];
1066 $revision->setTitle( $title );
1067
1068 $content = $this->makeContent( $title, $revId, $revisionInfo );
1069 $revision->setContent( SlotRecord::MAIN, $content );
1070
1071 foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1072 if ( !isset( $slotInfo['role'] ) ) {
1073 throw new RuntimeException( "Missing role for imported slot." );
1074 }
1075
1076 $content = $this->makeContent( $title, $revId, $slotInfo );
1077 $revision->setContent( $slotInfo['role'], $content );
1078 }
1079 $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1080
1081 if ( isset( $revisionInfo['comment'] ) ) {
1082 $revision->setComment( $revisionInfo['comment'] );
1083 }
1084
1085 if ( isset( $revisionInfo['minor'] ) ) {
1086 $revision->setMinor( true );
1087 }
1088 if ( isset( $revisionInfo['contributor']['username'] ) ) {
1089 $revision->setUsername(
1090 $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1091 );
1092 } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1093 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1094 } else {
1095 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1096 }
1097 if ( isset( $revisionInfo['sha1'] ) ) {
1098 $revision->setSha1Base36( $revisionInfo['sha1'] );
1099 }
1100 $revision->setNoUpdates( $this->mNoUpdates );
1101
1102 return $this->revisionCallback( $revision );
1103 }
1104
1109 private function handleUpload( &$pageInfo ) {
1110 $this->debug( "Enter upload handler" );
1111 $uploadInfo = [];
1112
1113 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1114 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1115
1116 $skip = false;
1117
1118 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1119 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1120 $this->reader->localName == 'upload' ) {
1121 break;
1122 }
1123
1124 $tag = $this->reader->localName;
1125
1126 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1127 // Do nothing
1128 } elseif ( in_array( $tag, $normalFields ) ) {
1129 $uploadInfo[$tag] = $this->nodeContents();
1130 } elseif ( $tag == 'contributor' ) {
1131 $uploadInfo['contributor'] = $this->handleContributor();
1132 } elseif ( $tag == 'contents' ) {
1133 $contents = $this->nodeContents();
1134 $encoding = $this->reader->getAttribute( 'encoding' );
1135 if ( $encoding === 'base64' ) {
1136 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1137 $uploadInfo['isTempSrc'] = true;
1138 }
1139 } elseif ( $tag != '#text' ) {
1140 $this->warn( "Unhandled upload XML tag $tag" );
1141 $skip = true;
1142 }
1143 }
1144
1145 if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1146 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1147 if ( file_exists( $path ) ) {
1148 $uploadInfo['fileSrc'] = $path;
1149 $uploadInfo['isTempSrc'] = false;
1150 }
1151 }
1152
1153 if ( $this->mImportUploads ) {
1154 return $this->processUpload( $pageInfo, $uploadInfo );
1155 }
1156 }
1157
1162 private function dumpTemp( $contents ) {
1163 $filename = tempnam( wfTempDir(), 'importupload' );
1164 file_put_contents( $filename, $contents );
1165 return $filename;
1166 }
1167
1173 private function processUpload( $pageInfo, $uploadInfo ) {
1174 $revision = new WikiRevision();
1175 $revId = $pageInfo['id'];
1176 $title = $pageInfo['_title'];
1177 // T292348: text key may be absent, force addition if null
1178 $uploadInfo['text'] ??= '';
1179 $content = $this->makeContent( $title, $revId, $uploadInfo );
1180
1181 $revision->setTitle( $title );
1182 $revision->setID( $revId );
1183 $revision->setTimestamp( $uploadInfo['timestamp'] );
1184 $revision->setContent( SlotRecord::MAIN, $content );
1185 $revision->setFilename( $uploadInfo['filename'] );
1186 if ( isset( $uploadInfo['archivename'] ) ) {
1187 $revision->setArchiveName( $uploadInfo['archivename'] );
1188 }
1189 $revision->setSrc( $uploadInfo['src'] );
1190 if ( isset( $uploadInfo['fileSrc'] ) ) {
1191 $revision->setFileSrc( $uploadInfo['fileSrc'],
1192 !empty( $uploadInfo['isTempSrc'] )
1193 );
1194 }
1195 if ( isset( $uploadInfo['sha1base36'] ) ) {
1196 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1197 }
1198 $revision->setSize( intval( $uploadInfo['size'] ) );
1199 $revision->setComment( $uploadInfo['comment'] );
1200
1201 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1202 $revision->setUsername(
1203 $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1204 );
1205 } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1206 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1207 }
1208 $revision->setNoUpdates( $this->mNoUpdates );
1209
1210 return ( $this->mUploadCallback )( $revision );
1211 }
1212
1216 private function handleContributor() {
1217 $this->debug( "Enter contributor handler." );
1218
1219 if ( $this->reader->isEmptyElement ) {
1220 return [];
1221 }
1222
1223 $fields = [ 'id', 'ip', 'username' ];
1224 $info = [];
1225
1226 while ( $this->reader->read() ) {
1227 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1228 $this->reader->localName == 'contributor' ) {
1229 break;
1230 }
1231
1232 $tag = $this->reader->localName;
1233
1234 if ( in_array( $tag, $fields ) ) {
1235 $info[$tag] = $this->nodeContents();
1236 }
1237 }
1238
1239 return $info;
1240 }
1241
1247 private function processTitle( $text, $ns = null ) {
1248 if ( $this->foreignNamespaces === null ) {
1249 $foreignTitleFactory = new NaiveForeignTitleFactory(
1250 $this->contentLanguage
1251 );
1252 } else {
1253 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1254 $this->foreignNamespaces );
1255 }
1256
1257 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1258 intval( $ns ) );
1259
1260 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1261 $foreignTitle );
1262
1263 if ( $title === null ) {
1264 # Invalid page title? Ignore the page
1265 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1266 return false;
1267 } elseif ( $title->isExternal() ) {
1268 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1269 return false;
1270 } elseif ( !$title->canExist() ) {
1271 $this->notice( 'import-error-special', $title->getPrefixedText() );
1272 return false;
1273 } elseif ( !$this->performer->definitelyCan( 'edit', $title ) ) {
1274 # Do not import if the importing wiki user cannot edit this page
1275 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1276 return false;
1277 }
1278
1279 return [ $title, $foreignTitle ];
1280 }
1281
1285 private function openReader() {
1286 // Enable the entity loader, as it is needed for loading external URLs via
1287 // XMLReader::open (T86036)
1288 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
1289 $oldDisable = @libxml_disable_entity_loader( false );
1290
1291 // A static call, to avoid https://github.com/php/php-src/issues/11548
1292 $reader = XMLReader::open(
1293 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1294 if ( $reader instanceof XMLReader ) {
1295 $this->reader = $reader;
1296 $status = true;
1297 } else {
1298 $status = false;
1299 }
1300 if ( !$status ) {
1301 $error = libxml_get_last_error();
1302 // phpcs:ignore Generic.PHP.NoSilencedErrors
1303 @libxml_disable_entity_loader( $oldDisable );
1304 throw new RuntimeException(
1305 'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1306 );
1307 }
1308 // phpcs:ignore Generic.PHP.NoSilencedErrors
1309 @libxml_disable_entity_loader( $oldDisable );
1310 }
1311
1315 private function syntaxCheckXML() {
1316 if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) {
1317 return;
1318 }
1319 AtEase::suppressWarnings();
1320 $oldDisable = libxml_disable_entity_loader( false );
1321 try {
1322 while ( $this->reader->read() );
1323 $error = libxml_get_last_error();
1324 if ( $error ) {
1325 $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message;
1326 wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage );
1327 throw new RuntimeException( $errorMessage );
1328 }
1329 } finally {
1330 libxml_disable_entity_loader( $oldDisable );
1331 AtEase::restoreWarnings();
1332 $this->reader->close();
1333 }
1334
1335 // Reopen for the real import
1336 UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 );
1337 $this->openReader();
1338 }
1339}
1340
1342class_alias( WikiImporter::class, 'WikiImporter' );
const NS_MAIN
Definition Defines.php:51
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Reporting callback.
Defer callable updates to run later in the PHP process.
Class for handling updates to the site_stats table.
Exception representing a failure to serialize or unserialize a content object.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
static registerSource(ImportSource $source)
static seekSource(string $id, int $offset)
XML file reader for the page data importer.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setDebug( $debug)
Set debug mode...
__construct(ImportSource $source, Authority $performer, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
importRevision( $revision)
Default per-revision callback, performs the import.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setNoUpdates( $noupdates)
Set 'no updates' mode.
doImport()
Primary entry point.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
importLogItem( $revision)
Default per-revision callback, performs the import.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
importUpload( $revision)
Dummy for now...
Base class for language-specific code.
Definition Language.php:69
A class containing constants representing the names of configuration variables.
const MaxArticleSize
Name constant for the MaxArticleSize setting, for use with Config::get()
Helper class for mapping page value objects to a string key.
Service for creating WikiPage objects.
Value object representing a content slot associated with a page revision.
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition Status.php:44
A simple, immutable structure to hold the title of a page on a foreign MediaWiki installation.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Creates Title objects.
Represents a title within MediaWiki.
Definition Title.php:69
Class to parse and build external user names.
Value object representing a message parameter with one of the types from {.
Interface for configuration instances.
Definition Config.php:18
Content objects represent page content, e.g.
Definition Content.php:28
Source interface for XML import.
Interface for objects (potentially) representing an editable wiki page.
This interface represents the authority associated with the current execution context,...
Definition Authority.php:23
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
Interface for database access objects.
$source