MediaWiki master
WikiImporter.php
Go to the documentation of this file.
1<?php
54use Wikimedia\AtEase\AtEase;
57use Wikimedia\NormalizedException\NormalizedException;
59
68 private $reader;
69
71 private $sourceAdapterId;
72
74 private $foreignNamespaces = null;
75
77 private $mLogItemCallback;
78
80 private $mUploadCallback;
81
83 private $mRevisionCallback;
84
86 private $mPageCallback;
87
89 private $mSiteInfoCallback;
90
92 private $mPageOutCallback;
93
95 private $mNoticeCallback;
96
98 private $mDebug;
99
101 private $mImportUploads;
102
104 private $mImageBasePath;
105
107 private $mNoUpdates = false;
108
110 private $pageOffset = 0;
111
112 private ImportTitleFactory $importTitleFactory;
113 private ExternalUserNames $externalUserNames;
114
116 private $countableCache = [];
117
119 private $disableStatisticsUpdate = false;
120
127 private Authority $performer;
128
129 private Config $config;
130 private HookRunner $hookRunner;
131 private Language $contentLanguage;
132 private NamespaceInfo $namespaceInfo;
133 private TitleFactory $titleFactory;
134 private WikiPageFactory $wikiPageFactory;
135 private UploadRevisionImporter $uploadRevisionImporter;
136 private IContentHandlerFactory $contentHandlerFactory;
137 private SlotRoleRegistry $slotRoleRegistry;
138
142 public function __construct(
144 Authority $performer,
145 Config $config,
146 HookContainer $hookContainer,
147 Language $contentLanguage,
148 NamespaceInfo $namespaceInfo,
149 TitleFactory $titleFactory,
150 WikiPageFactory $wikiPageFactory,
151 UploadRevisionImporter $uploadRevisionImporter,
152 IContentHandlerFactory $contentHandlerFactory,
153 SlotRoleRegistry $slotRoleRegistry
154 ) {
155 $this->performer = $performer;
156 $this->config = $config;
157 $this->hookRunner = new HookRunner( $hookContainer );
158 $this->contentLanguage = $contentLanguage;
159 $this->namespaceInfo = $namespaceInfo;
160 $this->titleFactory = $titleFactory;
161 $this->wikiPageFactory = $wikiPageFactory;
162 $this->uploadRevisionImporter = $uploadRevisionImporter;
163 $this->contentHandlerFactory = $contentHandlerFactory;
164 $this->slotRoleRegistry = $slotRoleRegistry;
165
166 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
167 stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
168 }
169 $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source );
170
171 $this->openReader();
172
173 // Default callbacks
174 $this->setPageCallback( [ $this, 'beforeImportPage' ] );
175 $this->setRevisionCallback( [ $this, "importRevision" ] );
176 $this->setUploadCallback( [ $this, 'importUpload' ] );
177 $this->setLogItemCallback( [ $this, 'importLogItem' ] );
178 $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
179
180 $this->importTitleFactory = new NaiveImportTitleFactory(
181 $this->contentLanguage,
182 $this->namespaceInfo,
183 $this->titleFactory
184 );
185 $this->externalUserNames = new ExternalUserNames( 'imported', false );
186 }
187
191 public function getReader() {
192 return $this->reader;
193 }
194
198 public function throwXmlError( $err ) {
199 $this->debug( "FAILURE: $err" );
200 wfDebug( "WikiImporter XML error: $err" );
201 }
202
206 public function debug( $data ) {
207 if ( $this->mDebug ) {
208 wfDebug( "IMPORT: $data" );
209 }
210 }
211
215 public function warn( $data ) {
216 wfDebug( "IMPORT: $data" );
217 }
218
225 public function notice( $msg, ...$params ) {
226 if ( is_callable( $this->mNoticeCallback ) ) {
227 call_user_func( $this->mNoticeCallback, $msg, $params );
228 } else { # No ImportReporter -> CLI
229 // T177997: the command line importers should call setNoticeCallback()
230 // for their own custom callback to echo the notice
231 wfDebug( wfMessage( $msg, $params )->text() );
232 }
233 }
234
239 public function setDebug( $debug ) {
240 $this->mDebug = $debug;
241 }
242
247 public function setNoUpdates( $noupdates ) {
248 $this->mNoUpdates = $noupdates;
249 }
250
257 public function setPageOffset( $nthPage ) {
258 $this->pageOffset = $nthPage;
259 }
260
267 public function setNoticeCallback( $callback ) {
268 return wfSetVar( $this->mNoticeCallback, $callback );
269 }
270
276 public function setPageCallback( $callback ) {
277 $previous = $this->mPageCallback;
278 $this->mPageCallback = $callback;
279 return $previous;
280 }
281
291 public function setPageOutCallback( $callback ) {
292 $previous = $this->mPageOutCallback;
293 $this->mPageOutCallback = $callback;
294 return $previous;
295 }
296
302 public function setRevisionCallback( $callback ) {
303 $previous = $this->mRevisionCallback;
304 $this->mRevisionCallback = $callback;
305 return $previous;
306 }
307
313 public function setUploadCallback( $callback ) {
314 $previous = $this->mUploadCallback;
315 $this->mUploadCallback = $callback;
316 return $previous;
317 }
318
324 public function setLogItemCallback( $callback ) {
325 $previous = $this->mLogItemCallback;
326 $this->mLogItemCallback = $callback;
327 return $previous;
328 }
329
335 public function setSiteInfoCallback( $callback ) {
336 $previous = $this->mSiteInfoCallback;
337 $this->mSiteInfoCallback = $callback;
338 return $previous;
339 }
340
346 public function setImportTitleFactory( $factory ) {
347 $this->importTitleFactory = $factory;
348 }
349
355 public function setTargetNamespace( $namespace ) {
356 if ( $namespace === null ) {
357 // Don't override namespaces
360 $this->contentLanguage,
361 $this->namespaceInfo,
362 $this->titleFactory
363 )
364 );
365 return true;
366 } elseif (
367 $namespace >= 0 &&
368 $this->namespaceInfo->exists( intval( $namespace ) )
369 ) {
370 $namespace = intval( $namespace );
373 $this->namespaceInfo,
374 $this->titleFactory,
375 $namespace
376 )
377 );
378 return true;
379 } else {
380 return false;
381 }
382 }
383
389 public function setTargetRootPage( $rootpage ) {
390 $status = Status::newGood();
391 $nsInfo = $this->namespaceInfo;
392 if ( $rootpage === null ) {
393 // No rootpage
396 $this->contentLanguage,
397 $nsInfo,
398 $this->titleFactory
399 )
400 );
401 } elseif ( $rootpage !== '' ) {
402 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
403 $title = Title::newFromText( $rootpage );
404
405 if ( !$title || $title->isExternal() ) {
406 $status->fatal( 'import-rootpage-invalid' );
407 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
408 $displayNSText = $title->getNamespace() === NS_MAIN
409 ? wfMessage( 'blanknamespace' )->text()
410 : $this->contentLanguage->getNsText( $title->getNamespace() );
411 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
412 } else {
413 // set namespace to 'all', so the namespace check in processTitle() can pass
414 $this->setTargetNamespace( null );
417 $nsInfo,
418 $this->titleFactory,
419 $title
420 )
421 );
422 }
423 }
424 return $status;
425 }
426
430 public function setImageBasePath( $dir ) {
431 $this->mImageBasePath = $dir;
432 }
433
437 public function setImportUploads( $import ) {
438 $this->mImportUploads = $import;
439 }
440
446 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
447 $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
448 }
449
454 public function disableStatisticsUpdate() {
455 $this->disableStatisticsUpdate = true;
456 }
457
464 public function beforeImportPage( $titleAndForeignTitle ) {
465 $title = $titleAndForeignTitle[0];
466 $page = $this->wikiPageFactory->newFromTitle( $title );
467 $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
468 return true;
469 }
470
476 public function importRevision( $revision ) {
477 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
478 $this->notice( 'import-error-bad-location',
479 $revision->getTitle()->getPrefixedText(),
480 $revision->getID(),
481 $revision->getModel(),
482 $revision->getFormat()
483 );
484
485 return false;
486 }
487
488 try {
489 return $revision->importOldRevision();
490 } catch ( MWContentSerializationException $ex ) {
491 $this->notice( 'import-error-unserialize',
492 $revision->getTitle()->getPrefixedText(),
493 $revision->getID(),
494 $revision->getModel(),
495 $revision->getFormat()
496 );
497 }
498
499 return false;
500 }
501
507 public function importLogItem( $revision ) {
508 return $revision->importLogItem();
509 }
510
516 public function importUpload( $revision ) {
517 $status = $this->uploadRevisionImporter->import( $revision );
518 return $status->isGood();
519 }
520
530 public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
531 $sRevCount, $pageInfo
532 ) {
533 // Update article count statistics (T42009)
534 // The normal counting logic in WikiPage->doEditUpdates() is designed for
535 // one-revision-at-a-time editing, not bulk imports. In this situation it
536 // suffers from issues of replica DB lag. We let WikiPage handle the total page
537 // and revision count, and we implement our own custom logic for the
538 // article (content page) count.
539 if ( !$this->disableStatisticsUpdate ) {
540 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
541
542 $page->loadPageData( IDBAccessObject::READ_LATEST );
543 $rev = $page->getRevisionRecord();
544 if ( $rev === null ) {
545
546 wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
547 ' because WikiPage::getRevisionRecord() returned null' );
548 } else {
549 $update = $page->newPageUpdater( $this->performer )->prepareUpdate();
550 $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
551 $countable = $update->isCountable();
552 if ( array_key_exists( $countKey, $this->countableCache ) &&
553 $countable != $this->countableCache[$countKey] ) {
554 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
555 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
556 ] ) );
557 }
558 }
559 }
560
561 $title = Title::newFromPageIdentity( $pageIdentity );
562 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
563 $revCount, $sRevCount, $pageInfo );
564 }
565
571 private function siteInfoCallback( $siteInfo ) {
572 if ( $this->mSiteInfoCallback ) {
573 return call_user_func_array(
574 $this->mSiteInfoCallback,
575 [ $siteInfo, $this ]
576 );
577 } else {
578 return false;
579 }
580 }
581
586 public function pageCallback( $title ) {
587 if ( $this->mPageCallback ) {
588 call_user_func( $this->mPageCallback, $title );
589 }
590 }
591
600 private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
601 $sucCount, $pageInfo ) {
602 if ( $this->mPageOutCallback ) {
603 call_user_func_array( $this->mPageOutCallback, func_get_args() );
604 }
605 }
606
612 private function revisionCallback( $revision ) {
613 if ( $this->mRevisionCallback ) {
614 return call_user_func_array(
615 $this->mRevisionCallback,
616 [ $revision, $this ]
617 );
618 } else {
619 return false;
620 }
621 }
622
628 private function logItemCallback( $revision ) {
629 if ( $this->mLogItemCallback ) {
630 return call_user_func_array(
631 $this->mLogItemCallback,
632 [ $revision, $this ]
633 );
634 } else {
635 return false;
636 }
637 }
638
645 public function nodeAttribute( $attr ) {
646 return $this->reader->getAttribute( $attr ) ?? '';
647 }
648
656 public function nodeContents() {
657 if ( $this->reader->isEmptyElement ) {
658 return "";
659 }
660 $buffer = "";
661 while ( $this->reader->read() ) {
662 switch ( $this->reader->nodeType ) {
663 case XMLReader::TEXT:
664 case XMLReader::CDATA:
665 case XMLReader::SIGNIFICANT_WHITESPACE:
666 $buffer .= $this->reader->value;
667 break;
668 case XMLReader::END_ELEMENT:
669 return $buffer;
670 }
671 }
672
673 $this->reader->close();
674 return '';
675 }
676
682 public function doImport() {
683 $this->syntaxCheckXML();
684
685 // Calls to reader->read need to be wrapped in calls to
686 // libxml_disable_entity_loader() to avoid local file
687 // inclusion attacks (T48932).
688 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
689 $oldDisable = @libxml_disable_entity_loader( true );
690 try {
691 $this->reader->read();
692
693 if ( $this->reader->localName != 'mediawiki' ) {
694 // phpcs:ignore Generic.PHP.NoSilencedErrors
695 @libxml_disable_entity_loader( $oldDisable );
696 $error = libxml_get_last_error();
697 if ( $error ) {
698 throw new NormalizedException( "XML error at line {line}: {message}", [
699 'line' => $error->line,
700 'message' => $error->message,
701 ] );
702 } else {
703 throw new UnexpectedValueException(
704 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
705 );
706 }
707 }
708 $this->debug( "<mediawiki> tag is correct." );
709
710 $this->debug( "Starting primary dump processing loop." );
711
712 $keepReading = $this->reader->read();
713 $skip = false;
714 $pageCount = 0;
715 while ( $keepReading ) {
716 $tag = $this->reader->localName;
717 if ( $this->pageOffset ) {
718 if ( $tag === 'page' ) {
719 $pageCount++;
720 }
721 if ( $pageCount < $this->pageOffset ) {
722 $keepReading = $this->reader->next();
723 continue;
724 }
725 }
726 $type = $this->reader->nodeType;
727
728 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
729 // Do nothing
730 } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
731 break;
732 } elseif ( $tag == 'siteinfo' ) {
733 $this->handleSiteInfo();
734 } elseif ( $tag == 'page' ) {
735 $this->handlePage();
736 } elseif ( $tag == 'logitem' ) {
737 $this->handleLogItem();
738 } elseif ( $tag != '#text' ) {
739 $this->warn( "Unhandled top-level XML tag $tag" );
740
741 $skip = true;
742 }
743
744 if ( $skip ) {
745 $keepReading = $this->reader->next();
746 $skip = false;
747 $this->debug( "Skip" );
748 } else {
749 $keepReading = $this->reader->read();
750 }
751 }
752 } finally {
753 // phpcs:ignore Generic.PHP.NoSilencedErrors
754 @libxml_disable_entity_loader( $oldDisable );
755 $this->reader->close();
756 }
757
758 return true;
759 }
760
761 private function handleSiteInfo() {
762 $this->debug( "Enter site info handler." );
763 $siteInfo = [];
764
765 // Fields that can just be stuffed in the siteInfo object
766 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
767
768 while ( $this->reader->read() ) {
769 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
770 $this->reader->localName == 'siteinfo' ) {
771 break;
772 }
773
774 $tag = $this->reader->localName;
775
776 if ( $tag == 'namespace' ) {
777 $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
778 $this->nodeContents();
779 } elseif ( in_array( $tag, $normalFields ) ) {
780 $siteInfo[$tag] = $this->nodeContents();
781 }
782 }
783
784 $siteInfo['_namespaces'] = $this->foreignNamespaces;
785 $this->siteInfoCallback( $siteInfo );
786 }
787
788 private function handleLogItem() {
789 $this->debug( "Enter log item handler." );
790 $logInfo = [];
791
792 // Fields that can just be stuffed in the pageInfo object
793 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
794 'logtitle', 'params' ];
795
796 while ( $this->reader->read() ) {
797 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
798 $this->reader->localName == 'logitem' ) {
799 break;
800 }
801
802 $tag = $this->reader->localName;
803
804 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
805 // Do nothing
806 } elseif ( in_array( $tag, $normalFields ) ) {
807 $logInfo[$tag] = $this->nodeContents();
808 } elseif ( $tag == 'contributor' ) {
809 $logInfo['contributor'] = $this->handleContributor();
810 } elseif ( $tag != '#text' ) {
811 $this->warn( "Unhandled log-item XML tag $tag" );
812 }
813 }
814
815 $this->processLogItem( $logInfo );
816 }
817
822 private function processLogItem( $logInfo ) {
823 $revision = new WikiRevision();
824
825 if ( isset( $logInfo['id'] ) ) {
826 $revision->setID( $logInfo['id'] );
827 }
828 $revision->setType( $logInfo['type'] );
829 $revision->setAction( $logInfo['action'] );
830 if ( isset( $logInfo['timestamp'] ) ) {
831 $revision->setTimestamp( $logInfo['timestamp'] );
832 }
833 if ( isset( $logInfo['params'] ) ) {
834 $revision->setParams( $logInfo['params'] );
835 }
836 if ( isset( $logInfo['logtitle'] ) ) {
837 // @todo Using Title for non-local titles is a recipe for disaster.
838 // We should use ForeignTitle here instead.
839 $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
840 }
841
842 $revision->setNoUpdates( $this->mNoUpdates );
843
844 if ( isset( $logInfo['comment'] ) ) {
845 $revision->setComment( $logInfo['comment'] );
846 }
847
848 if ( isset( $logInfo['contributor']['username'] ) ) {
849 $revision->setUsername(
850 $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
851 );
852 } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
853 $revision->setUserIP( $logInfo['contributor']['ip'] );
854 } else {
855 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
856 }
857
858 return $this->logItemCallback( $revision );
859 }
860
861 private function handlePage() {
862 // Handle page data.
863 $this->debug( "Enter page handler." );
864 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
865
866 // Fields that can just be stuffed in the pageInfo object
867 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
868
869 $skip = false;
870 $badTitle = false;
871
872 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
873 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
874 $this->reader->localName == 'page' ) {
875 break;
876 }
877
878 $skip = false;
879
880 $tag = $this->reader->localName;
881
882 if ( $badTitle ) {
883 // The title is invalid, bail out of this page
884 $skip = true;
885 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
886 // Do nothing
887 } elseif ( in_array( $tag, $normalFields ) ) {
888 // An XML snippet:
889 // <page>
890 // <id>123</id>
891 // <title>Page</title>
892 // <redirect title="NewTitle"/>
893 // ...
894 // Because the redirect tag is built differently, we need special handling for that case.
895 if ( $tag == 'redirect' ) {
896 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
897 } else {
898 $pageInfo[$tag] = $this->nodeContents();
899 }
900 } elseif ( $tag == 'revision' || $tag == 'upload' ) {
901 if ( !isset( $title ) ) {
902 $title = $this->processTitle( $pageInfo['title'],
903 $pageInfo['ns'] ?? null );
904
905 // $title is either an array of two titles or false.
906 if ( is_array( $title ) ) {
907 $this->pageCallback( $title );
908 [ $pageInfo['_title'], $foreignTitle ] = $title;
909 } else {
910 $badTitle = true;
911 $skip = true;
912 }
913 }
914
915 if ( $title ) {
916 if ( $tag == 'revision' ) {
917 $this->handleRevision( $pageInfo );
918 } else {
919 $this->handleUpload( $pageInfo );
920 }
921 }
922 } elseif ( $tag != '#text' ) {
923 $this->warn( "Unhandled page XML tag $tag" );
924 $skip = true;
925 }
926 }
927
928 // @note $pageInfo is only set if a valid $title is processed above with
929 // no error. If we have a valid $title, then pageCallback is called
930 // above, $pageInfo['title'] is set and we do pageOutCallback here.
931 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
932 // set since they both come from $title above.
933 if ( array_key_exists( '_title', $pageInfo ) ) {
935 $title = $pageInfo['_title'];
936 $this->pageOutCallback(
937 $title,
938 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
939 $foreignTitle,
940 $pageInfo['revisionCount'],
941 $pageInfo['successfulRevisionCount'],
942 $pageInfo
943 );
944 }
945 }
946
950 private function handleRevision( &$pageInfo ) {
951 $this->debug( "Enter revision handler" );
952 $revisionInfo = [];
953
954 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
955 'model', 'format', 'text', 'sha1' ];
956
957 $skip = false;
958
959 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
960 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
961 $this->reader->localName == 'revision' ) {
962 break;
963 }
964
965 $tag = $this->reader->localName;
966
967 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
968 $this, $pageInfo, $revisionInfo )
969 ) {
970 // Do nothing
971 } elseif ( in_array( $tag, $normalFields ) ) {
972 $revisionInfo[$tag] = $this->nodeContents();
973 } elseif ( $tag == 'content' ) {
974 // We can have multiple content tags, so make this an array.
975 $revisionInfo[$tag][] = $this->handleContent();
976 } elseif ( $tag == 'contributor' ) {
977 $revisionInfo['contributor'] = $this->handleContributor();
978 } elseif ( $tag != '#text' ) {
979 $this->warn( "Unhandled revision XML tag $tag" );
980 $skip = true;
981 }
982 }
983
984 $pageInfo['revisionCount']++;
985 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
986 $pageInfo['successfulRevisionCount']++;
987 }
988 }
989
990 private function handleContent() {
991 $this->debug( "Enter content handler" );
992 $contentInfo = [];
993
994 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
995
996 $skip = false;
997
998 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
999 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1000 $this->reader->localName == 'content' ) {
1001 break;
1002 }
1003
1004 $tag = $this->reader->localName;
1005
1006 if ( !$this->hookRunner->onImportHandleContentXMLTag(
1007 $this, $contentInfo )
1008 ) {
1009 // Do nothing
1010 } elseif ( in_array( $tag, $normalFields ) ) {
1011 $contentInfo[$tag] = $this->nodeContents();
1012 } elseif ( $tag != '#text' ) {
1013 $this->warn( "Unhandled content XML tag $tag" );
1014 $skip = true;
1015 }
1016 }
1017
1018 return $contentInfo;
1019 }
1020
1028 private function makeContent( PageIdentity $page, $revisionId, $contentInfo ) {
1029 $maxArticleSize = $this->config->get( MainConfigNames::MaxArticleSize );
1030
1031 if ( !isset( $contentInfo['text'] ) ) {
1032 throw new InvalidArgumentException( 'Missing text field in import.' );
1033 }
1034
1035 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1036 // database errors and instability. Testing for revisions with only listed
1037 // content models, as other content models might use serialization formats
1038 // which aren't checked against $wgMaxArticleSize.
1039 if ( ( !isset( $contentInfo['model'] ) ||
1040 in_array( $contentInfo['model'], [
1041 'wikitext',
1042 'css',
1043 'json',
1044 'javascript',
1045 'text',
1046 ''
1047 ] ) ) &&
1048 strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1049 ) {
1050 throw new RuntimeException( 'The text of ' .
1051 ( $revisionId ?
1052 "the revision with ID $revisionId" :
1053 'a revision'
1054 ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1055 }
1056
1057 $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1058 $model = $contentInfo['model'] ?? $this->slotRoleRegistry
1059 ->getRoleHandler( $role )
1060 ->getDefaultModel( $page );
1061 $handler = $this->contentHandlerFactory->getContentHandler( $model );
1062
1063 $text = $handler->importTransform( $contentInfo['text'] );
1064
1065 return $handler->unserializeContent( $text );
1066 }
1067
1073 private function processRevision( $pageInfo, $revisionInfo ) {
1074 $revision = new WikiRevision();
1075
1076 $revId = $revisionInfo['id'] ?? 0;
1077 if ( $revId ) {
1078 $revision->setID( $revisionInfo['id'] );
1079 }
1080
1081 $title = $pageInfo['_title'];
1082 $revision->setTitle( $title );
1083
1084 $content = $this->makeContent( $title, $revId, $revisionInfo );
1085 $revision->setContent( SlotRecord::MAIN, $content );
1086
1087 foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1088 if ( !isset( $slotInfo['role'] ) ) {
1089 throw new RuntimeException( "Missing role for imported slot." );
1090 }
1091
1092 $content = $this->makeContent( $title, $revId, $slotInfo );
1093 $revision->setContent( $slotInfo['role'], $content );
1094 }
1095 $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1096
1097 if ( isset( $revisionInfo['comment'] ) ) {
1098 $revision->setComment( $revisionInfo['comment'] );
1099 }
1100
1101 if ( isset( $revisionInfo['minor'] ) ) {
1102 $revision->setMinor( true );
1103 }
1104 if ( isset( $revisionInfo['contributor']['username'] ) ) {
1105 $revision->setUsername(
1106 $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1107 );
1108 } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1109 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1110 } else {
1111 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1112 }
1113 if ( isset( $revisionInfo['sha1'] ) ) {
1114 $revision->setSha1Base36( $revisionInfo['sha1'] );
1115 }
1116 $revision->setNoUpdates( $this->mNoUpdates );
1117
1118 return $this->revisionCallback( $revision );
1119 }
1120
1125 private function handleUpload( &$pageInfo ) {
1126 $this->debug( "Enter upload handler" );
1127 $uploadInfo = [];
1128
1129 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1130 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1131
1132 $skip = false;
1133
1134 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1135 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1136 $this->reader->localName == 'upload' ) {
1137 break;
1138 }
1139
1140 $tag = $this->reader->localName;
1141
1142 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1143 // Do nothing
1144 } elseif ( in_array( $tag, $normalFields ) ) {
1145 $uploadInfo[$tag] = $this->nodeContents();
1146 } elseif ( $tag == 'contributor' ) {
1147 $uploadInfo['contributor'] = $this->handleContributor();
1148 } elseif ( $tag == 'contents' ) {
1149 $contents = $this->nodeContents();
1150 $encoding = $this->reader->getAttribute( 'encoding' );
1151 if ( $encoding === 'base64' ) {
1152 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1153 $uploadInfo['isTempSrc'] = true;
1154 }
1155 } elseif ( $tag != '#text' ) {
1156 $this->warn( "Unhandled upload XML tag $tag" );
1157 $skip = true;
1158 }
1159 }
1160
1161 if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1162 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1163 if ( file_exists( $path ) ) {
1164 $uploadInfo['fileSrc'] = $path;
1165 $uploadInfo['isTempSrc'] = false;
1166 }
1167 }
1168
1169 if ( $this->mImportUploads ) {
1170 return $this->processUpload( $pageInfo, $uploadInfo );
1171 }
1172 }
1173
1178 private function dumpTemp( $contents ) {
1179 $filename = tempnam( wfTempDir(), 'importupload' );
1180 file_put_contents( $filename, $contents );
1181 return $filename;
1182 }
1183
1189 private function processUpload( $pageInfo, $uploadInfo ) {
1190 $revision = new WikiRevision();
1191 $revId = $pageInfo['id'];
1192 $title = $pageInfo['_title'];
1193 // T292348: text key may be absent, force addition if null
1194 $uploadInfo['text'] ??= '';
1195 $content = $this->makeContent( $title, $revId, $uploadInfo );
1196
1197 $revision->setTitle( $title );
1198 $revision->setID( $revId );
1199 $revision->setTimestamp( $uploadInfo['timestamp'] );
1200 $revision->setContent( SlotRecord::MAIN, $content );
1201 $revision->setFilename( $uploadInfo['filename'] );
1202 if ( isset( $uploadInfo['archivename'] ) ) {
1203 $revision->setArchiveName( $uploadInfo['archivename'] );
1204 }
1205 $revision->setSrc( $uploadInfo['src'] );
1206 if ( isset( $uploadInfo['fileSrc'] ) ) {
1207 $revision->setFileSrc( $uploadInfo['fileSrc'],
1208 !empty( $uploadInfo['isTempSrc'] )
1209 );
1210 }
1211 if ( isset( $uploadInfo['sha1base36'] ) ) {
1212 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1213 }
1214 $revision->setSize( intval( $uploadInfo['size'] ) );
1215 $revision->setComment( $uploadInfo['comment'] );
1216
1217 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1218 $revision->setUsername(
1219 $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1220 );
1221 } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1222 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1223 }
1224 $revision->setNoUpdates( $this->mNoUpdates );
1225
1226 return call_user_func( $this->mUploadCallback, $revision );
1227 }
1228
1232 private function handleContributor() {
1233 $this->debug( "Enter contributor handler." );
1234
1235 if ( $this->reader->isEmptyElement ) {
1236 return [];
1237 }
1238
1239 $fields = [ 'id', 'ip', 'username' ];
1240 $info = [];
1241
1242 while ( $this->reader->read() ) {
1243 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1244 $this->reader->localName == 'contributor' ) {
1245 break;
1246 }
1247
1248 $tag = $this->reader->localName;
1249
1250 if ( in_array( $tag, $fields ) ) {
1251 $info[$tag] = $this->nodeContents();
1252 }
1253 }
1254
1255 return $info;
1256 }
1257
1263 private function processTitle( $text, $ns = null ) {
1264 if ( $this->foreignNamespaces === null ) {
1265 $foreignTitleFactory = new NaiveForeignTitleFactory(
1266 $this->contentLanguage
1267 );
1268 } else {
1269 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1270 $this->foreignNamespaces );
1271 }
1272
1273 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1274 intval( $ns ) );
1275
1276 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1277 $foreignTitle );
1278
1279 if ( $title === null ) {
1280 # Invalid page title? Ignore the page
1281 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1282 return false;
1283 } elseif ( $title->isExternal() ) {
1284 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1285 return false;
1286 } elseif ( !$title->canExist() ) {
1287 $this->notice( 'import-error-special', $title->getPrefixedText() );
1288 return false;
1289 } elseif ( !$this->performer->definitelyCan( 'edit', $title ) ) {
1290 # Do not import if the importing wiki user cannot edit this page
1291 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1292 return false;
1293 }
1294
1295 return [ $title, $foreignTitle ];
1296 }
1297
1302 private function openReader() {
1303 // Enable the entity loader, as it is needed for loading external URLs via
1304 // XMLReader::open (T86036)
1305 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
1306 $oldDisable = @libxml_disable_entity_loader( false );
1307
1308 if ( PHP_VERSION_ID >= 80000 ) {
1309 // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548
1310 $reader = XMLReader::open(
1311 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1312 if ( $reader instanceof XMLReader ) {
1313 $this->reader = $reader;
1314 $status = true;
1315 } else {
1316 $status = false;
1317 }
1318 } else {
1319 // A static call generated a deprecation warning prior to PHP 8.0
1320 $this->reader = new XMLReader;
1321 $status = $this->reader->open(
1322 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1323 }
1324 if ( !$status ) {
1325 $error = libxml_get_last_error();
1326 // phpcs:ignore Generic.PHP.NoSilencedErrors
1327 @libxml_disable_entity_loader( $oldDisable );
1328 throw new RuntimeException(
1329 'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1330 );
1331 }
1332 // phpcs:ignore Generic.PHP.NoSilencedErrors
1333 @libxml_disable_entity_loader( $oldDisable );
1334 }
1335
1339 private function syntaxCheckXML() {
1340 if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) {
1341 return;
1342 }
1343 AtEase::suppressWarnings();
1344 $oldDisable = libxml_disable_entity_loader( false );
1345 try {
1346 while ( $this->reader->read() );
1347 $error = libxml_get_last_error();
1348 if ( $error ) {
1349 $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message;
1350 wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage );
1351 throw new RuntimeException( $errorMessage );
1352 }
1353 } finally {
1354 libxml_disable_entity_loader( $oldDisable );
1355 AtEase::restoreWarnings();
1356 $this->reader->close();
1357 }
1358
1359 // Reopen for the real import
1360 UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 );
1361 $this->openReader();
1362 }
1363}
const NS_MAIN
Definition Defines.php:65
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
array $params
The job parameters.
Reporting callback.
Exception representing a failure to serialize or unserialize a content object.
Helper class for mapping value objects representing basic entities to cache keys.
Defer callable updates to run later in the PHP process.
Class for handling updates to the site_stats table.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Base class for language-specific code.
Definition Language.php:81
A class containing constants representing the names of configuration variables.
Service for creating WikiPage objects.
Value object representing a content slot associated with a page revision.
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition Status.php:54
A simple, immutable structure to hold the title of a page on a foreign MediaWiki installation.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Creates Title objects.
Represents a title within MediaWiki.
Definition Title.php:78
Class to parse and build external user names.
static seekSource(string $id, int $offset)
static isSeekableSource(string $id)
static registerSource(ImportSource $source)
XML file reader for the page data importer.
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setNoUpdates( $noupdates)
Set 'no updates' mode.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
importLogItem( $revision)
Default per-revision callback, performs the import.
setImageBasePath( $dir)
throwXmlError( $err)
setDebug( $debug)
Set debug mode...
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
__construct(ImportSource $source, Authority $performer, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
Represents a revision, log entry or upload during the import process.
Value object representing a message parameter that consists of a list of values.
Source interface for XML import.
Interface for configuration instances.
Definition Config.php:32
Base interface for representing page content.
Definition Content.php:39
Interface for objects (potentially) representing an editable wiki page.
This interface represents the authority associated with the current execution context,...
Definition Authority.php:37
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
Interface for database access objects.
$source