MediaWiki master
WikiImporter.php
Go to the documentation of this file.
1<?php
52use Wikimedia\AtEase\AtEase;
53use Wikimedia\NormalizedException\NormalizedException;
55
64 private $reader;
65
67 private $sourceAdapterId;
68
70 private $foreignNamespaces = null;
71
73 private $mLogItemCallback;
74
76 private $mUploadCallback;
77
79 private $mRevisionCallback;
80
82 private $mPageCallback;
83
85 private $mSiteInfoCallback;
86
88 private $mPageOutCallback;
89
91 private $mNoticeCallback;
92
94 private $mDebug;
95
97 private $mImportUploads;
98
100 private $mImageBasePath;
101
103 private $mNoUpdates = false;
104
106 private $pageOffset = 0;
107
108 private ImportTitleFactory $importTitleFactory;
109 private ExternalUserNames $externalUserNames;
110
112 private $countableCache = [];
113
115 private $disableStatisticsUpdate = false;
116
123 private Authority $performer;
124
125 private Config $config;
126 private HookRunner $hookRunner;
127 private Language $contentLanguage;
128 private NamespaceInfo $namespaceInfo;
129 private TitleFactory $titleFactory;
130 private WikiPageFactory $wikiPageFactory;
131 private UploadRevisionImporter $uploadRevisionImporter;
132 private IContentHandlerFactory $contentHandlerFactory;
133 private SlotRoleRegistry $slotRoleRegistry;
134
138 public function __construct(
140 Authority $performer,
141 Config $config,
142 HookContainer $hookContainer,
143 Language $contentLanguage,
144 NamespaceInfo $namespaceInfo,
145 TitleFactory $titleFactory,
146 WikiPageFactory $wikiPageFactory,
147 UploadRevisionImporter $uploadRevisionImporter,
148 IContentHandlerFactory $contentHandlerFactory,
149 SlotRoleRegistry $slotRoleRegistry
150 ) {
151 $this->performer = $performer;
152 $this->config = $config;
153 $this->hookRunner = new HookRunner( $hookContainer );
154 $this->contentLanguage = $contentLanguage;
155 $this->namespaceInfo = $namespaceInfo;
156 $this->titleFactory = $titleFactory;
157 $this->wikiPageFactory = $wikiPageFactory;
158 $this->uploadRevisionImporter = $uploadRevisionImporter;
159 $this->contentHandlerFactory = $contentHandlerFactory;
160 $this->slotRoleRegistry = $slotRoleRegistry;
161
162 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
163 stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
164 }
165 $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source );
166
167 $this->openReader();
168
169 // Default callbacks
170 $this->setPageCallback( [ $this, 'beforeImportPage' ] );
171 $this->setRevisionCallback( [ $this, "importRevision" ] );
172 $this->setUploadCallback( [ $this, 'importUpload' ] );
173 $this->setLogItemCallback( [ $this, 'importLogItem' ] );
174 $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
175
176 $this->importTitleFactory = new NaiveImportTitleFactory(
177 $this->contentLanguage,
178 $this->namespaceInfo,
179 $this->titleFactory
180 );
181 $this->externalUserNames = new ExternalUserNames( 'imported', false );
182 }
183
187 public function getReader() {
188 return $this->reader;
189 }
190
194 public function throwXmlError( $err ) {
195 $this->debug( "FAILURE: $err" );
196 wfDebug( "WikiImporter XML error: $err" );
197 }
198
202 public function debug( $data ) {
203 if ( $this->mDebug ) {
204 wfDebug( "IMPORT: $data" );
205 }
206 }
207
211 public function warn( $data ) {
212 wfDebug( "IMPORT: $data" );
213 }
214
219 public function notice( $msg, ...$params ) {
220 if ( is_callable( $this->mNoticeCallback ) ) {
221 call_user_func( $this->mNoticeCallback, $msg, $params );
222 } else { # No ImportReporter -> CLI
223 // T177997: the command line importers should call setNoticeCallback()
224 // for their own custom callback to echo the notice
225 wfDebug( wfMessage( $msg, $params )->text() );
226 }
227 }
228
233 public function setDebug( $debug ) {
234 $this->mDebug = $debug;
235 }
236
241 public function setNoUpdates( $noupdates ) {
242 $this->mNoUpdates = $noupdates;
243 }
244
251 public function setPageOffset( $nthPage ) {
252 $this->pageOffset = $nthPage;
253 }
254
261 public function setNoticeCallback( $callback ) {
262 return wfSetVar( $this->mNoticeCallback, $callback );
263 }
264
270 public function setPageCallback( $callback ) {
271 $previous = $this->mPageCallback;
272 $this->mPageCallback = $callback;
273 return $previous;
274 }
275
285 public function setPageOutCallback( $callback ) {
286 $previous = $this->mPageOutCallback;
287 $this->mPageOutCallback = $callback;
288 return $previous;
289 }
290
296 public function setRevisionCallback( $callback ) {
297 $previous = $this->mRevisionCallback;
298 $this->mRevisionCallback = $callback;
299 return $previous;
300 }
301
307 public function setUploadCallback( $callback ) {
308 $previous = $this->mUploadCallback;
309 $this->mUploadCallback = $callback;
310 return $previous;
311 }
312
318 public function setLogItemCallback( $callback ) {
319 $previous = $this->mLogItemCallback;
320 $this->mLogItemCallback = $callback;
321 return $previous;
322 }
323
329 public function setSiteInfoCallback( $callback ) {
330 $previous = $this->mSiteInfoCallback;
331 $this->mSiteInfoCallback = $callback;
332 return $previous;
333 }
334
340 public function setImportTitleFactory( $factory ) {
341 $this->importTitleFactory = $factory;
342 }
343
349 public function setTargetNamespace( $namespace ) {
350 if ( $namespace === null ) {
351 // Don't override namespaces
354 $this->contentLanguage,
355 $this->namespaceInfo,
356 $this->titleFactory
357 )
358 );
359 return true;
360 } elseif (
361 $namespace >= 0 &&
362 $this->namespaceInfo->exists( intval( $namespace ) )
363 ) {
364 $namespace = intval( $namespace );
367 $this->namespaceInfo,
368 $this->titleFactory,
369 $namespace
370 )
371 );
372 return true;
373 } else {
374 return false;
375 }
376 }
377
383 public function setTargetRootPage( $rootpage ) {
384 $status = Status::newGood();
385 $nsInfo = $this->namespaceInfo;
386 if ( $rootpage === null ) {
387 // No rootpage
390 $this->contentLanguage,
391 $nsInfo,
392 $this->titleFactory
393 )
394 );
395 } elseif ( $rootpage !== '' ) {
396 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
397 $title = Title::newFromText( $rootpage );
398
399 if ( !$title || $title->isExternal() ) {
400 $status->fatal( 'import-rootpage-invalid' );
401 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
402 $displayNSText = $title->getNamespace() === NS_MAIN
403 ? wfMessage( 'blanknamespace' )->text()
404 : $this->contentLanguage->getNsText( $title->getNamespace() );
405 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
406 } else {
407 // set namespace to 'all', so the namespace check in processTitle() can pass
408 $this->setTargetNamespace( null );
411 $nsInfo,
412 $this->titleFactory,
413 $title
414 )
415 );
416 }
417 }
418 return $status;
419 }
420
424 public function setImageBasePath( $dir ) {
425 $this->mImageBasePath = $dir;
426 }
427
431 public function setImportUploads( $import ) {
432 $this->mImportUploads = $import;
433 }
434
440 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
441 $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
442 }
443
448 public function disableStatisticsUpdate() {
449 $this->disableStatisticsUpdate = true;
450 }
451
458 public function beforeImportPage( $titleAndForeignTitle ) {
459 $title = $titleAndForeignTitle[0];
460 $page = $this->wikiPageFactory->newFromTitle( $title );
461 $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
462 return true;
463 }
464
470 public function importRevision( $revision ) {
471 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
472 $this->notice( 'import-error-bad-location',
473 $revision->getTitle()->getPrefixedText(),
474 $revision->getID(),
475 $revision->getModel(),
476 $revision->getFormat()
477 );
478
479 return false;
480 }
481
482 try {
483 return $revision->importOldRevision();
484 } catch ( MWContentSerializationException $ex ) {
485 $this->notice( 'import-error-unserialize',
486 $revision->getTitle()->getPrefixedText(),
487 $revision->getID(),
488 $revision->getModel(),
489 $revision->getFormat()
490 );
491 }
492
493 return false;
494 }
495
501 public function importLogItem( $revision ) {
502 return $revision->importLogItem();
503 }
504
510 public function importUpload( $revision ) {
511 $status = $this->uploadRevisionImporter->import( $revision );
512 return $status->isGood();
513 }
514
524 public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
525 $sRevCount, $pageInfo
526 ) {
527 // Update article count statistics (T42009)
528 // The normal counting logic in WikiPage->doEditUpdates() is designed for
529 // one-revision-at-a-time editing, not bulk imports. In this situation it
530 // suffers from issues of replica DB lag. We let WikiPage handle the total page
531 // and revision count, and we implement our own custom logic for the
532 // article (content page) count.
533 if ( !$this->disableStatisticsUpdate ) {
534 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
535
536 $page->loadPageData( IDBAccessObject::READ_LATEST );
537 $rev = $page->getRevisionRecord();
538 if ( $rev === null ) {
539
540 wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
541 ' because WikiPage::getRevisionRecord() returned null' );
542 } else {
543 $update = $page->newPageUpdater( $this->performer )->prepareUpdate();
544 $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
545 $countable = $update->isCountable();
546 if ( array_key_exists( $countKey, $this->countableCache ) &&
547 $countable != $this->countableCache[$countKey] ) {
548 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
549 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
550 ] ) );
551 }
552 }
553 }
554
555 $title = Title::newFromPageIdentity( $pageIdentity );
556 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
557 $revCount, $sRevCount, $pageInfo );
558 }
559
565 private function siteInfoCallback( $siteInfo ) {
566 if ( isset( $this->mSiteInfoCallback ) ) {
567 return call_user_func_array(
568 $this->mSiteInfoCallback,
569 [ $siteInfo, $this ]
570 );
571 } else {
572 return false;
573 }
574 }
575
580 public function pageCallback( $title ) {
581 if ( isset( $this->mPageCallback ) ) {
582 call_user_func( $this->mPageCallback, $title );
583 }
584 }
585
594 private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
595 $sucCount, $pageInfo ) {
596 if ( isset( $this->mPageOutCallback ) ) {
597 call_user_func_array( $this->mPageOutCallback, func_get_args() );
598 }
599 }
600
606 private function revisionCallback( $revision ) {
607 if ( isset( $this->mRevisionCallback ) ) {
608 return call_user_func_array(
609 $this->mRevisionCallback,
610 [ $revision, $this ]
611 );
612 } else {
613 return false;
614 }
615 }
616
622 private function logItemCallback( $revision ) {
623 if ( isset( $this->mLogItemCallback ) ) {
624 return call_user_func_array(
625 $this->mLogItemCallback,
626 [ $revision, $this ]
627 );
628 } else {
629 return false;
630 }
631 }
632
639 public function nodeAttribute( $attr ) {
640 return $this->reader->getAttribute( $attr ) ?? '';
641 }
642
650 public function nodeContents() {
651 if ( $this->reader->isEmptyElement ) {
652 return "";
653 }
654 $buffer = "";
655 while ( $this->reader->read() ) {
656 switch ( $this->reader->nodeType ) {
657 case XMLReader::TEXT:
658 case XMLReader::CDATA:
659 case XMLReader::SIGNIFICANT_WHITESPACE:
660 $buffer .= $this->reader->value;
661 break;
662 case XMLReader::END_ELEMENT:
663 return $buffer;
664 }
665 }
666
667 $this->reader->close();
668 return '';
669 }
670
676 public function doImport() {
677 $this->syntaxCheckXML();
678
679 // Calls to reader->read need to be wrapped in calls to
680 // libxml_disable_entity_loader() to avoid local file
681 // inclusion attacks (T48932).
682 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
683 $oldDisable = @libxml_disable_entity_loader( true );
684 try {
685 $this->reader->read();
686
687 if ( $this->reader->localName != 'mediawiki' ) {
688 // phpcs:ignore Generic.PHP.NoSilencedErrors
689 @libxml_disable_entity_loader( $oldDisable );
690 $error = libxml_get_last_error();
691 if ( $error ) {
692 throw new NormalizedException( "XML error at line {line}: {message}", [
693 'line' => $error->line,
694 'message' => $error->message,
695 ] );
696 } else {
697 throw new UnexpectedValueException(
698 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
699 );
700 }
701 }
702 $this->debug( "<mediawiki> tag is correct." );
703
704 $this->debug( "Starting primary dump processing loop." );
705
706 $keepReading = $this->reader->read();
707 $skip = false;
708 $pageCount = 0;
709 while ( $keepReading ) {
710 $tag = $this->reader->localName;
711 if ( $this->pageOffset ) {
712 if ( $tag === 'page' ) {
713 $pageCount++;
714 }
715 if ( $pageCount < $this->pageOffset ) {
716 $keepReading = $this->reader->next();
717 continue;
718 }
719 }
720 $type = $this->reader->nodeType;
721
722 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
723 // Do nothing
724 } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
725 break;
726 } elseif ( $tag == 'siteinfo' ) {
727 $this->handleSiteInfo();
728 } elseif ( $tag == 'page' ) {
729 $this->handlePage();
730 } elseif ( $tag == 'logitem' ) {
731 $this->handleLogItem();
732 } elseif ( $tag != '#text' ) {
733 $this->warn( "Unhandled top-level XML tag $tag" );
734
735 $skip = true;
736 }
737
738 if ( $skip ) {
739 $keepReading = $this->reader->next();
740 $skip = false;
741 $this->debug( "Skip" );
742 } else {
743 $keepReading = $this->reader->read();
744 }
745 }
746 } finally {
747 // phpcs:ignore Generic.PHP.NoSilencedErrors
748 @libxml_disable_entity_loader( $oldDisable );
749 $this->reader->close();
750 }
751
752 return true;
753 }
754
755 private function handleSiteInfo() {
756 $this->debug( "Enter site info handler." );
757 $siteInfo = [];
758
759 // Fields that can just be stuffed in the siteInfo object
760 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
761
762 while ( $this->reader->read() ) {
763 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
764 $this->reader->localName == 'siteinfo' ) {
765 break;
766 }
767
768 $tag = $this->reader->localName;
769
770 if ( $tag == 'namespace' ) {
771 $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
772 $this->nodeContents();
773 } elseif ( in_array( $tag, $normalFields ) ) {
774 $siteInfo[$tag] = $this->nodeContents();
775 }
776 }
777
778 $siteInfo['_namespaces'] = $this->foreignNamespaces;
779 $this->siteInfoCallback( $siteInfo );
780 }
781
782 private function handleLogItem() {
783 $this->debug( "Enter log item handler." );
784 $logInfo = [];
785
786 // Fields that can just be stuffed in the pageInfo object
787 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
788 'logtitle', 'params' ];
789
790 while ( $this->reader->read() ) {
791 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
792 $this->reader->localName == 'logitem' ) {
793 break;
794 }
795
796 $tag = $this->reader->localName;
797
798 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
799 // Do nothing
800 } elseif ( in_array( $tag, $normalFields ) ) {
801 $logInfo[$tag] = $this->nodeContents();
802 } elseif ( $tag == 'contributor' ) {
803 $logInfo['contributor'] = $this->handleContributor();
804 } elseif ( $tag != '#text' ) {
805 $this->warn( "Unhandled log-item XML tag $tag" );
806 }
807 }
808
809 $this->processLogItem( $logInfo );
810 }
811
816 private function processLogItem( $logInfo ) {
817 $revision = new WikiRevision();
818
819 if ( isset( $logInfo['id'] ) ) {
820 $revision->setID( $logInfo['id'] );
821 }
822 $revision->setType( $logInfo['type'] );
823 $revision->setAction( $logInfo['action'] );
824 if ( isset( $logInfo['timestamp'] ) ) {
825 $revision->setTimestamp( $logInfo['timestamp'] );
826 }
827 if ( isset( $logInfo['params'] ) ) {
828 $revision->setParams( $logInfo['params'] );
829 }
830 if ( isset( $logInfo['logtitle'] ) ) {
831 // @todo Using Title for non-local titles is a recipe for disaster.
832 // We should use ForeignTitle here instead.
833 $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
834 }
835
836 $revision->setNoUpdates( $this->mNoUpdates );
837
838 if ( isset( $logInfo['comment'] ) ) {
839 $revision->setComment( $logInfo['comment'] );
840 }
841
842 if ( isset( $logInfo['contributor']['username'] ) ) {
843 $revision->setUsername(
844 $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
845 );
846 } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
847 $revision->setUserIP( $logInfo['contributor']['ip'] );
848 } else {
849 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
850 }
851
852 return $this->logItemCallback( $revision );
853 }
854
855 private function handlePage() {
856 // Handle page data.
857 $this->debug( "Enter page handler." );
858 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
859
860 // Fields that can just be stuffed in the pageInfo object
861 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
862
863 $skip = false;
864 $badTitle = false;
865
866 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
867 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
868 $this->reader->localName == 'page' ) {
869 break;
870 }
871
872 $skip = false;
873
874 $tag = $this->reader->localName;
875
876 if ( $badTitle ) {
877 // The title is invalid, bail out of this page
878 $skip = true;
879 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
880 // Do nothing
881 } elseif ( in_array( $tag, $normalFields ) ) {
882 // An XML snippet:
883 // <page>
884 // <id>123</id>
885 // <title>Page</title>
886 // <redirect title="NewTitle"/>
887 // ...
888 // Because the redirect tag is built differently, we need special handling for that case.
889 if ( $tag == 'redirect' ) {
890 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
891 } else {
892 $pageInfo[$tag] = $this->nodeContents();
893 }
894 } elseif ( $tag == 'revision' || $tag == 'upload' ) {
895 if ( !isset( $title ) ) {
896 $title = $this->processTitle( $pageInfo['title'],
897 $pageInfo['ns'] ?? null );
898
899 // $title is either an array of two titles or false.
900 if ( is_array( $title ) ) {
901 $this->pageCallback( $title );
902 [ $pageInfo['_title'], $foreignTitle ] = $title;
903 } else {
904 $badTitle = true;
905 $skip = true;
906 }
907 }
908
909 if ( $title ) {
910 if ( $tag == 'revision' ) {
911 $this->handleRevision( $pageInfo );
912 } else {
913 $this->handleUpload( $pageInfo );
914 }
915 }
916 } elseif ( $tag != '#text' ) {
917 $this->warn( "Unhandled page XML tag $tag" );
918 $skip = true;
919 }
920 }
921
922 // @note $pageInfo is only set if a valid $title is processed above with
923 // no error. If we have a valid $title, then pageCallback is called
924 // above, $pageInfo['title'] is set and we do pageOutCallback here.
925 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
926 // set since they both come from $title above.
927 if ( array_key_exists( '_title', $pageInfo ) ) {
929 $title = $pageInfo['_title'];
930 $this->pageOutCallback(
931 $title,
932 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
933 $foreignTitle,
934 $pageInfo['revisionCount'],
935 $pageInfo['successfulRevisionCount'],
936 $pageInfo
937 );
938 }
939 }
940
944 private function handleRevision( &$pageInfo ) {
945 $this->debug( "Enter revision handler" );
946 $revisionInfo = [];
947
948 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
949 'model', 'format', 'text', 'sha1' ];
950
951 $skip = false;
952
953 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
954 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
955 $this->reader->localName == 'revision' ) {
956 break;
957 }
958
959 $tag = $this->reader->localName;
960
961 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
962 $this, $pageInfo, $revisionInfo )
963 ) {
964 // Do nothing
965 } elseif ( in_array( $tag, $normalFields ) ) {
966 $revisionInfo[$tag] = $this->nodeContents();
967 } elseif ( $tag == 'content' ) {
968 // We can have multiple content tags, so make this an array.
969 $revisionInfo[$tag][] = $this->handleContent();
970 } elseif ( $tag == 'contributor' ) {
971 $revisionInfo['contributor'] = $this->handleContributor();
972 } elseif ( $tag != '#text' ) {
973 $this->warn( "Unhandled revision XML tag $tag" );
974 $skip = true;
975 }
976 }
977
978 $pageInfo['revisionCount']++;
979 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
980 $pageInfo['successfulRevisionCount']++;
981 }
982 }
983
984 private function handleContent() {
985 $this->debug( "Enter content handler" );
986 $contentInfo = [];
987
988 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
989
990 $skip = false;
991
992 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
993 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
994 $this->reader->localName == 'content' ) {
995 break;
996 }
997
998 $tag = $this->reader->localName;
999
1000 if ( !$this->hookRunner->onImportHandleContentXMLTag(
1001 $this, $contentInfo )
1002 ) {
1003 // Do nothing
1004 } elseif ( in_array( $tag, $normalFields ) ) {
1005 $contentInfo[$tag] = $this->nodeContents();
1006 } elseif ( $tag != '#text' ) {
1007 $this->warn( "Unhandled content XML tag $tag" );
1008 $skip = true;
1009 }
1010 }
1011
1012 return $contentInfo;
1013 }
1014
1022 private function makeContent( PageIdentity $page, $revisionId, $contentInfo ) {
1023 $maxArticleSize = $this->config->get( MainConfigNames::MaxArticleSize );
1024
1025 if ( !isset( $contentInfo['text'] ) ) {
1026 throw new InvalidArgumentException( 'Missing text field in import.' );
1027 }
1028
1029 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1030 // database errors and instability. Testing for revisions with only listed
1031 // content models, as other content models might use serialization formats
1032 // which aren't checked against $wgMaxArticleSize.
1033 if ( ( !isset( $contentInfo['model'] ) ||
1034 in_array( $contentInfo['model'], [
1035 'wikitext',
1036 'css',
1037 'json',
1038 'javascript',
1039 'text',
1040 ''
1041 ] ) ) &&
1042 strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1043 ) {
1044 throw new RuntimeException( 'The text of ' .
1045 ( $revisionId ?
1046 "the revision with ID $revisionId" :
1047 'a revision'
1048 ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1049 }
1050
1051 $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1052 $model = $contentInfo['model'] ?? $this->slotRoleRegistry
1053 ->getRoleHandler( $role )
1054 ->getDefaultModel( $page );
1055 $handler = $this->contentHandlerFactory->getContentHandler( $model );
1056
1057 $text = $handler->importTransform( $contentInfo['text'] );
1058
1059 return $handler->unserializeContent( $text );
1060 }
1061
1067 private function processRevision( $pageInfo, $revisionInfo ) {
1068 $revision = new WikiRevision();
1069
1070 $revId = $revisionInfo['id'] ?? 0;
1071 if ( $revId ) {
1072 $revision->setID( $revisionInfo['id'] );
1073 }
1074
1075 $title = $pageInfo['_title'];
1076 $revision->setTitle( $title );
1077
1078 $content = $this->makeContent( $title, $revId, $revisionInfo );
1079 $revision->setContent( SlotRecord::MAIN, $content );
1080
1081 foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1082 if ( !isset( $slotInfo['role'] ) ) {
1083 throw new RuntimeException( "Missing role for imported slot." );
1084 }
1085
1086 $content = $this->makeContent( $title, $revId, $slotInfo );
1087 $revision->setContent( $slotInfo['role'], $content );
1088 }
1089 $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1090
1091 if ( isset( $revisionInfo['comment'] ) ) {
1092 $revision->setComment( $revisionInfo['comment'] );
1093 }
1094
1095 if ( isset( $revisionInfo['minor'] ) ) {
1096 $revision->setMinor( true );
1097 }
1098 if ( isset( $revisionInfo['contributor']['username'] ) ) {
1099 $revision->setUsername(
1100 $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1101 );
1102 } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1103 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1104 } else {
1105 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1106 }
1107 if ( isset( $revisionInfo['sha1'] ) ) {
1108 $revision->setSha1Base36( $revisionInfo['sha1'] );
1109 }
1110 $revision->setNoUpdates( $this->mNoUpdates );
1111
1112 return $this->revisionCallback( $revision );
1113 }
1114
1119 private function handleUpload( &$pageInfo ) {
1120 $this->debug( "Enter upload handler" );
1121 $uploadInfo = [];
1122
1123 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1124 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1125
1126 $skip = false;
1127
1128 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1129 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1130 $this->reader->localName == 'upload' ) {
1131 break;
1132 }
1133
1134 $tag = $this->reader->localName;
1135
1136 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1137 // Do nothing
1138 } elseif ( in_array( $tag, $normalFields ) ) {
1139 $uploadInfo[$tag] = $this->nodeContents();
1140 } elseif ( $tag == 'contributor' ) {
1141 $uploadInfo['contributor'] = $this->handleContributor();
1142 } elseif ( $tag == 'contents' ) {
1143 $contents = $this->nodeContents();
1144 $encoding = $this->reader->getAttribute( 'encoding' );
1145 if ( $encoding === 'base64' ) {
1146 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1147 $uploadInfo['isTempSrc'] = true;
1148 }
1149 } elseif ( $tag != '#text' ) {
1150 $this->warn( "Unhandled upload XML tag $tag" );
1151 $skip = true;
1152 }
1153 }
1154
1155 if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1156 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1157 if ( file_exists( $path ) ) {
1158 $uploadInfo['fileSrc'] = $path;
1159 $uploadInfo['isTempSrc'] = false;
1160 }
1161 }
1162
1163 if ( $this->mImportUploads ) {
1164 return $this->processUpload( $pageInfo, $uploadInfo );
1165 }
1166 }
1167
1172 private function dumpTemp( $contents ) {
1173 $filename = tempnam( wfTempDir(), 'importupload' );
1174 file_put_contents( $filename, $contents );
1175 return $filename;
1176 }
1177
1183 private function processUpload( $pageInfo, $uploadInfo ) {
1184 $revision = new WikiRevision();
1185 $revId = $pageInfo['id'];
1186 $title = $pageInfo['_title'];
1187 // T292348: text key may be absent, force addition if null
1188 $uploadInfo['text'] ??= '';
1189 $content = $this->makeContent( $title, $revId, $uploadInfo );
1190
1191 $revision->setTitle( $title );
1192 $revision->setID( $revId );
1193 $revision->setTimestamp( $uploadInfo['timestamp'] );
1194 $revision->setContent( SlotRecord::MAIN, $content );
1195 $revision->setFilename( $uploadInfo['filename'] );
1196 if ( isset( $uploadInfo['archivename'] ) ) {
1197 $revision->setArchiveName( $uploadInfo['archivename'] );
1198 }
1199 $revision->setSrc( $uploadInfo['src'] );
1200 if ( isset( $uploadInfo['fileSrc'] ) ) {
1201 $revision->setFileSrc( $uploadInfo['fileSrc'],
1202 !empty( $uploadInfo['isTempSrc'] )
1203 );
1204 }
1205 if ( isset( $uploadInfo['sha1base36'] ) ) {
1206 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1207 }
1208 $revision->setSize( intval( $uploadInfo['size'] ) );
1209 $revision->setComment( $uploadInfo['comment'] );
1210
1211 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1212 $revision->setUsername(
1213 $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1214 );
1215 } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1216 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1217 }
1218 $revision->setNoUpdates( $this->mNoUpdates );
1219
1220 return call_user_func( $this->mUploadCallback, $revision );
1221 }
1222
1226 private function handleContributor() {
1227 $this->debug( "Enter contributor handler." );
1228
1229 if ( $this->reader->isEmptyElement ) {
1230 return [];
1231 }
1232
1233 $fields = [ 'id', 'ip', 'username' ];
1234 $info = [];
1235
1236 while ( $this->reader->read() ) {
1237 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1238 $this->reader->localName == 'contributor' ) {
1239 break;
1240 }
1241
1242 $tag = $this->reader->localName;
1243
1244 if ( in_array( $tag, $fields ) ) {
1245 $info[$tag] = $this->nodeContents();
1246 }
1247 }
1248
1249 return $info;
1250 }
1251
1257 private function processTitle( $text, $ns = null ) {
1258 if ( $this->foreignNamespaces === null ) {
1259 $foreignTitleFactory = new NaiveForeignTitleFactory(
1260 $this->contentLanguage
1261 );
1262 } else {
1263 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1264 $this->foreignNamespaces );
1265 }
1266
1267 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1268 intval( $ns ) );
1269
1270 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1271 $foreignTitle );
1272
1273 if ( $title === null ) {
1274 # Invalid page title? Ignore the page
1275 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1276 return false;
1277 } elseif ( $title->isExternal() ) {
1278 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1279 return false;
1280 } elseif ( !$title->canExist() ) {
1281 $this->notice( 'import-error-special', $title->getPrefixedText() );
1282 return false;
1283 } elseif ( !$this->performer->definitelyCan( 'edit', $title ) ) {
1284 # Do not import if the importing wiki user cannot edit this page
1285 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1286 return false;
1287 }
1288
1289 return [ $title, $foreignTitle ];
1290 }
1291
1296 private function openReader() {
1297 // Enable the entity loader, as it is needed for loading external URLs via
1298 // XMLReader::open (T86036)
1299 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
1300 $oldDisable = @libxml_disable_entity_loader( false );
1301
1302 if ( PHP_VERSION_ID >= 80000 ) {
1303 // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548
1304 $reader = XMLReader::open(
1305 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1306 if ( $reader instanceof XMLReader ) {
1307 $this->reader = $reader;
1308 $status = true;
1309 } else {
1310 $status = false;
1311 }
1312 } else {
1313 // A static call generated a deprecation warning prior to PHP 8.0
1314 $this->reader = new XMLReader;
1315 $status = $this->reader->open(
1316 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1317 }
1318 if ( !$status ) {
1319 $error = libxml_get_last_error();
1320 // phpcs:ignore Generic.PHP.NoSilencedErrors
1321 @libxml_disable_entity_loader( $oldDisable );
1322 throw new RuntimeException(
1323 'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1324 );
1325 }
1326 // phpcs:ignore Generic.PHP.NoSilencedErrors
1327 @libxml_disable_entity_loader( $oldDisable );
1328 }
1329
1333 private function syntaxCheckXML() {
1334 if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) {
1335 return;
1336 }
1337 AtEase::suppressWarnings();
1338 $oldDisable = libxml_disable_entity_loader( false );
1339 try {
1340 while ( $this->reader->read() );
1341 $error = libxml_get_last_error();
1342 if ( $error ) {
1343 $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message;
1344 wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage );
1345 throw new RuntimeException( $errorMessage );
1346 }
1347 } finally {
1348 libxml_disable_entity_loader( $oldDisable );
1349 AtEase::restoreWarnings();
1350 $this->reader->close();
1351 }
1352
1353 // Reopen for the real import
1354 UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 );
1355 $this->openReader();
1356 }
1357}
const NS_MAIN
Definition Defines.php:65
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
array $params
The job parameters.
Reporting callback.
Exception representing a failure to serialize or unserialize a content object.
Helper class for mapping value objects representing basic entities to cache keys.
Defer callable updates to run later in the PHP process.
Class for handling updates to the site_stats table.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
A class containing constants representing the names of configuration variables.
Service for creating WikiPage objects.
Value object representing a content slot associated with a page revision.
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition Status.php:54
A simple, immutable structure to hold the title of a page on a foreign MediaWiki installation.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Creates Title objects.
Represents a title within MediaWiki.
Definition Title.php:78
Class to parse and build external user names.
static seekSource(string $id, int $offset)
static isSeekableSource(string $id)
static registerSource(ImportSource $source)
XML file reader for the page data importer.
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setNoUpdates( $noupdates)
Set 'no updates' mode.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
importLogItem( $revision)
Default per-revision callback, performs the import.
setImageBasePath( $dir)
throwXmlError( $err)
setDebug( $debug)
Set debug mode...
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
__construct(ImportSource $source, Authority $performer, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
Represents a revision, log entry or upload during the import process.
Source interface for XML import.
Interface for configuration instances.
Definition Config.php:32
Interface for objects (potentially) representing an editable wiki page.
This interface represents the authority associated with the current execution context,...
Definition Authority.php:37
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
Interface for database access objects.
$source