MediaWiki master
WikiImporter.php
Go to the documentation of this file.
1<?php
13namespace MediaWiki\Import;
14
15use InvalidArgumentException;
44use RuntimeException;
45use UnexpectedValueException;
48use Wikimedia\NormalizedException\NormalizedException;
50use XMLReader;
51
60 private $reader;
61
63 private $sourceAdapterId;
64
66 private $foreignNamespaces = null;
67
69 private $mLogItemCallback;
70
72 private $mUploadCallback;
73
75 private $mRevisionCallback;
76
78 private $mPageCallback;
79
81 private $mSiteInfoCallback;
82
84 private $mPageOutCallback;
85
87 private $mNoticeCallback;
88
90 private $mDebug;
91
93 private $mImportUploads;
94
96 private $mImageBasePath;
97
99 private $mNoUpdates = false;
100
102 private $pageOffset = 0;
103
104 private ImportTitleFactory $importTitleFactory;
105 private ExternalUserNames $externalUserNames;
106
108 private $countableCache = [];
109
111 private $disableStatisticsUpdate = false;
112
119 private readonly Authority $performer;
120
121 private readonly HookRunner $hookRunner;
122
126 public function __construct(
128 Authority $performer,
129 private readonly Config $config,
130 HookContainer $hookContainer,
131 private readonly Language $contentLanguage,
132 private readonly NamespaceInfo $namespaceInfo,
133 private readonly TitleFactory $titleFactory,
134 private readonly WikiPageFactory $wikiPageFactory,
135 private readonly UploadRevisionImporter $uploadRevisionImporter,
136 private readonly IContentHandlerFactory $contentHandlerFactory,
137 private readonly SlotRoleRegistry $slotRoleRegistry,
138 ) {
139 $this->performer = $performer;
140 $this->hookRunner = new HookRunner( $hookContainer );
141
142 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
143 stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
144 }
145 $this->sourceAdapterId = UploadSourceAdapter::registerSource( $source );
146
147 $this->openReader();
148
149 // Default callbacks
150 $this->setPageCallback( $this->beforeImportPage( ... ) );
151 $this->setRevisionCallback( $this->importRevision( ... ) );
152 $this->setUploadCallback( $this->importUpload( ... ) );
153 $this->setLogItemCallback( $this->importLogItem( ... ) );
154 $this->setPageOutCallback( $this->finishImportPage( ... ) );
155
156 $this->importTitleFactory = new NaiveImportTitleFactory(
157 $this->contentLanguage,
158 $this->namespaceInfo,
159 $this->titleFactory
160 );
161 $this->externalUserNames = new ExternalUserNames( 'imported', false );
162 }
163
167 public function getReader() {
168 return $this->reader;
169 }
170
174 public function throwXmlError( $err ) {
175 $this->debug( "FAILURE: $err" );
176 wfDebug( "WikiImporter XML error: $err" );
177 }
178
182 public function debug( $data ) {
183 if ( $this->mDebug ) {
184 wfDebug( "IMPORT: $data" );
185 }
186 }
187
191 public function warn( $data ) {
192 wfDebug( "IMPORT: $data" );
193 }
194
201 public function notice( $msg, ...$params ) {
202 if ( is_callable( $this->mNoticeCallback ) ) {
203 ( $this->mNoticeCallback )( $msg, $params );
204 } else { # No ImportReporter -> CLI
205 // T177997: the command line importers should call setNoticeCallback()
206 // for their own custom callback to echo the notice
207 wfDebug( wfMessage( $msg, $params )->text() );
208 }
209 }
210
215 public function setDebug( $debug ) {
216 $this->mDebug = $debug;
217 }
218
223 public function setNoUpdates( $noupdates ) {
224 $this->mNoUpdates = $noupdates;
225 }
226
233 public function setPageOffset( $nthPage ) {
234 $this->pageOffset = $nthPage;
235 }
236
243 public function setNoticeCallback( $callback ) {
244 return wfSetVar( $this->mNoticeCallback, $callback );
245 }
246
252 public function setPageCallback( $callback ) {
253 $previous = $this->mPageCallback;
254 $this->mPageCallback = $callback;
255 return $previous;
256 }
257
267 public function setPageOutCallback( $callback ) {
268 $previous = $this->mPageOutCallback;
269 $this->mPageOutCallback = $callback;
270 return $previous;
271 }
272
278 public function setRevisionCallback( $callback ) {
279 $previous = $this->mRevisionCallback;
280 $this->mRevisionCallback = $callback;
281 return $previous;
282 }
283
289 public function setUploadCallback( $callback ) {
290 $previous = $this->mUploadCallback;
291 $this->mUploadCallback = $callback;
292 return $previous;
293 }
294
300 public function setLogItemCallback( $callback ) {
301 $previous = $this->mLogItemCallback;
302 $this->mLogItemCallback = $callback;
303 return $previous;
304 }
305
311 public function setSiteInfoCallback( $callback ) {
312 $previous = $this->mSiteInfoCallback;
313 $this->mSiteInfoCallback = $callback;
314 return $previous;
315 }
316
322 public function setImportTitleFactory( $factory ) {
323 $this->importTitleFactory = $factory;
324 }
325
331 public function setTargetNamespace( $namespace ) {
332 if ( $namespace === null ) {
333 // Don't override namespaces
336 $this->contentLanguage,
337 $this->namespaceInfo,
338 $this->titleFactory
339 )
340 );
341 return true;
342 } elseif (
343 $namespace >= 0 &&
344 $this->namespaceInfo->exists( intval( $namespace ) )
345 ) {
346 $namespace = intval( $namespace );
349 $this->namespaceInfo,
350 $this->titleFactory,
351 $namespace
352 )
353 );
354 return true;
355 } else {
356 return false;
357 }
358 }
359
365 public function setTargetRootPage( $rootpage ) {
366 $status = Status::newGood();
367 $nsInfo = $this->namespaceInfo;
368 if ( $rootpage === null ) {
369 // No rootpage
372 $this->contentLanguage,
373 $nsInfo,
374 $this->titleFactory
375 )
376 );
377 } elseif ( $rootpage !== '' ) {
378 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
379 $title = Title::newFromText( $rootpage );
380
381 if ( !$title || $title->isExternal() ) {
382 $status->fatal( 'import-rootpage-invalid' );
383 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
384 $displayNSText = $title->getNamespace() === NS_MAIN
385 ? wfMessage( 'blanknamespace' )->text()
386 : $this->contentLanguage->getNsText( $title->getNamespace() );
387 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
388 } else {
389 // set namespace to 'all', so the namespace check in processTitle() can pass
390 $this->setTargetNamespace( null );
393 $nsInfo,
394 $this->titleFactory,
395 $title
396 )
397 );
398 }
399 }
400 return $status;
401 }
402
406 public function setImageBasePath( $dir ) {
407 $this->mImageBasePath = $dir;
408 }
409
413 public function setImportUploads( $import ) {
414 $this->mImportUploads = $import;
415 }
416
422 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
423 $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
424 }
425
430 public function disableStatisticsUpdate() {
431 $this->disableStatisticsUpdate = true;
432 }
433
440 public function beforeImportPage( $titleAndForeignTitle ) {
441 $title = $titleAndForeignTitle[0];
442 $page = $this->wikiPageFactory->newFromTitle( $title );
443 $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
444 return true;
445 }
446
452 public function importRevision( $revision ) {
453 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
454 $this->notice( 'import-error-bad-location',
455 $revision->getTitle()->getPrefixedText(),
456 $revision->getID(),
457 $revision->getModel(),
458 $revision->getFormat()
459 );
460
461 return false;
462 }
463
464 try {
465 return $revision->importOldRevision();
467 $this->notice( 'import-error-unserialize',
468 $revision->getTitle()->getPrefixedText(),
469 $revision->getID(),
470 $revision->getModel(),
471 $revision->getFormat()
472 );
473 }
474
475 return false;
476 }
477
483 public function importLogItem( $revision ) {
484 return $revision->importLogItem();
485 }
486
492 public function importUpload( $revision ) {
493 $status = $this->uploadRevisionImporter->import( $revision );
494 return $status->isGood();
495 }
496
506 public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
507 $sRevCount, $pageInfo
508 ) {
509 // Update article count statistics (T42009)
510 // The normal counting logic in WikiPage->doEditUpdates() is designed for
511 // one-revision-at-a-time editing, not bulk imports. In this situation it
512 // suffers from issues of replica DB lag. We let WikiPage handle the total page
513 // and revision count, and we implement our own custom logic for the
514 // article (content page) count.
515 if ( !$this->disableStatisticsUpdate ) {
516 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
517
518 $page->loadPageData( IDBAccessObject::READ_LATEST );
519 $rev = $page->getRevisionRecord();
520 if ( $rev === null ) {
521
522 wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
523 ' because WikiPage::getRevisionRecord() returned null' );
524 } else {
525 $update = $page->newPageUpdater( $this->performer )->prepareUpdate();
526 $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
527 $countable = $update->isCountable();
528 if ( array_key_exists( $countKey, $this->countableCache ) &&
529 $countable != $this->countableCache[$countKey] ) {
530 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
531 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
532 ] ) );
533 }
534 }
535 }
536
537 $title = Title::newFromPageIdentity( $pageIdentity );
538 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
539 $revCount, $sRevCount, $pageInfo );
540 }
541
547 private function siteInfoCallback( $siteInfo ) {
548 if ( $this->mSiteInfoCallback ) {
549 return ( $this->mSiteInfoCallback )( $siteInfo, $this );
550 } else {
551 return false;
552 }
553 }
554
559 public function pageCallback( $title ) {
560 if ( $this->mPageCallback ) {
561 ( $this->mPageCallback )( $title );
562 }
563 }
564
573 private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
574 $sucCount, $pageInfo ) {
575 if ( $this->mPageOutCallback ) {
576 ( $this->mPageOutCallback )( $pageIdentity, $foreignTitle, $revCount, $sucCount, $pageInfo );
577 }
578 }
579
585 private function revisionCallback( $revision ) {
586 if ( $this->mRevisionCallback ) {
587 return ( $this->mRevisionCallback )( $revision, $this );
588 } else {
589 return false;
590 }
591 }
592
598 private function logItemCallback( $revision ) {
599 if ( $this->mLogItemCallback ) {
600 return ( $this->mLogItemCallback )( $revision, $this );
601 } else {
602 return false;
603 }
604 }
605
612 public function nodeAttribute( $attr ) {
613 return $this->reader->getAttribute( $attr ) ?? '';
614 }
615
623 public function nodeContents() {
624 if ( $this->reader->isEmptyElement ) {
625 return "";
626 }
627 $buffer = "";
628 while ( $this->reader->read() ) {
629 switch ( $this->reader->nodeType ) {
630 case XMLReader::TEXT:
631 case XMLReader::CDATA:
632 case XMLReader::SIGNIFICANT_WHITESPACE:
633 $buffer .= $this->reader->value;
634 break;
635 case XMLReader::END_ELEMENT:
636 return $buffer;
637 }
638 }
639
640 $this->reader->close();
641 return '';
642 }
643
649 public function doImport() {
650 $this->syntaxCheckXML();
651
652 // Calls to reader->read need to be wrapped in calls to
653 // libxml_disable_entity_loader() to avoid local file
654 // inclusion attacks (T48932).
655 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
656 $oldDisable = @libxml_disable_entity_loader( true );
657 try {
658 $this->reader->read();
659
660 if ( $this->reader->localName != 'mediawiki' ) {
661 // phpcs:ignore Generic.PHP.NoSilencedErrors
662 @libxml_disable_entity_loader( $oldDisable );
663 $error = libxml_get_last_error();
664 if ( $error ) {
665 throw new NormalizedException( "XML error at line {line}: {message}", [
666 'line' => $error->line,
667 'message' => $error->message,
668 ] );
669 } else {
670 throw new UnexpectedValueException(
671 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
672 );
673 }
674 }
675 $this->debug( "<mediawiki> tag is correct." );
676
677 $this->debug( "Starting primary dump processing loop." );
678
679 $keepReading = $this->reader->read();
680 $skip = false;
681 $pageCount = 0;
682 while ( $keepReading ) {
683 $tag = $this->reader->localName;
684 if ( $this->pageOffset ) {
685 if ( $tag === 'page' ) {
686 $pageCount++;
687 }
688 if ( $pageCount < $this->pageOffset ) {
689 $keepReading = $this->reader->next();
690 continue;
691 }
692 }
693 $type = $this->reader->nodeType;
694
695 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
696 // Do nothing
697 } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
698 break;
699 } elseif ( $tag == 'siteinfo' ) {
700 $this->handleSiteInfo();
701 } elseif ( $tag == 'page' ) {
702 $this->handlePage();
703 } elseif ( $tag == 'logitem' ) {
704 $this->handleLogItem();
705 } elseif ( $tag != '#text' ) {
706 $this->warn( "Unhandled top-level XML tag $tag" );
707
708 $skip = true;
709 }
710
711 if ( $skip ) {
712 $keepReading = $this->reader->next();
713 $skip = false;
714 $this->debug( "Skip" );
715 } else {
716 $keepReading = $this->reader->read();
717 }
718 }
719 } finally {
720 // phpcs:ignore Generic.PHP.NoSilencedErrors
721 @libxml_disable_entity_loader( $oldDisable );
722 $this->reader->close();
723 }
724
725 return true;
726 }
727
728 private function handleSiteInfo() {
729 $this->debug( "Enter site info handler." );
730 $siteInfo = [];
731
732 // Fields that can just be stuffed in the siteInfo object
733 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
734
735 while ( $this->reader->read() ) {
736 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
737 $this->reader->localName == 'siteinfo' ) {
738 break;
739 }
740
741 $tag = $this->reader->localName;
742
743 if ( $tag == 'namespace' ) {
744 $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
745 $this->nodeContents();
746 } elseif ( in_array( $tag, $normalFields ) ) {
747 $siteInfo[$tag] = $this->nodeContents();
748 }
749 }
750
751 $siteInfo['_namespaces'] = $this->foreignNamespaces;
752 $this->siteInfoCallback( $siteInfo );
753 }
754
755 private function handleLogItem() {
756 $this->debug( "Enter log item handler." );
757 $logInfo = [];
758
759 // Fields that can just be stuffed in the pageInfo object
760 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
761 'logtitle', 'params' ];
762
763 while ( $this->reader->read() ) {
764 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
765 $this->reader->localName == 'logitem' ) {
766 break;
767 }
768
769 $tag = $this->reader->localName;
770
771 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
772 // Do nothing
773 } elseif ( in_array( $tag, $normalFields ) ) {
774 $logInfo[$tag] = $this->nodeContents();
775 } elseif ( $tag == 'contributor' ) {
776 $logInfo['contributor'] = $this->handleContributor();
777 } elseif ( $tag != '#text' ) {
778 $this->warn( "Unhandled log-item XML tag $tag" );
779 }
780 }
781
782 $this->processLogItem( $logInfo );
783 }
784
789 private function processLogItem( $logInfo ) {
790 $revision = new WikiRevision();
791
792 if ( isset( $logInfo['id'] ) ) {
793 $revision->setID( $logInfo['id'] );
794 }
795 $revision->setType( $logInfo['type'] );
796 $revision->setAction( $logInfo['action'] );
797 if ( isset( $logInfo['timestamp'] ) ) {
798 $revision->setTimestamp( $logInfo['timestamp'] );
799 }
800 if ( isset( $logInfo['params'] ) ) {
801 $revision->setParams( $logInfo['params'] );
802 }
803 if ( isset( $logInfo['logtitle'] ) ) {
804 // @todo Using Title for non-local titles is a recipe for disaster.
805 // We should use ForeignTitle here instead.
806 $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
807 }
808
809 $revision->setNoUpdates( $this->mNoUpdates );
810
811 if ( isset( $logInfo['comment'] ) ) {
812 $revision->setComment( $logInfo['comment'] );
813 }
814
815 if ( isset( $logInfo['contributor']['username'] ) ) {
816 $revision->setUsername(
817 $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
818 );
819 } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
820 $revision->setUserIP( $logInfo['contributor']['ip'] );
821 } else {
822 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
823 }
824
825 return $this->logItemCallback( $revision );
826 }
827
828 private function handlePage() {
829 // Handle page data.
830 $this->debug( "Enter page handler." );
831 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
832
833 // Fields that can just be stuffed in the pageInfo object
834 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
835
836 $skip = false;
837 $badTitle = false;
838
839 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
840 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
841 $this->reader->localName == 'page' ) {
842 break;
843 }
844
845 $skip = false;
846
847 $tag = $this->reader->localName;
848
849 if ( $badTitle ) {
850 // The title is invalid, bail out of this page
851 $skip = true;
852 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
853 // Do nothing
854 } elseif ( in_array( $tag, $normalFields ) ) {
855 // An XML snippet:
856 // <page>
857 // <id>123</id>
858 // <title>Page</title>
859 // <redirect title="NewTitle"/>
860 // ...
861 // Because the redirect tag is built differently, we need special handling for that case.
862 if ( $tag == 'redirect' ) {
863 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
864 } else {
865 $pageInfo[$tag] = $this->nodeContents();
866 }
867 } elseif ( $tag == 'revision' || $tag == 'upload' ) {
868 if ( !isset( $title ) ) {
869 $title = $this->processTitle( $pageInfo['title'],
870 $pageInfo['ns'] ?? null );
871
872 // $title is either an array of two titles or false.
873 if ( is_array( $title ) ) {
874 $this->pageCallback( $title );
875 [ $pageInfo['_title'], $foreignTitle ] = $title;
876 } else {
877 $badTitle = true;
878 $skip = true;
879 }
880 }
881
882 if ( $title ) {
883 if ( $tag == 'revision' ) {
884 $this->handleRevision( $pageInfo );
885 } else {
886 $this->handleUpload( $pageInfo );
887 }
888 }
889 } elseif ( $tag != '#text' ) {
890 $this->warn( "Unhandled page XML tag $tag" );
891 $skip = true;
892 }
893 }
894
895 // @note $pageInfo is only set if a valid $title is processed above with
896 // no error. If we have a valid $title, then pageCallback is called
897 // above, $pageInfo['title'] is set and we do pageOutCallback here.
898 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
899 // set since they both come from $title above.
900 if ( array_key_exists( '_title', $pageInfo ) ) {
902 $title = $pageInfo['_title'];
903 $this->pageOutCallback(
904 $title,
905 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
906 $foreignTitle,
907 $pageInfo['revisionCount'],
908 $pageInfo['successfulRevisionCount'],
909 $pageInfo
910 );
911 }
912 }
913
917 private function handleRevision( &$pageInfo ) {
918 $this->debug( "Enter revision handler" );
919 $revisionInfo = [];
920
921 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
922 'model', 'format', 'text', 'sha1' ];
923
924 $skip = false;
925
926 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
927 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
928 $this->reader->localName == 'revision' ) {
929 break;
930 }
931
932 $tag = $this->reader->localName;
933
934 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
935 $this, $pageInfo, $revisionInfo )
936 ) {
937 // Do nothing
938 } elseif ( in_array( $tag, $normalFields ) ) {
939 $revisionInfo[$tag] = $this->nodeContents();
940 } elseif ( $tag == 'content' ) {
941 // We can have multiple content tags, so make this an array.
942 $revisionInfo[$tag][] = $this->handleContent();
943 } elseif ( $tag == 'contributor' ) {
944 $revisionInfo['contributor'] = $this->handleContributor();
945 } elseif ( $tag != '#text' ) {
946 $this->warn( "Unhandled revision XML tag $tag" );
947 $skip = true;
948 }
949 }
950
951 $pageInfo['revisionCount']++;
952 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
953 $pageInfo['successfulRevisionCount']++;
954 }
955 }
956
957 private function handleContent(): array {
958 $this->debug( "Enter content handler" );
959 $contentInfo = [];
960
961 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
962
963 $skip = false;
964
965 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
966 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
967 $this->reader->localName == 'content' ) {
968 break;
969 }
970
971 $tag = $this->reader->localName;
972
973 if ( !$this->hookRunner->onImportHandleContentXMLTag(
974 $this, $contentInfo )
975 ) {
976 // Do nothing
977 } elseif ( in_array( $tag, $normalFields ) ) {
978 $contentInfo[$tag] = $this->nodeContents();
979 } elseif ( $tag != '#text' ) {
980 $this->warn( "Unhandled content XML tag $tag" );
981 $skip = true;
982 }
983 }
984
985 return $contentInfo;
986 }
987
995 private function makeContent( PageIdentity $page, $revisionId, $contentInfo ) {
996 $maxArticleSize = $this->config->get( MainConfigNames::MaxArticleSize );
997
998 if ( !isset( $contentInfo['text'] ) ) {
999 throw new InvalidArgumentException( 'Missing text field in import.' );
1000 }
1001
1002 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1003 // database errors and instability. Testing for revisions with only listed
1004 // content models, as other content models might use serialization formats
1005 // which aren't checked against $wgMaxArticleSize.
1006 if ( ( !isset( $contentInfo['model'] ) ||
1007 in_array( $contentInfo['model'], [
1008 'wikitext',
1009 'css',
1010 'json',
1011 'javascript',
1012 'text',
1013 ''
1014 ] ) ) &&
1015 strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1016 ) {
1017 throw new RuntimeException( 'The text of ' .
1018 ( $revisionId ?
1019 "the revision with ID $revisionId" :
1020 'a revision'
1021 ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1022 }
1023
1024 $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1025 $model = $contentInfo['model'] ?? $this->slotRoleRegistry
1026 ->getRoleHandler( $role )
1027 ->getDefaultModel( $page );
1028 $handler = $this->contentHandlerFactory->getContentHandler( $model );
1029
1030 $text = $handler->importTransform( $contentInfo['text'] );
1031
1032 return $handler->unserializeContent( $text );
1033 }
1034
1040 private function processRevision( $pageInfo, $revisionInfo ) {
1041 $revision = new WikiRevision();
1042
1043 $revId = $revisionInfo['id'] ?? 0;
1044 if ( $revId ) {
1045 $revision->setID( $revisionInfo['id'] );
1046 }
1047
1048 $title = $pageInfo['_title'];
1049 $revision->setTitle( $title );
1050
1051 $content = $this->makeContent( $title, $revId, $revisionInfo );
1052 $revision->setContent( SlotRecord::MAIN, $content );
1053
1054 foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1055 if ( !isset( $slotInfo['role'] ) ) {
1056 throw new RuntimeException( "Missing role for imported slot." );
1057 }
1058
1059 $content = $this->makeContent( $title, $revId, $slotInfo );
1060 $revision->setContent( $slotInfo['role'], $content );
1061 }
1062 $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1063
1064 if ( isset( $revisionInfo['comment'] ) ) {
1065 $revision->setComment( $revisionInfo['comment'] );
1066 }
1067
1068 if ( isset( $revisionInfo['minor'] ) ) {
1069 $revision->setMinor( true );
1070 }
1071 if ( isset( $revisionInfo['contributor']['username'] ) ) {
1072 $revision->setUsername(
1073 $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1074 );
1075 } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1076 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1077 } else {
1078 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1079 }
1080 if ( isset( $revisionInfo['sha1'] ) ) {
1081 $revision->setSha1Base36( $revisionInfo['sha1'] );
1082 }
1083 $revision->setNoUpdates( $this->mNoUpdates );
1084
1085 return $this->revisionCallback( $revision );
1086 }
1087
1092 private function handleUpload( &$pageInfo ) {
1093 $this->debug( "Enter upload handler" );
1094 $uploadInfo = [];
1095
1096 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1097 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1098
1099 $skip = false;
1100
1101 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1102 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1103 $this->reader->localName == 'upload' ) {
1104 break;
1105 }
1106
1107 $tag = $this->reader->localName;
1108
1109 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1110 // Do nothing
1111 } elseif ( in_array( $tag, $normalFields ) ) {
1112 $uploadInfo[$tag] = $this->nodeContents();
1113 } elseif ( $tag == 'contributor' ) {
1114 $uploadInfo['contributor'] = $this->handleContributor();
1115 } elseif ( $tag == 'contents' ) {
1116 $contents = $this->nodeContents();
1117 $encoding = $this->reader->getAttribute( 'encoding' );
1118 if ( $encoding === 'base64' ) {
1119 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1120 $uploadInfo['isTempSrc'] = true;
1121 }
1122 } elseif ( $tag != '#text' ) {
1123 $this->warn( "Unhandled upload XML tag $tag" );
1124 $skip = true;
1125 }
1126 }
1127
1128 if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1129 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1130 if ( file_exists( $path ) ) {
1131 $uploadInfo['fileSrc'] = $path;
1132 $uploadInfo['isTempSrc'] = false;
1133 }
1134 }
1135
1136 if ( $this->mImportUploads ) {
1137 return $this->processUpload( $pageInfo, $uploadInfo );
1138 }
1139 }
1140
1145 private function dumpTemp( $contents ) {
1146 $filename = tempnam( wfTempDir(), 'importupload' );
1147 file_put_contents( $filename, $contents );
1148 return $filename;
1149 }
1150
1156 private function processUpload( $pageInfo, $uploadInfo ) {
1157 $revision = new WikiRevision();
1158 $revId = $pageInfo['id'];
1159 $title = $pageInfo['_title'];
1160 // T292348: text key may be absent, force addition if null
1161 $uploadInfo['text'] ??= '';
1162 $content = $this->makeContent( $title, $revId, $uploadInfo );
1163
1164 $revision->setTitle( $title );
1165 $revision->setID( $revId );
1166 $revision->setTimestamp( $uploadInfo['timestamp'] );
1167 $revision->setContent( SlotRecord::MAIN, $content );
1168 $revision->setFilename( $uploadInfo['filename'] );
1169 if ( isset( $uploadInfo['archivename'] ) ) {
1170 $revision->setArchiveName( $uploadInfo['archivename'] );
1171 }
1172 $revision->setSrc( $uploadInfo['src'] );
1173 if ( isset( $uploadInfo['fileSrc'] ) ) {
1174 $revision->setFileSrc( $uploadInfo['fileSrc'],
1175 !empty( $uploadInfo['isTempSrc'] )
1176 );
1177 }
1178 if ( isset( $uploadInfo['sha1base36'] ) ) {
1179 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1180 }
1181 $revision->setSize( intval( $uploadInfo['size'] ) );
1182 $revision->setComment( $uploadInfo['comment'] );
1183
1184 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1185 $revision->setUsername(
1186 $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1187 );
1188 } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1189 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1190 }
1191 $revision->setNoUpdates( $this->mNoUpdates );
1192
1193 return ( $this->mUploadCallback )( $revision );
1194 }
1195
1199 private function handleContributor() {
1200 $this->debug( "Enter contributor handler." );
1201
1202 if ( $this->reader->isEmptyElement ) {
1203 return [];
1204 }
1205
1206 $fields = [ 'id', 'ip', 'username' ];
1207 $info = [];
1208
1209 while ( $this->reader->read() ) {
1210 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1211 $this->reader->localName == 'contributor' ) {
1212 break;
1213 }
1214
1215 $tag = $this->reader->localName;
1216
1217 if ( in_array( $tag, $fields ) ) {
1218 $info[$tag] = $this->nodeContents();
1219 }
1220 }
1221
1222 return $info;
1223 }
1224
1230 private function processTitle( $text, $ns = null ) {
1231 if ( $this->foreignNamespaces === null ) {
1232 $foreignTitleFactory = new NaiveForeignTitleFactory(
1233 $this->contentLanguage
1234 );
1235 } else {
1236 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1237 $this->foreignNamespaces );
1238 }
1239
1240 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1241 intval( $ns ) );
1242
1243 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1244 $foreignTitle );
1245
1246 if ( $title === null ) {
1247 # Invalid page title? Ignore the page
1248 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1249 return false;
1250 } elseif ( $title->isExternal() ) {
1251 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1252 return false;
1253 } elseif ( !$title->canExist() ) {
1254 $this->notice( 'import-error-special', $title->getPrefixedText() );
1255 return false;
1256 } elseif ( !$this->performer->definitelyCan( 'edit', $title ) ) {
1257 # Do not import if the importing wiki user cannot edit this page
1258 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1259 return false;
1260 }
1261
1262 return [ $title, $foreignTitle ];
1263 }
1264
1268 private function openReader() {
1269 // Enable the entity loader, as it is needed for loading external URLs via
1270 // XMLReader::open (T86036)
1271 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
1272 $oldDisable = @libxml_disable_entity_loader( false );
1273
1274 // A static call, to avoid https://github.com/php/php-src/issues/11548
1275 $reader = XMLReader::open(
1276 'uploadsource://' . $this->sourceAdapterId, null, LIBXML_PARSEHUGE );
1277 if ( $reader instanceof XMLReader ) {
1278 $this->reader = $reader;
1279 $status = true;
1280 } else {
1281 $status = false;
1282 }
1283 if ( !$status ) {
1284 $error = libxml_get_last_error();
1285 // phpcs:ignore Generic.PHP.NoSilencedErrors
1286 @libxml_disable_entity_loader( $oldDisable );
1287 throw new RuntimeException(
1288 'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1289 );
1290 }
1291 // phpcs:ignore Generic.PHP.NoSilencedErrors
1292 @libxml_disable_entity_loader( $oldDisable );
1293 }
1294
1298 private function syntaxCheckXML() {
1299 if ( !UploadSourceAdapter::isSeekableSource( $this->sourceAdapterId ) ) {
1300 return;
1301 }
1302 // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
1303 $oldDisable = @libxml_disable_entity_loader( false );
1304 try {
1305 // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
1306 while ( @$this->reader->read() );
1307 $error = libxml_get_last_error();
1308 if ( $error ) {
1309 $errorMessage = 'XML error at line ' . $error->line . ': ' . $error->message;
1310 wfDebug( __METHOD__ . ': Invalid xml found - ' . $errorMessage );
1311 throw new RuntimeException( $errorMessage );
1312 }
1313 } finally {
1314 // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
1315 @libxml_disable_entity_loader( $oldDisable );
1316 $this->reader->close();
1317 }
1318
1319 // Reopen for the real import
1320 UploadSourceAdapter::seekSource( $this->sourceAdapterId, 0 );
1321 $this->openReader();
1322 }
1323}
1324
1326class_alias( WikiImporter::class, 'WikiImporter' );
const NS_MAIN
Definition Defines.php:51
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Exception representing a failure to serialize or unserialize a content object.
Defer callable updates to run later in the PHP process.
Class for handling updates to the site_stats table.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
static registerSource(ImportSource $source)
static seekSource(string $id, int $offset)
XML file reader for the page data importer.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setDebug( $debug)
Set debug mode...
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
importRevision( $revision)
Default per-revision callback, performs the import.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
__construct(ImportSource $source, Authority $performer, private readonly Config $config, HookContainer $hookContainer, private readonly Language $contentLanguage, private readonly NamespaceInfo $namespaceInfo, private readonly TitleFactory $titleFactory, private readonly WikiPageFactory $wikiPageFactory, private readonly UploadRevisionImporter $uploadRevisionImporter, private readonly IContentHandlerFactory $contentHandlerFactory, private readonly SlotRoleRegistry $slotRoleRegistry,)
Creates an ImportXMLReader drawing from the source provided.
setNoUpdates( $noupdates)
Set 'no updates' mode.
doImport()
Primary entry point.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
importLogItem( $revision)
Default per-revision callback, performs the import.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
importUpload( $revision)
Dummy for now...
Base class for language-specific code.
Definition Language.php:65
A class containing constants representing the names of configuration variables.
const MaxArticleSize
Name constant for the MaxArticleSize setting, for use with Config::get()
Helper class for mapping page value objects to a string key.
Service for creating WikiPage objects.
Value object representing a content slot associated with a page revision.
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
Generic operation result class Has warning/error list, boolean status and arbitrary value.
Definition Status.php:44
A simple, immutable structure to hold the title of a page on a foreign MediaWiki installation.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Creates Title objects.
Represents a title within MediaWiki.
Definition Title.php:69
Class to parse and build external user names.
Value object representing a message parameter with one of the types from {.
Interface for configuration instances.
Definition Config.php:18
Content objects represent page content, e.g.
Definition Content.php:28
Source interface for XML import.
Interface for objects (potentially) representing an editable wiki page.
This interface represents the authority associated with the current execution context,...
Definition Authority.php:23
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
Interface for database access objects.
$source