63 private $sourceAdapterId;
66 private $foreignNamespaces =
null;
69 private $mLogItemCallback;
72 private $mUploadCallback;
75 private $mRevisionCallback;
78 private $mPageCallback;
81 private $mSiteInfoCallback;
84 private $mPageOutCallback;
87 private $mNoticeCallback;
93 private $mImportUploads;
96 private $mImageBasePath;
99 private $mNoUpdates =
false;
102 private $pageOffset = 0;
108 private $countableCache = [];
111 private $disableStatisticsUpdate =
false;
129 private readonly
Config $config,
131 private readonly
Language $contentLanguage,
139 $this->performer = $performer;
140 $this->hookRunner =
new HookRunner( $hookContainer );
142 if ( !in_array(
'uploadsource', stream_get_wrappers() ) ) {
143 stream_wrapper_register(
'uploadsource', UploadSourceAdapter::class );
157 $this->contentLanguage,
158 $this->namespaceInfo,
168 return $this->reader;
175 $this->
debug(
"FAILURE: $err" );
176 wfDebug(
"WikiImporter XML error: $err" );
183 if ( $this->mDebug ) {
191 public function warn( $data ) {
201 public function notice( $msg, ...$params ) {
202 if ( is_callable( $this->mNoticeCallback ) ) {
203 ( $this->mNoticeCallback )( $msg, $params );
204 }
else { # No ImportReporter -> CLI
216 $this->mDebug = $debug;
224 $this->mNoUpdates = $noupdates;
234 $this->pageOffset = $nthPage;
244 return wfSetVar( $this->mNoticeCallback, $callback );
253 $previous = $this->mPageCallback;
254 $this->mPageCallback = $callback;
268 $previous = $this->mPageOutCallback;
269 $this->mPageOutCallback = $callback;
279 $previous = $this->mRevisionCallback;
280 $this->mRevisionCallback = $callback;
290 $previous = $this->mUploadCallback;
291 $this->mUploadCallback = $callback;
301 $previous = $this->mLogItemCallback;
302 $this->mLogItemCallback = $callback;
312 $previous = $this->mSiteInfoCallback;
313 $this->mSiteInfoCallback = $callback;
323 $this->importTitleFactory = $factory;
332 if ( $namespace ===
null ) {
336 $this->contentLanguage,
337 $this->namespaceInfo,
344 $this->namespaceInfo->exists( intval( $namespace ) )
346 $namespace = intval( $namespace );
349 $this->namespaceInfo,
366 $status = Status::newGood();
367 $nsInfo = $this->namespaceInfo;
368 if ( $rootpage ===
null ) {
372 $this->contentLanguage,
377 } elseif ( $rootpage !==
'' ) {
378 $rootpage = rtrim( $rootpage,
'/' );
379 $title = Title::newFromText( $rootpage );
381 if ( !$title || $title->isExternal() ) {
382 $status->fatal(
'import-rootpage-invalid' );
383 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
384 $displayNSText = $title->getNamespace() ===
NS_MAIN
386 : $this->contentLanguage->getNsText( $title->getNamespace() );
387 $status->fatal(
'import-rootpage-nosubpage', $displayNSText );
407 $this->mImageBasePath = $dir;
414 $this->mImportUploads = $import;
423 $this->externalUserNames =
new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
441 $title = $titleAndForeignTitle[0];
442 $page = $this->wikiPageFactory->newFromTitle( $title );
443 $this->countableCache[
'title_' . $title->getPrefixedText()] = $page->isCountable();
453 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
454 $this->
notice(
'import-error-bad-location',
455 $revision->getTitle()->getPrefixedText(),
457 $revision->getModel(),
458 $revision->getFormat()
465 return $revision->importOldRevision();
467 $this->
notice(
'import-error-unserialize',
468 $revision->getTitle()->getPrefixedText(),
470 $revision->getModel(),
471 $revision->getFormat()
484 return $revision->importLogItem();
493 $status = $this->uploadRevisionImporter->import( $revision );
494 return $status->isGood();
507 $sRevCount, $pageInfo
516 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
518 $page->loadPageData( IDBAccessObject::READ_LATEST );
519 $rev = $page->getRevisionRecord();
520 if ( $rev ===
null ) {
522 wfDebug( __METHOD__ .
': Skipping article count adjustment for ' . $pageIdentity .
523 ' because WikiPage::getRevisionRecord() returned null' );
525 $update = $page->newPageUpdater( $this->performer )->prepareUpdate();
526 $countKey =
'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
527 $countable = $update->isCountable();
528 if ( array_key_exists( $countKey, $this->countableCache ) &&
529 $countable != $this->countableCache[$countKey] ) {
530 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
531 'articles' => ( (
int)$countable - (
int)$this->countableCache[$countKey] )
537 $title = Title::newFromPageIdentity( $pageIdentity );
538 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
539 $revCount, $sRevCount, $pageInfo );
547 private function siteInfoCallback( $siteInfo ) {
548 if ( $this->mSiteInfoCallback ) {
549 return ( $this->mSiteInfoCallback )( $siteInfo, $this );
560 if ( $this->mPageCallback ) {
561 ( $this->mPageCallback )( $title );
573 private function pageOutCallback(
PageIdentity $pageIdentity, $foreignTitle, $revCount,
574 $sucCount, $pageInfo ) {
575 if ( $this->mPageOutCallback ) {
576 ( $this->mPageOutCallback )( $pageIdentity, $foreignTitle, $revCount, $sucCount, $pageInfo );
585 private function revisionCallback( $revision ) {
586 if ( $this->mRevisionCallback ) {
587 return ( $this->mRevisionCallback )( $revision, $this );
598 private function logItemCallback( $revision ) {
599 if ( $this->mLogItemCallback ) {
600 return ( $this->mLogItemCallback )( $revision, $this );
613 return $this->reader->getAttribute( $attr ) ??
'';
624 if ( $this->reader->isEmptyElement ) {
628 while ( $this->reader->read() ) {
629 switch ( $this->reader->nodeType ) {
630 case XMLReader::TEXT:
631 case XMLReader::CDATA:
632 case XMLReader::SIGNIFICANT_WHITESPACE:
633 $buffer .= $this->reader->value;
635 case XMLReader::END_ELEMENT:
640 $this->reader->close();
650 $this->syntaxCheckXML();
656 $oldDisable = @libxml_disable_entity_loader(
true );
658 $this->reader->read();
660 if ( $this->reader->localName !=
'mediawiki' ) {
662 @libxml_disable_entity_loader( $oldDisable );
663 $error = libxml_get_last_error();
665 throw new NormalizedException(
"XML error at line {line}: {message}", [
666 'line' => $error->line,
667 'message' => $error->message,
670 throw new UnexpectedValueException(
671 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
675 $this->
debug(
"<mediawiki> tag is correct." );
677 $this->
debug(
"Starting primary dump processing loop." );
679 $keepReading = $this->reader->read();
682 while ( $keepReading ) {
683 $tag = $this->reader->localName;
684 if ( $this->pageOffset ) {
685 if ( $tag ===
'page' ) {
688 if ( $pageCount < $this->pageOffset ) {
689 $keepReading = $this->reader->next();
693 $type = $this->reader->nodeType;
695 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
697 } elseif ( $tag ==
'mediawiki' && $type == XMLReader::END_ELEMENT ) {
699 } elseif ( $tag ==
'siteinfo' ) {
700 $this->handleSiteInfo();
701 } elseif ( $tag ==
'page' ) {
703 } elseif ( $tag ==
'logitem' ) {
704 $this->handleLogItem();
705 } elseif ( $tag !=
'#text' ) {
706 $this->
warn(
"Unhandled top-level XML tag $tag" );
712 $keepReading = $this->reader->next();
714 $this->
debug(
"Skip" );
716 $keepReading = $this->reader->read();
721 @libxml_disable_entity_loader( $oldDisable );
722 $this->reader->close();
728 private function handleSiteInfo() {
729 $this->
debug(
"Enter site info handler." );
733 $normalFields = [
'sitename',
'base',
'generator',
'case' ];
735 while ( $this->reader->read() ) {
736 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
737 $this->reader->localName ==
'siteinfo' ) {
741 $tag = $this->reader->localName;
743 if ( $tag ==
'namespace' ) {
746 } elseif ( in_array( $tag, $normalFields ) ) {
751 $siteInfo[
'_namespaces'] = $this->foreignNamespaces;
752 $this->siteInfoCallback( $siteInfo );
755 private function handleLogItem() {
756 $this->
debug(
"Enter log item handler." );
760 $normalFields = [
'id',
'comment',
'type',
'action',
'timestamp',
761 'logtitle',
'params' ];
763 while ( $this->reader->read() ) {
764 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
765 $this->reader->localName ==
'logitem' ) {
769 $tag = $this->reader->localName;
771 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
773 } elseif ( in_array( $tag, $normalFields ) ) {
775 } elseif ( $tag ==
'contributor' ) {
776 $logInfo[
'contributor'] = $this->handleContributor();
777 } elseif ( $tag !=
'#text' ) {
778 $this->
warn(
"Unhandled log-item XML tag $tag" );
782 $this->processLogItem( $logInfo );
789 private function processLogItem( $logInfo ) {
790 $revision =
new WikiRevision();
792 if ( isset( $logInfo[
'id'] ) ) {
793 $revision->setID( $logInfo[
'id'] );
795 $revision->setType( $logInfo[
'type'] );
796 $revision->setAction( $logInfo[
'action'] );
797 if ( isset( $logInfo[
'timestamp'] ) ) {
798 $revision->setTimestamp( $logInfo[
'timestamp'] );
800 if ( isset( $logInfo[
'params'] ) ) {
801 $revision->setParams( $logInfo[
'params'] );
803 if ( isset( $logInfo[
'logtitle'] ) ) {
806 $revision->setTitle( Title::newFromText( $logInfo[
'logtitle'] ) );
809 $revision->setNoUpdates( $this->mNoUpdates );
811 if ( isset( $logInfo[
'comment'] ) ) {
812 $revision->setComment( $logInfo[
'comment'] );
815 if ( isset( $logInfo[
'contributor'][
'username'] ) ) {
816 $revision->setUsername(
817 $this->externalUserNames->applyPrefix( $logInfo[
'contributor'][
'username'] )
819 } elseif ( isset( $logInfo[
'contributor'][
'ip'] ) ) {
820 $revision->setUserIP( $logInfo[
'contributor'][
'ip'] );
822 $revision->setUsername( $this->externalUserNames->addPrefix(
'Unknown user' ) );
825 return $this->logItemCallback( $revision );
828 private function handlePage() {
830 $this->
debug(
"Enter page handler." );
831 $pageInfo = [
'revisionCount' => 0,
'successfulRevisionCount' => 0 ];
834 $normalFields = [
'title',
'ns',
'id',
'redirect',
'restrictions' ];
839 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
840 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
841 $this->reader->localName ==
'page' ) {
847 $tag = $this->reader->localName;
852 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
854 } elseif ( in_array( $tag, $normalFields ) ) {
862 if ( $tag ==
'redirect' ) {
867 } elseif ( $tag ==
'revision' || $tag ==
'upload' ) {
868 if ( !isset( $title ) ) {
869 $title = $this->processTitle( $pageInfo[
'title'],
870 $pageInfo[
'ns'] ??
null );
873 if ( is_array( $title ) ) {
875 [ $pageInfo[
'_title'], $foreignTitle ] = $title;
883 if ( $tag ==
'revision' ) {
884 $this->handleRevision( $pageInfo );
886 $this->handleUpload( $pageInfo );
889 } elseif ( $tag !=
'#text' ) {
890 $this->
warn(
"Unhandled page XML tag $tag" );
900 if ( array_key_exists(
'_title', $pageInfo ) ) {
902 $title = $pageInfo[
'_title'];
903 $this->pageOutCallback(
907 $pageInfo[
'revisionCount'],
908 $pageInfo[
'successfulRevisionCount'],
917 private function handleRevision( &$pageInfo ) {
918 $this->
debug(
"Enter revision handler" );
921 $normalFields = [
'id',
'parentid',
'timestamp',
'comment',
'minor',
'origin',
922 'model',
'format',
'text',
'sha1' ];
926 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
927 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
928 $this->reader->localName ==
'revision' ) {
932 $tag = $this->reader->localName;
934 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
935 $this, $pageInfo, $revisionInfo )
938 } elseif ( in_array( $tag, $normalFields ) ) {
940 } elseif ( $tag ==
'content' ) {
942 $revisionInfo[$tag][] = $this->handleContent();
943 } elseif ( $tag ==
'contributor' ) {
944 $revisionInfo[
'contributor'] = $this->handleContributor();
945 } elseif ( $tag !=
'#text' ) {
946 $this->
warn(
"Unhandled revision XML tag $tag" );
951 $pageInfo[
'revisionCount']++;
952 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
953 $pageInfo[
'successfulRevisionCount']++;
957 private function handleContent(): array {
958 $this->
debug(
"Enter content handler" );
961 $normalFields = [
'role',
'origin',
'model',
'format',
'text' ];
965 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
966 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
967 $this->reader->localName ==
'content' ) {
971 $tag = $this->reader->localName;
973 if ( !$this->hookRunner->onImportHandleContentXMLTag(
974 $this, $contentInfo )
977 } elseif ( in_array( $tag, $normalFields ) ) {
979 } elseif ( $tag !=
'#text' ) {
980 $this->
warn(
"Unhandled content XML tag $tag" );
995 private function makeContent( PageIdentity $page, $revisionId, $contentInfo ) {
998 if ( !isset( $contentInfo[
'text'] ) ) {
999 throw new InvalidArgumentException(
'Missing text field in import.' );
1006 if ( ( !isset( $contentInfo[
'model'] ) ||
1007 in_array( $contentInfo[
'model'], [
1015 strlen( $contentInfo[
'text'] ) > $maxArticleSize * 1024
1017 throw new RuntimeException(
'The text of ' .
1019 "the revision with ID $revisionId" :
1021 ) .
" exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1024 $role = $contentInfo[
'role'] ?? SlotRecord::MAIN;
1025 $model = $contentInfo[
'model'] ?? $this->slotRoleRegistry
1026 ->getRoleHandler( $role )
1027 ->getDefaultModel( $page );
1028 $handler = $this->contentHandlerFactory->getContentHandler( $model );
1030 $text = $handler->importTransform( $contentInfo[
'text'] );
1032 return $handler->unserializeContent( $text );
1040 private function processRevision( $pageInfo, $revisionInfo ) {
1041 $revision =
new WikiRevision();
1043 $revId = $revisionInfo[
'id'] ?? 0;
1045 $revision->setID( $revisionInfo[
'id'] );
1048 $title = $pageInfo[
'_title'];
1049 $revision->setTitle( $title );
1051 $content = $this->makeContent( $title, $revId, $revisionInfo );
1052 $revision->setContent( SlotRecord::MAIN, $content );
1054 foreach ( $revisionInfo[
'content'] ?? [] as $slotInfo ) {
1055 if ( !isset( $slotInfo[
'role'] ) ) {
1056 throw new RuntimeException(
"Missing role for imported slot." );
1059 $content = $this->makeContent( $title, $revId, $slotInfo );
1060 $revision->setContent( $slotInfo[
'role'], $content );
1062 $revision->setTimestamp( $revisionInfo[
'timestamp'] ??
wfTimestampNow() );
1064 if ( isset( $revisionInfo[
'comment'] ) ) {
1065 $revision->setComment( $revisionInfo[
'comment'] );
1068 if ( isset( $revisionInfo[
'minor'] ) ) {
1069 $revision->setMinor(
true );
1071 if ( isset( $revisionInfo[
'contributor'][
'username'] ) ) {
1072 $revision->setUsername(
1073 $this->externalUserNames->applyPrefix( $revisionInfo[
'contributor'][
'username'] )
1075 } elseif ( isset( $revisionInfo[
'contributor'][
'ip'] ) ) {
1076 $revision->setUserIP( $revisionInfo[
'contributor'][
'ip'] );
1078 $revision->setUsername( $this->externalUserNames->addPrefix(
'Unknown user' ) );
1080 if ( isset( $revisionInfo[
'sha1'] ) ) {
1081 $revision->setSha1Base36( $revisionInfo[
'sha1'] );
1083 $revision->setNoUpdates( $this->mNoUpdates );
1085 return $this->revisionCallback( $revision );
1092 private function handleUpload( &$pageInfo ) {
1093 $this->debug(
"Enter upload handler" );
1096 $normalFields = [
'timestamp',
'comment',
'filename',
'text',
1097 'src',
'size',
'sha1base36',
'archivename',
'rel' ];
1101 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1102 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1103 $this->reader->localName ==
'upload' ) {
1107 $tag = $this->reader->localName;
1109 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1111 } elseif ( in_array( $tag, $normalFields ) ) {
1112 $uploadInfo[$tag] = $this->nodeContents();
1113 } elseif ( $tag ==
'contributor' ) {
1114 $uploadInfo[
'contributor'] = $this->handleContributor();
1115 } elseif ( $tag ==
'contents' ) {
1116 $contents = $this->nodeContents();
1117 $encoding = $this->reader->getAttribute(
'encoding' );
1118 if ( $encoding ===
'base64' ) {
1119 $uploadInfo[
'fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1120 $uploadInfo[
'isTempSrc'] =
true;
1122 } elseif ( $tag !=
'#text' ) {
1123 $this->warn(
"Unhandled upload XML tag $tag" );
1128 if ( $this->mImageBasePath && isset( $uploadInfo[
'rel'] ) ) {
1129 $path =
"{$this->mImageBasePath}/{$uploadInfo['rel']}";
1130 if ( file_exists(
$path ) ) {
1131 $uploadInfo[
'fileSrc'] =
$path;
1132 $uploadInfo[
'isTempSrc'] =
false;
1136 if ( $this->mImportUploads ) {
1137 return $this->processUpload( $pageInfo, $uploadInfo );
1145 private function dumpTemp( $contents ) {
1146 $filename = tempnam(
wfTempDir(),
'importupload' );
1147 file_put_contents( $filename, $contents );
1156 private function processUpload( $pageInfo, $uploadInfo ) {
1157 $revision =
new WikiRevision();
1158 $revId = $pageInfo[
'id'];
1159 $title = $pageInfo[
'_title'];
1161 $uploadInfo[
'text'] ??=
'';
1162 $content = $this->makeContent( $title, $revId, $uploadInfo );
1164 $revision->setTitle( $title );
1165 $revision->setID( $revId );
1166 $revision->setTimestamp( $uploadInfo[
'timestamp'] );
1167 $revision->setContent( SlotRecord::MAIN, $content );
1168 $revision->setFilename( $uploadInfo[
'filename'] );
1169 if ( isset( $uploadInfo[
'archivename'] ) ) {
1170 $revision->setArchiveName( $uploadInfo[
'archivename'] );
1172 $revision->setSrc( $uploadInfo[
'src'] );
1173 if ( isset( $uploadInfo[
'fileSrc'] ) ) {
1174 $revision->setFileSrc( $uploadInfo[
'fileSrc'],
1175 !empty( $uploadInfo[
'isTempSrc'] )
1178 if ( isset( $uploadInfo[
'sha1base36'] ) ) {
1179 $revision->setSha1Base36( $uploadInfo[
'sha1base36'] );
1181 $revision->setSize( intval( $uploadInfo[
'size'] ) );
1182 $revision->setComment( $uploadInfo[
'comment'] );
1184 if ( isset( $uploadInfo[
'contributor'][
'username'] ) ) {
1185 $revision->setUsername(
1186 $this->externalUserNames->applyPrefix( $uploadInfo[
'contributor'][
'username'] )
1188 } elseif ( isset( $uploadInfo[
'contributor'][
'ip'] ) ) {
1189 $revision->setUserIP( $uploadInfo[
'contributor'][
'ip'] );
1191 $revision->setNoUpdates( $this->mNoUpdates );
1193 return ( $this->mUploadCallback )( $revision );
1199 private function handleContributor() {
1200 $this->debug(
"Enter contributor handler." );
1202 if ( $this->reader->isEmptyElement ) {
1206 $fields = [
'id',
'ip',
'username' ];
1209 while ( $this->reader->read() ) {
1210 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1211 $this->reader->localName ==
'contributor' ) {
1215 $tag = $this->reader->localName;
1217 if ( in_array( $tag, $fields ) ) {
1218 $info[$tag] = $this->nodeContents();
1230 private function processTitle( $text, $ns =
null ) {
1231 if ( $this->foreignNamespaces ===
null ) {
1232 $foreignTitleFactory =
new NaiveForeignTitleFactory(
1233 $this->contentLanguage
1236 $foreignTitleFactory =
new NamespaceAwareForeignTitleFactory(
1237 $this->foreignNamespaces );
1240 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1243 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1246 if ( $title ===
null ) {
1247 # Invalid page title? Ignore the page
1248 $this->notice(
'import-error-invalid', $foreignTitle->getFullText() );
1250 } elseif ( $title->isExternal() ) {
1251 $this->notice(
'import-error-interwiki', $title->getPrefixedText() );
1253 } elseif ( !$title->canExist() ) {
1254 $this->notice(
'import-error-special', $title->getPrefixedText() );
1256 } elseif ( !$this->performer->definitelyCan(
'edit', $title ) ) {
1257 # Do not import if the importing wiki user cannot edit this page
1258 $this->notice(
'import-error-edit', $title->getPrefixedText() );
1262 return [ $title, $foreignTitle ];
1268 private function openReader() {
1272 $oldDisable = @libxml_disable_entity_loader(
false );
1275 $reader = XMLReader::open(
1276 'uploadsource://' . $this->sourceAdapterId,
null, LIBXML_PARSEHUGE );
1277 if ( $reader instanceof XMLReader ) {
1278 $this->reader = $reader;
1284 $error = libxml_get_last_error();
1286 @libxml_disable_entity_loader( $oldDisable );
1287 throw new RuntimeException(
1288 'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1292 @libxml_disable_entity_loader( $oldDisable );
1298 private function syntaxCheckXML() {
1303 $oldDisable = @libxml_disable_entity_loader(
false );
1306 while ( @$this->reader->read() );
1307 $error = libxml_get_last_error();
1309 $errorMessage =
'XML error at line ' . $error->line .
': ' . $error->message;
1310 wfDebug( __METHOD__ .
': Invalid xml found - ' . $errorMessage );
1311 throw new RuntimeException( $errorMessage );
1315 @libxml_disable_entity_loader( $oldDisable );
1316 $this->reader->close();
1321 $this->openReader();