66 private $sourceAdapterId;
69 private $foreignNamespaces =
null;
72 private $mLogItemCallback;
75 private $mUploadCallback;
78 private $mRevisionCallback;
81 private $mPageCallback;
84 private $mSiteInfoCallback;
87 private $mPageOutCallback;
90 private $mNoticeCallback;
96 private $mImportUploads;
99 private $mImageBasePath;
102 private $mNoUpdates =
false;
105 private $pageOffset = 0;
111 private $countableCache = [];
114 private $disableStatisticsUpdate =
false;
150 $this->performer = $performer;
151 $this->config = $config;
152 $this->hookRunner =
new HookRunner( $hookContainer );
153 $this->contentLanguage = $contentLanguage;
154 $this->namespaceInfo = $namespaceInfo;
155 $this->titleFactory = $titleFactory;
156 $this->wikiPageFactory = $wikiPageFactory;
157 $this->uploadRevisionImporter = $uploadRevisionImporter;
158 $this->contentHandlerFactory = $contentHandlerFactory;
159 $this->slotRoleRegistry = $slotRoleRegistry;
161 if ( !in_array(
'uploadsource', stream_get_wrappers() ) ) {
162 stream_wrapper_register(
'uploadsource', UploadSourceAdapter::class );
176 $this->contentLanguage,
177 $this->namespaceInfo,
187 return $this->reader;
194 $this->
debug(
"FAILURE: $err" );
195 wfDebug(
"WikiImporter XML error: $err" );
202 if ( $this->mDebug ) {
210 public function warn( $data ) {
219 if ( is_callable( $this->mNoticeCallback ) ) {
220 call_user_func( $this->mNoticeCallback, $msg,
$params );
233 $this->mDebug = $debug;
241 $this->mNoUpdates = $noupdates;
251 $this->pageOffset = $nthPage;
261 return wfSetVar( $this->mNoticeCallback, $callback );
270 $previous = $this->mPageCallback;
271 $this->mPageCallback = $callback;
285 $previous = $this->mPageOutCallback;
286 $this->mPageOutCallback = $callback;
296 $previous = $this->mRevisionCallback;
297 $this->mRevisionCallback = $callback;
307 $previous = $this->mUploadCallback;
308 $this->mUploadCallback = $callback;
318 $previous = $this->mLogItemCallback;
319 $this->mLogItemCallback = $callback;
329 $previous = $this->mSiteInfoCallback;
330 $this->mSiteInfoCallback = $callback;
340 $this->importTitleFactory = $factory;
349 if ( $namespace ===
null ) {
353 $this->contentLanguage,
354 $this->namespaceInfo,
361 $this->namespaceInfo->exists( intval( $namespace ) )
363 $namespace = intval( $namespace );
366 $this->namespaceInfo,
383 $status = Status::newGood();
384 $nsInfo = $this->namespaceInfo;
385 if ( $rootpage ===
null ) {
389 $this->contentLanguage,
394 } elseif ( $rootpage !==
'' ) {
395 $rootpage = rtrim( $rootpage,
'/' );
396 $title = Title::newFromText( $rootpage );
399 $status->fatal(
'import-rootpage-invalid' );
400 } elseif ( !$nsInfo->hasSubpages( $title->
getNamespace() ) ) {
403 : $this->contentLanguage->getNsText( $title->
getNamespace() );
404 $status->fatal(
'import-rootpage-nosubpage', $displayNSText );
424 $this->mImageBasePath = $dir;
431 $this->mImportUploads = $import;
440 $this->externalUserNames =
new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
458 $title = $titleAndForeignTitle[0];
459 $page = $this->wikiPageFactory->newFromTitle( $title );
460 $this->countableCache[
'title_' . $title->
getPrefixedText()] = $page->isCountable();
470 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
471 $this->
notice(
'import-error-bad-location',
472 $revision->getTitle()->getPrefixedText(),
474 $revision->getModel(),
475 $revision->getFormat()
482 return $revision->importOldRevision();
484 $this->
notice(
'import-error-unserialize',
485 $revision->getTitle()->getPrefixedText(),
487 $revision->getModel(),
488 $revision->getFormat()
501 return $revision->importLogItem();
510 $status = $this->uploadRevisionImporter->import( $revision );
511 return $status->isGood();
524 $sRevCount, $pageInfo
533 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
535 $page->loadPageData( IDBAccessObject::READ_LATEST );
536 $rev = $page->getRevisionRecord();
537 if ( $rev ===
null ) {
539 wfDebug( __METHOD__ .
': Skipping article count adjustment for ' . $pageIdentity .
540 ' because WikiPage::getRevisionRecord() returned null' );
542 $update = $page->newPageUpdater( $this->performer )->prepareUpdate();
543 $countKey =
'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
544 $countable = $update->isCountable();
545 if ( array_key_exists( $countKey, $this->countableCache ) &&
546 $countable != $this->countableCache[$countKey] ) {
547 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
548 'articles' => ( (
int)$countable - (
int)$this->countableCache[$countKey] )
554 $title = Title::newFromPageIdentity( $pageIdentity );
555 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
556 $revCount, $sRevCount, $pageInfo );
564 private function siteInfoCallback( $siteInfo ) {
565 if ( isset( $this->mSiteInfoCallback ) ) {
566 return call_user_func_array(
567 $this->mSiteInfoCallback,
580 if ( isset( $this->mPageCallback ) ) {
581 call_user_func( $this->mPageCallback, $title );
593 private function pageOutCallback(
PageIdentity $pageIdentity, $foreignTitle, $revCount,
594 $sucCount, $pageInfo ) {
595 if ( isset( $this->mPageOutCallback ) ) {
596 call_user_func_array( $this->mPageOutCallback, func_get_args() );
605 private function revisionCallback( $revision ) {
606 if ( isset( $this->mRevisionCallback ) ) {
607 return call_user_func_array(
608 $this->mRevisionCallback,
621 private function logItemCallback( $revision ) {
622 if ( isset( $this->mLogItemCallback ) ) {
623 return call_user_func_array(
624 $this->mLogItemCallback,
639 return $this->reader->getAttribute( $attr ) ??
'';
650 if ( $this->reader->isEmptyElement ) {
654 while ( $this->reader->read() ) {
655 switch ( $this->reader->nodeType ) {
656 case XMLReader::TEXT:
657 case XMLReader::CDATA:
658 case XMLReader::SIGNIFICANT_WHITESPACE:
659 $buffer .= $this->reader->value;
661 case XMLReader::END_ELEMENT:
666 $this->reader->close();
676 $this->syntaxCheckXML();
682 $oldDisable = @libxml_disable_entity_loader(
true );
684 $this->reader->read();
686 if ( $this->reader->localName !=
'mediawiki' ) {
688 @libxml_disable_entity_loader( $oldDisable );
689 $error = libxml_get_last_error();
691 throw new NormalizedException(
"XML error at line {line}: {message}", [
692 'line' => $error->line,
693 'message' => $error->message,
696 throw new UnexpectedValueException(
697 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
701 $this->
debug(
"<mediawiki> tag is correct." );
703 $this->
debug(
"Starting primary dump processing loop." );
705 $keepReading = $this->reader->read();
708 while ( $keepReading ) {
709 $tag = $this->reader->localName;
710 if ( $this->pageOffset ) {
711 if ( $tag ===
'page' ) {
714 if ( $pageCount < $this->pageOffset ) {
715 $keepReading = $this->reader->next();
719 $type = $this->reader->nodeType;
721 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
723 } elseif ( $tag ==
'mediawiki' && $type == XMLReader::END_ELEMENT ) {
725 } elseif ( $tag ==
'siteinfo' ) {
726 $this->handleSiteInfo();
727 } elseif ( $tag ==
'page' ) {
729 } elseif ( $tag ==
'logitem' ) {
730 $this->handleLogItem();
731 } elseif ( $tag !=
'#text' ) {
732 $this->
warn(
"Unhandled top-level XML tag $tag" );
738 $keepReading = $this->reader->next();
740 $this->
debug(
"Skip" );
742 $keepReading = $this->reader->read();
747 @libxml_disable_entity_loader( $oldDisable );
748 $this->reader->close();
754 private function handleSiteInfo() {
755 $this->debug(
"Enter site info handler." );
759 $normalFields = [
'sitename',
'base',
'generator',
'case' ];
761 while ( $this->reader->read() ) {
762 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
763 $this->reader->localName ==
'siteinfo' ) {
767 $tag = $this->reader->localName;
769 if ( $tag ==
'namespace' ) {
772 } elseif ( in_array( $tag, $normalFields ) ) {
777 $siteInfo[
'_namespaces'] = $this->foreignNamespaces;
778 $this->siteInfoCallback( $siteInfo );
781 private function handleLogItem() {
782 $this->
debug(
"Enter log item handler." );
786 $normalFields = [
'id',
'comment',
'type',
'action',
'timestamp',
787 'logtitle',
'params' ];
789 while ( $this->reader->read() ) {
790 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
791 $this->reader->localName ==
'logitem' ) {
795 $tag = $this->reader->localName;
797 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
799 } elseif ( in_array( $tag, $normalFields ) ) {
801 } elseif ( $tag ==
'contributor' ) {
802 $logInfo[
'contributor'] = $this->handleContributor();
803 } elseif ( $tag !=
'#text' ) {
804 $this->
warn(
"Unhandled log-item XML tag $tag" );
808 $this->processLogItem( $logInfo );
815 private function processLogItem( $logInfo ) {
818 if ( isset( $logInfo[
'id'] ) ) {
819 $revision->setID( $logInfo[
'id'] );
821 $revision->setType( $logInfo[
'type'] );
822 $revision->setAction( $logInfo[
'action'] );
823 if ( isset( $logInfo[
'timestamp'] ) ) {
824 $revision->setTimestamp( $logInfo[
'timestamp'] );
826 if ( isset( $logInfo[
'params'] ) ) {
827 $revision->setParams( $logInfo[
'params'] );
829 if ( isset( $logInfo[
'logtitle'] ) ) {
832 $revision->setTitle( Title::newFromText( $logInfo[
'logtitle'] ) );
835 $revision->setNoUpdates( $this->mNoUpdates );
837 if ( isset( $logInfo[
'comment'] ) ) {
838 $revision->setComment( $logInfo[
'comment'] );
841 if ( isset( $logInfo[
'contributor'][
'username'] ) ) {
842 $revision->setUsername(
843 $this->externalUserNames->applyPrefix( $logInfo[
'contributor'][
'username'] )
845 } elseif ( isset( $logInfo[
'contributor'][
'ip'] ) ) {
846 $revision->setUserIP( $logInfo[
'contributor'][
'ip'] );
848 $revision->setUsername( $this->externalUserNames->addPrefix(
'Unknown user' ) );
851 return $this->logItemCallback( $revision );
854 private function handlePage() {
856 $this->
debug(
"Enter page handler." );
857 $pageInfo = [
'revisionCount' => 0,
'successfulRevisionCount' => 0 ];
860 $normalFields = [
'title',
'ns',
'id',
'redirect',
'restrictions' ];
865 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
866 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
867 $this->reader->localName ==
'page' ) {
873 $tag = $this->reader->localName;
878 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
880 } elseif ( in_array( $tag, $normalFields ) ) {
888 if ( $tag ==
'redirect' ) {
893 } elseif ( $tag ==
'revision' || $tag ==
'upload' ) {
894 if ( !isset( $title ) ) {
895 $title = $this->processTitle( $pageInfo[
'title'],
896 $pageInfo[
'ns'] ??
null );
899 if ( is_array( $title ) ) {
901 [ $pageInfo[
'_title'], $foreignTitle ] = $title;
909 if ( $tag ==
'revision' ) {
910 $this->handleRevision( $pageInfo );
912 $this->handleUpload( $pageInfo );
915 } elseif ( $tag !=
'#text' ) {
916 $this->
warn(
"Unhandled page XML tag $tag" );
926 if ( array_key_exists(
'_title', $pageInfo ) ) {
928 $title = $pageInfo[
'_title'];
929 $this->pageOutCallback(
933 $pageInfo[
'revisionCount'],
934 $pageInfo[
'successfulRevisionCount'],
943 private function handleRevision( &$pageInfo ) {
944 $this->
debug(
"Enter revision handler" );
947 $normalFields = [
'id',
'parentid',
'timestamp',
'comment',
'minor',
'origin',
948 'model',
'format',
'text',
'sha1' ];
952 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
953 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
954 $this->reader->localName ==
'revision' ) {
958 $tag = $this->reader->localName;
960 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
961 $this, $pageInfo, $revisionInfo )
964 } elseif ( in_array( $tag, $normalFields ) ) {
966 } elseif ( $tag ==
'content' ) {
968 $revisionInfo[$tag][] = $this->handleContent();
969 } elseif ( $tag ==
'contributor' ) {
970 $revisionInfo[
'contributor'] = $this->handleContributor();
971 } elseif ( $tag !=
'#text' ) {
972 $this->
warn(
"Unhandled revision XML tag $tag" );
977 $pageInfo[
'revisionCount']++;
978 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
979 $pageInfo[
'successfulRevisionCount']++;
983 private function handleContent() {
984 $this->
debug(
"Enter content handler" );
987 $normalFields = [
'role',
'origin',
'model',
'format',
'text' ];
991 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
992 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
993 $this->reader->localName ==
'content' ) {
997 $tag = $this->reader->localName;
999 if ( !$this->hookRunner->onImportHandleContentXMLTag(
1000 $this, $contentInfo )
1003 } elseif ( in_array( $tag, $normalFields ) ) {
1005 } elseif ( $tag !=
'#text' ) {
1006 $this->
warn(
"Unhandled content XML tag $tag" );
1011 return $contentInfo;
1021 private function makeContent(
Title $title, $revisionId, $contentInfo ) {
1022 $maxArticleSize = $this->config->get( MainConfigNames::MaxArticleSize );
1024 if ( !isset( $contentInfo[
'text'] ) ) {
1025 throw new InvalidArgumentException(
'Missing text field in import.' );
1032 if ( ( !isset( $contentInfo[
'model'] ) ||
1033 in_array( $contentInfo[
'model'], [
1041 strlen( $contentInfo[
'text'] ) > $maxArticleSize * 1024
1043 throw new RuntimeException(
'The text of ' .
1045 "the revision with ID $revisionId" :
1047 ) .
" exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1050 $role = $contentInfo[
'role'] ?? SlotRecord::MAIN;
1051 $model = $contentInfo[
'model'] ?? $this->slotRoleRegistry
1052 ->getRoleHandler( $role )
1053 ->getDefaultModel( $title );
1054 $handler = $this->contentHandlerFactory->getContentHandler( $model );
1056 $text = $handler->importTransform( $contentInfo[
'text'] );
1058 return $handler->unserializeContent( $text );
1066 private function processRevision( $pageInfo, $revisionInfo ) {
1069 $revId = $revisionInfo[
'id'] ?? 0;
1071 $revision->setID( $revisionInfo[
'id'] );
1074 $title = $pageInfo[
'_title'];
1075 $revision->setTitle( $title );
1077 $content = $this->makeContent( $title, $revId, $revisionInfo );
1078 $revision->setContent( SlotRecord::MAIN, $content );
1080 foreach ( $revisionInfo[
'content'] ?? [] as $slotInfo ) {
1081 if ( !isset( $slotInfo[
'role'] ) ) {
1082 throw new RuntimeException(
"Missing role for imported slot." );
1085 $content = $this->makeContent( $title, $revId, $slotInfo );
1086 $revision->setContent( $slotInfo[
'role'], $content );
1088 $revision->setTimestamp( $revisionInfo[
'timestamp'] ??
wfTimestampNow() );
1090 if ( isset( $revisionInfo[
'comment'] ) ) {
1091 $revision->setComment( $revisionInfo[
'comment'] );
1094 if ( isset( $revisionInfo[
'minor'] ) ) {
1095 $revision->setMinor(
true );
1097 if ( isset( $revisionInfo[
'contributor'][
'username'] ) ) {
1098 $revision->setUsername(
1099 $this->externalUserNames->applyPrefix( $revisionInfo[
'contributor'][
'username'] )
1101 } elseif ( isset( $revisionInfo[
'contributor'][
'ip'] ) ) {
1102 $revision->setUserIP( $revisionInfo[
'contributor'][
'ip'] );
1104 $revision->setUsername( $this->externalUserNames->addPrefix(
'Unknown user' ) );
1106 if ( isset( $revisionInfo[
'sha1'] ) ) {
1107 $revision->setSha1Base36( $revisionInfo[
'sha1'] );
1109 $revision->setNoUpdates( $this->mNoUpdates );
1111 return $this->revisionCallback( $revision );
1118 private function handleUpload( &$pageInfo ) {
1119 $this->
debug(
"Enter upload handler" );
1122 $normalFields = [
'timestamp',
'comment',
'filename',
'text',
1123 'src',
'size',
'sha1base36',
'archivename',
'rel' ];
1127 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1128 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1129 $this->reader->localName ==
'upload' ) {
1133 $tag = $this->reader->localName;
1135 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1137 } elseif ( in_array( $tag, $normalFields ) ) {
1139 } elseif ( $tag ==
'contributor' ) {
1140 $uploadInfo[
'contributor'] = $this->handleContributor();
1141 } elseif ( $tag ==
'contents' ) {
1143 $encoding = $this->reader->getAttribute(
'encoding' );
1144 if ( $encoding ===
'base64' ) {
1145 $uploadInfo[
'fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1146 $uploadInfo[
'isTempSrc'] =
true;
1148 } elseif ( $tag !=
'#text' ) {
1149 $this->
warn(
"Unhandled upload XML tag $tag" );
1154 if ( $this->mImageBasePath && isset( $uploadInfo[
'rel'] ) ) {
1155 $path =
"{$this->mImageBasePath}/{$uploadInfo['rel']}";
1156 if ( file_exists(
$path ) ) {
1157 $uploadInfo[
'fileSrc'] =
$path;
1158 $uploadInfo[
'isTempSrc'] =
false;
1162 if ( $this->mImportUploads ) {
1163 return $this->processUpload( $pageInfo, $uploadInfo );
1171 private function dumpTemp( $contents ) {
1172 $filename = tempnam(
wfTempDir(),
'importupload' );
1173 file_put_contents( $filename, $contents );
1182 private function processUpload( $pageInfo, $uploadInfo ) {
1184 $revId = $pageInfo[
'id'];
1185 $title = $pageInfo[
'_title'];
1187 $uploadInfo[
'text'] ??=
'';
1188 $content = $this->makeContent( $title, $revId, $uploadInfo );
1190 $revision->setTitle( $title );
1191 $revision->setID( $revId );
1192 $revision->setTimestamp( $uploadInfo[
'timestamp'] );
1193 $revision->setContent( SlotRecord::MAIN, $content );
1194 $revision->setFilename( $uploadInfo[
'filename'] );
1195 if ( isset( $uploadInfo[
'archivename'] ) ) {
1196 $revision->setArchiveName( $uploadInfo[
'archivename'] );
1198 $revision->setSrc( $uploadInfo[
'src'] );
1199 if ( isset( $uploadInfo[
'fileSrc'] ) ) {
1200 $revision->setFileSrc( $uploadInfo[
'fileSrc'],
1201 !empty( $uploadInfo[
'isTempSrc'] )
1204 if ( isset( $uploadInfo[
'sha1base36'] ) ) {
1205 $revision->setSha1Base36( $uploadInfo[
'sha1base36'] );
1207 $revision->setSize( intval( $uploadInfo[
'size'] ) );
1208 $revision->setComment( $uploadInfo[
'comment'] );
1210 if ( isset( $uploadInfo[
'contributor'][
'username'] ) ) {
1211 $revision->setUsername(
1212 $this->externalUserNames->applyPrefix( $uploadInfo[
'contributor'][
'username'] )
1214 } elseif ( isset( $uploadInfo[
'contributor'][
'ip'] ) ) {
1215 $revision->setUserIP( $uploadInfo[
'contributor'][
'ip'] );
1217 $revision->setNoUpdates( $this->mNoUpdates );
1219 return call_user_func( $this->mUploadCallback, $revision );
1225 private function handleContributor() {
1226 $this->
debug(
"Enter contributor handler." );
1228 if ( $this->reader->isEmptyElement ) {
1232 $fields = [
'id',
'ip',
'username' ];
1235 while ( $this->reader->read() ) {
1236 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1237 $this->reader->localName ==
'contributor' ) {
1241 $tag = $this->reader->localName;
1243 if ( in_array( $tag, $fields ) ) {
1256 private function processTitle( $text, $ns =
null ) {
1257 if ( $this->foreignNamespaces ===
null ) {
1259 $this->contentLanguage
1263 $this->foreignNamespaces );
1266 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1269 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1272 if ( $title ===
null ) {
1273 # Invalid page title? Ignore the page
1274 $this->
notice(
'import-error-invalid', $foreignTitle->getFullText() );
1282 } elseif ( !$this->performer->definitelyCan(
'edit', $title ) ) {
1283 # Do not import if the importing wiki user cannot edit this page
1288 return [ $title, $foreignTitle ];
1295 private function openReader() {
1299 $oldDisable = @libxml_disable_entity_loader(
false );
1301 if ( PHP_VERSION_ID >= 80000 ) {
1303 $reader = XMLReader::open(
1304 'uploadsource://' . $this->sourceAdapterId,
null, LIBXML_PARSEHUGE );
1305 if ( $reader instanceof XMLReader ) {
1306 $this->reader = $reader;
1313 $this->reader =
new XMLReader;
1314 $status = $this->reader->open(
1315 'uploadsource://' . $this->sourceAdapterId,
null, LIBXML_PARSEHUGE );
1318 $error = libxml_get_last_error();
1320 @libxml_disable_entity_loader( $oldDisable );
1321 throw new RuntimeException(
1322 'Encountered an internal error while initializing WikiImporter object: ' . $error->message
1326 @libxml_disable_entity_loader( $oldDisable );
1332 private function syntaxCheckXML() {
1336 AtEase::suppressWarnings();
1337 $oldDisable = libxml_disable_entity_loader(
false );
1339 while ( $this->reader->read() );
1340 $error = libxml_get_last_error();
1342 $errorMessage =
'XML error at line ' . $error->line .
': ' . $error->message;
1343 wfDebug( __METHOD__ .
': Invalid xml found - ' . $errorMessage );
1344 throw new RuntimeException( $errorMessage );
1347 libxml_disable_entity_loader( $oldDisable );
1348 AtEase::restoreWarnings();
1349 $this->reader->close();
1354 $this->openReader();