MediaWiki 1.39.10
WikiImporter.php
Go to the documentation of this file.
1<?php
38use Wikimedia\NormalizedException\NormalizedException;
39
48 private $reader;
49
51 private $foreignNamespaces = null;
52
54 private $mLogItemCallback;
55
57 private $mUploadCallback;
58
60 private $mRevisionCallback;
61
63 private $mPageCallback;
64
66 private $mSiteInfoCallback;
67
69 private $mPageOutCallback;
70
72 private $mNoticeCallback;
73
75 private $mDebug;
76
78 private $mImportUploads;
79
81 private $mImageBasePath;
82
84 private $mNoUpdates = false;
85
87 private $pageOffset = 0;
88
90 private $config;
91
93 private $importTitleFactory;
94
96 private $hookRunner;
97
99 private $countableCache = [];
100
102 private $disableStatisticsUpdate = false;
103
105 private $externalUserNames;
106
108 private $contentLanguage;
109
111 private $namespaceInfo;
112
114 private $titleFactory;
115
117 private $wikiPageFactory;
118
120 private $uploadRevisionImporter;
121
123 private $permissionManager;
124
126 private $contentHandlerFactory;
127
129 private $slotRoleRegistry;
130
147 public function __construct(
149 Config $config,
150 HookContainer $hookContainer,
151 Language $contentLanguage,
152 NamespaceInfo $namespaceInfo,
153 TitleFactory $titleFactory,
154 WikiPageFactory $wikiPageFactory,
155 UploadRevisionImporter $uploadRevisionImporter,
156 PermissionManager $permissionManager,
157 IContentHandlerFactory $contentHandlerFactory,
158 SlotRoleRegistry $slotRoleRegistry
159 ) {
160 $this->config = $config;
161 $this->hookRunner = new HookRunner( $hookContainer );
162 $this->contentLanguage = $contentLanguage;
163 $this->namespaceInfo = $namespaceInfo;
164 $this->titleFactory = $titleFactory;
165 $this->wikiPageFactory = $wikiPageFactory;
166 $this->uploadRevisionImporter = $uploadRevisionImporter;
167 $this->permissionManager = $permissionManager;
168 $this->contentHandlerFactory = $contentHandlerFactory;
169 $this->slotRoleRegistry = $slotRoleRegistry;
170
171 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
172 stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
173 }
175
176 // Enable the entity loader, as it is needed for loading external URLs via
177 // XMLReader::open (T86036)
178 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
179 $oldDisable = @libxml_disable_entity_loader( false );
180 if ( PHP_VERSION_ID >= 80000 ) {
181 // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548
182 $reader = XMLReader::open(
183 "uploadsource://$id", null, LIBXML_PARSEHUGE );
184 if ( $reader instanceof XMLReader ) {
185 $this->reader = $reader;
186 $status = true;
187 } else {
188 $status = false;
189 }
190 } else {
191 // A static call generated a deprecation warning prior to PHP 8.0
192 $this->reader = new XMLReader;
193 $status = $this->reader->open(
194 "uploadsource://$id", null, LIBXML_PARSEHUGE );
195 }
196 if ( !$status ) {
197 $error = libxml_get_last_error();
198 // phpcs:ignore Generic.PHP.NoSilencedErrors
199 @libxml_disable_entity_loader( $oldDisable );
200 throw new MWException( 'Encountered an internal error while initializing WikiImporter object: ' .
201 $error->message );
202 }
203 // phpcs:ignore Generic.PHP.NoSilencedErrors
204 @libxml_disable_entity_loader( $oldDisable );
205
206 // Default callbacks
207 $this->setPageCallback( [ $this, 'beforeImportPage' ] );
208 $this->setRevisionCallback( [ $this, "importRevision" ] );
209 $this->setUploadCallback( [ $this, 'importUpload' ] );
210 $this->setLogItemCallback( [ $this, 'importLogItem' ] );
211 $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
212
213 $this->importTitleFactory = new NaiveImportTitleFactory(
214 $this->contentLanguage,
215 $this->namespaceInfo,
216 $this->titleFactory
217 );
218 $this->externalUserNames = new ExternalUserNames( 'imported', false );
219 }
220
224 public function getReader() {
225 return $this->reader;
226 }
227
231 public function throwXmlError( $err ) {
232 $this->debug( "FAILURE: $err" );
233 wfDebug( "WikiImporter XML error: $err" );
234 }
235
239 public function debug( $data ) {
240 if ( $this->mDebug ) {
241 wfDebug( "IMPORT: $data" );
242 }
243 }
244
248 public function warn( $data ) {
249 wfDebug( "IMPORT: $data" );
250 }
251
256 public function notice( $msg, ...$params ) {
257 if ( is_callable( $this->mNoticeCallback ) ) {
258 call_user_func( $this->mNoticeCallback, $msg, $params );
259 } else { # No ImportReporter -> CLI
260 // T177997: the command line importers should call setNoticeCallback()
261 // for their own custom callback to echo the notice
262 wfDebug( wfMessage( $msg, $params )->text() );
263 }
264 }
265
270 public function setDebug( $debug ) {
271 $this->mDebug = $debug;
272 }
273
278 public function setNoUpdates( $noupdates ) {
279 $this->mNoUpdates = $noupdates;
280 }
281
288 public function setPageOffset( $nthPage ) {
289 $this->pageOffset = $nthPage;
290 }
291
298 public function setNoticeCallback( $callback ) {
299 return wfSetVar( $this->mNoticeCallback, $callback );
300 }
301
307 public function setPageCallback( $callback ) {
308 $previous = $this->mPageCallback;
309 $this->mPageCallback = $callback;
310 return $previous;
311 }
312
322 public function setPageOutCallback( $callback ) {
323 $previous = $this->mPageOutCallback;
324 $this->mPageOutCallback = $callback;
325 return $previous;
326 }
327
333 public function setRevisionCallback( $callback ) {
334 $previous = $this->mRevisionCallback;
335 $this->mRevisionCallback = $callback;
336 return $previous;
337 }
338
344 public function setUploadCallback( $callback ) {
345 $previous = $this->mUploadCallback;
346 $this->mUploadCallback = $callback;
347 return $previous;
348 }
349
355 public function setLogItemCallback( $callback ) {
356 $previous = $this->mLogItemCallback;
357 $this->mLogItemCallback = $callback;
358 return $previous;
359 }
360
366 public function setSiteInfoCallback( $callback ) {
367 $previous = $this->mSiteInfoCallback;
368 $this->mSiteInfoCallback = $callback;
369 return $previous;
370 }
371
377 public function setImportTitleFactory( $factory ) {
378 $this->importTitleFactory = $factory;
379 }
380
386 public function setTargetNamespace( $namespace ) {
387 if ( $namespace === null ) {
388 // Don't override namespaces
391 $this->contentLanguage,
392 $this->namespaceInfo,
393 $this->titleFactory
394 )
395 );
396 return true;
397 } elseif (
398 $namespace >= 0 &&
399 $this->namespaceInfo->exists( intval( $namespace ) )
400 ) {
401 $namespace = intval( $namespace );
404 $this->namespaceInfo,
405 $this->titleFactory,
406 $namespace
407 )
408 );
409 return true;
410 } else {
411 return false;
412 }
413 }
414
420 public function setTargetRootPage( $rootpage ) {
421 $status = Status::newGood();
422 $nsInfo = $this->namespaceInfo;
423 if ( $rootpage === null ) {
424 // No rootpage
427 $this->contentLanguage,
428 $nsInfo,
429 $this->titleFactory
430 )
431 );
432 } elseif ( $rootpage !== '' ) {
433 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
434 $title = Title::newFromText( $rootpage );
435
436 if ( !$title || $title->isExternal() ) {
437 $status->fatal( 'import-rootpage-invalid' );
438 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
439 $displayNSText = $title->getNamespace() === NS_MAIN
440 ? wfMessage( 'blanknamespace' )->text()
441 : $this->contentLanguage->getNsText( $title->getNamespace() );
442 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
443 } else {
444 // set namespace to 'all', so the namespace check in processTitle() can pass
445 $this->setTargetNamespace( null );
448 $nsInfo,
449 $this->titleFactory,
450 $title
451 )
452 );
453 }
454 }
455 return $status;
456 }
457
461 public function setImageBasePath( $dir ) {
462 $this->mImageBasePath = $dir;
463 }
464
468 public function setImportUploads( $import ) {
469 $this->mImportUploads = $import;
470 }
471
477 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
478 $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
479 }
480
485 public function disableStatisticsUpdate() {
486 $this->disableStatisticsUpdate = true;
487 }
488
495 public function beforeImportPage( $titleAndForeignTitle ) {
496 $title = $titleAndForeignTitle[0];
497 $page = $this->wikiPageFactory->newFromTitle( $title );
498 $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
499 return true;
500 }
501
507 public function importRevision( $revision ) {
508 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
509 $this->notice( 'import-error-bad-location',
510 $revision->getTitle()->getPrefixedText(),
511 $revision->getID(),
512 $revision->getModel(),
513 $revision->getFormat()
514 );
515
516 return false;
517 }
518
519 try {
520 return $revision->importOldRevision();
521 } catch ( MWContentSerializationException $ex ) {
522 $this->notice( 'import-error-unserialize',
523 $revision->getTitle()->getPrefixedText(),
524 $revision->getID(),
525 $revision->getModel(),
526 $revision->getFormat()
527 );
528 }
529
530 return false;
531 }
532
538 public function importLogItem( $revision ) {
539 return $revision->importLogItem();
540 }
541
547 public function importUpload( $revision ) {
548 $status = $this->uploadRevisionImporter->import( $revision );
549 return $status->isGood();
550 }
551
561 public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
562 $sRevCount, $pageInfo
563 ) {
564 // Update article count statistics (T42009)
565 // The normal counting logic in WikiPage->doEditUpdates() is designed for
566 // one-revision-at-a-time editing, not bulk imports. In this situation it
567 // suffers from issues of replica DB lag. We let WikiPage handle the total page
568 // and revision count, and we implement our own custom logic for the
569 // article (content page) count.
570 if ( !$this->disableStatisticsUpdate ) {
571 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
572
573 $page->loadPageData( WikiPage::READ_LATEST );
574 $rev = $page->getRevisionRecord();
575 if ( $rev === null ) {
576
577 wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
578 ' because WikiPage::getRevisionRecord() returned null' );
579 } else {
580 $user = RequestContext::getMain()->getUser();
581 $update = $page->newPageUpdater( $user )->prepareUpdate();
582 $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
583 $countable = $update->isCountable();
584 if ( array_key_exists( $countKey, $this->countableCache ) &&
585 $countable != $this->countableCache[$countKey] ) {
586 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
587 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
588 ] ) );
589 }
590 }
591 }
592
593 $title = Title::castFromPageIdentity( $pageIdentity );
594 // @phan-suppress-next-line PhanTypeMismatchArgumentNullable castFrom does not return null here
595 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
596 $revCount, $sRevCount, $pageInfo );
597 }
598
604 private function siteInfoCallback( $siteInfo ) {
605 if ( isset( $this->mSiteInfoCallback ) ) {
606 return call_user_func_array(
607 $this->mSiteInfoCallback,
608 [ $siteInfo, $this ]
609 );
610 } else {
611 return false;
612 }
613 }
614
619 public function pageCallback( $title ) {
620 if ( isset( $this->mPageCallback ) ) {
621 call_user_func( $this->mPageCallback, $title );
622 }
623 }
624
633 private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
634 $sucCount, $pageInfo ) {
635 if ( isset( $this->mPageOutCallback ) ) {
636 call_user_func_array( $this->mPageOutCallback, func_get_args() );
637 }
638 }
639
645 private function revisionCallback( $revision ) {
646 if ( isset( $this->mRevisionCallback ) ) {
647 return call_user_func_array(
648 $this->mRevisionCallback,
649 [ $revision, $this ]
650 );
651 } else {
652 return false;
653 }
654 }
655
661 private function logItemCallback( $revision ) {
662 if ( isset( $this->mLogItemCallback ) ) {
663 return call_user_func_array(
664 $this->mLogItemCallback,
665 [ $revision, $this ]
666 );
667 } else {
668 return false;
669 }
670 }
671
678 public function nodeAttribute( $attr ) {
679 return $this->reader->getAttribute( $attr ) ?? '';
680 }
681
689 public function nodeContents() {
690 if ( $this->reader->isEmptyElement ) {
691 return "";
692 }
693 $buffer = "";
694 while ( $this->reader->read() ) {
695 switch ( $this->reader->nodeType ) {
696 case XMLReader::TEXT:
697 case XMLReader::CDATA:
698 case XMLReader::SIGNIFICANT_WHITESPACE:
699 $buffer .= $this->reader->value;
700 break;
701 case XMLReader::END_ELEMENT:
702 return $buffer;
703 }
704 }
705
706 $this->reader->close();
707 return '';
708 }
709
716 public function doImport() {
717 // Calls to reader->read need to be wrapped in calls to
718 // libxml_disable_entity_loader() to avoid local file
719 // inclusion attacks (T48932).
720 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
721 $oldDisable = @libxml_disable_entity_loader( true );
722 try {
723 $this->reader->read();
724
725 if ( $this->reader->localName != 'mediawiki' ) {
726 // phpcs:ignore Generic.PHP.NoSilencedErrors
727 @libxml_disable_entity_loader( $oldDisable );
728 $error = libxml_get_last_error();
729 if ( $error ) {
730 throw new NormalizedException( "XML error at line {line}: {message}", [
731 'line' => $error->line,
732 'message' => $error->message,
733 ] );
734 } else {
735 throw new MWException(
736 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
737 );
738 }
739 }
740 $this->debug( "<mediawiki> tag is correct." );
741
742 $this->debug( "Starting primary dump processing loop." );
743
744 $keepReading = $this->reader->read();
745 $skip = false;
746 $pageCount = 0;
747 while ( $keepReading ) {
748 $tag = $this->reader->localName;
749 if ( $this->pageOffset ) {
750 if ( $tag === 'page' ) {
751 $pageCount++;
752 }
753 if ( $pageCount < $this->pageOffset ) {
754 $keepReading = $this->reader->next();
755 continue;
756 }
757 }
758 $type = $this->reader->nodeType;
759
760 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
761 // Do nothing
762 } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
763 break;
764 } elseif ( $tag == 'siteinfo' ) {
765 $this->handleSiteInfo();
766 } elseif ( $tag == 'page' ) {
767 $this->handlePage();
768 } elseif ( $tag == 'logitem' ) {
769 $this->handleLogItem();
770 } elseif ( $tag != '#text' ) {
771 $this->warn( "Unhandled top-level XML tag $tag" );
772
773 $skip = true;
774 }
775
776 if ( $skip ) {
777 $keepReading = $this->reader->next();
778 $skip = false;
779 $this->debug( "Skip" );
780 } else {
781 $keepReading = $this->reader->read();
782 }
783 }
784 } finally {
785 // phpcs:ignore Generic.PHP.NoSilencedErrors
786 @libxml_disable_entity_loader( $oldDisable );
787 $this->reader->close();
788 }
789
790 return true;
791 }
792
793 private function handleSiteInfo() {
794 $this->debug( "Enter site info handler." );
795 $siteInfo = [];
796
797 // Fields that can just be stuffed in the siteInfo object
798 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
799
800 while ( $this->reader->read() ) {
801 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
802 $this->reader->localName == 'siteinfo' ) {
803 break;
804 }
805
806 $tag = $this->reader->localName;
807
808 if ( $tag == 'namespace' ) {
809 $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
810 $this->nodeContents();
811 } elseif ( in_array( $tag, $normalFields ) ) {
812 $siteInfo[$tag] = $this->nodeContents();
813 }
814 }
815
816 $siteInfo['_namespaces'] = $this->foreignNamespaces;
817 $this->siteInfoCallback( $siteInfo );
818 }
819
820 private function handleLogItem() {
821 $this->debug( "Enter log item handler." );
822 $logInfo = [];
823
824 // Fields that can just be stuffed in the pageInfo object
825 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
826 'logtitle', 'params' ];
827
828 while ( $this->reader->read() ) {
829 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
830 $this->reader->localName == 'logitem' ) {
831 break;
832 }
833
834 $tag = $this->reader->localName;
835
836 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
837 // Do nothing
838 } elseif ( in_array( $tag, $normalFields ) ) {
839 $logInfo[$tag] = $this->nodeContents();
840 } elseif ( $tag == 'contributor' ) {
841 $logInfo['contributor'] = $this->handleContributor();
842 } elseif ( $tag != '#text' ) {
843 $this->warn( "Unhandled log-item XML tag $tag" );
844 }
845 }
846
847 $this->processLogItem( $logInfo );
848 }
849
854 private function processLogItem( $logInfo ) {
855 $revision = new WikiRevision( $this->config );
856
857 if ( isset( $logInfo['id'] ) ) {
858 $revision->setID( $logInfo['id'] );
859 }
860 $revision->setType( $logInfo['type'] );
861 $revision->setAction( $logInfo['action'] );
862 if ( isset( $logInfo['timestamp'] ) ) {
863 $revision->setTimestamp( $logInfo['timestamp'] );
864 }
865 if ( isset( $logInfo['params'] ) ) {
866 $revision->setParams( $logInfo['params'] );
867 }
868 if ( isset( $logInfo['logtitle'] ) ) {
869 // @todo Using Title for non-local titles is a recipe for disaster.
870 // We should use ForeignTitle here instead.
871 $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
872 }
873
874 $revision->setNoUpdates( $this->mNoUpdates );
875
876 if ( isset( $logInfo['comment'] ) ) {
877 $revision->setComment( $logInfo['comment'] );
878 }
879
880 if ( isset( $logInfo['contributor']['username'] ) ) {
881 $revision->setUsername(
882 $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
883 );
884 } elseif ( isset( $logInfo['contributor']['ip'] ) ) {
885 $revision->setUserIP( $logInfo['contributor']['ip'] );
886 } else {
887 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
888 }
889
890 return $this->logItemCallback( $revision );
891 }
892
893 private function handlePage() {
894 // Handle page data.
895 $this->debug( "Enter page handler." );
896 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
897
898 // Fields that can just be stuffed in the pageInfo object
899 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
900
901 $skip = false;
902 $badTitle = false;
903
904 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
905 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
906 $this->reader->localName == 'page' ) {
907 break;
908 }
909
910 $skip = false;
911
912 $tag = $this->reader->localName;
913
914 if ( $badTitle ) {
915 // The title is invalid, bail out of this page
916 $skip = true;
917 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
918 // Do nothing
919 } elseif ( in_array( $tag, $normalFields ) ) {
920 // An XML snippet:
921 // <page>
922 // <id>123</id>
923 // <title>Page</title>
924 // <redirect title="NewTitle"/>
925 // ...
926 // Because the redirect tag is built differently, we need special handling for that case.
927 if ( $tag == 'redirect' ) {
928 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
929 } else {
930 $pageInfo[$tag] = $this->nodeContents();
931 }
932 } elseif ( $tag == 'revision' || $tag == 'upload' ) {
933 if ( !isset( $title ) ) {
934 $title = $this->processTitle( $pageInfo['title'],
935 $pageInfo['ns'] ?? null );
936
937 // $title is either an array of two titles or false.
938 if ( is_array( $title ) ) {
939 $this->pageCallback( $title );
940 list( $pageInfo['_title'], $foreignTitle ) = $title;
941 } else {
942 $badTitle = true;
943 $skip = true;
944 }
945 }
946
947 if ( $title ) {
948 if ( $tag == 'revision' ) {
949 $this->handleRevision( $pageInfo );
950 } else {
951 $this->handleUpload( $pageInfo );
952 }
953 }
954 } elseif ( $tag != '#text' ) {
955 $this->warn( "Unhandled page XML tag $tag" );
956 $skip = true;
957 }
958 }
959
960 // @note $pageInfo is only set if a valid $title is processed above with
961 // no error. If we have a valid $title, then pageCallback is called
962 // above, $pageInfo['title'] is set and we do pageOutCallback here.
963 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
964 // set since they both come from $title above.
965 if ( array_key_exists( '_title', $pageInfo ) ) {
967 $title = $pageInfo['_title'];
968 $this->pageOutCallback(
969 $title,
970 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set together with _title key
971 $foreignTitle,
972 $pageInfo['revisionCount'],
973 $pageInfo['successfulRevisionCount'],
974 $pageInfo
975 );
976 }
977 }
978
982 private function handleRevision( &$pageInfo ) {
983 $this->debug( "Enter revision handler" );
984 $revisionInfo = [];
985
986 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
987 'model', 'format', 'text', 'sha1' ];
988
989 $skip = false;
990
991 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
992 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
993 $this->reader->localName == 'revision' ) {
994 break;
995 }
996
997 $tag = $this->reader->localName;
998
999 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
1000 $this, $pageInfo, $revisionInfo )
1001 ) {
1002 // Do nothing
1003 } elseif ( in_array( $tag, $normalFields ) ) {
1004 $revisionInfo[$tag] = $this->nodeContents();
1005 } elseif ( $tag == 'content' ) {
1006 // We can have multiple content tags, so make this an array.
1007 $revisionInfo[$tag][] = $this->handleContent();
1008 } elseif ( $tag == 'contributor' ) {
1009 $revisionInfo['contributor'] = $this->handleContributor();
1010 } elseif ( $tag != '#text' ) {
1011 $this->warn( "Unhandled revision XML tag $tag" );
1012 $skip = true;
1013 }
1014 }
1015
1016 $pageInfo['revisionCount']++;
1017 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
1018 $pageInfo['successfulRevisionCount']++;
1019 }
1020 }
1021
1022 private function handleContent() {
1023 $this->debug( "Enter content handler" );
1024 $contentInfo = [];
1025
1026 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
1027
1028 $skip = false;
1029
1030 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1031 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1032 $this->reader->localName == 'content' ) {
1033 break;
1034 }
1035
1036 $tag = $this->reader->localName;
1037
1038 if ( !$this->hookRunner->onImportHandleContentXMLTag(
1039 $this, $contentInfo )
1040 ) {
1041 // Do nothing
1042 } elseif ( in_array( $tag, $normalFields ) ) {
1043 $contentInfo[$tag] = $this->nodeContents();
1044 } elseif ( $tag != '#text' ) {
1045 $this->warn( "Unhandled content XML tag $tag" );
1046 $skip = true;
1047 }
1048 }
1049
1050 return $contentInfo;
1051 }
1052
1061 private function makeContent( Title $title, $revisionId, $contentInfo ) {
1062 $maxArticleSize = MediaWikiServices::getInstance()->getMainConfig()->get(
1063 MainConfigNames::MaxArticleSize );
1064
1065 if ( !isset( $contentInfo['text'] ) ) {
1066 throw new MWException( 'Missing text field in import.' );
1067 }
1068
1069 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1070 // database errors and instability. Testing for revisions with only listed
1071 // content models, as other content models might use serialization formats
1072 // which aren't checked against $wgMaxArticleSize.
1073 if ( ( !isset( $contentInfo['model'] ) ||
1074 in_array( $contentInfo['model'], [
1075 'wikitext',
1076 'css',
1077 'json',
1078 'javascript',
1079 'text',
1080 ''
1081 ] ) ) &&
1082 strlen( $contentInfo['text'] ) > $maxArticleSize * 1024
1083 ) {
1084 throw new MWException( 'The text of ' .
1085 ( $revisionId ?
1086 "the revision with ID $revisionId" :
1087 'a revision'
1088 ) . " exceeds the maximum allowable size ({$maxArticleSize} KiB)" );
1089 }
1090
1091 $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1092 $model = $contentInfo['model'] ?? $this->getDefaultContentModel( $title, $role );
1093 $handler = $this->getContentHandler( $model );
1094
1095 $text = $handler->importTransform( $contentInfo['text'] );
1096
1097 return $handler->unserializeContent( $text );
1098 }
1099
1106 private function processRevision( $pageInfo, $revisionInfo ) {
1107 $revision = new WikiRevision( $this->config );
1108
1109 $revId = $revisionInfo['id'] ?? 0;
1110 if ( $revId ) {
1111 $revision->setID( $revisionInfo['id'] );
1112 }
1113
1114 $title = $pageInfo['_title'];
1115 $revision->setTitle( $title );
1116
1117 $content = $this->makeContent( $title, $revId, $revisionInfo );
1118 $revision->setContent( SlotRecord::MAIN, $content );
1119
1120 foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1121 if ( !isset( $slotInfo['role'] ) ) {
1122 throw new MWException( "Missing role for imported slot." );
1123 }
1124
1125 $content = $this->makeContent( $title, $revId, $slotInfo );
1126 $revision->setContent( $slotInfo['role'], $content );
1127 }
1128 $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1129
1130 if ( isset( $revisionInfo['comment'] ) ) {
1131 $revision->setComment( $revisionInfo['comment'] );
1132 }
1133
1134 if ( isset( $revisionInfo['minor'] ) ) {
1135 $revision->setMinor( true );
1136 }
1137 if ( isset( $revisionInfo['contributor']['username'] ) ) {
1138 $revision->setUsername(
1139 $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1140 );
1141 } elseif ( isset( $revisionInfo['contributor']['ip'] ) ) {
1142 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1143 } else {
1144 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1145 }
1146 if ( isset( $revisionInfo['sha1'] ) ) {
1147 $revision->setSha1Base36( $revisionInfo['sha1'] );
1148 }
1149 $revision->setNoUpdates( $this->mNoUpdates );
1150
1151 return $this->revisionCallback( $revision );
1152 }
1153
1158 private function handleUpload( &$pageInfo ) {
1159 $this->debug( "Enter upload handler" );
1160 $uploadInfo = [];
1161
1162 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1163 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1164
1165 $skip = false;
1166
1167 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1168 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1169 $this->reader->localName == 'upload' ) {
1170 break;
1171 }
1172
1173 $tag = $this->reader->localName;
1174
1175 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1176 // Do nothing
1177 } elseif ( in_array( $tag, $normalFields ) ) {
1178 $uploadInfo[$tag] = $this->nodeContents();
1179 } elseif ( $tag == 'contributor' ) {
1180 $uploadInfo['contributor'] = $this->handleContributor();
1181 } elseif ( $tag == 'contents' ) {
1182 $contents = $this->nodeContents();
1183 $encoding = $this->reader->getAttribute( 'encoding' );
1184 if ( $encoding === 'base64' ) {
1185 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1186 $uploadInfo['isTempSrc'] = true;
1187 }
1188 } elseif ( $tag != '#text' ) {
1189 $this->warn( "Unhandled upload XML tag $tag" );
1190 $skip = true;
1191 }
1192 }
1193
1194 if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1195 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1196 if ( file_exists( $path ) ) {
1197 $uploadInfo['fileSrc'] = $path;
1198 $uploadInfo['isTempSrc'] = false;
1199 }
1200 }
1201
1202 if ( $this->mImportUploads ) {
1203 return $this->processUpload( $pageInfo, $uploadInfo );
1204 }
1205 }
1206
1211 private function dumpTemp( $contents ) {
1212 $filename = tempnam( wfTempDir(), 'importupload' );
1213 file_put_contents( $filename, $contents );
1214 return $filename;
1215 }
1216
1222 private function processUpload( $pageInfo, $uploadInfo ) {
1223 $revision = new WikiRevision( $this->config );
1224 $revId = $pageInfo['id'];
1225 $title = $pageInfo['_title'];
1226 // T292348: text key may be absent, force addition if null
1227 $uploadInfo['text'] = $uploadInfo['text'] ?? '';
1228 $content = $this->makeContent( $title, $revId, $uploadInfo );
1229
1230 $revision->setTitle( $title );
1231 $revision->setID( $revId );
1232 $revision->setTimestamp( $uploadInfo['timestamp'] );
1233 $revision->setContent( SlotRecord::MAIN, $content );
1234 $revision->setFilename( $uploadInfo['filename'] );
1235 if ( isset( $uploadInfo['archivename'] ) ) {
1236 $revision->setArchiveName( $uploadInfo['archivename'] );
1237 }
1238 $revision->setSrc( $uploadInfo['src'] );
1239 if ( isset( $uploadInfo['fileSrc'] ) ) {
1240 $revision->setFileSrc( $uploadInfo['fileSrc'],
1241 !empty( $uploadInfo['isTempSrc'] )
1242 );
1243 }
1244 if ( isset( $uploadInfo['sha1base36'] ) ) {
1245 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1246 }
1247 $revision->setSize( intval( $uploadInfo['size'] ) );
1248 $revision->setComment( $uploadInfo['comment'] );
1249
1250 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1251 $revision->setUsername(
1252 $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1253 );
1254 } elseif ( isset( $uploadInfo['contributor']['ip'] ) ) {
1255 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1256 }
1257 $revision->setNoUpdates( $this->mNoUpdates );
1258
1259 return call_user_func( $this->mUploadCallback, $revision );
1260 }
1261
1265 private function handleContributor() {
1266 $this->debug( "Enter contributor handler." );
1267
1268 if ( $this->reader->isEmptyElement ) {
1269 return [];
1270 }
1271
1272 $fields = [ 'id', 'ip', 'username' ];
1273 $info = [];
1274
1275 while ( $this->reader->read() ) {
1276 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1277 $this->reader->localName == 'contributor' ) {
1278 break;
1279 }
1280
1281 $tag = $this->reader->localName;
1282
1283 if ( in_array( $tag, $fields ) ) {
1284 $info[$tag] = $this->nodeContents();
1285 }
1286 }
1287
1288 return $info;
1289 }
1290
1296 private function processTitle( $text, $ns = null ) {
1297 if ( $this->foreignNamespaces === null ) {
1298 $foreignTitleFactory = new NaiveForeignTitleFactory(
1299 $this->contentLanguage
1300 );
1301 } else {
1302 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1303 $this->foreignNamespaces );
1304 }
1305
1306 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1307 intval( $ns ) );
1308
1309 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1310 $foreignTitle );
1311
1312 $commandLineMode = $this->config->get( 'CommandLineMode' );
1313 if ( $title === null ) {
1314 # Invalid page title? Ignore the page
1315 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1316 return false;
1317 } elseif ( $title->isExternal() ) {
1318 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1319 return false;
1320 } elseif ( !$title->canExist() ) {
1321 $this->notice( 'import-error-special', $title->getPrefixedText() );
1322 return false;
1323 } elseif ( !$commandLineMode ) {
1324 $user = RequestContext::getMain()->getUser();
1325
1326 if ( !$this->permissionManager->userCan( 'edit', $user, $title ) ) {
1327 # Do not import if the importing wiki user cannot edit this page
1328 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1329
1330 return false;
1331 }
1332 }
1333
1334 return [ $title, $foreignTitle ];
1335 }
1336
1341 private function getContentHandler( $model ) {
1342 return $this->contentHandlerFactory->getContentHandler( $model );
1343 }
1344
1351 private function getDefaultContentModel( $title, $role ) {
1352 return $this->slotRoleRegistry
1353 ->getRoleHandler( $role )
1354 ->getDefaultModel( $title );
1355 }
1356}
const NS_MAIN
Definition Defines.php:64
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Class to parse and build external user names.
Reporting callback.
Base class for language-specific code.
Definition Language.php:53
Exception representing a failure to serialize or unserialize a content object.
MediaWiki exception.
Helper class for mapping value objects representing basic entities to cache keys.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
Service for creating WikiPage objects.
A service class for checking permissions To obtain an instance, use MediaWikiServices::getInstance()-...
Value object representing a content slot associated with a page revision.
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
static getMain()
Get the RequestContext object associated with the main request.
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Creates Title objects.
Represents a title within MediaWiki.
Definition Title.php:49
static registerSource(ImportSource $source)
XML file reader for the page data importer.
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
setNoUpdates( $noupdates)
Set 'no updates' mode.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
importLogItem( $revision)
Default per-revision callback, performs the import.
setImageBasePath( $dir)
throwXmlError( $err)
setDebug( $debug)
Set debug mode...
__construct(ImportSource $source, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, PermissionManager $permissionManager, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
Represents a revision, log entry or upload during the import process.
Interface for configuration instances.
Definition Config.php:30
Source interface for XML import.
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
Interface for objects (potentially) representing an editable wiki page.
$debug
Definition mcc.php:31
$source
$content
Definition router.php:76