MediaWiki REL1_37
WikiImporter.php
Go to the documentation of this file.
1<?php
36
45 private $reader;
46
48 private $foreignNamespaces = null;
49
52
55
58
61
64
67
70
72 private $mDebug;
73
76
79
81 private $mNoUpdates = false;
82
84 private $pageOffset = 0;
85
87 private $config;
88
91
93 private $hookRunner;
94
96 private $countableCache = [];
97
99 private $disableStatisticsUpdate = false;
100
103
106
109
112
115
118
121
124
127
143 public function __construct(
145 Config $config,
146 HookContainer $hookContainer,
147 Language $contentLanguage,
148 NamespaceInfo $namespaceInfo,
149 TitleFactory $titleFactory,
150 WikiPageFactory $wikiPageFactory,
151 UploadRevisionImporter $uploadRevisionImporter,
152 PermissionManager $permissionManager,
153 IContentHandlerFactory $contentHandlerFactory,
154 SlotRoleRegistry $slotRoleRegistry
155 ) {
156 $this->reader = new XMLReader();
157 $this->config = $config;
158 $this->hookRunner = new HookRunner( $hookContainer );
159 $this->contentLanguage = $contentLanguage;
160 $this->namespaceInfo = $namespaceInfo;
161 $this->titleFactory = $titleFactory;
162 $this->wikiPageFactory = $wikiPageFactory;
163 $this->uploadRevisionImporter = $uploadRevisionImporter;
164 $this->permissionManager = $permissionManager;
165 $this->contentHandlerFactory = $contentHandlerFactory;
166 $this->slotRoleRegistry = $slotRoleRegistry;
167
168 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
169 stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
170 }
172
173 // Enable the entity loader, as it is needed for loading external URLs via
174 // XMLReader::open (T86036)
175 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
176 $oldDisable = @libxml_disable_entity_loader( false );
177 if ( defined( 'LIBXML_PARSEHUGE' ) ) {
178 $status = $this->reader->open( "uploadsource://$id", null, LIBXML_PARSEHUGE );
179 } else {
180 $status = $this->reader->open( "uploadsource://$id" );
181 }
182 if ( !$status ) {
183 $error = libxml_get_last_error();
184 // phpcs:ignore Generic.PHP.NoSilencedErrors
185 @libxml_disable_entity_loader( $oldDisable );
186 throw new MWException( 'Encountered an internal error while initializing WikiImporter object: ' .
187 $error->message );
188 }
189 // phpcs:ignore Generic.PHP.NoSilencedErrors
190 @libxml_disable_entity_loader( $oldDisable );
191
192 // Default callbacks
193 $this->setPageCallback( [ $this, 'beforeImportPage' ] );
194 $this->setRevisionCallback( [ $this, "importRevision" ] );
195 $this->setUploadCallback( [ $this, 'importUpload' ] );
196 $this->setLogItemCallback( [ $this, 'importLogItem' ] );
197 $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
198
199 $this->importTitleFactory = new NaiveImportTitleFactory(
200 $this->contentLanguage,
201 $this->namespaceInfo,
202 $this->titleFactory
203 );
204 $this->externalUserNames = new ExternalUserNames( 'imported', false );
205 }
206
210 public function getReader() {
211 return $this->reader;
212 }
213
217 public function throwXmlError( $err ) {
218 $this->debug( "FAILURE: $err" );
219 wfDebug( "WikiImporter XML error: $err" );
220 }
221
225 public function debug( $data ) {
226 if ( $this->mDebug ) {
227 wfDebug( "IMPORT: $data" );
228 }
229 }
230
234 public function warn( $data ) {
235 wfDebug( "IMPORT: $data" );
236 }
237
242 public function notice( $msg, ...$params ) {
243 if ( is_callable( $this->mNoticeCallback ) ) {
244 call_user_func( $this->mNoticeCallback, $msg, $params );
245 } else { # No ImportReporter -> CLI
246 // T177997: the command line importers should call setNoticeCallback()
247 // for their own custom callback to echo the notice
248 wfDebug( wfMessage( $msg, $params )->text() );
249 }
250 }
251
256 public function setDebug( $debug ) {
257 $this->mDebug = $debug;
258 }
259
264 public function setNoUpdates( $noupdates ) {
265 $this->mNoUpdates = $noupdates;
266 }
267
274 public function setPageOffset( $nthPage ) {
275 $this->pageOffset = $nthPage;
276 }
277
284 public function setNoticeCallback( $callback ) {
285 return wfSetVar( $this->mNoticeCallback, $callback );
286 }
287
293 public function setPageCallback( $callback ) {
294 $previous = $this->mPageCallback;
295 $this->mPageCallback = $callback;
296 return $previous;
297 }
298
308 public function setPageOutCallback( $callback ) {
309 $previous = $this->mPageOutCallback;
310 $this->mPageOutCallback = $callback;
311 return $previous;
312 }
313
319 public function setRevisionCallback( $callback ) {
320 $previous = $this->mRevisionCallback;
321 $this->mRevisionCallback = $callback;
322 return $previous;
323 }
324
330 public function setUploadCallback( $callback ) {
331 $previous = $this->mUploadCallback;
332 $this->mUploadCallback = $callback;
333 return $previous;
334 }
335
341 public function setLogItemCallback( $callback ) {
342 $previous = $this->mLogItemCallback;
343 $this->mLogItemCallback = $callback;
344 return $previous;
345 }
346
352 public function setSiteInfoCallback( $callback ) {
353 $previous = $this->mSiteInfoCallback;
354 $this->mSiteInfoCallback = $callback;
355 return $previous;
356 }
357
363 public function setImportTitleFactory( $factory ) {
364 $this->importTitleFactory = $factory;
365 }
366
372 public function setTargetNamespace( $namespace ) {
373 if ( $namespace === null ) {
374 // Don't override namespaces
377 $this->contentLanguage,
378 $this->namespaceInfo,
379 $this->titleFactory
380 )
381 );
382 return true;
383 } elseif (
384 $namespace >= 0 &&
385 $this->namespaceInfo->exists( intval( $namespace ) )
386 ) {
387 $namespace = intval( $namespace );
390 $this->namespaceInfo,
391 $this->titleFactory,
392 $namespace
393 )
394 );
395 return true;
396 } else {
397 return false;
398 }
399 }
400
406 public function setTargetRootPage( $rootpage ) {
407 $status = Status::newGood();
408 $nsInfo = $this->namespaceInfo;
409 if ( $rootpage === null ) {
410 // No rootpage
413 $this->contentLanguage,
414 $nsInfo,
415 $this->titleFactory
416 )
417 );
418 } elseif ( $rootpage !== '' ) {
419 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
420 $title = Title::newFromText( $rootpage );
421
422 if ( !$title || $title->isExternal() ) {
423 $status->fatal( 'import-rootpage-invalid' );
424 } elseif ( !$nsInfo->hasSubpages( $title->getNamespace() ) ) {
425 $displayNSText = $title->getNamespace() === NS_MAIN
426 ? wfMessage( 'blanknamespace' )->text()
427 : $this->contentLanguage->getNsText( $title->getNamespace() );
428 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
429 } else {
430 // set namespace to 'all', so the namespace check in processTitle() can pass
431 $this->setTargetNamespace( null );
434 $nsInfo,
435 $this->titleFactory,
436 $title
437 )
438 );
439 }
440 }
441 return $status;
442 }
443
447 public function setImageBasePath( $dir ) {
448 $this->mImageBasePath = $dir;
449 }
450
454 public function setImportUploads( $import ) {
455 $this->mImportUploads = $import;
456 }
457
463 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
464 $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
465 }
466
471 public function disableStatisticsUpdate() {
472 $this->disableStatisticsUpdate = true;
473 }
474
481 public function beforeImportPage( $titleAndForeignTitle ) {
482 $title = $titleAndForeignTitle[0];
483 $page = $this->wikiPageFactory->newFromTitle( $title );
484 $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
485 return true;
486 }
487
493 public function importRevision( $revision ) {
494 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
495 $this->notice( 'import-error-bad-location',
496 $revision->getTitle()->getPrefixedText(),
497 $revision->getID(),
498 $revision->getModel(),
499 $revision->getFormat()
500 );
501
502 return false;
503 }
504
505 try {
506 return $revision->importOldRevision();
507 } catch ( MWContentSerializationException $ex ) {
508 $this->notice( 'import-error-unserialize',
509 $revision->getTitle()->getPrefixedText(),
510 $revision->getID(),
511 $revision->getModel(),
512 $revision->getFormat()
513 );
514 }
515
516 return false;
517 }
518
524 public function importLogItem( $revision ) {
525 return $revision->importLogItem();
526 }
527
533 public function importUpload( $revision ) {
534 $status = $this->uploadRevisionImporter->import( $revision );
535 return $status->isGood();
536 }
537
547 public function finishImportPage( PageIdentity $pageIdentity, $foreignTitle, $revCount,
548 $sRevCount, $pageInfo
549 ) {
550 // Update article count statistics (T42009)
551 // The normal counting logic in WikiPage->doEditUpdates() is designed for
552 // one-revision-at-a-time editing, not bulk imports. In this situation it
553 // suffers from issues of replica DB lag. We let WikiPage handle the total page
554 // and revision count, and we implement our own custom logic for the
555 // article (content page) count.
556 if ( !$this->disableStatisticsUpdate ) {
557 $page = $this->wikiPageFactory->newFromTitle( $pageIdentity );
558
559 $page->loadPageData( 'fromdbmaster' );
560 $content = $page->getContent();
561 if ( $content === null ) {
562 wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $pageIdentity .
563 ' because WikiPage::getContent() returned null' );
564 } else {
565 // No user is available
566 $user = RequestContext::getMain()->getUser();
567 $editInfo = $page->prepareContentForEdit( $content, null, $user );
568 $countKey = 'title_' . CacheKeyHelper::getKeyForPage( $pageIdentity );
569 $countable = $page->isCountable( $editInfo );
570 if ( array_key_exists( $countKey, $this->countableCache ) &&
571 $countable != $this->countableCache[$countKey] ) {
572 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
573 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
574 ] ) );
575 }
576 }
577 }
578
579 $title = Title::castFromPageIdentity( $pageIdentity );
580 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
581 $revCount, $sRevCount, $pageInfo );
582 }
583
589 private function siteInfoCallback( $siteInfo ) {
590 if ( isset( $this->mSiteInfoCallback ) ) {
591 return call_user_func_array(
592 $this->mSiteInfoCallback,
593 [ $siteInfo, $this ]
594 );
595 } else {
596 return false;
597 }
598 }
599
604 public function pageCallback( $title ) {
605 if ( isset( $this->mPageCallback ) ) {
606 call_user_func( $this->mPageCallback, $title );
607 }
608 }
609
618 private function pageOutCallback( PageIdentity $pageIdentity, $foreignTitle, $revCount,
619 $sucCount, $pageInfo ) {
620 if ( isset( $this->mPageOutCallback ) ) {
621 call_user_func_array( $this->mPageOutCallback, func_get_args() );
622 }
623 }
624
630 private function revisionCallback( $revision ) {
631 if ( isset( $this->mRevisionCallback ) ) {
632 return call_user_func_array(
633 $this->mRevisionCallback,
634 [ $revision, $this ]
635 );
636 } else {
637 return false;
638 }
639 }
640
646 private function logItemCallback( $revision ) {
647 if ( isset( $this->mLogItemCallback ) ) {
648 return call_user_func_array(
649 $this->mLogItemCallback,
650 [ $revision, $this ]
651 );
652 } else {
653 return false;
654 }
655 }
656
663 public function nodeAttribute( $attr ) {
664 return $this->reader->getAttribute( $attr );
665 }
666
674 public function nodeContents() {
675 if ( $this->reader->isEmptyElement ) {
676 return "";
677 }
678 $buffer = "";
679 while ( $this->reader->read() ) {
680 switch ( $this->reader->nodeType ) {
681 case XMLReader::TEXT:
682 case XMLReader::CDATA:
683 case XMLReader::SIGNIFICANT_WHITESPACE:
684 $buffer .= $this->reader->value;
685 break;
686 case XMLReader::END_ELEMENT:
687 return $buffer;
688 }
689 }
690
691 $this->reader->close();
692 return '';
693 }
694
701 public function doImport() {
702 // Calls to reader->read need to be wrapped in calls to
703 // libxml_disable_entity_loader() to avoid local file
704 // inclusion attacks (T48932).
705 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
706 $oldDisable = @libxml_disable_entity_loader( true );
707 try {
708 $this->reader->read();
709
710 if ( $this->reader->localName != 'mediawiki' ) {
711 // phpcs:ignore Generic.PHP.NoSilencedErrors
712 @libxml_disable_entity_loader( $oldDisable );
713 throw new MWException( "Expected <mediawiki> tag, got " .
714 $this->reader->localName );
715 }
716 $this->debug( "<mediawiki> tag is correct." );
717
718 $this->debug( "Starting primary dump processing loop." );
719
720 $keepReading = $this->reader->read();
721 $skip = false;
722 $pageCount = 0;
723 while ( $keepReading ) {
724 $tag = $this->reader->localName;
725 if ( $this->pageOffset ) {
726 if ( $tag === 'page' ) {
727 $pageCount++;
728 }
729 if ( $pageCount < $this->pageOffset ) {
730 $keepReading = $this->reader->next();
731 continue;
732 }
733 }
734 $type = $this->reader->nodeType;
735
736 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
737 // Do nothing
738 } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
739 break;
740 } elseif ( $tag == 'siteinfo' ) {
741 $this->handleSiteInfo();
742 } elseif ( $tag == 'page' ) {
743 $this->handlePage();
744 } elseif ( $tag == 'logitem' ) {
745 $this->handleLogItem();
746 } elseif ( $tag != '#text' ) {
747 $this->warn( "Unhandled top-level XML tag $tag" );
748
749 $skip = true;
750 }
751
752 if ( $skip ) {
753 $keepReading = $this->reader->next();
754 $skip = false;
755 $this->debug( "Skip" );
756 } else {
757 $keepReading = $this->reader->read();
758 }
759 }
760 } finally {
761 // phpcs:ignore Generic.PHP.NoSilencedErrors
762 @libxml_disable_entity_loader( $oldDisable );
763 $this->reader->close();
764 }
765
766 return true;
767 }
768
769 private function handleSiteInfo() {
770 $this->debug( "Enter site info handler." );
771 $siteInfo = [];
772
773 // Fields that can just be stuffed in the siteInfo object
774 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
775
776 while ( $this->reader->read() ) {
777 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
778 $this->reader->localName == 'siteinfo' ) {
779 break;
780 }
781
782 $tag = $this->reader->localName;
783
784 if ( $tag == 'namespace' ) {
785 $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
786 $this->nodeContents();
787 } elseif ( in_array( $tag, $normalFields ) ) {
788 $siteInfo[$tag] = $this->nodeContents();
789 }
790 }
791
792 $siteInfo['_namespaces'] = $this->foreignNamespaces;
793 $this->siteInfoCallback( $siteInfo );
794 }
795
796 private function handleLogItem() {
797 $this->debug( "Enter log item handler." );
798 $logInfo = [];
799
800 // Fields that can just be stuffed in the pageInfo object
801 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
802 'logtitle', 'params' ];
803
804 while ( $this->reader->read() ) {
805 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
806 $this->reader->localName == 'logitem' ) {
807 break;
808 }
809
810 $tag = $this->reader->localName;
811
812 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
813 // Do nothing
814 } elseif ( in_array( $tag, $normalFields ) ) {
815 $logInfo[$tag] = $this->nodeContents();
816 } elseif ( $tag == 'contributor' ) {
817 $logInfo['contributor'] = $this->handleContributor();
818 } elseif ( $tag != '#text' ) {
819 $this->warn( "Unhandled log-item XML tag $tag" );
820 }
821 }
822
823 $this->processLogItem( $logInfo );
824 }
825
830 private function processLogItem( $logInfo ) {
831 $revision = new WikiRevision( $this->config );
832
833 if ( isset( $logInfo['id'] ) ) {
834 $revision->setID( $logInfo['id'] );
835 }
836 $revision->setType( $logInfo['type'] );
837 $revision->setAction( $logInfo['action'] );
838 if ( isset( $logInfo['timestamp'] ) ) {
839 $revision->setTimestamp( $logInfo['timestamp'] );
840 }
841 if ( isset( $logInfo['params'] ) ) {
842 $revision->setParams( $logInfo['params'] );
843 }
844 if ( isset( $logInfo['logtitle'] ) ) {
845 // @todo Using Title for non-local titles is a recipe for disaster.
846 // We should use ForeignTitle here instead.
847 $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
848 }
849
850 $revision->setNoUpdates( $this->mNoUpdates );
851
852 if ( isset( $logInfo['comment'] ) ) {
853 $revision->setComment( $logInfo['comment'] );
854 }
855
856 if ( isset( $logInfo['contributor']['ip'] ) ) {
857 $revision->setUserIP( $logInfo['contributor']['ip'] );
858 }
859
860 if ( !isset( $logInfo['contributor']['username'] ) ) {
861 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
862 } else {
863 $revision->setUsername(
864 $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
865 );
866 }
867
868 return $this->logItemCallback( $revision );
869 }
870
871 private function handlePage() {
872 // Handle page data.
873 $this->debug( "Enter page handler." );
874 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
875
876 // Fields that can just be stuffed in the pageInfo object
877 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
878
879 $skip = false;
880 $badTitle = false;
881
882 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
883 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
884 $this->reader->localName == 'page' ) {
885 break;
886 }
887
888 $skip = false;
889
890 $tag = $this->reader->localName;
891
892 if ( $badTitle ) {
893 // The title is invalid, bail out of this page
894 $skip = true;
895 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
896 // Do nothing
897 } elseif ( in_array( $tag, $normalFields ) ) {
898 // An XML snippet:
899 // <page>
900 // <id>123</id>
901 // <title>Page</title>
902 // <redirect title="NewTitle"/>
903 // ...
904 // Because the redirect tag is built differently, we need special handling for that case.
905 if ( $tag == 'redirect' ) {
906 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
907 } else {
908 $pageInfo[$tag] = $this->nodeContents();
909 }
910 } elseif ( $tag == 'revision' || $tag == 'upload' ) {
911 if ( !isset( $title ) ) {
912 $title = $this->processTitle( $pageInfo['title'],
913 $pageInfo['ns'] ?? null );
914
915 // $title is either an array of two titles or false.
916 if ( is_array( $title ) ) {
917 $this->pageCallback( $title );
918 list( $pageInfo['_title'], $foreignTitle ) = $title;
919 } else {
920 $badTitle = true;
921 $skip = true;
922 }
923 }
924
925 if ( $title ) {
926 if ( $tag == 'revision' ) {
927 $this->handleRevision( $pageInfo );
928 } else {
929 $this->handleUpload( $pageInfo );
930 }
931 }
932 } elseif ( $tag != '#text' ) {
933 $this->warn( "Unhandled page XML tag $tag" );
934 $skip = true;
935 }
936 }
937
938 // @note $pageInfo is only set if a valid $title is processed above with
939 // no error. If we have a valid $title, then pageCallback is called
940 // above, $pageInfo['title'] is set and we do pageOutCallback here.
941 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
942 // set since they both come from $title above.
943 if ( array_key_exists( '_title', $pageInfo ) ) {
945 $title = $pageInfo['_title'];
946 $this->pageOutCallback(
947 $title,
948 $foreignTitle,
949 $pageInfo['revisionCount'],
950 $pageInfo['successfulRevisionCount'],
951 $pageInfo
952 );
953 }
954 }
955
959 private function handleRevision( &$pageInfo ) {
960 $this->debug( "Enter revision handler" );
961 $revisionInfo = [];
962
963 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
964 'model', 'format', 'text', 'sha1' ];
965
966 $skip = false;
967
968 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
969 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
970 $this->reader->localName == 'revision' ) {
971 break;
972 }
973
974 $tag = $this->reader->localName;
975
976 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
977 $this, $pageInfo, $revisionInfo )
978 ) {
979 // Do nothing
980 } elseif ( in_array( $tag, $normalFields ) ) {
981 $revisionInfo[$tag] = $this->nodeContents();
982 } elseif ( $tag == 'content' ) {
983 // We can have multiple content tags, so make this an array.
984 $revisionInfo[$tag][] = $this->handleContent();
985 } elseif ( $tag == 'contributor' ) {
986 $revisionInfo['contributor'] = $this->handleContributor();
987 } elseif ( $tag != '#text' ) {
988 $this->warn( "Unhandled revision XML tag $tag" );
989 $skip = true;
990 }
991 }
992
993 $pageInfo['revisionCount']++;
994 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
995 $pageInfo['successfulRevisionCount']++;
996 }
997 }
998
999 private function handleContent() {
1000 $this->debug( "Enter content handler" );
1001 $contentInfo = [];
1002
1003 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
1004
1005 $skip = false;
1006
1007 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1008 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1009 $this->reader->localName == 'content' ) {
1010 break;
1011 }
1012
1013 $tag = $this->reader->localName;
1014
1015 if ( !$this->hookRunner->onImportHandleContentXMLTag(
1016 $this, $contentInfo )
1017 ) {
1018 // Do nothing
1019 } elseif ( in_array( $tag, $normalFields ) ) {
1020 $contentInfo[$tag] = $this->nodeContents();
1021 } elseif ( $tag != '#text' ) {
1022 $this->warn( "Unhandled content XML tag $tag" );
1023 $skip = true;
1024 }
1025 }
1026
1027 return $contentInfo;
1028 }
1029
1038 private function makeContent( Title $title, $revisionId, $contentInfo ) {
1039 global $wgMaxArticleSize;
1040
1041 if ( !isset( $contentInfo['text'] ) ) {
1042 throw new MWException( 'Missing text field in import.' );
1043 }
1044
1045 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
1046 // database errors and instability. Testing for revisions with only listed
1047 // content models, as other content models might use serialization formats
1048 // which aren't checked against $wgMaxArticleSize.
1049 if ( ( !isset( $contentInfo['model'] ) ||
1050 in_array( $contentInfo['model'], [
1051 'wikitext',
1052 'css',
1053 'json',
1054 'javascript',
1055 'text',
1056 ''
1057 ] ) ) &&
1058 strlen( $contentInfo['text'] ) > $wgMaxArticleSize * 1024
1059 ) {
1060 throw new MWException( 'The text of ' .
1061 ( $revisionId ?
1062 "the revision with ID $revisionId" :
1063 'a revision'
1064 ) . " exceeds the maximum allowable size ($wgMaxArticleSize KiB)" );
1065 }
1066
1067 $role = $contentInfo['role'] ?? SlotRecord::MAIN;
1068 $model = $contentInfo['model'] ?? $this->getDefaultContentModel( $title, $role );
1069 $handler = $this->getContentHandler( $model );
1070
1071 $text = $handler->importTransform( $contentInfo['text'] );
1072
1073 return $handler->unserializeContent( $text );
1074 }
1075
1082 private function processRevision( $pageInfo, $revisionInfo ) {
1083 $revision = new WikiRevision( $this->config );
1084
1085 $revId = $revisionInfo['id'] ?? 0;
1086 if ( $revId ) {
1087 $revision->setID( $revisionInfo['id'] );
1088 }
1089
1090 $title = $pageInfo['_title'];
1091 $revision->setTitle( $title );
1092
1093 $content = $this->makeContent( $title, $revId, $revisionInfo );
1094 $revision->setContent( SlotRecord::MAIN, $content );
1095
1096 foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
1097 if ( !isset( $slotInfo['role'] ) ) {
1098 throw new MWException( "Missing role for imported slot." );
1099 }
1100
1101 $content = $this->makeContent( $title, $revId, $slotInfo );
1102 $revision->setContent( $slotInfo['role'], $content );
1103 }
1104 $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
1105
1106 if ( isset( $revisionInfo['comment'] ) ) {
1107 $revision->setComment( $revisionInfo['comment'] );
1108 }
1109
1110 if ( isset( $revisionInfo['minor'] ) ) {
1111 $revision->setMinor( true );
1112 }
1113 if ( isset( $revisionInfo['contributor']['ip'] ) ) {
1114 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
1115 } elseif ( isset( $revisionInfo['contributor']['username'] ) ) {
1116 $revision->setUsername(
1117 $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1118 );
1119 } else {
1120 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1121 }
1122 if ( isset( $revisionInfo['sha1'] ) ) {
1123 $revision->setSha1Base36( $revisionInfo['sha1'] );
1124 }
1125 $revision->setNoUpdates( $this->mNoUpdates );
1126
1127 return $this->revisionCallback( $revision );
1128 }
1129
1134 private function handleUpload( &$pageInfo ) {
1135 $this->debug( "Enter upload handler" );
1136 $uploadInfo = [];
1137
1138 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1139 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1140
1141 $skip = false;
1142
1143 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1144 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1145 $this->reader->localName == 'upload' ) {
1146 break;
1147 }
1148
1149 $tag = $this->reader->localName;
1150
1151 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1152 // Do nothing
1153 } elseif ( in_array( $tag, $normalFields ) ) {
1154 $uploadInfo[$tag] = $this->nodeContents();
1155 } elseif ( $tag == 'contributor' ) {
1156 $uploadInfo['contributor'] = $this->handleContributor();
1157 } elseif ( $tag == 'contents' ) {
1158 $contents = $this->nodeContents();
1159 $encoding = $this->reader->getAttribute( 'encoding' );
1160 if ( $encoding === 'base64' ) {
1161 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1162 $uploadInfo['isTempSrc'] = true;
1163 }
1164 } elseif ( $tag != '#text' ) {
1165 $this->warn( "Unhandled upload XML tag $tag" );
1166 $skip = true;
1167 }
1168 }
1169
1170 if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1171 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1172 if ( file_exists( $path ) ) {
1173 $uploadInfo['fileSrc'] = $path;
1174 $uploadInfo['isTempSrc'] = false;
1175 }
1176 }
1177
1178 if ( $this->mImportUploads ) {
1179 return $this->processUpload( $pageInfo, $uploadInfo );
1180 }
1181 }
1182
1187 private function dumpTemp( $contents ) {
1188 $filename = tempnam( wfTempDir(), 'importupload' );
1189 file_put_contents( $filename, $contents );
1190 return $filename;
1191 }
1192
1198 private function processUpload( $pageInfo, $uploadInfo ) {
1199 $revision = new WikiRevision( $this->config );
1200 $revId = $pageInfo['id'];
1201 $title = $pageInfo['_title'];
1202 $content = $this->makeContent( $title, $revId, $uploadInfo );
1203
1204 $revision->setTitle( $title );
1205 $revision->setID( $revId );
1206 $revision->setTimestamp( $uploadInfo['timestamp'] );
1207 $revision->setContent( SlotRecord::MAIN, $content );
1208 $revision->setFilename( $uploadInfo['filename'] );
1209 if ( isset( $uploadInfo['archivename'] ) ) {
1210 $revision->setArchiveName( $uploadInfo['archivename'] );
1211 }
1212 $revision->setSrc( $uploadInfo['src'] );
1213 if ( isset( $uploadInfo['fileSrc'] ) ) {
1214 $revision->setFileSrc( $uploadInfo['fileSrc'],
1215 !empty( $uploadInfo['isTempSrc'] )
1216 );
1217 }
1218 if ( isset( $uploadInfo['sha1base36'] ) ) {
1219 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1220 }
1221 $revision->setSize( intval( $uploadInfo['size'] ) );
1222 $revision->setComment( $uploadInfo['comment'] );
1223
1224 if ( isset( $uploadInfo['contributor']['ip'] ) ) {
1225 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1226 }
1227 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1228 $revision->setUsername(
1229 $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1230 );
1231 }
1232 $revision->setNoUpdates( $this->mNoUpdates );
1233
1234 return call_user_func( $this->mUploadCallback, $revision );
1235 }
1236
1240 private function handleContributor() {
1241 $this->debug( "Enter contributor handler." );
1242
1243 if ( $this->reader->isEmptyElement ) {
1244 return [];
1245 }
1246
1247 $fields = [ 'id', 'ip', 'username' ];
1248 $info = [];
1249
1250 while ( $this->reader->read() ) {
1251 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1252 $this->reader->localName == 'contributor' ) {
1253 break;
1254 }
1255
1256 $tag = $this->reader->localName;
1257
1258 if ( in_array( $tag, $fields ) ) {
1259 $info[$tag] = $this->nodeContents();
1260 }
1261 }
1262
1263 return $info;
1264 }
1265
1271 private function processTitle( $text, $ns = null ) {
1272 if ( $this->foreignNamespaces === null ) {
1273 $foreignTitleFactory = new NaiveForeignTitleFactory(
1274 $this->contentLanguage
1275 );
1276 } else {
1277 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1278 $this->foreignNamespaces );
1279 }
1280
1281 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1282 intval( $ns ) );
1283
1284 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1285 $foreignTitle );
1286
1287 $commandLineMode = $this->config->get( 'CommandLineMode' );
1288 if ( $title === null ) {
1289 # Invalid page title? Ignore the page
1290 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1291 return false;
1292 } elseif ( $title->isExternal() ) {
1293 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1294 return false;
1295 } elseif ( !$title->canExist() ) {
1296 $this->notice( 'import-error-special', $title->getPrefixedText() );
1297 return false;
1298 } elseif ( !$commandLineMode ) {
1299 $user = RequestContext::getMain()->getUser();
1300
1301 if ( !$this->permissionManager->userCan( 'edit', $user, $title ) ) {
1302 # Do not import if the importing wiki user cannot edit this page
1303 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1304
1305 return false;
1306 }
1307 }
1308
1309 return [ $title, $foreignTitle ];
1310 }
1311
1316 private function getContentHandler( $model ) {
1317 return $this->contentHandlerFactory->getContentHandler( $model );
1318 }
1319
1326 private function getDefaultContentModel( $title, $role ) {
1327 return $this->slotRoleRegistry
1328 ->getRoleHandler( $role )
1329 ->getDefaultModel( $title );
1330 }
1331}
$wgMaxArticleSize
Maximum article size in kibibytes.
const NS_MAIN
Definition Defines.php:64
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Class to parse and build external user names.
Reporting callback.
Internationalisation code See https://www.mediawiki.org/wiki/Special:MyLanguage/Localisation for more...
Definition Language.php:42
Exception representing a failure to serialize or unserialize a content object.
MediaWiki exception.
Helper class for mapping value objects representing basic entities to cache keys.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
A service class for checking permissions To obtain an instance, use MediaWikiServices::getInstance()-...
Value object representing a content slot associated with a page revision.
A registry service for SlotRoleHandlers, used to define which slot roles are available on which page.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
This is a utility class for dealing with namespaces that encodes all the "magic" behaviors of them ba...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Creates Title objects.
Represents a title within MediaWiki.
Definition Title.php:48
static registerSource(ImportSource $source)
XML file reader for the page data importer.
callable $mLogItemCallback
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
ExternalUserNames $externalUserNames
setNoUpdates( $noupdates)
Set 'no updates' mode.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
pageOutCallback(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sucCount, $pageInfo)
Notify the callback function when a "</page>" is closed.
callable $mUploadCallback
PermissionManager $permissionManager
HookRunner $hookRunner
array null $foreignNamespaces
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
bool null $mDebug
callable $mPageOutCallback
dumpTemp( $contents)
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
getContentHandler( $model)
callable null $mNoticeCallback
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
processLogItem( $logInfo)
TitleFactory $titleFactory
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
UploadRevisionImporter $uploadRevisionImporter
setPageOffset( $nthPage)
Sets 'pageOffset' value.
string null $mImageBasePath
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
WikiPageFactory $wikiPageFactory
array $countableCache
importLogItem( $revision)
Default per-revision callback, performs the import.
NamespaceInfo $namespaceInfo
setImageBasePath( $dir)
handleUpload(&$pageInfo)
getDefaultContentModel( $title, $role)
handleRevision(&$pageInfo)
revisionCallback( $revision)
Notify the callback function of a revision.
callable $mPageCallback
logItemCallback( $revision)
Notify the callback function of a new log item.
throwXmlError( $err)
Language $contentLanguage
setDebug( $debug)
Set debug mode...
bool null $mImportUploads
__construct(ImportSource $source, Config $config, HookContainer $hookContainer, Language $contentLanguage, NamespaceInfo $namespaceInfo, TitleFactory $titleFactory, WikiPageFactory $wikiPageFactory, UploadRevisionImporter $uploadRevisionImporter, PermissionManager $permissionManager, IContentHandlerFactory $contentHandlerFactory, SlotRoleRegistry $slotRoleRegistry)
Creates an ImportXMLReader drawing from the source provided.
IContentHandlerFactory $contentHandlerFactory
processRevision( $pageInfo, $revisionInfo)
processUpload( $pageInfo, $uploadInfo)
callable null $mSiteInfoCallback
bool $disableStatisticsUpdate
processTitle( $text, $ns=null)
makeContent(Title $title, $revisionId, $contentInfo)
XMLReader $reader
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
ImportTitleFactory $importTitleFactory
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
finishImportPage(PageIdentity $pageIdentity, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
callable $mRevisionCallback
disableStatisticsUpdate()
Statistics update can cause a lot of time.
siteInfoCallback( $siteInfo)
Notify the callback function of site info.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
SlotRoleRegistry $slotRoleRegistry
Represents a revision, log entry or upload during the import process.
Interface for configuration instances.
Definition Config.php:30
Source interface for XML import.
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
Interface for objects (potentially) representing an editable wiki page.
$debug
Definition mcc.php:31
$source
$content
Definition router.php:76