MediaWiki REL1_35
WikiImporter.php
Go to the documentation of this file.
1<?php
30
39 private $reader;
40 private $foreignNamespaces = null;
45 private $mNoUpdates = false;
46 private $pageOffset = 0;
48 private $config;
52 private $hookRunner;
54 private $countableCache = [];
56 private $disableStatisticsUpdate = false;
59
67 public function __construct( ImportSource $source, Config $config ) {
68 if ( !class_exists( 'XMLReader' ) ) {
69 throw new Exception( 'Import requires PHP to have been compiled with libxml support' );
70 }
71
72 $this->config = $config;
73 $this->hookRunner = Hooks::runner();
74
75 if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
76 stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
77 }
79
80 // Enable the entity loader, as it is needed for loading external URLs via
81 // XMLReader::open (T86036)
82 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
83 $oldDisable = @libxml_disable_entity_loader( false );
84 if ( PHP_VERSION_ID >= 80000 ) {
85 // A static call is now preferred, and avoids https://github.com/php/php-src/issues/11548
86 $reader = XMLReader::open(
87 "uploadsource://$id", null, LIBXML_PARSEHUGE );
88 if ( $reader instanceof XMLReader ) {
89 $this->reader = $reader;
90 $status = true;
91 } else {
92 $status = false;
93 }
94 } else {
95 // A static call generated a deprecation warning prior to PHP 8.0
96 $this->reader = new XMLReader;
97 $status = $this->reader->open(
98 "uploadsource://$id", null, LIBXML_PARSEHUGE );
99 }
100 if ( !$status ) {
101 $error = libxml_get_last_error();
102 // phpcs:ignore Generic.PHP.NoSilencedErrors
103 @libxml_disable_entity_loader( $oldDisable );
104 throw new MWException( 'Encountered an internal error while initializing WikiImporter object: ' .
105 $error->message );
106 }
107 // phpcs:ignore Generic.PHP.NoSilencedErrors
108 @libxml_disable_entity_loader( $oldDisable );
109
110 // Default callbacks
111 $this->setPageCallback( [ $this, 'beforeImportPage' ] );
112 $this->setRevisionCallback( [ $this, "importRevision" ] );
113 $this->setUploadCallback( [ $this, 'importUpload' ] );
114 $this->setLogItemCallback( [ $this, 'importLogItem' ] );
115 $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
116
117 $this->importTitleFactory = new NaiveImportTitleFactory();
118 $this->externalUserNames = new ExternalUserNames( 'imported', false );
119 }
120
124 public function getReader() {
125 return $this->reader;
126 }
127
128 public function throwXmlError( $err ) {
129 $this->debug( "FAILURE: $err" );
130 wfDebug( "WikiImporter XML error: $err" );
131 }
132
133 public function debug( $data ) {
134 if ( $this->mDebug ) {
135 wfDebug( "IMPORT: $data" );
136 }
137 }
138
139 public function warn( $data ) {
140 wfDebug( "IMPORT: $data" );
141 }
142
143 public function notice( $msg, ...$params ) {
144 if ( is_callable( $this->mNoticeCallback ) ) {
145 call_user_func( $this->mNoticeCallback, $msg, $params );
146 } else { # No ImportReporter -> CLI
147 // T177997: the command line importers should call setNoticeCallback()
148 // for their own custom callback to echo the notice
149 wfDebug( wfMessage( $msg, $params )->text() );
150 }
151 }
152
157 public function setDebug( $debug ) {
158 $this->mDebug = $debug;
159 }
160
165 public function setNoUpdates( $noupdates ) {
166 $this->mNoUpdates = $noupdates;
167 }
168
175 public function setPageOffset( $nthPage ) {
176 $this->pageOffset = $nthPage;
177 }
178
185 public function setNoticeCallback( $callback ) {
186 return wfSetVar( $this->mNoticeCallback, $callback );
187 }
188
194 public function setPageCallback( $callback ) {
195 $previous = $this->mPageCallback;
196 $this->mPageCallback = $callback;
197 return $previous;
198 }
199
209 public function setPageOutCallback( $callback ) {
210 $previous = $this->mPageOutCallback;
211 $this->mPageOutCallback = $callback;
212 return $previous;
213 }
214
220 public function setRevisionCallback( $callback ) {
221 $previous = $this->mRevisionCallback;
222 $this->mRevisionCallback = $callback;
223 return $previous;
224 }
225
231 public function setUploadCallback( $callback ) {
232 $previous = $this->mUploadCallback;
233 $this->mUploadCallback = $callback;
234 return $previous;
235 }
236
242 public function setLogItemCallback( $callback ) {
243 $previous = $this->mLogItemCallback;
244 $this->mLogItemCallback = $callback;
245 return $previous;
246 }
247
253 public function setSiteInfoCallback( $callback ) {
254 $previous = $this->mSiteInfoCallback;
255 $this->mSiteInfoCallback = $callback;
256 return $previous;
257 }
258
264 public function setImportTitleFactory( $factory ) {
265 $this->importTitleFactory = $factory;
266 }
267
273 public function setTargetNamespace( $namespace ) {
274 if ( $namespace === null ) {
275 // Don't override namespaces
277 return true;
278 } elseif (
279 $namespace >= 0 &&
280 MediaWikiServices::getInstance()->getNamespaceInfo()->exists( intval( $namespace ) )
281 ) {
282 $namespace = intval( $namespace );
283 $this->setImportTitleFactory( new NamespaceImportTitleFactory( $namespace ) );
284 return true;
285 } else {
286 return false;
287 }
288 }
289
295 public function setTargetRootPage( $rootpage ) {
296 $status = Status::newGood();
297 if ( $rootpage === null ) {
298 // No rootpage
300 } elseif ( $rootpage !== '' ) {
301 $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
302 $title = Title::newFromText( $rootpage );
303
304 if ( !$title || $title->isExternal() ) {
305 $status->fatal( 'import-rootpage-invalid' );
306 } elseif (
307 !MediaWikiServices::getInstance()->getNamespaceInfo()->
308 hasSubpages( $title->getNamespace() )
309 ) {
310 $displayNSText = $title->getNamespace() == NS_MAIN
311 ? wfMessage( 'blanknamespace' )->text()
312 : MediaWikiServices::getInstance()->getContentLanguage()->
313 getNsText( $title->getNamespace() );
314 $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
315 } else {
316 // set namespace to 'all', so the namespace check in processTitle() can pass
317 $this->setTargetNamespace( null );
319 }
320 }
321 return $status;
322 }
323
327 public function setImageBasePath( $dir ) {
328 $this->mImageBasePath = $dir;
329 }
330
334 public function setImportUploads( $import ) {
335 $this->mImportUploads = $import;
336 }
337
343 public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
344 $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
345 }
346
351 public function disableStatisticsUpdate() {
352 $this->disableStatisticsUpdate = true;
353 }
354
361 public function beforeImportPage( $titleAndForeignTitle ) {
362 $title = $titleAndForeignTitle[0];
363 $page = WikiPage::factory( $title );
364 $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
365 return true;
366 }
367
373 public function importRevision( $revision ) {
374 if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
375 $this->notice( 'import-error-bad-location',
376 $revision->getTitle()->getPrefixedText(),
377 $revision->getID(),
378 $revision->getModel(),
379 $revision->getFormat() );
380
381 return false;
382 }
383
384 try {
385 return $revision->importOldRevision();
386 } catch ( MWContentSerializationException $ex ) {
387 $this->notice( 'import-error-unserialize',
388 $revision->getTitle()->getPrefixedText(),
389 $revision->getID(),
390 $revision->getModel(),
391 $revision->getFormat() );
392 }
393
394 return false;
395 }
396
402 public function importLogItem( $revision ) {
403 return $revision->importLogItem();
404 }
405
411 public function importUpload( $revision ) {
412 return $revision->importUpload();
413 }
414
424 public function finishImportPage( $title, $foreignTitle, $revCount,
425 $sRevCount, $pageInfo
426 ) {
427 // Update article count statistics (T42009)
428 // The normal counting logic in WikiPage->doEditUpdates() is designed for
429 // one-revision-at-a-time editing, not bulk imports. In this situation it
430 // suffers from issues of replica DB lag. We let WikiPage handle the total page
431 // and revision count, and we implement our own custom logic for the
432 // article (content page) count.
433 if ( !$this->disableStatisticsUpdate ) {
434 $page = WikiPage::factory( $title );
435 $page->loadPageData( 'fromdbmaster' );
436 $content = $page->getContent();
437 if ( $content === null ) {
438 wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $title .
439 ' because WikiPage::getContent() returned null' );
440 } else {
441 $editInfo = $page->prepareContentForEdit( $content );
442 $countKey = 'title_' . $title->getPrefixedText();
443 $countable = $page->isCountable( $editInfo );
444 if ( array_key_exists( $countKey, $this->countableCache ) &&
445 $countable != $this->countableCache[$countKey] ) {
446 DeferredUpdates::addUpdate( SiteStatsUpdate::factory( [
447 'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
448 ] ) );
449 }
450 }
451 }
452
453 return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
454 $revCount, $sRevCount, $pageInfo );
455 }
456
461 public function debugRevisionHandler( &$revision ) {
462 $this->debug( "Got revision:" );
463 if ( is_object( $revision->title ) ) {
464 $this->debug( "-- Title: " . $revision->title->getPrefixedText() );
465 } else {
466 $this->debug( "-- Title: <invalid>" );
467 }
468 $this->debug( "-- User: " . $revision->user_text );
469 $this->debug( "-- Timestamp: " . $revision->timestamp );
470 $this->debug( "-- Comment: " . $revision->comment );
471 $this->debug( "-- Text: " . $revision->text );
472 }
473
479 private function siteInfoCallback( $siteInfo ) {
480 if ( isset( $this->mSiteInfoCallback ) ) {
481 return call_user_func_array( $this->mSiteInfoCallback,
482 [ $siteInfo, $this ] );
483 } else {
484 return false;
485 }
486 }
487
492 public function pageCallback( $title ) {
493 if ( isset( $this->mPageCallback ) ) {
494 call_user_func( $this->mPageCallback, $title );
495 }
496 }
497
506 private function pageOutCallback( $title, $foreignTitle, $revCount,
507 $sucCount, $pageInfo ) {
508 if ( isset( $this->mPageOutCallback ) ) {
509 call_user_func_array( $this->mPageOutCallback, func_get_args() );
510 }
511 }
512
518 private function revisionCallback( $revision ) {
519 if ( isset( $this->mRevisionCallback ) ) {
520 return call_user_func_array( $this->mRevisionCallback,
521 [ $revision, $this ] );
522 } else {
523 return false;
524 }
525 }
526
532 private function logItemCallback( $revision ) {
533 if ( isset( $this->mLogItemCallback ) ) {
534 return call_user_func_array( $this->mLogItemCallback,
535 [ $revision, $this ] );
536 } else {
537 return false;
538 }
539 }
540
547 public function nodeAttribute( $attr ) {
548 return $this->reader->getAttribute( $attr );
549 }
550
558 public function nodeContents() {
559 if ( $this->reader->isEmptyElement ) {
560 return "";
561 }
562 $buffer = "";
563 while ( $this->reader->read() ) {
564 switch ( $this->reader->nodeType ) {
565 case XMLReader::TEXT:
566 case XMLReader::CDATA:
567 case XMLReader::SIGNIFICANT_WHITESPACE:
568 $buffer .= $this->reader->value;
569 break;
570 case XMLReader::END_ELEMENT:
571 return $buffer;
572 }
573 }
574
575 $this->reader->close();
576 return '';
577 }
578
585 public function doImport() {
586 // Calls to reader->read need to be wrapped in calls to
587 // libxml_disable_entity_loader() to avoid local file
588 // inclusion attacks (T48932).
589 // phpcs:ignore Generic.PHP.NoSilencedErrors -- suppress deprecation per T268847
590 $oldDisable = @libxml_disable_entity_loader( true );
591 $rethrow = null;
592 try {
593 $this->reader->read();
594
595 if ( $this->reader->localName != 'mediawiki' ) {
596 // phpcs:ignore Generic.PHP.NoSilencedErrors
597 @libxml_disable_entity_loader( $oldDisable );
598 throw new MWException(
599 "Expected '<mediawiki>' tag, got '<{$this->reader->localName}>' tag."
600 );
601 }
602 $this->debug( "<mediawiki> tag is correct." );
603
604 $this->debug( "Starting primary dump processing loop." );
605
606 $keepReading = $this->reader->read();
607 $skip = false;
608 $pageCount = 0;
609 while ( $keepReading ) {
610 $tag = $this->reader->localName;
611 if ( $this->pageOffset ) {
612 if ( $tag === 'page' ) {
613 $pageCount++;
614 }
615 if ( $pageCount < $this->pageOffset ) {
616 $keepReading = $this->reader->next();
617 continue;
618 }
619 }
620 $type = $this->reader->nodeType;
621
622 if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
623 // Do nothing
624 } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
625 break;
626 } elseif ( $tag == 'siteinfo' ) {
627 $this->handleSiteInfo();
628 } elseif ( $tag == 'page' ) {
629 $this->handlePage();
630 } elseif ( $tag == 'logitem' ) {
631 $this->handleLogItem();
632 } elseif ( $tag != '#text' ) {
633 $this->warn( "Unhandled top-level XML tag $tag" );
634
635 $skip = true;
636 }
637
638 if ( $skip ) {
639 $keepReading = $this->reader->next();
640 $skip = false;
641 $this->debug( "Skip" );
642 } else {
643 $keepReading = $this->reader->read();
644 }
645 }
646 } finally {
647 // phpcs:ignore Generic.PHP.NoSilencedErrors
648 @libxml_disable_entity_loader( $oldDisable );
649 $this->reader->close();
650 }
651
652 return true;
653 }
654
655 private function handleSiteInfo() {
656 $this->debug( "Enter site info handler." );
657 $siteInfo = [];
658
659 // Fields that can just be stuffed in the siteInfo object
660 $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
661
662 while ( $this->reader->read() ) {
663 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
664 $this->reader->localName == 'siteinfo' ) {
665 break;
666 }
667
668 $tag = $this->reader->localName;
669
670 if ( $tag == 'namespace' ) {
671 $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
672 $this->nodeContents();
673 } elseif ( in_array( $tag, $normalFields ) ) {
674 $siteInfo[$tag] = $this->nodeContents();
675 }
676 }
677
678 $siteInfo['_namespaces'] = $this->foreignNamespaces;
679 $this->siteInfoCallback( $siteInfo );
680 }
681
682 private function handleLogItem() {
683 $this->debug( "Enter log item handler." );
684 $logInfo = [];
685
686 // Fields that can just be stuffed in the pageInfo object
687 $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
688 'logtitle', 'params' ];
689
690 while ( $this->reader->read() ) {
691 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
692 $this->reader->localName == 'logitem' ) {
693 break;
694 }
695
696 $tag = $this->reader->localName;
697
698 if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
699 // Do nothing
700 } elseif ( in_array( $tag, $normalFields ) ) {
701 $logInfo[$tag] = $this->nodeContents();
702 } elseif ( $tag == 'contributor' ) {
703 $logInfo['contributor'] = $this->handleContributor();
704 } elseif ( $tag != '#text' ) {
705 $this->warn( "Unhandled log-item XML tag $tag" );
706 }
707 }
708
709 $this->processLogItem( $logInfo );
710 }
711
716 private function processLogItem( $logInfo ) {
717 $revision = new WikiRevision( $this->config );
718
719 if ( isset( $logInfo['id'] ) ) {
720 $revision->setID( $logInfo['id'] );
721 }
722 $revision->setType( $logInfo['type'] );
723 $revision->setAction( $logInfo['action'] );
724 if ( isset( $logInfo['timestamp'] ) ) {
725 $revision->setTimestamp( $logInfo['timestamp'] );
726 }
727 if ( isset( $logInfo['params'] ) ) {
728 $revision->setParams( $logInfo['params'] );
729 }
730 if ( isset( $logInfo['logtitle'] ) ) {
731 // @todo Using Title for non-local titles is a recipe for disaster.
732 // We should use ForeignTitle here instead.
733 $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
734 }
735
736 $revision->setNoUpdates( $this->mNoUpdates );
737
738 if ( isset( $logInfo['comment'] ) ) {
739 $revision->setComment( $logInfo['comment'] );
740 }
741
742 if ( isset( $logInfo['contributor']['ip'] ) ) {
743 $revision->setUserIP( $logInfo['contributor']['ip'] );
744 }
745
746 if ( !isset( $logInfo['contributor']['username'] ) ) {
747 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
748 } else {
749 $revision->setUsername(
750 $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
751 );
752 }
753
754 return $this->logItemCallback( $revision );
755 }
756
757 private function handlePage() {
758 // Handle page data.
759 $this->debug( "Enter page handler." );
760 $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
761
762 // Fields that can just be stuffed in the pageInfo object
763 $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
764
765 $skip = false;
766 $badTitle = false;
767
768 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
769 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
770 $this->reader->localName == 'page' ) {
771 break;
772 }
773
774 $skip = false;
775
776 $tag = $this->reader->localName;
777
778 if ( $badTitle ) {
779 // The title is invalid, bail out of this page
780 $skip = true;
781 } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
782 // Do nothing
783 } elseif ( in_array( $tag, $normalFields ) ) {
784 // An XML snippet:
785 // <page>
786 // <id>123</id>
787 // <title>Page</title>
788 // <redirect title="NewTitle"/>
789 // ...
790 // Because the redirect tag is built differently, we need special handling for that case.
791 if ( $tag == 'redirect' ) {
792 $pageInfo[$tag] = $this->nodeAttribute( 'title' );
793 } else {
794 $pageInfo[$tag] = $this->nodeContents();
795 }
796 } elseif ( $tag == 'revision' || $tag == 'upload' ) {
797 if ( !isset( $title ) ) {
798 $title = $this->processTitle( $pageInfo['title'],
799 $pageInfo['ns'] ?? null );
800
801 // $title is either an array of two titles or false.
802 if ( is_array( $title ) ) {
803 $this->pageCallback( $title );
804 list( $pageInfo['_title'], $foreignTitle ) = $title;
805 } else {
806 $badTitle = true;
807 $skip = true;
808 }
809 }
810
811 if ( $title ) {
812 if ( $tag == 'revision' ) {
813 $this->handleRevision( $pageInfo );
814 } else {
815 $this->handleUpload( $pageInfo );
816 }
817 }
818 } elseif ( $tag != '#text' ) {
819 $this->warn( "Unhandled page XML tag $tag" );
820 $skip = true;
821 }
822 }
823
824 // @note $pageInfo is only set if a valid $title is processed above with
825 // no error. If we have a valid $title, then pageCallback is called
826 // above, $pageInfo['title'] is set and we do pageOutCallback here.
827 // If $pageInfo['_title'] is not set, then $foreignTitle is also not
828 // set since they both come from $title above.
829 if ( array_key_exists( '_title', $pageInfo ) ) {
830 $this->pageOutCallback( $pageInfo['_title'], $foreignTitle,
831 $pageInfo['revisionCount'],
832 $pageInfo['successfulRevisionCount'],
833 $pageInfo );
834 }
835 }
836
840 private function handleRevision( &$pageInfo ) {
841 $this->debug( "Enter revision handler" );
842 $revisionInfo = [];
843
844 $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
845 'model', 'format', 'text', 'sha1' ];
846
847 $skip = false;
848
849 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
850 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
851 $this->reader->localName == 'revision' ) {
852 break;
853 }
854
855 $tag = $this->reader->localName;
856
857 if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
858 $this, $pageInfo, $revisionInfo )
859 ) {
860 // Do nothing
861 } elseif ( in_array( $tag, $normalFields ) ) {
862 $revisionInfo[$tag] = $this->nodeContents();
863 } elseif ( $tag == 'content' ) {
864 // We can have multiple content tags, so make this an array.
865 $revisionInfo[$tag][] = $this->handleContent();
866 } elseif ( $tag == 'contributor' ) {
867 $revisionInfo['contributor'] = $this->handleContributor();
868 } elseif ( $tag != '#text' ) {
869 $this->warn( "Unhandled revision XML tag $tag" );
870 $skip = true;
871 }
872 }
873
874 $pageInfo['revisionCount']++;
875 if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
876 $pageInfo['successfulRevisionCount']++;
877 }
878 }
879
880 private function handleContent() {
881 $this->debug( "Enter content handler" );
882 $contentInfo = [];
883
884 $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
885
886 $skip = false;
887
888 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
889 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
890 $this->reader->localName == 'content' ) {
891 break;
892 }
893
894 $tag = $this->reader->localName;
895
896 if ( !$this->hookRunner->onImportHandleContentXMLTag(
897 $this, $contentInfo )
898 ) {
899 // Do nothing
900 } elseif ( in_array( $tag, $normalFields ) ) {
901 $contentInfo[$tag] = $this->nodeContents();
902 } elseif ( $tag != '#text' ) {
903 $this->warn( "Unhandled content XML tag $tag" );
904 $skip = true;
905 }
906 }
907
908 return $contentInfo;
909 }
910
919 private function makeContent( Title $title, $revisionId, $contentInfo ) {
920 global $wgMaxArticleSize;
921
922 if ( !isset( $contentInfo['text'] ) ) {
923 throw new MWException( 'Missing text field in import.' );
924 }
925
926 // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
927 // database errors and instability. Testing for revisions with only listed
928 // content models, as other content models might use serialization formats
929 // which aren't checked against $wgMaxArticleSize.
930 if ( ( !isset( $contentInfo['model'] ) ||
931 in_array( $contentInfo['model'], [
932 'wikitext',
933 'css',
934 'json',
935 'javascript',
936 'text',
937 ''
938 ] ) ) &&
939 strlen( $contentInfo['text'] ) > $wgMaxArticleSize * 1024
940 ) {
941 throw new MWException( 'The text of ' .
942 ( $revisionId ?
943 "the revision with ID $revisionId" :
944 'a revision'
945 ) . " exceeds the maximum allowable size ($wgMaxArticleSize KB)" );
946 }
947
948 $role = $contentInfo['role'] ?? SlotRecord::MAIN;
949 $model = $contentInfo['model'] ?? $this->getDefaultContentModel( $title, $role );
950 $handler = $this->getContentHandler( $model );
951
952 $text = $handler->importTransform( $contentInfo['text'] );
953
954 $content = $handler->unserializeContent( $text );
955
956 return $content;
957 }
958
965 private function processRevision( $pageInfo, $revisionInfo ) {
966 $revision = new WikiRevision( $this->config );
967
968 $revId = $revisionInfo['id'] ?? 0;
969 if ( $revId ) {
970 $revision->setID( $revisionInfo['id'] );
971 }
972
973 $title = $pageInfo['_title'];
974 $revision->setTitle( $title );
975
976 $content = $this->makeContent( $title, $revId, $revisionInfo );
977 $revision->setContent( SlotRecord::MAIN, $content );
978
979 foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
980 if ( !isset( $slotInfo['role'] ) ) {
981 throw new MWException( "Missing role for imported slot." );
982 }
983
984 $content = $this->makeContent( $title, $revId, $slotInfo );
985 $revision->setContent( $slotInfo['role'], $content );
986 }
987 $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
988
989 if ( isset( $revisionInfo['comment'] ) ) {
990 $revision->setComment( $revisionInfo['comment'] );
991 }
992
993 if ( isset( $revisionInfo['minor'] ) ) {
994 $revision->setMinor( true );
995 }
996 if ( isset( $revisionInfo['contributor']['ip'] ) ) {
997 $revision->setUserIP( $revisionInfo['contributor']['ip'] );
998 } elseif ( isset( $revisionInfo['contributor']['username'] ) ) {
999 $revision->setUsername(
1000 $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
1001 );
1002 } else {
1003 $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
1004 }
1005 if ( isset( $revisionInfo['sha1'] ) ) {
1006 $revision->setSha1Base36( $revisionInfo['sha1'] );
1007 }
1008 $revision->setNoUpdates( $this->mNoUpdates );
1009
1010 return $this->revisionCallback( $revision );
1011 }
1012
1017 private function handleUpload( &$pageInfo ) {
1018 $this->debug( "Enter upload handler" );
1019 $uploadInfo = [];
1020
1021 $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1022 'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1023
1024 $skip = false;
1025
1026 while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1027 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1028 $this->reader->localName == 'upload' ) {
1029 break;
1030 }
1031
1032 $tag = $this->reader->localName;
1033
1034 if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1035 // Do nothing
1036 } elseif ( in_array( $tag, $normalFields ) ) {
1037 $uploadInfo[$tag] = $this->nodeContents();
1038 } elseif ( $tag == 'contributor' ) {
1039 $uploadInfo['contributor'] = $this->handleContributor();
1040 } elseif ( $tag == 'contents' ) {
1041 $contents = $this->nodeContents();
1042 $encoding = $this->reader->getAttribute( 'encoding' );
1043 if ( $encoding === 'base64' ) {
1044 $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1045 $uploadInfo['isTempSrc'] = true;
1046 }
1047 } elseif ( $tag != '#text' ) {
1048 $this->warn( "Unhandled upload XML tag $tag" );
1049 $skip = true;
1050 }
1051 }
1052
1053 if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1054 $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1055 if ( file_exists( $path ) ) {
1056 $uploadInfo['fileSrc'] = $path;
1057 $uploadInfo['isTempSrc'] = false;
1058 }
1059 }
1060
1061 if ( $this->mImportUploads ) {
1062 return $this->processUpload( $pageInfo, $uploadInfo );
1063 }
1064 }
1065
1070 private function dumpTemp( $contents ) {
1071 $filename = tempnam( wfTempDir(), 'importupload' );
1072 file_put_contents( $filename, $contents );
1073 return $filename;
1074 }
1075
1081 private function processUpload( $pageInfo, $uploadInfo ) {
1082 $revision = new WikiRevision( $this->config );
1083 $revId = $pageInfo['id'];
1084 $title = $pageInfo['_title'];
1085 // T292348: text key may be absent, force addition if null
1086 $uploadInfo['text'] = $uploadInfo['text'] ?? '';
1087 $content = $this->makeContent( $title, $revId, $uploadInfo );
1088
1089 $revision->setTitle( $title );
1090 $revision->setID( $revId );
1091 $revision->setTimestamp( $uploadInfo['timestamp'] );
1092 $revision->setContent( SlotRecord::MAIN, $content );
1093 $revision->setFilename( $uploadInfo['filename'] );
1094 if ( isset( $uploadInfo['archivename'] ) ) {
1095 $revision->setArchiveName( $uploadInfo['archivename'] );
1096 }
1097 $revision->setSrc( $uploadInfo['src'] );
1098 if ( isset( $uploadInfo['fileSrc'] ) ) {
1099 $revision->setFileSrc( $uploadInfo['fileSrc'],
1100 !empty( $uploadInfo['isTempSrc'] ) );
1101 }
1102 if ( isset( $uploadInfo['sha1base36'] ) ) {
1103 $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1104 }
1105 $revision->setSize( intval( $uploadInfo['size'] ) );
1106 $revision->setComment( $uploadInfo['comment'] );
1107
1108 if ( isset( $uploadInfo['contributor']['ip'] ) ) {
1109 $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1110 }
1111 if ( isset( $uploadInfo['contributor']['username'] ) ) {
1112 $revision->setUsername(
1113 $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1114 );
1115 }
1116 $revision->setNoUpdates( $this->mNoUpdates );
1117
1118 return call_user_func( $this->mUploadCallback, $revision );
1119 }
1120
1124 private function handleContributor() {
1125 $this->debug( "Enter contributor handler." );
1126 $fields = [ 'id', 'ip', 'username' ];
1127 $info = [];
1128
1129 if ( $this->reader->isEmptyElement ) {
1130 return $info;
1131 }
1132 while ( $this->reader->read() ) {
1133 if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1134 $this->reader->localName == 'contributor' ) {
1135 break;
1136 }
1137
1138 $tag = $this->reader->localName;
1139
1140 if ( in_array( $tag, $fields ) ) {
1141 $info[$tag] = $this->nodeContents();
1142 }
1143 }
1144
1145 return $info;
1146 }
1147
1153 private function processTitle( $text, $ns = null ) {
1154 if ( $this->foreignNamespaces === null ) {
1155 $foreignTitleFactory = new NaiveForeignTitleFactory();
1156 } else {
1157 $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1158 $this->foreignNamespaces );
1159 }
1160
1161 $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1162 intval( $ns ) );
1163
1164 $title = $this->importTitleFactory->createTitleFromForeignTitle(
1165 $foreignTitle );
1166
1167 $commandLineMode = $this->config->get( 'CommandLineMode' );
1168 if ( $title === null ) {
1169 # Invalid page title? Ignore the page
1170 $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1171 return false;
1172 } elseif ( $title->isExternal() ) {
1173 $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1174 return false;
1175 } elseif ( !$title->canExist() ) {
1176 $this->notice( 'import-error-special', $title->getPrefixedText() );
1177 return false;
1178 } elseif ( !$commandLineMode ) {
1179 $permissionManager = MediaWikiServices::getInstance()->getPermissionManager();
1180 $user = RequestContext::getMain()->getUser();
1181
1182 if ( !$permissionManager->userCan( 'edit', $user, $title ) ) {
1183 # Do not import if the importing wiki user cannot edit this page
1184 $this->notice( 'import-error-edit', $title->getPrefixedText() );
1185
1186 return false;
1187 }
1188
1189 if ( !$title->exists() && !$permissionManager->userCan( 'create', $user, $title ) ) {
1190 # Do not import if the importing wiki user cannot create this page
1191 $this->notice( 'import-error-create', $title->getPrefixedText() );
1192
1193 return false;
1194 }
1195 }
1196
1197 return [ $title, $foreignTitle ];
1198 }
1199
1204 private function getContentHandler( $model ) {
1205 return MediaWikiServices::getInstance()
1206 ->getContentHandlerFactory()
1207 ->getContentHandler( $model );
1208 }
1209
1216 private function getDefaultContentModel( $title, $role ) {
1217 return MediaWikiServices::getInstance()
1218 ->getSlotRoleRegistry()
1219 ->getRoleHandler( $role )
1220 ->getDefaultModel( $title );
1221 }
1222}
$wgMaxArticleSize
Maximum article size in kilobytes.
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTempDir()
Tries to get the system directory for temporary files.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Class to parse and build external user names.
Reporting callback.
Exception representing a failure to serialize or unserialize a content object.
MediaWiki exception.
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
MediaWikiServices is the service locator for the application scope of MediaWiki.
Value object representing a content slot associated with a page revision.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Represents a title within MediaWiki.
Definition Title.php:42
static registerSource(ImportSource $source)
XML file reader for the page data importer.
finishImportPage( $title, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
setImportUploads( $import)
doImport()
Primary entry point.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
ExternalUserNames $externalUserNames
setNoUpdates( $noupdates)
Set 'no updates' mode.
pageOutCallback( $title, $foreignTitle, $revCount, $sucCount, $pageInfo)
Notify the callback function when a "</page>" is closed.
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
HookRunner $hookRunner
importUpload( $revision)
Dummy for now...
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
dumpTemp( $contents)
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
getContentHandler( $model)
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
processLogItem( $logInfo)
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
setPageOffset( $nthPage)
Sets 'pageOffset' value.
debugRevisionHandler(&$revision)
Alternate per-revision callback, for debugging.
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
array $countableCache
importLogItem( $revision)
Default per-revision callback, performs the import.
XMLReader null $reader
setImageBasePath( $dir)
handleUpload(&$pageInfo)
getDefaultContentModel( $title, $role)
handleRevision(&$pageInfo)
revisionCallback( $revision)
Notify the callback function of a revision.
logItemCallback( $revision)
Notify the callback function of a new log item.
throwXmlError( $err)
setDebug( $debug)
Set debug mode...
processRevision( $pageInfo, $revisionInfo)
processUpload( $pageInfo, $uploadInfo)
bool $disableStatisticsUpdate
processTitle( $text, $ns=null)
makeContent(Title $title, $revisionId, $contentInfo)
notice( $msg,... $params)
importRevision( $revision)
Default per-revision callback, performs the import.
ImportTitleFactory $importTitleFactory
__construct(ImportSource $source, Config $config)
Creates an ImportXMLReader drawing from the source provided.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
disableStatisticsUpdate()
Statistics update can cause a lot of time.
siteInfoCallback( $siteInfo)
Notify the callback function of site info.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
Represents a revision, log entry or upload during the import process.
const NS_MAIN
Definition Defines.php:70
Interface for configuration instances.
Definition Config.php:30
Source interface for XML import.
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
$debug
Definition mcc.php:31
$source
$content
Definition router.php:76