MediaWiki  master
WikiImporter.php
Go to the documentation of this file.
1 <?php
30 
37 class WikiImporter {
39  private $reader;
40  private $foreignNamespaces = null;
45  private $mNoUpdates = false;
46  private $pageOffset = 0;
48  private $config;
52  private $hookRunner;
54  private $countableCache = [];
56  private $disableStatisticsUpdate = false;
59 
67  if ( !class_exists( 'XMLReader' ) ) {
68  throw new Exception( 'Import requires PHP to have been compiled with libxml support' );
69  }
70 
71  $this->reader = new XMLReader();
72  $this->config = $config;
73  $this->hookRunner = Hooks::runner();
74 
75  if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
76  stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
77  }
79 
80  // Enable the entity loader, as it is needed for loading external URLs via
81  // XMLReader::open (T86036)
82  $oldDisable = libxml_disable_entity_loader( false );
83  if ( defined( 'LIBXML_PARSEHUGE' ) ) {
84  $status = $this->reader->open( "uploadsource://$id", null, LIBXML_PARSEHUGE );
85  } else {
86  $status = $this->reader->open( "uploadsource://$id" );
87  }
88  if ( !$status ) {
89  $error = libxml_get_last_error();
90  libxml_disable_entity_loader( $oldDisable );
91  throw new MWException( 'Encountered an internal error while initializing WikiImporter object: ' .
92  $error->message );
93  }
94  libxml_disable_entity_loader( $oldDisable );
95 
96  // Default callbacks
97  $this->setPageCallback( [ $this, 'beforeImportPage' ] );
98  $this->setRevisionCallback( [ $this, "importRevision" ] );
99  $this->setUploadCallback( [ $this, 'importUpload' ] );
100  $this->setLogItemCallback( [ $this, 'importLogItem' ] );
101  $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
102 
103  $this->importTitleFactory = new NaiveImportTitleFactory();
104  $this->externalUserNames = new ExternalUserNames( 'imported', false );
105  }
106 
110  public function getReader() {
111  return $this->reader;
112  }
113 
114  public function throwXmlError( $err ) {
115  $this->debug( "FAILURE: $err" );
116  wfDebug( "WikiImporter XML error: $err" );
117  }
118 
119  public function debug( $data ) {
120  if ( $this->mDebug ) {
121  wfDebug( "IMPORT: $data" );
122  }
123  }
124 
125  public function warn( $data ) {
126  wfDebug( "IMPORT: $data" );
127  }
128 
129  public function notice( $msg, ...$params ) {
130  if ( is_callable( $this->mNoticeCallback ) ) {
131  call_user_func( $this->mNoticeCallback, $msg, $params );
132  } else { # No ImportReporter -> CLI
133  // T177997: the command line importers should call setNoticeCallback()
134  // for their own custom callback to echo the notice
135  wfDebug( wfMessage( $msg, $params )->text() );
136  }
137  }
138 
143  public function setDebug( $debug ) {
144  $this->mDebug = $debug;
145  }
146 
151  public function setNoUpdates( $noupdates ) {
152  $this->mNoUpdates = $noupdates;
153  }
154 
161  public function setPageOffset( $nthPage ) {
162  $this->pageOffset = $nthPage;
163  }
164 
171  public function setNoticeCallback( $callback ) {
172  return wfSetVar( $this->mNoticeCallback, $callback );
173  }
174 
180  public function setPageCallback( $callback ) {
181  $previous = $this->mPageCallback;
182  $this->mPageCallback = $callback;
183  return $previous;
184  }
185 
195  public function setPageOutCallback( $callback ) {
196  $previous = $this->mPageOutCallback;
197  $this->mPageOutCallback = $callback;
198  return $previous;
199  }
200 
206  public function setRevisionCallback( $callback ) {
207  $previous = $this->mRevisionCallback;
208  $this->mRevisionCallback = $callback;
209  return $previous;
210  }
211 
217  public function setUploadCallback( $callback ) {
218  $previous = $this->mUploadCallback;
219  $this->mUploadCallback = $callback;
220  return $previous;
221  }
222 
228  public function setLogItemCallback( $callback ) {
229  $previous = $this->mLogItemCallback;
230  $this->mLogItemCallback = $callback;
231  return $previous;
232  }
233 
239  public function setSiteInfoCallback( $callback ) {
240  $previous = $this->mSiteInfoCallback;
241  $this->mSiteInfoCallback = $callback;
242  return $previous;
243  }
244 
250  public function setImportTitleFactory( $factory ) {
251  $this->importTitleFactory = $factory;
252  }
253 
259  public function setTargetNamespace( $namespace ) {
260  if ( $namespace === null ) {
261  // Don't override namespaces
263  return true;
264  } elseif (
265  $namespace >= 0 &&
266  MediaWikiServices::getInstance()->getNamespaceInfo()->exists( intval( $namespace ) )
267  ) {
268  $namespace = intval( $namespace );
269  $this->setImportTitleFactory( new NamespaceImportTitleFactory( $namespace ) );
270  return true;
271  } else {
272  return false;
273  }
274  }
275 
281  public function setTargetRootPage( $rootpage ) {
282  $status = Status::newGood();
283  if ( $rootpage === null ) {
284  // No rootpage
286  } elseif ( $rootpage !== '' ) {
287  $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
288  $title = Title::newFromText( $rootpage );
289 
290  if ( !$title || $title->isExternal() ) {
291  $status->fatal( 'import-rootpage-invalid' );
292  } elseif (
293  !MediaWikiServices::getInstance()->getNamespaceInfo()->
294  hasSubpages( $title->getNamespace() )
295  ) {
296  $displayNSText = $title->getNamespace() === NS_MAIN
297  ? wfMessage( 'blanknamespace' )->text()
298  : MediaWikiServices::getInstance()->getContentLanguage()->
299  getNsText( $title->getNamespace() );
300  $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
301  } else {
302  // set namespace to 'all', so the namespace check in processTitle() can pass
303  $this->setTargetNamespace( null );
305  }
306  }
307  return $status;
308  }
309 
313  public function setImageBasePath( $dir ) {
314  $this->mImageBasePath = $dir;
315  }
316 
320  public function setImportUploads( $import ) {
321  $this->mImportUploads = $import;
322  }
323 
329  public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
330  $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
331  }
332 
337  public function disableStatisticsUpdate() {
338  $this->disableStatisticsUpdate = true;
339  }
340 
347  public function beforeImportPage( $titleAndForeignTitle ) {
348  $title = $titleAndForeignTitle[0];
349  $page = WikiPage::factory( $title );
350  $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
351  return true;
352  }
353 
359  public function importRevision( $revision ) {
360  if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
361  $this->notice( 'import-error-bad-location',
362  $revision->getTitle()->getPrefixedText(),
363  $revision->getID(),
364  $revision->getModel(),
365  $revision->getFormat() );
366 
367  return false;
368  }
369 
370  try {
371  return $revision->importOldRevision();
372  } catch ( MWContentSerializationException $ex ) {
373  $this->notice( 'import-error-unserialize',
374  $revision->getTitle()->getPrefixedText(),
375  $revision->getID(),
376  $revision->getModel(),
377  $revision->getFormat() );
378  }
379 
380  return false;
381  }
382 
388  public function importLogItem( $revision ) {
389  return $revision->importLogItem();
390  }
391 
397  public function importUpload( $revision ) {
398  return $revision->importUpload();
399  }
400 
410  public function finishImportPage( $title, $foreignTitle, $revCount,
411  $sRevCount, $pageInfo
412  ) {
413  // Update article count statistics (T42009)
414  // The normal counting logic in WikiPage->doEditUpdates() is designed for
415  // one-revision-at-a-time editing, not bulk imports. In this situation it
416  // suffers from issues of replica DB lag. We let WikiPage handle the total page
417  // and revision count, and we implement our own custom logic for the
418  // article (content page) count.
419  if ( !$this->disableStatisticsUpdate ) {
420  $page = WikiPage::factory( $title );
421  $page->loadPageData( 'fromdbmaster' );
422  $content = $page->getContent();
423  if ( $content === null ) {
424  wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $title .
425  ' because WikiPage::getContent() returned null' );
426  } else {
427  $editInfo = $page->prepareContentForEdit( $content );
428  $countKey = 'title_' . $title->getPrefixedText();
429  $countable = $page->isCountable( $editInfo );
430  if ( array_key_exists( $countKey, $this->countableCache ) &&
431  $countable != $this->countableCache[$countKey] ) {
433  'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
434  ] ) );
435  }
436  }
437  }
438 
439  return $this->hookRunner->onAfterImportPage( $title, $foreignTitle,
440  $revCount, $sRevCount, $pageInfo );
441  }
442 
447  public function debugRevisionHandler( &$revision ) {
448  $this->debug( "Got revision:" );
449  if ( is_object( $revision->title ) ) {
450  $this->debug( "-- Title: " . $revision->title->getPrefixedText() );
451  } else {
452  $this->debug( "-- Title: <invalid>" );
453  }
454  $this->debug( "-- User: " . $revision->user_text );
455  $this->debug( "-- Timestamp: " . $revision->timestamp );
456  $this->debug( "-- Comment: " . $revision->comment );
457  $this->debug( "-- Text: " . $revision->text );
458  }
459 
465  private function siteInfoCallback( $siteInfo ) {
466  if ( isset( $this->mSiteInfoCallback ) ) {
467  return call_user_func_array( $this->mSiteInfoCallback,
468  [ $siteInfo, $this ] );
469  } else {
470  return false;
471  }
472  }
473 
478  public function pageCallback( $title ) {
479  if ( isset( $this->mPageCallback ) ) {
480  call_user_func( $this->mPageCallback, $title );
481  }
482  }
483 
492  private function pageOutCallback( $title, $foreignTitle, $revCount,
493  $sucCount, $pageInfo ) {
494  if ( isset( $this->mPageOutCallback ) ) {
495  call_user_func_array( $this->mPageOutCallback, func_get_args() );
496  }
497  }
498 
504  private function revisionCallback( $revision ) {
505  if ( isset( $this->mRevisionCallback ) ) {
506  return call_user_func_array( $this->mRevisionCallback,
507  [ $revision, $this ] );
508  } else {
509  return false;
510  }
511  }
512 
518  private function logItemCallback( $revision ) {
519  if ( isset( $this->mLogItemCallback ) ) {
520  return call_user_func_array( $this->mLogItemCallback,
521  [ $revision, $this ] );
522  } else {
523  return false;
524  }
525  }
526 
533  public function nodeAttribute( $attr ) {
534  return $this->reader->getAttribute( $attr );
535  }
536 
544  public function nodeContents() {
545  if ( $this->reader->isEmptyElement ) {
546  return "";
547  }
548  $buffer = "";
549  while ( $this->reader->read() ) {
550  switch ( $this->reader->nodeType ) {
551  case XMLReader::TEXT:
552  case XMLReader::CDATA:
553  case XMLReader::SIGNIFICANT_WHITESPACE:
554  $buffer .= $this->reader->value;
555  break;
556  case XMLReader::END_ELEMENT:
557  return $buffer;
558  }
559  }
560 
561  $this->reader->close();
562  return '';
563  }
564 
571  public function doImport() {
572  // Calls to reader->read need to be wrapped in calls to
573  // libxml_disable_entity_loader() to avoid local file
574  // inclusion attacks (T48932).
575  $oldDisable = libxml_disable_entity_loader( true );
576  $rethrow = null;
577  try {
578  $this->reader->read();
579 
580  if ( $this->reader->localName != 'mediawiki' ) {
581  libxml_disable_entity_loader( $oldDisable );
582  throw new MWException( "Expected <mediawiki> tag, got " .
583  $this->reader->localName );
584  }
585  $this->debug( "<mediawiki> tag is correct." );
586 
587  $this->debug( "Starting primary dump processing loop." );
588 
589  $keepReading = $this->reader->read();
590  $skip = false;
591  $pageCount = 0;
592  while ( $keepReading ) {
593  $tag = $this->reader->localName;
594  if ( $this->pageOffset ) {
595  if ( $tag === 'page' ) {
596  $pageCount++;
597  }
598  if ( $pageCount < $this->pageOffset ) {
599  $keepReading = $this->reader->next();
600  continue;
601  }
602  }
603  $type = $this->reader->nodeType;
604 
605  if ( !$this->hookRunner->onImportHandleToplevelXMLTag( $this ) ) {
606  // Do nothing
607  } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
608  break;
609  } elseif ( $tag == 'siteinfo' ) {
610  $this->handleSiteInfo();
611  } elseif ( $tag == 'page' ) {
612  $this->handlePage();
613  } elseif ( $tag == 'logitem' ) {
614  $this->handleLogItem();
615  } elseif ( $tag != '#text' ) {
616  $this->warn( "Unhandled top-level XML tag $tag" );
617 
618  $skip = true;
619  }
620 
621  if ( $skip ) {
622  $keepReading = $this->reader->next();
623  $skip = false;
624  $this->debug( "Skip" );
625  } else {
626  $keepReading = $this->reader->read();
627  }
628  }
629  } catch ( Exception $ex ) {
630  $rethrow = $ex;
631  }
632 
633  // finally
634  libxml_disable_entity_loader( $oldDisable );
635  $this->reader->close();
636 
637  if ( $rethrow ) {
638  throw $rethrow;
639  }
640 
641  return true;
642  }
643 
644  private function handleSiteInfo() {
645  $this->debug( "Enter site info handler." );
646  $siteInfo = [];
647 
648  // Fields that can just be stuffed in the siteInfo object
649  $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
650 
651  while ( $this->reader->read() ) {
652  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
653  $this->reader->localName == 'siteinfo' ) {
654  break;
655  }
656 
657  $tag = $this->reader->localName;
658 
659  if ( $tag == 'namespace' ) {
660  $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
661  $this->nodeContents();
662  } elseif ( in_array( $tag, $normalFields ) ) {
663  $siteInfo[$tag] = $this->nodeContents();
664  }
665  }
666 
667  $siteInfo['_namespaces'] = $this->foreignNamespaces;
668  $this->siteInfoCallback( $siteInfo );
669  }
670 
671  private function handleLogItem() {
672  $this->debug( "Enter log item handler." );
673  $logInfo = [];
674 
675  // Fields that can just be stuffed in the pageInfo object
676  $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
677  'logtitle', 'params' ];
678 
679  while ( $this->reader->read() ) {
680  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
681  $this->reader->localName == 'logitem' ) {
682  break;
683  }
684 
685  $tag = $this->reader->localName;
686 
687  if ( !$this->hookRunner->onImportHandleLogItemXMLTag( $this, $logInfo ) ) {
688  // Do nothing
689  } elseif ( in_array( $tag, $normalFields ) ) {
690  $logInfo[$tag] = $this->nodeContents();
691  } elseif ( $tag == 'contributor' ) {
692  $logInfo['contributor'] = $this->handleContributor();
693  } elseif ( $tag != '#text' ) {
694  $this->warn( "Unhandled log-item XML tag $tag" );
695  }
696  }
697 
698  $this->processLogItem( $logInfo );
699  }
700 
705  private function processLogItem( $logInfo ) {
706  $revision = new WikiRevision( $this->config );
707 
708  if ( isset( $logInfo['id'] ) ) {
709  $revision->setID( $logInfo['id'] );
710  }
711  $revision->setType( $logInfo['type'] );
712  $revision->setAction( $logInfo['action'] );
713  if ( isset( $logInfo['timestamp'] ) ) {
714  $revision->setTimestamp( $logInfo['timestamp'] );
715  }
716  if ( isset( $logInfo['params'] ) ) {
717  $revision->setParams( $logInfo['params'] );
718  }
719  if ( isset( $logInfo['logtitle'] ) ) {
720  // @todo Using Title for non-local titles is a recipe for disaster.
721  // We should use ForeignTitle here instead.
722  $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
723  }
724 
725  $revision->setNoUpdates( $this->mNoUpdates );
726 
727  if ( isset( $logInfo['comment'] ) ) {
728  $revision->setComment( $logInfo['comment'] );
729  }
730 
731  if ( isset( $logInfo['contributor']['ip'] ) ) {
732  $revision->setUserIP( $logInfo['contributor']['ip'] );
733  }
734 
735  if ( !isset( $logInfo['contributor']['username'] ) ) {
736  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
737  } else {
738  $revision->setUsername(
739  $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
740  );
741  }
742 
743  return $this->logItemCallback( $revision );
744  }
745 
746  private function handlePage() {
747  // Handle page data.
748  $this->debug( "Enter page handler." );
749  $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
750 
751  // Fields that can just be stuffed in the pageInfo object
752  $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
753 
754  $skip = false;
755  $badTitle = false;
756 
757  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
758  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
759  $this->reader->localName == 'page' ) {
760  break;
761  }
762 
763  $skip = false;
764 
765  $tag = $this->reader->localName;
766 
767  if ( $badTitle ) {
768  // The title is invalid, bail out of this page
769  $skip = true;
770  } elseif ( !$this->hookRunner->onImportHandlePageXMLTag( $this, $pageInfo ) ) {
771  // Do nothing
772  } elseif ( in_array( $tag, $normalFields ) ) {
773  // An XML snippet:
774  // <page>
775  // <id>123</id>
776  // <title>Page</title>
777  // <redirect title="NewTitle"/>
778  // ...
779  // Because the redirect tag is built differently, we need special handling for that case.
780  if ( $tag == 'redirect' ) {
781  $pageInfo[$tag] = $this->nodeAttribute( 'title' );
782  } else {
783  $pageInfo[$tag] = $this->nodeContents();
784  }
785  } elseif ( $tag == 'revision' || $tag == 'upload' ) {
786  if ( !isset( $title ) ) {
787  $title = $this->processTitle( $pageInfo['title'],
788  $pageInfo['ns'] ?? null );
789 
790  // $title is either an array of two titles or false.
791  if ( is_array( $title ) ) {
792  $this->pageCallback( $title );
793  list( $pageInfo['_title'], $foreignTitle ) = $title;
794  } else {
795  $badTitle = true;
796  $skip = true;
797  }
798  }
799 
800  if ( $title ) {
801  if ( $tag == 'revision' ) {
802  $this->handleRevision( $pageInfo );
803  } else {
804  $this->handleUpload( $pageInfo );
805  }
806  }
807  } elseif ( $tag != '#text' ) {
808  $this->warn( "Unhandled page XML tag $tag" );
809  $skip = true;
810  }
811  }
812 
813  // @note $pageInfo is only set if a valid $title is processed above with
814  // no error. If we have a valid $title, then pageCallback is called
815  // above, $pageInfo['title'] is set and we do pageOutCallback here.
816  // If $pageInfo['_title'] is not set, then $foreignTitle is also not
817  // set since they both come from $title above.
818  if ( array_key_exists( '_title', $pageInfo ) ) {
819  $this->pageOutCallback( $pageInfo['_title'], $foreignTitle,
820  $pageInfo['revisionCount'],
821  $pageInfo['successfulRevisionCount'],
822  $pageInfo );
823  }
824  }
825 
829  private function handleRevision( &$pageInfo ) {
830  $this->debug( "Enter revision handler" );
831  $revisionInfo = [];
832 
833  $normalFields = [ 'id', 'parentid', 'timestamp', 'comment', 'minor', 'origin',
834  'model', 'format', 'text', 'sha1' ];
835 
836  $skip = false;
837 
838  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
839  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
840  $this->reader->localName == 'revision' ) {
841  break;
842  }
843 
844  $tag = $this->reader->localName;
845 
846  if ( !$this->hookRunner->onImportHandleRevisionXMLTag(
847  $this, $pageInfo, $revisionInfo )
848  ) {
849  // Do nothing
850  } elseif ( in_array( $tag, $normalFields ) ) {
851  $revisionInfo[$tag] = $this->nodeContents();
852  } elseif ( $tag == 'content' ) {
853  // We can have multiple content tags, so make this an array.
854  $revisionInfo[$tag][] = $this->handleContent();
855  } elseif ( $tag == 'contributor' ) {
856  $revisionInfo['contributor'] = $this->handleContributor();
857  } elseif ( $tag != '#text' ) {
858  $this->warn( "Unhandled revision XML tag $tag" );
859  $skip = true;
860  }
861  }
862 
863  $pageInfo['revisionCount']++;
864  if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
865  $pageInfo['successfulRevisionCount']++;
866  }
867  }
868 
869  private function handleContent() {
870  $this->debug( "Enter content handler" );
871  $contentInfo = [];
872 
873  $normalFields = [ 'role', 'origin', 'model', 'format', 'text' ];
874 
875  $skip = false;
876 
877  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
878  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
879  $this->reader->localName == 'content' ) {
880  break;
881  }
882 
883  $tag = $this->reader->localName;
884 
885  if ( !$this->hookRunner->onImportHandleContentXMLTag(
886  $this, $contentInfo )
887  ) {
888  // Do nothing
889  } elseif ( in_array( $tag, $normalFields ) ) {
890  $contentInfo[$tag] = $this->nodeContents();
891  } elseif ( $tag != '#text' ) {
892  $this->warn( "Unhandled content XML tag $tag" );
893  $skip = true;
894  }
895  }
896 
897  return $contentInfo;
898  }
899 
908  private function makeContent( Title $title, $revisionId, $contentInfo ) {
909  global $wgMaxArticleSize;
910 
911  if ( !isset( $contentInfo['text'] ) ) {
912  throw new MWException( 'Missing text field in import.' );
913  }
914 
915  // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
916  // database errors and instability. Testing for revisions with only listed
917  // content models, as other content models might use serialization formats
918  // which aren't checked against $wgMaxArticleSize.
919  if ( ( !isset( $contentInfo['model'] ) ||
920  in_array( $contentInfo['model'], [
921  'wikitext',
922  'css',
923  'json',
924  'javascript',
925  'text',
926  ''
927  ] ) ) &&
928  strlen( $contentInfo['text'] ) > $wgMaxArticleSize * 1024
929  ) {
930  throw new MWException( 'The text of ' .
931  ( $revisionId ?
932  "the revision with ID $revisionId" :
933  'a revision'
934  ) . " exceeds the maximum allowable size ($wgMaxArticleSize KB)" );
935  }
936 
937  $role = $contentInfo['role'] ?? SlotRecord::MAIN;
938  $model = $contentInfo['model'] ?? $this->getDefaultContentModel( $title, $role );
939  $handler = $this->getContentHandler( $model );
940 
941  $text = $handler->importTransform( $contentInfo['text'] );
942 
943  $content = $handler->unserializeContent( $text );
944 
945  return $content;
946  }
947 
954  private function processRevision( $pageInfo, $revisionInfo ) {
955  $revision = new WikiRevision( $this->config );
956 
957  $revId = $revisionInfo['id'] ?? 0;
958  if ( $revId ) {
959  $revision->setID( $revisionInfo['id'] );
960  }
961 
962  $title = $pageInfo['_title'];
963  $revision->setTitle( $title );
964 
965  $content = $this->makeContent( $title, $revId, $revisionInfo );
966  $revision->setContent( SlotRecord::MAIN, $content );
967 
968  foreach ( $revisionInfo['content'] ?? [] as $slotInfo ) {
969  if ( !isset( $slotInfo['role'] ) ) {
970  throw new MWException( "Missing role for imported slot." );
971  }
972 
973  $content = $this->makeContent( $title, $revId, $slotInfo );
974  $revision->setContent( $slotInfo['role'], $content );
975  }
976  $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
977 
978  if ( isset( $revisionInfo['comment'] ) ) {
979  $revision->setComment( $revisionInfo['comment'] );
980  }
981 
982  if ( isset( $revisionInfo['minor'] ) ) {
983  $revision->setMinor( true );
984  }
985  if ( isset( $revisionInfo['contributor']['ip'] ) ) {
986  $revision->setUserIP( $revisionInfo['contributor']['ip'] );
987  } elseif ( isset( $revisionInfo['contributor']['username'] ) ) {
988  $revision->setUsername(
989  $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
990  );
991  } else {
992  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
993  }
994  if ( isset( $revisionInfo['sha1'] ) ) {
995  $revision->setSha1Base36( $revisionInfo['sha1'] );
996  }
997  $revision->setNoUpdates( $this->mNoUpdates );
998 
999  return $this->revisionCallback( $revision );
1000  }
1001 
1006  private function handleUpload( &$pageInfo ) {
1007  $this->debug( "Enter upload handler" );
1008  $uploadInfo = [];
1009 
1010  $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
1011  'src', 'size', 'sha1base36', 'archivename', 'rel' ];
1012 
1013  $skip = false;
1014 
1015  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
1016  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1017  $this->reader->localName == 'upload' ) {
1018  break;
1019  }
1020 
1021  $tag = $this->reader->localName;
1022 
1023  if ( !$this->hookRunner->onImportHandleUploadXMLTag( $this, $pageInfo ) ) {
1024  // Do nothing
1025  } elseif ( in_array( $tag, $normalFields ) ) {
1026  $uploadInfo[$tag] = $this->nodeContents();
1027  } elseif ( $tag == 'contributor' ) {
1028  $uploadInfo['contributor'] = $this->handleContributor();
1029  } elseif ( $tag == 'contents' ) {
1030  $contents = $this->nodeContents();
1031  $encoding = $this->reader->getAttribute( 'encoding' );
1032  if ( $encoding === 'base64' ) {
1033  $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
1034  $uploadInfo['isTempSrc'] = true;
1035  }
1036  } elseif ( $tag != '#text' ) {
1037  $this->warn( "Unhandled upload XML tag $tag" );
1038  $skip = true;
1039  }
1040  }
1041 
1042  if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
1043  $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
1044  if ( file_exists( $path ) ) {
1045  $uploadInfo['fileSrc'] = $path;
1046  $uploadInfo['isTempSrc'] = false;
1047  }
1048  }
1049 
1050  if ( $this->mImportUploads ) {
1051  return $this->processUpload( $pageInfo, $uploadInfo );
1052  }
1053  }
1054 
1059  private function dumpTemp( $contents ) {
1060  $filename = tempnam( wfTempDir(), 'importupload' );
1061  file_put_contents( $filename, $contents );
1062  return $filename;
1063  }
1064 
1070  private function processUpload( $pageInfo, $uploadInfo ) {
1071  $revision = new WikiRevision( $this->config );
1072  $text = $uploadInfo['text'] ?? '';
1073 
1074  $revision->setTitle( $pageInfo['_title'] );
1075  $revision->setID( $pageInfo['id'] );
1076  $revision->setTimestamp( $uploadInfo['timestamp'] );
1077  $revision->setText( $text );
1078  $revision->setFilename( $uploadInfo['filename'] );
1079  if ( isset( $uploadInfo['archivename'] ) ) {
1080  $revision->setArchiveName( $uploadInfo['archivename'] );
1081  }
1082  $revision->setSrc( $uploadInfo['src'] );
1083  if ( isset( $uploadInfo['fileSrc'] ) ) {
1084  $revision->setFileSrc( $uploadInfo['fileSrc'],
1085  !empty( $uploadInfo['isTempSrc'] ) );
1086  }
1087  if ( isset( $uploadInfo['sha1base36'] ) ) {
1088  $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1089  }
1090  $revision->setSize( intval( $uploadInfo['size'] ) );
1091  $revision->setComment( $uploadInfo['comment'] );
1092 
1093  if ( isset( $uploadInfo['contributor']['ip'] ) ) {
1094  $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1095  }
1096  if ( isset( $uploadInfo['contributor']['username'] ) ) {
1097  $revision->setUsername(
1098  $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1099  );
1100  }
1101  $revision->setNoUpdates( $this->mNoUpdates );
1102 
1103  return call_user_func( $this->mUploadCallback, $revision );
1104  }
1105 
1109  private function handleContributor() {
1110  $this->debug( "Enter contributor handler." );
1111  $fields = [ 'id', 'ip', 'username' ];
1112  $info = [];
1113 
1114  if ( $this->reader->isEmptyElement ) {
1115  return $info;
1116  }
1117  while ( $this->reader->read() ) {
1118  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1119  $this->reader->localName == 'contributor' ) {
1120  break;
1121  }
1122 
1123  $tag = $this->reader->localName;
1124 
1125  if ( in_array( $tag, $fields ) ) {
1126  $info[$tag] = $this->nodeContents();
1127  }
1128  }
1129 
1130  return $info;
1131  }
1132 
1138  private function processTitle( $text, $ns = null ) {
1139  if ( $this->foreignNamespaces === null ) {
1140  $foreignTitleFactory = new NaiveForeignTitleFactory();
1141  } else {
1142  $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1143  $this->foreignNamespaces );
1144  }
1145 
1146  $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1147  intval( $ns ) );
1148 
1149  $title = $this->importTitleFactory->createTitleFromForeignTitle(
1150  $foreignTitle );
1151 
1152  $commandLineMode = $this->config->get( 'CommandLineMode' );
1153  if ( $title === null ) {
1154  # Invalid page title? Ignore the page
1155  $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1156  return false;
1157  } elseif ( $title->isExternal() ) {
1158  $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1159  return false;
1160  } elseif ( !$title->canExist() ) {
1161  $this->notice( 'import-error-special', $title->getPrefixedText() );
1162  return false;
1163  } elseif ( !$commandLineMode ) {
1164  $permissionManager = MediaWikiServices::getInstance()->getPermissionManager();
1165  $user = RequestContext::getMain()->getUser();
1166 
1167  if ( !$permissionManager->userCan( 'edit', $user, $title ) ) {
1168  # Do not import if the importing wiki user cannot edit this page
1169  $this->notice( 'import-error-edit', $title->getPrefixedText() );
1170 
1171  return false;
1172  }
1173 
1174  if ( !$title->exists() && !$permissionManager->userCan( 'create', $user, $title ) ) {
1175  # Do not import if the importing wiki user cannot create this page
1176  $this->notice( 'import-error-create', $title->getPrefixedText() );
1177 
1178  return false;
1179  }
1180  }
1181 
1182  return [ $title, $foreignTitle ];
1183  }
1184 
1189  private function getContentHandler( $model ) {
1190  return MediaWikiServices::getInstance()
1191  ->getContentHandlerFactory()
1192  ->getContentHandler( $model );
1193  }
1194 
1201  private function getDefaultContentModel( $title, $role ) {
1202  return MediaWikiServices::getInstance()
1203  ->getSlotRoleRegistry()
1204  ->getRoleHandler( $role )
1205  ->getDefaultModel( $title );
1206  }
1207 }
NaiveImportTitleFactory
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Definition: NaiveImportTitleFactory.php:34
WikiImporter\processRevision
processRevision( $pageInfo, $revisionInfo)
Definition: WikiImporter.php:954
WikiImporter\$mUploadCallback
$mUploadCallback
Definition: WikiImporter.php:41
WikiImporter
XML file reader for the page data importer.
Definition: WikiImporter.php:37
WikiImporter\makeContent
makeContent(Title $title, $revisionId, $contentInfo)
Definition: WikiImporter.php:908
Title\newFromText
static newFromText( $text, $defaultNamespace=NS_MAIN)
Create a new Title from text, such as what one would find in a link.
Definition: Title.php:329
$wgMaxArticleSize
$wgMaxArticleSize
Maximum article size in kilobytes.
Definition: DefaultSettings.php:2407
WikiImporter\setImageBasePath
setImageBasePath( $dir)
Definition: WikiImporter.php:313
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:154
wfSetVar
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
Definition: GlobalFunctions.php:1543
UploadSourceAdapter\registerSource
static registerSource(ImportSource $source)
Definition: UploadSourceAdapter.php:48
WikiImporter\$mImportUploads
$mImportUploads
Definition: WikiImporter.php:44
NamespaceAwareForeignTitleFactory
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
Definition: NamespaceAwareForeignTitleFactory.php:25
WikiImporter\$mRevisionCallback
$mRevisionCallback
Definition: WikiImporter.php:41
WikiImporter\revisionCallback
revisionCallback( $revision)
Notify the callback function of a revision.
Definition: WikiImporter.php:504
WikiImporter\setNoticeCallback
setNoticeCallback( $callback)
Set a callback that displays notice messages.
Definition: WikiImporter.php:171
DeferredUpdates\addUpdate
static addUpdate(DeferrableUpdate $update, $stage=self::POSTSEND)
Add an update to the deferred update queue for execution at the appropriate time.
Definition: DeferredUpdates.php:106
NaiveForeignTitleFactory
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
Definition: NaiveForeignTitleFactory.php:27
WikiImporter\$mPageOutCallback
$mPageOutCallback
Definition: WikiImporter.php:42
WikiImporter\setNoUpdates
setNoUpdates( $noupdates)
Set 'no updates' mode.
Definition: WikiImporter.php:151
WikiImporter\getReader
getReader()
Definition: WikiImporter.php:110
ExternalUserNames
Class to parse and build external user names.
Definition: ExternalUserNames.php:29
WikiImporter\processLogItem
processLogItem( $logInfo)
Definition: WikiImporter.php:705
WikiImporter\handleRevision
handleRevision(&$pageInfo)
Definition: WikiImporter.php:829
WikiImporter\setRevisionCallback
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
Definition: WikiImporter.php:206
wfMessage
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Definition: GlobalFunctions.php:1219
WikiImporter\handleContributor
handleContributor()
Definition: WikiImporter.php:1109
WikiImporter\setUsernamePrefix
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
Definition: WikiImporter.php:329
ImportReporter
Reporting callback.
Definition: ImportReporter.php:28
WikiImporter\$externalUserNames
ExternalUserNames $externalUserNames
Definition: WikiImporter.php:58
NamespaceImportTitleFactory
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Definition: NamespaceImportTitleFactory.php:28
WikiImporter\nodeContents
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
Definition: WikiImporter.php:544
WikiImporter\siteInfoCallback
siteInfoCallback( $siteInfo)
Notify the callback function of site info.
Definition: WikiImporter.php:465
$debug
$debug
Definition: mcc.php:31
WikiImporter\getContentHandler
getContentHandler( $model)
Definition: WikiImporter.php:1189
NS_MAIN
const NS_MAIN
Definition: Defines.php:69
Config
Interface for configuration instances.
Definition: Config.php:30
MWException
MediaWiki exception.
Definition: MWException.php:29
ImportTitleFactory
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
Definition: ImportTitleFactory.php:25
WikiPage\factory
static factory(Title $title)
Create a WikiPage object of the appropriate class for the given title.
Definition: WikiPage.php:156
WikiImporter\$hookRunner
HookRunner $hookRunner
Definition: WikiImporter.php:52
WikiImporter\dumpTemp
dumpTemp( $contents)
Definition: WikiImporter.php:1059
WikiImporter\$countableCache
array $countableCache
Definition: WikiImporter.php:54
WikiImporter\pageOutCallback
pageOutCallback( $title, $foreignTitle, $revCount, $sucCount, $pageInfo)
Notify the callback function when a "</page>" is closed.
Definition: WikiImporter.php:492
WikiImporter\__construct
__construct(ImportSource $source, Config $config)
Creates an ImportXMLReader drawing from the source provided.
Definition: WikiImporter.php:66
MWContentSerializationException
Exception representing a failure to serialize or unserialize a content object.
Definition: MWContentSerializationException.php:8
WikiImporter\throwXmlError
throwXmlError( $err)
Definition: WikiImporter.php:114
SubpageImportTitleFactory
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Definition: SubpageImportTitleFactory.php:28
$title
$title
Definition: testCompression.php:38
SiteStatsUpdate\factory
static factory(array $deltas)
Definition: SiteStatsUpdate.php:71
WikiImporter\finishImportPage
finishImportPage( $title, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
Definition: WikiImporter.php:410
wfTimestampNow
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
Definition: GlobalFunctions.php:1837
WikiImporter\$importTitleFactory
ImportTitleFactory $importTitleFactory
Definition: WikiImporter.php:50
WikiImporter\processUpload
processUpload( $pageInfo, $uploadInfo)
Definition: WikiImporter.php:1070
WikiImporter\disableStatisticsUpdate
disableStatisticsUpdate()
Statistics update can cause a lot of time.
Definition: WikiImporter.php:337
WikiImporter\setImportUploads
setImportUploads( $import)
Definition: WikiImporter.php:320
WikiImporter\$mNoUpdates
$mNoUpdates
Definition: WikiImporter.php:45
wfDebug
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Definition: GlobalFunctions.php:909
WikiImporter\$mPageCallback
$mPageCallback
Definition: WikiImporter.php:41
WikiImporter\$mSiteInfoCallback
$mSiteInfoCallback
Definition: WikiImporter.php:42
$content
$content
Definition: router.php:76
WikiImporter\beforeImportPage
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
Definition: WikiImporter.php:347
StatusValue\newGood
static newGood( $value=null)
Factory function for good results.
Definition: StatusValue.php:82
WikiImporter\importRevision
importRevision( $revision)
Default per-revision callback, performs the import.
Definition: WikiImporter.php:359
Hooks\runner
static runner()
Get a HookRunner instance for calling hooks using the new interfaces.
Definition: Hooks.php:172
WikiImporter\doImport
doImport()
Primary entry point.
Definition: WikiImporter.php:571
WikiImporter\processTitle
processTitle( $text, $ns=null)
Definition: WikiImporter.php:1138
WikiImporter\$mImageBasePath
$mImageBasePath
Definition: WikiImporter.php:44
WikiImporter\$foreignNamespaces
$foreignNamespaces
Definition: WikiImporter.php:40
WikiImporter\setDebug
setDebug( $debug)
Set debug mode...
Definition: WikiImporter.php:143
WikiImporter\notice
notice( $msg,... $params)
Definition: WikiImporter.php:129
RequestContext\getMain
static getMain()
Get the RequestContext object associated with the main request.
Definition: RequestContext.php:453
WikiImporter\handleUpload
handleUpload(&$pageInfo)
Definition: WikiImporter.php:1006
WikiImporter\setUploadCallback
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
Definition: WikiImporter.php:217
WikiImporter\warn
warn( $data)
Definition: WikiImporter.php:125
WikiImporter\handleSiteInfo
handleSiteInfo()
Definition: WikiImporter.php:644
WikiImporter\setTargetRootPage
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
Definition: WikiImporter.php:281
Title
Represents a title within MediaWiki.
Definition: Title.php:42
WikiImporter\$mNoticeCallback
$mNoticeCallback
Definition: WikiImporter.php:43
wfTempDir
wfTempDir()
Tries to get the system directory for temporary files.
Definition: GlobalFunctions.php:1871
WikiRevision
Represents a revision, log entry or upload during the import process.
Definition: WikiRevision.php:39
WikiImporter\debug
debug( $data)
Definition: WikiImporter.php:119
WikiImporter\importLogItem
importLogItem( $revision)
Default per-revision callback, performs the import.
Definition: WikiImporter.php:388
WikiImporter\handleContent
handleContent()
Definition: WikiImporter.php:869
WikiImporter\importUpload
importUpload( $revision)
Dummy for now...
Definition: WikiImporter.php:397
WikiImporter\setPageOffset
setPageOffset( $nthPage)
Sets 'pageOffset' value.
Definition: WikiImporter.php:161
WikiImporter\handleLogItem
handleLogItem()
Definition: WikiImporter.php:671
ImportSource
Source interface for XML import.
Definition: ImportSource.php:32
$path
$path
Definition: NoLocalSettings.php:25
WikiImporter\$pageOffset
$pageOffset
Definition: WikiImporter.php:46
WikiImporter\$config
Config $config
Definition: WikiImporter.php:48
WikiImporter\setSiteInfoCallback
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
Definition: WikiImporter.php:239
WikiImporter\getDefaultContentModel
getDefaultContentModel( $title, $role)
Definition: WikiImporter.php:1201
WikiImporter\setPageOutCallback
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
Definition: WikiImporter.php:195
WikiImporter\debugRevisionHandler
debugRevisionHandler(&$revision)
Alternate per-revision callback, for debugging.
Definition: WikiImporter.php:447
WikiImporter\$disableStatisticsUpdate
bool $disableStatisticsUpdate
Definition: WikiImporter.php:56
$source
$source
Definition: mwdoc-filter.php:34
WikiImporter\$reader
XMLReader $reader
Definition: WikiImporter.php:39
WikiImporter\handlePage
handlePage()
Definition: WikiImporter.php:746
WikiImporter\nodeAttribute
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
Definition: WikiImporter.php:533
MediaWiki\HookContainer\HookRunner
This class provides an implementation of the core hook interfaces, forwarding hook calls to HookConta...
Definition: HookRunner.php:570
WikiImporter\setPageCallback
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
Definition: WikiImporter.php:180
WikiImporter\setImportTitleFactory
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
Definition: WikiImporter.php:250
WikiImporter\setTargetNamespace
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
Definition: WikiImporter.php:259
WikiImporter\logItemCallback
logItemCallback( $revision)
Notify the callback function of a new log item.
Definition: WikiImporter.php:518
WikiImporter\pageCallback
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
Definition: WikiImporter.php:478
WikiImporter\$mLogItemCallback
$mLogItemCallback
Definition: WikiImporter.php:41
WikiImporter\$mDebug
$mDebug
Definition: WikiImporter.php:43
Revision\SlotRecord
Value object representing a content slot associated with a page revision.
Definition: SlotRecord.php:39
WikiImporter\setLogItemCallback
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
Definition: WikiImporter.php:228
$type
$type
Definition: testCompression.php:52