MediaWiki  master
WikiImporter.php
Go to the documentation of this file.
1 <?php
28 
35 class WikiImporter {
37  private $reader;
38  private $foreignNamespaces = null;
43  private $mNoUpdates = false;
44  private $pageOffset = 0;
46  private $config;
50  private $countableCache = [];
52  private $disableStatisticsUpdate = false;
55 
63  if ( !class_exists( 'XMLReader' ) ) {
64  throw new Exception( 'Import requires PHP to have been compiled with libxml support' );
65  }
66 
67  $this->reader = new XMLReader();
68  $this->config = $config;
69 
70  if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
71  stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
72  }
74 
75  // Enable the entity loader, as it is needed for loading external URLs via
76  // XMLReader::open (T86036)
77  $oldDisable = libxml_disable_entity_loader( false );
78  if ( defined( 'LIBXML_PARSEHUGE' ) ) {
79  $status = $this->reader->open( "uploadsource://$id", null, LIBXML_PARSEHUGE );
80  } else {
81  $status = $this->reader->open( "uploadsource://$id" );
82  }
83  if ( !$status ) {
84  $error = libxml_get_last_error();
85  libxml_disable_entity_loader( $oldDisable );
86  throw new MWException( 'Encountered an internal error while initializing WikiImporter object: ' .
87  $error->message );
88  }
89  libxml_disable_entity_loader( $oldDisable );
90 
91  // Default callbacks
92  $this->setPageCallback( [ $this, 'beforeImportPage' ] );
93  $this->setRevisionCallback( [ $this, "importRevision" ] );
94  $this->setUploadCallback( [ $this, 'importUpload' ] );
95  $this->setLogItemCallback( [ $this, 'importLogItem' ] );
96  $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
97 
98  $this->importTitleFactory = new NaiveImportTitleFactory();
99  $this->externalUserNames = new ExternalUserNames( 'imported', false );
100  }
101 
105  public function getReader() {
106  return $this->reader;
107  }
108 
109  public function throwXmlError( $err ) {
110  $this->debug( "FAILURE: $err" );
111  wfDebug( "WikiImporter XML error: $err\n" );
112  }
113 
114  public function debug( $data ) {
115  if ( $this->mDebug ) {
116  wfDebug( "IMPORT: $data\n" );
117  }
118  }
119 
120  public function warn( $data ) {
121  wfDebug( "IMPORT: $data\n" );
122  }
123 
124  public function notice( $msg, ...$params ) {
125  if ( is_callable( $this->mNoticeCallback ) ) {
126  call_user_func( $this->mNoticeCallback, $msg, $params );
127  } else { # No ImportReporter -> CLI
128  // T177997: the command line importers should call setNoticeCallback()
129  // for their own custom callback to echo the notice
130  wfDebug( wfMessage( $msg, $params )->text() . "\n" );
131  }
132  }
133 
138  public function setDebug( $debug ) {
139  $this->mDebug = $debug;
140  }
141 
146  public function setNoUpdates( $noupdates ) {
147  $this->mNoUpdates = $noupdates;
148  }
149 
156  public function setPageOffset( $nthPage ) {
157  $this->pageOffset = $nthPage;
158  }
159 
166  public function setNoticeCallback( $callback ) {
167  return wfSetVar( $this->mNoticeCallback, $callback );
168  }
169 
175  public function setPageCallback( $callback ) {
176  $previous = $this->mPageCallback;
177  $this->mPageCallback = $callback;
178  return $previous;
179  }
180 
190  public function setPageOutCallback( $callback ) {
191  $previous = $this->mPageOutCallback;
192  $this->mPageOutCallback = $callback;
193  return $previous;
194  }
195 
201  public function setRevisionCallback( $callback ) {
202  $previous = $this->mRevisionCallback;
203  $this->mRevisionCallback = $callback;
204  return $previous;
205  }
206 
212  public function setUploadCallback( $callback ) {
213  $previous = $this->mUploadCallback;
214  $this->mUploadCallback = $callback;
215  return $previous;
216  }
217 
223  public function setLogItemCallback( $callback ) {
224  $previous = $this->mLogItemCallback;
225  $this->mLogItemCallback = $callback;
226  return $previous;
227  }
228 
234  public function setSiteInfoCallback( $callback ) {
235  $previous = $this->mSiteInfoCallback;
236  $this->mSiteInfoCallback = $callback;
237  return $previous;
238  }
239 
245  public function setImportTitleFactory( $factory ) {
246  $this->importTitleFactory = $factory;
247  }
248 
254  public function setTargetNamespace( $namespace ) {
255  if ( $namespace === null ) {
256  // Don't override namespaces
258  return true;
259  } elseif (
260  $namespace >= 0 &&
261  MediaWikiServices::getInstance()->getNamespaceInfo()->exists( intval( $namespace ) )
262  ) {
263  $namespace = intval( $namespace );
264  $this->setImportTitleFactory( new NamespaceImportTitleFactory( $namespace ) );
265  return true;
266  } else {
267  return false;
268  }
269  }
270 
276  public function setTargetRootPage( $rootpage ) {
277  $status = Status::newGood();
278  if ( $rootpage === null ) {
279  // No rootpage
281  } elseif ( $rootpage !== '' ) {
282  $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
283  $title = Title::newFromText( $rootpage );
284 
285  if ( !$title || $title->isExternal() ) {
286  $status->fatal( 'import-rootpage-invalid' );
287  } elseif (
288  !MediaWikiServices::getInstance()->getNamespaceInfo()->
289  hasSubpages( $title->getNamespace() )
290  ) {
291  $displayNSText = $title->getNamespace() == NS_MAIN
292  ? wfMessage( 'blanknamespace' )->text()
293  : MediaWikiServices::getInstance()->getContentLanguage()->
294  getNsText( $title->getNamespace() );
295  $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
296  } else {
297  // set namespace to 'all', so the namespace check in processTitle() can pass
298  $this->setTargetNamespace( null );
300  }
301  }
302  return $status;
303  }
304 
308  public function setImageBasePath( $dir ) {
309  $this->mImageBasePath = $dir;
310  }
311 
315  public function setImportUploads( $import ) {
316  $this->mImportUploads = $import;
317  }
318 
324  public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
325  $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
326  }
327 
332  public function disableStatisticsUpdate() {
333  $this->disableStatisticsUpdate = true;
334  }
335 
342  public function beforeImportPage( $titleAndForeignTitle ) {
343  $title = $titleAndForeignTitle[0];
344  $page = WikiPage::factory( $title );
345  $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
346  return true;
347  }
348 
354  public function importRevision( $revision ) {
355  if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
356  $this->notice( 'import-error-bad-location',
357  $revision->getTitle()->getPrefixedText(),
358  $revision->getID(),
359  $revision->getModel(),
360  $revision->getFormat() );
361 
362  return false;
363  }
364 
365  try {
366  return $revision->importOldRevision();
367  } catch ( MWContentSerializationException $ex ) {
368  $this->notice( 'import-error-unserialize',
369  $revision->getTitle()->getPrefixedText(),
370  $revision->getID(),
371  $revision->getModel(),
372  $revision->getFormat() );
373  }
374 
375  return false;
376  }
377 
383  public function importLogItem( $revision ) {
384  return $revision->importLogItem();
385  }
386 
392  public function importUpload( $revision ) {
393  return $revision->importUpload();
394  }
395 
405  public function finishImportPage( $title, $foreignTitle, $revCount,
406  $sRevCount, $pageInfo
407  ) {
408  // Update article count statistics (T42009)
409  // The normal counting logic in WikiPage->doEditUpdates() is designed for
410  // one-revision-at-a-time editing, not bulk imports. In this situation it
411  // suffers from issues of replica DB lag. We let WikiPage handle the total page
412  // and revision count, and we implement our own custom logic for the
413  // article (content page) count.
414  if ( !$this->disableStatisticsUpdate ) {
415  $page = WikiPage::factory( $title );
416  $page->loadPageData( 'fromdbmaster' );
417  $content = $page->getContent();
418  if ( $content === null ) {
419  wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $title .
420  ' because WikiPage::getContent() returned null' );
421  } else {
422  $editInfo = $page->prepareContentForEdit( $content );
423  $countKey = 'title_' . $title->getPrefixedText();
424  $countable = $page->isCountable( $editInfo );
425  if ( array_key_exists( $countKey, $this->countableCache ) &&
426  $countable != $this->countableCache[$countKey] ) {
428  'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
429  ] ) );
430  }
431  }
432  }
433 
434  return Hooks::run( 'AfterImportPage', [ $title, $foreignTitle, $revCount,
435  $sRevCount, $pageInfo ] );
436  }
437 
442  public function debugRevisionHandler( &$revision ) {
443  $this->debug( "Got revision:" );
444  if ( is_object( $revision->title ) ) {
445  $this->debug( "-- Title: " . $revision->title->getPrefixedText() );
446  } else {
447  $this->debug( "-- Title: <invalid>" );
448  }
449  $this->debug( "-- User: " . $revision->user_text );
450  $this->debug( "-- Timestamp: " . $revision->timestamp );
451  $this->debug( "-- Comment: " . $revision->comment );
452  $this->debug( "-- Text: " . $revision->text );
453  }
454 
460  private function siteInfoCallback( $siteInfo ) {
461  if ( isset( $this->mSiteInfoCallback ) ) {
462  return call_user_func_array( $this->mSiteInfoCallback,
463  [ $siteInfo, $this ] );
464  } else {
465  return false;
466  }
467  }
468 
473  public function pageCallback( $title ) {
474  if ( isset( $this->mPageCallback ) ) {
475  call_user_func( $this->mPageCallback, $title );
476  }
477  }
478 
487  private function pageOutCallback( $title, $foreignTitle, $revCount,
488  $sucCount, $pageInfo ) {
489  if ( isset( $this->mPageOutCallback ) ) {
490  call_user_func_array( $this->mPageOutCallback, func_get_args() );
491  }
492  }
493 
499  private function revisionCallback( $revision ) {
500  if ( isset( $this->mRevisionCallback ) ) {
501  return call_user_func_array( $this->mRevisionCallback,
502  [ $revision, $this ] );
503  } else {
504  return false;
505  }
506  }
507 
513  private function logItemCallback( $revision ) {
514  if ( isset( $this->mLogItemCallback ) ) {
515  return call_user_func_array( $this->mLogItemCallback,
516  [ $revision, $this ] );
517  } else {
518  return false;
519  }
520  }
521 
528  public function nodeAttribute( $attr ) {
529  return $this->reader->getAttribute( $attr );
530  }
531 
539  public function nodeContents() {
540  if ( $this->reader->isEmptyElement ) {
541  return "";
542  }
543  $buffer = "";
544  while ( $this->reader->read() ) {
545  switch ( $this->reader->nodeType ) {
546  case XMLReader::TEXT:
547  case XMLReader::CDATA:
548  case XMLReader::SIGNIFICANT_WHITESPACE:
549  $buffer .= $this->reader->value;
550  break;
551  case XMLReader::END_ELEMENT:
552  return $buffer;
553  }
554  }
555 
556  $this->reader->close();
557  return '';
558  }
559 
566  public function doImport() {
567  // Calls to reader->read need to be wrapped in calls to
568  // libxml_disable_entity_loader() to avoid local file
569  // inclusion attacks (T48932).
570  $oldDisable = libxml_disable_entity_loader( true );
571  $this->reader->read();
572 
573  if ( $this->reader->localName != 'mediawiki' ) {
574  libxml_disable_entity_loader( $oldDisable );
575  throw new MWException( "Expected <mediawiki> tag, got " .
576  $this->reader->localName );
577  }
578  $this->debug( "<mediawiki> tag is correct." );
579 
580  $this->debug( "Starting primary dump processing loop." );
581 
582  $keepReading = $this->reader->read();
583  $skip = false;
584  $rethrow = null;
585  $pageCount = 0;
586  try {
587  while ( $keepReading ) {
588  $tag = $this->reader->localName;
589  if ( $this->pageOffset ) {
590  if ( $tag === 'page' ) {
591  $pageCount++;
592  }
593  if ( $pageCount < $this->pageOffset ) {
594  $keepReading = $this->reader->next();
595  continue;
596  }
597  }
598  $type = $this->reader->nodeType;
599 
600  if ( !Hooks::run( 'ImportHandleToplevelXMLTag', [ $this ] ) ) {
601  // Do nothing
602  } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
603  break;
604  } elseif ( $tag == 'siteinfo' ) {
605  $this->handleSiteInfo();
606  } elseif ( $tag == 'page' ) {
607  $this->handlePage();
608  } elseif ( $tag == 'logitem' ) {
609  $this->handleLogItem();
610  } elseif ( $tag != '#text' ) {
611  $this->warn( "Unhandled top-level XML tag $tag" );
612 
613  $skip = true;
614  }
615 
616  if ( $skip ) {
617  $keepReading = $this->reader->next();
618  $skip = false;
619  $this->debug( "Skip" );
620  } else {
621  $keepReading = $this->reader->read();
622  }
623  }
624  } catch ( Exception $ex ) {
625  $rethrow = $ex;
626  }
627 
628  // finally
629  libxml_disable_entity_loader( $oldDisable );
630  $this->reader->close();
631 
632  if ( $rethrow ) {
633  throw $rethrow;
634  }
635 
636  return true;
637  }
638 
639  private function handleSiteInfo() {
640  $this->debug( "Enter site info handler." );
641  $siteInfo = [];
642 
643  // Fields that can just be stuffed in the siteInfo object
644  $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
645 
646  while ( $this->reader->read() ) {
647  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
648  $this->reader->localName == 'siteinfo' ) {
649  break;
650  }
651 
652  $tag = $this->reader->localName;
653 
654  if ( $tag == 'namespace' ) {
655  $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
656  $this->nodeContents();
657  } elseif ( in_array( $tag, $normalFields ) ) {
658  $siteInfo[$tag] = $this->nodeContents();
659  }
660  }
661 
662  $siteInfo['_namespaces'] = $this->foreignNamespaces;
663  $this->siteInfoCallback( $siteInfo );
664  }
665 
666  private function handleLogItem() {
667  $this->debug( "Enter log item handler." );
668  $logInfo = [];
669 
670  // Fields that can just be stuffed in the pageInfo object
671  $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
672  'logtitle', 'params' ];
673 
674  while ( $this->reader->read() ) {
675  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
676  $this->reader->localName == 'logitem' ) {
677  break;
678  }
679 
680  $tag = $this->reader->localName;
681 
682  if ( !Hooks::run( 'ImportHandleLogItemXMLTag', [
683  $this, $logInfo
684  ] ) ) {
685  // Do nothing
686  } elseif ( in_array( $tag, $normalFields ) ) {
687  $logInfo[$tag] = $this->nodeContents();
688  } elseif ( $tag == 'contributor' ) {
689  $logInfo['contributor'] = $this->handleContributor();
690  } elseif ( $tag != '#text' ) {
691  $this->warn( "Unhandled log-item XML tag $tag" );
692  }
693  }
694 
695  $this->processLogItem( $logInfo );
696  }
697 
702  private function processLogItem( $logInfo ) {
703  $revision = new WikiRevision( $this->config );
704 
705  if ( isset( $logInfo['id'] ) ) {
706  $revision->setID( $logInfo['id'] );
707  }
708  $revision->setType( $logInfo['type'] );
709  $revision->setAction( $logInfo['action'] );
710  if ( isset( $logInfo['timestamp'] ) ) {
711  $revision->setTimestamp( $logInfo['timestamp'] );
712  }
713  if ( isset( $logInfo['params'] ) ) {
714  $revision->setParams( $logInfo['params'] );
715  }
716  if ( isset( $logInfo['logtitle'] ) ) {
717  // @todo Using Title for non-local titles is a recipe for disaster.
718  // We should use ForeignTitle here instead.
719  $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
720  }
721 
722  $revision->setNoUpdates( $this->mNoUpdates );
723 
724  if ( isset( $logInfo['comment'] ) ) {
725  $revision->setComment( $logInfo['comment'] );
726  }
727 
728  if ( isset( $logInfo['contributor']['ip'] ) ) {
729  $revision->setUserIP( $logInfo['contributor']['ip'] );
730  }
731 
732  if ( !isset( $logInfo['contributor']['username'] ) ) {
733  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
734  } else {
735  $revision->setUsername(
736  $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
737  );
738  }
739 
740  return $this->logItemCallback( $revision );
741  }
742 
746  private function handlePage() {
747  // Handle page data.
748  $this->debug( "Enter page handler." );
749  $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
750 
751  // Fields that can just be stuffed in the pageInfo object
752  $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
753 
754  $skip = false;
755  $badTitle = false;
756 
757  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
758  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
759  $this->reader->localName == 'page' ) {
760  break;
761  }
762 
763  $skip = false;
764 
765  $tag = $this->reader->localName;
766 
767  if ( $badTitle ) {
768  // The title is invalid, bail out of this page
769  $skip = true;
770  } elseif ( !Hooks::run( 'ImportHandlePageXMLTag', [ $this,
771  &$pageInfo ] ) ) {
772  // Do nothing
773  } elseif ( in_array( $tag, $normalFields ) ) {
774  // An XML snippet:
775  // <page>
776  // <id>123</id>
777  // <title>Page</title>
778  // <redirect title="NewTitle"/>
779  // ...
780  // Because the redirect tag is built differently, we need special handling for that case.
781  if ( $tag == 'redirect' ) {
782  $pageInfo[$tag] = $this->nodeAttribute( 'title' );
783  } else {
784  $pageInfo[$tag] = $this->nodeContents();
785  }
786  } elseif ( $tag == 'revision' || $tag == 'upload' ) {
787  if ( !isset( $title ) ) {
788  $title = $this->processTitle( $pageInfo['title'],
789  $pageInfo['ns'] ?? null );
790 
791  // $title is either an array of two titles or false.
792  if ( is_array( $title ) ) {
793  $this->pageCallback( $title );
794  list( $pageInfo['_title'], $foreignTitle ) = $title;
795  } else {
796  $badTitle = true;
797  $skip = true;
798  }
799  }
800 
801  if ( $title ) {
802  if ( $tag == 'revision' ) {
803  $this->handleRevision( $pageInfo );
804  } else {
805  $this->handleUpload( $pageInfo );
806  }
807  }
808  } elseif ( $tag != '#text' ) {
809  $this->warn( "Unhandled page XML tag $tag" );
810  $skip = true;
811  }
812  }
813 
814  // @note $pageInfo is only set if a valid $title is processed above with
815  // no error. If we have a valid $title, then pageCallback is called
816  // above, $pageInfo['title'] is set and we do pageOutCallback here.
817  // If $pageInfo['_title'] is not set, then $foreignTitle is also not
818  // set since they both come from $title above.
819  if ( array_key_exists( '_title', $pageInfo ) ) {
820  $this->pageOutCallback( $pageInfo['_title'], $foreignTitle,
821  $pageInfo['revisionCount'],
822  $pageInfo['successfulRevisionCount'],
823  $pageInfo );
824  }
825  }
826 
830  private function handleRevision( &$pageInfo ) {
831  $this->debug( "Enter revision handler" );
832  $revisionInfo = [];
833 
834  $normalFields = [ 'id', 'timestamp', 'comment', 'minor', 'model', 'format', 'text', 'sha1' ];
835 
836  $skip = false;
837 
838  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
839  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
840  $this->reader->localName == 'revision' ) {
841  break;
842  }
843 
844  $tag = $this->reader->localName;
845 
846  if ( !Hooks::run( 'ImportHandleRevisionXMLTag', [
847  $this, $pageInfo, $revisionInfo
848  ] ) ) {
849  // Do nothing
850  } elseif ( in_array( $tag, $normalFields ) ) {
851  $revisionInfo[$tag] = $this->nodeContents();
852  } elseif ( $tag == 'contributor' ) {
853  $revisionInfo['contributor'] = $this->handleContributor();
854  } elseif ( $tag != '#text' ) {
855  $this->warn( "Unhandled revision XML tag $tag" );
856  $skip = true;
857  }
858  }
859 
860  $pageInfo['revisionCount']++;
861  if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
862  $pageInfo['successfulRevisionCount']++;
863  }
864  }
865 
872  private function processRevision( $pageInfo, $revisionInfo ) {
873  global $wgMaxArticleSize;
874 
875  // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
876  // database errors and instability. Testing for revisions with only listed
877  // content models, as other content models might use serialization formats
878  // which aren't checked against $wgMaxArticleSize.
879  if ( ( !isset( $revisionInfo['model'] ) ||
880  in_array( $revisionInfo['model'], [
881  'wikitext',
882  'css',
883  'json',
884  'javascript',
885  'text',
886  ''
887  ] ) ) &&
888  strlen( $revisionInfo['text'] ) > $wgMaxArticleSize * 1024
889  ) {
890  throw new MWException( 'The text of ' .
891  ( isset( $revisionInfo['id'] ) ?
892  "the revision with ID $revisionInfo[id]" :
893  'a revision'
894  ) . " exceeds the maximum allowable size ($wgMaxArticleSize KB)" );
895  }
896 
897  // FIXME: process schema version 11!
898  $revision = new WikiRevision( $this->config );
899 
900  if ( isset( $revisionInfo['id'] ) ) {
901  $revision->setID( $revisionInfo['id'] );
902  }
903  if ( isset( $revisionInfo['model'] ) ) {
904  $revision->setModel( $revisionInfo['model'] );
905  }
906  if ( isset( $revisionInfo['format'] ) ) {
907  $revision->setFormat( $revisionInfo['format'] );
908  }
909  $revision->setTitle( $pageInfo['_title'] );
910 
911  if ( isset( $revisionInfo['text'] ) ) {
912  $handler = $revision->getContentHandler();
913  $text = $handler->importTransform(
914  $revisionInfo['text'],
915  $revision->getFormat() );
916 
917  $revision->setText( $text );
918  }
919  $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
920 
921  if ( isset( $revisionInfo['comment'] ) ) {
922  $revision->setComment( $revisionInfo['comment'] );
923  }
924 
925  if ( isset( $revisionInfo['minor'] ) ) {
926  $revision->setMinor( true );
927  }
928  if ( isset( $revisionInfo['contributor']['ip'] ) ) {
929  $revision->setUserIP( $revisionInfo['contributor']['ip'] );
930  } elseif ( isset( $revisionInfo['contributor']['username'] ) ) {
931  $revision->setUsername(
932  $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
933  );
934  } else {
935  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
936  }
937  if ( isset( $revisionInfo['sha1'] ) ) {
938  $revision->setSha1Base36( $revisionInfo['sha1'] );
939  }
940  $revision->setNoUpdates( $this->mNoUpdates );
941 
942  return $this->revisionCallback( $revision );
943  }
944 
949  private function handleUpload( &$pageInfo ) {
950  $this->debug( "Enter upload handler" );
951  $uploadInfo = [];
952 
953  $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
954  'src', 'size', 'sha1base36', 'archivename', 'rel' ];
955 
956  $skip = false;
957 
958  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
959  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
960  $this->reader->localName == 'upload' ) {
961  break;
962  }
963 
964  $tag = $this->reader->localName;
965 
966  if ( !Hooks::run( 'ImportHandleUploadXMLTag', [
967  $this, $pageInfo
968  ] ) ) {
969  // Do nothing
970  } elseif ( in_array( $tag, $normalFields ) ) {
971  $uploadInfo[$tag] = $this->nodeContents();
972  } elseif ( $tag == 'contributor' ) {
973  $uploadInfo['contributor'] = $this->handleContributor();
974  } elseif ( $tag == 'contents' ) {
975  $contents = $this->nodeContents();
976  $encoding = $this->reader->getAttribute( 'encoding' );
977  if ( $encoding === 'base64' ) {
978  $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
979  $uploadInfo['isTempSrc'] = true;
980  }
981  } elseif ( $tag != '#text' ) {
982  $this->warn( "Unhandled upload XML tag $tag" );
983  $skip = true;
984  }
985  }
986 
987  if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
988  $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
989  if ( file_exists( $path ) ) {
990  $uploadInfo['fileSrc'] = $path;
991  $uploadInfo['isTempSrc'] = false;
992  }
993  }
994 
995  if ( $this->mImportUploads ) {
996  return $this->processUpload( $pageInfo, $uploadInfo );
997  }
998  }
999 
1004  private function dumpTemp( $contents ) {
1005  $filename = tempnam( wfTempDir(), 'importupload' );
1006  file_put_contents( $filename, $contents );
1007  return $filename;
1008  }
1009 
1015  private function processUpload( $pageInfo, $uploadInfo ) {
1016  $revision = new WikiRevision( $this->config );
1017  $text = $uploadInfo['text'] ?? '';
1018 
1019  $revision->setTitle( $pageInfo['_title'] );
1020  $revision->setID( $pageInfo['id'] );
1021  $revision->setTimestamp( $uploadInfo['timestamp'] );
1022  $revision->setText( $text );
1023  $revision->setFilename( $uploadInfo['filename'] );
1024  if ( isset( $uploadInfo['archivename'] ) ) {
1025  $revision->setArchiveName( $uploadInfo['archivename'] );
1026  }
1027  $revision->setSrc( $uploadInfo['src'] );
1028  if ( isset( $uploadInfo['fileSrc'] ) ) {
1029  $revision->setFileSrc( $uploadInfo['fileSrc'],
1030  !empty( $uploadInfo['isTempSrc'] ) );
1031  }
1032  if ( isset( $uploadInfo['sha1base36'] ) ) {
1033  $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1034  }
1035  $revision->setSize( intval( $uploadInfo['size'] ) );
1036  $revision->setComment( $uploadInfo['comment'] );
1037 
1038  if ( isset( $uploadInfo['contributor']['ip'] ) ) {
1039  $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1040  }
1041  if ( isset( $uploadInfo['contributor']['username'] ) ) {
1042  $revision->setUsername(
1043  $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1044  );
1045  }
1046  $revision->setNoUpdates( $this->mNoUpdates );
1047 
1048  return call_user_func( $this->mUploadCallback, $revision );
1049  }
1050 
1054  private function handleContributor() {
1055  $fields = [ 'id', 'ip', 'username' ];
1056  $info = [];
1057 
1058  if ( $this->reader->isEmptyElement ) {
1059  return $info;
1060  }
1061  while ( $this->reader->read() ) {
1062  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1063  $this->reader->localName == 'contributor' ) {
1064  break;
1065  }
1066 
1067  $tag = $this->reader->localName;
1068 
1069  if ( in_array( $tag, $fields ) ) {
1070  $info[$tag] = $this->nodeContents();
1071  }
1072  }
1073 
1074  return $info;
1075  }
1076 
1082  private function processTitle( $text, $ns = null ) {
1083  if ( $this->foreignNamespaces === null ) {
1084  $foreignTitleFactory = new NaiveForeignTitleFactory();
1085  } else {
1086  $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1087  $this->foreignNamespaces );
1088  }
1089 
1090  $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1091  intval( $ns ) );
1092 
1093  $title = $this->importTitleFactory->createTitleFromForeignTitle(
1094  $foreignTitle );
1095 
1096  $commandLineMode = $this->config->get( 'CommandLineMode' );
1097  if ( $title === null ) {
1098  # Invalid page title? Ignore the page
1099  $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1100  return false;
1101  } elseif ( $title->isExternal() ) {
1102  $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1103  return false;
1104  } elseif ( !$title->canExist() ) {
1105  $this->notice( 'import-error-special', $title->getPrefixedText() );
1106  return false;
1107  } elseif ( !$commandLineMode ) {
1108  $permissionManager = MediaWikiServices::getInstance()->getPermissionManager();
1109  $user = RequestContext::getMain()->getUser();
1110 
1111  if ( !$permissionManager->userCan( 'edit', $user, $title ) ) {
1112  # Do not import if the importing wiki user cannot edit this page
1113  $this->notice( 'import-error-edit', $title->getPrefixedText() );
1114 
1115  return false;
1116  }
1117 
1118  if ( !$title->exists() && !$permissionManager->userCan( 'create', $user, $title ) ) {
1119  # Do not import if the importing wiki user cannot create this page
1120  $this->notice( 'import-error-create', $title->getPrefixedText() );
1121 
1122  return false;
1123  }
1124  }
1125 
1126  return [ $title, $foreignTitle ];
1127  }
1128 }
NaiveImportTitleFactory
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Definition: NaiveImportTitleFactory.php:34
WikiImporter\processRevision
processRevision( $pageInfo, $revisionInfo)
Definition: WikiImporter.php:872
WikiImporter\$mUploadCallback
$mUploadCallback
Definition: WikiImporter.php:39
WikiImporter
XML file reader for the page data importer.
Definition: WikiImporter.php:35
Title\newFromText
static newFromText( $text, $defaultNamespace=NS_MAIN)
Create a new Title from text, such as what one would find in a link.
Definition: Title.php:332
$wgMaxArticleSize
$wgMaxArticleSize
Maximum article size in kilobytes.
Definition: DefaultSettings.php:2377
WikiImporter\setImageBasePath
setImageBasePath( $dir)
Definition: WikiImporter.php:308
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:144
wfSetVar
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
Definition: GlobalFunctions.php:1541
UploadSourceAdapter\registerSource
static registerSource(ImportSource $source)
Definition: UploadSourceAdapter.php:48
WikiImporter\$mImportUploads
$mImportUploads
Definition: WikiImporter.php:42
NamespaceAwareForeignTitleFactory
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
Definition: NamespaceAwareForeignTitleFactory.php:25
WikiImporter\$mRevisionCallback
$mRevisionCallback
Definition: WikiImporter.php:39
WikiImporter\revisionCallback
revisionCallback( $revision)
Notify the callback function of a revision.
Definition: WikiImporter.php:499
WikiImporter\setNoticeCallback
setNoticeCallback( $callback)
Set a callback that displays notice messages.
Definition: WikiImporter.php:166
DeferredUpdates\addUpdate
static addUpdate(DeferrableUpdate $update, $stage=self::POSTSEND)
Add an update to the deferred update queue for execution at the appropriate time.
Definition: DeferredUpdates.php:106
NaiveForeignTitleFactory
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
Definition: NaiveForeignTitleFactory.php:27
WikiImporter\$mPageOutCallback
$mPageOutCallback
Definition: WikiImporter.php:40
WikiImporter\setNoUpdates
setNoUpdates( $noupdates)
Set 'no updates' mode.
Definition: WikiImporter.php:146
WikiImporter\getReader
getReader()
Definition: WikiImporter.php:105
ExternalUserNames
Class to parse and build external user names.
Definition: ExternalUserNames.php:29
WikiImporter\processLogItem
processLogItem( $logInfo)
Definition: WikiImporter.php:702
WikiImporter\handleRevision
handleRevision(&$pageInfo)
Definition: WikiImporter.php:830
WikiImporter\setRevisionCallback
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
Definition: WikiImporter.php:201
wfMessage
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Definition: GlobalFunctions.php:1198
WikiImporter\handleContributor
handleContributor()
Definition: WikiImporter.php:1054
WikiImporter\setUsernamePrefix
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
Definition: WikiImporter.php:324
ImportReporter
Reporting callback.
Definition: ImportReporter.php:27
WikiImporter\$externalUserNames
ExternalUserNames $externalUserNames
Definition: WikiImporter.php:54
NamespaceImportTitleFactory
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Definition: NamespaceImportTitleFactory.php:28
WikiImporter\nodeContents
nodeContents()
Shouldn't something like this be built-in to XMLReader? Fetches text contents of the current element,...
Definition: WikiImporter.php:539
WikiImporter\siteInfoCallback
siteInfoCallback( $siteInfo)
Notify the callback function of site info.
Definition: WikiImporter.php:460
NS_MAIN
const NS_MAIN
Definition: Defines.php:69
Config
Interface for configuration instances.
Definition: Config.php:28
MWException
MediaWiki exception.
Definition: MWException.php:26
ImportTitleFactory
Represents an object that can convert page titles on a foreign wiki (ForeignTitle objects) into page ...
Definition: ImportTitleFactory.php:25
WikiPage\factory
static factory(Title $title)
Create a WikiPage object of the appropriate class for the given title.
Definition: WikiPage.php:143
WikiImporter\dumpTemp
dumpTemp( $contents)
Definition: WikiImporter.php:1004
WikiImporter\$countableCache
array $countableCache
Definition: WikiImporter.php:50
WikiImporter\pageOutCallback
pageOutCallback( $title, $foreignTitle, $revCount, $sucCount, $pageInfo)
Notify the callback function when a "</page>" is closed.
Definition: WikiImporter.php:487
WikiImporter\__construct
__construct(ImportSource $source, Config $config)
Creates an ImportXMLReader drawing from the source provided.
Definition: WikiImporter.php:62
MWContentSerializationException
Exception representing a failure to serialize or unserialize a content object.
Definition: MWContentSerializationException.php:7
WikiImporter\throwXmlError
throwXmlError( $err)
Definition: WikiImporter.php:109
SubpageImportTitleFactory
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Definition: SubpageImportTitleFactory.php:28
$title
$title
Definition: testCompression.php:38
SiteStatsUpdate\factory
static factory(array $deltas)
Definition: SiteStatsUpdate.php:71
WikiImporter\finishImportPage
finishImportPage( $title, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
Definition: WikiImporter.php:405
wfTimestampNow
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
Definition: GlobalFunctions.php:1835
WikiImporter\$importTitleFactory
ImportTitleFactory $importTitleFactory
Definition: WikiImporter.php:48
WikiImporter\processUpload
processUpload( $pageInfo, $uploadInfo)
Definition: WikiImporter.php:1015
WikiImporter\disableStatisticsUpdate
disableStatisticsUpdate()
Statistics update can cause a lot of time.
Definition: WikiImporter.php:332
WikiImporter\setImportUploads
setImportUploads( $import)
Definition: WikiImporter.php:315
WikiImporter\$mNoUpdates
$mNoUpdates
Definition: WikiImporter.php:43
wfDebug
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
Definition: GlobalFunctions.php:913
WikiImporter\$mPageCallback
$mPageCallback
Definition: WikiImporter.php:39
WikiImporter\$mSiteInfoCallback
$mSiteInfoCallback
Definition: WikiImporter.php:40
$content
$content
Definition: router.php:76
WikiImporter\beforeImportPage
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
Definition: WikiImporter.php:342
StatusValue\newGood
static newGood( $value=null)
Factory function for good results.
Definition: StatusValue.php:81
WikiImporter\importRevision
importRevision( $revision)
Default per-revision callback, performs the import.
Definition: WikiImporter.php:354
WikiImporter\doImport
doImport()
Primary entry point.
Definition: WikiImporter.php:566
WikiImporter\processTitle
processTitle( $text, $ns=null)
Definition: WikiImporter.php:1082
WikiImporter\$mImageBasePath
$mImageBasePath
Definition: WikiImporter.php:42
WikiImporter\$foreignNamespaces
$foreignNamespaces
Definition: WikiImporter.php:38
WikiImporter\setDebug
setDebug( $debug)
Set debug mode...
Definition: WikiImporter.php:138
WikiImporter\notice
notice( $msg,... $params)
Definition: WikiImporter.php:124
RequestContext\getMain
static getMain()
Get the RequestContext object associated with the main request.
Definition: RequestContext.php:451
WikiImporter\handleUpload
handleUpload(&$pageInfo)
Definition: WikiImporter.php:949
WikiImporter\setUploadCallback
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
Definition: WikiImporter.php:212
WikiImporter\warn
warn( $data)
Definition: WikiImporter.php:120
WikiImporter\handleSiteInfo
handleSiteInfo()
Definition: WikiImporter.php:639
WikiImporter\setTargetRootPage
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
Definition: WikiImporter.php:276
WikiImporter\$mNoticeCallback
$mNoticeCallback
Definition: WikiImporter.php:41
$debug
$debug
Definition: Setup.php:673
wfTempDir
wfTempDir()
Tries to get the system directory for temporary files.
Definition: GlobalFunctions.php:1873
WikiRevision
Represents a revision, log entry or upload during the import process.
Definition: WikiRevision.php:37
WikiImporter\debug
debug( $data)
Definition: WikiImporter.php:114
WikiImporter\importLogItem
importLogItem( $revision)
Default per-revision callback, performs the import.
Definition: WikiImporter.php:383
WikiImporter\importUpload
importUpload( $revision)
Dummy for now...
Definition: WikiImporter.php:392
WikiImporter\setPageOffset
setPageOffset( $nthPage)
Sets 'pageOffset' value.
Definition: WikiImporter.php:156
WikiImporter\handleLogItem
handleLogItem()
Definition: WikiImporter.php:666
ImportSource
Source interface for XML import.
Definition: ImportSource.php:32
$path
$path
Definition: NoLocalSettings.php:25
WikiImporter\$pageOffset
$pageOffset
Definition: WikiImporter.php:44
WikiImporter\$config
Config $config
Definition: WikiImporter.php:46
WikiImporter\setSiteInfoCallback
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
Definition: WikiImporter.php:234
WikiImporter\setPageOutCallback
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
Definition: WikiImporter.php:190
WikiImporter\debugRevisionHandler
debugRevisionHandler(&$revision)
Alternate per-revision callback, for debugging.
Definition: WikiImporter.php:442
WikiImporter\$disableStatisticsUpdate
bool $disableStatisticsUpdate
Definition: WikiImporter.php:52
$source
$source
Definition: mwdoc-filter.php:34
WikiImporter\$reader
XMLReader $reader
Definition: WikiImporter.php:37
WikiImporter\handlePage
handlePage()
PhanTypeInvalidDimOffset Phan not reading the reference inside the hook.
Definition: WikiImporter.php:746
WikiImporter\nodeAttribute
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
Definition: WikiImporter.php:528
WikiImporter\setPageCallback
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
Definition: WikiImporter.php:175
WikiImporter\setImportTitleFactory
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
Definition: WikiImporter.php:245
WikiImporter\setTargetNamespace
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
Definition: WikiImporter.php:254
WikiImporter\logItemCallback
logItemCallback( $revision)
Notify the callback function of a new log item.
Definition: WikiImporter.php:513
Hooks\run
static run( $event, array $args=[], $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
Definition: Hooks.php:133
WikiImporter\pageCallback
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
Definition: WikiImporter.php:473
WikiImporter\$mLogItemCallback
$mLogItemCallback
Definition: WikiImporter.php:39
WikiImporter\$mDebug
$mDebug
Definition: WikiImporter.php:41
WikiImporter\setLogItemCallback
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
Definition: WikiImporter.php:223
$type
$type
Definition: testCompression.php:52