MediaWiki  master
WikiImporter.php
Go to the documentation of this file.
1 <?php
28 
35 class WikiImporter {
36  private $reader = null;
42  private $mNoUpdates = false;
43  private $pageOffset = 0;
45  private $config;
49  private $countableCache = [];
51  private $disableStatisticsUpdate = false;
54 
62  if ( !class_exists( 'XMLReader' ) ) {
63  throw new Exception( 'Import requires PHP to have been compiled with libxml support' );
64  }
65 
66  $this->reader = new XMLReader();
67  $this->config = $config;
68 
69  if ( !in_array( 'uploadsource', stream_get_wrappers() ) ) {
70  stream_wrapper_register( 'uploadsource', UploadSourceAdapter::class );
71  }
72  $id = UploadSourceAdapter::registerSource( $source );
73 
74  // Enable the entity loader, as it is needed for loading external URLs via
75  // XMLReader::open (T86036)
76  $oldDisable = libxml_disable_entity_loader( false );
77  if ( defined( 'LIBXML_PARSEHUGE' ) ) {
78  $status = $this->reader->open( "uploadsource://$id", null, LIBXML_PARSEHUGE );
79  } else {
80  $status = $this->reader->open( "uploadsource://$id" );
81  }
82  if ( !$status ) {
83  $error = libxml_get_last_error();
84  libxml_disable_entity_loader( $oldDisable );
85  throw new MWException( 'Encountered an internal error while initializing WikiImporter object: ' .
86  $error->message );
87  }
88  libxml_disable_entity_loader( $oldDisable );
89 
90  // Default callbacks
91  $this->setPageCallback( [ $this, 'beforeImportPage' ] );
92  $this->setRevisionCallback( [ $this, "importRevision" ] );
93  $this->setUploadCallback( [ $this, 'importUpload' ] );
94  $this->setLogItemCallback( [ $this, 'importLogItem' ] );
95  $this->setPageOutCallback( [ $this, 'finishImportPage' ] );
96 
97  $this->importTitleFactory = new NaiveImportTitleFactory();
98  $this->externalUserNames = new ExternalUserNames( 'imported', false );
99  }
100 
104  public function getReader() {
105  return $this->reader;
106  }
107 
108  public function throwXmlError( $err ) {
109  $this->debug( "FAILURE: $err" );
110  wfDebug( "WikiImporter XML error: $err\n" );
111  }
112 
113  public function debug( $data ) {
114  if ( $this->mDebug ) {
115  wfDebug( "IMPORT: $data\n" );
116  }
117  }
118 
119  public function warn( $data ) {
120  wfDebug( "IMPORT: $data\n" );
121  }
122 
123  public function notice( $msg, ...$params ) {
124  if ( is_callable( $this->mNoticeCallback ) ) {
125  call_user_func( $this->mNoticeCallback, $msg, $params );
126  } else { # No ImportReporter -> CLI
127  // T177997: the command line importers should call setNoticeCallback()
128  // for their own custom callback to echo the notice
129  wfDebug( wfMessage( $msg, $params )->text() . "\n" );
130  }
131  }
132 
137  function setDebug( $debug ) {
138  $this->mDebug = $debug;
139  }
140 
145  function setNoUpdates( $noupdates ) {
146  $this->mNoUpdates = $noupdates;
147  }
148 
155  function setPageOffset( $nthPage ) {
156  $this->pageOffset = $nthPage;
157  }
158 
165  public function setNoticeCallback( $callback ) {
166  return wfSetVar( $this->mNoticeCallback, $callback );
167  }
168 
174  public function setPageCallback( $callback ) {
175  $previous = $this->mPageCallback;
176  $this->mPageCallback = $callback;
177  return $previous;
178  }
179 
189  public function setPageOutCallback( $callback ) {
190  $previous = $this->mPageOutCallback;
191  $this->mPageOutCallback = $callback;
192  return $previous;
193  }
194 
200  public function setRevisionCallback( $callback ) {
201  $previous = $this->mRevisionCallback;
202  $this->mRevisionCallback = $callback;
203  return $previous;
204  }
205 
211  public function setUploadCallback( $callback ) {
212  $previous = $this->mUploadCallback;
213  $this->mUploadCallback = $callback;
214  return $previous;
215  }
216 
222  public function setLogItemCallback( $callback ) {
223  $previous = $this->mLogItemCallback;
224  $this->mLogItemCallback = $callback;
225  return $previous;
226  }
227 
233  public function setSiteInfoCallback( $callback ) {
234  $previous = $this->mSiteInfoCallback;
235  $this->mSiteInfoCallback = $callback;
236  return $previous;
237  }
238 
244  public function setImportTitleFactory( $factory ) {
245  $this->importTitleFactory = $factory;
246  }
247 
253  public function setTargetNamespace( $namespace ) {
254  if ( is_null( $namespace ) ) {
255  // Don't override namespaces
257  return true;
258  } elseif (
259  $namespace >= 0 &&
260  MediaWikiServices::getInstance()->getNamespaceInfo()->exists( intval( $namespace ) )
261  ) {
262  $namespace = intval( $namespace );
263  $this->setImportTitleFactory( new NamespaceImportTitleFactory( $namespace ) );
264  return true;
265  } else {
266  return false;
267  }
268  }
269 
275  public function setTargetRootPage( $rootpage ) {
277  if ( is_null( $rootpage ) ) {
278  // No rootpage
280  } elseif ( $rootpage !== '' ) {
281  $rootpage = rtrim( $rootpage, '/' ); // avoid double slashes
282  $title = Title::newFromText( $rootpage );
283 
284  if ( !$title || $title->isExternal() ) {
285  $status->fatal( 'import-rootpage-invalid' );
286  } elseif (
287  !MediaWikiServices::getInstance()->getNamespaceInfo()->
288  hasSubpages( $title->getNamespace() )
289  ) {
290  $displayNSText = $title->getNamespace() == NS_MAIN
291  ? wfMessage( 'blanknamespace' )->text()
292  : MediaWikiServices::getInstance()->getContentLanguage()->
293  getNsText( $title->getNamespace() );
294  $status->fatal( 'import-rootpage-nosubpage', $displayNSText );
295  } else {
296  // set namespace to 'all', so the namespace check in processTitle() can pass
297  $this->setTargetNamespace( null );
299  }
300  }
301  return $status;
302  }
303 
307  public function setImageBasePath( $dir ) {
308  $this->mImageBasePath = $dir;
309  }
310 
314  public function setImportUploads( $import ) {
315  $this->mImportUploads = $import;
316  }
317 
323  public function setUsernamePrefix( $usernamePrefix, $assignKnownUsers ) {
324  $this->externalUserNames = new ExternalUserNames( $usernamePrefix, $assignKnownUsers );
325  }
326 
331  public function disableStatisticsUpdate() {
332  $this->disableStatisticsUpdate = true;
333  }
334 
341  public function beforeImportPage( $titleAndForeignTitle ) {
342  $title = $titleAndForeignTitle[0];
343  $page = WikiPage::factory( $title );
344  $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
345  return true;
346  }
347 
353  public function importRevision( $revision ) {
354  if ( !$revision->getContentHandler()->canBeUsedOn( $revision->getTitle() ) ) {
355  $this->notice( 'import-error-bad-location',
356  $revision->getTitle()->getPrefixedText(),
357  $revision->getID(),
358  $revision->getModel(),
359  $revision->getFormat() );
360 
361  return false;
362  }
363 
364  try {
365  return $revision->importOldRevision();
366  } catch ( MWContentSerializationException $ex ) {
367  $this->notice( 'import-error-unserialize',
368  $revision->getTitle()->getPrefixedText(),
369  $revision->getID(),
370  $revision->getModel(),
371  $revision->getFormat() );
372  }
373 
374  return false;
375  }
376 
382  public function importLogItem( $revision ) {
383  return $revision->importLogItem();
384  }
385 
391  public function importUpload( $revision ) {
392  return $revision->importUpload();
393  }
394 
404  public function finishImportPage( $title, $foreignTitle, $revCount,
405  $sRevCount, $pageInfo
406  ) {
407  // Update article count statistics (T42009)
408  // The normal counting logic in WikiPage->doEditUpdates() is designed for
409  // one-revision-at-a-time editing, not bulk imports. In this situation it
410  // suffers from issues of replica DB lag. We let WikiPage handle the total page
411  // and revision count, and we implement our own custom logic for the
412  // article (content page) count.
413  if ( !$this->disableStatisticsUpdate ) {
414  $page = WikiPage::factory( $title );
415  $page->loadPageData( 'fromdbmaster' );
416  $content = $page->getContent();
417  if ( $content === null ) {
418  wfDebug( __METHOD__ . ': Skipping article count adjustment for ' . $title .
419  ' because WikiPage::getContent() returned null' );
420  } else {
421  $editInfo = $page->prepareContentForEdit( $content );
422  $countKey = 'title_' . $title->getPrefixedText();
423  $countable = $page->isCountable( $editInfo );
424  if ( array_key_exists( $countKey, $this->countableCache ) &&
425  $countable != $this->countableCache[$countKey] ) {
427  'articles' => ( (int)$countable - (int)$this->countableCache[$countKey] )
428  ] ) );
429  }
430  }
431  }
432 
433  return Hooks::run( 'AfterImportPage', func_get_args() );
434  }
435 
440  public function debugRevisionHandler( &$revision ) {
441  $this->debug( "Got revision:" );
442  if ( is_object( $revision->title ) ) {
443  $this->debug( "-- Title: " . $revision->title->getPrefixedText() );
444  } else {
445  $this->debug( "-- Title: <invalid>" );
446  }
447  $this->debug( "-- User: " . $revision->user_text );
448  $this->debug( "-- Timestamp: " . $revision->timestamp );
449  $this->debug( "-- Comment: " . $revision->comment );
450  $this->debug( "-- Text: " . $revision->text );
451  }
452 
458  private function siteInfoCallback( $siteInfo ) {
459  if ( isset( $this->mSiteInfoCallback ) ) {
460  return call_user_func_array( $this->mSiteInfoCallback,
461  [ $siteInfo, $this ] );
462  } else {
463  return false;
464  }
465  }
466 
471  function pageCallback( $title ) {
472  if ( isset( $this->mPageCallback ) ) {
473  call_user_func( $this->mPageCallback, $title );
474  }
475  }
476 
485  private function pageOutCallback( $title, $foreignTitle, $revCount,
486  $sucCount, $pageInfo ) {
487  if ( isset( $this->mPageOutCallback ) ) {
488  call_user_func_array( $this->mPageOutCallback, func_get_args() );
489  }
490  }
491 
497  private function revisionCallback( $revision ) {
498  if ( isset( $this->mRevisionCallback ) ) {
499  return call_user_func_array( $this->mRevisionCallback,
500  [ $revision, $this ] );
501  } else {
502  return false;
503  }
504  }
505 
511  private function logItemCallback( $revision ) {
512  if ( isset( $this->mLogItemCallback ) ) {
513  return call_user_func_array( $this->mLogItemCallback,
514  [ $revision, $this ] );
515  } else {
516  return false;
517  }
518  }
519 
526  public function nodeAttribute( $attr ) {
527  return $this->reader->getAttribute( $attr );
528  }
529 
537  public function nodeContents() {
538  if ( $this->reader->isEmptyElement ) {
539  return "";
540  }
541  $buffer = "";
542  while ( $this->reader->read() ) {
543  switch ( $this->reader->nodeType ) {
544  case XMLReader::TEXT:
545  case XMLReader::CDATA:
546  case XMLReader::SIGNIFICANT_WHITESPACE:
547  $buffer .= $this->reader->value;
548  break;
549  case XMLReader::END_ELEMENT:
550  return $buffer;
551  }
552  }
553 
554  $this->reader->close();
555  return '';
556  }
557 
564  public function doImport() {
565  // Calls to reader->read need to be wrapped in calls to
566  // libxml_disable_entity_loader() to avoid local file
567  // inclusion attacks (T48932).
568  $oldDisable = libxml_disable_entity_loader( true );
569  $this->reader->read();
570 
571  if ( $this->reader->localName != 'mediawiki' ) {
572  libxml_disable_entity_loader( $oldDisable );
573  throw new MWException( "Expected <mediawiki> tag, got " .
574  $this->reader->localName );
575  }
576  $this->debug( "<mediawiki> tag is correct." );
577 
578  $this->debug( "Starting primary dump processing loop." );
579 
580  $keepReading = $this->reader->read();
581  $skip = false;
582  $rethrow = null;
583  $pageCount = 0;
584  try {
585  while ( $keepReading ) {
586  $tag = $this->reader->localName;
587  if ( $this->pageOffset ) {
588  if ( $tag === 'page' ) {
589  $pageCount++;
590  }
591  if ( $pageCount < $this->pageOffset ) {
592  $keepReading = $this->reader->next();
593  continue;
594  }
595  }
596  $type = $this->reader->nodeType;
597 
598  if ( !Hooks::run( 'ImportHandleToplevelXMLTag', [ $this ] ) ) {
599  // Do nothing
600  } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
601  break;
602  } elseif ( $tag == 'siteinfo' ) {
603  $this->handleSiteInfo();
604  } elseif ( $tag == 'page' ) {
605  $this->handlePage();
606  } elseif ( $tag == 'logitem' ) {
607  $this->handleLogItem();
608  } elseif ( $tag != '#text' ) {
609  $this->warn( "Unhandled top-level XML tag $tag" );
610 
611  $skip = true;
612  }
613 
614  if ( $skip ) {
615  $keepReading = $this->reader->next();
616  $skip = false;
617  $this->debug( "Skip" );
618  } else {
619  $keepReading = $this->reader->read();
620  }
621  }
622  } catch ( Exception $ex ) {
623  $rethrow = $ex;
624  }
625 
626  // finally
627  libxml_disable_entity_loader( $oldDisable );
628  $this->reader->close();
629 
630  if ( $rethrow ) {
631  throw $rethrow;
632  }
633 
634  return true;
635  }
636 
637  private function handleSiteInfo() {
638  $this->debug( "Enter site info handler." );
639  $siteInfo = [];
640 
641  // Fields that can just be stuffed in the siteInfo object
642  $normalFields = [ 'sitename', 'base', 'generator', 'case' ];
643 
644  while ( $this->reader->read() ) {
645  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
646  $this->reader->localName == 'siteinfo' ) {
647  break;
648  }
649 
650  $tag = $this->reader->localName;
651 
652  if ( $tag == 'namespace' ) {
653  $this->foreignNamespaces[$this->nodeAttribute( 'key' )] =
654  $this->nodeContents();
655  } elseif ( in_array( $tag, $normalFields ) ) {
656  $siteInfo[$tag] = $this->nodeContents();
657  }
658  }
659 
660  $siteInfo['_namespaces'] = $this->foreignNamespaces;
661  $this->siteInfoCallback( $siteInfo );
662  }
663 
664  private function handleLogItem() {
665  $this->debug( "Enter log item handler." );
666  $logInfo = [];
667 
668  // Fields that can just be stuffed in the pageInfo object
669  $normalFields = [ 'id', 'comment', 'type', 'action', 'timestamp',
670  'logtitle', 'params' ];
671 
672  while ( $this->reader->read() ) {
673  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
674  $this->reader->localName == 'logitem' ) {
675  break;
676  }
677 
678  $tag = $this->reader->localName;
679 
680  if ( !Hooks::run( 'ImportHandleLogItemXMLTag', [
681  $this, $logInfo
682  ] ) ) {
683  // Do nothing
684  } elseif ( in_array( $tag, $normalFields ) ) {
685  $logInfo[$tag] = $this->nodeContents();
686  } elseif ( $tag == 'contributor' ) {
687  $logInfo['contributor'] = $this->handleContributor();
688  } elseif ( $tag != '#text' ) {
689  $this->warn( "Unhandled log-item XML tag $tag" );
690  }
691  }
692 
693  $this->processLogItem( $logInfo );
694  }
695 
700  private function processLogItem( $logInfo ) {
701  $revision = new WikiRevision( $this->config );
702 
703  if ( isset( $logInfo['id'] ) ) {
704  $revision->setID( $logInfo['id'] );
705  }
706  $revision->setType( $logInfo['type'] );
707  $revision->setAction( $logInfo['action'] );
708  if ( isset( $logInfo['timestamp'] ) ) {
709  $revision->setTimestamp( $logInfo['timestamp'] );
710  }
711  if ( isset( $logInfo['params'] ) ) {
712  $revision->setParams( $logInfo['params'] );
713  }
714  if ( isset( $logInfo['logtitle'] ) ) {
715  // @todo Using Title for non-local titles is a recipe for disaster.
716  // We should use ForeignTitle here instead.
717  $revision->setTitle( Title::newFromText( $logInfo['logtitle'] ) );
718  }
719 
720  $revision->setNoUpdates( $this->mNoUpdates );
721 
722  if ( isset( $logInfo['comment'] ) ) {
723  $revision->setComment( $logInfo['comment'] );
724  }
725 
726  if ( isset( $logInfo['contributor']['ip'] ) ) {
727  $revision->setUserIP( $logInfo['contributor']['ip'] );
728  }
729 
730  if ( !isset( $logInfo['contributor']['username'] ) ) {
731  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
732  } else {
733  $revision->setUsername(
734  $this->externalUserNames->applyPrefix( $logInfo['contributor']['username'] )
735  );
736  }
737 
738  return $this->logItemCallback( $revision );
739  }
740 
741  private function handlePage() {
742  // Handle page data.
743  $this->debug( "Enter page handler." );
744  $pageInfo = [ 'revisionCount' => 0, 'successfulRevisionCount' => 0 ];
745 
746  // Fields that can just be stuffed in the pageInfo object
747  $normalFields = [ 'title', 'ns', 'id', 'redirect', 'restrictions' ];
748 
749  $skip = false;
750  $badTitle = false;
751 
752  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
753  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
754  $this->reader->localName == 'page' ) {
755  break;
756  }
757 
758  $skip = false;
759 
760  $tag = $this->reader->localName;
761 
762  if ( $badTitle ) {
763  // The title is invalid, bail out of this page
764  $skip = true;
765  } elseif ( !Hooks::run( 'ImportHandlePageXMLTag', [ $this,
766  &$pageInfo ] ) ) {
767  // Do nothing
768  } elseif ( in_array( $tag, $normalFields ) ) {
769  // An XML snippet:
770  // <page>
771  // <id>123</id>
772  // <title>Page</title>
773  // <redirect title="NewTitle"/>
774  // ...
775  // Because the redirect tag is built differently, we need special handling for that case.
776  if ( $tag == 'redirect' ) {
777  $pageInfo[$tag] = $this->nodeAttribute( 'title' );
778  } else {
779  $pageInfo[$tag] = $this->nodeContents();
780  }
781  } elseif ( $tag == 'revision' || $tag == 'upload' ) {
782  if ( !isset( $title ) ) {
783  $title = $this->processTitle( $pageInfo['title'],
784  $pageInfo['ns'] ?? null );
785 
786  // $title is either an array of two titles or false.
787  if ( is_array( $title ) ) {
788  $this->pageCallback( $title );
789  list( $pageInfo['_title'], $foreignTitle ) = $title;
790  } else {
791  $badTitle = true;
792  $skip = true;
793  }
794  }
795 
796  if ( $title ) {
797  if ( $tag == 'revision' ) {
798  $this->handleRevision( $pageInfo );
799  } else {
800  $this->handleUpload( $pageInfo );
801  }
802  }
803  } elseif ( $tag != '#text' ) {
804  $this->warn( "Unhandled page XML tag $tag" );
805  $skip = true;
806  }
807  }
808 
809  // @note $pageInfo is only set if a valid $title is processed above with
810  // no error. If we have a valid $title, then pageCallback is called
811  // above, $pageInfo['title'] is set and we do pageOutCallback here.
812  // If $pageInfo['_title'] is not set, then $foreignTitle is also not
813  // set since they both come from $title above.
814  if ( array_key_exists( '_title', $pageInfo ) ) {
815  $this->pageOutCallback( $pageInfo['_title'], $foreignTitle,
816  $pageInfo['revisionCount'],
817  $pageInfo['successfulRevisionCount'],
818  $pageInfo );
819  }
820  }
821 
825  private function handleRevision( &$pageInfo ) {
826  $this->debug( "Enter revision handler" );
827  $revisionInfo = [];
828 
829  $normalFields = [ 'id', 'timestamp', 'comment', 'minor', 'model', 'format', 'text', 'sha1' ];
830 
831  $skip = false;
832 
833  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
834  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
835  $this->reader->localName == 'revision' ) {
836  break;
837  }
838 
839  $tag = $this->reader->localName;
840 
841  if ( !Hooks::run( 'ImportHandleRevisionXMLTag', [
842  $this, $pageInfo, $revisionInfo
843  ] ) ) {
844  // Do nothing
845  } elseif ( in_array( $tag, $normalFields ) ) {
846  $revisionInfo[$tag] = $this->nodeContents();
847  } elseif ( $tag == 'contributor' ) {
848  $revisionInfo['contributor'] = $this->handleContributor();
849  } elseif ( $tag != '#text' ) {
850  $this->warn( "Unhandled revision XML tag $tag" );
851  $skip = true;
852  }
853  }
854 
855  $pageInfo['revisionCount']++;
856  if ( $this->processRevision( $pageInfo, $revisionInfo ) ) {
857  $pageInfo['successfulRevisionCount']++;
858  }
859  }
860 
867  private function processRevision( $pageInfo, $revisionInfo ) {
868  global $wgMaxArticleSize;
869 
870  // Make sure revisions won't violate $wgMaxArticleSize, which could lead to
871  // database errors and instability. Testing for revisions with only listed
872  // content models, as other content models might use serialization formats
873  // which aren't checked against $wgMaxArticleSize.
874  if ( ( !isset( $revisionInfo['model'] ) ||
875  in_array( $revisionInfo['model'], [
876  'wikitext',
877  'css',
878  'json',
879  'javascript',
880  'text',
881  ''
882  ] ) ) &&
883  strlen( $revisionInfo['text'] ) > $wgMaxArticleSize * 1024
884  ) {
885  throw new MWException( 'The text of ' .
886  ( isset( $revisionInfo['id'] ) ?
887  "the revision with ID $revisionInfo[id]" :
888  'a revision'
889  ) . " exceeds the maximum allowable size ($wgMaxArticleSize KB)" );
890  }
891 
892  // FIXME: process schema version 11!
893  $revision = new WikiRevision( $this->config );
894 
895  if ( isset( $revisionInfo['id'] ) ) {
896  $revision->setID( $revisionInfo['id'] );
897  }
898  if ( isset( $revisionInfo['model'] ) ) {
899  $revision->setModel( $revisionInfo['model'] );
900  }
901  if ( isset( $revisionInfo['format'] ) ) {
902  $revision->setFormat( $revisionInfo['format'] );
903  }
904  $revision->setTitle( $pageInfo['_title'] );
905 
906  if ( isset( $revisionInfo['text'] ) ) {
907  $handler = $revision->getContentHandler();
908  $text = $handler->importTransform(
909  $revisionInfo['text'],
910  $revision->getFormat() );
911 
912  $revision->setText( $text );
913  }
914  $revision->setTimestamp( $revisionInfo['timestamp'] ?? wfTimestampNow() );
915 
916  if ( isset( $revisionInfo['comment'] ) ) {
917  $revision->setComment( $revisionInfo['comment'] );
918  }
919 
920  if ( isset( $revisionInfo['minor'] ) ) {
921  $revision->setMinor( true );
922  }
923  if ( isset( $revisionInfo['contributor']['ip'] ) ) {
924  $revision->setUserIP( $revisionInfo['contributor']['ip'] );
925  } elseif ( isset( $revisionInfo['contributor']['username'] ) ) {
926  $revision->setUsername(
927  $this->externalUserNames->applyPrefix( $revisionInfo['contributor']['username'] )
928  );
929  } else {
930  $revision->setUsername( $this->externalUserNames->addPrefix( 'Unknown user' ) );
931  }
932  if ( isset( $revisionInfo['sha1'] ) ) {
933  $revision->setSha1Base36( $revisionInfo['sha1'] );
934  }
935  $revision->setNoUpdates( $this->mNoUpdates );
936 
937  return $this->revisionCallback( $revision );
938  }
939 
944  private function handleUpload( &$pageInfo ) {
945  $this->debug( "Enter upload handler" );
946  $uploadInfo = [];
947 
948  $normalFields = [ 'timestamp', 'comment', 'filename', 'text',
949  'src', 'size', 'sha1base36', 'archivename', 'rel' ];
950 
951  $skip = false;
952 
953  while ( $skip ? $this->reader->next() : $this->reader->read() ) {
954  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
955  $this->reader->localName == 'upload' ) {
956  break;
957  }
958 
959  $tag = $this->reader->localName;
960 
961  if ( !Hooks::run( 'ImportHandleUploadXMLTag', [
962  $this, $pageInfo
963  ] ) ) {
964  // Do nothing
965  } elseif ( in_array( $tag, $normalFields ) ) {
966  $uploadInfo[$tag] = $this->nodeContents();
967  } elseif ( $tag == 'contributor' ) {
968  $uploadInfo['contributor'] = $this->handleContributor();
969  } elseif ( $tag == 'contents' ) {
970  $contents = $this->nodeContents();
971  $encoding = $this->reader->getAttribute( 'encoding' );
972  if ( $encoding === 'base64' ) {
973  $uploadInfo['fileSrc'] = $this->dumpTemp( base64_decode( $contents ) );
974  $uploadInfo['isTempSrc'] = true;
975  }
976  } elseif ( $tag != '#text' ) {
977  $this->warn( "Unhandled upload XML tag $tag" );
978  $skip = true;
979  }
980  }
981 
982  if ( $this->mImageBasePath && isset( $uploadInfo['rel'] ) ) {
983  $path = "{$this->mImageBasePath}/{$uploadInfo['rel']}";
984  if ( file_exists( $path ) ) {
985  $uploadInfo['fileSrc'] = $path;
986  $uploadInfo['isTempSrc'] = false;
987  }
988  }
989 
990  if ( $this->mImportUploads ) {
991  return $this->processUpload( $pageInfo, $uploadInfo );
992  }
993  }
994 
999  private function dumpTemp( $contents ) {
1000  $filename = tempnam( wfTempDir(), 'importupload' );
1001  file_put_contents( $filename, $contents );
1002  return $filename;
1003  }
1004 
1010  private function processUpload( $pageInfo, $uploadInfo ) {
1011  $revision = new WikiRevision( $this->config );
1012  $text = $uploadInfo['text'] ?? '';
1013 
1014  $revision->setTitle( $pageInfo['_title'] );
1015  $revision->setID( $pageInfo['id'] );
1016  $revision->setTimestamp( $uploadInfo['timestamp'] );
1017  $revision->setText( $text );
1018  $revision->setFilename( $uploadInfo['filename'] );
1019  if ( isset( $uploadInfo['archivename'] ) ) {
1020  $revision->setArchiveName( $uploadInfo['archivename'] );
1021  }
1022  $revision->setSrc( $uploadInfo['src'] );
1023  if ( isset( $uploadInfo['fileSrc'] ) ) {
1024  $revision->setFileSrc( $uploadInfo['fileSrc'],
1025  !empty( $uploadInfo['isTempSrc'] ) );
1026  }
1027  if ( isset( $uploadInfo['sha1base36'] ) ) {
1028  $revision->setSha1Base36( $uploadInfo['sha1base36'] );
1029  }
1030  $revision->setSize( intval( $uploadInfo['size'] ) );
1031  $revision->setComment( $uploadInfo['comment'] );
1032 
1033  if ( isset( $uploadInfo['contributor']['ip'] ) ) {
1034  $revision->setUserIP( $uploadInfo['contributor']['ip'] );
1035  }
1036  if ( isset( $uploadInfo['contributor']['username'] ) ) {
1037  $revision->setUsername(
1038  $this->externalUserNames->applyPrefix( $uploadInfo['contributor']['username'] )
1039  );
1040  }
1041  $revision->setNoUpdates( $this->mNoUpdates );
1042 
1043  return call_user_func( $this->mUploadCallback, $revision );
1044  }
1045 
1049  private function handleContributor() {
1050  $fields = [ 'id', 'ip', 'username' ];
1051  $info = [];
1052 
1053  if ( $this->reader->isEmptyElement ) {
1054  return $info;
1055  }
1056  while ( $this->reader->read() ) {
1057  if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
1058  $this->reader->localName == 'contributor' ) {
1059  break;
1060  }
1061 
1062  $tag = $this->reader->localName;
1063 
1064  if ( in_array( $tag, $fields ) ) {
1065  $info[$tag] = $this->nodeContents();
1066  }
1067  }
1068 
1069  return $info;
1070  }
1071 
1077  private function processTitle( $text, $ns = null ) {
1078  if ( is_null( $this->foreignNamespaces ) ) {
1079  $foreignTitleFactory = new NaiveForeignTitleFactory();
1080  } else {
1081  $foreignTitleFactory = new NamespaceAwareForeignTitleFactory(
1082  $this->foreignNamespaces );
1083  }
1084 
1085  $foreignTitle = $foreignTitleFactory->createForeignTitle( $text,
1086  intval( $ns ) );
1087 
1088  $title = $this->importTitleFactory->createTitleFromForeignTitle(
1089  $foreignTitle );
1090 
1091  $commandLineMode = $this->config->get( 'CommandLineMode' );
1092  if ( is_null( $title ) ) {
1093  # Invalid page title? Ignore the page
1094  $this->notice( 'import-error-invalid', $foreignTitle->getFullText() );
1095  return false;
1096  } elseif ( $title->isExternal() ) {
1097  $this->notice( 'import-error-interwiki', $title->getPrefixedText() );
1098  return false;
1099  } elseif ( !$title->canExist() ) {
1100  $this->notice( 'import-error-special', $title->getPrefixedText() );
1101  return false;
1102  } elseif ( !$title->userCan( 'edit' ) && !$commandLineMode ) {
1103  # Do not import if the importing wiki user cannot edit this page
1104  $this->notice( 'import-error-edit', $title->getPrefixedText() );
1105  return false;
1106  } elseif ( !$title->exists() && !$title->userCan( 'create' ) && !$commandLineMode ) {
1107  # Do not import if the importing wiki user cannot create this page
1108  $this->notice( 'import-error-create', $title->getPrefixedText() );
1109  return false;
1110  }
1111 
1112  return [ $title, $foreignTitle ];
1113  }
1114 }
static factory(Title $title)
Create a WikiPage object of the appropriate class for the given title.
Definition: WikiPage.php:138
nodeAttribute( $attr)
Retrieves the contents of the named attribute of the current element.
setRevisionCallback( $callback)
Sets the action to perform as each page revision is reached.
logItemCallback( $revision)
Notify the callback function of a new log item.
A parser that translates page titles on a foreign wiki into ForeignTitle objects, using information a...
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
$wgMaxArticleSize
Maximum article size in kilobytes.
setNoUpdates( $noupdates)
Set &#39;no updates&#39; mode.
setPageCallback( $callback)
Sets the action to perform as each new page in the stream is reached.
const NS_MAIN
Definition: Defines.php:64
setImportTitleFactory( $factory)
Sets the factory object to use to convert ForeignTitle objects into local Title objects.
XML file reader for the page data importer.
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
setLogItemCallback( $callback)
Sets the action to perform as each log item reached.
$source
siteInfoCallback( $siteInfo)
Notify the callback function of site info.
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
processUpload( $pageInfo, $uploadInfo)
array $countableCache
processRevision( $pageInfo, $revisionInfo)
__construct(ImportSource $source, Config $config)
Creates an ImportXMLReader drawing from the source provided.
This list may contain false positives That usually means there is additional text with links below the first Each row contains links to the first and second as well as the first line of the second redirect text
processLogItem( $logInfo)
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not it can be in the form of< username >< more info > e g for bot passwords intended to be added to log contexts Fields it might only if the login was with a bot password it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output modifiable modifiable after all normalizations have been except for the $wgMaxImageArea check set to true or false to override the $wgMaxImageArea check result gives extension the possibility to transform it themselves $handler
Definition: hooks.txt:780
revisionCallback( $revision)
Notify the callback function of a revision.
Status::newGood()` to allow deletion, and then `return false` from the hook function. Ensure you consume the 'ChangeTagAfterDelete' hook to carry out custom deletion actions. $tag:name of the tag $user:user initiating the action & $status:Status object. See above. 'ChangeTagsListActive':Allows you to nominate which of the tags your extension uses are in active use. & $tags:list of all active tags. Append to this array. 'ChangeTagsAfterUpdateTags':Called after tags have been updated with the ChangeTags::updateTags function. Params:$addedTags:tags effectively added in the update $removedTags:tags effectively removed in the update $prevTags:tags that were present prior to the update $rc_id:recentchanges table id $rev_id:revision table id $log_id:logging table id $params:tag params $rc:RecentChange being tagged when the tagging accompanies the action, or null $user:User who performed the tagging when the tagging is subsequent to the action, or null 'ChangeTagsAllowedAdd':Called when checking if a user can add tags to a change. & $allowedTags:List of all the tags the user is allowed to add. Any tags the user wants to add( $addTags) that are not in this array will cause it to fail. You may add or remove tags to this array as required. $addTags:List of tags user intends to add. $user:User who is adding the tags. 'ChangeUserGroups':Called before user groups are changed. $performer:The User who will perform the change $user:The User whose groups will be changed & $add:The groups that will be added & $remove:The groups that will be removed 'Collation::factory':Called if $wgCategoryCollation is an unknown collation. $collationName:Name of the collation in question & $collationObject:Null. Replace with a subclass of the Collation class that implements the collation given in $collationName. 'ConfirmEmailComplete':Called after a user 's email has been confirmed successfully. $user:user(object) whose email is being confirmed 'ContentAlterParserOutput':Modify parser output for a given content object. Called by Content::getParserOutput after parsing has finished. Can be used for changes that depend on the result of the parsing but have to be done before LinksUpdate is called(such as adding tracking categories based on the rendered HTML). $content:The Content to render $title:Title of the page, as context $parserOutput:ParserOutput to manipulate 'ContentGetParserOutput':Customize parser output for a given content object, called by AbstractContent::getParserOutput. May be used to override the normal model-specific rendering of page content. $content:The Content to render $title:Title of the page, as context $revId:The revision ID, as context $options:ParserOptions for rendering. To avoid confusing the parser cache, the output can only depend on parameters provided to this hook function, not on global state. $generateHtml:boolean, indicating whether full HTML should be generated. If false, generation of HTML may be skipped, but other information should still be present in the ParserOutput object. & $output:ParserOutput, to manipulate or replace 'ContentHandlerDefaultModelFor':Called when the default content model is determined for a given title. May be used to assign a different model for that title. $title:the Title in question & $model:the model name. Use with CONTENT_MODEL_XXX constants. 'ContentHandlerForModelID':Called when a ContentHandler is requested for a given content model name, but no entry for that model exists in $wgContentHandlers. Note:if your extension implements additional models via this hook, please use GetContentModels hook to make them known to core. $modeName:the requested content model name & $handler:set this to a ContentHandler object, if desired. 'ContentModelCanBeUsedOn':Called to determine whether that content model can be used on a given page. This is especially useful to prevent some content models to be used in some special location. $contentModel:ID of the content model in question $title:the Title in question. & $ok:Output parameter, whether it is OK to use $contentModel on $title. Handler functions that modify $ok should generally return false to prevent further hooks from further modifying $ok. 'ContribsPager::getQueryInfo':Before the contributions query is about to run & $pager:Pager object for contributions & $queryInfo:The query for the contribs Pager 'ContribsPager::reallyDoQuery':Called before really executing the query for My Contributions & $data:an array of results of all contribs queries $pager:The ContribsPager object hooked into $offset:Index offset, inclusive $limit:Exact query limit $descending:Query direction, false for ascending, true for descending 'ContributionsLineEnding':Called before a contributions HTML line is finished $page:SpecialPage object for contributions & $ret:the HTML line $row:the DB row for this line & $classes:the classes to add to the surrounding< li > & $attribs:associative array of other HTML attributes for the< li > element. Currently only data attributes reserved to MediaWiki are allowed(see Sanitizer::isReservedDataAttribute). 'ContributionsToolLinks':Change tool links above Special:Contributions $id:User identifier $title:User page title & $tools:Array of tool links $specialPage:SpecialPage instance for context and services. Can be either SpecialContributions or DeletedContributionsPage. Extensions should type hint against a generic SpecialPage though. 'ConvertContent':Called by AbstractContent::convert when a conversion to another content model is requested. Handler functions that modify $result should generally return false to disable further attempts at conversion. $content:The Content object to be converted. $toModel:The ID of the content model to convert to. $lossy:boolean indicating whether lossy conversion is allowed. & $result:Output parameter, in case the handler function wants to provide a converted Content object. Note that $result->getContentModel() must return $toModel. 'ContentSecurityPolicyDefaultSource':Modify the allowed CSP load sources. This affects all directives except for the script directive. If you want to add a script source, see ContentSecurityPolicyScriptSource hook. & $defaultSrc:Array of Content-Security-Policy allowed sources $policyConfig:Current configuration for the Content-Security-Policy header $mode:ContentSecurityPolicy::REPORT_ONLY_MODE or ContentSecurityPolicy::FULL_MODE depending on type of header 'ContentSecurityPolicyDirectives':Modify the content security policy directives. Use this only if ContentSecurityPolicyDefaultSource and ContentSecurityPolicyScriptSource do not meet your needs. & $directives:Array of CSP directives $policyConfig:Current configuration for the CSP header $mode:ContentSecurityPolicy::REPORT_ONLY_MODE or ContentSecurityPolicy::FULL_MODE depending on type of header 'ContentSecurityPolicyScriptSource':Modify the allowed CSP script sources. Note that you also have to use ContentSecurityPolicyDefaultSource if you want non-script sources to be loaded from whatever you add. & $scriptSrc:Array of CSP directives $policyConfig:Current configuration for the CSP header $mode:ContentSecurityPolicy::REPORT_ONLY_MODE or ContentSecurityPolicy::FULL_MODE depending on type of header 'CustomEditor':When invoking the page editor Return true to allow the normal editor to be used, or false if implementing a custom editor, e.g. for a special namespace, etc. $article:Article being edited $user:User performing the edit 'DeletedContribsPager::reallyDoQuery':Called before really executing the query for Special:DeletedContributions Similar to ContribsPager::reallyDoQuery & $data:an array of results of all contribs queries $pager:The DeletedContribsPager object hooked into $offset:Index offset, inclusive $limit:Exact query limit $descending:Query direction, false for ascending, true for descending 'DeletedContributionsLineEnding':Called before a DeletedContributions HTML line is finished. Similar to ContributionsLineEnding $page:SpecialPage object for DeletedContributions & $ret:the HTML line $row:the DB row for this line & $classes:the classes to add to the surrounding< li > & $attribs:associative array of other HTML attributes for the< li > element. Currently only data attributes reserved to MediaWiki are allowed(see Sanitizer::isReservedDataAttribute). 'DeleteUnknownPreferences':Called by the cleanupPreferences.php maintenance script to build a WHERE clause with which to delete preferences that are not known about. This hook is used by extensions that have dynamically-named preferences that should not be deleted in the usual cleanup process. For example, the Gadgets extension creates preferences prefixed with 'gadget-', and so anything with that prefix is excluded from the deletion. &where:An array that will be passed as the $cond parameter to IDatabase::select() to determine what will be deleted from the user_properties table. $db:The IDatabase object, useful for accessing $db->buildLike() etc. 'DifferenceEngineAfterLoadNewText':called in DifferenceEngine::loadNewText() after the new revision 's content has been loaded into the class member variable $differenceEngine->mNewContent but before returning true from this function. $differenceEngine:DifferenceEngine object 'DifferenceEngineLoadTextAfterNewContentIsLoaded':called in DifferenceEngine::loadText() after the new revision 's content has been loaded into the class member variable $differenceEngine->mNewContent but before checking if the variable 's value is null. This hook can be used to inject content into said class member variable. $differenceEngine:DifferenceEngine object 'DifferenceEngineMarkPatrolledLink':Allows extensions to change the "mark as patrolled" link which is shown both on the diff header as well as on the bottom of a page, usually wrapped in a span element which has class="patrollink". $differenceEngine:DifferenceEngine object & $markAsPatrolledLink:The "mark as patrolled" link HTML(string) $rcid:Recent change ID(rc_id) for this change(int) 'DifferenceEngineMarkPatrolledRCID':Allows extensions to possibly change the rcid parameter. For example the rcid might be set to zero due to the user being the same as the performer of the change but an extension might still want to show it under certain conditions. & $rcid:rc_id(int) of the change or 0 $differenceEngine:DifferenceEngine object $change:RecentChange object $user:User object representing the current user 'DifferenceEngineNewHeader':Allows extensions to change the $newHeader variable, which contains information about the new revision, such as the revision 's author, whether the revision was marked as a minor edit or not, etc. $differenceEngine:DifferenceEngine object & $newHeader:The string containing the various #mw-diff-otitle[1-5] divs, which include things like revision author info, revision comment, RevisionDelete link and more $formattedRevisionTools:Array containing revision tools, some of which may have been injected with the DiffRevisionTools hook $nextlink:String containing the link to the next revision(if any) $status
Definition: hooks.txt:1263
disableStatisticsUpdate()
Statistics update can cause a lot of time.
handleRevision(&$pageInfo)
wfTempDir()
Tries to get the system directory for temporary files.
either a unescaped string or a HtmlArmor object after in associative array form externallinks including delete and has completed for all link tables whether this was an auto creation use $formDescriptor instead default is conds Array Extra conditions for the No matching items in log is displayed if loglist is empty msgKey Array If you want a nice box with a set this to the key of the message First element is the message additional optional elements are parameters for the key that are processed with wfMessage() -> params() ->parseAsBlock() - offset Set to overwrite offset parameter in $wgRequest set to '' to unset offset - wrap String Wrap the message in html(usually something like "&lt
nodeContents()
Shouldn&#39;t something like this be built-in to XMLReader? Fetches text contents of the current element...
Interface for configuration instances.
Definition: Config.php:28
importRevision( $revision)
Default per-revision callback, performs the import.
setDebug( $debug)
Set debug mode...
ExternalUserNames $externalUserNames
doImport()
Primary entry point.
static factory(array $deltas)
beforeImportPage( $titleAndForeignTitle)
Default per-page callback.
static newGood( $value=null)
Factory function for good results.
Definition: StatusValue.php:81
setUsernamePrefix( $usernamePrefix, $assignKnownUsers)
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfTimestampNow()
Convenience function; returns MediaWiki timestamp for the present time.
processTitle( $text, $ns=null)
$params
Represents a revision, log entry or upload during the import process.
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:780
$buffer
namespace and then decline to actually register it file or subcat img or subcat $title
Definition: hooks.txt:925
Config $config
setSiteInfoCallback( $callback)
Sets the action to perform when site info is encountered.
debugRevisionHandler(&$revision)
Alternate per-revision callback, for debugging.
pageOutCallback( $title, $foreignTitle, $revCount, $sucCount, $pageInfo)
Notify the callback function when a "</page>" is closed.
setPageOffset( $nthPage)
Sets &#39;pageOffset&#39; value.
dumpTemp( $contents)
throwXmlError( $err)
static registerSource(ImportSource $source)
finishImportPage( $title, $foreignTitle, $revCount, $sRevCount, $pageInfo)
Mostly for hook use.
importLogItem( $revision)
Default per-revision callback, performs the import.
setPageOutCallback( $callback)
Sets the action to perform as each page in the stream is completed.
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
A parser that translates page titles on a foreign wiki into ForeignTitle objects, with no knowledge o...
wfSetVar(&$dest, $source, $force=false)
Sets dest to source and returns the original value of dest If source is NULL, it just returns the val...
Source interface for XML import.
static addUpdate(DeferrableUpdate $update, $stage=self::POSTSEND)
Add an update to the deferred list to be run later by execute()
you have access to all of the normal MediaWiki so you can get a DB use the etc For full docs on the Maintenance class
Definition: maintenance.txt:52
bool $disableStatisticsUpdate
setTargetNamespace( $namespace)
Set a target namespace to override the defaults.
ImportTitleFactory $importTitleFactory
pageCallback( $title)
Notify the callback function when a new "<page>" is reached.
setImportUploads( $import)
A class to convert page titles on a foreign wiki (ForeignTitle objects) into page titles on the local...
Reporting callback.
$debug
Definition: mcc.php:31
notice( $msg,... $params)
importUpload( $revision)
Dummy for now...
Class to parse and build external user names.
$content
Definition: pageupdater.txt:72
setUploadCallback( $callback)
Sets the action to perform as each file upload version is reached.
setNoticeCallback( $callback)
Set a callback that displays notice messages.
Exception representing a failure to serialize or unserialize a content object.
static run( $event, array $args=[], $deprecatedVersion=null)
Call hook functions defined in Hooks::register and $wgHooks.
Definition: Hooks.php:200
handleUpload(&$pageInfo)
static newFromText( $text, $defaultNamespace=NS_MAIN)
Create a new Title from text, such as what one would find in a link.
Definition: Title.php:319
setTargetRootPage( $rootpage)
Set a target root page under which all pages are imported.
setImageBasePath( $dir)