MediaWiki  master
TextPassDumper.php
Go to the documentation of this file.
1 <?php
28 require_once __DIR__ . '/BackupDumper.php';
29 require_once __DIR__ . '/SevenZipStream.php';
30 require_once __DIR__ . '/../../includes/export/WikiExporter.php';
31 
37 
43  public $prefetch = null;
45  private $thisPage;
47  private $thisRev;
48 
49  // when we spend more than maxTimeAllowed seconds on this run, we continue
50  // processing until we write out the next complete page, then save output file(s),
51  // rename it/them and open new one(s)
52  public $maxTimeAllowed = 0; // 0 = no limit
53 
54  protected $input = "php://stdin";
56  protected $fetchCount = 0;
57  protected $prefetchCount = 0;
58  protected $prefetchCountLast = 0;
59  protected $fetchCountLast = 0;
60 
61  protected $maxFailures = 5;
63  protected $failureTimeout = 5; // Seconds to sleep after db failure
64 
65  protected $bufferSize = 524288; // In bytes. Maximum size to read from the stub in on go.
66 
67  protected $php = "php";
68  protected $spawn = false;
69 
73  protected $spawnProc = false;
74 
78  protected $spawnWrite = false;
79 
83  protected $spawnRead = false;
84 
88  protected $spawnErr = false;
89 
93  protected $xmlwriterobj = false;
94 
95  protected $timeExceeded = false;
96  protected $firstPageWritten = false;
97  protected $lastPageWritten = false;
98  protected $checkpointJustWritten = false;
99  protected $checkpointFiles = [];
100 
104  protected $db;
105 
109  function __construct( $args = null ) {
110  parent::__construct();
111 
112  $this->addDescription( <<<TEXT
113 This script postprocesses XML dumps from dumpBackup.php to add
114 page text which was stubbed out (using --stub).
115 
116 XML input is accepted on stdin.
117 XML output is sent to stdout; progress reports are sent to stderr.
118 TEXT
119  );
120  $this->stderr = fopen( "php://stderr", "wt" );
121 
122  $this->addOption( 'stub', 'To load a compressed stub dump instead of stdin. ' .
123  'Specify as --stub=<type>:<file>.', false, true );
124  $this->addOption( 'prefetch', 'Use a prior dump file as a text source, to savepressure on the ' .
125  'database. (Requires the XMLReader extension). Specify as --prefetch=<type>:<file>',
126  false, true );
127  $this->addOption( 'maxtime', 'Write out checkpoint file after this many minutes (writing' .
128  'out complete page, closing xml file properly, and opening new one' .
129  'with header). This option requires the checkpointfile option.', false, true );
130  $this->addOption( 'checkpointfile', 'Use this string for checkpoint filenames,substituting ' .
131  'first pageid written for the first %s (required) and the last pageid written for the ' .
132  'second %s if it exists.', false, true, false, true ); // This can be specified multiple times
133  $this->addOption( 'quiet', 'Don\'t dump status reports to stderr.' );
134  $this->addOption( 'full', 'Dump all revisions of every page' );
135  $this->addOption( 'current', 'Base ETA on number of pages in database instead of all revisions' );
136  $this->addOption( 'spawn', 'Spawn a subprocess for loading text records, optionally specify ' .
137  'php[,mwscript] paths' );
138  $this->addOption( 'buffersize', 'Buffer size in bytes to use for reading the stub. ' .
139  '(Default: 512KB, Minimum: 4KB)', false, true );
140 
141  if ( $args ) {
142  $this->loadWithArgv( $args );
143  $this->processOptions();
144  }
145  }
146 
150  private function getBlobStore() {
151  return MediaWikiServices::getInstance()->getBlobStore();
152  }
153 
154  function execute() {
155  $this->processOptions();
156  $this->dump( true );
157  }
158 
159  function processOptions() {
160  parent::processOptions();
161 
162  if ( $this->hasOption( 'buffersize' ) ) {
163  $this->bufferSize = max( intval( $this->getOption( 'buffersize' ) ), 4 * 1024 );
164  }
165 
166  if ( $this->hasOption( 'prefetch' ) ) {
167  $url = $this->processFileOpt( $this->getOption( 'prefetch' ) );
168  $this->prefetch = new BaseDump( $url );
169  }
170 
171  if ( $this->hasOption( 'stub' ) ) {
172  $this->input = $this->processFileOpt( $this->getOption( 'stub' ) );
173  }
174 
175  if ( $this->hasOption( 'maxtime' ) ) {
176  $this->maxTimeAllowed = intval( $this->getOption( 'maxtime' ) ) * 60;
177  }
178 
179  if ( $this->hasOption( 'checkpointfile' ) ) {
180  $this->checkpointFiles = $this->getOption( 'checkpointfile' );
181  }
182 
183  if ( $this->hasOption( 'current' ) ) {
185  }
186 
187  if ( $this->hasOption( 'full' ) ) {
188  $this->history = WikiExporter::FULL;
189  }
190 
191  if ( $this->hasOption( 'spawn' ) ) {
192  $this->spawn = true;
193  $val = $this->getOption( 'spawn' );
194  if ( $val !== 1 ) {
195  $this->php = explode( ',', $val, 2 );
196  }
197  }
198  }
199 
211  function rotateDb() {
212  // Cleaning up old connections
213  if ( isset( $this->lb ) ) {
214  $this->lb->closeAll();
215  unset( $this->lb );
216  }
217 
218  if ( $this->forcedDb !== null ) {
219  $this->db = $this->forcedDb;
220 
221  return;
222  }
223 
224  if ( isset( $this->db ) && $this->db->isOpen() ) {
225  throw new MWException( 'DB is set and has not been closed by the Load Balancer' );
226  }
227 
228  unset( $this->db );
229 
230  // Trying to set up new connection.
231  // We do /not/ retry upon failure, but delegate to encapsulating logic, to avoid
232  // individually retrying at different layers of code.
233 
234  try {
235  $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
236  $this->lb = $lbFactory->newMainLB();
237  } catch ( Exception $e ) {
238  throw new MWException( __METHOD__
239  . " rotating DB failed to obtain new load balancer (" . $e->getMessage() . ")" );
240  }
241 
242  try {
243  $this->db = $this->lb->getMaintenanceConnectionRef( DB_REPLICA, 'dump' );
244  } catch ( Exception $e ) {
245  throw new MWException( __METHOD__
246  . " rotating DB failed to obtain new database (" . $e->getMessage() . ")" );
247  }
248  }
249 
251  parent::initProgress();
252  $this->timeOfCheckpoint = $this->startTime;
253  }
254 
255  function dump( $history, $text = WikiExporter::TEXT ) {
256  // Notice messages will foul up your XML output even if they're
257  // relatively harmless.
258  if ( ini_get( 'display_errors' ) ) {
259  ini_set( 'display_errors', 'stderr' );
260  }
261 
262  $this->initProgress( $this->history );
263 
264  // We are trying to get an initial database connection to avoid that the
265  // first try of this request's first call to getText fails. However, if
266  // obtaining a good DB connection fails it's not a serious issue, as
267  // getText does retry upon failure and can start without having a working
268  // DB connection.
269  try {
270  $this->rotateDb();
271  } catch ( Exception $e ) {
272  // We do not even count this as failure. Just let eventual
273  // watchdogs know.
274  $this->progress( "Getting initial DB connection failed (" .
275  $e->getMessage() . ")" );
276  }
277 
278  $this->egress = new ExportProgressFilter( $this->sink, $this );
279 
280  // it would be nice to do it in the constructor, oh well. need egress set
281  $this->finalOptionCheck();
282 
283  // we only want this so we know how to close a stream :-P
284  $this->xmlwriterobj = new XmlDumpWriter( XmlDumpWriter::WRITE_CONTENT, $this->schemaVersion );
285 
286  $input = fopen( $this->input, "rt" );
287  $this->readDump( $input );
288 
289  if ( $this->spawnProc ) {
290  $this->closeSpawn();
291  }
292 
293  $this->report( true );
294  }
295 
296  function processFileOpt( $opt ) {
297  $split = explode( ':', $opt, 2 );
298  $val = $split[0];
299  $param = '';
300  if ( count( $split ) === 2 ) {
301  $param = $split[1];
302  }
303  $fileURIs = explode( ';', $param );
304  foreach ( $fileURIs as $URI ) {
305  switch ( $val ) {
306  case "file":
307  $newURI = $URI;
308  break;
309  case "gzip":
310  $newURI = "compress.zlib://$URI";
311  break;
312  case "bzip2":
313  $newURI = "compress.bzip2://$URI";
314  break;
315  case "7zip":
316  $newURI = "mediawiki.compress.7z://$URI";
317  break;
318  default:
319  $newURI = $URI;
320  }
321  $newFileURIs[] = $newURI;
322  }
323  $val = implode( ';', $newFileURIs );
324 
325  return $val;
326  }
327 
331  function showReport() {
332  if ( !$this->prefetch ) {
333  parent::showReport();
334 
335  return;
336  }
337 
338  if ( $this->reporting ) {
339  $now = wfTimestamp( TS_DB );
340  $nowts = microtime( true );
341  $deltaAll = $nowts - $this->startTime;
342  $deltaPart = $nowts - $this->lastTime;
343  $this->pageCountPart = $this->pageCount - $this->pageCountLast;
344  $this->revCountPart = $this->revCount - $this->revCountLast;
345 
346  if ( $deltaAll ) {
347  $portion = $this->revCount / $this->maxCount;
348  $eta = $this->startTime + $deltaAll / $portion;
349  $etats = wfTimestamp( TS_DB, intval( $eta ) );
350  if ( $this->fetchCount ) {
351  $fetchRate = 100.0 * $this->prefetchCount / $this->fetchCount;
352  } else {
353  $fetchRate = '-';
354  }
355  $pageRate = $this->pageCount / $deltaAll;
356  $revRate = $this->revCount / $deltaAll;
357  } else {
358  $pageRate = '-';
359  $revRate = '-';
360  $etats = '-';
361  $fetchRate = '-';
362  }
363  if ( $deltaPart ) {
364  if ( $this->fetchCountLast ) {
365  $fetchRatePart = 100.0 * $this->prefetchCountLast / $this->fetchCountLast;
366  } else {
367  $fetchRatePart = '-';
368  }
369  $pageRatePart = $this->pageCountPart / $deltaPart;
370  $revRatePart = $this->revCountPart / $deltaPart;
371  } else {
372  $fetchRatePart = '-';
373  $pageRatePart = '-';
374  $revRatePart = '-';
375  }
376 
377  $dbDomain = WikiMap::getCurrentWikiDbDomain()->getId();
378  $this->progress( sprintf(
379  "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), "
380  . "%d revs (%0.1f|%0.1f/sec all|curr), %0.1f%%|%0.1f%% "
381  . "prefetched (all|curr), ETA %s [max %d]",
382  $now, $dbDomain, $this->ID, $this->pageCount, $pageRate,
383  $pageRatePart, $this->revCount, $revRate, $revRatePart,
384  $fetchRate, $fetchRatePart, $etats, $this->maxCount
385  ) );
386  $this->lastTime = $nowts;
387  $this->revCountLast = $this->revCount;
388  $this->prefetchCountLast = $this->prefetchCount;
389  $this->fetchCountLast = $this->fetchCount;
390  }
391  }
392 
393  function setTimeExceeded() {
394  $this->timeExceeded = true;
395  }
396 
397  function checkIfTimeExceeded() {
398  if ( $this->maxTimeAllowed
399  && ( $this->lastTime - $this->timeOfCheckpoint > $this->maxTimeAllowed )
400  ) {
401  return true;
402  }
403 
404  return false;
405  }
406 
407  function finalOptionCheck() {
408  if ( ( $this->checkpointFiles && !$this->maxTimeAllowed )
409  || ( $this->maxTimeAllowed && !$this->checkpointFiles )
410  ) {
411  throw new MWException( "Options checkpointfile and maxtime must be specified together.\n" );
412  }
413  foreach ( $this->checkpointFiles as $checkpointFile ) {
414  $count = substr_count( $checkpointFile, "%s" );
415  if ( $count != 2 ) {
416  throw new MWException( "Option checkpointfile must contain two '%s' "
417  . "for substitution of first and last pageids, count is $count instead, "
418  . "file is $checkpointFile.\n" );
419  }
420  }
421 
422  if ( $this->checkpointFiles ) {
423  $filenameList = (array)$this->egress->getFilenames();
424  if ( count( $filenameList ) != count( $this->checkpointFiles ) ) {
425  throw new MWException( "One checkpointfile must be specified "
426  . "for each output option, if maxtime is used.\n" );
427  }
428  }
429  }
430 
436  function readDump( $input ) {
437  $this->buffer = "";
438  $this->openElement = false;
439  $this->atStart = true;
440  $this->state = "";
441  $this->lastName = "";
442  $this->thisPage = 0;
443  $this->thisRev = 0;
444  $this->thisRevModel = null;
445  $this->thisRevFormat = null;
446 
447  $parser = xml_parser_create( "UTF-8" );
448  xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
449 
450  xml_set_element_handler(
451  $parser,
452  [ $this, 'startElement' ],
453  [ $this, 'endElement' ]
454  );
455  xml_set_character_data_handler( $parser, [ $this, 'characterData' ] );
456 
457  $offset = 0; // for context extraction on error reporting
458  do {
459  if ( $this->checkIfTimeExceeded() ) {
460  $this->setTimeExceeded();
461  }
462  $chunk = fread( $input, $this->bufferSize );
463  if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
464  wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
465 
466  $byte = xml_get_current_byte_index( $parser );
467  $msg = wfMessage( 'xml-error-string',
468  'XML import parse failure',
469  xml_get_current_line_number( $parser ),
470  xml_get_current_column_number( $parser ),
471  $byte . ( is_null( $chunk ) ? null : ( '; "' . substr( $chunk, $byte - $offset, 16 ) . '"' ) ),
472  xml_error_string( xml_get_error_code( $parser ) ) )->escaped();
473 
474  xml_parser_free( $parser );
475 
476  throw new MWException( $msg );
477  }
478  $offset += strlen( $chunk );
479  } while ( $chunk !== false && !feof( $input ) );
480  if ( $this->maxTimeAllowed ) {
481  $filenameList = (array)$this->egress->getFilenames();
482  // we wrote some stuff after last checkpoint that needs renamed
483  if ( file_exists( $filenameList[0] ) ) {
484  $newFilenames = [];
485  # we might have just written the header and footer and had no
486  # pages or revisions written... perhaps they were all deleted
487  # there's no pageID 0 so we use that. the caller is responsible
488  # for deciding what to do with a file containing only the
489  # siteinfo information and the mw tags.
490  if ( !$this->firstPageWritten ) {
491  $firstPageID = str_pad( 0, 9, "0", STR_PAD_LEFT );
492  $lastPageID = str_pad( 0, 9, "0", STR_PAD_LEFT );
493  } else {
494  $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT );
495  $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT );
496  }
497 
498  $filenameCount = count( $filenameList );
499  for ( $i = 0; $i < $filenameCount; $i++ ) {
500  $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
501  $fileinfo = pathinfo( $filenameList[$i] );
502  $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
503  }
504  $this->egress->closeAndRename( $newFilenames );
505  }
506  }
507  xml_parser_free( $parser );
508 
509  return true;
510  }
511 
521  private function exportTransform( $text, $model, $format = null ) {
522  try {
524  $text = $handler->exportTransform( $text, $format );
525  }
526  catch ( MWException $ex ) {
527  $this->progress(
528  "Unable to apply export transformation for content model '$model': " .
529  $ex->getMessage()
530  );
531  }
532 
533  return $text;
534  }
535 
556  function getText( $id, $model = null, $format = null ) {
557  global $wgContentHandlerUseDB;
558 
559  $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch.
560  $text = false; // The candidate for a good text. false if no proper value.
561  $failures = 0; // The number of times, this invocation of getText already failed.
562 
563  // The number of times getText failed without yielding a good text in between.
564  static $consecutiveFailedTextRetrievals = 0;
565 
566  $this->fetchCount++;
567 
568  // To allow to simply return on success and do not have to worry about book keeping,
569  // we assume, this fetch works (possible after some retries). Nevertheless, we koop
570  // the old value, so we can restore it, if problems occur (See after the while loop).
571  $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals;
572  $consecutiveFailedTextRetrievals = 0;
573 
574  if ( $model === null && $wgContentHandlerUseDB ) {
575  // TODO: MCR: use content table
576  $row = $this->db->selectRow(
577  'revision',
578  [ 'rev_content_model', 'rev_content_format' ],
579  [ 'rev_id' => $this->thisRev ],
580  __METHOD__
581  );
582 
583  if ( $row ) {
584  $model = $row->rev_content_model;
585  $format = $row->rev_content_format;
586  }
587  }
588 
589  if ( $model === null || $model === '' ) {
590  $model = false;
591  }
592 
593  while ( $failures < $this->maxFailures ) {
594  // As soon as we found a good text for the $id, we will return immediately.
595  // Hence, if we make it past the try catch block, we know that we did not
596  // find a good text.
597 
598  try {
599  // Step 1: Get some text (or reuse from previous iteratuon if checking
600  // for plausibility failed)
601 
602  // Trying to get prefetch, if it has not been tried before
603  if ( $text === false && isset( $this->prefetch ) && $prefetchNotTried ) {
604  $prefetchNotTried = false;
605  $tryIsPrefetch = true;
606  $text = $this->prefetch->prefetch( (int)$this->thisPage, (int)$this->thisRev );
607 
608  if ( $text === null ) {
609  $text = false;
610  }
611 
612  if ( is_string( $text ) && $model !== false ) {
613  // Apply export transformation to text coming from an old dump.
614  // The purpose of this transformation is to convert up from legacy
615  // formats, which may still be used in the older dump that is used
616  // for pre-fetching. Applying the transformation again should not
617  // interfere with content that is already in the correct form.
618  $text = $this->exportTransform( $text, $model, $format );
619  }
620  }
621 
622  if ( $text === false ) {
623  // Fallback to asking the database
624  $tryIsPrefetch = false;
625  if ( $this->spawn ) {
626  $text = $this->getTextSpawned( $id );
627  } else {
628  $text = $this->getTextDb( $id );
629  }
630 
631  if ( $text !== false && $model !== false ) {
632  // Apply export transformation to text coming from the database.
633  // Prefetched text should already have transformations applied.
634  $text = $this->exportTransform( $text, $model, $format );
635  }
636 
637  // No more checks for texts from DB for now.
638  // If we received something that is not false,
639  // We treat it as good text, regardless of whether it actually is or is not
640  if ( $text !== false ) {
641  return $text;
642  }
643  }
644 
645  if ( $text === false ) {
646  throw new MWException( "Generic error while obtaining text for id " . $id );
647  }
648 
649  // We received a good candidate for the text of $id via some method
650 
651  // Step 2: Checking for plausibility and return the text if it is
652  // plausible
653  $revID = intval( $this->thisRev );
654  if ( !isset( $this->db ) ) {
655  throw new MWException( "No database available" );
656  }
657 
658  if ( $model !== CONTENT_MODEL_WIKITEXT ) {
659  $revLength = strlen( $text );
660  } else {
661  $revLength = $this->db->selectField( 'revision', 'rev_len', [ 'rev_id' => $revID ] );
662  }
663 
664  if ( strlen( $text ) == $revLength ) {
665  if ( $tryIsPrefetch ) {
666  $this->prefetchCount++;
667  }
668 
669  return $text;
670  }
671 
672  $text = false;
673  throw new MWException( "Received text is unplausible for id " . $id );
674  } catch ( Exception $e ) {
675  $msg = "getting/checking text " . $id . " failed (" . $e->getMessage() . ")";
676  if ( $failures + 1 < $this->maxFailures ) {
677  $msg .= " (Will retry " . ( $this->maxFailures - $failures - 1 ) . " more times)";
678  }
679  $this->progress( $msg );
680  }
681 
682  // Something went wrong; we did not a text that was plausible :(
683  $failures++;
684 
685  // A failure in a prefetch hit does not warrant resetting db connection etc.
686  if ( !$tryIsPrefetch ) {
687  // After backing off for some time, we try to reboot the whole process as
688  // much as possible to not carry over failures from one part to the other
689  // parts
690  sleep( $this->failureTimeout );
691  try {
692  $this->rotateDb();
693  if ( $this->spawn ) {
694  $this->closeSpawn();
695  $this->openSpawn();
696  }
697  } catch ( Exception $e ) {
698  $this->progress( "Rebooting getText infrastructure failed (" . $e->getMessage() . ")" .
699  " Trying to continue anyways" );
700  }
701  }
702  }
703 
704  // Retirieving a good text for $id failed (at least) maxFailures times.
705  // We abort for this $id.
706 
707  // Restoring the consecutive failures, and maybe aborting, if the dump
708  // is too broken.
709  $consecutiveFailedTextRetrievals = $oldConsecutiveFailedTextRetrievals + 1;
710  if ( $consecutiveFailedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals ) {
711  throw new MWException( "Graceful storage failure" );
712  }
713 
714  return "";
715  }
716 
723  private function getTextDb( $id ) {
724  $store = $this->getBlobStore();
725  $address = ( is_int( $id ) || strpos( $id, ':' ) === false )
726  ? SqlBlobStore::makeAddressFromTextId( (int)$id )
727  : $id;
728 
729  try {
730  $text = $store->getBlob( $address );
731 
732  $stripped = str_replace( "\r", "", $text );
733  $normalized = MediaWikiServices::getInstance()->getContentLanguage()
734  ->normalize( $stripped );
735 
736  return $normalized;
737  } catch ( BlobAccessException $ex ) {
738  // XXX: log a warning?
739  return false;
740  }
741  }
742 
747  private function getTextSpawned( $address ) {
748  Wikimedia\suppressWarnings();
749  if ( !$this->spawnProc ) {
750  // First time?
751  $this->openSpawn();
752  }
753  $text = $this->getTextSpawnedOnce( $address );
754  Wikimedia\restoreWarnings();
755 
756  return $text;
757  }
758 
759  function openSpawn() {
760  global $IP;
761 
763  if ( count( $this->php ) == 2 ) {
764  $mwscriptpath = $this->php[1];
765  } else {
766  $mwscriptpath = "$IP/../multiversion/MWScript.php";
767  }
768  if ( file_exists( $mwscriptpath ) ) {
769  $cmd = implode( " ",
770  array_map( [ Shell::class, 'escape' ],
771  [
772  $this->php[0],
773  $mwscriptpath,
774  "fetchText.php",
775  '--wiki', $wiki ] ) );
776  } else {
777  $cmd = implode( " ",
778  array_map( [ Shell::class, 'escape' ],
779  [
780  $this->php[0],
781  "$IP/maintenance/fetchText.php",
782  '--wiki', $wiki ] ) );
783  }
784  $spec = [
785  0 => [ "pipe", "r" ],
786  1 => [ "pipe", "w" ],
787  2 => [ "file", "/dev/null", "a" ] ];
788  $pipes = [];
789 
790  $this->progress( "Spawning database subprocess: $cmd" );
791  $this->spawnProc = proc_open( $cmd, $spec, $pipes );
792  if ( !$this->spawnProc ) {
793  $this->progress( "Subprocess spawn failed." );
794 
795  return false;
796  }
797  list(
798  $this->spawnWrite, // -> stdin
799  $this->spawnRead, // <- stdout
800  ) = $pipes;
801 
802  return true;
803  }
804 
805  private function closeSpawn() {
806  Wikimedia\suppressWarnings();
807  if ( $this->spawnRead ) {
808  fclose( $this->spawnRead );
809  }
810  $this->spawnRead = false;
811  if ( $this->spawnWrite ) {
812  fclose( $this->spawnWrite );
813  }
814  $this->spawnWrite = false;
815  if ( $this->spawnErr ) {
816  fclose( $this->spawnErr );
817  }
818  $this->spawnErr = false;
819  if ( $this->spawnProc ) {
820  pclose( $this->spawnProc );
821  }
822  $this->spawnProc = false;
823  Wikimedia\restoreWarnings();
824  }
825 
830  private function getTextSpawnedOnce( $address ) {
831  if ( is_int( $address ) || intval( $address ) ) {
832  $address = SqlBlobStore::makeAddressFromTextId( (int)$address );
833  }
834 
835  $ok = fwrite( $this->spawnWrite, "$address\n" );
836  // $this->progress( ">> $id" );
837  if ( !$ok ) {
838  return false;
839  }
840 
841  $ok = fflush( $this->spawnWrite );
842  // $this->progress( ">> [flush]" );
843  if ( !$ok ) {
844  return false;
845  }
846 
847  // check that the text address they are sending is the one we asked for
848  // this avoids out of sync revision text errors we have encountered in the past
849  $newAddress = fgets( $this->spawnRead );
850  if ( $newAddress === false ) {
851  return false;
852  }
853  $newAddress = trim( $newAddress );
854  if ( strpos( $newAddress, ':' ) === false ) {
855  $newAddress = SqlBlobStore::makeAddressFromTextId( intval( $newAddress ) );
856  }
857 
858  if ( $newAddress !== $address ) {
859  return false;
860  }
861 
862  $len = fgets( $this->spawnRead );
863  // $this->progress( "<< " . trim( $len ) );
864  if ( $len === false ) {
865  return false;
866  }
867 
868  $nbytes = intval( $len );
869  // actual error, not zero-length text
870  if ( $nbytes < 0 ) {
871  return false;
872  }
873 
874  $text = "";
875 
876  // Subprocess may not send everything at once, we have to loop.
877  while ( $nbytes > strlen( $text ) ) {
878  $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) );
879  if ( $buffer === false ) {
880  break;
881  }
882  $text .= $buffer;
883  }
884 
885  $gotbytes = strlen( $text );
886  if ( $gotbytes != $nbytes ) {
887  $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " );
888 
889  return false;
890  }
891 
892  // Do normalization in the dump thread...
893  $stripped = str_replace( "\r", "", $text );
894  $normalized = MediaWikiServices::getInstance()->getContentLanguage()->
895  normalize( $stripped );
896 
897  return $normalized;
898  }
899 
901  $this->checkpointJustWritten = false;
902 
903  $this->clearOpenElement( null );
904  $this->lastName = $name;
905 
906  if ( $name == 'revision' ) {
907  $this->state = $name;
908  $this->egress->writeOpenPage( null, $this->buffer );
909  $this->buffer = "";
910  } elseif ( $name == 'page' ) {
911  $this->state = $name;
912  if ( $this->atStart ) {
913  $this->egress->writeOpenStream( $this->buffer );
914  $this->buffer = "";
915  $this->atStart = false;
916  }
917  }
918 
919  if ( $name == "text" && isset( $attribs['id'] ) ) {
920  $id = $attribs['id'];
921  $model = trim( $this->thisRevModel );
922  $format = trim( $this->thisRevFormat );
923 
924  $model = $model === '' ? null : $model;
925  $format = $format === '' ? null : $format;
926 
927  $text = $this->getText( $id, $model, $format );
928  $this->openElement = [ $name, [ 'xml:space' => 'preserve' ] ];
929  if ( strlen( $text ) > 0 ) {
930  $this->characterData( $parser, $text );
931  }
932  } else {
933  $this->openElement = [ $name, $attribs ];
934  }
935  }
936 
937  function endElement( $parser, $name ) {
938  $this->checkpointJustWritten = false;
939 
940  if ( $this->openElement ) {
941  $this->clearOpenElement( "" );
942  } else {
943  $this->buffer .= "</$name>";
944  }
945 
946  if ( $name == 'revision' ) {
947  $this->egress->writeRevision( null, $this->buffer );
948  $this->buffer = "";
949  $this->thisRev = "";
950  $this->thisRevModel = null;
951  $this->thisRevFormat = null;
952  } elseif ( $name == 'page' ) {
953  if ( !$this->firstPageWritten ) {
954  $this->firstPageWritten = trim( $this->thisPage );
955  }
956  $this->lastPageWritten = trim( $this->thisPage );
957  if ( $this->timeExceeded ) {
958  $this->egress->writeClosePage( $this->buffer );
959  // nasty hack, we can't just write the chardata after the
960  // page tag, it will include leading blanks from the next line
961  $this->egress->sink->write( "\n" );
962 
963  $this->buffer = $this->xmlwriterobj->closeStream();
964  $this->egress->writeCloseStream( $this->buffer );
965 
966  $this->buffer = "";
967  $this->thisPage = "";
968  // this could be more than one file if we had more than one output arg
969 
970  $filenameList = (array)$this->egress->getFilenames();
971  $newFilenames = [];
972  $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT );
973  $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT );
974  $filenamesCount = count( $filenameList );
975  for ( $i = 0; $i < $filenamesCount; $i++ ) {
976  $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
977  $fileinfo = pathinfo( $filenameList[$i] );
978  $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
979  }
980  $this->egress->closeRenameAndReopen( $newFilenames );
981  $this->buffer = $this->xmlwriterobj->openStream();
982  $this->timeExceeded = false;
983  $this->timeOfCheckpoint = $this->lastTime;
984  $this->firstPageWritten = false;
985  $this->checkpointJustWritten = true;
986  } else {
987  $this->egress->writeClosePage( $this->buffer );
988  $this->buffer = "";
989  $this->thisPage = "";
990  }
991  } elseif ( $name == 'mediawiki' ) {
992  $this->egress->writeCloseStream( $this->buffer );
993  $this->buffer = "";
994  }
995  }
996 
997  function characterData( $parser, $data ) {
998  $this->clearOpenElement( null );
999  if ( $this->lastName == "id" ) {
1000  if ( $this->state == "revision" ) {
1001  $this->thisRev .= $data;
1002  } elseif ( $this->state == "page" ) {
1003  $this->thisPage .= $data;
1004  }
1005  } elseif ( $this->lastName == "model" ) {
1006  $this->thisRevModel .= $data;
1007  } elseif ( $this->lastName == "format" ) {
1008  $this->thisRevFormat .= $data;
1009  }
1010 
1011  // have to skip the newline left over from closepagetag line of
1012  // end of checkpoint files. nasty hack!!
1013  if ( $this->checkpointJustWritten ) {
1014  if ( $data[0] == "\n" ) {
1015  $data = substr( $data, 1 );
1016  }
1017  $this->checkpointJustWritten = false;
1018  }
1019  $this->buffer .= htmlspecialchars( $data );
1020  }
1021 
1022  function clearOpenElement( $style ) {
1023  if ( $this->openElement ) {
1024  $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style );
1025  $this->openElement = false;
1026  }
1027  }
1028 }
bool resource $spawnWrite
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
IMaintainableDatabase $db
const CONTENT_MODEL_WIKITEXT
Definition: Defines.php:215
processing should stop and the error should be shown to the user * false
Definition: hooks.txt:187
$IP
Definition: WebStart.php:41
const WRITE_CONTENT
Output serialized revision content.
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
getOption( $name, $default=null)
Get an option, or return the default.
progress( $string)
div flags Integer display flags(NO_ACTION_LINK, NO_EXTRA_USER_LINKS) 'LogException' returning false will NOT prevent logging $e
Definition: hooks.txt:2147
characterData( $parser, $data)
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for and distribution as defined by Sections through of this document Licensor shall mean the copyright owner or entity authorized by the copyright owner that is granting the License Legal Entity shall mean the union of the acting entity and all other entities that control are controlled by or are under common control with that entity For the purposes of this definition control direct or to cause the direction or management of such whether by contract or including but not limited to software source documentation and configuration files Object form shall mean any form resulting from mechanical transformation or translation of a Source including but not limited to compiled object generated and conversions to other media types Work shall mean the work of whether in Source or Object made available under the as indicated by a copyright notice that is included in or attached to the whether in Source or Object that is based or other modifications as a an original work of authorship For the purposes of this Derivative Works shall not include works that remain separable from
string bool $thisRev
exportTransform( $text, $model, $format=null)
Applies applicable export transformations to $text.
static getWikiIdFromDbDomain( $domain)
Get the wiki ID of a database domain.
Definition: WikiMap.php:269
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
__construct( $args=null)
getText( $id, $model=null, $format=null)
Tries to load revision text.
hasOption( $name)
Checks to see if a particular option exists.
showReport()
Overridden to include prefetch ratio if enabled.
target page
bool XmlDumpWriter $xmlwriterobj
loadWithArgv( $argv)
Load params and arguments from a given array of command-line arguments.
Exception representing a failure to access a data blob.
startElement( $parser, $name, $attribs)
see documentation in includes Linker php for Linker::makeImageLink or false for current used if you return false $parser
Definition: hooks.txt:1781
This list may contain false positives That usually means there is additional text with links below the first Each row contains links to the first and second as well as the first line of the second redirect text
clearOpenElement( $style)
This document provides an overview of the usage of PageUpdater and that is
Definition: pageupdater.txt:3
if( $line===false) $args
Definition: cdb.php:64
The ContentHandler facility adds support for arbitrary content types on wiki instead of relying on wikitext for everything It was introduced in MediaWiki Each kind of and so on Built in content types are
$wgContentHandlerUseDB
Set to false to disable use of the database fields introduced by the ContentHandler facility...
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not it can be in the form of< username >< more info > e g for bot passwords intended to be added to log contexts Fields it might only if the login was with a bot password it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output modifiable modifiable after all normalizations have been except for the $wgMaxImageArea check set to true or false to override the $wgMaxImageArea check result gives extension the possibility to transform it themselves $handler
Definition: hooks.txt:767
An extension or a local will often add custom code to the function with or without a global variable For someone wanting email notification when an article is shown may add
Definition: hooks.txt:51
getTextSpawned( $address)
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
IMaintainableDatabase null $forcedDb
The dependency-injected database to use.
dump( $history, $text=WikiExporter::TEXT)
either a unescaped string or a HtmlArmor object after in associative array form externallinks including delete and has completed for all link tables whether this was an auto creation use $formDescriptor instead default is conds Array Extra conditions for the No matching items in log is displayed if loglist is empty msgKey Array If you want a nice box with a set this to the key of the message First element is the message additional optional elements are parameters for the key that are processed with wfMessage() -> params() ->parseAsBlock() - offset Set to overwrite offset parameter in $wgRequest set to '' to unset offset - wrap String Wrap the message in html(usually something like "&lt
static getForModelID( $modelId)
Returns the ContentHandler singleton for the given model ID.
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
addDescription( $text)
Set the description text.
bool resource $spawnProc
getTextDb( $id)
Loads the serialized content from storage.
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses after processing & $attribs
Definition: hooks.txt:1972
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:767
$buffer
output( $out, $channel=null)
Throw some output to the user.
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
string bool $thisPage
static getCurrentWikiDbDomain()
Definition: WikiMap.php:293
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
presenting them properly to the user as errors is done by the caller return true use this to change the list i e etc next in line in page history
Definition: hooks.txt:1748
you have access to all of the normal MediaWiki so you can get a DB use the etc For full docs on the Maintenance class
Definition: maintenance.txt:52
report( $final=false)
initProgress( $history=WikiExporter::FULL)
bool resource $spawnRead
static element( $element, $attribs=null, $contents='', $allowShortTag=true)
Format an XML element with given attributes and, optionally, text content.
Definition: Xml.php:41
Using a hook running we can avoid having all this option specific stuff in our mainline code Using the function We ve cleaned up the code here by removing clumps of infrequently used code and moving them off somewhere else It s much easier for someone working with this code to see what s _really_ going on
Definition: hooks.txt:77
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Allows to change the fields on the form that will be generated $name
Definition: hooks.txt:271
const DB_REPLICA
Definition: defines.php:25
BaseDump $prefetch
bool resource $spawnErr
rotateDb()
Drop the database connection $this->db and try to get a new one.
getTextSpawnedOnce( $address)
endElement( $parser, $name)