MediaWiki  master
TextPassDumper.php
Go to the documentation of this file.
1 <?php
28 require_once __DIR__ . '/BackupDumper.php';
29 require_once __DIR__ . '/SevenZipStream.php';
30 require_once __DIR__ . '/../../includes/export/WikiExporter.php';
31 
37 
43  public $prefetch = null;
45  private $thisPage;
47  private $thisRev;
48 
49  // when we spend more than maxTimeAllowed seconds on this run, we continue
50  // processing until we write out the next complete page, then save output file(s),
51  // rename it/them and open new one(s)
52  public $maxTimeAllowed = 0; // 0 = no limit
53 
54  protected $input = "php://stdin";
56  protected $fetchCount = 0;
57  protected $prefetchCount = 0;
58  protected $prefetchCountLast = 0;
59  protected $fetchCountLast = 0;
60 
61  protected $maxFailures = 5;
63  protected $failureTimeout = 5; // Seconds to sleep after db failure
64 
65  protected $bufferSize = 524288; // In bytes. Maximum size to read from the stub in on go.
66 
67  protected $php = "php";
68  protected $spawn = false;
69 
73  protected $spawnProc = false;
74 
78  protected $spawnWrite = false;
79 
83  protected $spawnRead = false;
84 
88  protected $spawnErr = false;
89 
93  protected $xmlwriterobj = false;
94 
95  protected $timeExceeded = false;
96  protected $firstPageWritten = false;
97  protected $lastPageWritten = false;
98  protected $checkpointJustWritten = false;
99  protected $checkpointFiles = [];
100 
104  protected $db;
105 
109  function __construct( $args = null ) {
110  parent::__construct();
111 
112  $this->addDescription( <<<TEXT
113 This script postprocesses XML dumps from dumpBackup.php to add
114 page text which was stubbed out (using --stub).
115 
116 XML input is accepted on stdin.
117 XML output is sent to stdout; progress reports are sent to stderr.
118 TEXT
119  );
120  $this->stderr = fopen( "php://stderr", "wt" );
121 
122  $this->addOption( 'stub', 'To load a compressed stub dump instead of stdin. ' .
123  'Specify as --stub=<type>:<file>.', false, true );
124  $this->addOption( 'prefetch', 'Use a prior dump file as a text source, to savepressure on the ' .
125  'database. (Requires the XMLReader extension). Specify as --prefetch=<type>:<file>',
126  false, true );
127  $this->addOption( 'maxtime', 'Write out checkpoint file after this many minutes (writing' .
128  'out complete page, closing xml file properly, and opening new one' .
129  'with header). This option requires the checkpointfile option.', false, true );
130  $this->addOption( 'checkpointfile', 'Use this string for checkpoint filenames,substituting ' .
131  'first pageid written for the first %s (required) and the last pageid written for the ' .
132  'second %s if it exists.', false, true, false, true ); // This can be specified multiple times
133  $this->addOption( 'quiet', 'Don\'t dump status reports to stderr.' );
134  $this->addOption( 'full', 'Dump all revisions of every page' );
135  $this->addOption( 'current', 'Base ETA on number of pages in database instead of all revisions' );
136  $this->addOption( 'spawn', 'Spawn a subprocess for loading text records, optionally specify ' .
137  'php[,mwscript] paths' );
138  $this->addOption( 'buffersize', 'Buffer size in bytes to use for reading the stub. ' .
139  '(Default: 512KB, Minimum: 4KB)', false, true );
140 
141  if ( $args ) {
142  $this->loadWithArgv( $args );
143  $this->processOptions();
144  }
145  }
146 
150  private function getBlobStore() {
151  return MediaWikiServices::getInstance()->getBlobStore();
152  }
153 
154  function execute() {
155  $this->processOptions();
156  $this->dump( true );
157  }
158 
159  function processOptions() {
160  parent::processOptions();
161 
162  if ( $this->hasOption( 'buffersize' ) ) {
163  $this->bufferSize = max( intval( $this->getOption( 'buffersize' ) ), 4 * 1024 );
164  }
165 
166  if ( $this->hasOption( 'prefetch' ) ) {
167  $url = $this->processFileOpt( $this->getOption( 'prefetch' ) );
168  $this->prefetch = new BaseDump( $url );
169  }
170 
171  if ( $this->hasOption( 'stub' ) ) {
172  $this->input = $this->processFileOpt( $this->getOption( 'stub' ) );
173  }
174 
175  if ( $this->hasOption( 'maxtime' ) ) {
176  $this->maxTimeAllowed = intval( $this->getOption( 'maxtime' ) ) * 60;
177  }
178 
179  if ( $this->hasOption( 'checkpointfile' ) ) {
180  $this->checkpointFiles = $this->getOption( 'checkpointfile' );
181  }
182 
183  if ( $this->hasOption( 'current' ) ) {
185  }
186 
187  if ( $this->hasOption( 'full' ) ) {
188  $this->history = WikiExporter::FULL;
189  }
190 
191  if ( $this->hasOption( 'spawn' ) ) {
192  $this->spawn = true;
193  $val = $this->getOption( 'spawn' );
194  if ( $val !== 1 ) {
195  $this->php = explode( ',', $val, 2 );
196  }
197  }
198  }
199 
211  function rotateDb() {
212  // Cleaning up old connections
213  if ( isset( $this->lb ) ) {
214  $this->lb->closeAll();
215  unset( $this->lb );
216  }
217 
218  if ( $this->forcedDb !== null ) {
219  $this->db = $this->forcedDb;
220 
221  return;
222  }
223 
224  if ( isset( $this->db ) && $this->db->isOpen() ) {
225  throw new MWException( 'DB is set and has not been closed by the Load Balancer' );
226  }
227 
228  unset( $this->db );
229 
230  // Trying to set up new connection.
231  // We do /not/ retry upon failure, but delegate to encapsulating logic, to avoid
232  // individually retrying at different layers of code.
233 
234  try {
235  $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
236  $this->lb = $lbFactory->newMainLB();
237  } catch ( Exception $e ) {
238  throw new MWException( __METHOD__
239  . " rotating DB failed to obtain new load balancer (" . $e->getMessage() . ")" );
240  }
241 
242  try {
243  $this->db = $this->lb->getConnection( DB_REPLICA, 'dump' );
244  } catch ( Exception $e ) {
245  throw new MWException( __METHOD__
246  . " rotating DB failed to obtain new database (" . $e->getMessage() . ")" );
247  }
248  }
249 
251  parent::initProgress();
252  $this->timeOfCheckpoint = $this->startTime;
253  }
254 
255  function dump( $history, $text = WikiExporter::TEXT ) {
256  // Notice messages will foul up your XML output even if they're
257  // relatively harmless.
258  if ( ini_get( 'display_errors' ) ) {
259  ini_set( 'display_errors', 'stderr' );
260  }
261 
262  $this->initProgress( $this->history );
263 
264  // We are trying to get an initial database connection to avoid that the
265  // first try of this request's first call to getText fails. However, if
266  // obtaining a good DB connection fails it's not a serious issue, as
267  // getText does retry upon failure and can start without having a working
268  // DB connection.
269  try {
270  $this->rotateDb();
271  } catch ( Exception $e ) {
272  // We do not even count this as failure. Just let eventual
273  // watchdogs know.
274  $this->progress( "Getting initial DB connection failed (" .
275  $e->getMessage() . ")" );
276  }
277 
278  $this->egress = new ExportProgressFilter( $this->sink, $this );
279 
280  // it would be nice to do it in the constructor, oh well. need egress set
281  $this->finalOptionCheck();
282 
283  // we only want this so we know how to close a stream :-P
284  $this->xmlwriterobj = new XmlDumpWriter();
285 
286  $input = fopen( $this->input, "rt" );
287  $this->readDump( $input );
288 
289  if ( $this->spawnProc ) {
290  $this->closeSpawn();
291  }
292 
293  $this->report( true );
294  }
295 
296  function processFileOpt( $opt ) {
297  $split = explode( ':', $opt, 2 );
298  $val = $split[0];
299  $param = '';
300  if ( count( $split ) === 2 ) {
301  $param = $split[1];
302  }
303  $fileURIs = explode( ';', $param );
304  foreach ( $fileURIs as $URI ) {
305  switch ( $val ) {
306  case "file":
307  $newURI = $URI;
308  break;
309  case "gzip":
310  $newURI = "compress.zlib://$URI";
311  break;
312  case "bzip2":
313  $newURI = "compress.bzip2://$URI";
314  break;
315  case "7zip":
316  $newURI = "mediawiki.compress.7z://$URI";
317  break;
318  default:
319  $newURI = $URI;
320  }
321  $newFileURIs[] = $newURI;
322  }
323  $val = implode( ';', $newFileURIs );
324 
325  return $val;
326  }
327 
331  function showReport() {
332  if ( !$this->prefetch ) {
333  parent::showReport();
334 
335  return;
336  }
337 
338  if ( $this->reporting ) {
339  $now = wfTimestamp( TS_DB );
340  $nowts = microtime( true );
341  $deltaAll = $nowts - $this->startTime;
342  $deltaPart = $nowts - $this->lastTime;
343  $this->pageCountPart = $this->pageCount - $this->pageCountLast;
344  $this->revCountPart = $this->revCount - $this->revCountLast;
345 
346  if ( $deltaAll ) {
347  $portion = $this->revCount / $this->maxCount;
348  $eta = $this->startTime + $deltaAll / $portion;
349  $etats = wfTimestamp( TS_DB, intval( $eta ) );
350  if ( $this->fetchCount ) {
351  $fetchRate = 100.0 * $this->prefetchCount / $this->fetchCount;
352  } else {
353  $fetchRate = '-';
354  }
355  $pageRate = $this->pageCount / $deltaAll;
356  $revRate = $this->revCount / $deltaAll;
357  } else {
358  $pageRate = '-';
359  $revRate = '-';
360  $etats = '-';
361  $fetchRate = '-';
362  }
363  if ( $deltaPart ) {
364  if ( $this->fetchCountLast ) {
365  $fetchRatePart = 100.0 * $this->prefetchCountLast / $this->fetchCountLast;
366  } else {
367  $fetchRatePart = '-';
368  }
369  $pageRatePart = $this->pageCountPart / $deltaPart;
370  $revRatePart = $this->revCountPart / $deltaPart;
371  } else {
372  $fetchRatePart = '-';
373  $pageRatePart = '-';
374  $revRatePart = '-';
375  }
376  $this->progress( sprintf(
377  "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), "
378  . "%d revs (%0.1f|%0.1f/sec all|curr), %0.1f%%|%0.1f%% "
379  . "prefetched (all|curr), ETA %s [max %d]",
380  $now, wfWikiID(), $this->ID, $this->pageCount, $pageRate,
381  $pageRatePart, $this->revCount, $revRate, $revRatePart,
382  $fetchRate, $fetchRatePart, $etats, $this->maxCount
383  ) );
384  $this->lastTime = $nowts;
385  $this->revCountLast = $this->revCount;
386  $this->prefetchCountLast = $this->prefetchCount;
387  $this->fetchCountLast = $this->fetchCount;
388  }
389  }
390 
391  function setTimeExceeded() {
392  $this->timeExceeded = true;
393  }
394 
395  function checkIfTimeExceeded() {
396  if ( $this->maxTimeAllowed
397  && ( $this->lastTime - $this->timeOfCheckpoint > $this->maxTimeAllowed )
398  ) {
399  return true;
400  }
401 
402  return false;
403  }
404 
405  function finalOptionCheck() {
406  if ( ( $this->checkpointFiles && !$this->maxTimeAllowed )
407  || ( $this->maxTimeAllowed && !$this->checkpointFiles )
408  ) {
409  throw new MWException( "Options checkpointfile and maxtime must be specified together.\n" );
410  }
411  foreach ( $this->checkpointFiles as $checkpointFile ) {
412  $count = substr_count( $checkpointFile, "%s" );
413  if ( $count != 2 ) {
414  throw new MWException( "Option checkpointfile must contain two '%s' "
415  . "for substitution of first and last pageids, count is $count instead, "
416  . "file is $checkpointFile.\n" );
417  }
418  }
419 
420  if ( $this->checkpointFiles ) {
421  $filenameList = (array)$this->egress->getFilenames();
422  if ( count( $filenameList ) != count( $this->checkpointFiles ) ) {
423  throw new MWException( "One checkpointfile must be specified "
424  . "for each output option, if maxtime is used.\n" );
425  }
426  }
427  }
428 
434  function readDump( $input ) {
435  $this->buffer = "";
436  $this->openElement = false;
437  $this->atStart = true;
438  $this->state = "";
439  $this->lastName = "";
440  $this->thisPage = 0;
441  $this->thisRev = 0;
442  $this->thisRevModel = null;
443  $this->thisRevFormat = null;
444 
445  $parser = xml_parser_create( "UTF-8" );
446  xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
447 
448  xml_set_element_handler(
449  $parser,
450  [ $this, 'startElement' ],
451  [ $this, 'endElement' ]
452  );
453  xml_set_character_data_handler( $parser, [ $this, 'characterData' ] );
454 
455  $offset = 0; // for context extraction on error reporting
456  do {
457  if ( $this->checkIfTimeExceeded() ) {
458  $this->setTimeExceeded();
459  }
460  $chunk = fread( $input, $this->bufferSize );
461  if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
462  wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
463 
464  $byte = xml_get_current_byte_index( $parser );
465  $msg = wfMessage( 'xml-error-string',
466  'XML import parse failure',
467  xml_get_current_line_number( $parser ),
468  xml_get_current_column_number( $parser ),
469  $byte . ( is_null( $chunk ) ? null : ( '; "' . substr( $chunk, $byte - $offset, 16 ) . '"' ) ),
470  xml_error_string( xml_get_error_code( $parser ) ) )->escaped();
471 
472  xml_parser_free( $parser );
473 
474  throw new MWException( $msg );
475  }
476  $offset += strlen( $chunk );
477  } while ( $chunk !== false && !feof( $input ) );
478  if ( $this->maxTimeAllowed ) {
479  $filenameList = (array)$this->egress->getFilenames();
480  // we wrote some stuff after last checkpoint that needs renamed
481  if ( file_exists( $filenameList[0] ) ) {
482  $newFilenames = [];
483  # we might have just written the header and footer and had no
484  # pages or revisions written... perhaps they were all deleted
485  # there's no pageID 0 so we use that. the caller is responsible
486  # for deciding what to do with a file containing only the
487  # siteinfo information and the mw tags.
488  if ( !$this->firstPageWritten ) {
489  $firstPageID = str_pad( 0, 9, "0", STR_PAD_LEFT );
490  $lastPageID = str_pad( 0, 9, "0", STR_PAD_LEFT );
491  } else {
492  $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT );
493  $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT );
494  }
495 
496  $filenameCount = count( $filenameList );
497  for ( $i = 0; $i < $filenameCount; $i++ ) {
498  $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
499  $fileinfo = pathinfo( $filenameList[$i] );
500  $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
501  }
502  $this->egress->closeAndRename( $newFilenames );
503  }
504  }
505  xml_parser_free( $parser );
506 
507  return true;
508  }
509 
519  private function exportTransform( $text, $model, $format = null ) {
520  try {
522  $text = $handler->exportTransform( $text, $format );
523  }
524  catch ( MWException $ex ) {
525  $this->progress(
526  "Unable to apply export transformation for content model '$model': " .
527  $ex->getMessage()
528  );
529  }
530 
531  return $text;
532  }
533 
554  function getText( $id, $model = null, $format = null ) {
555  global $wgContentHandlerUseDB;
556 
557  $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch.
558  $text = false; // The candidate for a good text. false if no proper value.
559  $failures = 0; // The number of times, this invocation of getText already failed.
560 
561  // The number of times getText failed without yielding a good text in between.
562  static $consecutiveFailedTextRetrievals = 0;
563 
564  $this->fetchCount++;
565 
566  // To allow to simply return on success and do not have to worry about book keeping,
567  // we assume, this fetch works (possible after some retries). Nevertheless, we koop
568  // the old value, so we can restore it, if problems occur (See after the while loop).
569  $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals;
570  $consecutiveFailedTextRetrievals = 0;
571 
572  if ( $model === null && $wgContentHandlerUseDB ) {
573  // TODO: MCR: use content table
574  $row = $this->db->selectRow(
575  'revision',
576  [ 'rev_content_model', 'rev_content_format' ],
577  [ 'rev_id' => $this->thisRev ],
578  __METHOD__
579  );
580 
581  if ( $row ) {
582  $model = $row->rev_content_model;
583  $format = $row->rev_content_format;
584  }
585  }
586 
587  if ( $model === null || $model === '' ) {
588  $model = false;
589  }
590 
591  while ( $failures < $this->maxFailures ) {
592  // As soon as we found a good text for the $id, we will return immediately.
593  // Hence, if we make it past the try catch block, we know that we did not
594  // find a good text.
595 
596  try {
597  // Step 1: Get some text (or reuse from previous iteratuon if checking
598  // for plausibility failed)
599 
600  // Trying to get prefetch, if it has not been tried before
601  if ( $text === false && isset( $this->prefetch ) && $prefetchNotTried ) {
602  $prefetchNotTried = false;
603  $tryIsPrefetch = true;
604  $text = $this->prefetch->prefetch( (int)$this->thisPage, (int)$this->thisRev );
605 
606  if ( $text === null ) {
607  $text = false;
608  }
609 
610  if ( is_string( $text ) && $model !== false ) {
611  // Apply export transformation to text coming from an old dump.
612  // The purpose of this transformation is to convert up from legacy
613  // formats, which may still be used in the older dump that is used
614  // for pre-fetching. Applying the transformation again should not
615  // interfere with content that is already in the correct form.
616  $text = $this->exportTransform( $text, $model, $format );
617  }
618  }
619 
620  if ( $text === false ) {
621  // Fallback to asking the database
622  $tryIsPrefetch = false;
623  if ( $this->spawn ) {
624  $text = $this->getTextSpawned( $id );
625  } else {
626  $text = $this->getTextDb( $id );
627  }
628 
629  if ( $text !== false && $model !== false ) {
630  // Apply export transformation to text coming from the database.
631  // Prefetched text should already have transformations applied.
632  $text = $this->exportTransform( $text, $model, $format );
633  }
634 
635  // No more checks for texts from DB for now.
636  // If we received something that is not false,
637  // We treat it as good text, regardless of whether it actually is or is not
638  if ( $text !== false ) {
639  return $text;
640  }
641  }
642 
643  if ( $text === false ) {
644  throw new MWException( "Generic error while obtaining text for id " . $id );
645  }
646 
647  // We received a good candidate for the text of $id via some method
648 
649  // Step 2: Checking for plausibility and return the text if it is
650  // plausible
651  $revID = intval( $this->thisRev );
652  if ( !isset( $this->db ) ) {
653  throw new MWException( "No database available" );
654  }
655 
656  if ( $model !== CONTENT_MODEL_WIKITEXT ) {
657  $revLength = strlen( $text );
658  } else {
659  $revLength = $this->db->selectField( 'revision', 'rev_len', [ 'rev_id' => $revID ] );
660  }
661 
662  if ( strlen( $text ) == $revLength ) {
663  if ( $tryIsPrefetch ) {
664  $this->prefetchCount++;
665  }
666 
667  return $text;
668  }
669 
670  $text = false;
671  throw new MWException( "Received text is unplausible for id " . $id );
672  } catch ( Exception $e ) {
673  $msg = "getting/checking text " . $id . " failed (" . $e->getMessage() . ")";
674  if ( $failures + 1 < $this->maxFailures ) {
675  $msg .= " (Will retry " . ( $this->maxFailures - $failures - 1 ) . " more times)";
676  }
677  $this->progress( $msg );
678  }
679 
680  // Something went wrong; we did not a text that was plausible :(
681  $failures++;
682 
683  // A failure in a prefetch hit does not warrant resetting db connection etc.
684  if ( !$tryIsPrefetch ) {
685  // After backing off for some time, we try to reboot the whole process as
686  // much as possible to not carry over failures from one part to the other
687  // parts
688  sleep( $this->failureTimeout );
689  try {
690  $this->rotateDb();
691  if ( $this->spawn ) {
692  $this->closeSpawn();
693  $this->openSpawn();
694  }
695  } catch ( Exception $e ) {
696  $this->progress( "Rebooting getText infrastructure failed (" . $e->getMessage() . ")" .
697  " Trying to continue anyways" );
698  }
699  }
700  }
701 
702  // Retirieving a good text for $id failed (at least) maxFailures times.
703  // We abort for this $id.
704 
705  // Restoring the consecutive failures, and maybe aborting, if the dump
706  // is too broken.
707  $consecutiveFailedTextRetrievals = $oldConsecutiveFailedTextRetrievals + 1;
708  if ( $consecutiveFailedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals ) {
709  throw new MWException( "Graceful storage failure" );
710  }
711 
712  return "";
713  }
714 
721  private function getTextDb( $id ) {
722  $store = $this->getBlobStore();
723  $address = ( is_int( $id ) || strpos( $id, ':' ) === false )
724  ? SqlBlobStore::makeAddressFromTextId( (int)$id )
725  : $id;
726 
727  try {
728  $text = $store->getBlob( $address );
729 
730  $stripped = str_replace( "\r", "", $text );
731  $normalized = MediaWikiServices::getInstance()->getContentLanguage()
732  ->normalize( $stripped );
733 
734  return $normalized;
735  } catch ( BlobAccessException $ex ) {
736  // XXX: log a warning?
737  return false;
738  }
739  }
740 
745  private function getTextSpawned( $address ) {
746  Wikimedia\suppressWarnings();
747  if ( !$this->spawnProc ) {
748  // First time?
749  $this->openSpawn();
750  }
751  $text = $this->getTextSpawnedOnce( $address );
752  Wikimedia\restoreWarnings();
753 
754  return $text;
755  }
756 
757  function openSpawn() {
758  global $IP;
759 
760  if ( count( $this->php ) == 2 ) {
761  $mwscriptpath = $this->php[1];
762  } else {
763  $mwscriptpath = "$IP/../multiversion/MWScript.php";
764  }
765  if ( file_exists( $mwscriptpath ) ) {
766  $cmd = implode( " ",
767  array_map( [ Shell::class, 'escape' ],
768  [
769  $this->php[0],
770  $mwscriptpath,
771  "fetchText.php",
772  '--wiki', wfWikiID() ] ) );
773  } else {
774  $cmd = implode( " ",
775  array_map( [ Shell::class, 'escape' ],
776  [
777  $this->php[0],
778  "$IP/maintenance/fetchText.php",
779  '--wiki', wfWikiID() ] ) );
780  }
781  $spec = [
782  0 => [ "pipe", "r" ],
783  1 => [ "pipe", "w" ],
784  2 => [ "file", "/dev/null", "a" ] ];
785  $pipes = [];
786 
787  $this->progress( "Spawning database subprocess: $cmd" );
788  $this->spawnProc = proc_open( $cmd, $spec, $pipes );
789  if ( !$this->spawnProc ) {
790  $this->progress( "Subprocess spawn failed." );
791 
792  return false;
793  }
794  list(
795  $this->spawnWrite, // -> stdin
796  $this->spawnRead, // <- stdout
797  ) = $pipes;
798 
799  return true;
800  }
801 
802  private function closeSpawn() {
803  Wikimedia\suppressWarnings();
804  if ( $this->spawnRead ) {
805  fclose( $this->spawnRead );
806  }
807  $this->spawnRead = false;
808  if ( $this->spawnWrite ) {
809  fclose( $this->spawnWrite );
810  }
811  $this->spawnWrite = false;
812  if ( $this->spawnErr ) {
813  fclose( $this->spawnErr );
814  }
815  $this->spawnErr = false;
816  if ( $this->spawnProc ) {
817  pclose( $this->spawnProc );
818  }
819  $this->spawnProc = false;
820  Wikimedia\restoreWarnings();
821  }
822 
827  private function getTextSpawnedOnce( $address ) {
828  if ( is_int( $address ) || intval( $address ) ) {
829  $address = SqlBlobStore::makeAddressFromTextId( (int)$address );
830  }
831 
832  $ok = fwrite( $this->spawnWrite, "$address\n" );
833  // $this->progress( ">> $id" );
834  if ( !$ok ) {
835  return false;
836  }
837 
838  $ok = fflush( $this->spawnWrite );
839  // $this->progress( ">> [flush]" );
840  if ( !$ok ) {
841  return false;
842  }
843 
844  // check that the text address they are sending is the one we asked for
845  // this avoids out of sync revision text errors we have encountered in the past
846  $newAddress = fgets( $this->spawnRead );
847  if ( $newAddress === false ) {
848  return false;
849  }
850  $newAddress = trim( $newAddress );
851  if ( strpos( $newAddress, ':' ) === false ) {
852  $newAddress = SqlBlobStore::makeAddressFromTextId( intval( $newAddress ) );
853  }
854 
855  if ( $newAddress !== $address ) {
856  return false;
857  }
858 
859  $len = fgets( $this->spawnRead );
860  // $this->progress( "<< " . trim( $len ) );
861  if ( $len === false ) {
862  return false;
863  }
864 
865  $nbytes = intval( $len );
866  // actual error, not zero-length text
867  if ( $nbytes < 0 ) {
868  return false;
869  }
870 
871  $text = "";
872 
873  // Subprocess may not send everything at once, we have to loop.
874  while ( $nbytes > strlen( $text ) ) {
875  $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) );
876  if ( $buffer === false ) {
877  break;
878  }
879  $text .= $buffer;
880  }
881 
882  $gotbytes = strlen( $text );
883  if ( $gotbytes != $nbytes ) {
884  $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " );
885 
886  return false;
887  }
888 
889  // Do normalization in the dump thread...
890  $stripped = str_replace( "\r", "", $text );
891  $normalized = MediaWikiServices::getInstance()->getContentLanguage()->
892  normalize( $stripped );
893 
894  return $normalized;
895  }
896 
898  $this->checkpointJustWritten = false;
899 
900  $this->clearOpenElement( null );
901  $this->lastName = $name;
902 
903  if ( $name == 'revision' ) {
904  $this->state = $name;
905  $this->egress->writeOpenPage( null, $this->buffer );
906  $this->buffer = "";
907  } elseif ( $name == 'page' ) {
908  $this->state = $name;
909  if ( $this->atStart ) {
910  $this->egress->writeOpenStream( $this->buffer );
911  $this->buffer = "";
912  $this->atStart = false;
913  }
914  }
915 
916  if ( $name == "text" && isset( $attribs['id'] ) ) {
917  $id = $attribs['id'];
918  $model = trim( $this->thisRevModel );
919  $format = trim( $this->thisRevFormat );
920 
921  $model = $model === '' ? null : $model;
922  $format = $format === '' ? null : $format;
923 
924  $text = $this->getText( $id, $model, $format );
925  $this->openElement = [ $name, [ 'xml:space' => 'preserve' ] ];
926  if ( strlen( $text ) > 0 ) {
927  $this->characterData( $parser, $text );
928  }
929  } else {
930  $this->openElement = [ $name, $attribs ];
931  }
932  }
933 
934  function endElement( $parser, $name ) {
935  $this->checkpointJustWritten = false;
936 
937  if ( $this->openElement ) {
938  $this->clearOpenElement( "" );
939  } else {
940  $this->buffer .= "</$name>";
941  }
942 
943  if ( $name == 'revision' ) {
944  $this->egress->writeRevision( null, $this->buffer );
945  $this->buffer = "";
946  $this->thisRev = "";
947  $this->thisRevModel = null;
948  $this->thisRevFormat = null;
949  } elseif ( $name == 'page' ) {
950  if ( !$this->firstPageWritten ) {
951  $this->firstPageWritten = trim( $this->thisPage );
952  }
953  $this->lastPageWritten = trim( $this->thisPage );
954  if ( $this->timeExceeded ) {
955  $this->egress->writeClosePage( $this->buffer );
956  // nasty hack, we can't just write the chardata after the
957  // page tag, it will include leading blanks from the next line
958  $this->egress->sink->write( "\n" );
959 
960  $this->buffer = $this->xmlwriterobj->closeStream();
961  $this->egress->writeCloseStream( $this->buffer );
962 
963  $this->buffer = "";
964  $this->thisPage = "";
965  // this could be more than one file if we had more than one output arg
966 
967  $filenameList = (array)$this->egress->getFilenames();
968  $newFilenames = [];
969  $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT );
970  $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT );
971  $filenamesCount = count( $filenameList );
972  for ( $i = 0; $i < $filenamesCount; $i++ ) {
973  $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID );
974  $fileinfo = pathinfo( $filenameList[$i] );
975  $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn;
976  }
977  $this->egress->closeRenameAndReopen( $newFilenames );
978  $this->buffer = $this->xmlwriterobj->openStream();
979  $this->timeExceeded = false;
980  $this->timeOfCheckpoint = $this->lastTime;
981  $this->firstPageWritten = false;
982  $this->checkpointJustWritten = true;
983  } else {
984  $this->egress->writeClosePage( $this->buffer );
985  $this->buffer = "";
986  $this->thisPage = "";
987  }
988  } elseif ( $name == 'mediawiki' ) {
989  $this->egress->writeCloseStream( $this->buffer );
990  $this->buffer = "";
991  }
992  }
993 
994  function characterData( $parser, $data ) {
995  $this->clearOpenElement( null );
996  if ( $this->lastName == "id" ) {
997  if ( $this->state == "revision" ) {
998  $this->thisRev .= $data;
999  } elseif ( $this->state == "page" ) {
1000  $this->thisPage .= $data;
1001  }
1002  } elseif ( $this->lastName == "model" ) {
1003  $this->thisRevModel .= $data;
1004  } elseif ( $this->lastName == "format" ) {
1005  $this->thisRevFormat .= $data;
1006  }
1007 
1008  // have to skip the newline left over from closepagetag line of
1009  // end of checkpoint files. nasty hack!!
1010  if ( $this->checkpointJustWritten ) {
1011  if ( $data[0] == "\n" ) {
1012  $data = substr( $data, 1 );
1013  }
1014  $this->checkpointJustWritten = false;
1015  }
1016  $this->buffer .= htmlspecialchars( $data );
1017  }
1018 
1019  function clearOpenElement( $style ) {
1020  if ( $this->openElement ) {
1021  $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style );
1022  $this->openElement = false;
1023  }
1024  }
1025 }
bool resource $spawnWrite
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
IMaintainableDatabase $db
const CONTENT_MODEL_WIKITEXT
Definition: Defines.php:231
processing should stop and the error should be shown to the user * false
Definition: hooks.txt:187
$IP
Definition: WebStart.php:41
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
getOption( $name, $default=null)
Get an option, or return the default.
progress( $string)
div flags Integer display flags(NO_ACTION_LINK, NO_EXTRA_USER_LINKS) 'LogException' returning false will NOT prevent logging $e
Definition: hooks.txt:2159
characterData( $parser, $data)
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for and distribution as defined by Sections through of this document Licensor shall mean the copyright owner or entity authorized by the copyright owner that is granting the License Legal Entity shall mean the union of the acting entity and all other entities that control are controlled by or are under common control with that entity For the purposes of this definition control direct or to cause the direction or management of such whether by contract or including but not limited to software source documentation and configuration files Object form shall mean any form resulting from mechanical transformation or translation of a Source including but not limited to compiled object generated and conversions to other media types Work shall mean the work of whether in Source or Object made available under the as indicated by a copyright notice that is included in or attached to the whether in Source or Object that is based or other modifications as a an original work of authorship For the purposes of this Derivative Works shall not include works that remain separable from
string bool $thisRev
exportTransform( $text, $model, $format=null)
Applies applicable export transformations to $text.
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
__construct( $args=null)
getText( $id, $model=null, $format=null)
Tries to load revision text.
hasOption( $name)
Checks to see if a particular option exists.
showReport()
Overridden to include prefetch ratio if enabled.
target page
bool XmlDumpWriter $xmlwriterobj
loadWithArgv( $argv)
Load params and arguments from a given array of command-line arguments.
Exception representing a failure to access a data blob.
startElement( $parser, $name, $attribs)
see documentation in includes Linker php for Linker::makeImageLink or false for current used if you return false $parser
Definition: hooks.txt:1799
This list may contain false positives That usually means there is additional text with links below the first Each row contains links to the first and second as well as the first line of the second redirect text
clearOpenElement( $style)
This document provides an overview of the usage of PageUpdater and that is
Definition: pageupdater.txt:3
if( $line===false) $args
Definition: cdb.php:64
The ContentHandler facility adds support for arbitrary content types on wiki instead of relying on wikitext for everything It was introduced in MediaWiki Each kind of and so on Built in content types are
$wgContentHandlerUseDB
Set to false to disable use of the database fields introduced by the ContentHandler facility...
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not it can be in the form of< username >< more info > e g for bot passwords intended to be added to log contexts Fields it might only if the login was with a bot password it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output modifiable modifiable after all normalizations have been except for the $wgMaxImageArea check set to true or false to override the $wgMaxImageArea check result gives extension the possibility to transform it themselves $handler
Definition: hooks.txt:780
An extension or a local will often add custom code to the function with or without a global variable For someone wanting email notification when an article is shown may add
Definition: hooks.txt:51
getTextSpawned( $address)
wfTimestamp( $outputtype=TS_UNIX, $ts=0)
Get a timestamp string in one of various formats.
dump( $history, $text=WikiExporter::TEXT)
either a unescaped string or a HtmlArmor object after in associative array form externallinks including delete and has completed for all link tables whether this was an auto creation use $formDescriptor instead default is conds Array Extra conditions for the No matching items in log is displayed if loglist is empty msgKey Array If you want a nice box with a set this to the key of the message First element is the message additional optional elements are parameters for the key that are processed with wfMessage() -> params() ->parseAsBlock() - offset Set to overwrite offset parameter in $wgRequest set to '' to unset offset - wrap String Wrap the message in html(usually something like "&lt
static getForModelID( $modelId)
Returns the ContentHandler singleton for the given model ID.
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
addDescription( $text)
Set the description text.
bool resource $spawnProc
getTextDb( $id)
Loads the serialized content from storage.
null means default in associative array with keys and values unescaped Should be merged with default with a value of false meaning to suppress the attribute in associative array with keys and values unescaped noclasses just before the function returns a value If you return an< a > element with HTML attributes $attribs and contents $html will be returned If you return $ret will be returned and may include noclasses after processing & $attribs
Definition: hooks.txt:1982
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:780
$buffer
wfWikiID()
Get an ASCII string identifying this wiki This is used as a prefix in memcached keys.
output( $out, $channel=null)
Throw some output to the user.
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
string bool $thisPage
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
presenting them properly to the user as errors is done by the caller return true use this to change the list i e etc next in line in page history
Definition: hooks.txt:1766
you have access to all of the normal MediaWiki so you can get a DB use the etc For full docs on the Maintenance class
Definition: maintenance.txt:52
IDatabase null $forcedDb
The dependency-injected database to use.
report( $final=false)
initProgress( $history=WikiExporter::FULL)
bool resource $spawnRead
static element( $element, $attribs=null, $contents='', $allowShortTag=true)
Format an XML element with given attributes and, optionally, text content.
Definition: Xml.php:41
Using a hook running we can avoid having all this option specific stuff in our mainline code Using the function We ve cleaned up the code here by removing clumps of infrequently used code and moving them off somewhere else It s much easier for someone working with this code to see what s _really_ going on
Definition: hooks.txt:77
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
Allows to change the fields on the form that will be generated $name
Definition: hooks.txt:271
const DB_REPLICA
Definition: defines.php:25
BaseDump $prefetch
bool resource $spawnErr
rotateDb()
Drop the database connection $this->db and try to get a new one.
getTextSpawnedOnce( $address)
endElement( $parser, $name)