Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 506 |
|
0.00% |
0 / 26 |
CRAP | |
0.00% |
0 / 1 |
TextPassDumper | |
0.00% |
0 / 505 |
|
0.00% |
0 / 26 |
24492 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
6 | |||
finalSetup | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getBlobStore | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getRevisionStore | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
processOptions | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
110 | |||
initProgress | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
dump | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
12 | |||
processFileOpt | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
72 | |||
showReport | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
56 | |||
setTimeExceeded | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
checkIfTimeExceeded | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
finalOptionCheck | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
90 | |||
readDump | |
0.00% |
0 / 52 |
|
0.00% |
0 / 1 |
90 | |||
exportTransform | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
6 | |||
getText | |
0.00% |
0 / 75 |
|
0.00% |
0 / 1 |
930 | |||
getTextDb | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
getTextSpawned | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
openSpawn | |
0.00% |
0 / 34 |
|
0.00% |
0 / 1 |
20 | |||
closeSpawn | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
getTextSpawnedOnce | |
0.00% |
0 / 36 |
|
0.00% |
0 / 1 |
182 | |||
startElement | |
0.00% |
0 / 36 |
|
0.00% |
0 / 1 |
272 | |||
endElement | |
0.00% |
0 / 43 |
|
0.00% |
0 / 1 |
72 | |||
characterData | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
110 | |||
clearOpenElement | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
isValidTextId | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | /** |
3 | * BackupDumper that postprocesses XML dumps from dumpBackup.php to add page text |
4 | * |
5 | * Copyright (C) 2005 Brooke Vibber <bvibber@wikimedia.org> |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup Dump |
25 | * @ingroup Maintenance |
26 | */ |
27 | |
28 | namespace MediaWiki\Maintenance; |
29 | |
30 | // @codeCoverageIgnoreStart |
31 | require_once __DIR__ . '/BackupDumper.php'; |
32 | require_once __DIR__ . '/../../includes/export/WikiExporter.php'; |
33 | // @codeCoverageIgnoreEnd |
34 | |
35 | use BaseDump; |
36 | use Exception; |
37 | use ExportProgressFilter; |
38 | use MediaWiki\Exception\MWException; |
39 | use MediaWiki\Exception\MWUnknownContentModelException; |
40 | use MediaWiki\Revision\RevisionStore; |
41 | use MediaWiki\Revision\SlotRecord; |
42 | use MediaWiki\Settings\SettingsBuilder; |
43 | use MediaWiki\Shell\Shell; |
44 | use MediaWiki\Storage\BlobAccessException; |
45 | use MediaWiki\Storage\BlobStore; |
46 | use MediaWiki\Storage\SqlBlobStore; |
47 | use MediaWiki\WikiMap\WikiMap; |
48 | use MediaWiki\Xml\Xml; |
49 | use RuntimeException; |
50 | use WikiExporter; |
51 | use Wikimedia\AtEase\AtEase; |
52 | use XmlDumpWriter; |
53 | |
54 | /** |
55 | * @ingroup Maintenance |
56 | */ |
57 | class TextPassDumper extends BackupDumper { |
58 | /** @var BaseDump|null */ |
59 | public $prefetch = null; |
60 | /** @var string */ |
61 | private $thisPage; |
62 | /** @var string */ |
63 | private $thisRev; |
64 | /** @var string|null */ |
65 | private $thisRole = null; |
66 | |
67 | /** |
68 | * @var int when we spend more than maxTimeAllowed seconds on this run, we continue |
69 | * processing until we write out the next complete page, then save output file(s), |
70 | * rename it/them and open new one(s); 0 = no limit |
71 | */ |
72 | public $maxTimeAllowed = 0; |
73 | |
74 | /** @var string */ |
75 | protected $input = "php://stdin"; |
76 | /** @var int */ |
77 | protected $history = WikiExporter::FULL; |
78 | /** @var int */ |
79 | protected $fetchCount = 0; |
80 | /** @var int */ |
81 | protected $prefetchCount = 0; |
82 | /** @var int */ |
83 | protected $prefetchCountLast = 0; |
84 | /** @var int */ |
85 | protected $fetchCountLast = 0; |
86 | |
87 | /** @var int */ |
88 | protected $maxFailures = 5; |
89 | /** @var int */ |
90 | protected $maxConsecutiveFailedTextRetrievals = 200; |
91 | /** @var int Seconds to sleep after db failure */ |
92 | protected $failureTimeout = 5; |
93 | |
94 | /** @var int In bytes. Maximum size to read from the stub in on go. */ |
95 | protected $bufferSize = 524_288; |
96 | |
97 | /** @var array */ |
98 | protected $php = [ PHP_BINARY ]; |
99 | /** @var bool */ |
100 | protected $spawn = false; |
101 | |
102 | /** |
103 | * @var resource|false |
104 | */ |
105 | protected $spawnProc = false; |
106 | |
107 | /** |
108 | * @var resource|null |
109 | */ |
110 | protected $spawnWrite; |
111 | |
112 | /** |
113 | * @var resource|null |
114 | */ |
115 | protected $spawnRead; |
116 | |
117 | /** |
118 | * @var resource|false |
119 | */ |
120 | protected $spawnErr = false; |
121 | |
122 | /** |
123 | * @var XmlDumpWriter|false |
124 | */ |
125 | protected $xmlwriterobj = false; |
126 | |
127 | /** @var bool */ |
128 | protected $timeExceeded = false; |
129 | /** @var string|false */ |
130 | protected $firstPageWritten = false; |
131 | /** @var string|false */ |
132 | protected $lastPageWritten = false; |
133 | /** @var bool */ |
134 | protected $checkpointJustWritten = false; |
135 | /** @var string[] */ |
136 | protected $checkpointFiles = []; |
137 | |
138 | /** |
139 | * @param array|null $args For backward compatibility |
140 | */ |
141 | public function __construct( $args = null ) { |
142 | parent::__construct(); |
143 | |
144 | $this->addDescription( <<<TEXT |
145 | This script postprocesses XML dumps from dumpBackup.php to add |
146 | page text which was stubbed out (using --stub). |
147 | |
148 | XML input is accepted on stdin. |
149 | XML output is sent to stdout; progress reports are sent to stderr. |
150 | TEXT |
151 | ); |
152 | $this->stderr = fopen( "php://stderr", "wt" ); |
153 | |
154 | $this->addOption( 'stub', 'To load a compressed stub dump instead of stdin. ' . |
155 | 'Specify as --stub=<type>:<file>.', false, true ); |
156 | $this->addOption( 'prefetch', 'Use a prior dump file as a text source, to savepressure on the ' . |
157 | 'database. (Requires the XMLReader extension). Specify as --prefetch=<type>:<file>', |
158 | false, true ); |
159 | $this->addOption( 'maxtime', 'Write out checkpoint file after this many minutes (writing' . |
160 | 'out complete page, closing xml file properly, and opening new one' . |
161 | 'with header). This option requires the checkpointfile option.', false, true ); |
162 | $this->addOption( 'checkpointfile', 'Use this string for checkpoint filenames,substituting ' . |
163 | 'first pageid written for the first %s (required) and the last pageid written for the ' . |
164 | 'second %s if it exists.', false, true, false, true ); // This can be specified multiple times |
165 | $this->addOption( 'quiet', 'Don\'t dump status reports to stderr.' ); |
166 | $this->addOption( 'full', 'Dump all revisions of every page' ); |
167 | $this->addOption( 'current', 'Base ETA on number of pages in database instead of all revisions' ); |
168 | $this->addOption( 'spawn', 'Spawn a subprocess for loading text records, optionally specify ' . |
169 | 'php[,mwscript] paths' ); |
170 | $this->addOption( 'buffersize', 'Buffer size in bytes to use for reading the stub. ' . |
171 | '(Default: 512 KiB, Minimum: 4 KiB)', false, true ); |
172 | |
173 | if ( $args ) { |
174 | $this->loadWithArgv( $args ); |
175 | $this->processOptions(); |
176 | } |
177 | } |
178 | |
179 | public function finalSetup( SettingsBuilder $settingsBuilder ) { |
180 | parent::finalSetup( $settingsBuilder ); |
181 | |
182 | SevenZipStream::register(); |
183 | } |
184 | |
185 | /** |
186 | * @return BlobStore |
187 | */ |
188 | private function getBlobStore() { |
189 | return $this->getServiceContainer()->getBlobStore(); |
190 | } |
191 | |
192 | /** |
193 | * @return RevisionStore |
194 | */ |
195 | private function getRevisionStore() { |
196 | return $this->getServiceContainer()->getRevisionStore(); |
197 | } |
198 | |
199 | public function execute() { |
200 | $this->processOptions(); |
201 | $this->dump( true ); |
202 | } |
203 | |
204 | protected function processOptions() { |
205 | parent::processOptions(); |
206 | |
207 | if ( $this->hasOption( 'buffersize' ) ) { |
208 | $this->bufferSize = max( intval( $this->getOption( 'buffersize' ) ), 4 * 1024 ); |
209 | } |
210 | |
211 | if ( $this->hasOption( 'prefetch' ) ) { |
212 | $url = $this->processFileOpt( $this->getOption( 'prefetch' ) ); |
213 | $this->prefetch = new BaseDump( $url ); |
214 | } |
215 | |
216 | if ( $this->hasOption( 'stub' ) ) { |
217 | $this->input = $this->processFileOpt( $this->getOption( 'stub' ) ); |
218 | } |
219 | |
220 | if ( $this->hasOption( 'maxtime' ) ) { |
221 | $this->maxTimeAllowed = intval( $this->getOption( 'maxtime' ) ) * 60; |
222 | } |
223 | |
224 | if ( $this->hasOption( 'checkpointfile' ) ) { |
225 | $this->checkpointFiles = $this->getOption( 'checkpointfile' ); |
226 | } |
227 | |
228 | if ( $this->hasOption( 'current' ) ) { |
229 | $this->history = WikiExporter::CURRENT; |
230 | } |
231 | |
232 | if ( $this->hasOption( 'full' ) ) { |
233 | $this->history = WikiExporter::FULL; |
234 | } |
235 | |
236 | if ( $this->hasOption( 'spawn' ) ) { |
237 | $this->spawn = true; |
238 | $val = $this->getOption( 'spawn' ); |
239 | if ( $val !== 1 ) { |
240 | $this->php = explode( ',', $val, 2 ); |
241 | } |
242 | } |
243 | } |
244 | |
245 | public function initProgress( $history = WikiExporter::FULL ) { |
246 | parent::initProgress(); |
247 | $this->timeOfCheckpoint = $this->startTime; |
248 | } |
249 | |
250 | public function dump( $history, $text = WikiExporter::TEXT ) { |
251 | // Notice messages will foul up your XML output even if they're |
252 | // relatively harmless. |
253 | if ( ini_get( 'display_errors' ) ) { |
254 | ini_set( 'display_errors', 'stderr' ); |
255 | } |
256 | |
257 | $this->initProgress( $this->history ); |
258 | |
259 | $this->egress = new ExportProgressFilter( $this->sink, $this ); |
260 | |
261 | // it would be nice to do it in the constructor, oh well. need egress set |
262 | $this->finalOptionCheck(); |
263 | |
264 | // we only want this so we know how to close a stream :-P |
265 | $this->xmlwriterobj = new XmlDumpWriter( XmlDumpWriter::WRITE_CONTENT, $this->schemaVersion ); |
266 | |
267 | $input = fopen( $this->input, "rt" ); |
268 | $this->readDump( $input ); |
269 | |
270 | if ( $this->spawnProc ) { |
271 | $this->closeSpawn(); |
272 | } |
273 | |
274 | $this->report( true ); |
275 | } |
276 | |
277 | protected function processFileOpt( $opt ) { |
278 | $split = explode( ':', $opt, 2 ); |
279 | $val = $split[0]; |
280 | $param = ''; |
281 | if ( count( $split ) === 2 ) { |
282 | $param = $split[1]; |
283 | } |
284 | $fileURIs = explode( ';', $param ); |
285 | $newFileURIs = []; |
286 | foreach ( $fileURIs as $URI ) { |
287 | switch ( $val ) { |
288 | case "file": |
289 | $newURI = $URI; |
290 | break; |
291 | case "gzip": |
292 | $newURI = "compress.zlib://$URI"; |
293 | break; |
294 | case "bzip2": |
295 | $newURI = "compress.bzip2://$URI"; |
296 | break; |
297 | case "7zip": |
298 | $newURI = "mediawiki.compress.7z://$URI"; |
299 | break; |
300 | default: |
301 | $newURI = $URI; |
302 | } |
303 | $newFileURIs[] = $newURI; |
304 | } |
305 | $val = implode( ';', $newFileURIs ); |
306 | |
307 | return $val; |
308 | } |
309 | |
310 | /** |
311 | * Overridden to include prefetch ratio if enabled. |
312 | */ |
313 | public function showReport() { |
314 | if ( !$this->prefetch ) { |
315 | parent::showReport(); |
316 | |
317 | return; |
318 | } |
319 | |
320 | if ( $this->reporting ) { |
321 | $now = wfTimestamp( TS_DB ); |
322 | $nowts = microtime( true ); |
323 | $deltaAll = $nowts - $this->startTime; |
324 | $deltaPart = $nowts - $this->lastTime; |
325 | $this->pageCountPart = $this->pageCount - $this->pageCountLast; |
326 | $this->revCountPart = $this->revCount - $this->revCountLast; |
327 | |
328 | if ( $deltaAll ) { |
329 | $portion = $this->revCount / $this->maxCount; |
330 | $eta = $this->startTime + $deltaAll / $portion; |
331 | $etats = wfTimestamp( TS_DB, intval( $eta ) ); |
332 | if ( $this->fetchCount ) { |
333 | $fetchRate = 100.0 * $this->prefetchCount / $this->fetchCount; |
334 | } else { |
335 | $fetchRate = '-'; |
336 | } |
337 | $pageRate = $this->pageCount / $deltaAll; |
338 | $revRate = $this->revCount / $deltaAll; |
339 | } else { |
340 | $pageRate = '-'; |
341 | $revRate = '-'; |
342 | $etats = '-'; |
343 | $fetchRate = '-'; |
344 | } |
345 | if ( $deltaPart ) { |
346 | if ( $this->fetchCountLast ) { |
347 | $fetchRatePart = 100.0 * $this->prefetchCountLast / $this->fetchCountLast; |
348 | } else { |
349 | $fetchRatePart = '-'; |
350 | } |
351 | $pageRatePart = $this->pageCountPart / $deltaPart; |
352 | $revRatePart = $this->revCountPart / $deltaPart; |
353 | } else { |
354 | $fetchRatePart = '-'; |
355 | $pageRatePart = '-'; |
356 | $revRatePart = '-'; |
357 | } |
358 | |
359 | $dbDomain = WikiMap::getCurrentWikiDbDomain()->getId(); |
360 | $this->progress( sprintf( |
361 | "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), " |
362 | . "%d revs (%0.1f|%0.1f/sec all|curr), %0.1f%%|%0.1f%% " |
363 | . "prefetched (all|curr), ETA %s [max %d]", |
364 | $now, $dbDomain, $this->ID, $this->pageCount, $pageRate, |
365 | $pageRatePart, $this->revCount, $revRate, $revRatePart, |
366 | $fetchRate, $fetchRatePart, $etats, $this->maxCount |
367 | ) ); |
368 | $this->lastTime = $nowts; |
369 | $this->revCountLast = $this->revCount; |
370 | $this->prefetchCountLast = $this->prefetchCount; |
371 | $this->fetchCountLast = $this->fetchCount; |
372 | } |
373 | } |
374 | |
375 | private function setTimeExceeded() { |
376 | $this->timeExceeded = true; |
377 | } |
378 | |
379 | private function checkIfTimeExceeded(): bool { |
380 | if ( $this->maxTimeAllowed |
381 | && ( $this->lastTime - $this->timeOfCheckpoint > $this->maxTimeAllowed ) |
382 | ) { |
383 | return true; |
384 | } |
385 | |
386 | return false; |
387 | } |
388 | |
389 | private function finalOptionCheck() { |
390 | if ( ( $this->checkpointFiles && !$this->maxTimeAllowed ) |
391 | || ( $this->maxTimeAllowed && !$this->checkpointFiles ) |
392 | ) { |
393 | throw new RuntimeException( "Options checkpointfile and maxtime must be specified together.\n" ); |
394 | } |
395 | foreach ( $this->checkpointFiles as $checkpointFile ) { |
396 | $count = substr_count( $checkpointFile, "%s" ); |
397 | if ( $count !== 2 ) { |
398 | throw new RuntimeException( "Option checkpointfile must contain two '%s' " |
399 | . "for substitution of first and last pageids, count is $count instead, " |
400 | . "file is $checkpointFile.\n" ); |
401 | } |
402 | } |
403 | |
404 | if ( $this->checkpointFiles ) { |
405 | $filenameList = (array)$this->egress->getFilenames(); |
406 | if ( count( $filenameList ) !== count( $this->checkpointFiles ) ) { |
407 | throw new RuntimeException( "One checkpointfile must be specified " |
408 | . "for each output option, if maxtime is used.\n" ); |
409 | } |
410 | } |
411 | } |
412 | |
413 | /** |
414 | * @throws MWException Failure to parse XML input |
415 | * @param resource $input |
416 | * @return bool |
417 | */ |
418 | protected function readDump( $input ) { |
419 | $this->buffer = ""; |
420 | $this->openElement = false; |
421 | $this->atStart = true; |
422 | $this->state = ""; |
423 | $this->lastName = ""; |
424 | $this->thisPage = ""; |
425 | $this->thisRev = ""; |
426 | $this->thisRole = null; |
427 | $this->thisRevModel = null; |
428 | $this->thisRevFormat = null; |
429 | |
430 | $parser = xml_parser_create( "UTF-8" ); |
431 | xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, 0 ); |
432 | |
433 | xml_set_element_handler( |
434 | $parser, |
435 | [ $this, 'startElement' ], |
436 | [ $this, 'endElement' ] |
437 | ); |
438 | xml_set_character_data_handler( $parser, [ $this, 'characterData' ] ); |
439 | |
440 | $offset = 0; // for context extraction on error reporting |
441 | do { |
442 | if ( $this->checkIfTimeExceeded() ) { |
443 | $this->setTimeExceeded(); |
444 | } |
445 | $chunk = fread( $input, $this->bufferSize ); |
446 | if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) { |
447 | wfDebug( "TextDumpPass::readDump encountered XML parsing error" ); |
448 | |
449 | $byte = xml_get_current_byte_index( $parser ); |
450 | $msg = wfMessage( 'xml-error-string', |
451 | 'XML import parse failure', |
452 | xml_get_current_line_number( $parser ), |
453 | xml_get_current_column_number( $parser ), |
454 | $byte . ( $chunk === false ? '' : ( '; "' . substr( $chunk, $byte - $offset, 16 ) . '"' ) ), |
455 | xml_error_string( xml_get_error_code( $parser ) ) )->escaped(); |
456 | |
457 | xml_parser_free( $parser ); |
458 | |
459 | throw new MWException( $msg ); |
460 | } |
461 | $offset += strlen( $chunk ); |
462 | } while ( $chunk !== false && !feof( $input ) ); |
463 | if ( $this->maxTimeAllowed ) { |
464 | $filenameList = (array)$this->egress->getFilenames(); |
465 | // we wrote some stuff after last checkpoint that needs renamed |
466 | if ( file_exists( $filenameList[0] ) ) { |
467 | $newFilenames = []; |
468 | # we might have just written the header and footer and had no |
469 | # pages or revisions written... perhaps they were all deleted |
470 | # there's no pageID 0 so we use that. the caller is responsible |
471 | # for deciding what to do with a file containing only the |
472 | # siteinfo information and the mw tags. |
473 | if ( !$this->firstPageWritten ) { |
474 | $firstPageID = str_pad( '0', 9, "0", STR_PAD_LEFT ); |
475 | $lastPageID = str_pad( '0', 9, "0", STR_PAD_LEFT ); |
476 | } else { |
477 | $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT ); |
478 | $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT ); |
479 | } |
480 | |
481 | $filenameCount = count( $filenameList ); |
482 | for ( $i = 0; $i < $filenameCount; $i++ ) { |
483 | $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID ); |
484 | $fileinfo = pathinfo( $filenameList[$i] ); |
485 | $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn; |
486 | } |
487 | $this->egress->closeAndRename( $newFilenames ); |
488 | } |
489 | } |
490 | xml_parser_free( $parser ); |
491 | |
492 | return true; |
493 | } |
494 | |
495 | /** |
496 | * Applies applicable export transformations to $text. |
497 | * |
498 | * @param string $text |
499 | * @param string $model |
500 | * @param string|null $format |
501 | * |
502 | * @return string |
503 | */ |
504 | private function exportTransform( $text, $model, $format = null ) { |
505 | try { |
506 | $contentHandler = $this->getServiceContainer() |
507 | ->getContentHandlerFactory() |
508 | ->getContentHandler( $model ); |
509 | } catch ( MWUnknownContentModelException $ex ) { |
510 | wfWarn( "Unable to apply export transformation for content model '$model': " . |
511 | $ex->getMessage() ); |
512 | |
513 | $this->progress( |
514 | "Unable to apply export transformation for content model '$model': " . |
515 | $ex->getMessage() |
516 | ); |
517 | return $text; |
518 | } |
519 | |
520 | return $contentHandler->exportTransform( $text, $format ); |
521 | } |
522 | |
523 | /** |
524 | * Tries to load revision text. |
525 | * Export transformations are applied if the content model is given or can be |
526 | * determined from the database. |
527 | * |
528 | * Upon errors, retries (Up to $this->maxFailures tries each call). |
529 | * If still no good revision could be found even after this retrying, "" is returned. |
530 | * If no good revision text could be returned for |
531 | * $this->maxConsecutiveFailedTextRetrievals consecutive calls to getText, MWException |
532 | * is thrown. |
533 | * |
534 | * @param int|string $id Content address, or text row ID. |
535 | * @param string|false|null $model The content model used to determine |
536 | * applicable export transformations. If $model is null, no transformation is applied. |
537 | * @param string|null $format The content format used when applying export transformations. |
538 | * @param int|null $expSize Expected length of the text, for checks |
539 | * |
540 | * @return string The revision text for $id, or "" |
541 | * @throws MWException |
542 | */ |
543 | protected function getText( $id, $model = null, $format = null, $expSize = null ) { |
544 | if ( !$this->isValidTextId( $id ) ) { |
545 | $msg = "Skipping bad text id " . $id . " of revision " . $this->thisRev; |
546 | $this->progress( $msg ); |
547 | return ''; |
548 | } |
549 | |
550 | $model = $model ?: null; |
551 | $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch. |
552 | $text = false; // The candidate for a good text. false if no proper value. |
553 | $failures = 0; // The number of times, this invocation of getText already failed. |
554 | $contentAddress = $id; // Where the content should be found |
555 | |
556 | // The number of times getText failed without yielding a good text in between. |
557 | static $consecutiveFailedTextRetrievals = 0; |
558 | |
559 | $this->fetchCount++; |
560 | |
561 | // To allow to simply return on success and do not have to worry about book keeping, |
562 | // we assume, this fetch works (possible after some retries). Nevertheless, we koop |
563 | // the old value, so we can restore it, if problems occur (See after the while loop). |
564 | $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals; |
565 | $consecutiveFailedTextRetrievals = 0; |
566 | |
567 | while ( $failures < $this->maxFailures ) { |
568 | // As soon as we found a good text for the $id, we will return immediately. |
569 | // Hence, if we make it past the try catch block, we know that we did not |
570 | // find a good text. |
571 | |
572 | try { |
573 | // Step 1: Get some text (or reuse from previous iteratuon if checking |
574 | // for plausibility failed) |
575 | |
576 | // Trying to get prefetch, if it has not been tried before |
577 | // @phan-suppress-next-line PhanSuspiciousValueComparisonInLoop |
578 | if ( $text === false && $this->prefetch && $prefetchNotTried ) { |
579 | $prefetchNotTried = false; |
580 | $tryIsPrefetch = true; |
581 | $text = $this->prefetch->prefetch( |
582 | (int)$this->thisPage, |
583 | (int)$this->thisRev, |
584 | trim( $this->thisRole ) |
585 | ) ?? false; |
586 | |
587 | if ( is_string( $text ) && $model !== null ) { |
588 | // Apply export transformation to text coming from an old dump. |
589 | // The purpose of this transformation is to convert up from legacy |
590 | // formats, which may still be used in the older dump that is used |
591 | // for pre-fetching. Applying the transformation again should not |
592 | // interfere with content that is already in the correct form. |
593 | $text = $this->exportTransform( $text, $model, $format ); |
594 | } |
595 | } |
596 | |
597 | if ( $text === false ) { |
598 | // Fallback to asking the database |
599 | $tryIsPrefetch = false; |
600 | if ( $this->spawn ) { |
601 | $text = $this->getTextSpawned( $contentAddress ); |
602 | } else { |
603 | $text = $this->getTextDb( $contentAddress ); |
604 | } |
605 | |
606 | if ( $text !== false && $model !== null ) { |
607 | // Apply export transformation to text coming from the database. |
608 | // Prefetched text should already have transformations applied. |
609 | $text = $this->exportTransform( $text, $model, $format ); |
610 | } |
611 | |
612 | // No more checks for texts from DB for now. |
613 | // If we received something that is not false, |
614 | // We treat it as good text, regardless of whether it actually is or is not |
615 | if ( $text !== false ) { |
616 | return $text; |
617 | } |
618 | } |
619 | |
620 | if ( $text === false ) { |
621 | throw new RuntimeException( "Generic error while obtaining text for id " . $contentAddress ); |
622 | } |
623 | |
624 | // We received a good candidate for the text of $id via some method |
625 | |
626 | // Step 2: Checking for plausibility and return the text if it is |
627 | // plausible |
628 | |
629 | if ( $expSize === null || strlen( $text ) == $expSize ) { |
630 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set when text is not false |
631 | if ( $tryIsPrefetch ) { |
632 | $this->prefetchCount++; |
633 | } |
634 | |
635 | return $text; |
636 | } |
637 | |
638 | $text = false; |
639 | throw new RuntimeException( "Received text is unplausible for id " . $contentAddress ); |
640 | } catch ( Exception $e ) { |
641 | $msg = "getting/checking text " . $contentAddress . " failed (" . $e->getMessage() |
642 | . ") for revision " . $this->thisRev; |
643 | if ( $failures + 1 < $this->maxFailures ) { |
644 | $msg .= " (Will retry " . ( $this->maxFailures - $failures - 1 ) . " more times)"; |
645 | } |
646 | $this->progress( $msg ); |
647 | } |
648 | |
649 | // Something went wrong; we did not get a text that was plausible :( |
650 | $failures++; |
651 | |
652 | if ( $contentAddress === $id && $this->thisRev && trim( $this->thisRole ) ) { |
653 | try { |
654 | // MediaWiki doesn't guarantee that content addresses are valid |
655 | // for any significant length of time. Try refreshing as the |
656 | // previously retrieved address may no longer be valid. |
657 | $revRecord = $this->getRevisionStore()->getRevisionById( (int)$this->thisRev ); |
658 | if ( $revRecord !== null ) { |
659 | $refreshed = $revRecord->getSlot( trim( $this->thisRole ) )->getAddress(); |
660 | if ( $contentAddress !== $refreshed ) { |
661 | $this->progress( |
662 | "Updated content address for rev {$this->thisRev} from " |
663 | . "{$contentAddress} to {$refreshed}" |
664 | ); |
665 | $contentAddress = $refreshed; |
666 | // Skip sleeping if we updated the address |
667 | continue; |
668 | } |
669 | } |
670 | } catch ( Exception $e ) { |
671 | $this->progress( |
672 | "refreshing content address for revision {$this->thisRev} failed ({$e->getMessage()})" |
673 | ); |
674 | } |
675 | } |
676 | |
677 | // A failure in a prefetch hit does not warrant resetting db connection etc. |
678 | if ( !$tryIsPrefetch ) { |
679 | // After backing off for some time, we try to reboot the whole process as |
680 | // much as possible to not carry over failures from one part to the other |
681 | // parts |
682 | sleep( $this->failureTimeout ); |
683 | try { |
684 | if ( $this->spawn ) { |
685 | $this->closeSpawn(); |
686 | $this->openSpawn(); |
687 | } |
688 | } catch ( Exception $e ) { |
689 | $this->progress( "Rebooting getText infrastructure failed (" . $e->getMessage() . ")" . |
690 | " Trying to continue anyways" ); |
691 | } |
692 | } |
693 | } |
694 | |
695 | // Retrieving a good text for $id failed (at least) maxFailures times. |
696 | // We abort for this $id. |
697 | |
698 | // Restoring the consecutive failures, and maybe aborting, if the dump |
699 | // is too broken. |
700 | $consecutiveFailedTextRetrievals = $oldConsecutiveFailedTextRetrievals + 1; |
701 | if ( $consecutiveFailedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals ) { |
702 | throw new MWException( "Graceful storage failure" ); |
703 | } |
704 | |
705 | return ""; |
706 | } |
707 | |
708 | /** |
709 | * Loads the serialized content from storage. |
710 | * |
711 | * @param int|string $id Content address, or text row ID. |
712 | * @return string|false |
713 | */ |
714 | private function getTextDb( $id ) { |
715 | $store = $this->getBlobStore(); |
716 | $address = ( is_int( $id ) || strpos( $id, ':' ) === false ) |
717 | ? SqlBlobStore::makeAddressFromTextId( (int)$id ) |
718 | : $id; |
719 | |
720 | try { |
721 | $text = $store->getBlob( $address ); |
722 | |
723 | $stripped = str_replace( "\r", "", $text ); |
724 | $normalized = $this->getServiceContainer()->getContentLanguage() |
725 | ->normalize( $stripped ); |
726 | |
727 | return $normalized; |
728 | } catch ( BlobAccessException $ex ) { |
729 | // XXX: log a warning? |
730 | return false; |
731 | } |
732 | } |
733 | |
734 | /** |
735 | * @param int|string $address Content address, or text row ID. |
736 | * @return string|false |
737 | */ |
738 | private function getTextSpawned( $address ) { |
739 | AtEase::suppressWarnings(); |
740 | if ( !$this->spawnProc ) { |
741 | // First time? |
742 | $this->openSpawn(); |
743 | } |
744 | $text = $this->getTextSpawnedOnce( $address ); |
745 | AtEase::restoreWarnings(); |
746 | |
747 | return $text; |
748 | } |
749 | |
750 | protected function openSpawn() { |
751 | global $IP; |
752 | |
753 | $wiki = WikiMap::getCurrentWikiId(); |
754 | if ( count( $this->php ) == 2 ) { |
755 | $mwscriptpath = $this->php[1]; |
756 | } else { |
757 | $mwscriptpath = "$IP/../multiversion/MWScript.php"; |
758 | } |
759 | if ( file_exists( $mwscriptpath ) ) { |
760 | $cmd = implode( " ", |
761 | array_map( [ Shell::class, 'escape' ], |
762 | [ |
763 | $this->php[0], |
764 | $mwscriptpath, |
765 | "fetchText.php", |
766 | '--wiki', $wiki ] ) ); |
767 | } else { |
768 | $cmd = implode( " ", |
769 | array_map( [ Shell::class, 'escape' ], |
770 | [ |
771 | $this->php[0], |
772 | "$IP/maintenance/fetchText.php", |
773 | '--wiki', $wiki ] ) ); |
774 | } |
775 | $spec = [ |
776 | 0 => [ "pipe", "r" ], |
777 | 1 => [ "pipe", "w" ], |
778 | 2 => [ "file", "/dev/null", "a" ] ]; |
779 | $pipes = []; |
780 | |
781 | $this->progress( "Spawning database subprocess: $cmd" ); |
782 | $this->spawnProc = proc_open( $cmd, $spec, $pipes ); |
783 | if ( !$this->spawnProc ) { |
784 | $this->progress( "Subprocess spawn failed." ); |
785 | |
786 | return false; |
787 | } |
788 | [ |
789 | $this->spawnWrite, // -> stdin |
790 | $this->spawnRead, // <- stdout |
791 | ] = $pipes; |
792 | |
793 | return true; |
794 | } |
795 | |
796 | private function closeSpawn() { |
797 | AtEase::suppressWarnings(); |
798 | if ( $this->spawnRead ) { |
799 | fclose( $this->spawnRead ); |
800 | } |
801 | $this->spawnRead = null; |
802 | if ( $this->spawnWrite ) { |
803 | fclose( $this->spawnWrite ); |
804 | } |
805 | $this->spawnWrite = null; |
806 | if ( $this->spawnErr ) { |
807 | fclose( $this->spawnErr ); |
808 | } |
809 | $this->spawnErr = false; |
810 | if ( $this->spawnProc ) { |
811 | proc_close( $this->spawnProc ); |
812 | } |
813 | $this->spawnProc = false; |
814 | AtEase::restoreWarnings(); |
815 | } |
816 | |
817 | /** |
818 | * @param int|string $address Content address, or text row ID. |
819 | * @return string|false |
820 | */ |
821 | private function getTextSpawnedOnce( $address ) { |
822 | if ( is_int( $address ) || intval( $address ) ) { |
823 | $address = SqlBlobStore::makeAddressFromTextId( (int)$address ); |
824 | } |
825 | |
826 | $ok = fwrite( $this->spawnWrite, "$address\n" ); |
827 | // $this->progress( ">> $id" ); |
828 | if ( !$ok ) { |
829 | return false; |
830 | } |
831 | |
832 | $ok = fflush( $this->spawnWrite ); |
833 | // $this->progress( ">> [flush]" ); |
834 | if ( !$ok ) { |
835 | return false; |
836 | } |
837 | |
838 | // check that the text address they are sending is the one we asked for |
839 | // this avoids out of sync revision text errors we have encountered in the past |
840 | $newAddress = fgets( $this->spawnRead ); |
841 | if ( $newAddress === false ) { |
842 | return false; |
843 | } |
844 | $newAddress = trim( $newAddress ); |
845 | if ( strpos( $newAddress, ':' ) === false ) { |
846 | $newAddress = SqlBlobStore::makeAddressFromTextId( intval( $newAddress ) ); |
847 | } |
848 | |
849 | if ( $newAddress !== $address ) { |
850 | return false; |
851 | } |
852 | |
853 | $len = fgets( $this->spawnRead ); |
854 | // $this->progress( "<< " . trim( $len ) ); |
855 | if ( $len === false ) { |
856 | return false; |
857 | } |
858 | |
859 | $nbytes = intval( $len ); |
860 | // actual error, not zero-length text |
861 | if ( $nbytes < 0 ) { |
862 | return false; |
863 | } |
864 | |
865 | $text = ""; |
866 | |
867 | // Subprocess may not send everything at once, we have to loop. |
868 | while ( $nbytes > strlen( $text ) ) { |
869 | $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) ); |
870 | if ( $buffer === false ) { |
871 | break; |
872 | } |
873 | $text .= $buffer; |
874 | } |
875 | |
876 | $gotbytes = strlen( $text ); |
877 | if ( $gotbytes != $nbytes ) { |
878 | $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " ); |
879 | |
880 | return false; |
881 | } |
882 | |
883 | // Do normalization in the dump thread... |
884 | $stripped = str_replace( "\r", "", $text ); |
885 | $normalized = $this->getServiceContainer()->getContentLanguage()-> |
886 | normalize( $stripped ); |
887 | |
888 | return $normalized; |
889 | } |
890 | |
891 | protected function startElement( $parser, $name, $attribs ) { |
892 | $this->checkpointJustWritten = false; |
893 | |
894 | $this->clearOpenElement( null ); |
895 | $this->lastName = $name; |
896 | |
897 | if ( $name == 'revision' ) { |
898 | $this->state = $name; |
899 | $this->egress->writeOpenPage( null, $this->buffer ); |
900 | $this->buffer = ""; |
901 | } elseif ( $name == 'page' ) { |
902 | $this->state = $name; |
903 | if ( $this->atStart ) { |
904 | $this->egress->writeOpenStream( $this->buffer ); |
905 | $this->buffer = ""; |
906 | $this->atStart = false; |
907 | } |
908 | } elseif ( $name === 'mediawiki' ) { |
909 | if ( isset( $attribs['version'] ) ) { |
910 | if ( $attribs['version'] !== $this->schemaVersion ) { |
911 | throw new RuntimeException( 'Mismatching schema version. ' |
912 | . 'Use the --schema-version option to set the output schema version to ' |
913 | . 'the version declared by the stub file, namely ' . $attribs['version'] ); |
914 | } |
915 | } |
916 | } |
917 | |
918 | if ( $name == "text" && ( isset( $attribs['id'] ) || isset( $attribs['location'] ) ) ) { |
919 | $id = $attribs['location'] ?? $attribs['id']; |
920 | $model = trim( $this->thisRevModel ); |
921 | $format = trim( $this->thisRevFormat ); |
922 | |
923 | $model = $model === '' ? null : $model; |
924 | $format = $format === '' ? null : $format; |
925 | $expSize = !empty( $attribs['bytes'] ) && $model === CONTENT_MODEL_WIKITEXT |
926 | ? (int)$attribs['bytes'] : null; |
927 | |
928 | $text = $this->getText( $id, $model, $format, $expSize ); |
929 | |
930 | unset( $attribs['id'] ); |
931 | unset( $attribs['location'] ); |
932 | if ( $text !== '' ) { |
933 | $attribs['xml:space'] = 'preserve'; |
934 | } |
935 | |
936 | $this->openElement = [ $name, $attribs ]; |
937 | if ( $text !== '' ) { |
938 | $this->characterData( $parser, $text ); |
939 | } |
940 | } else { |
941 | $this->openElement = [ $name, $attribs ]; |
942 | } |
943 | } |
944 | |
945 | protected function endElement( $parser, $name ) { |
946 | $this->checkpointJustWritten = false; |
947 | |
948 | if ( $this->openElement ) { |
949 | $this->clearOpenElement( "" ); |
950 | } else { |
951 | $this->buffer .= "</$name>"; |
952 | } |
953 | |
954 | if ( $name == 'revision' ) { |
955 | $this->egress->writeRevision( null, $this->buffer ); |
956 | $this->buffer = ""; |
957 | $this->thisRev = ""; |
958 | $this->thisRole = null; |
959 | $this->thisRevModel = null; |
960 | $this->thisRevFormat = null; |
961 | } elseif ( $name == 'page' ) { |
962 | if ( !$this->firstPageWritten ) { |
963 | $this->firstPageWritten = trim( $this->thisPage ); |
964 | } |
965 | $this->lastPageWritten = trim( $this->thisPage ); |
966 | if ( $this->timeExceeded ) { |
967 | $this->egress->writeClosePage( $this->buffer ); |
968 | // nasty hack, we can't just write the chardata after the |
969 | // page tag, it will include leading blanks from the next line |
970 | $this->egress->sink->write( "\n" ); |
971 | |
972 | $this->buffer = $this->xmlwriterobj->closeStream(); |
973 | $this->egress->writeCloseStream( $this->buffer ); |
974 | |
975 | $this->buffer = ""; |
976 | $this->thisPage = ""; |
977 | // this could be more than one file if we had more than one output arg |
978 | |
979 | $filenameList = (array)$this->egress->getFilenames(); |
980 | $newFilenames = []; |
981 | $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT ); |
982 | $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT ); |
983 | $filenamesCount = count( $filenameList ); |
984 | for ( $i = 0; $i < $filenamesCount; $i++ ) { |
985 | $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID ); |
986 | $fileinfo = pathinfo( $filenameList[$i] ); |
987 | $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn; |
988 | } |
989 | $this->egress->closeRenameAndReopen( $newFilenames ); |
990 | $this->buffer = $this->xmlwriterobj->openStream(); |
991 | $this->timeExceeded = false; |
992 | $this->timeOfCheckpoint = $this->lastTime; |
993 | $this->firstPageWritten = false; |
994 | $this->checkpointJustWritten = true; |
995 | } else { |
996 | $this->egress->writeClosePage( $this->buffer ); |
997 | $this->buffer = ""; |
998 | $this->thisPage = ""; |
999 | } |
1000 | } elseif ( $name == 'mediawiki' ) { |
1001 | $this->egress->writeCloseStream( $this->buffer ); |
1002 | $this->buffer = ""; |
1003 | } |
1004 | } |
1005 | |
1006 | protected function characterData( $parser, $data ) { |
1007 | $this->clearOpenElement( null ); |
1008 | if ( $this->lastName == "id" ) { |
1009 | if ( $this->state == "revision" ) { |
1010 | $this->thisRev .= $data; |
1011 | $this->thisRole = SlotRecord::MAIN; |
1012 | } elseif ( $this->state == "page" ) { |
1013 | $this->thisPage .= $data; |
1014 | } |
1015 | } elseif ( $this->lastName == "model" ) { |
1016 | $this->thisRevModel .= $data; |
1017 | } elseif ( $this->lastName == "format" ) { |
1018 | $this->thisRevFormat .= $data; |
1019 | } elseif ( $this->lastName == "content" ) { |
1020 | $this->thisRole = ""; |
1021 | $this->thisRevModel = ""; |
1022 | $this->thisRevFormat = ""; |
1023 | } elseif ( $this->lastName == "role" ) { |
1024 | $this->thisRole .= $data; |
1025 | } |
1026 | |
1027 | // have to skip the newline left over from closepagetag line of |
1028 | // end of checkpoint files. nasty hack!! |
1029 | if ( $this->checkpointJustWritten ) { |
1030 | if ( $data[0] == "\n" ) { |
1031 | $data = substr( $data, 1 ); |
1032 | } |
1033 | $this->checkpointJustWritten = false; |
1034 | } |
1035 | $this->buffer .= htmlspecialchars( $data, ENT_COMPAT ); |
1036 | } |
1037 | |
1038 | protected function clearOpenElement( $style ) { |
1039 | if ( $this->openElement ) { |
1040 | $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style ); |
1041 | $this->openElement = false; |
1042 | } |
1043 | } |
1044 | |
1045 | private function isValidTextId( string $id ): bool { |
1046 | if ( preg_match( '/:/', $id ) ) { |
1047 | return $id !== 'tt:0'; |
1048 | } elseif ( preg_match( '/^\d+$/', $id ) ) { |
1049 | return intval( $id ) > 0; |
1050 | } |
1051 | |
1052 | return false; |
1053 | } |
1054 | |
1055 | } |
1056 | |
1057 | /** @deprecated class alias since 1.43 */ |
1058 | class_alias( TextPassDumper::class, 'TextPassDumper' ); |