Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 515 |
|
0.00% |
0 / 26 |
CRAP | |
0.00% |
0 / 1 |
TextPassDumper | |
0.00% |
0 / 513 |
|
0.00% |
0 / 26 |
25122 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
6 | |||
finalSetup | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getBlobStore | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
processOptions | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
110 | |||
rotateDb | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
56 | |||
initProgress | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
dump | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
20 | |||
processFileOpt | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
72 | |||
showReport | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
56 | |||
setTimeExceeded | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
checkIfTimeExceeded | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
finalOptionCheck | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
90 | |||
readDump | |
0.00% |
0 / 52 |
|
0.00% |
0 / 1 |
90 | |||
exportTransform | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
6 | |||
getText | |
0.00% |
0 / 62 |
|
0.00% |
0 / 1 |
650 | |||
getTextDb | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 | |||
getTextSpawned | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
openSpawn | |
0.00% |
0 / 34 |
|
0.00% |
0 / 1 |
20 | |||
closeSpawn | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
getTextSpawnedOnce | |
0.00% |
0 / 36 |
|
0.00% |
0 / 1 |
182 | |||
startElement | |
0.00% |
0 / 36 |
|
0.00% |
0 / 1 |
272 | |||
endElement | |
0.00% |
0 / 43 |
|
0.00% |
0 / 1 |
72 | |||
characterData | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
110 | |||
clearOpenElement | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
isValidTextId | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | /** |
3 | * BackupDumper that postprocesses XML dumps from dumpBackup.php to add page text |
4 | * |
5 | * Copyright (C) 2005 Brooke Vibber <bvibber@wikimedia.org> |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup Dump |
25 | * @ingroup Maintenance |
26 | */ |
27 | |
28 | require_once __DIR__ . '/BackupDumper.php'; |
29 | require_once __DIR__ . '/../../includes/export/WikiExporter.php'; |
30 | |
31 | use MediaWiki\Revision\SlotRecord; |
32 | use MediaWiki\Settings\SettingsBuilder; |
33 | use MediaWiki\Shell\Shell; |
34 | use MediaWiki\Storage\BlobAccessException; |
35 | use MediaWiki\Storage\BlobStore; |
36 | use MediaWiki\Storage\SqlBlobStore; |
37 | use MediaWiki\WikiMap\WikiMap; |
38 | use Wikimedia\AtEase\AtEase; |
39 | use Wikimedia\Rdbms\IMaintainableDatabase; |
40 | |
41 | /** |
42 | * @ingroup Maintenance |
43 | */ |
44 | class TextPassDumper extends BackupDumper { |
45 | /** @var BaseDump|null */ |
46 | public $prefetch = null; |
47 | /** @var string */ |
48 | private $thisPage; |
49 | /** @var string */ |
50 | private $thisRev; |
51 | /** @var string|null */ |
52 | private $thisRole = null; |
53 | |
54 | /** |
55 | * @var int when we spend more than maxTimeAllowed seconds on this run, we continue |
56 | * processing until we write out the next complete page, then save output file(s), |
57 | * rename it/them and open new one(s); 0 = no limit |
58 | */ |
59 | public $maxTimeAllowed = 0; |
60 | |
61 | /** @var string */ |
62 | protected $input = "php://stdin"; |
63 | /** @var int */ |
64 | protected $history = WikiExporter::FULL; |
65 | /** @var int */ |
66 | protected $fetchCount = 0; |
67 | /** @var int */ |
68 | protected $prefetchCount = 0; |
69 | /** @var int */ |
70 | protected $prefetchCountLast = 0; |
71 | /** @var int */ |
72 | protected $fetchCountLast = 0; |
73 | |
74 | /** @var int */ |
75 | protected $maxFailures = 5; |
76 | /** @var int */ |
77 | protected $maxConsecutiveFailedTextRetrievals = 200; |
78 | /** @var int Seconds to sleep after db failure */ |
79 | protected $failureTimeout = 5; |
80 | |
81 | /** @var int In bytes. Maximum size to read from the stub in on go. */ |
82 | protected $bufferSize = 524_288; |
83 | |
84 | /** @var array */ |
85 | protected $php = [ PHP_BINARY ]; |
86 | /** @var bool */ |
87 | protected $spawn = false; |
88 | |
89 | /** |
90 | * @var resource|false |
91 | */ |
92 | protected $spawnProc = false; |
93 | |
94 | /** |
95 | * @var resource|null |
96 | */ |
97 | protected $spawnWrite; |
98 | |
99 | /** |
100 | * @var resource|null |
101 | */ |
102 | protected $spawnRead; |
103 | |
104 | /** |
105 | * @var resource|false |
106 | */ |
107 | protected $spawnErr = false; |
108 | |
109 | /** |
110 | * @var XmlDumpWriter|false |
111 | */ |
112 | protected $xmlwriterobj = false; |
113 | |
114 | /** @var bool */ |
115 | protected $timeExceeded = false; |
116 | /** @var string|false */ |
117 | protected $firstPageWritten = false; |
118 | /** @var string|false */ |
119 | protected $lastPageWritten = false; |
120 | /** @var bool */ |
121 | protected $checkpointJustWritten = false; |
122 | /** @var string[] */ |
123 | protected $checkpointFiles = []; |
124 | |
125 | /** |
126 | * @var IMaintainableDatabase |
127 | */ |
128 | protected $db; |
129 | |
130 | /** |
131 | * @param array|null $args For backward compatibility |
132 | */ |
133 | public function __construct( $args = null ) { |
134 | parent::__construct(); |
135 | |
136 | $this->addDescription( <<<TEXT |
137 | This script postprocesses XML dumps from dumpBackup.php to add |
138 | page text which was stubbed out (using --stub). |
139 | |
140 | XML input is accepted on stdin. |
141 | XML output is sent to stdout; progress reports are sent to stderr. |
142 | TEXT |
143 | ); |
144 | $this->stderr = fopen( "php://stderr", "wt" ); |
145 | |
146 | $this->addOption( 'stub', 'To load a compressed stub dump instead of stdin. ' . |
147 | 'Specify as --stub=<type>:<file>.', false, true ); |
148 | $this->addOption( 'prefetch', 'Use a prior dump file as a text source, to savepressure on the ' . |
149 | 'database. (Requires the XMLReader extension). Specify as --prefetch=<type>:<file>', |
150 | false, true ); |
151 | $this->addOption( 'maxtime', 'Write out checkpoint file after this many minutes (writing' . |
152 | 'out complete page, closing xml file properly, and opening new one' . |
153 | 'with header). This option requires the checkpointfile option.', false, true ); |
154 | $this->addOption( 'checkpointfile', 'Use this string for checkpoint filenames,substituting ' . |
155 | 'first pageid written for the first %s (required) and the last pageid written for the ' . |
156 | 'second %s if it exists.', false, true, false, true ); // This can be specified multiple times |
157 | $this->addOption( 'quiet', 'Don\'t dump status reports to stderr.' ); |
158 | $this->addOption( 'full', 'Dump all revisions of every page' ); |
159 | $this->addOption( 'current', 'Base ETA on number of pages in database instead of all revisions' ); |
160 | $this->addOption( 'spawn', 'Spawn a subprocess for loading text records, optionally specify ' . |
161 | 'php[,mwscript] paths' ); |
162 | $this->addOption( 'buffersize', 'Buffer size in bytes to use for reading the stub. ' . |
163 | '(Default: 512 KiB, Minimum: 4 KiB)', false, true ); |
164 | |
165 | if ( $args ) { |
166 | $this->loadWithArgv( $args ); |
167 | $this->processOptions(); |
168 | } |
169 | } |
170 | |
171 | public function finalSetup( SettingsBuilder $settingsBuilder ) { |
172 | parent::finalSetup( $settingsBuilder ); |
173 | |
174 | SevenZipStream::register(); |
175 | } |
176 | |
177 | /** |
178 | * @return BlobStore |
179 | */ |
180 | private function getBlobStore() { |
181 | return $this->getServiceContainer()->getBlobStore(); |
182 | } |
183 | |
184 | public function execute() { |
185 | $this->processOptions(); |
186 | $this->dump( true ); |
187 | } |
188 | |
189 | protected function processOptions() { |
190 | parent::processOptions(); |
191 | |
192 | if ( $this->hasOption( 'buffersize' ) ) { |
193 | $this->bufferSize = max( intval( $this->getOption( 'buffersize' ) ), 4 * 1024 ); |
194 | } |
195 | |
196 | if ( $this->hasOption( 'prefetch' ) ) { |
197 | $url = $this->processFileOpt( $this->getOption( 'prefetch' ) ); |
198 | $this->prefetch = new BaseDump( $url ); |
199 | } |
200 | |
201 | if ( $this->hasOption( 'stub' ) ) { |
202 | $this->input = $this->processFileOpt( $this->getOption( 'stub' ) ); |
203 | } |
204 | |
205 | if ( $this->hasOption( 'maxtime' ) ) { |
206 | $this->maxTimeAllowed = intval( $this->getOption( 'maxtime' ) ) * 60; |
207 | } |
208 | |
209 | if ( $this->hasOption( 'checkpointfile' ) ) { |
210 | $this->checkpointFiles = $this->getOption( 'checkpointfile' ); |
211 | } |
212 | |
213 | if ( $this->hasOption( 'current' ) ) { |
214 | $this->history = WikiExporter::CURRENT; |
215 | } |
216 | |
217 | if ( $this->hasOption( 'full' ) ) { |
218 | $this->history = WikiExporter::FULL; |
219 | } |
220 | |
221 | if ( $this->hasOption( 'spawn' ) ) { |
222 | $this->spawn = true; |
223 | $val = $this->getOption( 'spawn' ); |
224 | if ( $val !== 1 ) { |
225 | $this->php = explode( ',', $val, 2 ); |
226 | } |
227 | } |
228 | } |
229 | |
230 | /** |
231 | * Drop the database connection $this->db and try to get a new one. |
232 | * |
233 | * This function tries to get a /different/ connection if this is |
234 | * possible. Hence, (if this is possible) it switches to a different |
235 | * failover upon each call. |
236 | * |
237 | * This function resets $this->lb and closes all connections on it. |
238 | * |
239 | * @suppress PhanTypeObjectUnsetDeclaredProperty |
240 | */ |
241 | protected function rotateDb() { |
242 | // Cleaning up old connections |
243 | if ( isset( $this->lb ) ) { |
244 | $this->lb->closeAll( __METHOD__ ); |
245 | unset( $this->lb ); |
246 | } |
247 | |
248 | if ( $this->forcedDb !== null ) { |
249 | $this->db = $this->forcedDb; |
250 | |
251 | return; |
252 | } |
253 | |
254 | if ( isset( $this->db ) && $this->db->isOpen() ) { |
255 | throw new RuntimeException( 'DB is set and has not been closed by the Load Balancer' ); |
256 | } |
257 | |
258 | unset( $this->db ); |
259 | |
260 | // Trying to set up new connection. |
261 | // We do /not/ retry upon failure, but delegate to encapsulating logic, to avoid |
262 | // individually retrying at different layers of code. |
263 | |
264 | try { |
265 | $lbFactory = $this->getServiceContainer()->getDBLoadBalancerFactory(); |
266 | $this->lb = $lbFactory->newMainLB(); |
267 | } catch ( Exception $e ) { |
268 | throw new RuntimeException( __METHOD__ |
269 | . " rotating DB failed to obtain new load balancer (" . $e->getMessage() . ")" ); |
270 | } |
271 | |
272 | try { |
273 | $this->db = $this->lb->getMaintenanceConnectionRef( DB_REPLICA, 'dump' ); |
274 | } catch ( Exception $e ) { |
275 | throw new RuntimeException( __METHOD__ |
276 | . " rotating DB failed to obtain new database (" . $e->getMessage() . ")" ); |
277 | } |
278 | } |
279 | |
280 | public function initProgress( $history = WikiExporter::FULL ) { |
281 | parent::initProgress(); |
282 | $this->timeOfCheckpoint = $this->startTime; |
283 | } |
284 | |
285 | public function dump( $history, $text = WikiExporter::TEXT ) { |
286 | // Notice messages will foul up your XML output even if they're |
287 | // relatively harmless. |
288 | if ( ini_get( 'display_errors' ) ) { |
289 | ini_set( 'display_errors', 'stderr' ); |
290 | } |
291 | |
292 | $this->initProgress( $this->history ); |
293 | |
294 | // We are trying to get an initial database connection to avoid that the |
295 | // first try of this request's first call to getText fails. However, if |
296 | // obtaining a good DB connection fails it's not a serious issue, as |
297 | // getText does retry upon failure and can start without having a working |
298 | // DB connection. |
299 | try { |
300 | $this->rotateDb(); |
301 | } catch ( Exception $e ) { |
302 | // We do not even count this as failure. Just let eventual |
303 | // watchdogs know. |
304 | $this->progress( "Getting initial DB connection failed (" . |
305 | $e->getMessage() . ")" ); |
306 | } |
307 | |
308 | $this->egress = new ExportProgressFilter( $this->sink, $this ); |
309 | |
310 | // it would be nice to do it in the constructor, oh well. need egress set |
311 | $this->finalOptionCheck(); |
312 | |
313 | // we only want this so we know how to close a stream :-P |
314 | $this->xmlwriterobj = new XmlDumpWriter( XmlDumpWriter::WRITE_CONTENT, $this->schemaVersion ); |
315 | |
316 | $input = fopen( $this->input, "rt" ); |
317 | $this->readDump( $input ); |
318 | |
319 | if ( $this->spawnProc ) { |
320 | $this->closeSpawn(); |
321 | } |
322 | |
323 | $this->report( true ); |
324 | } |
325 | |
326 | protected function processFileOpt( $opt ) { |
327 | $split = explode( ':', $opt, 2 ); |
328 | $val = $split[0]; |
329 | $param = ''; |
330 | if ( count( $split ) === 2 ) { |
331 | $param = $split[1]; |
332 | } |
333 | $fileURIs = explode( ';', $param ); |
334 | $newFileURIs = []; |
335 | foreach ( $fileURIs as $URI ) { |
336 | switch ( $val ) { |
337 | case "file": |
338 | $newURI = $URI; |
339 | break; |
340 | case "gzip": |
341 | $newURI = "compress.zlib://$URI"; |
342 | break; |
343 | case "bzip2": |
344 | $newURI = "compress.bzip2://$URI"; |
345 | break; |
346 | case "7zip": |
347 | $newURI = "mediawiki.compress.7z://$URI"; |
348 | break; |
349 | default: |
350 | $newURI = $URI; |
351 | } |
352 | $newFileURIs[] = $newURI; |
353 | } |
354 | $val = implode( ';', $newFileURIs ); |
355 | |
356 | return $val; |
357 | } |
358 | |
359 | /** |
360 | * Overridden to include prefetch ratio if enabled. |
361 | */ |
362 | public function showReport() { |
363 | if ( !$this->prefetch ) { |
364 | parent::showReport(); |
365 | |
366 | return; |
367 | } |
368 | |
369 | if ( $this->reporting ) { |
370 | $now = wfTimestamp( TS_DB ); |
371 | $nowts = microtime( true ); |
372 | $deltaAll = $nowts - $this->startTime; |
373 | $deltaPart = $nowts - $this->lastTime; |
374 | $this->pageCountPart = $this->pageCount - $this->pageCountLast; |
375 | $this->revCountPart = $this->revCount - $this->revCountLast; |
376 | |
377 | if ( $deltaAll ) { |
378 | $portion = $this->revCount / $this->maxCount; |
379 | $eta = $this->startTime + $deltaAll / $portion; |
380 | $etats = wfTimestamp( TS_DB, intval( $eta ) ); |
381 | if ( $this->fetchCount ) { |
382 | $fetchRate = 100.0 * $this->prefetchCount / $this->fetchCount; |
383 | } else { |
384 | $fetchRate = '-'; |
385 | } |
386 | $pageRate = $this->pageCount / $deltaAll; |
387 | $revRate = $this->revCount / $deltaAll; |
388 | } else { |
389 | $pageRate = '-'; |
390 | $revRate = '-'; |
391 | $etats = '-'; |
392 | $fetchRate = '-'; |
393 | } |
394 | if ( $deltaPart ) { |
395 | if ( $this->fetchCountLast ) { |
396 | $fetchRatePart = 100.0 * $this->prefetchCountLast / $this->fetchCountLast; |
397 | } else { |
398 | $fetchRatePart = '-'; |
399 | } |
400 | $pageRatePart = $this->pageCountPart / $deltaPart; |
401 | $revRatePart = $this->revCountPart / $deltaPart; |
402 | } else { |
403 | $fetchRatePart = '-'; |
404 | $pageRatePart = '-'; |
405 | $revRatePart = '-'; |
406 | } |
407 | |
408 | $dbDomain = WikiMap::getCurrentWikiDbDomain()->getId(); |
409 | $this->progress( sprintf( |
410 | "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), " |
411 | . "%d revs (%0.1f|%0.1f/sec all|curr), %0.1f%%|%0.1f%% " |
412 | . "prefetched (all|curr), ETA %s [max %d]", |
413 | $now, $dbDomain, $this->ID, $this->pageCount, $pageRate, |
414 | $pageRatePart, $this->revCount, $revRate, $revRatePart, |
415 | $fetchRate, $fetchRatePart, $etats, $this->maxCount |
416 | ) ); |
417 | $this->lastTime = $nowts; |
418 | $this->revCountLast = $this->revCount; |
419 | $this->prefetchCountLast = $this->prefetchCount; |
420 | $this->fetchCountLast = $this->fetchCount; |
421 | } |
422 | } |
423 | |
424 | private function setTimeExceeded() { |
425 | $this->timeExceeded = true; |
426 | } |
427 | |
428 | private function checkIfTimeExceeded() { |
429 | if ( $this->maxTimeAllowed |
430 | && ( $this->lastTime - $this->timeOfCheckpoint > $this->maxTimeAllowed ) |
431 | ) { |
432 | return true; |
433 | } |
434 | |
435 | return false; |
436 | } |
437 | |
438 | private function finalOptionCheck() { |
439 | if ( ( $this->checkpointFiles && !$this->maxTimeAllowed ) |
440 | || ( $this->maxTimeAllowed && !$this->checkpointFiles ) |
441 | ) { |
442 | throw new RuntimeException( "Options checkpointfile and maxtime must be specified together.\n" ); |
443 | } |
444 | foreach ( $this->checkpointFiles as $checkpointFile ) { |
445 | $count = substr_count( $checkpointFile, "%s" ); |
446 | if ( $count !== 2 ) { |
447 | throw new RuntimeException( "Option checkpointfile must contain two '%s' " |
448 | . "for substitution of first and last pageids, count is $count instead, " |
449 | . "file is $checkpointFile.\n" ); |
450 | } |
451 | } |
452 | |
453 | if ( $this->checkpointFiles ) { |
454 | $filenameList = (array)$this->egress->getFilenames(); |
455 | if ( count( $filenameList ) !== count( $this->checkpointFiles ) ) { |
456 | throw new RuntimeException( "One checkpointfile must be specified " |
457 | . "for each output option, if maxtime is used.\n" ); |
458 | } |
459 | } |
460 | } |
461 | |
462 | /** |
463 | * @throws MWException Failure to parse XML input |
464 | * @param resource $input |
465 | * @return bool |
466 | */ |
467 | protected function readDump( $input ) { |
468 | $this->buffer = ""; |
469 | $this->openElement = false; |
470 | $this->atStart = true; |
471 | $this->state = ""; |
472 | $this->lastName = ""; |
473 | $this->thisPage = ""; |
474 | $this->thisRev = ""; |
475 | $this->thisRole = null; |
476 | $this->thisRevModel = null; |
477 | $this->thisRevFormat = null; |
478 | |
479 | $parser = xml_parser_create( "UTF-8" ); |
480 | xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, 0 ); |
481 | |
482 | xml_set_element_handler( |
483 | $parser, |
484 | [ $this, 'startElement' ], |
485 | [ $this, 'endElement' ] |
486 | ); |
487 | xml_set_character_data_handler( $parser, [ $this, 'characterData' ] ); |
488 | |
489 | $offset = 0; // for context extraction on error reporting |
490 | do { |
491 | if ( $this->checkIfTimeExceeded() ) { |
492 | $this->setTimeExceeded(); |
493 | } |
494 | $chunk = fread( $input, $this->bufferSize ); |
495 | if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) { |
496 | wfDebug( "TextDumpPass::readDump encountered XML parsing error" ); |
497 | |
498 | $byte = xml_get_current_byte_index( $parser ); |
499 | $msg = wfMessage( 'xml-error-string', |
500 | 'XML import parse failure', |
501 | xml_get_current_line_number( $parser ), |
502 | xml_get_current_column_number( $parser ), |
503 | $byte . ( $chunk === false ? '' : ( '; "' . substr( $chunk, $byte - $offset, 16 ) . '"' ) ), |
504 | xml_error_string( xml_get_error_code( $parser ) ) )->escaped(); |
505 | |
506 | xml_parser_free( $parser ); |
507 | |
508 | throw new MWException( $msg ); |
509 | } |
510 | $offset += strlen( $chunk ); |
511 | } while ( $chunk !== false && !feof( $input ) ); |
512 | if ( $this->maxTimeAllowed ) { |
513 | $filenameList = (array)$this->egress->getFilenames(); |
514 | // we wrote some stuff after last checkpoint that needs renamed |
515 | if ( file_exists( $filenameList[0] ) ) { |
516 | $newFilenames = []; |
517 | # we might have just written the header and footer and had no |
518 | # pages or revisions written... perhaps they were all deleted |
519 | # there's no pageID 0 so we use that. the caller is responsible |
520 | # for deciding what to do with a file containing only the |
521 | # siteinfo information and the mw tags. |
522 | if ( !$this->firstPageWritten ) { |
523 | $firstPageID = str_pad( '0', 9, "0", STR_PAD_LEFT ); |
524 | $lastPageID = str_pad( '0', 9, "0", STR_PAD_LEFT ); |
525 | } else { |
526 | $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT ); |
527 | $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT ); |
528 | } |
529 | |
530 | $filenameCount = count( $filenameList ); |
531 | for ( $i = 0; $i < $filenameCount; $i++ ) { |
532 | $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID ); |
533 | $fileinfo = pathinfo( $filenameList[$i] ); |
534 | $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn; |
535 | } |
536 | $this->egress->closeAndRename( $newFilenames ); |
537 | } |
538 | } |
539 | xml_parser_free( $parser ); |
540 | |
541 | return true; |
542 | } |
543 | |
544 | /** |
545 | * Applies applicable export transformations to $text. |
546 | * |
547 | * @param string $text |
548 | * @param string $model |
549 | * @param string|null $format |
550 | * |
551 | * @return string |
552 | */ |
553 | private function exportTransform( $text, $model, $format = null ) { |
554 | try { |
555 | $contentHandler = $this->getServiceContainer() |
556 | ->getContentHandlerFactory() |
557 | ->getContentHandler( $model ); |
558 | } catch ( MWUnknownContentModelException $ex ) { |
559 | wfWarn( "Unable to apply export transformation for content model '$model': " . |
560 | $ex->getMessage() ); |
561 | |
562 | $this->progress( |
563 | "Unable to apply export transformation for content model '$model': " . |
564 | $ex->getMessage() |
565 | ); |
566 | return $text; |
567 | } |
568 | |
569 | return $contentHandler->exportTransform( $text, $format ); |
570 | } |
571 | |
572 | /** |
573 | * Tries to load revision text. |
574 | * Export transformations are applied if the content model is given or can be |
575 | * determined from the database. |
576 | * |
577 | * Upon errors, retries (Up to $this->maxFailures tries each call). |
578 | * If still no good revision could be found even after this retrying, "" is returned. |
579 | * If no good revision text could be returned for |
580 | * $this->maxConsecutiveFailedTextRetrievals consecutive calls to getText, MWException |
581 | * is thrown. |
582 | * |
583 | * @param int|string $id Content address, or text row ID. |
584 | * @param string|false|null $model The content model used to determine |
585 | * applicable export transformations. If $model is null, no transformation is applied. |
586 | * @param string|null $format The content format used when applying export transformations. |
587 | * @param int|null $expSize Expected length of the text, for checks |
588 | * |
589 | * @return string The revision text for $id, or "" |
590 | * @throws MWException |
591 | */ |
592 | protected function getText( $id, $model = null, $format = null, $expSize = null ) { |
593 | if ( !$this->isValidTextId( $id ) ) { |
594 | $msg = "Skipping bad text id " . $id . " of revision " . $this->thisRev; |
595 | $this->progress( $msg ); |
596 | return ''; |
597 | } |
598 | |
599 | $model = $model ?: null; |
600 | $prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch. |
601 | $text = false; // The candidate for a good text. false if no proper value. |
602 | $failures = 0; // The number of times, this invocation of getText already failed. |
603 | |
604 | // The number of times getText failed without yielding a good text in between. |
605 | static $consecutiveFailedTextRetrievals = 0; |
606 | |
607 | $this->fetchCount++; |
608 | |
609 | // To allow to simply return on success and do not have to worry about book keeping, |
610 | // we assume, this fetch works (possible after some retries). Nevertheless, we koop |
611 | // the old value, so we can restore it, if problems occur (See after the while loop). |
612 | $oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals; |
613 | $consecutiveFailedTextRetrievals = 0; |
614 | |
615 | while ( $failures < $this->maxFailures ) { |
616 | // As soon as we found a good text for the $id, we will return immediately. |
617 | // Hence, if we make it past the try catch block, we know that we did not |
618 | // find a good text. |
619 | |
620 | try { |
621 | // Step 1: Get some text (or reuse from previous iteratuon if checking |
622 | // for plausibility failed) |
623 | |
624 | // Trying to get prefetch, if it has not been tried before |
625 | // @phan-suppress-next-line PhanSuspiciousValueComparisonInLoop |
626 | if ( $text === false && isset( $this->prefetch ) && $prefetchNotTried ) { |
627 | $prefetchNotTried = false; |
628 | $tryIsPrefetch = true; |
629 | $text = $this->prefetch->prefetch( |
630 | (int)$this->thisPage, |
631 | (int)$this->thisRev, |
632 | trim( $this->thisRole ) |
633 | ); |
634 | |
635 | if ( $text === null ) { |
636 | $text = false; |
637 | } |
638 | |
639 | if ( is_string( $text ) && $model !== null ) { |
640 | // Apply export transformation to text coming from an old dump. |
641 | // The purpose of this transformation is to convert up from legacy |
642 | // formats, which may still be used in the older dump that is used |
643 | // for pre-fetching. Applying the transformation again should not |
644 | // interfere with content that is already in the correct form. |
645 | $text = $this->exportTransform( $text, $model, $format ); |
646 | } |
647 | } |
648 | |
649 | if ( $text === false ) { |
650 | // Fallback to asking the database |
651 | $tryIsPrefetch = false; |
652 | if ( $this->spawn ) { |
653 | $text = $this->getTextSpawned( $id ); |
654 | } else { |
655 | $text = $this->getTextDb( $id ); |
656 | } |
657 | |
658 | if ( $text !== false && $model !== null ) { |
659 | // Apply export transformation to text coming from the database. |
660 | // Prefetched text should already have transformations applied. |
661 | $text = $this->exportTransform( $text, $model, $format ); |
662 | } |
663 | |
664 | // No more checks for texts from DB for now. |
665 | // If we received something that is not false, |
666 | // We treat it as good text, regardless of whether it actually is or is not |
667 | if ( $text !== false ) { |
668 | return $text; |
669 | } |
670 | } |
671 | |
672 | if ( $text === false ) { |
673 | throw new RuntimeException( "Generic error while obtaining text for id " . $id ); |
674 | } |
675 | |
676 | // We received a good candidate for the text of $id via some method |
677 | |
678 | // Step 2: Checking for plausibility and return the text if it is |
679 | // plausible |
680 | |
681 | if ( $expSize === null || strlen( $text ) == $expSize ) { |
682 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Set when text is not false |
683 | if ( $tryIsPrefetch ) { |
684 | $this->prefetchCount++; |
685 | } |
686 | |
687 | return $text; |
688 | } |
689 | |
690 | $text = false; |
691 | throw new RuntimeException( "Received text is unplausible for id " . $id ); |
692 | } catch ( Exception $e ) { |
693 | $msg = "getting/checking text " . $id . " failed (" . $e->getMessage() |
694 | . ") for revision " . $this->thisRev; |
695 | if ( $failures + 1 < $this->maxFailures ) { |
696 | $msg .= " (Will retry " . ( $this->maxFailures - $failures - 1 ) . " more times)"; |
697 | } |
698 | $this->progress( $msg ); |
699 | } |
700 | |
701 | // Something went wrong; we did not a text that was plausible :( |
702 | $failures++; |
703 | |
704 | // A failure in a prefetch hit does not warrant resetting db connection etc. |
705 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable Control flow is hard to understand here. |
706 | if ( !$tryIsPrefetch ) { |
707 | // After backing off for some time, we try to reboot the whole process as |
708 | // much as possible to not carry over failures from one part to the other |
709 | // parts |
710 | sleep( $this->failureTimeout ); |
711 | try { |
712 | $this->rotateDb(); |
713 | if ( $this->spawn ) { |
714 | $this->closeSpawn(); |
715 | $this->openSpawn(); |
716 | } |
717 | } catch ( Exception $e ) { |
718 | $this->progress( "Rebooting getText infrastructure failed (" . $e->getMessage() . ")" . |
719 | " Trying to continue anyways" ); |
720 | } |
721 | } |
722 | } |
723 | |
724 | // Retrieving a good text for $id failed (at least) maxFailures times. |
725 | // We abort for this $id. |
726 | |
727 | // Restoring the consecutive failures, and maybe aborting, if the dump |
728 | // is too broken. |
729 | $consecutiveFailedTextRetrievals = $oldConsecutiveFailedTextRetrievals + 1; |
730 | if ( $consecutiveFailedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals ) { |
731 | throw new MWException( "Graceful storage failure" ); |
732 | } |
733 | |
734 | return ""; |
735 | } |
736 | |
737 | /** |
738 | * Loads the serialized content from storage. |
739 | * |
740 | * @param int|string $id Content address, or text row ID. |
741 | * @return string|false |
742 | */ |
743 | private function getTextDb( $id ) { |
744 | $store = $this->getBlobStore(); |
745 | $address = ( is_int( $id ) || strpos( $id, ':' ) === false ) |
746 | ? SqlBlobStore::makeAddressFromTextId( (int)$id ) |
747 | : $id; |
748 | |
749 | try { |
750 | $text = $store->getBlob( $address ); |
751 | |
752 | $stripped = str_replace( "\r", "", $text ); |
753 | $normalized = $this->getServiceContainer()->getContentLanguage() |
754 | ->normalize( $stripped ); |
755 | |
756 | return $normalized; |
757 | } catch ( BlobAccessException $ex ) { |
758 | // XXX: log a warning? |
759 | return false; |
760 | } |
761 | } |
762 | |
763 | /** |
764 | * @param int|string $address Content address, or text row ID. |
765 | * @return string|false |
766 | */ |
767 | private function getTextSpawned( $address ) { |
768 | AtEase::suppressWarnings(); |
769 | if ( !$this->spawnProc ) { |
770 | // First time? |
771 | $this->openSpawn(); |
772 | } |
773 | $text = $this->getTextSpawnedOnce( $address ); |
774 | AtEase::restoreWarnings(); |
775 | |
776 | return $text; |
777 | } |
778 | |
779 | protected function openSpawn() { |
780 | global $IP; |
781 | |
782 | $wiki = WikiMap::getCurrentWikiId(); |
783 | if ( count( $this->php ) == 2 ) { |
784 | $mwscriptpath = $this->php[1]; |
785 | } else { |
786 | $mwscriptpath = "$IP/../multiversion/MWScript.php"; |
787 | } |
788 | if ( file_exists( $mwscriptpath ) ) { |
789 | $cmd = implode( " ", |
790 | array_map( [ Shell::class, 'escape' ], |
791 | [ |
792 | $this->php[0], |
793 | $mwscriptpath, |
794 | "fetchText.php", |
795 | '--wiki', $wiki ] ) ); |
796 | } else { |
797 | $cmd = implode( " ", |
798 | array_map( [ Shell::class, 'escape' ], |
799 | [ |
800 | $this->php[0], |
801 | "$IP/maintenance/fetchText.php", |
802 | '--wiki', $wiki ] ) ); |
803 | } |
804 | $spec = [ |
805 | 0 => [ "pipe", "r" ], |
806 | 1 => [ "pipe", "w" ], |
807 | 2 => [ "file", "/dev/null", "a" ] ]; |
808 | $pipes = []; |
809 | |
810 | $this->progress( "Spawning database subprocess: $cmd" ); |
811 | $this->spawnProc = proc_open( $cmd, $spec, $pipes ); |
812 | if ( !$this->spawnProc ) { |
813 | $this->progress( "Subprocess spawn failed." ); |
814 | |
815 | return false; |
816 | } |
817 | [ |
818 | $this->spawnWrite, // -> stdin |
819 | $this->spawnRead, // <- stdout |
820 | ] = $pipes; |
821 | |
822 | return true; |
823 | } |
824 | |
825 | private function closeSpawn() { |
826 | AtEase::suppressWarnings(); |
827 | if ( $this->spawnRead ) { |
828 | fclose( $this->spawnRead ); |
829 | } |
830 | $this->spawnRead = null; |
831 | if ( $this->spawnWrite ) { |
832 | fclose( $this->spawnWrite ); |
833 | } |
834 | $this->spawnWrite = null; |
835 | if ( $this->spawnErr ) { |
836 | fclose( $this->spawnErr ); |
837 | } |
838 | $this->spawnErr = false; |
839 | if ( $this->spawnProc ) { |
840 | pclose( $this->spawnProc ); |
841 | } |
842 | $this->spawnProc = false; |
843 | AtEase::restoreWarnings(); |
844 | } |
845 | |
846 | /** |
847 | * @param int|string $address Content address, or text row ID. |
848 | * @return string|false |
849 | */ |
850 | private function getTextSpawnedOnce( $address ) { |
851 | if ( is_int( $address ) || intval( $address ) ) { |
852 | $address = SqlBlobStore::makeAddressFromTextId( (int)$address ); |
853 | } |
854 | |
855 | $ok = fwrite( $this->spawnWrite, "$address\n" ); |
856 | // $this->progress( ">> $id" ); |
857 | if ( !$ok ) { |
858 | return false; |
859 | } |
860 | |
861 | $ok = fflush( $this->spawnWrite ); |
862 | // $this->progress( ">> [flush]" ); |
863 | if ( !$ok ) { |
864 | return false; |
865 | } |
866 | |
867 | // check that the text address they are sending is the one we asked for |
868 | // this avoids out of sync revision text errors we have encountered in the past |
869 | $newAddress = fgets( $this->spawnRead ); |
870 | if ( $newAddress === false ) { |
871 | return false; |
872 | } |
873 | $newAddress = trim( $newAddress ); |
874 | if ( strpos( $newAddress, ':' ) === false ) { |
875 | $newAddress = SqlBlobStore::makeAddressFromTextId( intval( $newAddress ) ); |
876 | } |
877 | |
878 | if ( $newAddress !== $address ) { |
879 | return false; |
880 | } |
881 | |
882 | $len = fgets( $this->spawnRead ); |
883 | // $this->progress( "<< " . trim( $len ) ); |
884 | if ( $len === false ) { |
885 | return false; |
886 | } |
887 | |
888 | $nbytes = intval( $len ); |
889 | // actual error, not zero-length text |
890 | if ( $nbytes < 0 ) { |
891 | return false; |
892 | } |
893 | |
894 | $text = ""; |
895 | |
896 | // Subprocess may not send everything at once, we have to loop. |
897 | while ( $nbytes > strlen( $text ) ) { |
898 | $buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) ); |
899 | if ( $buffer === false ) { |
900 | break; |
901 | } |
902 | $text .= $buffer; |
903 | } |
904 | |
905 | $gotbytes = strlen( $text ); |
906 | if ( $gotbytes != $nbytes ) { |
907 | $this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " ); |
908 | |
909 | return false; |
910 | } |
911 | |
912 | // Do normalization in the dump thread... |
913 | $stripped = str_replace( "\r", "", $text ); |
914 | $normalized = $this->getServiceContainer()->getContentLanguage()-> |
915 | normalize( $stripped ); |
916 | |
917 | return $normalized; |
918 | } |
919 | |
920 | protected function startElement( $parser, $name, $attribs ) { |
921 | $this->checkpointJustWritten = false; |
922 | |
923 | $this->clearOpenElement( null ); |
924 | $this->lastName = $name; |
925 | |
926 | if ( $name == 'revision' ) { |
927 | $this->state = $name; |
928 | $this->egress->writeOpenPage( null, $this->buffer ); |
929 | $this->buffer = ""; |
930 | } elseif ( $name == 'page' ) { |
931 | $this->state = $name; |
932 | if ( $this->atStart ) { |
933 | $this->egress->writeOpenStream( $this->buffer ); |
934 | $this->buffer = ""; |
935 | $this->atStart = false; |
936 | } |
937 | } elseif ( $name === 'mediawiki' ) { |
938 | if ( isset( $attribs['version'] ) ) { |
939 | if ( $attribs['version'] !== $this->schemaVersion ) { |
940 | throw new RuntimeException( 'Mismatching schema version. ' |
941 | . 'Use the --schema-version option to set the output schema version to ' |
942 | . 'the version declared by the stub file, namely ' . $attribs['version'] ); |
943 | } |
944 | } |
945 | } |
946 | |
947 | if ( $name == "text" && ( isset( $attribs['id'] ) || isset( $attribs['location'] ) ) ) { |
948 | $id = $attribs['location'] ?? $attribs['id']; |
949 | $model = trim( $this->thisRevModel ); |
950 | $format = trim( $this->thisRevFormat ); |
951 | |
952 | $model = $model === '' ? null : $model; |
953 | $format = $format === '' ? null : $format; |
954 | $expSize = !empty( $attribs['bytes'] ) && $model === CONTENT_MODEL_WIKITEXT |
955 | ? (int)$attribs['bytes'] : null; |
956 | |
957 | $text = $this->getText( $id, $model, $format, $expSize ); |
958 | |
959 | unset( $attribs['id'] ); |
960 | unset( $attribs['location'] ); |
961 | if ( strlen( $text ) > 0 ) { |
962 | $attribs['xml:space'] = 'preserve'; |
963 | } |
964 | |
965 | $this->openElement = [ $name, $attribs ]; |
966 | if ( strlen( $text ) > 0 ) { |
967 | $this->characterData( $parser, $text ); |
968 | } |
969 | } else { |
970 | $this->openElement = [ $name, $attribs ]; |
971 | } |
972 | } |
973 | |
974 | protected function endElement( $parser, $name ) { |
975 | $this->checkpointJustWritten = false; |
976 | |
977 | if ( $this->openElement ) { |
978 | $this->clearOpenElement( "" ); |
979 | } else { |
980 | $this->buffer .= "</$name>"; |
981 | } |
982 | |
983 | if ( $name == 'revision' ) { |
984 | $this->egress->writeRevision( null, $this->buffer ); |
985 | $this->buffer = ""; |
986 | $this->thisRev = ""; |
987 | $this->thisRole = null; |
988 | $this->thisRevModel = null; |
989 | $this->thisRevFormat = null; |
990 | } elseif ( $name == 'page' ) { |
991 | if ( !$this->firstPageWritten ) { |
992 | $this->firstPageWritten = trim( $this->thisPage ); |
993 | } |
994 | $this->lastPageWritten = trim( $this->thisPage ); |
995 | if ( $this->timeExceeded ) { |
996 | $this->egress->writeClosePage( $this->buffer ); |
997 | // nasty hack, we can't just write the chardata after the |
998 | // page tag, it will include leading blanks from the next line |
999 | $this->egress->sink->write( "\n" ); |
1000 | |
1001 | $this->buffer = $this->xmlwriterobj->closeStream(); |
1002 | $this->egress->writeCloseStream( $this->buffer ); |
1003 | |
1004 | $this->buffer = ""; |
1005 | $this->thisPage = ""; |
1006 | // this could be more than one file if we had more than one output arg |
1007 | |
1008 | $filenameList = (array)$this->egress->getFilenames(); |
1009 | $newFilenames = []; |
1010 | $firstPageID = str_pad( $this->firstPageWritten, 9, "0", STR_PAD_LEFT ); |
1011 | $lastPageID = str_pad( $this->lastPageWritten, 9, "0", STR_PAD_LEFT ); |
1012 | $filenamesCount = count( $filenameList ); |
1013 | for ( $i = 0; $i < $filenamesCount; $i++ ) { |
1014 | $checkpointNameFilledIn = sprintf( $this->checkpointFiles[$i], $firstPageID, $lastPageID ); |
1015 | $fileinfo = pathinfo( $filenameList[$i] ); |
1016 | $newFilenames[] = $fileinfo['dirname'] . '/' . $checkpointNameFilledIn; |
1017 | } |
1018 | $this->egress->closeRenameAndReopen( $newFilenames ); |
1019 | $this->buffer = $this->xmlwriterobj->openStream(); |
1020 | $this->timeExceeded = false; |
1021 | $this->timeOfCheckpoint = $this->lastTime; |
1022 | $this->firstPageWritten = false; |
1023 | $this->checkpointJustWritten = true; |
1024 | } else { |
1025 | $this->egress->writeClosePage( $this->buffer ); |
1026 | $this->buffer = ""; |
1027 | $this->thisPage = ""; |
1028 | } |
1029 | } elseif ( $name == 'mediawiki' ) { |
1030 | $this->egress->writeCloseStream( $this->buffer ); |
1031 | $this->buffer = ""; |
1032 | } |
1033 | } |
1034 | |
1035 | protected function characterData( $parser, $data ) { |
1036 | $this->clearOpenElement( null ); |
1037 | if ( $this->lastName == "id" ) { |
1038 | if ( $this->state == "revision" ) { |
1039 | $this->thisRev .= $data; |
1040 | $this->thisRole = SlotRecord::MAIN; |
1041 | } elseif ( $this->state == "page" ) { |
1042 | $this->thisPage .= $data; |
1043 | } |
1044 | } elseif ( $this->lastName == "model" ) { |
1045 | $this->thisRevModel .= $data; |
1046 | } elseif ( $this->lastName == "format" ) { |
1047 | $this->thisRevFormat .= $data; |
1048 | } elseif ( $this->lastName == "content" ) { |
1049 | $this->thisRole = ""; |
1050 | $this->thisRevModel = ""; |
1051 | $this->thisRevFormat = ""; |
1052 | } elseif ( $this->lastName == "role" ) { |
1053 | $this->thisRole .= $data; |
1054 | } |
1055 | |
1056 | // have to skip the newline left over from closepagetag line of |
1057 | // end of checkpoint files. nasty hack!! |
1058 | if ( $this->checkpointJustWritten ) { |
1059 | if ( $data[0] == "\n" ) { |
1060 | $data = substr( $data, 1 ); |
1061 | } |
1062 | $this->checkpointJustWritten = false; |
1063 | } |
1064 | $this->buffer .= htmlspecialchars( $data, ENT_COMPAT ); |
1065 | } |
1066 | |
1067 | protected function clearOpenElement( $style ) { |
1068 | if ( $this->openElement ) { |
1069 | $this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style ); |
1070 | $this->openElement = false; |
1071 | } |
1072 | } |
1073 | |
1074 | private function isValidTextId( $id ) { |
1075 | if ( preg_match( '/:/', $id ) ) { |
1076 | return $id !== 'tt:0'; |
1077 | } elseif ( preg_match( '/^\d+$/', $id ) ) { |
1078 | return intval( $id ) > 0; |
1079 | } |
1080 | |
1081 | return false; |
1082 | } |
1083 | |
1084 | } |