Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 63 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
DumpRenderer | |
0.00% |
0 / 63 |
|
0.00% |
0 / 3 |
42 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
12 | |||
handleRevision | |
0.00% |
0 / 35 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | /** |
3 | * Take page text out of an XML dump file and render basic HTML out to files. |
4 | * This is *NOT* suitable for publishing or offline use; it's intended for |
5 | * running comparative tests of parsing behavior using real-world data. |
6 | * |
7 | * Templates etc are pulled from the local wiki database, not from the dump. |
8 | * |
9 | * Copyright (C) 2006 Brooke Vibber <bvibber@wikimedia.org> |
10 | * https://www.mediawiki.org/ |
11 | * |
12 | * This program is free software; you can redistribute it and/or modify |
13 | * it under the terms of the GNU General Public License as published by |
14 | * the Free Software Foundation; either version 2 of the License, or |
15 | * (at your option) any later version. |
16 | * |
17 | * This program is distributed in the hope that it will be useful, |
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
20 | * GNU General Public License for more details. |
21 | * |
22 | * You should have received a copy of the GNU General Public License along |
23 | * with this program; if not, write to the Free Software Foundation, Inc., |
24 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
25 | * http://www.gnu.org/copyleft/gpl.html |
26 | * |
27 | * @file |
28 | * @ingroup Maintenance |
29 | */ |
30 | |
31 | use MediaWiki\Maintenance\Maintenance; |
32 | use MediaWiki\Parser\ParserOptions; |
33 | use MediaWiki\Permissions\UltimateAuthority; |
34 | use MediaWiki\Revision\MutableRevisionRecord; |
35 | use MediaWiki\User\User; |
36 | |
37 | // @codeCoverageIgnoreStart |
38 | require_once __DIR__ . '/Maintenance.php'; |
39 | // @codeCoverageIgnoreEnd |
40 | |
41 | /** |
42 | * Maintenance script that takes page text out of an XML dump file |
43 | * and render basic HTML out to files. |
44 | * |
45 | * @ingroup Maintenance |
46 | */ |
47 | class DumpRenderer extends Maintenance { |
48 | |
49 | /** @var int */ |
50 | private $count = 0; |
51 | private string $outputDirectory; |
52 | private float $startTime; |
53 | /** @var string */ |
54 | private $prefix; |
55 | |
56 | public function __construct() { |
57 | parent::__construct(); |
58 | $this->addDescription( |
59 | 'Take page text out of an XML dump file and render basic HTML out to files' ); |
60 | $this->addOption( 'output-dir', 'The directory to output the HTML files to', true, true ); |
61 | $this->addOption( 'prefix', 'Prefix for the rendered files (defaults to wiki)', false, true ); |
62 | $this->addOption( 'parser', 'Use an alternative parser class', false, true ); |
63 | } |
64 | |
65 | public function execute() { |
66 | $this->outputDirectory = $this->getOption( 'output-dir' ); |
67 | $this->prefix = $this->getOption( 'prefix', 'wiki' ); |
68 | $this->startTime = microtime( true ); |
69 | |
70 | if ( $this->hasOption( 'parser' ) ) { |
71 | $this->prefix .= '-' . $this->getOption( 'parser' ); |
72 | // T236809: We'll need to provide an alternate ParserFactory |
73 | // service to make this work. |
74 | $this->fatalError( 'Parser class configuration temporarily disabled.' ); |
75 | } |
76 | |
77 | $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] ); |
78 | |
79 | $source = new ImportStreamSource( $this->getStdin() ); |
80 | $importer = $this->getServiceContainer() |
81 | ->getWikiImporterFactory() |
82 | ->getWikiImporter( $source, new UltimateAuthority( $user ) ); |
83 | |
84 | $importer->setRevisionCallback( |
85 | [ $this, 'handleRevision' ] ); |
86 | $importer->setNoticeCallback( static function ( $msg, $params ) { |
87 | echo wfMessage( $msg, $params )->text() . "\n"; |
88 | } ); |
89 | |
90 | $importer->doImport(); |
91 | |
92 | $delta = microtime( true ) - $this->startTime; |
93 | $this->error( "Rendered {$this->count} pages in " . round( $delta, 2 ) . " seconds " ); |
94 | if ( $delta > 0 ) { |
95 | $this->error( round( $this->count / $delta, 2 ) . " pages/sec" ); |
96 | } |
97 | $this->error( "\n" ); |
98 | } |
99 | |
100 | /** |
101 | * Callback function for each revision, turn into HTML and save |
102 | * @param WikiRevision $rev |
103 | */ |
104 | public function handleRevision( WikiRevision $rev ) { |
105 | $title = $rev->getTitle(); |
106 | if ( !$title ) { |
107 | $this->error( "Got bogus revision with null title!" ); |
108 | |
109 | return; |
110 | } |
111 | $display = $title->getPrefixedText(); |
112 | |
113 | $this->count++; |
114 | |
115 | $sanitized = rawurlencode( $display ); |
116 | $filename = sprintf( "%s/%s-%07d-%s.html", |
117 | $this->outputDirectory, |
118 | $this->prefix, |
119 | $this->count, |
120 | $sanitized ); |
121 | $this->output( sprintf( "%s\t%s\n", $filename, $display ) ); |
122 | |
123 | $user = new User(); |
124 | $options = ParserOptions::newFromUser( $user ); |
125 | |
126 | $content = $rev->getContent(); |
127 | $contentRenderer = $this->getServiceContainer()->getContentRenderer(); |
128 | // ContentRenderer expects a RevisionRecord, and all we have is a |
129 | // WikiRevision from the dump. Make a fake MutableRevisionRecord to |
130 | // satisfy it -- the only thing ::getParserOutput actually needs is |
131 | // the revision ID and revision timestamp. |
132 | $mutableRev = new MutableRevisionRecord( $rev->getTitle() ); |
133 | $mutableRev->setId( $rev->getID() ); |
134 | $mutableRev->setTimestamp( $rev->getTimestamp() ); |
135 | $output = $contentRenderer->getParserOutput( |
136 | $content, $title, $mutableRev, $options |
137 | ); |
138 | |
139 | file_put_contents( $filename, |
140 | "<!DOCTYPE html>\n" . |
141 | "<html lang=\"en\" dir=\"ltr\">\n" . |
142 | "<head>\n" . |
143 | "<meta charset=\"UTF-8\" />\n" . |
144 | "<meta name=\"color-scheme\" content=\"light dark\">" . |
145 | "<title>" . htmlspecialchars( $display, ENT_COMPAT ) . "</title>\n" . |
146 | "</head>\n" . |
147 | "<body>\n" . |
148 | // TODO T371004 move runOutputPipeline out of $parserOutput |
149 | $output->runOutputPipeline( $options, [] )->getContentHolderText() . |
150 | "</body>\n" . |
151 | "</html>" ); |
152 | } |
153 | } |
154 | |
155 | // @codeCoverageIgnoreStart |
156 | $maintClass = DumpRenderer::class; |
157 | require_once RUN_MAINTENANCE_IF_MAIN; |
158 | // @codeCoverageIgnoreEnd |