Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 155 |
|
0.00% |
0 / 13 |
CRAP | |
0.00% |
0 / 1 |
CompareLanguageConverterOutput | |
0.00% |
0 / 155 |
|
0.00% |
0 / 13 |
462 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
20 | |||
newPageRestHelperFactory | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
2 | |||
getParserOptions | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
getParserOutput | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
6 | |||
getParsoidOutput | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
getWords | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
getBody | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
compareOutput | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
getConverterUsed | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
mb_sprintf | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
outputSimilarity | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
outputDiff | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | * @ingroup Maintenance |
20 | */ |
21 | |
22 | use MediaWiki\Config\ServiceOptions; |
23 | use MediaWiki\Content\TextContent; |
24 | use MediaWiki\Language\Language; |
25 | use MediaWiki\Maintenance\Maintenance; |
26 | use MediaWiki\Parser\ParserOptions; |
27 | use MediaWiki\Parser\ParserOutput; |
28 | use MediaWiki\Rest\Handler\Helper\PageRestHelperFactory; |
29 | use MediaWiki\Revision\SlotRecord; |
30 | use MediaWiki\Title\Title; |
31 | use MediaWiki\User\User; |
32 | use Wikimedia\Bcp47Code\Bcp47Code; |
33 | use Wikimedia\Diff\ArrayDiffFormatter; |
34 | use Wikimedia\Diff\ComplexityException; |
35 | use Wikimedia\Diff\Diff; |
36 | use Wikimedia\Stats\NullStatsdDataFactory; |
37 | use Wikimedia\Stats\StatsFactory; |
38 | |
39 | // @codeCoverageIgnoreStart |
40 | require_once __DIR__ . '/Maintenance.php'; |
41 | // @codeCoverageIgnoreEnd |
42 | |
43 | /** |
44 | * Maintenance script that compares variant conversion output between Parser and |
45 | * HtmlOutputRendererHelper. |
46 | * |
47 | * @ingroup Maintenance |
48 | */ |
49 | class CompareLanguageConverterOutput extends Maintenance { |
50 | public function __construct() { |
51 | parent::__construct(); |
52 | $this->addDescription( 'Compares variant conversion output between Parser and HtmlOutputRendererHelper' ); |
53 | $this->addArg( |
54 | 'page-title', |
55 | 'Name of the page to be parsed and compared', |
56 | true |
57 | ); |
58 | $this->addArg( |
59 | 'target-variant', |
60 | 'Target variant language code to transform the content to', |
61 | true |
62 | ); |
63 | } |
64 | |
65 | public function execute() { |
66 | $mwInstance = $this->getServiceContainer(); |
67 | |
68 | $pageName = $this->getArg( 'page-title' ); |
69 | $pageTitle = Title::newFromText( $pageName ); |
70 | |
71 | if ( !$pageTitle || !$pageTitle->exists() ) { |
72 | $this->fatalError( "Title with name $pageName not found" ); |
73 | } |
74 | |
75 | $targetVariantCode = $this->getArg( 'target-variant' ); |
76 | $languageNameUtils = $mwInstance->getLanguageNameUtils(); |
77 | if ( !$languageNameUtils->isValidBuiltInCode( $targetVariantCode ) ) { |
78 | $this->fatalError( "$targetVariantCode is not a supported variant" ); |
79 | } |
80 | $targetVariant = $mwInstance->getLanguageFactory()->getLanguage( |
81 | $targetVariantCode |
82 | ); |
83 | |
84 | $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] ); |
85 | $baseLanguage = $pageTitle->getPageLanguage(); |
86 | |
87 | $parserOutput = $this->getParserOutput( $pageTitle, $baseLanguage, $targetVariant ); |
88 | $parsoidOutput = $this->getParsoidOutput( $pageTitle, $targetVariant, $user ); |
89 | $converterUsed = $this->getConverterUsed( $parsoidOutput ); |
90 | |
91 | $this->compareOutput( $parserOutput->getContentHolderText(), $parsoidOutput->getContentHolderText(), |
92 | $converterUsed ); |
93 | return true; |
94 | } |
95 | |
96 | private function newPageRestHelperFactory(): PageRestHelperFactory { |
97 | $services = $this->getServiceContainer(); |
98 | |
99 | $factory = new PageRestHelperFactory( |
100 | new ServiceOptions( PageRestHelperFactory::CONSTRUCTOR_OPTIONS, $services->getMainConfig() ), |
101 | $services->getRevisionLookup(), |
102 | $services->getRevisionRenderer(), |
103 | $services->getTitleFormatter(), |
104 | $services->getPageStore(), |
105 | $services->getParsoidOutputStash(), |
106 | new NullStatsdDataFactory(), |
107 | $services->getParserOutputAccess(), |
108 | $services->getParsoidSiteConfig(), |
109 | $services->getHtmlTransformFactory(), |
110 | $services->getContentHandlerFactory(), |
111 | $services->getLanguageFactory(), |
112 | $services->getRedirectStore(), |
113 | $services->getLanguageConverterFactory(), |
114 | $services->getTitleFactory(), |
115 | $services->getConnectionProvider(), |
116 | $services->getChangeTagsStore(), |
117 | StatsFactory::newNull() |
118 | ); |
119 | return $factory; |
120 | } |
121 | |
122 | private function getParserOptions( Language $language ): ParserOptions { |
123 | $parserOpts = ParserOptions::newFromAnon(); |
124 | $parserOpts->setTargetLanguage( $language ); |
125 | $parserOpts->disableContentConversion( false ); |
126 | $parserOpts->disableTitleConversion( false ); |
127 | |
128 | return $parserOpts; |
129 | } |
130 | |
131 | private function getParserOutput( |
132 | Title $pageTitle, |
133 | Language $baseLanguage, |
134 | Language $targetVariant |
135 | ): ParserOutput { |
136 | // We update the default language variant because we want Parser to |
137 | // perform variant conversion to it. |
138 | global $wgDefaultLanguageVariant; |
139 | $wgDefaultLanguageVariant = $targetVariant->getCode(); |
140 | |
141 | $mwInstance = $this->getServiceContainer(); |
142 | |
143 | $languageFactory = $mwInstance->getLanguageFactory(); |
144 | $parser = $mwInstance->getParser(); |
145 | $parserOptions = $this->getParserOptions( |
146 | $languageFactory->getParentLanguage( $baseLanguage ) |
147 | ); |
148 | |
149 | $content = $mwInstance->getRevisionLookup() |
150 | ->getRevisionByTitle( $pageTitle ) |
151 | ->getContent( SlotRecord::MAIN ); |
152 | $wikiContent = ( $content instanceof TextContent ) ? $content->getText() : ''; |
153 | |
154 | $po = $parser->parse( $wikiContent, $pageTitle, $parserOptions ); |
155 | // TODO T371008 consider if using the Content framework makes sense instead of creating the pipeline |
156 | $pipeline = $mwInstance->getDefaultOutputPipeline(); |
157 | $options = [ 'deduplicateStyles' => false ]; |
158 | return $pipeline->run( $po, $parserOptions, $options ); |
159 | } |
160 | |
161 | private function getParsoidOutput( |
162 | Title $pageTitle, |
163 | Bcp47Code $targetVariant, |
164 | User $user |
165 | ): ParserOutput { |
166 | $parserOptions = ParserOptions::newFromAnon(); |
167 | $htmlOutputRendererHelper = $this->newPageRestHelperFactory()->newHtmlOutputRendererHelper( $pageTitle, [ |
168 | 'stash' => false, |
169 | 'flavor' => 'view', |
170 | ], $user, null, false, $parserOptions ); |
171 | $htmlOutputRendererHelper->setVariantConversionLanguage( $targetVariant ); |
172 | |
173 | $po = $htmlOutputRendererHelper->getHtml(); |
174 | $pipeline = $this->getServiceContainer()->getDefaultOutputPipeline(); |
175 | $options = [ 'deduplicateStyles' => false ]; |
176 | return $pipeline->run( $po, $parserOptions, $options ); |
177 | } |
178 | |
179 | private function getWords( string $output ): array { |
180 | $tagsRemoved = strip_tags( $output ); |
181 | $words = preg_split( '/\s+/', trim( $tagsRemoved ), -1, PREG_SPLIT_NO_EMPTY ); |
182 | return $words; |
183 | } |
184 | |
185 | private function getBody( string $output ): string { |
186 | $dom = new DOMDocument(); |
187 | // phpcs:disable Generic.PHP.NoSilencedErrors.Discouraged |
188 | @$dom->loadHTML( $output ); |
189 | $body = $dom->getElementsByTagName( 'body' )->item( 0 ); |
190 | if ( $body === null ) { |
191 | // Body element not present |
192 | return $output; |
193 | } |
194 | |
195 | return $body->textContent; |
196 | } |
197 | |
198 | private function compareOutput( |
199 | string $parserText, |
200 | string $parsoidText, |
201 | string $converterUsed |
202 | ): void { |
203 | $parsoidWords = $this->getWords( $this->getBody( $parsoidText ) ); |
204 | $parserWords = $this->getWords( $parserText ); |
205 | |
206 | $parserWordCount = count( $parserWords ); |
207 | $parsoidWordCount = count( $parsoidWords ); |
208 | $this->output( "Word count: Parsoid: $parsoidWordCount; Parser: $parserWordCount\n" ); |
209 | |
210 | $this->outputSimilarity( $parsoidWords, $parserWords ); |
211 | $this->output( "\n" ); |
212 | $this->outputDiff( $parsoidWords, $parserWords, $converterUsed ); |
213 | } |
214 | |
215 | private function getConverterUsed( ParserOutput $parsoidOutput ): string { |
216 | $isCoreConverterUsed = strpos( |
217 | $parsoidOutput->getRawText(), |
218 | 'Variant conversion performed using the core LanguageConverter' |
219 | ); |
220 | |
221 | if ( $isCoreConverterUsed ) { |
222 | return 'Core LanguageConverter'; |
223 | } else { |
224 | return 'Parsoid LanguageConverter'; |
225 | } |
226 | } |
227 | |
228 | // Inspired from: https://stackoverflow.com/a/55927237/903324 |
229 | private function mb_sprintf( string $format, ...$args ): string { |
230 | $params = $args; |
231 | |
232 | return sprintf( |
233 | preg_replace_callback( |
234 | '/(?<=%|%-)\d+(?=s)/', |
235 | static function ( array $matches ) use ( &$params ) { |
236 | $value = array_shift( $params ); |
237 | |
238 | return (string)( strlen( $value ) - mb_strlen( $value ) + $matches[0] ); |
239 | }, |
240 | $format |
241 | ), |
242 | ...$args |
243 | ); |
244 | } |
245 | |
246 | private function outputSimilarity( array $parsoidWords, array $parserWords ): void { |
247 | $parsoidOutput = implode( ' ', $parsoidWords ); |
248 | $parserOutput = implode( ' ', $parserWords ); |
249 | $this->output( |
250 | 'Total characters: Parsoid: ' . strlen( $parsoidOutput ) . |
251 | '; Parser: ' . strlen( $parserOutput ) . "\n" |
252 | ); |
253 | |
254 | $similarityPercent = 0; |
255 | $similarCharacters = similar_text( $parsoidOutput, $parserOutput, $similarityPercent ); |
256 | $similarityPercent = round( $similarityPercent, 2 ); |
257 | |
258 | $this->output( |
259 | "Similarity via similar_text(): $similarityPercent%; Similar characters: $similarCharacters" |
260 | ); |
261 | } |
262 | |
263 | private function outputDiff( array $parsoidWords, array $parserWords, string $converterUsed ): void { |
264 | $out = str_repeat( '-', 96 ) . "\n"; |
265 | $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", 'Line', 'Parsoid', 'Parser', 'Diff' ); |
266 | $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", '', "($converterUsed)", '', '' ); |
267 | $out .= str_repeat( '-', 96 ) . "\n"; |
268 | |
269 | try { |
270 | $diff = new Diff( $parsoidWords, $parserWords ); |
271 | } catch ( ComplexityException $e ) { |
272 | $this->output( $e->getMessage() ); |
273 | $this->error( 'Encountered ComplexityException while computing diff' ); |
274 | } |
275 | |
276 | // Print the difference between the words |
277 | $wordDiffFormat = ( new ArrayDiffFormatter() )->format( $diff ); |
278 | foreach ( $wordDiffFormat as $index => $wordDiff ) { |
279 | $action = $wordDiff['action']; |
280 | $old = $wordDiff['old'] ?? null; |
281 | $new = $wordDiff['new'] ?? null; |
282 | |
283 | $out .= $this->mb_sprintf( |
284 | "| %5s | %-35s | %-35s | %-8s |\n", |
285 | str_pad( (string)( $index + 1 ), 5, ' ', STR_PAD_LEFT ), |
286 | mb_strimwidth( $old ?? '- N/A -', 0, 35, '…' ), |
287 | mb_strimwidth( $new ?? '- N/A -', 0, 35, '…' ), |
288 | $action |
289 | ); |
290 | } |
291 | |
292 | // Print the footer. |
293 | $out .= str_repeat( '-', 96 ) . "\n"; |
294 | $this->output( "\n" . $out ); |
295 | } |
296 | } |
297 | |
298 | // @codeCoverageIgnoreStart |
299 | $maintClass = CompareLanguageConverterOutput::class; |
300 | require_once RUN_MAINTENANCE_IF_MAIN; |
301 | // @codeCoverageIgnoreEnd |