MediaWiki master
compareLanguageConverterOutput.php
Go to the documentation of this file.
1<?php
32use Wikimedia\Bcp47Code\Bcp47Code;
37
38// @codeCoverageIgnoreStart
39require_once __DIR__ . '/Maintenance.php';
40// @codeCoverageIgnoreEnd
41
49 public function __construct() {
50 parent::__construct();
51 $this->addDescription( 'Compares variant conversion output between Parser and HtmlOutputRendererHelper' );
52 $this->addArg(
53 'page-title',
54 'Name of the page to be parsed and compared',
55 true
56 );
57 $this->addArg(
58 'target-variant',
59 'Target variant language code to transform the content to',
60 true
61 );
62 }
63
65 public function execute() {
66 $mwInstance = $this->getServiceContainer();
67
68 $pageName = $this->getArg( 'page-title' );
69 $pageTitle = Title::newFromText( $pageName );
70
71 if ( !$pageTitle || !$pageTitle->exists() ) {
72 $this->fatalError( "Title with name $pageName not found" );
73 }
74
75 $targetVariantCode = $this->getArg( 'target-variant' );
76 $languageNameUtils = $mwInstance->getLanguageNameUtils();
77 if ( !$languageNameUtils->isValidBuiltInCode( $targetVariantCode ) ) {
78 $this->fatalError( "$targetVariantCode is not a supported variant" );
79 }
80 $targetVariant = $mwInstance->getLanguageFactory()->getLanguage(
81 $targetVariantCode
82 );
83
84 $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
85 $baseLanguage = $pageTitle->getPageLanguage();
86
87 $parserOutput = $this->getParserOutput( $pageTitle, $baseLanguage, $targetVariant );
88 $parsoidOutput = $this->getParsoidOutput( $pageTitle, $targetVariant, $user );
89 $converterUsed = $this->getConverterUsed( $parsoidOutput );
90
91 $this->compareOutput( $parserOutput->getContentHolderText(), $parsoidOutput->getContentHolderText(),
92 $converterUsed );
93 return true;
94 }
95
96 private function newPageRestHelperFactory(): PageRestHelperFactory {
97 $services = $this->getServiceContainer();
98
99 return new PageRestHelperFactory(
100 new ServiceOptions( PageRestHelperFactory::CONSTRUCTOR_OPTIONS, $services->getMainConfig() ),
101 $services->getRevisionLookup(),
102 $services->getRevisionRenderer(),
103 $services->getTitleFormatter(),
104 $services->getPageStore(),
105 $services->getParsoidOutputStash(),
106 $services->getParserOutputAccess(),
107 $services->getParsoidSiteConfig(),
108 $services->getHtmlTransformFactory(),
109 $services->getContentHandlerFactory(),
110 $services->getLanguageFactory(),
111 $services->getRedirectStore(),
112 $services->getLanguageConverterFactory(),
113 $services->getTitleFactory(),
114 $services->getConnectionProvider(),
115 $services->getChangeTagsStore(),
116 StatsFactory::newNull()
117 );
118 }
119
120 private function getParserOptions( Language $language ): ParserOptions {
121 $parserOpts = ParserOptions::newFromAnon();
122 $parserOpts->setTargetLanguage( $language );
123 $parserOpts->disableContentConversion( false );
124 $parserOpts->disableTitleConversion( false );
125
126 return $parserOpts;
127 }
128
129 private function getParserOutput(
130 Title $pageTitle,
131 Language $baseLanguage,
132 Language $targetVariant
133 ): ParserOutput {
134 // We update the default language variant because we want Parser to
135 // perform variant conversion to it.
137 $wgDefaultLanguageVariant = $targetVariant->getCode();
138
139 $mwInstance = $this->getServiceContainer();
140
141 $languageFactory = $mwInstance->getLanguageFactory();
142 $parser = $mwInstance->getParser();
143 $parserOptions = $this->getParserOptions(
144 $languageFactory->getParentLanguage( $baseLanguage )
145 );
146
147 $content = $mwInstance->getRevisionLookup()
148 ->getRevisionByTitle( $pageTitle )
149 ->getContent( SlotRecord::MAIN );
150 $wikiContent = ( $content instanceof TextContent ) ? $content->getText() : '';
151
152 $po = $parser->parse( $wikiContent, $pageTitle, $parserOptions );
153 // TODO T371008 consider if using the Content framework makes sense instead of creating the pipeline
154 $pipeline = $mwInstance->getDefaultOutputPipeline();
155 $options = [ 'deduplicateStyles' => false ];
156 return $pipeline->run( $po, $parserOptions, $options );
157 }
158
159 private function getParsoidOutput(
160 Title $pageTitle,
161 Bcp47Code $targetVariant,
162 User $user
163 ): ParserOutput {
164 $parserOptions = ParserOptions::newFromAnon();
165 $htmlOutputRendererHelper = $this->newPageRestHelperFactory()->newHtmlOutputRendererHelper( $pageTitle, [
166 'stash' => false,
167 'flavor' => 'view',
168 ], $user, null, false, $parserOptions );
169 $htmlOutputRendererHelper->setVariantConversionLanguage( $targetVariant );
170
171 $po = $htmlOutputRendererHelper->getHtml();
172 $pipeline = $this->getServiceContainer()->getDefaultOutputPipeline();
173 $options = [ 'deduplicateStyles' => false ];
174 return $pipeline->run( $po, $parserOptions, $options );
175 }
176
177 private function getWords( string $output ): array {
178 $tagsRemoved = strip_tags( $output );
179 $words = preg_split( '/\s+/', trim( $tagsRemoved ), -1, PREG_SPLIT_NO_EMPTY );
180 return $words;
181 }
182
183 private function getBody( string $output ): string {
184 $dom = new DOMDocument();
185 // phpcs:disable Generic.PHP.NoSilencedErrors.Discouraged
186 @$dom->loadHTML( $output );
187 $body = $dom->getElementsByTagName( 'body' )->item( 0 );
188 if ( $body === null ) {
189 // Body element not present
190 return $output;
191 }
192
193 return $body->textContent;
194 }
195
196 private function compareOutput(
197 string $parserText,
198 string $parsoidText,
199 string $converterUsed
200 ): void {
201 $parsoidWords = $this->getWords( $this->getBody( $parsoidText ) );
202 $parserWords = $this->getWords( $parserText );
203
204 $parserWordCount = count( $parserWords );
205 $parsoidWordCount = count( $parsoidWords );
206 $this->output( "Word count: Parsoid: $parsoidWordCount; Parser: $parserWordCount\n" );
207
208 $this->outputSimilarity( $parsoidWords, $parserWords );
209 $this->output( "\n" );
210 $this->outputDiff( $parsoidWords, $parserWords, $converterUsed );
211 }
212
213 private function getConverterUsed( ParserOutput $parsoidOutput ): string {
214 $isCoreConverterUsed = strpos(
215 $parsoidOutput->getRawText(),
216 'Variant conversion performed using the core LanguageConverter'
217 );
218
219 if ( $isCoreConverterUsed ) {
220 return 'Core LanguageConverter';
221 } else {
222 return 'Parsoid LanguageConverter';
223 }
224 }
225
226 // Inspired from: https://stackoverflow.com/a/55927237/903324
227 private function mb_sprintf( string $format, ...$args ): string {
228 $params = $args;
229
230 return sprintf(
231 preg_replace_callback(
232 '/(?<=%|%-)\d+(?=s)/',
233 static function ( array $matches ) use ( &$params ) {
234 $value = array_shift( $params );
235
236 return (string)( strlen( $value ) - mb_strlen( $value ) + $matches[0] );
237 },
238 $format
239 ),
240 ...$args
241 );
242 }
243
244 private function outputSimilarity( array $parsoidWords, array $parserWords ): void {
245 $parsoidOutput = implode( ' ', $parsoidWords );
246 $parserOutput = implode( ' ', $parserWords );
247 $this->output(
248 'Total characters: Parsoid: ' . strlen( $parsoidOutput ) .
249 '; Parser: ' . strlen( $parserOutput ) . "\n"
250 );
251
252 $similarityPercent = 0;
253 $similarCharacters = similar_text( $parsoidOutput, $parserOutput, $similarityPercent );
254 $similarityPercent = round( $similarityPercent, 2 );
255
256 $this->output(
257 "Similarity via similar_text(): $similarityPercent%; Similar characters: $similarCharacters"
258 );
259 }
260
261 private function outputDiff( array $parsoidWords, array $parserWords, string $converterUsed ): void {
262 $out = str_repeat( '-', 96 ) . "\n";
263 $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", 'Line', 'Parsoid', 'Parser', 'Diff' );
264 $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", '', "($converterUsed)", '', '' );
265 $out .= str_repeat( '-', 96 ) . "\n";
266
267 try {
268 $diff = new Diff( $parsoidWords, $parserWords );
269 } catch ( ComplexityException $e ) {
270 $this->output( $e->getMessage() );
271 $this->error( 'Encountered ComplexityException while computing diff' );
272 }
273
274 // Print the difference between the words
275 $wordDiffFormat = ( new ArrayDiffFormatter() )->format( $diff );
276 foreach ( $wordDiffFormat as $index => $wordDiff ) {
277 $action = $wordDiff['action'];
278 $old = $wordDiff['old'] ?? null;
279 $new = $wordDiff['new'] ?? null;
280
281 $out .= $this->mb_sprintf(
282 "| %5s | %-35s | %-35s | %-8s |\n",
283 str_pad( (string)( $index + 1 ), 5, ' ', STR_PAD_LEFT ),
284 mb_strimwidth( $old ?? '- N/A -', 0, 35, '…' ),
285 mb_strimwidth( $new ?? '- N/A -', 0, 35, '…' ),
286 $action
287 );
288 }
289
290 // Print the footer.
291 $out .= str_repeat( '-', 96 ) . "\n";
292 $this->output( "\n" . $out );
293 }
294}
295
296// @codeCoverageIgnoreStart
297$maintClass = CompareLanguageConverterOutput::class;
298require_once RUN_MAINTENANCE_IF_MAIN;
299// @codeCoverageIgnoreEnd
Maintenance script that compares variant conversion output between Parser and HtmlOutputRendererHelpe...
execute()
Do the actual work.All child classes will need to implement thisbool|null|void True for success,...
A class for passing options to services.
Content object implementation for representing flat text.
Base class for language-specific code.
Definition Language.php:81
getCode()
Get the internal language code for this language object.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
getArg( $argId=0, $default=null)
Get an argument.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.
Set options of the Parser.
setTargetLanguage( $x)
Target language for the parse.
ParserOutput is a rendering of a Content object or a message.
getContentHolderText()
Returns the content holder text of the ParserOutput.
Value object representing a content slot associated with a page revision.
Represents a title within MediaWiki.
Definition Title.php:78
getPageLanguage()
Get the language in which the content of this page is written in wikitext.
Definition Title.php:3572
exists( $flags=0)
Check if page exists.
Definition Title.php:3143
User class for the MediaWiki software.
Definition User.php:123
A pseudo-formatter that just passes along the Diff::$edits array.
Class representing a 'diff' between two sequences of strings.
Definition Diff.php:34
This is the primary interface for validating metrics definitions, caching defined metrics,...
$wgDefaultLanguageVariant
Config variable stub for the DefaultLanguageVariant setting, for use by phpdoc and IDEs.
array $params
The job parameters.