MediaWiki master
compareLanguageConverterOutput.php
Go to the documentation of this file.
1<?php
32use Wikimedia\Bcp47Code\Bcp47Code;
38
39// @codeCoverageIgnoreStart
40require_once __DIR__ . '/Maintenance.php';
41// @codeCoverageIgnoreEnd
42
50 public function __construct() {
51 parent::__construct();
52 $this->addDescription( 'Compares variant conversion output between Parser and HtmlOutputRendererHelper' );
53 $this->addArg(
54 'page-title',
55 'Name of the page to be parsed and compared',
56 true
57 );
58 $this->addArg(
59 'target-variant',
60 'Target variant language code to transform the content to',
61 true
62 );
63 }
64
65 public function execute() {
66 $mwInstance = $this->getServiceContainer();
67
68 $pageName = $this->getArg( 'page-title' );
69 $pageTitle = Title::newFromText( $pageName );
70
71 if ( !$pageTitle || !$pageTitle->exists() ) {
72 $this->fatalError( "Title with name $pageName not found" );
73 }
74
75 $targetVariantCode = $this->getArg( 'target-variant' );
76 $languageNameUtils = $mwInstance->getLanguageNameUtils();
77 if ( !$languageNameUtils->isValidBuiltInCode( $targetVariantCode ) ) {
78 $this->fatalError( "$targetVariantCode is not a supported variant" );
79 }
80 $targetVariant = $mwInstance->getLanguageFactory()->getLanguage(
81 $targetVariantCode
82 );
83
84 $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
85 $baseLanguage = $pageTitle->getPageLanguage();
86
87 $parserOutput = $this->getParserOutput( $pageTitle, $baseLanguage, $targetVariant );
88 $parsoidOutput = $this->getParsoidOutput( $pageTitle, $targetVariant, $user );
89 $converterUsed = $this->getConverterUsed( $parsoidOutput );
90
91 $this->compareOutput( $parserOutput->getContentHolderText(), $parsoidOutput->getContentHolderText(),
92 $converterUsed );
93 return true;
94 }
95
96 private function newPageRestHelperFactory(): PageRestHelperFactory {
97 $services = $this->getServiceContainer();
98
99 $factory = new PageRestHelperFactory(
100 new ServiceOptions( PageRestHelperFactory::CONSTRUCTOR_OPTIONS, $services->getMainConfig() ),
101 $services->getRevisionLookup(),
102 $services->getRevisionRenderer(),
103 $services->getTitleFormatter(),
104 $services->getPageStore(),
105 $services->getParsoidOutputStash(),
107 $services->getParserOutputAccess(),
108 $services->getParsoidSiteConfig(),
109 $services->getHtmlTransformFactory(),
110 $services->getContentHandlerFactory(),
111 $services->getLanguageFactory(),
112 $services->getRedirectStore(),
113 $services->getLanguageConverterFactory(),
114 $services->getTitleFactory(),
115 $services->getConnectionProvider(),
116 $services->getChangeTagsStore(),
117 StatsFactory::newNull()
118 );
119 return $factory;
120 }
121
122 private function getParserOptions( Language $language ): ParserOptions {
123 $parserOpts = ParserOptions::newFromAnon();
124 $parserOpts->setTargetLanguage( $language );
125 $parserOpts->disableContentConversion( false );
126 $parserOpts->disableTitleConversion( false );
127
128 return $parserOpts;
129 }
130
131 private function getParserOutput(
132 Title $pageTitle,
133 Language $baseLanguage,
134 Language $targetVariant
135 ): ParserOutput {
136 // We update the default language variant because we want Parser to
137 // perform variant conversion to it.
139 $wgDefaultLanguageVariant = $targetVariant->getCode();
140
141 $mwInstance = $this->getServiceContainer();
142
143 $languageFactory = $mwInstance->getLanguageFactory();
144 $parser = $mwInstance->getParser();
145 $parserOptions = $this->getParserOptions(
146 $languageFactory->getParentLanguage( $baseLanguage )
147 );
148
149 $content = $mwInstance->getRevisionLookup()
150 ->getRevisionByTitle( $pageTitle )
151 ->getContent( SlotRecord::MAIN );
152 $wikiContent = ( $content instanceof TextContent ) ? $content->getText() : '';
153
154 $po = $parser->parse( $wikiContent, $pageTitle, $parserOptions );
155 // TODO T371008 consider if using the Content framework makes sense instead of creating the pipeline
156 $pipeline = $mwInstance->getDefaultOutputPipeline();
157 $options = [ 'deduplicateStyles' => false ];
158 return $pipeline->run( $po, $parserOptions, $options );
159 }
160
161 private function getParsoidOutput(
162 Title $pageTitle,
163 Bcp47Code $targetVariant,
164 User $user
165 ): ParserOutput {
166 $parserOptions = ParserOptions::newFromAnon();
167 $htmlOutputRendererHelper = $this->newPageRestHelperFactory()->newHtmlOutputRendererHelper( $pageTitle, [
168 'stash' => false,
169 'flavor' => 'view',
170 ], $user, null, false, $parserOptions );
171 $htmlOutputRendererHelper->setVariantConversionLanguage( $targetVariant );
172
173 $po = $htmlOutputRendererHelper->getHtml();
174 $pipeline = $this->getServiceContainer()->getDefaultOutputPipeline();
175 $options = [ 'deduplicateStyles' => false ];
176 return $pipeline->run( $po, $parserOptions, $options );
177 }
178
179 private function getWords( string $output ): array {
180 $tagsRemoved = strip_tags( $output );
181 $words = preg_split( '/\s+/', trim( $tagsRemoved ), -1, PREG_SPLIT_NO_EMPTY );
182 return $words;
183 }
184
185 private function getBody( string $output ): string {
186 $dom = new DOMDocument();
187 // phpcs:disable Generic.PHP.NoSilencedErrors.Discouraged
188 @$dom->loadHTML( $output );
189 $body = $dom->getElementsByTagName( 'body' )->item( 0 );
190 if ( $body === null ) {
191 // Body element not present
192 return $output;
193 }
194
195 return $body->textContent;
196 }
197
198 private function compareOutput(
199 string $parserText,
200 string $parsoidText,
201 string $converterUsed
202 ): void {
203 $parsoidWords = $this->getWords( $this->getBody( $parsoidText ) );
204 $parserWords = $this->getWords( $parserText );
205
206 $parserWordCount = count( $parserWords );
207 $parsoidWordCount = count( $parsoidWords );
208 $this->output( "Word count: Parsoid: $parsoidWordCount; Parser: $parserWordCount\n" );
209
210 $this->outputSimilarity( $parsoidWords, $parserWords );
211 $this->output( "\n" );
212 $this->outputDiff( $parsoidWords, $parserWords, $converterUsed );
213 }
214
215 private function getConverterUsed( ParserOutput $parsoidOutput ): string {
216 $isCoreConverterUsed = strpos(
217 $parsoidOutput->getRawText(),
218 'Variant conversion performed using the core LanguageConverter'
219 );
220
221 if ( $isCoreConverterUsed ) {
222 return 'Core LanguageConverter';
223 } else {
224 return 'Parsoid LanguageConverter';
225 }
226 }
227
228 // Inspired from: https://stackoverflow.com/a/55927237/903324
229 private function mb_sprintf( string $format, ...$args ): string {
230 $params = $args;
231
232 return sprintf(
233 preg_replace_callback(
234 '/(?<=%|%-)\d+(?=s)/',
235 static function ( array $matches ) use ( &$params ) {
236 $value = array_shift( $params );
237
238 return (string)( strlen( $value ) - mb_strlen( $value ) + $matches[0] );
239 },
240 $format
241 ),
242 ...$args
243 );
244 }
245
246 private function outputSimilarity( array $parsoidWords, array $parserWords ): void {
247 $parsoidOutput = implode( ' ', $parsoidWords );
248 $parserOutput = implode( ' ', $parserWords );
249 $this->output(
250 'Total characters: Parsoid: ' . strlen( $parsoidOutput ) .
251 '; Parser: ' . strlen( $parserOutput ) . "\n"
252 );
253
254 $similarityPercent = 0;
255 $similarCharacters = similar_text( $parsoidOutput, $parserOutput, $similarityPercent );
256 $similarityPercent = round( $similarityPercent, 2 );
257
258 $this->output(
259 "Similarity via similar_text(): $similarityPercent%; Similar characters: $similarCharacters"
260 );
261 }
262
263 private function outputDiff( array $parsoidWords, array $parserWords, string $converterUsed ): void {
264 $out = str_repeat( '-', 96 ) . "\n";
265 $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", 'Line', 'Parsoid', 'Parser', 'Diff' );
266 $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", '', "($converterUsed)", '', '' );
267 $out .= str_repeat( '-', 96 ) . "\n";
268
269 try {
270 $diff = new Diff( $parsoidWords, $parserWords );
271 } catch ( ComplexityException $e ) {
272 $this->output( $e->getMessage() );
273 $this->error( 'Encountered ComplexityException while computing diff' );
274 }
275
276 // Print the difference between the words
277 $wordDiffFormat = ( new ArrayDiffFormatter() )->format( $diff );
278 foreach ( $wordDiffFormat as $index => $wordDiff ) {
279 $action = $wordDiff['action'];
280 $old = $wordDiff['old'] ?? null;
281 $new = $wordDiff['new'] ?? null;
282
283 $out .= $this->mb_sprintf(
284 "| %5s | %-35s | %-35s | %-8s |\n",
285 str_pad( (string)( $index + 1 ), 5, ' ', STR_PAD_LEFT ),
286 mb_strimwidth( $old ?? '- N/A -', 0, 35, '…' ),
287 mb_strimwidth( $new ?? '- N/A -', 0, 35, '…' ),
288 $action
289 );
290 }
291
292 // Print the footer.
293 $out .= str_repeat( '-', 96 ) . "\n";
294 $this->output( "\n" . $out );
295 }
296}
297
298// @codeCoverageIgnoreStart
299$maintClass = CompareLanguageConverterOutput::class;
300require_once RUN_MAINTENANCE_IF_MAIN;
301// @codeCoverageIgnoreEnd
array $params
The job parameters.
Maintenance script that compares variant conversion output between Parser and HtmlOutputRendererHelpe...
A class for passing options to services.
Content object implementation for representing flat text.
Base class for language-specific code.
Definition Language.php:81
getCode()
Get the internal language code for this language object.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
getArg( $argId=0, $default=null)
Get an argument.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.
Set options of the Parser.
setTargetLanguage( $x)
Target language for the parse.
ParserOutput is a rendering of a Content object or a message.
getContentHolderText()
Returns the content holder text of the ParserOutput.
Value object representing a content slot associated with a page revision.
Represents a title within MediaWiki.
Definition Title.php:78
getPageLanguage()
Get the language in which the content of this page is written in wikitext.
Definition Title.php:3562
exists( $flags=0)
Check if page exists.
Definition Title.php:3138
User class for the MediaWiki software.
Definition User.php:119
A pseudo-formatter that just passes along the Diff::$edits array.
Class representing a 'diff' between two sequences of strings.
Definition Diff.php:34
This is the primary interface for validating metrics definitions, caching defined metrics,...
$wgDefaultLanguageVariant
Config variable stub for the DefaultLanguageVariant setting, for use by phpdoc and IDEs.