MediaWiki master
compareLanguageConverterOutput.php
Go to the documentation of this file.
1<?php
18use Wikimedia\Bcp47Code\Bcp47Code;
23
24// @codeCoverageIgnoreStart
25require_once __DIR__ . '/Maintenance.php';
26// @codeCoverageIgnoreEnd
27
35 public function __construct() {
36 parent::__construct();
37 $this->addDescription( 'Compares variant conversion output between Parser and HtmlOutputRendererHelper' );
38 $this->addArg(
39 'page-title',
40 'Name of the page to be parsed and compared',
41 true
42 );
43 $this->addArg(
44 'target-variant',
45 'Target variant language code to transform the content to',
46 true
47 );
48 }
49
51 public function execute() {
52 $mwInstance = $this->getServiceContainer();
53
54 $pageName = $this->getArg( 'page-title' );
55 $pageTitle = Title::newFromText( $pageName );
56
57 if ( !$pageTitle || !$pageTitle->exists() ) {
58 $this->fatalError( "Title with name $pageName not found" );
59 }
60
61 $targetVariantCode = $this->getArg( 'target-variant' );
62 $languageNameUtils = $mwInstance->getLanguageNameUtils();
63 if ( !$languageNameUtils->isValidBuiltInCode( $targetVariantCode ) ) {
64 $this->fatalError( "$targetVariantCode is not a supported variant" );
65 }
66 $targetVariant = $mwInstance->getLanguageFactory()->getLanguage(
67 $targetVariantCode
68 );
69
70 $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
71 $baseLanguage = $pageTitle->getPageLanguage();
72
73 $parserOutput = $this->getParserOutput( $pageTitle, $baseLanguage, $targetVariant );
74 $parsoidOutput = $this->getParsoidOutput( $pageTitle, $targetVariant, $user );
75 $converterUsed = $this->getConverterUsed( $parsoidOutput );
76
77 $this->compareOutput( $parserOutput->getContentHolderText(), $parsoidOutput->getContentHolderText(),
78 $converterUsed );
79 return true;
80 }
81
82 private function newPageRestHelperFactory(): PageRestHelperFactory {
83 $services = $this->getServiceContainer();
84
85 return new PageRestHelperFactory(
86 new ServiceOptions( PageRestHelperFactory::CONSTRUCTOR_OPTIONS, $services->getMainConfig() ),
87 $services->getRevisionLookup(),
88 $services->getRevisionRenderer(),
89 $services->getTitleFormatter(),
90 $services->getPageStore(),
91 $services->getParsoidOutputStash(),
92 $services->getParserOutputAccess(),
93 $services->getParsoidSiteConfig(),
94 $services->getHtmlTransformFactory(),
95 $services->getContentHandlerFactory(),
96 $services->getLanguageFactory(),
97 $services->getRedirectStore(),
98 $services->getLanguageConverterFactory(),
99 $services->getTitleFactory(),
100 $services->getConnectionProvider(),
101 $services->getChangeTagsStore(),
102 StatsFactory::newNull()
103 );
104 }
105
106 private function getParserOptions( Language $language ): ParserOptions {
107 $parserOpts = ParserOptions::newFromAnon();
108 $parserOpts->setTargetLanguage( $language );
109 $parserOpts->disableContentConversion( false );
110 $parserOpts->disableTitleConversion( false );
111
112 return $parserOpts;
113 }
114
115 private function getParserOutput(
116 Title $pageTitle,
117 Language $baseLanguage,
118 Language $targetVariant
119 ): ParserOutput {
120 // We update the default language variant because we want Parser to
121 // perform variant conversion to it.
123 $wgDefaultLanguageVariant = $targetVariant->getCode();
124
125 $mwInstance = $this->getServiceContainer();
126
127 $languageFactory = $mwInstance->getLanguageFactory();
128 $parser = $mwInstance->getParser();
129 $parserOptions = $this->getParserOptions(
130 $languageFactory->getParentLanguage( $baseLanguage )
131 );
132
133 $content = $mwInstance->getRevisionLookup()
134 ->getRevisionByTitle( $pageTitle )
135 ->getContent( SlotRecord::MAIN );
136 $wikiContent = ( $content instanceof TextContent ) ? $content->getText() : '';
137
138 $po = $parser->parse( $wikiContent, $pageTitle, $parserOptions );
139 // TODO T371008 consider if using the Content framework makes sense instead of creating the pipeline
140 $pipeline = $mwInstance->getDefaultOutputPipeline();
141 $options = [ 'deduplicateStyles' => false ];
142 return $pipeline->run( $po, $parserOptions, $options );
143 }
144
145 private function getParsoidOutput(
146 Title $pageTitle,
147 Bcp47Code $targetVariant,
148 User $user
149 ): ParserOutput {
150 $parserOptions = ParserOptions::newFromAnon();
151 $htmlOutputRendererHelper = $this->newPageRestHelperFactory()->newHtmlOutputRendererHelper( $pageTitle, [
152 'stash' => false,
153 'flavor' => 'view',
154 ], $user, null, false, $parserOptions );
155 $htmlOutputRendererHelper->setVariantConversionLanguage( $targetVariant );
156
157 $po = $htmlOutputRendererHelper->getHtml();
158 $pipeline = $this->getServiceContainer()->getDefaultOutputPipeline();
159 $options = [ 'deduplicateStyles' => false ];
160 return $pipeline->run( $po, $parserOptions, $options );
161 }
162
163 private function getWords( string $output ): array {
164 $tagsRemoved = strip_tags( $output );
165 $words = preg_split( '/\s+/', trim( $tagsRemoved ), -1, PREG_SPLIT_NO_EMPTY );
166 return $words;
167 }
168
169 private function getBody( string $output ): string {
170 $dom = new DOMDocument();
171 // phpcs:disable Generic.PHP.NoSilencedErrors.Discouraged
172 @$dom->loadHTML( $output );
173 $body = $dom->getElementsByTagName( 'body' )->item( 0 );
174 if ( $body === null ) {
175 // Body element not present
176 return $output;
177 }
178
179 return $body->textContent;
180 }
181
182 private function compareOutput(
183 string $parserText,
184 string $parsoidText,
185 string $converterUsed
186 ): void {
187 $parsoidWords = $this->getWords( $this->getBody( $parsoidText ) );
188 $parserWords = $this->getWords( $parserText );
189
190 $parserWordCount = count( $parserWords );
191 $parsoidWordCount = count( $parsoidWords );
192 $this->output( "Word count: Parsoid: $parsoidWordCount; Parser: $parserWordCount\n" );
193
194 $this->outputSimilarity( $parsoidWords, $parserWords );
195 $this->output( "\n" );
196 $this->outputDiff( $parsoidWords, $parserWords, $converterUsed );
197 }
198
199 private function getConverterUsed( ParserOutput $parsoidOutput ): string {
200 $isCoreConverterUsed = strpos(
201 $parsoidOutput->getRawText(),
202 'Variant conversion performed using the core LanguageConverter'
203 );
204
205 if ( $isCoreConverterUsed ) {
206 return 'Core LanguageConverter';
207 } else {
208 return 'Parsoid LanguageConverter';
209 }
210 }
211
212 // Inspired from: https://stackoverflow.com/a/55927237/903324
213 private function mb_sprintf( string $format, string ...$args ): string {
214 $params = $args;
215
216 return sprintf(
217 preg_replace_callback(
218 '/(?<=%|%-)\d+(?=s)/',
219 static function ( array $matches ) use ( &$params ) {
220 $value = array_shift( $params );
221
222 return (string)( strlen( $value ) - mb_strlen( $value ) + $matches[0] );
223 },
224 $format
225 ),
226 ...$args
227 );
228 }
229
230 private function outputSimilarity( array $parsoidWords, array $parserWords ): void {
231 $parsoidOutput = implode( ' ', $parsoidWords );
232 $parserOutput = implode( ' ', $parserWords );
233 $this->output(
234 'Total characters: Parsoid: ' . strlen( $parsoidOutput ) .
235 '; Parser: ' . strlen( $parserOutput ) . "\n"
236 );
237
238 $similarityPercent = 0;
239 $similarCharacters = similar_text( $parsoidOutput, $parserOutput, $similarityPercent );
240 $similarityPercent = round( $similarityPercent, 2 );
241
242 $this->output(
243 "Similarity via similar_text(): $similarityPercent%; Similar characters: $similarCharacters"
244 );
245 }
246
247 private function outputDiff( array $parsoidWords, array $parserWords, string $converterUsed ): void {
248 $out = str_repeat( '-', 96 ) . "\n";
249 $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", 'Line', 'Parsoid', 'Parser', 'Diff' );
250 $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", '', "($converterUsed)", '', '' );
251 $out .= str_repeat( '-', 96 ) . "\n";
252
253 try {
254 $diff = new Diff( $parsoidWords, $parserWords );
255 } catch ( ComplexityException $e ) {
256 $this->output( $e->getMessage() );
257 $this->fatalError( 'Encountered ComplexityException while computing diff' );
258 }
259
260 // Print the difference between the words
261 $wordDiffFormat = ( new ArrayDiffFormatter() )->format( $diff );
262 foreach ( $wordDiffFormat as $index => $wordDiff ) {
263 $action = $wordDiff['action'];
264 $old = $wordDiff['old'] ?? null;
265 $new = $wordDiff['new'] ?? null;
266
267 $out .= $this->mb_sprintf(
268 "| %5s | %-35s | %-35s | %-8s |\n",
269 str_pad( (string)( $index + 1 ), 5, ' ', STR_PAD_LEFT ),
270 mb_strimwidth( $old ?? '- N/A -', 0, 35, '…' ),
271 mb_strimwidth( $new ?? '- N/A -', 0, 35, '…' ),
272 $action
273 );
274 }
275
276 // Print the footer.
277 $out .= str_repeat( '-', 96 ) . "\n";
278 $this->output( "\n" . $out );
279 }
280}
281
282// @codeCoverageIgnoreStart
283$maintClass = CompareLanguageConverterOutput::class;
284require_once RUN_MAINTENANCE_IF_MAIN;
285// @codeCoverageIgnoreEnd
Maintenance script that compares variant conversion output between Parser and HtmlOutputRendererHelpe...
execute()
Do the actual work.All child classes will need to implement thisbool|null|void True for success,...
A class for passing options to services.
Content object implementation for representing flat text.
Base class for language-specific code.
Definition Language.php:68
getCode()
Get the internal language code for this language object.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
getArg( $argId=0, $default=null)
Get an argument.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
getServiceContainer()
Returns the main service container.
addDescription( $text)
Set the description text.
Set options of the Parser.
setTargetLanguage( $x)
Target language for the parse.
ParserOutput is a rendering of a Content object or a message.
getContentHolderText()
Returns the body fragment text of the ParserOutput.
Value object representing a content slot associated with a page revision.
Represents a title within MediaWiki.
Definition Title.php:69
getPageLanguage()
Get the language in which the content of this page is written in wikitext.
Definition Title.php:3556
exists( $flags=0)
Check if page exists.
Definition Title.php:3129
User class for the MediaWiki software.
Definition User.php:130
A pseudo-formatter that just passes along the Diff::$edits array.
Class representing a 'diff' between two sequences of strings.
Definition Diff.php:20
This is the primary interface for validating metrics definitions, caching defined metrics,...
$wgDefaultLanguageVariant
Config variable stub for the DefaultLanguageVariant setting, for use by phpdoc and IDEs.
array $params
The job parameters.