MediaWiki master
compareLanguageConverterOutput.php
Go to the documentation of this file.
1<?php
28use Wikimedia\Bcp47Code\Bcp47Code;
32
33require_once __DIR__ . '/Maintenance.php';
34
42 public function __construct() {
43 parent::__construct();
44 $this->addDescription( 'Compares variant conversion output between Parser and HtmlOutputRendererHelper' );
45 $this->addArg(
46 'page-title',
47 'Name of the page to be parsed and compared',
48 true
49 );
50 $this->addArg(
51 'target-variant',
52 'Target variant language code to transform the content to',
53 true
54 );
55 }
56
57 public function execute() {
58 $mwInstance = $this->getServiceContainer();
59
60 $pageName = $this->getArg( 'page-title' );
61 $pageTitle = Title::newFromText( $pageName );
62
63 if ( !$pageTitle || !$pageTitle->exists() ) {
64 $this->fatalError( "Title with name $pageName not found" );
65 }
66
67 $targetVariantCode = $this->getArg( 'target-variant' );
68 $languageNameUtils = $mwInstance->getLanguageNameUtils();
69 if ( !$languageNameUtils->isValidBuiltInCode( $targetVariantCode ) ) {
70 $this->fatalError( "$targetVariantCode is not a supported variant" );
71 }
72 $targetVariant = $mwInstance->getLanguageFactory()->getLanguage(
73 $targetVariantCode
74 );
75
76 $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
77 $baseLanguage = $pageTitle->getPageLanguage();
78
79 $parserOutput = $this->getParserOutput( $pageTitle, $baseLanguage, $targetVariant );
80 $parsoidOutput = $this->getParsoidOutput( $pageTitle, $targetVariant, $user );
81 $converterUsed = $this->getConverterUsed( $parsoidOutput );
82
83 $this->compareOutput( $parserOutput, $parsoidOutput, $converterUsed );
84
85 return true;
86 }
87
88 private function newHtmlOutputRendererHelper(): HtmlOutputRendererHelper {
89 $services = $this->getServiceContainer();
90
91 $helper = new HtmlOutputRendererHelper(
92 $services->getParsoidOutputStash(),
94 $services->getParsoidOutputAccess(),
95 $services->getHtmlTransformFactory(),
96 $services->getContentHandlerFactory(),
97 $services->getLanguageFactory()
98 );
99 return $helper;
100 }
101
102 private function getParserOptions( Language $language ): ParserOptions {
103 $parserOpts = ParserOptions::newFromAnon();
104 $parserOpts->setTargetLanguage( $language );
105 $parserOpts->disableContentConversion( false );
106 $parserOpts->disableTitleConversion( false );
107
108 return $parserOpts;
109 }
110
111 private function getParserOutput(
112 Title $pageTitle,
113 Language $baseLanguage,
114 Language $targetVariant
115 ): ParserOutput {
116 // We update the default language variant because we want Parser to
117 // perform variant conversion to it.
119 $wgDefaultLanguageVariant = $targetVariant->getCode();
120
121 $mwInstance = $this->getServiceContainer();
122
123 $languageFactory = $mwInstance->getLanguageFactory();
124 $parser = $mwInstance->getParser();
125 $parserOptions = $this->getParserOptions(
126 $languageFactory->getParentLanguage( $baseLanguage )
127 );
128
129 $content = $mwInstance->getRevisionLookup()
130 ->getRevisionByTitle( $pageTitle )
131 ->getContent( SlotRecord::MAIN );
132 $wikiContent = ( $content instanceof TextContent ) ? $content->getText() : '';
133
134 return $parser->parse( $wikiContent, $pageTitle, $parserOptions );
135 }
136
137 private function getParsoidOutput(
138 Title $pageTitle,
139 Bcp47Code $targetVariant,
140 User $user
141 ): ParserOutput {
142 $htmlOutputRendererHelper = $this->newHtmlOutputRendererHelper();
143 $htmlOutputRendererHelper->init( $pageTitle, [
144 'stash' => false,
145 'flavor' => 'view',
146 ], $user );
147 $htmlOutputRendererHelper->setVariantConversionLanguage( $targetVariant );
148
149 return $htmlOutputRendererHelper->getHtml();
150 }
151
152 private function getWords( string $output ): array {
153 $tagsRemoved = strip_tags( $output );
154 $words = preg_split( '/\s+/', trim( $tagsRemoved ), -1, PREG_SPLIT_NO_EMPTY );
155 return $words;
156 }
157
158 private function getBody( string $output ): string {
159 $dom = new DOMDocument();
160 // phpcs:disable Generic.PHP.NoSilencedErrors.Discouraged
161 @$dom->loadHTML( $output );
162 $body = $dom->getElementsByTagName( 'body' )->item( 0 );
163 if ( $body === null ) {
164 // Body element not present
165 return $output;
166 }
167
168 return $body->textContent;
169 }
170
171 private function compareOutput(
172 ParserOutput $parserOutput,
173 ParserOutput $parsoidOutput,
174 string $converterUsed
175 ): void {
176 $parsoidText = $parsoidOutput->getText( [ 'deduplicateStyles' => false ] );
177 $parserText = $parserOutput->getText( [ 'deduplicateStyles' => false ] );
178
179 $parsoidWords = $this->getWords( $this->getBody( $parsoidText ) );
180 $parserWords = $this->getWords( $parserText );
181
182 $parserWordCount = count( $parserWords );
183 $parsoidWordCount = count( $parsoidWords );
184 $this->output( "Word count: Parsoid: $parsoidWordCount; Parser: $parserWordCount\n" );
185
186 $this->outputSimilarity( $parsoidWords, $parserWords );
187 $this->output( "\n" );
188 $this->outputDiff( $parsoidWords, $parserWords, $converterUsed );
189 }
190
191 private function getConverterUsed( ParserOutput $parsoidOutput ): string {
192 $isCoreConverterUsed = strpos(
193 $parsoidOutput->getRawText(),
194 'Variant conversion performed using the core LanguageConverter'
195 );
196
197 if ( $isCoreConverterUsed ) {
198 return 'Core LanguageConverter';
199 } else {
200 return 'Parsoid LanguageConverter';
201 }
202 }
203
204 // Inspired from: https://stackoverflow.com/a/55927237/903324
205 private function mb_sprintf( string $format, ...$args ): string {
206 $params = $args;
207
208 return sprintf(
209 preg_replace_callback(
210 '/(?<=%|%-)\d+(?=s)/',
211 static function ( array $matches ) use ( &$params ) {
212 $value = array_shift( $params );
213
214 return (string)( strlen( $value ) - mb_strlen( $value ) + $matches[0] );
215 },
216 $format
217 ),
218 ...$args
219 );
220 }
221
222 private function outputSimilarity( array $parsoidWords, array $parserWords ): void {
223 $parsoidOutput = implode( ' ', $parsoidWords );
224 $parserOutput = implode( ' ', $parserWords );
225 $this->output(
226 'Total characters: Parsoid: ' . strlen( $parsoidOutput ) .
227 '; Parser: ' . strlen( $parserOutput ) . "\n"
228 );
229
230 $similarityPercent = 0;
231 $similarCharacters = similar_text( $parsoidOutput, $parserOutput, $similarityPercent );
232 $similarityPercent = round( $similarityPercent, 2 );
233
234 $this->output(
235 "Similarity via similar_text(): $similarityPercent%; Similar characters: $similarCharacters"
236 );
237 }
238
239 private function outputDiff( array $parsoidWords, array $parserWords, string $converterUsed ): void {
240 $out = str_repeat( '-', 96 ) . "\n";
241 $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", 'Line', 'Parsoid', 'Parser', 'Diff' );
242 $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", '', "($converterUsed)", '', '' );
243 $out .= str_repeat( '-', 96 ) . "\n";
244
245 try {
246 $diff = new Diff( $parsoidWords, $parserWords );
247 } catch ( ComplexityException $e ) {
248 $this->output( $e->getMessage() );
249 $this->error( 'Encountered ComplexityException while computing diff' );
250 }
251
252 // Print the difference between the words
253 $wordDiffFormat = ( new ArrayDiffFormatter() )->format( $diff );
254 foreach ( $wordDiffFormat as $index => $wordDiff ) {
255 $action = $wordDiff['action'];
256 $old = $wordDiff['old'] ?? null;
257 $new = $wordDiff['new'] ?? null;
258
259 $out .= $this->mb_sprintf(
260 "| %5s | %-35s | %-35s | %-8s |\n",
261 str_pad( (string)( $index + 1 ), 5, ' ', STR_PAD_LEFT ),
262 mb_strimwidth( $old ?? '- N/A -', 0, 35, '…' ),
263 mb_strimwidth( $new ?? '- N/A -', 0, 35, '…' ),
264 $action
265 );
266 }
267
268 // Print the footer.
269 $out .= str_repeat( '-', 96 ) . "\n";
270 $this->output( "\n" . $out );
271 }
272}
273
274$maintClass = CompareLanguageConverterOutput::class;
275require_once RUN_MAINTENANCE_IF_MAIN;
array $params
The job parameters.
Maintenance script that compares variant conversion output between Parser and HtmlOutputRendererHelpe...
Base class for language-specific code.
Definition Language.php:65
getCode()
Get the internal language code for this language object.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
getServiceContainer()
Returns the main service container.
getArg( $argId=0, $default=null)
Get an argument.
addDescription( $text)
Set the description text.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Content object implementation for representing flat text.
ParserOutput is a rendering of a Content object or a message.
getText( $options=[])
Get the output HTML.
Helper for getting output of a given wikitext page rendered by parsoid.
Value object representing a content slot associated with a page revision.
Represents a title within MediaWiki.
Definition Title.php:78
getPageLanguage()
Get the language in which the content of this page is written in wikitext.
Definition Title.php:3579
exists( $flags=0)
Check if page exists.
Definition Title.php:3145
internal since 1.36
Definition User.php:93
Set options of the Parser.
setTargetLanguage( $x)
Target language for the parse.
A pseudo-formatter that just passes along the Diff::$edits array.
Class representing a 'diff' between two sequences of strings.
Definition Diff.php:34
$wgDefaultLanguageVariant
Config variable stub for the DefaultLanguageVariant setting, for use by phpdoc and IDEs.