MediaWiki 1.40.4
compareLanguageConverterOutput.php
Go to the documentation of this file.
1<?php
29use Wikimedia\Bcp47Code\Bcp47Code;
30
31require_once __DIR__ . '/Maintenance.php';
32
40 public function __construct() {
41 parent::__construct();
42 $this->addDescription( 'Compares variant conversion output between Parser and HtmlOutputRendererHelper' );
43 $this->addArg(
44 'page-title',
45 'Name of the page to be parsed and compared',
46 true
47 );
48 $this->addArg(
49 'target-variant',
50 'Target variant language code to transform the content to',
51 true
52 );
53 }
54
55 public function execute() {
56 $mwInstance = MediaWikiServices::getInstance();
57
58 $pageName = $this->getArg( 'page-title' );
59 $pageTitle = Title::newFromText( $pageName );
60
61 if ( !$pageTitle || !$pageTitle->exists() ) {
62 $this->fatalError( "Title with name $pageName not found" );
63 }
64
65 $targetVariantCode = $this->getArg( 'target-variant' );
66 $languageNameUtils = $mwInstance->getLanguageNameUtils();
67 if ( !$languageNameUtils->isValidBuiltInCode( $targetVariantCode ) ) {
68 $this->fatalError( "$targetVariantCode is not a supported variant" );
69 }
70 $targetVariant = $mwInstance->getLanguageFactory()->getLanguage(
71 $targetVariantCode
72 );
73
74 $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
75 $baseLanguage = $pageTitle->getPageLanguage();
76
77 $parserOutput = $this->getParserOutput( $pageTitle, $baseLanguage, $targetVariant );
78 $parsoidOutput = $this->getParsoidOutput( $pageTitle, $targetVariant, $user );
79 $converterUsed = $this->getConverterUsed( $parsoidOutput );
80
81 $this->compareOutput( $parserOutput, $parsoidOutput, $converterUsed );
82
83 return true;
84 }
85
86 private function newHtmlOutputRendererHelper(): HtmlOutputRendererHelper {
87 $services = MediaWikiServices::getInstance();
88
89 $helper = new HtmlOutputRendererHelper(
90 $services->getParsoidOutputStash(),
92 $services->getParsoidOutputAccess(),
93 $services->getHtmlTransformFactory(),
94 $services->getContentHandlerFactory(),
95 $services->getLanguageFactory()
96 );
97 return $helper;
98 }
99
100 private function getParserOptions( Language $language ): ParserOptions {
101 $parserOpts = ParserOptions::newFromAnon();
102 $parserOpts->setTargetLanguage( $language );
103 $parserOpts->disableContentConversion( false );
104 $parserOpts->disableTitleConversion( false );
105
106 return $parserOpts;
107 }
108
109 private function getParserOutput(
110 Title $pageTitle,
111 Language $baseLanguage,
112 Language $targetVariant
113 ): ParserOutput {
114 // We update the default language variant because we want Parser to
115 // perform variant conversion to it.
117 $wgDefaultLanguageVariant = $targetVariant->getCode();
118
119 $mwInstance = MediaWikiServices::getInstance();
120
121 $languageFactory = $mwInstance->getLanguageFactory();
122 $parser = $mwInstance->getParser();
123 $parserOptions = $this->getParserOptions(
124 $languageFactory->getParentLanguage( $baseLanguage )
125 );
126
127 $content = $mwInstance->getRevisionLookup()
128 ->getRevisionByTitle( $pageTitle )
129 ->getContent( SlotRecord::MAIN );
130 $wikiContent = ( $content instanceof TextContent ) ? $content->getText() : '';
131
132 return $parser->parse( $wikiContent, $pageTitle, $parserOptions );
133 }
134
135 private function getParsoidOutput(
136 Title $pageTitle,
137 Bcp47Code $targetVariant,
138 User $user
139 ): ParserOutput {
140 $htmlOutputRendererHelper = $this->newHtmlOutputRendererHelper();
141 $htmlOutputRendererHelper->init( $pageTitle, [
142 'stash' => false,
143 'flavor' => 'view',
144 ], $user );
145 $htmlOutputRendererHelper->setVariantConversionLanguage( $targetVariant );
146
147 return $htmlOutputRendererHelper->getHtml();
148 }
149
150 private function getWords( string $output ): array {
151 $tagsRemoved = strip_tags( $output );
152 $words = preg_split( '/\s+/', trim( $tagsRemoved ), -1, PREG_SPLIT_NO_EMPTY );
153 return $words;
154 }
155
156 private function getBody( string $output ): string {
157 $dom = new DOMDocument();
158 // phpcs:disable Generic.PHP.NoSilencedErrors.Discouraged
159 @$dom->loadHTML( $output );
160 $body = $dom->getElementsByTagName( 'body' )->item( 0 );
161 if ( $body === null ) {
162 // Body element not present
163 return $output;
164 }
165
166 return $body->textContent;
167 }
168
169 private function compareOutput(
170 ParserOutput $parserOutput,
171 ParserOutput $parsoidOutput,
172 string $converterUsed
173 ): void {
174 $parsoidText = $parsoidOutput->getText( [ 'deduplicateStyles' => false ] );
175 $parserText = $parserOutput->getText( [ 'deduplicateStyles' => false ] );
176
177 $parsoidWords = $this->getWords( $this->getBody( $parsoidText ) );
178 $parserWords = $this->getWords( $parserText );
179
180 $parserWordCount = count( $parserWords );
181 $parsoidWordCount = count( $parsoidWords );
182 $this->output( "Word count: Parsoid: $parsoidWordCount; Parser: $parserWordCount\n" );
183
184 $this->outputSimilarity( $parsoidWords, $parserWords );
185 $this->output( "\n" );
186 $this->outputDiff( $parsoidWords, $parserWords, $converterUsed );
187 }
188
189 private function getConverterUsed( ParserOutput $parsoidOutput ): string {
190 $isCoreConverterUsed = strpos(
191 $parsoidOutput->getRawText(),
192 'Variant conversion performed using the core LanguageConverter'
193 );
194
195 if ( $isCoreConverterUsed ) {
196 return 'Core LanguageConverter';
197 } else {
198 return 'Parsoid LanguageConverter';
199 }
200 }
201
202 // Inspired from: https://stackoverflow.com/a/55927237/903324
203 private function mb_sprintf( string $format, ...$args ): string {
204 $params = $args;
205
206 return sprintf(
207 preg_replace_callback(
208 '/(?<=%|%-)\d+(?=s)/',
209 static function ( array $matches ) use ( &$params ) {
210 $value = array_shift( $params );
211
212 return (string)( strlen( $value ) - mb_strlen( $value ) + $matches[0] );
213 },
214 $format
215 ),
216 ...$args
217 );
218 }
219
220 private function outputSimilarity( array $parsoidWords, array $parserWords ): void {
221 $parsoidOutput = implode( ' ', $parsoidWords );
222 $parserOutput = implode( ' ', $parserWords );
223 $this->output(
224 'Total characters: Parsoid: ' . strlen( $parsoidOutput ) .
225 '; Parser: ' . strlen( $parserOutput ) . "\n"
226 );
227
228 $similarityPercent = 0;
229 $similarCharacters = similar_text( $parsoidOutput, $parserOutput, $similarityPercent );
230 $similarityPercent = round( $similarityPercent, 2 );
231
232 $this->output(
233 "Similarity via similar_text(): $similarityPercent%; Similar characters: $similarCharacters"
234 );
235 }
236
237 private function outputDiff( array $parsoidWords, array $parserWords, string $converterUsed ): void {
238 $out = str_repeat( '-', 96 ) . "\n";
239 $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", 'Line', 'Parsoid', 'Parser', 'Diff' );
240 $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", '', "($converterUsed)", '', '' );
241 $out .= str_repeat( '-', 96 ) . "\n";
242
243 try {
244 $diff = new Diff( $parsoidWords, $parserWords );
245 } catch ( ComplexityException $e ) {
246 $this->output( $e->getMessage() );
247 $this->error( 'Encountered ComplexityException while computing diff' );
248 }
249
250 // Print the difference between the words
251 $wordDiffFormat = ( new ArrayDiffFormatter() )->format( $diff );
252 foreach ( $wordDiffFormat as $index => $wordDiff ) {
253 $action = $wordDiff['action'];
254 $old = $wordDiff['old'] ?? null;
255 $new = $wordDiff['new'] ?? null;
256
257 $out .= $this->mb_sprintf(
258 "| %5s | %-35s | %-35s | %-8s |\n",
259 str_pad( (string)( $index + 1 ), 5, ' ', STR_PAD_LEFT ),
260 mb_strimwidth( $old ?? '- N/A -', 0, 35, '…' ),
261 mb_strimwidth( $new ?? '- N/A -', 0, 35, '…' ),
262 $action
263 );
264 }
265
266 // Print the footer.
267 $out .= str_repeat( '-', 96 ) . "\n";
268 $this->output( "\n" . $out );
269 }
270}
271
272$maintClass = CompareLanguageConverterOutput::class;
273require_once RUN_MAINTENANCE_IF_MAIN;
A pseudo-formatter that just passes along the Diff::$edits array.
Maintenance script that compares variant conversion output between Parser and HtmlOutputRendererHelpe...
Class representing a 'diff' between two sequences of strings.
Definition Diff.php:32
Base class for language-specific code.
Definition Language.php:56
getCode()
Get the internal language code for this language object.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
getArg( $argId=0, $default=null)
Get an argument.
addDescription( $text)
Set the description text.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Service locator for MediaWiki core services.
Helper for getting output of a given wikitext page rendered by parsoid.
Value object representing a content slot associated with a page revision.
Represents a title within MediaWiki.
Definition Title.php:82
getPageLanguage()
Get the language in which the content of this page is written in wikitext.
Definition Title.php:3984
exists( $flags=0)
Check if page exists.
Definition Title.php:3523
Set options of the Parser.
setTargetLanguage( $x)
Target language for the parse.
getText( $options=[])
Get the output HTML.
Content object implementation for representing flat text.
internal since 1.36
Definition User.php:71
static newSystemUser( $name, $options=[])
Static factory method for creation of a "system" user from username.
Definition User.php:793
const MAINTENANCE_SCRIPT_USER
Username used for various maintenance scripts.
Definition User.php:117
$wgDefaultLanguageVariant
Config variable stub for the DefaultLanguageVariant setting, for use by phpdoc and IDEs.
$content
Definition router.php:76