36 parent::__construct();
37 $this->
addDescription(
'Compares variant conversion output between Parser and HtmlOutputRendererHelper' );
40 'Name of the page to be parsed and compared',
45 'Target variant language code to transform the content to',
54 $pageName = $this->
getArg(
'page-title' );
55 $pageTitle = Title::newFromText( $pageName );
57 if ( !$pageTitle || !$pageTitle->
exists() ) {
58 $this->
fatalError(
"Title with name $pageName not found" );
61 $targetVariantCode = $this->
getArg(
'target-variant' );
62 $languageNameUtils = $mwInstance->getLanguageNameUtils();
63 if ( !$languageNameUtils->isValidBuiltInCode( $targetVariantCode ) ) {
64 $this->
fatalError(
"$targetVariantCode is not a supported variant" );
66 $targetVariant = $mwInstance->getLanguageFactory()->getLanguage(
70 $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [
'steal' =>
true ] );
73 $parserOutput = $this->getParserOutput( $pageTitle, $baseLanguage, $targetVariant );
74 $parsoidOutput = $this->getParsoidOutput( $pageTitle, $targetVariant, $user );
75 $converterUsed = $this->getConverterUsed( $parsoidOutput );
77 $this->compareOutput( $parserOutput->getContentHolderText(), $parsoidOutput->
getContentHolderText(),
86 new ServiceOptions( PageRestHelperFactory::CONSTRUCTOR_OPTIONS, $services->getMainConfig() ),
87 $services->getRevisionLookup(),
88 $services->getRevisionRenderer(),
89 $services->getTitleFormatter(),
90 $services->getPageStore(),
91 $services->getParsoidOutputStash(),
92 $services->getParserOutputAccess(),
93 $services->getParsoidSiteConfig(),
94 $services->getHtmlTransformFactory(),
95 $services->getContentHandlerFactory(),
96 $services->getLanguageFactory(),
97 $services->getRedirectStore(),
98 $services->getLanguageConverterFactory(),
99 $services->getTitleFactory(),
100 $services->getConnectionProvider(),
101 $services->getChangeTagsStore(),
102 StatsFactory::newNull()
109 $parserOpts->disableContentConversion(
false );
110 $parserOpts->disableTitleConversion(
false );
115 private function getParserOutput(
125 $mwInstance = $this->getServiceContainer();
127 $languageFactory = $mwInstance->getLanguageFactory();
128 $parser = $mwInstance->getParser();
129 $parserOptions = $this->getParserOptions(
130 $languageFactory->getParentLanguage( $baseLanguage )
133 $content = $mwInstance->getRevisionLookup()
134 ->getRevisionByTitle( $pageTitle )
135 ->getContent( SlotRecord::MAIN );
136 $wikiContent = ( $content instanceof
TextContent ) ? $content->getText() :
'';
138 $po = $parser->parse( $wikiContent, $pageTitle, $parserOptions );
140 $pipeline = $mwInstance->getDefaultOutputPipeline();
141 $options = [
'deduplicateStyles' => false ];
142 return $pipeline->run( $po, $parserOptions, $options );
145 private function getParsoidOutput(
147 Bcp47Code $targetVariant,
151 $htmlOutputRendererHelper = $this->newPageRestHelperFactory()->newHtmlOutputRendererHelper( $pageTitle, [
154 ], $user,
null,
false, $parserOptions );
155 $htmlOutputRendererHelper->setVariantConversionLanguage( $targetVariant );
157 $po = $htmlOutputRendererHelper->getHtml();
158 $pipeline = $this->getServiceContainer()->getDefaultOutputPipeline();
159 $options = [
'deduplicateStyles' => false ];
160 return $pipeline->run( $po, $parserOptions, $options );
163 private function getWords(
string $output ): array {
164 $tagsRemoved = strip_tags( $output );
165 $words = preg_split(
'/\s+/', trim( $tagsRemoved ), -1, PREG_SPLIT_NO_EMPTY );
169 private function getBody(
string $output ): string {
170 $dom = new DOMDocument();
172 @$dom->loadHTML( $output );
173 $body = $dom->getElementsByTagName(
'body' )->item( 0 );
174 if ( $body ===
null ) {
179 return $body->textContent;
182 private function compareOutput(
185 string $converterUsed
187 $parsoidWords = $this->getWords( $this->getBody( $parsoidText ) );
188 $parserWords = $this->getWords( $parserText );
190 $parserWordCount = count( $parserWords );
191 $parsoidWordCount = count( $parsoidWords );
192 $this->output(
"Word count: Parsoid: $parsoidWordCount; Parser: $parserWordCount\n" );
194 $this->outputSimilarity( $parsoidWords, $parserWords );
195 $this->output(
"\n" );
196 $this->outputDiff( $parsoidWords, $parserWords, $converterUsed );
199 private function getConverterUsed(
ParserOutput $parsoidOutput ): string {
200 $isCoreConverterUsed = strpos(
201 $parsoidOutput->getRawText(),
202 'Variant conversion performed using the core LanguageConverter'
205 if ( $isCoreConverterUsed ) {
206 return 'Core LanguageConverter';
208 return 'Parsoid LanguageConverter';
213 private function mb_sprintf(
string $format,
string ...$args ): string {
217 preg_replace_callback(
218 '/(?<=%|%-)\d+(?=s)/',
219 static function ( array
$matches ) use ( &$params ) {
220 $value = array_shift( $params );
222 return (
string)( strlen( $value ) - mb_strlen( $value ) +
$matches[0] );
230 private function outputSimilarity( array $parsoidWords, array $parserWords ): void {
231 $parsoidOutput = implode(
' ', $parsoidWords );
232 $parserOutput = implode(
' ', $parserWords );
234 'Total characters: Parsoid: ' . strlen( $parsoidOutput ) .
235 '; Parser: ' . strlen( $parserOutput ) .
"\n"
238 $similarityPercent = 0;
239 $similarCharacters = similar_text( $parsoidOutput, $parserOutput, $similarityPercent );
240 $similarityPercent = round( $similarityPercent, 2 );
243 "Similarity via similar_text(): $similarityPercent%; Similar characters: $similarCharacters"
247 private function outputDiff( array $parsoidWords, array $parserWords,
string $converterUsed ): void {
248 $out = str_repeat(
'-', 96 ) .
"\n";
249 $out .= sprintf(
"| %5s | %-35s | %-35s | %-8s |\n",
'Line',
'Parsoid',
'Parser',
'Diff' );
250 $out .= sprintf(
"| %5s | %-35s | %-35s | %-8s |\n",
'',
"($converterUsed)",
'',
'' );
251 $out .= str_repeat(
'-', 96 ) .
"\n";
254 $diff =
new Diff( $parsoidWords, $parserWords );
256 $this->output( $e->getMessage() );
257 $this->fatalError(
'Encountered ComplexityException while computing diff' );
262 foreach ( $wordDiffFormat as $index => $wordDiff ) {
263 $action = $wordDiff[
'action'];
264 $old = $wordDiff[
'old'] ??
null;
265 $new = $wordDiff[
'new'] ??
null;
267 $out .= $this->mb_sprintf(
268 "| %5s | %-35s | %-35s | %-8s |\n",
269 str_pad( (
string)( $index + 1 ), 5,
' ', STR_PAD_LEFT ),
270 mb_strimwidth( $old ??
'- N/A -', 0, 35,
'…' ),
271 mb_strimwidth( $new ??
'- N/A -', 0, 35,
'…' ),
277 $out .= str_repeat(
'-', 96 ) .
"\n";
278 $this->output(
"\n" . $out );