42 parent::__construct();
43 $this->
addDescription(
'Compares variant conversion output between Parser and HtmlOutputRendererHelper' );
46 'Name of the page to be parsed and compared',
51 'Target variant language code to transform the content to',
59 $pageName = $this->
getArg(
'page-title' );
60 $pageTitle = Title::newFromText( $pageName );
62 if ( !$pageTitle || !$pageTitle->
exists() ) {
63 $this->
fatalError(
"Title with name $pageName not found" );
66 $targetVariantCode = $this->
getArg(
'target-variant' );
67 $languageNameUtils = $mwInstance->getLanguageNameUtils();
68 if ( !$languageNameUtils->isValidBuiltInCode( $targetVariantCode ) ) {
69 $this->
fatalError(
"$targetVariantCode is not a supported variant" );
71 $targetVariant = $mwInstance->getLanguageFactory()->getLanguage(
75 $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [
'steal' =>
true ] );
78 $parserOutput = $this->getParserOutput( $pageTitle, $baseLanguage, $targetVariant );
79 $parsoidOutput = $this->getParsoidOutput( $pageTitle, $targetVariant, $user );
80 $converterUsed = $this->getConverterUsed( $parsoidOutput );
82 $this->compareOutput( $parserOutput, $parsoidOutput, $converterUsed );
91 $services->getParsoidOutputStash(),
93 $services->getParsoidOutputAccess(),
94 $services->getHtmlTransformFactory(),
95 $services->getContentHandlerFactory(),
96 $services->getLanguageFactory()
104 $parserOpts->disableContentConversion(
false );
105 $parserOpts->disableTitleConversion(
false );
110 private function getParserOutput(
120 $mwInstance = $this->getServiceContainer();
122 $languageFactory = $mwInstance->getLanguageFactory();
123 $parser = $mwInstance->getParser();
124 $parserOptions = $this->getParserOptions(
125 $languageFactory->getParentLanguage( $baseLanguage )
128 $content = $mwInstance->getRevisionLookup()
129 ->getRevisionByTitle( $pageTitle )
130 ->getContent( SlotRecord::MAIN );
131 $wikiContent = ( $content instanceof
TextContent ) ? $content->getText() :
'';
133 return $parser->parse( $wikiContent, $pageTitle, $parserOptions );
136 private function getParsoidOutput(
138 Bcp47Code $targetVariant,
141 $htmlOutputRendererHelper = $this->newHtmlOutputRendererHelper();
142 $htmlOutputRendererHelper->init( $pageTitle, [
146 $htmlOutputRendererHelper->setVariantConversionLanguage( $targetVariant );
148 return $htmlOutputRendererHelper->getHtml();
151 private function getWords(
string $output ): array {
152 $tagsRemoved = strip_tags( $output );
153 $words = preg_split(
'/\s+/', trim( $tagsRemoved ), -1, PREG_SPLIT_NO_EMPTY );
157 private function getBody(
string $output ): string {
158 $dom = new DOMDocument();
160 @$dom->loadHTML( $output );
161 $body = $dom->getElementsByTagName(
'body' )->item( 0 );
162 if ( $body ===
null ) {
167 return $body->textContent;
170 private function compareOutput(
173 string $converterUsed
175 $parsoidText = $parsoidOutput->getText( [
'deduplicateStyles' => false ] );
176 $parserText = $parserOutput->
getText( [
'deduplicateStyles' =>
false ] );
178 $parsoidWords = $this->getWords( $this->getBody( $parsoidText ) );
179 $parserWords = $this->getWords( $parserText );
181 $parserWordCount = count( $parserWords );
182 $parsoidWordCount = count( $parsoidWords );
183 $this->output(
"Word count: Parsoid: $parsoidWordCount; Parser: $parserWordCount\n" );
185 $this->outputSimilarity( $parsoidWords, $parserWords );
186 $this->output(
"\n" );
187 $this->outputDiff( $parsoidWords, $parserWords, $converterUsed );
190 private function getConverterUsed(
ParserOutput $parsoidOutput ): string {
191 $isCoreConverterUsed = strpos(
192 $parsoidOutput->getRawText(),
193 'Variant conversion performed using the core LanguageConverter'
196 if ( $isCoreConverterUsed ) {
197 return 'Core LanguageConverter';
199 return 'Parsoid LanguageConverter';
204 private function mb_sprintf(
string $format, ...$args ): string {
208 preg_replace_callback(
209 '/(?<=%|%-)\d+(?=s)/',
211 $value = array_shift(
$params );
213 return (
string)( strlen( $value ) - mb_strlen( $value ) +
$matches[0] );
221 private function outputSimilarity( array $parsoidWords, array $parserWords ): void {
222 $parsoidOutput = implode(
' ', $parsoidWords );
223 $parserOutput = implode(
' ', $parserWords );
225 'Total characters: Parsoid: ' . strlen( $parsoidOutput ) .
226 '; Parser: ' . strlen( $parserOutput ) .
"\n"
229 $similarityPercent = 0;
230 $similarCharacters = similar_text( $parsoidOutput, $parserOutput, $similarityPercent );
231 $similarityPercent = round( $similarityPercent, 2 );
234 "Similarity via similar_text(): $similarityPercent%; Similar characters: $similarCharacters"
238 private function outputDiff( array $parsoidWords, array $parserWords,
string $converterUsed ): void {
239 $out = str_repeat(
'-', 96 ) .
"\n";
240 $out .= sprintf(
"| %5s | %-35s | %-35s | %-8s |\n",
'Line',
'Parsoid',
'Parser',
'Diff' );
241 $out .= sprintf(
"| %5s | %-35s | %-35s | %-8s |\n",
'',
"($converterUsed)",
'',
'' );
242 $out .= str_repeat(
'-', 96 ) .
"\n";
245 $diff =
new Diff( $parsoidWords, $parserWords );
247 $this->output( $e->getMessage() );
248 $this->error(
'Encountered ComplexityException while computing diff' );
253 foreach ( $wordDiffFormat as $index => $wordDiff ) {
254 $action = $wordDiff[
'action'];
255 $old = $wordDiff[
'old'] ??
null;
256 $new = $wordDiff[
'new'] ??
null;
258 $out .= $this->mb_sprintf(
259 "| %5s | %-35s | %-35s | %-8s |\n",
260 str_pad( (
string)( $index + 1 ), 5,
' ', STR_PAD_LEFT ),
261 mb_strimwidth( $old ??
'- N/A -', 0, 35,
'…' ),
262 mb_strimwidth( $new ??
'- N/A -', 0, 35,
'…' ),
268 $out .= str_repeat(
'-', 96 ) .
"\n";
269 $this->output(
"\n" . $out );
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addArg( $arg, $description, $required=true, $multi=false)
Add some args that are needed.
getServiceContainer()
Returns the main service container.
getArg( $argId=0, $default=null)
Get an argument.
addDescription( $text)
Set the description text.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
Content object implementation for representing flat text.