Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 153 |
|
0.00% |
0 / 13 |
CRAP | |
0.00% |
0 / 1 |
| CompareLanguageConverterOutput | |
0.00% |
0 / 153 |
|
0.00% |
0 / 13 |
462 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
| execute | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
20 | |||
| newPageRestHelperFactory | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
2 | |||
| getParserOptions | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
| getParserOutput | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
6 | |||
| getParsoidOutput | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
| getWords | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| getBody | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
| compareOutput | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
| getConverterUsed | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
| mb_sprintf | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
| outputSimilarity | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
| outputDiff | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
12 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * @license GPL-2.0-or-later |
| 4 | * @file |
| 5 | * @ingroup Maintenance |
| 6 | */ |
| 7 | |
| 8 | use MediaWiki\Config\ServiceOptions; |
| 9 | use MediaWiki\Content\TextContent; |
| 10 | use MediaWiki\Language\Language; |
| 11 | use MediaWiki\Maintenance\Maintenance; |
| 12 | use MediaWiki\Parser\ParserOptions; |
| 13 | use MediaWiki\Parser\ParserOutput; |
| 14 | use MediaWiki\Rest\Handler\Helper\PageRestHelperFactory; |
| 15 | use MediaWiki\Revision\SlotRecord; |
| 16 | use MediaWiki\Title\Title; |
| 17 | use MediaWiki\User\User; |
| 18 | use Wikimedia\Bcp47Code\Bcp47Code; |
| 19 | use Wikimedia\Diff\ArrayDiffFormatter; |
| 20 | use Wikimedia\Diff\ComplexityException; |
| 21 | use Wikimedia\Diff\Diff; |
| 22 | use Wikimedia\Stats\StatsFactory; |
| 23 | |
| 24 | // @codeCoverageIgnoreStart |
| 25 | require_once __DIR__ . '/Maintenance.php'; |
| 26 | // @codeCoverageIgnoreEnd |
| 27 | |
| 28 | /** |
| 29 | * Maintenance script that compares variant conversion output between Parser and |
| 30 | * HtmlOutputRendererHelper. |
| 31 | * |
| 32 | * @ingroup Maintenance |
| 33 | */ |
| 34 | class CompareLanguageConverterOutput extends Maintenance { |
| 35 | public function __construct() { |
| 36 | parent::__construct(); |
| 37 | $this->addDescription( 'Compares variant conversion output between Parser and HtmlOutputRendererHelper' ); |
| 38 | $this->addArg( |
| 39 | 'page-title', |
| 40 | 'Name of the page to be parsed and compared', |
| 41 | true |
| 42 | ); |
| 43 | $this->addArg( |
| 44 | 'target-variant', |
| 45 | 'Target variant language code to transform the content to', |
| 46 | true |
| 47 | ); |
| 48 | } |
| 49 | |
| 50 | /** @inheritDoc */ |
| 51 | public function execute() { |
| 52 | $mwInstance = $this->getServiceContainer(); |
| 53 | |
| 54 | $pageName = $this->getArg( 'page-title' ); |
| 55 | $pageTitle = Title::newFromText( $pageName ); |
| 56 | |
| 57 | if ( !$pageTitle || !$pageTitle->exists() ) { |
| 58 | $this->fatalError( "Title with name $pageName not found" ); |
| 59 | } |
| 60 | |
| 61 | $targetVariantCode = $this->getArg( 'target-variant' ); |
| 62 | $languageNameUtils = $mwInstance->getLanguageNameUtils(); |
| 63 | if ( !$languageNameUtils->isValidBuiltInCode( $targetVariantCode ) ) { |
| 64 | $this->fatalError( "$targetVariantCode is not a supported variant" ); |
| 65 | } |
| 66 | $targetVariant = $mwInstance->getLanguageFactory()->getLanguage( |
| 67 | $targetVariantCode |
| 68 | ); |
| 69 | |
| 70 | $user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] ); |
| 71 | $baseLanguage = $pageTitle->getPageLanguage(); |
| 72 | |
| 73 | $parserOutput = $this->getParserOutput( $pageTitle, $baseLanguage, $targetVariant ); |
| 74 | $parsoidOutput = $this->getParsoidOutput( $pageTitle, $targetVariant, $user ); |
| 75 | $converterUsed = $this->getConverterUsed( $parsoidOutput ); |
| 76 | |
| 77 | $this->compareOutput( $parserOutput->getContentHolderText(), $parsoidOutput->getContentHolderText(), |
| 78 | $converterUsed ); |
| 79 | return true; |
| 80 | } |
| 81 | |
| 82 | private function newPageRestHelperFactory(): PageRestHelperFactory { |
| 83 | $services = $this->getServiceContainer(); |
| 84 | |
| 85 | return new PageRestHelperFactory( |
| 86 | new ServiceOptions( PageRestHelperFactory::CONSTRUCTOR_OPTIONS, $services->getMainConfig() ), |
| 87 | $services->getRevisionLookup(), |
| 88 | $services->getRevisionRenderer(), |
| 89 | $services->getTitleFormatter(), |
| 90 | $services->getPageStore(), |
| 91 | $services->getParsoidOutputStash(), |
| 92 | $services->getParserOutputAccess(), |
| 93 | $services->getParsoidSiteConfig(), |
| 94 | $services->getHtmlTransformFactory(), |
| 95 | $services->getContentHandlerFactory(), |
| 96 | $services->getLanguageFactory(), |
| 97 | $services->getRedirectStore(), |
| 98 | $services->getLanguageConverterFactory(), |
| 99 | $services->getTitleFactory(), |
| 100 | $services->getConnectionProvider(), |
| 101 | $services->getChangeTagsStore(), |
| 102 | StatsFactory::newNull() |
| 103 | ); |
| 104 | } |
| 105 | |
| 106 | private function getParserOptions( Language $language ): ParserOptions { |
| 107 | $parserOpts = ParserOptions::newFromAnon(); |
| 108 | $parserOpts->setTargetLanguage( $language ); |
| 109 | $parserOpts->disableContentConversion( false ); |
| 110 | $parserOpts->disableTitleConversion( false ); |
| 111 | |
| 112 | return $parserOpts; |
| 113 | } |
| 114 | |
| 115 | private function getParserOutput( |
| 116 | Title $pageTitle, |
| 117 | Language $baseLanguage, |
| 118 | Language $targetVariant |
| 119 | ): ParserOutput { |
| 120 | // We update the default language variant because we want Parser to |
| 121 | // perform variant conversion to it. |
| 122 | global $wgDefaultLanguageVariant; |
| 123 | $wgDefaultLanguageVariant = $targetVariant->getCode(); |
| 124 | |
| 125 | $mwInstance = $this->getServiceContainer(); |
| 126 | |
| 127 | $languageFactory = $mwInstance->getLanguageFactory(); |
| 128 | $parser = $mwInstance->getParser(); |
| 129 | $parserOptions = $this->getParserOptions( |
| 130 | $languageFactory->getParentLanguage( $baseLanguage ) |
| 131 | ); |
| 132 | |
| 133 | $content = $mwInstance->getRevisionLookup() |
| 134 | ->getRevisionByTitle( $pageTitle ) |
| 135 | ->getContent( SlotRecord::MAIN ); |
| 136 | $wikiContent = ( $content instanceof TextContent ) ? $content->getText() : ''; |
| 137 | |
| 138 | $po = $parser->parse( $wikiContent, $pageTitle, $parserOptions ); |
| 139 | // TODO T371008 consider if using the Content framework makes sense instead of creating the pipeline |
| 140 | $pipeline = $mwInstance->getDefaultOutputPipeline(); |
| 141 | $options = [ 'deduplicateStyles' => false ]; |
| 142 | return $pipeline->run( $po, $parserOptions, $options ); |
| 143 | } |
| 144 | |
| 145 | private function getParsoidOutput( |
| 146 | Title $pageTitle, |
| 147 | Bcp47Code $targetVariant, |
| 148 | User $user |
| 149 | ): ParserOutput { |
| 150 | $parserOptions = ParserOptions::newFromAnon(); |
| 151 | $htmlOutputRendererHelper = $this->newPageRestHelperFactory()->newHtmlOutputRendererHelper( $pageTitle, [ |
| 152 | 'stash' => false, |
| 153 | 'flavor' => 'view', |
| 154 | ], $user, null, false, $parserOptions ); |
| 155 | $htmlOutputRendererHelper->setVariantConversionLanguage( $targetVariant ); |
| 156 | |
| 157 | $po = $htmlOutputRendererHelper->getHtml(); |
| 158 | $pipeline = $this->getServiceContainer()->getDefaultOutputPipeline(); |
| 159 | $options = [ 'deduplicateStyles' => false ]; |
| 160 | return $pipeline->run( $po, $parserOptions, $options ); |
| 161 | } |
| 162 | |
| 163 | private function getWords( string $output ): array { |
| 164 | $tagsRemoved = strip_tags( $output ); |
| 165 | $words = preg_split( '/\s+/', trim( $tagsRemoved ), -1, PREG_SPLIT_NO_EMPTY ); |
| 166 | return $words; |
| 167 | } |
| 168 | |
| 169 | private function getBody( string $output ): string { |
| 170 | $dom = new DOMDocument(); |
| 171 | // phpcs:disable Generic.PHP.NoSilencedErrors.Discouraged |
| 172 | @$dom->loadHTML( $output ); |
| 173 | $body = $dom->getElementsByTagName( 'body' )->item( 0 ); |
| 174 | if ( $body === null ) { |
| 175 | // Body element not present |
| 176 | return $output; |
| 177 | } |
| 178 | |
| 179 | return $body->textContent; |
| 180 | } |
| 181 | |
| 182 | private function compareOutput( |
| 183 | string $parserText, |
| 184 | string $parsoidText, |
| 185 | string $converterUsed |
| 186 | ): void { |
| 187 | $parsoidWords = $this->getWords( $this->getBody( $parsoidText ) ); |
| 188 | $parserWords = $this->getWords( $parserText ); |
| 189 | |
| 190 | $parserWordCount = count( $parserWords ); |
| 191 | $parsoidWordCount = count( $parsoidWords ); |
| 192 | $this->output( "Word count: Parsoid: $parsoidWordCount; Parser: $parserWordCount\n" ); |
| 193 | |
| 194 | $this->outputSimilarity( $parsoidWords, $parserWords ); |
| 195 | $this->output( "\n" ); |
| 196 | $this->outputDiff( $parsoidWords, $parserWords, $converterUsed ); |
| 197 | } |
| 198 | |
| 199 | private function getConverterUsed( ParserOutput $parsoidOutput ): string { |
| 200 | $isCoreConverterUsed = strpos( |
| 201 | $parsoidOutput->getRawText(), |
| 202 | 'Variant conversion performed using the core LanguageConverter' |
| 203 | ); |
| 204 | |
| 205 | if ( $isCoreConverterUsed ) { |
| 206 | return 'Core LanguageConverter'; |
| 207 | } else { |
| 208 | return 'Parsoid LanguageConverter'; |
| 209 | } |
| 210 | } |
| 211 | |
| 212 | // Inspired from: https://stackoverflow.com/a/55927237/903324 |
| 213 | private function mb_sprintf( string $format, string ...$args ): string { |
| 214 | $params = $args; |
| 215 | |
| 216 | return sprintf( |
| 217 | preg_replace_callback( |
| 218 | '/(?<=%|%-)\d+(?=s)/', |
| 219 | static function ( array $matches ) use ( &$params ) { |
| 220 | $value = array_shift( $params ); |
| 221 | |
| 222 | return (string)( strlen( $value ) - mb_strlen( $value ) + $matches[0] ); |
| 223 | }, |
| 224 | $format |
| 225 | ), |
| 226 | ...$args |
| 227 | ); |
| 228 | } |
| 229 | |
| 230 | private function outputSimilarity( array $parsoidWords, array $parserWords ): void { |
| 231 | $parsoidOutput = implode( ' ', $parsoidWords ); |
| 232 | $parserOutput = implode( ' ', $parserWords ); |
| 233 | $this->output( |
| 234 | 'Total characters: Parsoid: ' . strlen( $parsoidOutput ) . |
| 235 | '; Parser: ' . strlen( $parserOutput ) . "\n" |
| 236 | ); |
| 237 | |
| 238 | $similarityPercent = 0; |
| 239 | $similarCharacters = similar_text( $parsoidOutput, $parserOutput, $similarityPercent ); |
| 240 | $similarityPercent = round( $similarityPercent, 2 ); |
| 241 | |
| 242 | $this->output( |
| 243 | "Similarity via similar_text(): $similarityPercent%; Similar characters: $similarCharacters" |
| 244 | ); |
| 245 | } |
| 246 | |
| 247 | private function outputDiff( array $parsoidWords, array $parserWords, string $converterUsed ): void { |
| 248 | $out = str_repeat( '-', 96 ) . "\n"; |
| 249 | $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", 'Line', 'Parsoid', 'Parser', 'Diff' ); |
| 250 | $out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", '', "($converterUsed)", '', '' ); |
| 251 | $out .= str_repeat( '-', 96 ) . "\n"; |
| 252 | |
| 253 | try { |
| 254 | $diff = new Diff( $parsoidWords, $parserWords ); |
| 255 | } catch ( ComplexityException $e ) { |
| 256 | $this->output( $e->getMessage() ); |
| 257 | $this->fatalError( 'Encountered ComplexityException while computing diff' ); |
| 258 | } |
| 259 | |
| 260 | // Print the difference between the words |
| 261 | $wordDiffFormat = ( new ArrayDiffFormatter() )->format( $diff ); |
| 262 | foreach ( $wordDiffFormat as $index => $wordDiff ) { |
| 263 | $action = $wordDiff['action']; |
| 264 | $old = $wordDiff['old'] ?? null; |
| 265 | $new = $wordDiff['new'] ?? null; |
| 266 | |
| 267 | $out .= $this->mb_sprintf( |
| 268 | "| %5s | %-35s | %-35s | %-8s |\n", |
| 269 | str_pad( (string)( $index + 1 ), 5, ' ', STR_PAD_LEFT ), |
| 270 | mb_strimwidth( $old ?? '- N/A -', 0, 35, '…' ), |
| 271 | mb_strimwidth( $new ?? '- N/A -', 0, 35, '…' ), |
| 272 | $action |
| 273 | ); |
| 274 | } |
| 275 | |
| 276 | // Print the footer. |
| 277 | $out .= str_repeat( '-', 96 ) . "\n"; |
| 278 | $this->output( "\n" . $out ); |
| 279 | } |
| 280 | } |
| 281 | |
| 282 | // @codeCoverageIgnoreStart |
| 283 | $maintClass = CompareLanguageConverterOutput::class; |
| 284 | require_once RUN_MAINTENANCE_IF_MAIN; |
| 285 | // @codeCoverageIgnoreEnd |