Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
69.15% covered (warning)
69.15%
65 / 94
55.56% covered (warning)
55.56%
5 / 9
CRAP
0.00% covered (danger)
0.00%
0 / 1
LanguageVariantConverter
69.15% covered (warning)
69.15%
65 / 94
55.56% covered (warning)
55.56%
5 / 9
45.85
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
1
 setPageConfig
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 setPageLanguageOverride
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 convertPageBundleVariant
37.50% covered (danger)
37.50%
15 / 40
0.00% covered (danger)
0.00%
0 / 1
11.10
 convertParserOutputVariant
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 disableFallbackLanguageConverter
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getPageConfig
85.71% covered (warning)
85.71%
12 / 14
0.00% covered (danger)
0.00%
0 / 1
4.05
 getPageLanguage
90.00% covered (success)
90.00%
9 / 10
0.00% covered (danger)
0.00%
0 / 1
5.03
 getBaseAndSourceLanguage
100.00% covered (success)
100.00%
17 / 17
100.00% covered (success)
100.00%
1 / 1
7
1<?php
2
3namespace MediaWiki\Parser\Parsoid;
4
5use LanguageCode;
6use MediaWiki\Languages\LanguageConverterFactory;
7use MediaWiki\Languages\LanguageFactory;
8use MediaWiki\Page\PageIdentity;
9use MediaWiki\Parser\ParserOutput;
10use MediaWiki\Parser\Parsoid\Config\PageConfigFactory;
11use MediaWiki\Rest\HttpException;
12use MediaWiki\Rest\LocalizedHttpException;
13use MediaWiki\Revision\RevisionAccessException;
14use MediaWiki\Title\Title;
15use MediaWiki\Title\TitleFactory;
16use Wikimedia\Bcp47Code\Bcp47Code;
17use Wikimedia\Bcp47Code\Bcp47CodeValue;
18use Wikimedia\Message\MessageValue;
19use Wikimedia\Parsoid\Config\PageConfig;
20use Wikimedia\Parsoid\Config\SiteConfig;
21use Wikimedia\Parsoid\Core\PageBundle;
22use Wikimedia\Parsoid\DOM\Element;
23use Wikimedia\Parsoid\Parsoid;
24use Wikimedia\Parsoid\Utils\DOMCompat;
25use Wikimedia\Parsoid\Utils\DOMUtils;
26
27/**
28 * @since 1.40
29 * @unstable should be marked stable before 1.40 release
30 */
31class LanguageVariantConverter {
32    private PageConfigFactory $pageConfigFactory;
33    private ?PageConfig $pageConfig = null;
34    private PageIdentity $pageIdentity;
35    private Title $pageTitle;
36    private Parsoid $parsoid;
37    private SiteConfig $siteConfig;
38    private LanguageConverterFactory $languageConverterFactory;
39    private LanguageFactory $languageFactory;
40    /**
41     * Page language override from the Content-Language header.
42     */
43    private ?Bcp47Code $pageLanguageOverride = null;
44    private bool $isFallbackLanguageConverterEnabled = true;
45
46    public function __construct(
47        PageIdentity $pageIdentity,
48        PageConfigFactory $pageConfigFactory,
49        Parsoid $parsoid,
50        SiteConfig $siteConfig,
51        TitleFactory $titleFactory,
52        LanguageConverterFactory $languageConverterFactory,
53        LanguageFactory $languageFactory
54    ) {
55        $this->pageConfigFactory = $pageConfigFactory;
56        $this->pageIdentity = $pageIdentity;
57        $this->parsoid = $parsoid;
58        $this->siteConfig = $siteConfig;
59        $this->pageTitle = $titleFactory->newFromPageIdentity( $this->pageIdentity );
60        $this->languageConverterFactory = $languageConverterFactory;
61        $this->languageFactory = $languageFactory;
62    }
63
64    /**
65     * Set the PageConfig object to be used during language variant conversion.
66     * If not provided, the object will be created.
67     *
68     * @param PageConfig $pageConfig
69     * @return void
70     */
71    public function setPageConfig( PageConfig $pageConfig ) {
72        $this->pageConfig = $pageConfig;
73    }
74
75    /**
76     * Set the page content language override.
77     *
78     * @param Bcp47Code $language
79     * @return void
80     */
81    public function setPageLanguageOverride( Bcp47Code $language ) {
82        $this->pageLanguageOverride = $language;
83    }
84
85    /**
86     * Perform variant conversion on a PageBundle object.
87     *
88     * @param PageBundle $pageBundle
89     * @param Bcp47Code $targetVariant
90     * @param ?Bcp47Code $sourceVariant
91     *
92     * @return PageBundle The converted PageBundle, or the object passed in as
93     *         $pageBundle if the conversion is not supported.
94     * @throws HttpException
95     */
96    public function convertPageBundleVariant(
97        PageBundle $pageBundle,
98        Bcp47Code $targetVariant,
99        ?Bcp47Code $sourceVariant = null
100    ): PageBundle {
101        [ $pageLanguage, $sourceVariant ] =
102            $this->getBaseAndSourceLanguage( $pageBundle, $sourceVariant );
103
104        if ( !$this->siteConfig->langConverterEnabledBcp47( $pageLanguage ) ) {
105            // If the language doesn't support variants, just return the content unmodified.
106            return $pageBundle;
107        }
108
109        $pageConfig = $this->getPageConfig( $pageLanguage, $sourceVariant );
110
111        if ( $this->parsoid->implementsLanguageConversionBcp47( $pageConfig, $targetVariant ) ) {
112            return $this->parsoid->pb2pb(
113                $pageConfig, 'variant', $pageBundle,
114                [
115                    'variant' => [
116                        'source' => $sourceVariant,
117                        'target' => $targetVariant,
118                    ]
119                ]
120            );
121        } else {
122            if ( !$this->isFallbackLanguageConverterEnabled ) {
123                // Fallback variant conversion is not enabled, return the page bundle as is.
124                return $pageBundle;
125            }
126
127            // LanguageConverter::hasVariant and LanguageConverter::convertTo
128            // could take a string|Bcp47Code in the future, which would
129            // allow us to avoid the $targetVariantCode conversion here.
130            $baseLanguage = $this->languageFactory->getParentLanguage( $targetVariant );
131            $languageConverter = $this->languageConverterFactory->getLanguageConverter( $baseLanguage );
132            $targetVariantCode = $this->languageFactory->getLanguage( $targetVariant )->getCode();
133            if ( $languageConverter->hasVariant( $targetVariantCode ) ) {
134                // NOTE: This is not a convert() because we have the exact desired variant
135                // and don't need to compute a preferred variant based on a base language.
136                // Also see T267067 for why convert() should be avoided.
137                $convertedHtml = $languageConverter->convertTo( $pageBundle->html, $targetVariantCode );
138                $pageVariant = $targetVariant;
139            } else {
140                // No conversion possible - pass through original HTML in original language
141                $convertedHtml = $pageBundle->html;
142                $pageVariant = $pageConfig->getPageLanguageBcp47();
143            }
144
145            // Add a note so that we can identify what was used to perform the variant conversion
146            $msg = "<!-- Variant conversion performed using the core LanguageConverter -->";
147            $convertedHtml = $msg . $convertedHtml;
148
149            // NOTE: Keep this in sync with code in Parsoid.php in Parsoid repo
150            // Add meta information that Parsoid normally adds
151            $headers = [
152                'content-language' => $pageVariant->toBcp47Code(),
153                'vary' => [ 'Accept', 'Accept-Language' ]
154            ];
155            $doc = DOMUtils::parseHTML( '' );
156            $doc->appendChild( $doc->createElement( 'head' ) );
157            DOMUtils::addHttpEquivHeaders( $doc, $headers );
158            $docElt = $doc->documentElement;
159            '@phan-var Element $docElt';
160            $docHtml = DOMCompat::getOuterHTML( $docElt );
161            $convertedHtml = preg_replace( "#</body>#", $docHtml, "$convertedHtml</body>" );
162            return new PageBundle(
163                $convertedHtml, [], [], $pageBundle->version, $headers
164            );
165        }
166    }
167
168    /**
169     * Perform variant conversion on a ParserOutput object.
170     *
171     * @param ParserOutput $parserOutput
172     * @param Bcp47Code $targetVariant
173     * @param ?Bcp47Code $sourceVariant
174     *
175     * @return ParserOutput
176     */
177    public function convertParserOutputVariant(
178        ParserOutput $parserOutput,
179        Bcp47Code $targetVariant,
180        ?Bcp47Code $sourceVariant = null
181    ): ParserOutput {
182        $pageBundle = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput );
183        $modifiedPageBundle = $this->convertPageBundleVariant( $pageBundle, $targetVariant, $sourceVariant );
184
185        return PageBundleParserOutputConverter::parserOutputFromPageBundle( $modifiedPageBundle, $parserOutput );
186    }
187
188    /**
189     * Disable fallback language variant converter
190     * @return void
191     */
192    public function disableFallbackLanguageConverter(): void {
193        $this->isFallbackLanguageConverterEnabled = false;
194    }
195
196    private function getPageConfig( Bcp47Code $pageLanguage, ?Bcp47Code $sourceVariant ): PageConfig {
197        if ( $this->pageConfig ) {
198            return $this->pageConfig;
199        }
200
201        try {
202            $this->pageConfig = $this->pageConfigFactory->create(
203                $this->pageIdentity,
204                null,
205                null,
206                null,
207                $pageLanguage
208            );
209
210            if ( $sourceVariant ) {
211                $this->pageConfig->setVariantBcp47( $sourceVariant );
212            }
213        } catch ( RevisionAccessException $exception ) {
214            // TODO: Throw a different exception, this class should not know
215            //       about HTTP status codes.
216            throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ), 404 );
217        }
218
219        return $this->pageConfig;
220    }
221
222    /**
223     * Try to determine the page's language code as follows:
224     *
225     * First consider any value set by calling ::setPageLanguageOverride();
226     * this would have come from a Content-Language header.
227     *
228     * If ::setPageLanguageOverride() has not been called, check for a
229     * content-language header in $pageBundle, which should be
230     * equivalent.  These are used when the title/article doesn't
231     * (yet) exist.
232     *
233     * If these are not given, use the $default if given; this is used
234     * to allow additional parameters to the request to be used as
235     * fallbacks.
236     *
237     * If we don't have $default, but we do have a PageConfig in
238     * $this->pageConfig, return $this->pageConfig->getPageLanguage().
239     *
240     * Finally, fall back to $this->pageTitle->getPageLanguage().
241     *
242     * @param PageBundle $pageBundle
243     * @param Bcp47Code|null $default A default language, used after
244     *   Content-Language but before PageConfig/Title lookup.
245     *
246     * @return Bcp47Code the page language; may be a variant.
247     */
248    private function getPageLanguage( PageBundle $pageBundle, ?Bcp47Code $default = null ): Bcp47Code {
249        // If a language was set by calling setPageLanguageOverride(), always use it!
250        if ( $this->pageLanguageOverride ) {
251            return $this->pageLanguageOverride;
252        }
253
254        // If the page bundle contains a language code, use that.
255        $pageBundleLanguage = $pageBundle->headers[ 'content-language' ] ?? null;
256        if ( $pageBundleLanguage ) {
257            // The HTTP header will contain a BCP-47 language code, not a
258            // mediawiki-internal one.
259            return new Bcp47CodeValue( $pageBundleLanguage );
260        }
261
262        // NOTE: Use explicit default *before* we try PageBundle, because PageConfig::getPageLanguage()
263        //       falls back to Title::getPageLanguage(). If we did that first, $default would never be used.
264        if ( $default ) {
265            return $default;
266        }
267
268        // If we have a PageConfig, we can ask it for the page's language. Note that this will fall back to
269        // Title::getPageLanguage(), so it has to be the last thing we try.
270        if ( $this->pageConfig ) {
271            return $this->pageConfig->getPageLanguageBcp47();
272        }
273
274        // Finally, just go by the code associated with the title. This may come from the database or
275        // it may be determined based on the title itself.
276        return $this->pageTitle->getPageLanguage();
277    }
278
279    /**
280     * Determine the codes of the base language and the source variant.
281     *
282     * The base language will be used to find the appropriate LanguageConverter.
283     * It should never be a variant.
284     *
285     * The source variant will be used to instruct the LanguageConverter.
286     * It should always be a variant (or null to trigger auto-detection of
287     * the source variant).
288     *
289     * @param PageBundle $pageBundle
290     * @param ?Bcp47Code $sourceLanguage
291     *
292     * @return array{0:Bcp47Code,1:?Bcp47Code} [ Bcp47Code $pageLanguage, ?Bcp47Code $sourceLanguage ]
293     */
294    private function getBaseAndSourceLanguage( PageBundle $pageBundle, ?Bcp47Code $sourceLanguage ): array {
295        // Try to determine the language code associated with the content of the page.
296        // The result may be a variant code.
297        $baseLanguage = $this->getPageLanguage( $pageBundle, $sourceLanguage );
298
299        // To find out if $baseLanguage is actually a variant, get the parent language and compare.
300        $parentLang = $this->languageFactory->getParentLanguage( $baseLanguage );
301
302        // If $parentLang is not the same language as $baseLanguage, this means that
303        // $baseLanguage is a variant. In that case, set $sourceLanguage to that
304        // variant (unless $sourceLanguage is already set), and set $baseLanguage
305        // to the $parentLang
306        if ( $parentLang && strcasecmp( $parentLang->toBcp47Code(), $baseLanguage->toBcp47Code() ) !== 0 ) {
307            if ( !$sourceLanguage ) {
308                $sourceLanguage = $baseLanguage;
309            }
310            $baseLanguage = $parentLang;
311        }
312
313        if ( $sourceLanguage !== null ) {
314            $parentConverter = $this->languageConverterFactory->getLanguageConverter( $parentLang );
315            // If the source variant isn't actually a variant, trigger auto-detection
316            $sourceIsVariant = (
317                strcasecmp( $parentLang->toBcp47Code(), $sourceLanguage->toBcp47Code() ) !== 0 &&
318                $parentConverter->hasVariant(
319                    LanguageCode::bcp47ToInternal( $sourceLanguage->toBcp47Code() )
320                )
321            );
322            if ( !$sourceIsVariant ) {
323                $sourceLanguage = null;
324            }
325        }
326
327        return [ $baseLanguage, $sourceLanguage ];
328    }
329}