Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
69.15% |
65 / 94 |
|
55.56% |
5 / 9 |
CRAP | |
0.00% |
0 / 1 |
LanguageVariantConverter | |
69.15% |
65 / 94 |
|
55.56% |
5 / 9 |
45.85 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
setPageConfig | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
setPageLanguageOverride | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
convertPageBundleVariant | |
37.50% |
15 / 40 |
|
0.00% |
0 / 1 |
11.10 | |||
convertParserOutputVariant | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
disableFallbackLanguageConverter | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getPageConfig | |
85.71% |
12 / 14 |
|
0.00% |
0 / 1 |
4.05 | |||
getPageLanguage | |
90.00% |
9 / 10 |
|
0.00% |
0 / 1 |
5.03 | |||
getBaseAndSourceLanguage | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
7 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Parser\Parsoid; |
4 | |
5 | use MediaWiki\Language\LanguageCode; |
6 | use MediaWiki\Languages\LanguageConverterFactory; |
7 | use MediaWiki\Languages\LanguageFactory; |
8 | use MediaWiki\Page\PageIdentity; |
9 | use MediaWiki\Parser\ParserOutput; |
10 | use MediaWiki\Parser\Parsoid\Config\PageConfigFactory; |
11 | use MediaWiki\Rest\HttpException; |
12 | use MediaWiki\Rest\LocalizedHttpException; |
13 | use MediaWiki\Revision\RevisionAccessException; |
14 | use MediaWiki\Title\Title; |
15 | use MediaWiki\Title\TitleFactory; |
16 | use Wikimedia\Bcp47Code\Bcp47Code; |
17 | use Wikimedia\Bcp47Code\Bcp47CodeValue; |
18 | use Wikimedia\Message\MessageValue; |
19 | use Wikimedia\Parsoid\Config\PageConfig; |
20 | use Wikimedia\Parsoid\Config\SiteConfig; |
21 | use Wikimedia\Parsoid\Core\PageBundle; |
22 | use Wikimedia\Parsoid\DOM\Element; |
23 | use Wikimedia\Parsoid\Parsoid; |
24 | use Wikimedia\Parsoid\Utils\DOMCompat; |
25 | use Wikimedia\Parsoid\Utils\DOMUtils; |
26 | |
27 | /** |
28 | * @since 1.40 |
29 | * @unstable should be marked stable before 1.40 release |
30 | */ |
31 | class LanguageVariantConverter { |
32 | private PageConfigFactory $pageConfigFactory; |
33 | private ?PageConfig $pageConfig = null; |
34 | private PageIdentity $pageIdentity; |
35 | private Title $pageTitle; |
36 | private Parsoid $parsoid; |
37 | private SiteConfig $siteConfig; |
38 | private LanguageConverterFactory $languageConverterFactory; |
39 | private LanguageFactory $languageFactory; |
40 | /** |
41 | * Page language override from the Content-Language header. |
42 | */ |
43 | private ?Bcp47Code $pageLanguageOverride = null; |
44 | private bool $isFallbackLanguageConverterEnabled = true; |
45 | |
46 | public function __construct( |
47 | PageIdentity $pageIdentity, |
48 | PageConfigFactory $pageConfigFactory, |
49 | Parsoid $parsoid, |
50 | SiteConfig $siteConfig, |
51 | TitleFactory $titleFactory, |
52 | LanguageConverterFactory $languageConverterFactory, |
53 | LanguageFactory $languageFactory |
54 | ) { |
55 | $this->pageConfigFactory = $pageConfigFactory; |
56 | $this->pageIdentity = $pageIdentity; |
57 | $this->parsoid = $parsoid; |
58 | $this->siteConfig = $siteConfig; |
59 | $this->pageTitle = $titleFactory->newFromPageIdentity( $this->pageIdentity ); |
60 | $this->languageConverterFactory = $languageConverterFactory; |
61 | $this->languageFactory = $languageFactory; |
62 | } |
63 | |
64 | /** |
65 | * Set the PageConfig object to be used during language variant conversion. |
66 | * If not provided, the object will be created. |
67 | * |
68 | * @param PageConfig $pageConfig |
69 | * @return void |
70 | */ |
71 | public function setPageConfig( PageConfig $pageConfig ) { |
72 | $this->pageConfig = $pageConfig; |
73 | } |
74 | |
75 | /** |
76 | * Set the page content language override. |
77 | * |
78 | * @param Bcp47Code $language |
79 | * @return void |
80 | */ |
81 | public function setPageLanguageOverride( Bcp47Code $language ) { |
82 | $this->pageLanguageOverride = $language; |
83 | } |
84 | |
85 | /** |
86 | * Perform variant conversion on a PageBundle object. |
87 | * |
88 | * @param PageBundle $pageBundle |
89 | * @param Bcp47Code $targetVariant |
90 | * @param ?Bcp47Code $sourceVariant |
91 | * |
92 | * @return PageBundle The converted PageBundle, or the object passed in as |
93 | * $pageBundle if the conversion is not supported. |
94 | * @throws HttpException |
95 | */ |
96 | public function convertPageBundleVariant( |
97 | PageBundle $pageBundle, |
98 | Bcp47Code $targetVariant, |
99 | ?Bcp47Code $sourceVariant = null |
100 | ): PageBundle { |
101 | [ $pageLanguage, $sourceVariant ] = |
102 | $this->getBaseAndSourceLanguage( $pageBundle, $sourceVariant ); |
103 | |
104 | if ( !$this->siteConfig->langConverterEnabledBcp47( $pageLanguage ) ) { |
105 | // If the language doesn't support variants, just return the content unmodified. |
106 | return $pageBundle; |
107 | } |
108 | |
109 | $pageConfig = $this->getPageConfig( $pageLanguage, $sourceVariant ); |
110 | |
111 | if ( $this->parsoid->implementsLanguageConversionBcp47( $pageConfig, $targetVariant ) ) { |
112 | return $this->parsoid->pb2pb( |
113 | $pageConfig, 'variant', $pageBundle, |
114 | [ |
115 | 'variant' => [ |
116 | 'source' => $sourceVariant, |
117 | 'target' => $targetVariant, |
118 | ] |
119 | ] |
120 | ); |
121 | } else { |
122 | if ( !$this->isFallbackLanguageConverterEnabled ) { |
123 | // Fallback variant conversion is not enabled, return the page bundle as is. |
124 | return $pageBundle; |
125 | } |
126 | |
127 | // LanguageConverter::hasVariant and LanguageConverter::convertTo |
128 | // could take a string|Bcp47Code in the future, which would |
129 | // allow us to avoid the $targetVariantCode conversion here. |
130 | $baseLanguage = $this->languageFactory->getParentLanguage( $targetVariant ); |
131 | $languageConverter = $this->languageConverterFactory->getLanguageConverter( $baseLanguage ); |
132 | $targetVariantCode = $this->languageFactory->getLanguage( $targetVariant )->getCode(); |
133 | if ( $languageConverter->hasVariant( $targetVariantCode ) ) { |
134 | // NOTE: This is not a convert() because we have the exact desired variant |
135 | // and don't need to compute a preferred variant based on a base language. |
136 | // Also see T267067 for why convert() should be avoided. |
137 | $convertedHtml = $languageConverter->convertTo( $pageBundle->html, $targetVariantCode ); |
138 | $pageVariant = $targetVariant; |
139 | } else { |
140 | // No conversion possible - pass through original HTML in original language |
141 | $convertedHtml = $pageBundle->html; |
142 | $pageVariant = $pageConfig->getPageLanguageBcp47(); |
143 | } |
144 | |
145 | // Add a note so that we can identify what was used to perform the variant conversion |
146 | $msg = "<!-- Variant conversion performed using the core LanguageConverter -->"; |
147 | $convertedHtml = $msg . $convertedHtml; |
148 | |
149 | // NOTE: Keep this in sync with code in Parsoid.php in Parsoid repo |
150 | // Add meta information that Parsoid normally adds |
151 | $headers = [ |
152 | 'content-language' => $pageVariant->toBcp47Code(), |
153 | 'vary' => [ 'Accept', 'Accept-Language' ] |
154 | ]; |
155 | $doc = DOMUtils::parseHTML( '' ); |
156 | $doc->appendChild( $doc->createElement( 'head' ) ); |
157 | DOMUtils::addHttpEquivHeaders( $doc, $headers ); |
158 | $docElt = $doc->documentElement; |
159 | '@phan-var Element $docElt'; |
160 | $docHtml = DOMCompat::getOuterHTML( $docElt ); |
161 | $convertedHtml = preg_replace( "#</body>#", $docHtml, "$convertedHtml</body>" ); |
162 | return new PageBundle( |
163 | $convertedHtml, [], [], $pageBundle->version, $headers |
164 | ); |
165 | } |
166 | } |
167 | |
168 | /** |
169 | * Perform variant conversion on a ParserOutput object. |
170 | * |
171 | * @param ParserOutput $parserOutput |
172 | * @param Bcp47Code $targetVariant |
173 | * @param ?Bcp47Code $sourceVariant |
174 | * |
175 | * @return ParserOutput |
176 | */ |
177 | public function convertParserOutputVariant( |
178 | ParserOutput $parserOutput, |
179 | Bcp47Code $targetVariant, |
180 | ?Bcp47Code $sourceVariant = null |
181 | ): ParserOutput { |
182 | $pageBundle = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput ); |
183 | $modifiedPageBundle = $this->convertPageBundleVariant( $pageBundle, $targetVariant, $sourceVariant ); |
184 | |
185 | return PageBundleParserOutputConverter::parserOutputFromPageBundle( $modifiedPageBundle, $parserOutput ); |
186 | } |
187 | |
188 | /** |
189 | * Disable fallback language variant converter |
190 | * @return void |
191 | */ |
192 | public function disableFallbackLanguageConverter(): void { |
193 | $this->isFallbackLanguageConverterEnabled = false; |
194 | } |
195 | |
196 | private function getPageConfig( Bcp47Code $pageLanguage, ?Bcp47Code $sourceVariant ): PageConfig { |
197 | if ( $this->pageConfig ) { |
198 | return $this->pageConfig; |
199 | } |
200 | |
201 | try { |
202 | $this->pageConfig = $this->pageConfigFactory->create( |
203 | $this->pageIdentity, |
204 | null, |
205 | null, |
206 | null, |
207 | $pageLanguage |
208 | ); |
209 | |
210 | if ( $sourceVariant ) { |
211 | $this->pageConfig->setVariantBcp47( $sourceVariant ); |
212 | } |
213 | } catch ( RevisionAccessException $exception ) { |
214 | // TODO: Throw a different exception, this class should not know |
215 | // about HTTP status codes. |
216 | throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ), 404 ); |
217 | } |
218 | |
219 | return $this->pageConfig; |
220 | } |
221 | |
222 | /** |
223 | * Try to determine the page's language code as follows: |
224 | * |
225 | * First consider any value set by calling ::setPageLanguageOverride(); |
226 | * this would have come from a Content-Language header. |
227 | * |
228 | * If ::setPageLanguageOverride() has not been called, check for a |
229 | * content-language header in $pageBundle, which should be |
230 | * equivalent. These are used when the title/article doesn't |
231 | * (yet) exist. |
232 | * |
233 | * If these are not given, use the $default if given; this is used |
234 | * to allow additional parameters to the request to be used as |
235 | * fallbacks. |
236 | * |
237 | * If we don't have $default, but we do have a PageConfig in |
238 | * $this->pageConfig, return $this->pageConfig->getPageLanguage(). |
239 | * |
240 | * Finally, fall back to $this->pageTitle->getPageLanguage(). |
241 | * |
242 | * @param PageBundle $pageBundle |
243 | * @param Bcp47Code|null $default A default language, used after |
244 | * Content-Language but before PageConfig/Title lookup. |
245 | * |
246 | * @return Bcp47Code the page language; may be a variant. |
247 | */ |
248 | private function getPageLanguage( PageBundle $pageBundle, ?Bcp47Code $default = null ): Bcp47Code { |
249 | // If a language was set by calling setPageLanguageOverride(), always use it! |
250 | if ( $this->pageLanguageOverride ) { |
251 | return $this->pageLanguageOverride; |
252 | } |
253 | |
254 | // If the page bundle contains a language code, use that. |
255 | $pageBundleLanguage = $pageBundle->headers[ 'content-language' ] ?? null; |
256 | if ( $pageBundleLanguage ) { |
257 | // The HTTP header will contain a BCP-47 language code, not a |
258 | // mediawiki-internal one. |
259 | return new Bcp47CodeValue( $pageBundleLanguage ); |
260 | } |
261 | |
262 | // NOTE: Use explicit default *before* we try PageBundle, because PageConfig::getPageLanguage() |
263 | // falls back to Title::getPageLanguage(). If we did that first, $default would never be used. |
264 | if ( $default ) { |
265 | return $default; |
266 | } |
267 | |
268 | // If we have a PageConfig, we can ask it for the page's language. Note that this will fall back to |
269 | // Title::getPageLanguage(), so it has to be the last thing we try. |
270 | if ( $this->pageConfig ) { |
271 | return $this->pageConfig->getPageLanguageBcp47(); |
272 | } |
273 | |
274 | // Finally, just go by the code associated with the title. This may come from the database or |
275 | // it may be determined based on the title itself. |
276 | return $this->pageTitle->getPageLanguage(); |
277 | } |
278 | |
279 | /** |
280 | * Determine the codes of the base language and the source variant. |
281 | * |
282 | * The base language will be used to find the appropriate LanguageConverter. |
283 | * It should never be a variant. |
284 | * |
285 | * The source variant will be used to instruct the LanguageConverter. |
286 | * It should always be a variant (or null to trigger auto-detection of |
287 | * the source variant). |
288 | * |
289 | * @param PageBundle $pageBundle |
290 | * @param ?Bcp47Code $sourceLanguage |
291 | * |
292 | * @return array{0:Bcp47Code,1:?Bcp47Code} [ Bcp47Code $pageLanguage, ?Bcp47Code $sourceLanguage ] |
293 | */ |
294 | private function getBaseAndSourceLanguage( PageBundle $pageBundle, ?Bcp47Code $sourceLanguage ): array { |
295 | // Try to determine the language code associated with the content of the page. |
296 | // The result may be a variant code. |
297 | $baseLanguage = $this->getPageLanguage( $pageBundle, $sourceLanguage ); |
298 | |
299 | // To find out if $baseLanguage is actually a variant, get the parent language and compare. |
300 | $parentLang = $this->languageFactory->getParentLanguage( $baseLanguage ); |
301 | |
302 | // If $parentLang is not the same language as $baseLanguage, this means that |
303 | // $baseLanguage is a variant. In that case, set $sourceLanguage to that |
304 | // variant (unless $sourceLanguage is already set), and set $baseLanguage |
305 | // to the $parentLang |
306 | if ( $parentLang && strcasecmp( $parentLang->toBcp47Code(), $baseLanguage->toBcp47Code() ) !== 0 ) { |
307 | if ( !$sourceLanguage ) { |
308 | $sourceLanguage = $baseLanguage; |
309 | } |
310 | $baseLanguage = $parentLang; |
311 | } |
312 | |
313 | if ( $sourceLanguage !== null ) { |
314 | $parentConverter = $this->languageConverterFactory->getLanguageConverter( $parentLang ); |
315 | // If the source variant isn't actually a variant, trigger auto-detection |
316 | $sourceIsVariant = ( |
317 | strcasecmp( $parentLang->toBcp47Code(), $sourceLanguage->toBcp47Code() ) !== 0 && |
318 | $parentConverter->hasVariant( |
319 | LanguageCode::bcp47ToInternal( $sourceLanguage->toBcp47Code() ) |
320 | ) |
321 | ); |
322 | if ( !$sourceIsVariant ) { |
323 | $sourceLanguage = null; |
324 | } |
325 | } |
326 | |
327 | return [ $baseLanguage, $sourceLanguage ]; |
328 | } |
329 | } |