MediaWiki master
LanguageCode.php
Go to the documentation of this file.
1<?php
21namespace MediaWiki\Language;
22
23use Wikimedia\Bcp47Code\Bcp47Code;
24use Wikimedia\Bcp47Code\Bcp47CodeValue;
25
44 private const DEPRECATED_LANGUAGE_CODE_MAPPING = [
45 // Note that als is actually a valid ISO 639 code (Tosk Albanian), but it
46 // was previously used in MediaWiki for Alsatian, which comes under gsw
47 'als' => 'gsw', // T25215
48 'bat-smg' => 'sgs', // T27522
49 'be-x-old' => 'be-tarask', // T11823
50 'fiu-vro' => 'vro', // T31186
51 'roa-rup' => 'rup', // T17988
52 'zh-classical' => 'lzh', // T30443
53 'zh-min-nan' => 'nan', // T30442
54 'zh-yue' => 'yue', // T30441
55 ];
56
83 private const NON_STANDARD_LANGUAGE_CODE_MAPPING = [
84 // All codes returned by LanguageNameUtils::getLanguageNames() validated
85 // against IANA registry at
86 // https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
87 // with help of validator at
88 // http://schneegans.de/lv/
89 'cbk-zam' => 'cbk', // T124657
90 'de-formal' => 'de-x-formal',
91 'eml' => 'egl', // T36217
92 'en-rtl' => 'en-x-rtl',
93 'es-formal' => 'es-x-formal',
94 'hu-formal' => 'hu-x-formal',
95 'map-bms' => 'jv-x-bms', // [[en:Banyumasan_dialect]] T125073
96 'mo' => 'ro-Cyrl-MD', // T125073
97 'nrm' => 'nrf', // [[en:Norman_language]] T25216
98 'nl-informal' => 'nl-x-informal',
99 'roa-tara' => 'nap-x-tara', // [[en:Tarantino_dialect]]
100 'simple' => 'en-simple',
101 'sr-ec' => 'sr-Cyrl', // T117845
102 'sr-el' => 'sr-Latn', // T117845
103
104 // Although these next codes aren't *wrong* per se, including
105 // both the script and the country code helps compatibility with
106 // other BCP 47 users. Note that MW also uses
107 // `kk-Arab`/`kk-Cyrl`/`kk-Latn`, `zh-Hans`/`zh-Hant`,
108 // without a country code, and those should be left alone.
109 // `kk` has the Suppress-Script: Cyrl field, so `kk-KZ` won't be mapped
110 // to `kk-Cyrl-KZ`.
111 // (See getVariantsFallbacks() in KkConverter.php for Arab/Cyrl/Latn id.)
112 // (See getVariantsFallbacks() in ZhConverter.php for Hans/Hant id.)
113 'crh-ro' => 'crh-Latn-RO',
114 'kk-cn' => 'kk-Arab-CN',
115 'kk-tr' => 'kk-Latn-TR',
116 'zh-cn' => 'zh-Hans-CN',
117 'zh-sg' => 'zh-Hans-SG',
118 'zh-my' => 'zh-Hans-MY',
119 'zh-tw' => 'zh-Hant-TW',
120 'zh-hk' => 'zh-Hant-HK',
121 'zh-mo' => 'zh-Hant-MO',
122 ];
123
137 public static function getDeprecatedCodeMapping() {
138 return self::DEPRECATED_LANGUAGE_CODE_MAPPING;
139 }
140
153 public static function getNonstandardLanguageCodeMapping() {
154 static $result = [];
155 if ( $result ) {
156 return $result;
157 }
158 foreach ( self::DEPRECATED_LANGUAGE_CODE_MAPPING as $code => $ignore ) {
159 $result[$code] = self::bcp47( $code );
160 }
161 foreach ( self::NON_STANDARD_LANGUAGE_CODE_MAPPING as $code => $ignore ) {
162 $result[$code] = self::bcp47( $code );
163 }
164 return $result;
165 }
166
177 public static function replaceDeprecatedCodes( $code ) {
178 return self::DEPRECATED_LANGUAGE_CODE_MAPPING[$code] ?? $code;
179 }
180
191 public static function bcp47( $code ) {
192 $code = self::replaceDeprecatedCodes( strtolower( $code ) );
193 if ( isset( self::NON_STANDARD_LANGUAGE_CODE_MAPPING[$code] ) ) {
194 $code = self::NON_STANDARD_LANGUAGE_CODE_MAPPING[$code];
195 }
196 $codeSegment = explode( '-', $code );
197 $codeBCP = [];
198 foreach ( $codeSegment as $segNo => $seg ) {
199 // when the previous segment is x, it is a private segment and should be lc
200 if ( $segNo > 0 && strtolower( $codeSegment[( $segNo - 1 )] ) == 'x' ) {
201 $codeBCP[$segNo] = strtolower( $seg );
202 // ISO 3166 country code
203 } elseif ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) {
204 $codeBCP[$segNo] = strtoupper( $seg );
205 // ISO 15924 script code
206 } elseif ( ( strlen( $seg ) == 4 ) && ( $segNo > 0 ) ) {
207 $codeBCP[$segNo] = ucfirst( strtolower( $seg ) );
208 // Use lowercase for other cases
209 } else {
210 $codeBCP[$segNo] = strtolower( $seg );
211 }
212 }
213 return implode( '-', $codeBCP );
214 }
215
234 public static function bcp47ToInternal( $code ): string {
235 if ( $code instanceof Language ) {
236 return $code->getCode();
237 }
238 if ( $code instanceof Bcp47Code ) {
239 $code = $code->toBcp47Code();
240 }
241 static $invertedLookup = [];
242 if ( !$invertedLookup ) {
243 // There should never be two different entries in
244 // NON_STANDARD_LANGUAGE_CODE_MAPPING that map *different*
245 // internal codes to the same external BCP-47 code. That is,
246 // BCP-47 should preserve all the information from the internal
247 // code (discussed further above)[*]. But note the converse isn't
248 // true: multiple BCP-47 codes can alias to the same internal code:
249 // BCP-47 internal
250 // zh-Hans-CN => zh-cn (in NON_STANDARD_LANGUAGE_CODE_MAPPING)
251 // zh-Hans => zh-hans (not in " )
252 // zh-CN => zh-cn (not in " )
253 //
254 // [*] eml/egl are the "exception that proves the rule": `egl` *is*
255 // (prematurely?) defined as an internal code, but only
256 // eml.wikipedia.org exists, and it defines its language as `eml`;
257 // for internal purposes `egl` should map back into `eml` until
258 // `eml` is deprecated (aka an `eml => egl` entry is added to
259 // DEPRECATED_LANGUAGE_CODE_MAPPING): T36217.
260 foreach ( self::NON_STANDARD_LANGUAGE_CODE_MAPPING as $internal => $bcp47 ) {
261 $invertedLookup[strtolower( $bcp47 )] = $internal;
262 }
263 // We deliberately do *not* use DEPRECATED_LANGUAGE_CODE_MAPPING
264 // here: deprecated codes are no longer valid mediawiki internal
265 // codes, and we should never return them.
266 }
267 // Internal codes are all lowercase. This also achieves
268 // case-insensitivity in the lookup.
269 $code = strtolower( $code );
270 return $invertedLookup[$code] ?? $code;
271 }
272
286 public static function normalizeNonstandardCodeAndWarn( string $code ): Bcp47Code {
287 $compatMap = self::getNonstandardLanguageCodeMapping();
288 if ( isset( $compatMap[strtolower( $code )] ) ) {
289 // Backward compatibility, since clients may have been
290 // sending us non-standards-compliant
291 // "mediawiki internal language codes"; eventually we'll
292 // emit a logged warning here.
293 $code = $compatMap[strtolower( $code )];
294 }
295 return new Bcp47CodeValue( $code );
296 }
297
313 public static function isWellFormedLanguageTag( string $code, bool $lenient = false ): bool {
314 $alpha = '[a-z]';
315 $digit = '[0-9]';
316 $alphanum = '[a-z0-9]';
317 $x = 'x'; # private use singleton
318 $singleton = '[a-wy-z]'; # other singleton
319 $s = $lenient ? '[-_]' : '-';
320
321 $language = "$alpha{2,8}|$alpha{2,3}$s$alpha{3}";
322 $script = "$alpha{4}"; # ISO 15924
323 $region = "(?:$alpha{2}|$digit{3})"; # ISO 3166-1 alpha-2 or UN M.49
324 $variant = "(?:$alphanum{5,8}|$digit$alphanum{3})";
325 $extension = "$singleton(?:$s$alphanum{2,8})+";
326 $privateUse = "$x(?:$s$alphanum{1,8})+";
327
328 # Define certain legacy language tags (marked as “Type: grandfathered” in BCP 47),
329 # since otherwise the regex is pretty useless.
330 # Since these are limited, this is safe even later changes to the registry --
331 # the only oddity is that it might change the type of the tag, and thus
332 # the results from the capturing groups.
333 # https://www.iana.org/assignments/language-subtag-registry
334
335 $legacy = "en{$s}gb{$s}oed"
336 . "|i{$s}(?:ami|bnn|default|enochian|hak|klingon|lux|mingo|navajo|pwn|tao|tay|tsu)"
337 . "|no{$s}(?:bok|nyn)"
338 . "|sgn{$s}(?:be{$s}(?:fr|nl)|ch{$s}de)"
339 . "|zh{$s}min{$s}nan";
340
341 $variantList = "$variant(?:$s$variant)*";
342 $extensionList = "$extension(?:$s$extension)*";
343
344 $langtag = "(?:($language)"
345 . "(?:$s$script)?"
346 . "(?:$s$region)?"
347 . "(?:$s$variantList)?"
348 . "(?:$s$extensionList)?"
349 . "(?:$s$privateUse)?)";
350
351 # Here is the final breakdown, with capturing groups for each of these components
352 # The variants, extensions, legacy, and private-use may have interior '-'
353
354 $root = "^(?:$langtag|$privateUse|$legacy)$";
355
356 return preg_match( "/$root/i", $code );
357 }
358}
359
361class_alias( LanguageCode::class, 'LanguageCode' );
if(!defined('MW_SETUP_CALLBACK'))
Definition WebStart.php:81
Methods for dealing with language codes.
static getDeprecatedCodeMapping()
Returns a mapping of deprecated language codes that were used in previous versions of MediaWiki to up...
static getNonstandardLanguageCodeMapping()
Returns a mapping of non-standard language codes used by (current and previous version of) MediaWiki,...
static bcp47( $code)
Get the normalised IANA language tag See unit test for examples.
static replaceDeprecatedCodes( $code)
Replace deprecated language codes that were used in previous versions of MediaWiki to up-to-date,...
static bcp47ToInternal( $code)
Convert standardized BCP 47 codes to the internal names used by MediaWiki and returned by Language::g...
static normalizeNonstandardCodeAndWarn(string $code)
We want to eventually require valid BCP-47 codes on HTTP and HTML APIs (where the standards require i...
static isWellFormedLanguageTag(string $code, bool $lenient=false)
Returns true if a language code string is a well-formed language tag according to RFC 5646.
Base class for language-specific code.
Definition Language.php:79