21use Wikimedia\Bcp47Code\Bcp47Code;
22use Wikimedia\Bcp47Code\Bcp47CodeValue;
42 private const DEPRECATED_LANGUAGE_CODE_MAPPING = [
47 'be-x-old' =>
'be-tarask',
50 'zh-classical' =>
'lzh',
51 'zh-min-nan' =>
'nan',
81 private const NON_STANDARD_LANGUAGE_CODE_MAPPING = [
88 'de-formal' =>
'de-x-formal',
90 'en-rtl' =>
'en-x-rtl',
91 'es-formal' =>
'es-x-formal',
92 'hu-formal' =>
'hu-x-formal',
93 'map-bms' =>
'jv-x-bms',
96 'nl-informal' =>
'nl-x-informal',
97 'roa-tara' =>
'nap-x-tara',
98 'simple' =>
'en-simple',
100 'sr-el' =>
'sr-Latn',
111 'kk-cn' =>
'kk-Arab-CN',
112 'kk-tr' =>
'kk-Latn-TR',
113 'zh-cn' =>
'zh-Hans-CN',
114 'zh-sg' =>
'zh-Hans-SG',
115 'zh-my' =>
'zh-Hans-MY',
116 'zh-tw' =>
'zh-Hant-TW',
117 'zh-hk' =>
'zh-Hant-HK',
118 'zh-mo' =>
'zh-Hant-MO',
134 return self::DEPRECATED_LANGUAGE_CODE_MAPPING;
154 foreach ( self::DEPRECATED_LANGUAGE_CODE_MAPPING as $code => $ignore ) {
155 $result[$code] = self::bcp47( $code );
157 foreach ( self::NON_STANDARD_LANGUAGE_CODE_MAPPING as $code => $ignore ) {
158 $result[$code] = self::bcp47( $code );
174 return self::DEPRECATED_LANGUAGE_CODE_MAPPING[$code] ?? $code;
187 public static function bcp47( $code ) {
188 $code = self::replaceDeprecatedCodes( strtolower( $code ) );
189 if ( isset( self::NON_STANDARD_LANGUAGE_CODE_MAPPING[$code] ) ) {
190 $code = self::NON_STANDARD_LANGUAGE_CODE_MAPPING[$code];
192 $codeSegment = explode(
'-', $code );
194 foreach ( $codeSegment as $segNo => $seg ) {
196 if ( $segNo > 0 && strtolower( $codeSegment[( $segNo - 1 )] ) ==
'x' ) {
197 $codeBCP[$segNo] = strtolower( $seg );
199 } elseif ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) {
200 $codeBCP[$segNo] = strtoupper( $seg );
202 } elseif ( ( strlen( $seg ) == 4 ) && ( $segNo > 0 ) ) {
203 $codeBCP[$segNo] = ucfirst( strtolower( $seg ) );
206 $codeBCP[$segNo] = strtolower( $seg );
209 $langCode = implode(
'-', $codeBCP );
232 static $invertedLookup = [];
233 if ( !$invertedLookup ) {
251 foreach ( self::NON_STANDARD_LANGUAGE_CODE_MAPPING as $internal => $bcp47 ) {
252 $invertedLookup[strtolower( $bcp47 )] = $internal;
260 $code = strtolower( $code );
261 return $invertedLookup[$code] ?? $code;
277 $compatMap = self::getNonstandardLanguageCodeMapping();
278 if ( isset( $compatMap[strtolower( $code )] ) ) {
283 $code = $compatMap[strtolower( $code )];
285 return new Bcp47CodeValue( $code );
306 $alphanum =
'[a-z0-9]';
307 $x =
'x'; #
private use singleton
308 $singleton =
'[a-wy-z]'; # other singleton
309 $s = $lenient ?
'[-_]' :
'-';
311 $language =
"$alpha{2,8}|$alpha{2,3}$s$alpha{3}";
312 $script =
"$alpha{4}"; # ISO 15924
313 $region =
"(?:$alpha{2}|$digit{3})"; # ISO 3166-1 alpha-2 or UN M.49
314 $variant =
"(?:$alphanum{5,8}|$digit$alphanum{3})";
315 $extension =
"$singleton(?:$s$alphanum{2,8})+";
316 $privateUse =
"$x(?:$s$alphanum{1,8})+";
318 # Define certain legacy language tags (marked as “Type: grandfathered” in BCP 47),
319 # since otherwise the regex is pretty useless.
320 # Since these are limited, this is safe even later changes to the registry --
321 # the only oddity is that it might change the type of the tag, and thus
322 # the results from the capturing groups.
325 $legacy =
"en{$s}GB{$s}oed"
326 .
"|i{$s}(?:ami|bnn|default|enochian|hak|klingon|lux|mingo|navajo|pwn|tao|tay|tsu)"
327 .
"|no{$s}(?:bok|nyn)"
328 .
"|sgn{$s}(?:BE{$s}(?:fr|nl)|CH{$s}de)"
329 .
"|zh{$s}min{$s}nan";
331 $variantList =
"$variant(?:$s$variant)*";
332 $extensionList =
"$extension(?:$s$extension)*";
334 $langtag =
"(?:($language)"
337 .
"(?:$s$variantList)?"
338 .
"(?:$s$extensionList)?"
339 .
"(?:$s$privateUse)?)";
341 # Here is the final breakdown, with capturing groups for each of these components
342 # The variants, extensions, legacy, and private-use may have interior '-'
344 $root =
"^(?:$langtag|$privateUse|$legacy)$";
346 return preg_match(
"/$root/", strtolower( $code ) );
Methods for dealing with language codes.
static getNonstandardLanguageCodeMapping()
Returns a mapping of non-standard language codes used by (current and previous version of) MediaWiki,...
static normalizeNonstandardCodeAndWarn(string $code)
We want to eventually require valid BCP-47 codes on HTTP and HTML APIs (where the standards require i...
static replaceDeprecatedCodes( $code)
Replace deprecated language codes that were used in previous versions of MediaWiki to up-to-date,...
static getDeprecatedCodeMapping()
Returns a mapping of deprecated language codes that were used in previous versions of MediaWiki to up...
static bcp47( $code)
Get the normalised IANA language tag See unit test for examples.
static bcp47ToInternal(string $code)
Convert standardized BCP 47 codes to the internal names used by MediaWiki and returned by Language::g...
static isWellFormedLanguageTag(string $code, bool $lenient=false)
Returns true if a language code string is a well-formed language tag according to RFC 5646.