21use Wikimedia\Bcp47Code\Bcp47Code;
22use Wikimedia\Bcp47Code\Bcp47CodeValue;
42 private const DEPRECATED_LANGUAGE_CODE_MAPPING = [
47 'be-x-old' =>
'be-tarask',
50 'zh-classical' =>
'lzh',
51 'zh-min-nan' =>
'nan',
81 private const NON_STANDARD_LANGUAGE_CODE_MAPPING = [
88 'de-formal' =>
'de-x-formal',
90 'en-rtl' =>
'en-x-rtl',
91 'es-formal' =>
'es-x-formal',
92 'hu-formal' =>
'hu-x-formal',
93 'map-bms' =>
'jv-x-bms',
96 'nl-informal' =>
'nl-x-informal',
97 'roa-tara' =>
'nap-x-tara',
98 'simple' =>
'en-simple',
100 'sr-el' =>
'sr-Latn',
111 'crh-ro' =>
'crh-Latn-RO',
112 'kk-cn' =>
'kk-Arab-CN',
113 'kk-tr' =>
'kk-Latn-TR',
114 'zh-cn' =>
'zh-Hans-CN',
115 'zh-sg' =>
'zh-Hans-SG',
116 'zh-my' =>
'zh-Hans-MY',
117 'zh-tw' =>
'zh-Hant-TW',
118 'zh-hk' =>
'zh-Hant-HK',
119 'zh-mo' =>
'zh-Hant-MO',
136 return self::DEPRECATED_LANGUAGE_CODE_MAPPING;
156 foreach ( self::DEPRECATED_LANGUAGE_CODE_MAPPING as $code => $ignore ) {
157 $result[$code] = self::bcp47( $code );
159 foreach ( self::NON_STANDARD_LANGUAGE_CODE_MAPPING as $code => $ignore ) {
160 $result[$code] = self::bcp47( $code );
176 return self::DEPRECATED_LANGUAGE_CODE_MAPPING[$code] ?? $code;
189 public static function bcp47( $code ) {
190 $code = self::replaceDeprecatedCodes( strtolower( $code ) );
191 if ( isset( self::NON_STANDARD_LANGUAGE_CODE_MAPPING[$code] ) ) {
192 $code = self::NON_STANDARD_LANGUAGE_CODE_MAPPING[$code];
194 $codeSegment = explode(
'-', $code );
196 foreach ( $codeSegment as $segNo => $seg ) {
198 if ( $segNo > 0 && strtolower( $codeSegment[( $segNo - 1 )] ) ==
'x' ) {
199 $codeBCP[$segNo] = strtolower( $seg );
201 } elseif ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) {
202 $codeBCP[$segNo] = strtoupper( $seg );
204 } elseif ( ( strlen( $seg ) == 4 ) && ( $segNo > 0 ) ) {
205 $codeBCP[$segNo] = ucfirst( strtolower( $seg ) );
208 $codeBCP[$segNo] = strtolower( $seg );
211 return implode(
'-', $codeBCP );
234 return $code->getCode();
236 if ( $code instanceof Bcp47Code ) {
237 $code = $code->toBcp47Code();
239 static $invertedLookup = [];
240 if ( !$invertedLookup ) {
258 foreach ( self::NON_STANDARD_LANGUAGE_CODE_MAPPING as $internal => $bcp47 ) {
259 $invertedLookup[strtolower( $bcp47 )] = $internal;
267 $code = strtolower( $code );
268 return $invertedLookup[$code] ?? $code;
285 $compatMap = self::getNonstandardLanguageCodeMapping();
286 if ( isset( $compatMap[strtolower( $code )] ) ) {
291 $code = $compatMap[strtolower( $code )];
293 return new Bcp47CodeValue( $code );
314 $alphanum =
'[a-z0-9]';
315 $x =
'x'; #
private use singleton
316 $singleton =
'[a-wy-z]'; # other singleton
317 $s = $lenient ?
'[-_]' :
'-';
319 $language =
"$alpha{2,8}|$alpha{2,3}$s$alpha{3}";
320 $script =
"$alpha{4}"; # ISO 15924
321 $region =
"(?:$alpha{2}|$digit{3})"; # ISO 3166-1 alpha-2 or UN M.49
322 $variant =
"(?:$alphanum{5,8}|$digit$alphanum{3})";
323 $extension =
"$singleton(?:$s$alphanum{2,8})+";
324 $privateUse =
"$x(?:$s$alphanum{1,8})+";
326 # Define certain legacy language tags (marked as “Type: grandfathered” in BCP 47),
327 # since otherwise the regex is pretty useless.
328 # Since these are limited, this is safe even later changes to the registry --
329 # the only oddity is that it might change the type of the tag, and thus
330 # the results from the capturing groups.
333 $legacy =
"en{$s}gb{$s}oed"
334 .
"|i{$s}(?:ami|bnn|default|enochian|hak|klingon|lux|mingo|navajo|pwn|tao|tay|tsu)"
335 .
"|no{$s}(?:bok|nyn)"
336 .
"|sgn{$s}(?:be{$s}(?:fr|nl)|ch{$s}de)"
337 .
"|zh{$s}min{$s}nan";
339 $variantList =
"$variant(?:$s$variant)*";
340 $extensionList =
"$extension(?:$s$extension)*";
342 $langtag =
"(?:($language)"
345 .
"(?:$s$variantList)?"
346 .
"(?:$s$extensionList)?"
347 .
"(?:$s$privateUse)?)";
349 # Here is the final breakdown, with capturing groups for each of these components
350 # The variants, extensions, legacy, and private-use may have interior '-'
352 $root =
"^(?:$langtag|$privateUse|$legacy)$";
354 return preg_match(
"/$root/i", $code );
if(!defined('MW_SETUP_CALLBACK'))
Methods for dealing with language codes.
static getNonstandardLanguageCodeMapping()
Returns a mapping of non-standard language codes used by (current and previous version of) MediaWiki,...
static normalizeNonstandardCodeAndWarn(string $code)
We want to eventually require valid BCP-47 codes on HTTP and HTML APIs (where the standards require i...
static bcp47ToInternal( $code)
Convert standardized BCP 47 codes to the internal names used by MediaWiki and returned by Language::g...
static replaceDeprecatedCodes( $code)
Replace deprecated language codes that were used in previous versions of MediaWiki to up-to-date,...
static getDeprecatedCodeMapping()
Returns a mapping of deprecated language codes that were used in previous versions of MediaWiki to up...
static bcp47( $code)
Get the normalised IANA language tag See unit test for examples.
static isWellFormedLanguageTag(string $code, bool $lenient=false)
Returns true if a language code string is a well-formed language tag according to RFC 5646.
Base class for language-specific code.