21use Wikimedia\Bcp47Code\Bcp47Code;
22use Wikimedia\Bcp47Code\Bcp47CodeValue;
42 private const DEPRECATED_LANGUAGE_CODE_MAPPING = [
47 'be-x-old' =>
'be-tarask',
50 'zh-classical' =>
'lzh',
51 'zh-min-nan' =>
'nan',
81 private const NON_STANDARD_LANGUAGE_CODE_MAPPING = [
88 'de-formal' =>
'de-x-formal',
90 'en-rtl' =>
'en-x-rtl',
91 'es-formal' =>
'es-x-formal',
92 'hu-formal' =>
'hu-x-formal',
93 'map-bms' =>
'jv-x-bms',
96 'nl-informal' =>
'nl-x-informal',
97 'roa-tara' =>
'nap-x-tara',
98 'simple' =>
'en-simple',
100 'sr-el' =>
'sr-Latn',
111 'crh-ro' =>
'crh-Latn-RO',
112 'kk-cn' =>
'kk-Arab-CN',
113 'kk-tr' =>
'kk-Latn-TR',
114 'zh-cn' =>
'zh-Hans-CN',
115 'zh-sg' =>
'zh-Hans-SG',
116 'zh-my' =>
'zh-Hans-MY',
117 'zh-tw' =>
'zh-Hant-TW',
118 'zh-hk' =>
'zh-Hant-HK',
119 'zh-mo' =>
'zh-Hant-MO',
135 return self::DEPRECATED_LANGUAGE_CODE_MAPPING;
155 foreach ( self::DEPRECATED_LANGUAGE_CODE_MAPPING as $code => $ignore ) {
156 $result[$code] = self::bcp47( $code );
158 foreach ( self::NON_STANDARD_LANGUAGE_CODE_MAPPING as $code => $ignore ) {
159 $result[$code] = self::bcp47( $code );
175 return self::DEPRECATED_LANGUAGE_CODE_MAPPING[$code] ?? $code;
188 public static function bcp47( $code ) {
189 $code = self::replaceDeprecatedCodes( strtolower( $code ) );
190 if ( isset( self::NON_STANDARD_LANGUAGE_CODE_MAPPING[$code] ) ) {
191 $code = self::NON_STANDARD_LANGUAGE_CODE_MAPPING[$code];
193 $codeSegment = explode(
'-', $code );
195 foreach ( $codeSegment as $segNo => $seg ) {
197 if ( $segNo > 0 && strtolower( $codeSegment[( $segNo - 1 )] ) ==
'x' ) {
198 $codeBCP[$segNo] = strtolower( $seg );
200 } elseif ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) {
201 $codeBCP[$segNo] = strtoupper( $seg );
203 } elseif ( ( strlen( $seg ) == 4 ) && ( $segNo > 0 ) ) {
204 $codeBCP[$segNo] = ucfirst( strtolower( $seg ) );
207 $codeBCP[$segNo] = strtolower( $seg );
210 return implode(
'-', $codeBCP );
233 return $code->getCode();
235 if ( $code instanceof Bcp47Code ) {
236 $code = $code->toBcp47Code();
238 static $invertedLookup = [];
239 if ( !$invertedLookup ) {
257 foreach ( self::NON_STANDARD_LANGUAGE_CODE_MAPPING as $internal => $bcp47 ) {
258 $invertedLookup[strtolower( $bcp47 )] = $internal;
266 $code = strtolower( $code );
267 return $invertedLookup[$code] ?? $code;
284 $compatMap = self::getNonstandardLanguageCodeMapping();
285 if ( isset( $compatMap[strtolower( $code )] ) ) {
290 $code = $compatMap[strtolower( $code )];
292 return new Bcp47CodeValue( $code );
313 $alphanum =
'[a-z0-9]';
314 $x =
'x'; #
private use singleton
315 $singleton =
'[a-wy-z]'; # other singleton
316 $s = $lenient ?
'[-_]' :
'-';
318 $language =
"$alpha{2,8}|$alpha{2,3}$s$alpha{3}";
319 $script =
"$alpha{4}"; # ISO 15924
320 $region =
"(?:$alpha{2}|$digit{3})"; # ISO 3166-1 alpha-2 or UN M.49
321 $variant =
"(?:$alphanum{5,8}|$digit$alphanum{3})";
322 $extension =
"$singleton(?:$s$alphanum{2,8})+";
323 $privateUse =
"$x(?:$s$alphanum{1,8})+";
325 # Define certain legacy language tags (marked as “Type: grandfathered” in BCP 47),
326 # since otherwise the regex is pretty useless.
327 # Since these are limited, this is safe even later changes to the registry --
328 # the only oddity is that it might change the type of the tag, and thus
329 # the results from the capturing groups.
332 $legacy =
"en{$s}gb{$s}oed"
333 .
"|i{$s}(?:ami|bnn|default|enochian|hak|klingon|lux|mingo|navajo|pwn|tao|tay|tsu)"
334 .
"|no{$s}(?:bok|nyn)"
335 .
"|sgn{$s}(?:be{$s}(?:fr|nl)|ch{$s}de)"
336 .
"|zh{$s}min{$s}nan";
338 $variantList =
"$variant(?:$s$variant)*";
339 $extensionList =
"$extension(?:$s$extension)*";
341 $langtag =
"(?:($language)"
344 .
"(?:$s$variantList)?"
345 .
"(?:$s$extensionList)?"
346 .
"(?:$s$privateUse)?)";
348 # Here is the final breakdown, with capturing groups for each of these components
349 # The variants, extensions, legacy, and private-use may have interior '-'
351 $root =
"^(?:$langtag|$privateUse|$legacy)$";
353 return preg_match(
"/$root/i", $code );
if(!defined('MW_SETUP_CALLBACK'))
Methods for dealing with language codes.
static getNonstandardLanguageCodeMapping()
Returns a mapping of non-standard language codes used by (current and previous version of) MediaWiki,...
static normalizeNonstandardCodeAndWarn(string $code)
We want to eventually require valid BCP-47 codes on HTTP and HTML APIs (where the standards require i...
static bcp47ToInternal( $code)
Convert standardized BCP 47 codes to the internal names used by MediaWiki and returned by Language::g...
static replaceDeprecatedCodes( $code)
Replace deprecated language codes that were used in previous versions of MediaWiki to up-to-date,...
static getDeprecatedCodeMapping()
Returns a mapping of deprecated language codes that were used in previous versions of MediaWiki to up...
static bcp47( $code)
Get the normalised IANA language tag See unit test for examples.
static isWellFormedLanguageTag(string $code, bool $lenient=false)
Returns true if a language code string is a well-formed language tag according to RFC 5646.
Base class for language-specific code.