29 private const FIRST_LETTER_VERSION = 4;
32 private $primaryCollator;
35 private $mainCollator;
44 private $useNumericCollation =
false;
47 private $firstLetterData;
58 private const CJK_BLOCKS = [
97 private const TAILORING_FIRST_LETTERS = [
101 'as' => [
"\u{0982}",
"\u{0981}",
"\u{0983}",
"\u{09CE}",
"ক্ষ " ],
102 'ast' => [
"Ch",
"Ll",
"Ñ" ],
103 'az' => [
"Ç",
"Ə",
"Ğ",
"İ",
"Ö",
"Ş",
"Ü" ],
105 'be-tarask' => [
"Ё" ],
107 'bn' => [
'ং',
'ঃ',
'ঁ' ],
108 'bn@collation=traditional' => [
109 'ং',
'ঃ',
'ঁ',
'ক্',
'খ্',
'গ্',
'ঘ্',
'ঙ্',
'চ্',
'ছ্',
'জ্',
'ঝ্',
110 'ঞ্',
'ট্',
'ঠ্',
'ড্',
'ঢ্',
'ণ্',
'ৎ',
'থ্',
'দ্',
'ধ্',
'ন্',
'প্',
111 'ফ্',
'ব্',
'ভ্',
'ম্',
'য্',
'র্',
'ৰ্',
'ল্',
'ৱ্',
'শ্',
'ষ্',
'স্',
'হ্'
114 'br' => [
"Ch",
"C'h" ],
115 'bs' => [
"Č",
"Ć",
"Dž",
"Đ",
"Lj",
"Nj",
"Š",
"Ž" ],
120 'cs' => [
"Č",
"Ch",
"Ř",
"Š",
"Ž" ],
121 'cy' => [
"Ch",
"Dd",
"Ff",
"Ng",
"Ll",
"Ph",
"Rh",
"Th" ],
122 'da' => [
"Æ",
"Ø",
"Å" ],
124 'de-AT@collation=phonebook' => [
'ä',
'ö',
'ü',
'ß' ],
125 'dsb' => [
"Č",
"Ć",
"Dź",
"Ě",
"Ch",
"Ł",
"Ń",
"Ŕ",
"Š",
"Ś",
"Ž",
"Ź" ],
126 'ee' => [
"Dz",
"Ɖ",
"Ɛ",
"Ƒ",
"Gb",
"Ɣ",
"Kp",
"Ny",
"Ŋ",
"Ɔ",
"Ts",
"Ʋ" ],
129 'eo' => [
"Ĉ",
"Ĝ",
"Ĥ",
"Ĵ",
"Ŝ",
"Ŭ" ],
131 'et' => [
"Š",
"Ž",
"Õ",
"Ä",
"Ö",
"Ü" ],
141 'fi' => [
"Å",
"Ä",
"Ö" ],
142 'fil' => [
"Ñ",
"Ng" ],
143 'fo' => [
"Á",
"Ð",
"Í",
"Ó",
"Ú",
"Ý",
"Æ",
"Ø",
"Å" ],
146 'fur' => [
"À",
"Á",
"Â",
"È",
"Ì",
"Ò",
"Ù" ],
150 'gl' => [
"Ch",
"Ll",
"Ñ" ],
151 'gu' => [
"\u{0A82}",
"\u{0A83}",
"\u{0A81}",
"\u{0AB3}" ],
152 'ha' => [
'Ɓ',
'Ɗ',
'Ƙ',
'Sh',
'Ts',
'Ƴ' ],
155 'hi' => [
"\u{0902}",
"\u{0903}" ],
156 'hr' => [
"Č",
"Ć",
"Dž",
"Đ",
"Lj",
"Nj",
"Š",
"Ž" ],
157 'hsb' => [
"Č",
"Dź",
"Ě",
"Ch",
"Ł",
"Ń",
"Ř",
"Š",
"Ć",
"Ž" ],
158 'hu' => [
"Cs",
"Dz",
"Dzs",
"Gy",
"Ly",
"Ny",
"Ö",
"Sz",
"Ty",
"Ü",
"Zs" ],
161 'ig' => [
"Ch",
"Gb",
"Gh",
"Gw",
"Ị",
"Kp",
"Kw",
"Ṅ",
"Nw",
"Ny",
"Ọ",
"Sh",
"Ụ" ],
162 'is' => [
"Á",
"Ð",
"É",
"Í",
"Ó",
"Ú",
"Ý",
"Þ",
"Æ",
"Ö",
"Å" ],
165 'kk' => [
"Ё",
"Ү",
"І" ],
166 'kl' => [
"Æ",
"Ø",
"Å" ],
168 "រ",
"ឫ",
"ឬ",
"ល",
"ឭ",
"ឮ",
"\u{17BB}\u{17C6}",
169 "\u{17C6}",
"\u{17B6}\u{17C6}",
"\u{17C7}",
170 "\u{17B7}\u{17C7}",
"\u{17BB}\u{17C7}",
171 "\u{17C1}\u{17C7}",
"\u{17C4}\u{17C7}",
173 'kn' => [
"\u{0C81}",
"\u{0C83}",
"\u{0CF1}",
"\u{0CF2}" ],
174 'kok' => [
"\u{0902}",
"\u{0903}",
"ळ",
"क्ष" ],
175 'ku' => [
"Ç",
"Ê",
"Î",
"Ş",
"Û" ],
179 'lkt' => [
'Č',
'Ǧ',
'Ȟ',
'Š',
'Ž' ],
182 'lt' => [
"Č",
"Š",
"Ž" ],
183 'lv' => [
"Č",
"Ģ",
"Ķ",
"Ļ",
"Ņ",
"Š",
"Ž" ],
184 'mk' => [
"Ѓ",
"Ќ" ],
187 'mo' => [
"Ă",
"Â",
"Î",
"Ș",
"Ț" ],
188 'mr' => [
"\u{0902}",
"\u{0903}",
"ळ",
"क्ष",
"ज्ञ" ],
190 'mt' => [
"Ċ",
"Ġ",
"Għ",
"Ħ",
"Ż" ],
191 'nb' => [
"Æ",
"Ø",
"Å" ],
194 'nn' => [
"Æ",
"Ø",
"Å" ],
195 'no' => [
"Æ",
"Ø",
"Å" ],
197 'om' => [
'Ch',
'Dh',
'Kh',
'Ny',
'Ph',
'Sh' ],
198 'or' => [
"\u{0B01}",
"\u{0B02}",
"\u{0B03}",
"କ୍ଷ" ],
199 'pa' => [
"\u{0A4D}" ],
200 'pl' => [
"Ą",
"Ć",
"Ę",
"Ł",
"Ń",
"Ó",
"Ś",
"Ź",
"Ż" ],
203 'ro' => [
"Ă",
"Â",
"Î",
"Ș",
"Ț" ],
205 'rup' => [
"Ă",
"Â",
"Î",
"Ľ",
"Ń",
"Ș",
"Ț" ],
208 'Á',
'Č',
'Ʒ',
'Ǯ',
'Đ',
'Ǧ',
'Ǥ',
'Ǩ',
'Ŋ',
209 'Š',
'Ŧ',
'Ž',
'Ø',
'Æ',
'Ȧ',
'Ä',
'Ö'
211 'si' => [
"\u{0D82}",
"\u{0D83}",
"\u{0DA4}" ],
212 'sk' => [
"Ä",
"Č",
"Ch",
"Ô",
"Š",
"Ž" ],
213 'sl' => [
"Č",
"Š",
"Ž" ],
214 'smn' => [
"Á",
"Č",
"Đ",
"Ŋ",
"Š",
"Ŧ",
"Ž",
"Æ",
"Ø",
"Å",
"Ä",
"Ö" ],
215 'sq' => [
"Ç",
"Dh",
"Ë",
"Gj",
"Ll",
"Nj",
"Rr",
"Sh",
"Th",
"Xh",
"Zh" ],
217 'sr-Latn' => [
"Č",
"Ć",
"Dž",
"Đ",
"Lj",
"Nj",
"Š",
"Ž" ],
218 'sv' => [
"Å",
"Ä",
"Ö" ],
219 'sv@collation=standard' => [
"Å",
"Ä",
"Ö" ],
222 "\u{0B82}",
"ஃ",
"க்ஷ",
"க்",
"ங்",
"ச்",
"ஞ்",
"ட்",
"ண்",
"த்",
"ந்",
223 "ப்",
"ம்",
"ய்",
"ர்",
"ல்",
"வ்",
"ழ்",
"ள்",
"ற்",
"ன்",
"ஜ்",
"ஶ்",
"ஷ்",
226 'te' => [
"\u{0C01}",
"\u{0C02}",
"\u{0C03}" ],
227 'th' => [
"ฯ",
"\u{0E46}",
"\u{0E4D}",
"\u{0E3A}" ],
228 'tk' => [
"Ç",
"Ä",
"Ž",
"Ň",
"Ö",
"Ş",
"Ü",
"Ý" ],
229 'tl' => [
"Ñ",
"Ng" ],
230 'to' => [
"Ng",
"ʻ" ],
231 'tr' => [
"Ç",
"Ğ",
"İ",
"Ö",
"Ş",
"Ü" ],
233 'tt' => [
"Ә",
"Ө",
"Ү",
"Җ",
"Ң",
"Һ" ],
234 'uk' => [
"Ґ",
"Ь" ],
235 'uz' => [
"Ch",
"G'",
"Ng",
"O'",
"Sh" ],
236 'vi' => [
"Ă",
"Â",
"Đ",
"Ê",
"Ô",
"Ơ",
"Ư" ],
237 'vo' => [
"Ä",
"Ö",
"Ü" ],
239 "\u{05D1}\u{05BF}",
"\u{05DB}\u{05BC}",
"\u{05E4}\u{05BC}",
240 "\u{05E9}\u{05C2}",
"\u{05EA}\u{05BC}"
242 'yo' => [
"Ẹ",
"Gb",
"Ọ",
"Ṣ" ],
254 $this->locale = $locale;
256 $localeParts = explode(
'@', $locale );
257 $this->digitTransformLanguage = $languageFactory->
getLanguage( $locale ===
'root' ?
'en' : $localeParts[0] );
259 $mainCollator = Collator::create( $locale );
260 if ( !$mainCollator ) {
261 throw new InvalidArgumentException(
"Invalid ICU locale specified for collation: $locale" );
263 $this->mainCollator = $mainCollator;
266 $this->primaryCollator = Collator::create( $locale );
267 $this->primaryCollator->setStrength( Collator::PRIMARY );
270 if ( str_ends_with( $locale,
'-u-kn' ) ) {
271 $this->useNumericCollation =
true;
273 $this->locale = substr( $this->locale, 0, -5 );
274 $this->mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
275 $this->primaryCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
280 return $this->mainCollator->getSortKey( $string );
284 $string = strval( $string );
285 if ( $string ===
'' ) {
289 $firstChar = mb_substr( $string, 0, 1,
'UTF-8' );
292 if ( ord( $firstChar ) > 0x7f && self::isCjk( mb_ord( $firstChar ) ) ) {
296 $sortKey = $this->getPrimarySortKey( $string );
297 $data = $this->getFirstLetterData();
298 $keys = $data[
'keys'];
299 $letters = $data[
'chars'];
302 $min = ArrayUtils::findLowerBound(
303 static function ( $index ) use ( $keys ) {
304 return $keys[$index];
310 if ( $min ===
false ) {
315 $sortLetter = $letters[$min];
317 if ( $this->useNumericCollation ) {
322 if ( ord( $sortLetter ) >= 48 && ord( $sortLetter ) <= 57 ) {
323 $sortLetter =
wfMessage(
'category-header-numerals' )->numParams( 0, 9 )->text();
329 private function getPrimarySortKey( $string ) {
330 return $this->primaryCollator->getSortKey( $string );
337 private function getFirstLetterData() {
338 if ( $this->firstLetterData ===
null ) {
339 $cache = MediaWikiServices::getInstance()->getObjectCacheFactory()
341 $cacheKey = $cache->makeKey(
345 $this->digitTransformLanguage->getCode(),
347 self::FIRST_LETTER_VERSION
349 $this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK,
function () {
350 return $this->fetchFirstLetterData();
353 return $this->firstLetterData;
359 private function fetchFirstLetterData() {
361 if ( isset( self::TAILORING_FIRST_LETTERS[$this->locale] ) ) {
362 $letters = require __DIR__ .
"/data/first-letters-root.php";
364 $letters = array_merge( $letters, self::TAILORING_FIRST_LETTERS[$this->locale] );
366 if ( isset( self::TAILORING_FIRST_LETTERS[
'-' . $this->locale] ) ) {
367 $letters = array_diff( $letters, self::TAILORING_FIRST_LETTERS[
'-' . $this->locale] );
370 $digits = [
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9' ];
371 $letters = array_diff( $letters, $digits );
372 foreach ( $digits as $digit ) {
373 $letters[] = $this->digitTransformLanguage->formatNumNoSeparators( $digit );
375 } elseif ( $this->locale ===
'root' ) {
376 $letters = require __DIR__ .
"/data/first-letters-root.php";
378 throw new RuntimeException(
"MediaWiki does not support ICU locale " .
379 "\"{$this->locale}\"" );
392 foreach ( $letters as $letter ) {
393 $key = $this->getPrimarySortKey( $letter );
394 if ( isset( $letterMap[$key] ) ) {
397 $comp = $this->mainCollator->compare( $letter, $letterMap[$key] );
398 wfDebug(
"Primary collision '$letter' '{$letterMap[$key]}' (comparison: $comp)" );
401 $comp = mb_ord( $letter ) <=> mb_ord( $letterMap[$key] );
404 $letterMap[$key] = $letter;
407 $letterMap[$key] = $letter;
410 ksort( $letterMap, SORT_STRING );
447 $duplicatePrefixes = [];
448 foreach ( $letterMap as $key => $value ) {
455 if ( $prev !==
'' && str_starts_with( $key, $prev ) ) {
456 $duplicatePrefixes[] = $key;
464 foreach ( $duplicatePrefixes as $badKey ) {
465 wfDebug(
"Removing '{$letterMap[$badKey]}' from first letters." );
466 unset( $letterMap[$badKey] );
470 'chars' => array_values( $letterMap ),
471 'keys' => array_keys( $letterMap ),
481 public static function isCjk( $codepoint ) {
482 foreach ( self::CJK_BLOCKS as $block ) {
483 if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.