28 private const FIRST_LETTER_VERSION = 4;
31 private $primaryCollator;
34 private $mainCollator;
43 private $useNumericCollation =
false;
46 private $firstLetterData;
57 private const CJK_BLOCKS = [
96 private const TAILORING_FIRST_LETTERS = [
100 'as' => [
"\u{0982}",
"\u{0981}",
"\u{0983}",
"\u{09CE}",
"ক্ষ " ],
101 'ast' => [
"Ch",
"Ll",
"Ñ" ],
102 'az' => [
"Ç",
"Ə",
"Ğ",
"İ",
"Ö",
"Ş",
"Ü" ],
104 'be-tarask' => [
"Ё" ],
106 'bn' => [
'ং',
'ঃ',
'ঁ' ],
107 'bn@collation=traditional' => [
108 'ং',
'ঃ',
'ঁ',
'ক্',
'খ্',
'গ্',
'ঘ্',
'ঙ্',
'চ্',
'ছ্',
'জ্',
'ঝ্',
109 'ঞ্',
'ট্',
'ঠ্',
'ড্',
'ঢ্',
'ণ্',
'ৎ',
'থ্',
'দ্',
'ধ্',
'ন্',
'প্',
110 'ফ্',
'ব্',
'ভ্',
'ম্',
'য্',
'র্',
'ৰ্',
'ল্',
'ৱ্',
'শ্',
'ষ্',
'স্',
'হ্'
113 'br' => [
"Ch",
"C'h" ],
114 'bs' => [
"Č",
"Ć",
"Dž",
"Đ",
"Lj",
"Nj",
"Š",
"Ž" ],
119 'cs' => [
"Č",
"Ch",
"Ř",
"Š",
"Ž" ],
120 'cy' => [
"Ch",
"Dd",
"Ff",
"Ng",
"Ll",
"Ph",
"Rh",
"Th" ],
121 'da' => [
"Æ",
"Ø",
"Å" ],
123 'de-AT@collation=phonebook' => [
'ä',
'ö',
'ü',
'ß' ],
124 'dsb' => [
"Č",
"Ć",
"Dź",
"Ě",
"Ch",
"Ł",
"Ń",
"Ŕ",
"Š",
"Ś",
"Ž",
"Ź" ],
125 'ee' => [
"Dz",
"Ɖ",
"Ɛ",
"Ƒ",
"Gb",
"Ɣ",
"Kp",
"Ny",
"Ŋ",
"Ɔ",
"Ts",
"Ʋ" ],
128 'eo' => [
"Ĉ",
"Ĝ",
"Ĥ",
"Ĵ",
"Ŝ",
"Ŭ" ],
130 'et' => [
"Š",
"Ž",
"Õ",
"Ä",
"Ö",
"Ü" ],
140 'fi' => [
"Å",
"Ä",
"Ö" ],
141 'fil' => [
"Ñ",
"Ng" ],
142 'fo' => [
"Á",
"Ð",
"Í",
"Ó",
"Ú",
"Ý",
"Æ",
"Ø",
"Å" ],
145 'fur' => [
"À",
"Á",
"Â",
"È",
"Ì",
"Ò",
"Ù" ],
149 'gl' => [
"Ch",
"Ll",
"Ñ" ],
150 'gu' => [
"\u{0A82}",
"\u{0A83}",
"\u{0A81}",
"\u{0AB3}" ],
151 'ha' => [
'Ɓ',
'Ɗ',
'Ƙ',
'Sh',
'Ts',
'Ƴ' ],
154 'hi' => [
"\u{0902}",
"\u{0903}" ],
155 'hr' => [
"Č",
"Ć",
"Dž",
"Đ",
"Lj",
"Nj",
"Š",
"Ž" ],
156 'hsb' => [
"Č",
"Dź",
"Ě",
"Ch",
"Ł",
"Ń",
"Ř",
"Š",
"Ć",
"Ž" ],
157 'hu' => [
"Cs",
"Dz",
"Dzs",
"Gy",
"Ly",
"Ny",
"Ö",
"Sz",
"Ty",
"Ü",
"Zs" ],
160 'ig' => [
"Ch",
"Gb",
"Gh",
"Gw",
"Ị",
"Kp",
"Kw",
"Ṅ",
"Nw",
"Ny",
"Ọ",
"Sh",
"Ụ" ],
161 'is' => [
"Á",
"Ð",
"É",
"Í",
"Ó",
"Ú",
"Ý",
"Þ",
"Æ",
"Ö",
"Å" ],
164 'kk' => [
"Ү",
"І" ],
165 'kl' => [
"Æ",
"Ø",
"Å" ],
167 "រ",
"ឫ",
"ឬ",
"ល",
"ឭ",
"ឮ",
"\u{17BB}\u{17C6}",
168 "\u{17C6}",
"\u{17B6}\u{17C6}",
"\u{17C7}",
169 "\u{17B7}\u{17C7}",
"\u{17BB}\u{17C7}",
170 "\u{17C1}\u{17C7}",
"\u{17C4}\u{17C7}",
172 'kn' => [
"\u{0C81}",
"\u{0C83}",
"\u{0CF1}",
"\u{0CF2}" ],
173 'kok' => [
"\u{0902}",
"\u{0903}",
"ळ",
"क्ष" ],
174 'ku' => [
"Ç",
"Ê",
"Î",
"Ş",
"Û" ],
178 'lkt' => [
'Č',
'Ǧ',
'Ȟ',
'Š',
'Ž' ],
181 'lt' => [
"Č",
"Š",
"Ž" ],
182 'lv' => [
"Č",
"Ģ",
"Ķ",
"Ļ",
"Ņ",
"Š",
"Ž" ],
183 'mk' => [
"Ѓ",
"Ќ" ],
186 'mo' => [
"Ă",
"Â",
"Î",
"Ș",
"Ț" ],
187 'mr' => [
"\u{0902}",
"\u{0903}",
"ळ",
"क्ष",
"ज्ञ" ],
189 'mt' => [
"Ċ",
"Ġ",
"Għ",
"Ħ",
"Ż" ],
190 'nb' => [
"Æ",
"Ø",
"Å" ],
193 'nn' => [
"Æ",
"Ø",
"Å" ],
194 'no' => [
"Æ",
"Ø",
"Å" ],
196 'om' => [
'Ch',
'Dh',
'Kh',
'Ny',
'Ph',
'Sh' ],
197 'or' => [
"\u{0B01}",
"\u{0B02}",
"\u{0B03}",
"କ୍ଷ" ],
198 'pa' => [
"\u{0A4D}" ],
199 'pl' => [
"Ą",
"Ć",
"Ę",
"Ł",
"Ń",
"Ó",
"Ś",
"Ź",
"Ż" ],
202 'ro' => [
"Ă",
"Â",
"Î",
"Ș",
"Ț" ],
204 'rup' => [
"Ă",
"Â",
"Î",
"Ľ",
"Ń",
"Ș",
"Ț" ],
207 'Á',
'Č',
'Ʒ',
'Ǯ',
'Đ',
'Ǧ',
'Ǥ',
'Ǩ',
'Ŋ',
208 'Š',
'Ŧ',
'Ž',
'Ø',
'Æ',
'Ȧ',
'Ä',
'Ö'
210 'si' => [
"\u{0D82}",
"\u{0D83}",
"\u{0DA4}" ],
211 'sk' => [
"Ä",
"Č",
"Ch",
"Ô",
"Š",
"Ž" ],
212 'sl' => [
"Č",
"Š",
"Ž" ],
213 'smn' => [
"Á",
"Č",
"Đ",
"Ŋ",
"Š",
"Ŧ",
"Ž",
"Æ",
"Ø",
"Å",
"Ä",
"Ö" ],
214 'sq' => [
"Ç",
"Dh",
"Ë",
"Gj",
"Ll",
"Nj",
"Rr",
"Sh",
"Th",
"Xh",
"Zh" ],
216 'sr-Latn' => [
"Č",
"Ć",
"Dž",
"Đ",
"Lj",
"Nj",
"Š",
"Ž" ],
217 'sv' => [
"Å",
"Ä",
"Ö" ],
218 'sv@collation=standard' => [
"Å",
"Ä",
"Ö" ],
221 "\u{0B82}",
"ஃ",
"க்ஷ",
"க்",
"ங்",
"ச்",
"ஞ்",
"ட்",
"ண்",
"த்",
"ந்",
222 "ப்",
"ம்",
"ய்",
"ர்",
"ல்",
"வ்",
"ழ்",
"ள்",
"ற்",
"ன்",
"ஜ்",
"ஶ்",
"ஷ்",
225 'te' => [
"\u{0C01}",
"\u{0C02}",
"\u{0C03}" ],
226 'th' => [
"ฯ",
"\u{0E46}",
"\u{0E4D}",
"\u{0E3A}" ],
227 'tk' => [
"Ç",
"Ä",
"Ž",
"Ň",
"Ö",
"Ş",
"Ü",
"Ý" ],
228 'tl' => [
"Ñ",
"Ng" ],
229 'to' => [
"Ng",
"ʻ" ],
230 'tr' => [
"Ç",
"Ğ",
"İ",
"Ö",
"Ş",
"Ü" ],
232 'tt' => [
"Ә",
"Ө",
"Ү",
"Җ",
"Ң",
"Һ" ],
233 'uk' => [
"Ґ",
"Ь" ],
234 'uz' => [
"Ch",
"G'",
"Ng",
"O'",
"Sh" ],
235 'vi' => [
"Ă",
"Â",
"Đ",
"Ê",
"Ô",
"Ơ",
"Ư" ],
236 'vo' => [
"Ä",
"Ö",
"Ü" ],
238 "\u{05D1}\u{05BF}",
"\u{05DB}\u{05BC}",
"\u{05E4}\u{05BC}",
239 "\u{05E9}\u{05C2}",
"\u{05EA}\u{05BC}"
241 'yo' => [
"Ẹ",
"Gb",
"Ọ",
"Ṣ" ],
253 $this->locale = $locale;
255 $localeParts = explode(
'@', $locale );
256 $this->digitTransformLanguage = $languageFactory->
getLanguage( $locale ===
'root' ?
'en' : $localeParts[0] );
258 $mainCollator = Collator::create( $locale );
259 if ( !$mainCollator ) {
260 throw new InvalidArgumentException(
"Invalid ICU locale specified for collation: $locale" );
262 $this->mainCollator = $mainCollator;
265 $this->primaryCollator = Collator::create( $locale );
266 $this->primaryCollator->setStrength( Collator::PRIMARY );
269 if ( str_ends_with( $locale,
'-u-kn' ) ) {
270 $this->useNumericCollation =
true;
272 $this->locale = substr( $this->locale, 0, -5 );
273 $this->mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
274 $this->primaryCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
279 return $this->mainCollator->getSortKey( $string );
283 $string = strval( $string );
284 if ( $string ===
'' ) {
288 $firstChar = mb_substr( $string, 0, 1,
'UTF-8' );
291 if ( ord( $firstChar ) > 0x7f && self::isCjk( mb_ord( $firstChar ) ) ) {
295 $sortKey = $this->getPrimarySortKey( $string );
296 $data = $this->getFirstLetterData();
297 $keys = $data[
'keys'];
298 $letters = $data[
'chars'];
301 $min = ArrayUtils::findLowerBound(
302 static function ( $index ) use ( $keys ) {
303 return $keys[$index];
309 if ( $min ===
false ) {
314 $sortLetter = $letters[$min];
316 if ( $this->useNumericCollation ) {
321 if ( ord( $sortLetter ) >= 48 && ord( $sortLetter ) <= 57 ) {
322 $sortLetter =
wfMessage(
'category-header-numerals' )->numParams( 0, 9 )->text();
328 private function getPrimarySortKey( $string ) {
329 return $this->primaryCollator->getSortKey( $string );
336 private function getFirstLetterData() {
337 if ( $this->firstLetterData ===
null ) {
338 $cache = MediaWikiServices::getInstance()->getObjectCacheFactory()
340 $cacheKey = $cache->makeKey(
344 $this->digitTransformLanguage->getCode(),
346 self::FIRST_LETTER_VERSION
348 $this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK,
function () {
349 return $this->fetchFirstLetterData();
352 return $this->firstLetterData;
358 private function fetchFirstLetterData() {
360 if ( isset( self::TAILORING_FIRST_LETTERS[$this->locale] ) ) {
361 $letters = require __DIR__ .
"/data/first-letters-root.php";
363 $letters = array_merge( $letters, self::TAILORING_FIRST_LETTERS[$this->locale] );
365 if ( isset( self::TAILORING_FIRST_LETTERS[
'-' . $this->locale] ) ) {
366 $letters = array_diff( $letters, self::TAILORING_FIRST_LETTERS[
'-' . $this->locale] );
369 $digits = [
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9' ];
370 $letters = array_diff( $letters, $digits );
371 foreach ( $digits as $digit ) {
372 $letters[] = $this->digitTransformLanguage->formatNumNoSeparators( $digit );
374 } elseif ( $this->locale ===
'root' ) {
375 $letters = require __DIR__ .
"/data/first-letters-root.php";
377 throw new RuntimeException(
"MediaWiki does not support ICU locale " .
378 "\"{$this->locale}\"" );
391 foreach ( $letters as $letter ) {
392 $key = $this->getPrimarySortKey( $letter );
393 if ( isset( $letterMap[$key] ) ) {
396 $comp = $this->mainCollator->compare( $letter, $letterMap[$key] );
397 wfDebug(
"Primary collision '$letter' '{$letterMap[$key]}' (comparison: $comp)" );
400 $comp = mb_ord( $letter ) <=> mb_ord( $letterMap[$key] );
403 $letterMap[$key] = $letter;
406 $letterMap[$key] = $letter;
409 ksort( $letterMap, SORT_STRING );
446 $duplicatePrefixes = [];
447 foreach ( $letterMap as $key => $value ) {
454 if ( $prev !==
'' && str_starts_with( $key, $prev ) ) {
455 $duplicatePrefixes[] = $key;
463 foreach ( $duplicatePrefixes as $badKey ) {
464 wfDebug(
"Removing '{$letterMap[$badKey]}' from first letters." );
465 unset( $letterMap[$badKey] );
469 'chars' => array_values( $letterMap ),
470 'keys' => array_keys( $letterMap ),
480 public static function isCjk( $codepoint ) {
481 foreach ( self::CJK_BLOCKS as $block ) {
482 if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.