27 private const FIRST_LETTER_VERSION = 4;
30 private $primaryCollator;
33 private $mainCollator;
42 private $useNumericCollation =
false;
45 private $firstLetterData;
56 private const CJK_BLOCKS = [
95 private const TAILORING_FIRST_LETTERS = [
99 'as' => [
"\u{0982}",
"\u{0981}",
"\u{0983}",
"\u{09CE}",
"ক্ষ " ],
100 'ast' => [
"Ch",
"Ll",
"Ñ" ],
101 'az' => [
"Ç",
"Ə",
"Ğ",
"İ",
"Ö",
"Ş",
"Ü" ],
103 'be-tarask' => [
"Ё" ],
105 'bn' => [
'ং',
'ঃ',
'ঁ' ],
106 'bn@collation=traditional' => [
107 'ং',
'ঃ',
'ঁ',
'ক্',
'খ্',
'গ্',
'ঘ্',
'ঙ্',
'চ্',
'ছ্',
'জ্',
'ঝ্',
108 'ঞ্',
'ট্',
'ঠ্',
'ড্',
'ঢ্',
'ণ্',
'ৎ',
'থ্',
'দ্',
'ধ্',
'ন্',
'প্',
109 'ফ্',
'ব্',
'ভ্',
'ম্',
'য্',
'র্',
'ৰ্',
'ল্',
'ৱ্',
'শ্',
'ষ্',
'স্',
'হ্'
112 'br' => [
"Ch",
"C'h" ],
113 'bs' => [
"Č",
"Ć",
"Dž",
"Đ",
"Lj",
"Nj",
"Š",
"Ž" ],
118 'cs' => [
"Č",
"Ch",
"Ř",
"Š",
"Ž" ],
119 'cy' => [
"Ch",
"Dd",
"Ff",
"Ng",
"Ll",
"Ph",
"Rh",
"Th" ],
120 'da' => [
"Æ",
"Ø",
"Å" ],
122 'de-AT@collation=phonebook' => [
'ä',
'ö',
'ü',
'ß' ],
123 'dsb' => [
"Č",
"Ć",
"Dź",
"Ě",
"Ch",
"Ł",
"Ń",
"Ŕ",
"Š",
"Ś",
"Ž",
"Ź" ],
124 'ee' => [
"Dz",
"Ɖ",
"Ɛ",
"Ƒ",
"Gb",
"Ɣ",
"Kp",
"Ny",
"Ŋ",
"Ɔ",
"Ts",
"Ʋ" ],
127 'eo' => [
"Ĉ",
"Ĝ",
"Ĥ",
"Ĵ",
"Ŝ",
"Ŭ" ],
129 'et' => [
"Š",
"Ž",
"Õ",
"Ä",
"Ö",
"Ü" ],
139 'fi' => [
"Å",
"Ä",
"Ö" ],
140 'fil' => [
"Ñ",
"Ng" ],
141 'fo' => [
"Á",
"Ð",
"Í",
"Ó",
"Ú",
"Ý",
"Æ",
"Ø",
"Å" ],
144 'fur' => [
"À",
"Á",
"Â",
"È",
"Ì",
"Ò",
"Ù" ],
148 'gl' => [
"Ch",
"Ll",
"Ñ" ],
149 'gu' => [
"\u{0A82}",
"\u{0A83}",
"\u{0A81}",
"\u{0AB3}" ],
150 'ha' => [
'Ɓ',
'Ɗ',
'Ƙ',
'Sh',
'Ts',
'Ƴ' ],
153 'hi' => [
"\u{0902}",
"\u{0903}" ],
154 'hr' => [
"Č",
"Ć",
"Dž",
"Đ",
"Lj",
"Nj",
"Š",
"Ž" ],
155 'hsb' => [
"Č",
"Dź",
"Ě",
"Ch",
"Ł",
"Ń",
"Ř",
"Š",
"Ć",
"Ž" ],
156 'hu' => [
"Cs",
"Dz",
"Dzs",
"Gy",
"Ly",
"Ny",
"Ö",
"Sz",
"Ty",
"Ü",
"Zs" ],
159 'ig' => [
"Ch",
"Gb",
"Gh",
"Gw",
"Ị",
"Kp",
"Kw",
"Ṅ",
"Nw",
"Ny",
"Ọ",
"Sh",
"Ụ" ],
160 'is' => [
"Á",
"Ð",
"É",
"Í",
"Ó",
"Ú",
"Ý",
"Þ",
"Æ",
"Ö",
"Å" ],
163 'kk' => [
"Ү",
"І" ],
164 'kl' => [
"Æ",
"Ø",
"Å" ],
166 "រ",
"ឫ",
"ឬ",
"ល",
"ឭ",
"ឮ",
"\u{17BB}\u{17C6}",
167 "\u{17C6}",
"\u{17B6}\u{17C6}",
"\u{17C7}",
168 "\u{17B7}\u{17C7}",
"\u{17BB}\u{17C7}",
169 "\u{17C1}\u{17C7}",
"\u{17C4}\u{17C7}",
171 'kn' => [
"\u{0C81}",
"\u{0C83}",
"\u{0CF1}",
"\u{0CF2}" ],
172 'kok' => [
"\u{0902}",
"\u{0903}",
"ळ",
"क्ष" ],
173 'ku' => [
"Ç",
"Ê",
"Î",
"Ş",
"Û" ],
177 'lkt' => [
'Č',
'Ǧ',
'Ȟ',
'Š',
'Ž' ],
180 'lt' => [
"Č",
"Š",
"Ž" ],
181 'lv' => [
"Č",
"Ģ",
"Ķ",
"Ļ",
"Ņ",
"Š",
"Ž" ],
182 'mk' => [
"Ѓ",
"Ќ" ],
185 'mo' => [
"Ă",
"Â",
"Î",
"Ș",
"Ț" ],
186 'mr' => [
"\u{0902}",
"\u{0903}",
"ळ",
"क्ष",
"ज्ञ" ],
188 'mt' => [
"Ċ",
"Ġ",
"Għ",
"Ħ",
"Ż" ],
189 'nb' => [
"Æ",
"Ø",
"Å" ],
192 'nn' => [
"Æ",
"Ø",
"Å" ],
193 'no' => [
"Æ",
"Ø",
"Å" ],
195 'om' => [
'Ch',
'Dh',
'Kh',
'Ny',
'Ph',
'Sh' ],
196 'or' => [
"\u{0B01}",
"\u{0B02}",
"\u{0B03}",
"କ୍ଷ" ],
197 'pa' => [
"\u{0A4D}" ],
198 'pl' => [
"Ą",
"Ć",
"Ę",
"Ł",
"Ń",
"Ó",
"Ś",
"Ź",
"Ż" ],
201 'ro' => [
"Ă",
"Â",
"Î",
"Ș",
"Ț" ],
203 'rup' => [
"Ă",
"Â",
"Î",
"Ľ",
"Ń",
"Ș",
"Ț" ],
206 'Á',
'Č',
'Ʒ',
'Ǯ',
'Đ',
'Ǧ',
'Ǥ',
'Ǩ',
'Ŋ',
207 'Š',
'Ŧ',
'Ž',
'Ø',
'Æ',
'Ȧ',
'Ä',
'Ö'
209 'si' => [
"\u{0D82}",
"\u{0D83}",
"\u{0DA4}" ],
210 'sk' => [
"Ä",
"Č",
"Ch",
"Ô",
"Š",
"Ž" ],
211 'sl' => [
"Č",
"Š",
"Ž" ],
212 'smn' => [
"Á",
"Č",
"Đ",
"Ŋ",
"Š",
"Ŧ",
"Ž",
"Æ",
"Ø",
"Å",
"Ä",
"Ö" ],
213 'sq' => [
"Ç",
"Dh",
"Ë",
"Gj",
"Ll",
"Nj",
"Rr",
"Sh",
"Th",
"Xh",
"Zh" ],
215 'sr-Latn' => [
"Č",
"Ć",
"Dž",
"Đ",
"Lj",
"Nj",
"Š",
"Ž" ],
216 'sv' => [
"Å",
"Ä",
"Ö" ],
217 'sv@collation=standard' => [
"Å",
"Ä",
"Ö" ],
220 "\u{0B82}",
"ஃ",
"க்ஷ",
"க்",
"ங்",
"ச்",
"ஞ்",
"ட்",
"ண்",
"த்",
"ந்",
221 "ப்",
"ம்",
"ய்",
"ர்",
"ல்",
"வ்",
"ழ்",
"ள்",
"ற்",
"ன்",
"ஜ்",
"ஶ்",
"ஷ்",
224 'te' => [
"\u{0C01}",
"\u{0C02}",
"\u{0C03}" ],
225 'th' => [
"ฯ",
"\u{0E46}",
"\u{0E4D}",
"\u{0E3A}" ],
226 'tk' => [
"Ç",
"Ä",
"Ž",
"Ň",
"Ö",
"Ş",
"Ü",
"Ý" ],
227 'tl' => [
"Ñ",
"Ng" ],
228 'to' => [
"Ng",
"ʻ" ],
229 'tr' => [
"Ç",
"Ğ",
"İ",
"Ö",
"Ş",
"Ü" ],
231 'tt' => [
"Ә",
"Ө",
"Ү",
"Җ",
"Ң",
"Һ" ],
232 'uk' => [
"Ґ",
"Ь" ],
233 'uz' => [
"Ch",
"G'",
"Ng",
"O'",
"Sh" ],
234 'vi' => [
"Ă",
"Â",
"Đ",
"Ê",
"Ô",
"Ơ",
"Ư" ],
235 'vo' => [
"Ä",
"Ö",
"Ü" ],
237 "\u{05D1}\u{05BF}",
"\u{05DB}\u{05BC}",
"\u{05E4}\u{05BC}",
238 "\u{05E9}\u{05C2}",
"\u{05EA}\u{05BC}"
240 'yo' => [
"Ẹ",
"Gb",
"Ọ",
"Ṣ" ],
252 $this->locale = $locale;
254 $localeParts = explode(
'@', $locale );
255 $this->digitTransformLanguage = $languageFactory->
getLanguage( $locale ===
'root' ?
'en' : $localeParts[0] );
257 $mainCollator = Collator::create( $locale );
258 if ( !$mainCollator ) {
259 throw new InvalidArgumentException(
"Invalid ICU locale specified for collation: $locale" );
261 $this->mainCollator = $mainCollator;
264 $this->primaryCollator = Collator::create( $locale );
265 $this->primaryCollator->setStrength( Collator::PRIMARY );
268 if ( str_ends_with( $locale,
'-u-kn' ) ) {
269 $this->useNumericCollation =
true;
271 $this->locale = substr( $this->locale, 0, -5 );
272 $this->mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
273 $this->primaryCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
278 return $this->mainCollator->getSortKey( $string );
282 $string = strval( $string );
283 if ( $string ===
'' ) {
287 $firstChar = mb_substr( $string, 0, 1,
'UTF-8' );
290 if ( ord( $firstChar ) > 0x7f && self::isCjk( mb_ord( $firstChar ) ) ) {
294 $sortKey = $this->getPrimarySortKey( $string );
295 $data = $this->getFirstLetterData();
296 $keys = $data[
'keys'];
297 $letters = $data[
'chars'];
300 $min = ArrayUtils::findLowerBound(
301 static function ( $index ) use ( $keys ) {
302 return $keys[$index];
308 if ( $min ===
false ) {
313 $sortLetter = $letters[$min];
315 if ( $this->useNumericCollation ) {
320 if ( ord( $sortLetter ) >= 48 && ord( $sortLetter ) <= 57 ) {
321 $sortLetter =
wfMessage(
'category-header-numerals' )->numParams( 0, 9 )->text();
327 private function getPrimarySortKey( $string ) {
328 return $this->primaryCollator->getSortKey( $string );
335 private function getFirstLetterData() {
336 if ( $this->firstLetterData ===
null ) {
338 $cacheKey = $cache->makeKey(
342 $this->digitTransformLanguage->getCode(),
344 self::FIRST_LETTER_VERSION
346 $this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK,
function () {
347 return $this->fetchFirstLetterData();
350 return $this->firstLetterData;
356 private function fetchFirstLetterData() {
358 if ( isset( self::TAILORING_FIRST_LETTERS[$this->locale] ) ) {
359 $letters = require __DIR__ .
"/data/first-letters-root.php";
361 $letters = array_merge( $letters, self::TAILORING_FIRST_LETTERS[$this->locale] );
363 if ( isset( self::TAILORING_FIRST_LETTERS[
'-' . $this->locale] ) ) {
364 $letters = array_diff( $letters, self::TAILORING_FIRST_LETTERS[
'-' . $this->locale] );
367 $digits = [
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9' ];
368 $letters = array_diff( $letters, $digits );
369 foreach ( $digits as $digit ) {
370 $letters[] = $this->digitTransformLanguage->formatNumNoSeparators( $digit );
372 } elseif ( $this->locale ===
'root' ) {
373 $letters = require __DIR__ .
"/data/first-letters-root.php";
375 throw new RuntimeException(
"MediaWiki does not support ICU locale " .
376 "\"{$this->locale}\"" );
389 foreach ( $letters as $letter ) {
390 $key = $this->getPrimarySortKey( $letter );
391 if ( isset( $letterMap[$key] ) ) {
394 $comp = $this->mainCollator->compare( $letter, $letterMap[$key] );
395 wfDebug(
"Primary collision '$letter' '{$letterMap[$key]}' (comparison: $comp)" );
398 $comp = mb_ord( $letter ) <=> mb_ord( $letterMap[$key] );
401 $letterMap[$key] = $letter;
404 $letterMap[$key] = $letter;
407 ksort( $letterMap, SORT_STRING );
444 $duplicatePrefixes = [];
445 foreach ( $letterMap as $key => $value ) {
452 if ( $prev !==
'' && str_starts_with( $key, $prev ) ) {
453 $duplicatePrefixes[] = $key;
461 foreach ( $duplicatePrefixes as $badKey ) {
462 wfDebug(
"Removing '{$letterMap[$badKey]}' from first letters." );
463 unset( $letterMap[$badKey] );
467 'chars' => array_values( $letterMap ),
468 'keys' => array_keys( $letterMap ),
478 public static function isCjk( $codepoint ) {
479 foreach ( self::CJK_BLOCKS as $block ) {
480 if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.