27 private const FIRST_LETTER_VERSION = 4;
30 private $primaryCollator;
33 private $mainCollator;
42 private $useNumericCollation =
false;
45 private $firstLetterData;
56 private const CJK_BLOCKS = [
95 private const TAILORING_FIRST_LETTERS = [
99 'as' => [
"\u{0982}",
"\u{0981}",
"\u{0983}",
"\u{09CE}",
"ক্ষ " ],
100 'ast' => [
"Ch",
"Ll",
"Ñ" ],
101 'az' => [
"Ç",
"Ə",
"Ğ",
"İ",
"Ö",
"Ş",
"Ü" ],
103 'be-tarask' => [
"Ё" ],
105 'bn' => [
'ং',
'ঃ',
'ঁ' ],
106 'bn@collation=traditional' => [
107 'ং',
'ঃ',
'ঁ',
'ক্',
'খ্',
'গ্',
'ঘ্',
'ঙ্',
'চ্',
'ছ্',
'জ্',
'ঝ্',
108 'ঞ্',
'ট্',
'ঠ্',
'ড্',
'ঢ্',
'ণ্',
'ৎ',
'থ্',
'দ্',
'ধ্',
'ন্',
'প্',
109 'ফ্',
'ব্',
'ভ্',
'ম্',
'য্',
'র্',
'ৰ্',
'ল্',
'ৱ্',
'শ্',
'ষ্',
'স্',
'হ্'
112 'br' => [
"Ch",
"C'h" ],
113 'bs' => [
"Č",
"Ć",
"Dž",
"Đ",
"Lj",
"Nj",
"Š",
"Ž" ],
118 'cs' => [
"Č",
"Ch",
"Ř",
"Š",
"Ž" ],
119 'cy' => [
"Ch",
"Dd",
"Ff",
"Ng",
"Ll",
"Ph",
"Rh",
"Th" ],
120 'da' => [
"Æ",
"Ø",
"Å" ],
122 'de-AT@collation=phonebook' => [
'ä',
'ö',
'ü',
'ß' ],
123 'dsb' => [
"Č",
"Ć",
"Dź",
"Ě",
"Ch",
"Ł",
"Ń",
"Ŕ",
"Š",
"Ś",
"Ž",
"Ź" ],
124 'ee' => [
"Dz",
"Ɖ",
"Ɛ",
"Ƒ",
"Gb",
"Ɣ",
"Kp",
"Ny",
"Ŋ",
"Ɔ",
"Ts",
"Ʋ" ],
127 'eo' => [
"Ĉ",
"Ĝ",
"Ĥ",
"Ĵ",
"Ŝ",
"Ŭ" ],
129 'et' => [
"Š",
"Ž",
"Õ",
"Ä",
"Ö",
"Ü" ],
139 'fi' => [
"Å",
"Ä",
"Ö" ],
140 'fil' => [
"Ñ",
"Ng" ],
141 'fo' => [
"Á",
"Ð",
"Í",
"Ó",
"Ú",
"Ý",
"Æ",
"Ø",
"Å" ],
144 'fur' => [
"À",
"Á",
"Â",
"È",
"Ì",
"Ò",
"Ù" ],
148 'gl' => [
"Ch",
"Ll",
"Ñ" ],
149 'gu' => [
"\u{0A82}",
"\u{0A83}",
"\u{0A81}",
"\u{0AB3}" ],
150 'ha' => [
'Ɓ',
'Ɗ',
'Ƙ',
'Sh',
'Ts',
'Ƴ' ],
153 'hi' => [
"\u{0902}",
"\u{0903}" ],
154 'hr' => [
"Č",
"Ć",
"Dž",
"Đ",
"Lj",
"Nj",
"Š",
"Ž" ],
155 'hsb' => [
"Č",
"Dź",
"Ě",
"Ch",
"Ł",
"Ń",
"Ř",
"Š",
"Ć",
"Ž" ],
156 'hu' => [
"Cs",
"Dz",
"Dzs",
"Gy",
"Ly",
"Ny",
"Ö",
"Sz",
"Ty",
"Ü",
"Zs" ],
159 'ig' => [
"Ch",
"Gb",
"Gh",
"Gw",
"Ị",
"Kp",
"Kw",
"Ṅ",
"Nw",
"Ny",
"Ọ",
"Sh",
"Ụ" ],
160 'is' => [
"Á",
"Ð",
"É",
"Í",
"Ó",
"Ú",
"Ý",
"Þ",
"Æ",
"Ö",
"Å" ],
163 'kk' => [
"Ү",
"І" ],
164 'kl' => [
"Æ",
"Ø",
"Å" ],
166 "រ",
"ឫ",
"ឬ",
"ល",
"ឭ",
"ឮ",
"\u{17BB}\u{17C6}",
167 "\u{17C6}",
"\u{17B6}\u{17C6}",
"\u{17C7}",
168 "\u{17B7}\u{17C7}",
"\u{17BB}\u{17C7}",
169 "\u{17C1}\u{17C7}",
"\u{17C4}\u{17C7}",
171 'kn' => [
"\u{0C81}",
"\u{0C83}",
"\u{0CF1}",
"\u{0CF2}" ],
172 'kok' => [
"\u{0902}",
"\u{0903}",
"ळ",
"क्ष" ],
173 'ku' => [
"Ç",
"Ê",
"Î",
"Ş",
"Û" ],
177 'lkt' => [
'Č',
'Ǧ',
'Ȟ',
'Š',
'Ž' ],
180 'lt' => [
"Č",
"Š",
"Ž" ],
181 'lv' => [
"Č",
"Ģ",
"Ķ",
"Ļ",
"Ņ",
"Š",
"Ž" ],
182 'mk' => [
"Ѓ",
"Ќ" ],
185 'mo' => [
"Ă",
"Â",
"Î",
"Ș",
"Ț" ],
186 'mr' => [
"\u{0902}",
"\u{0903}",
"ळ",
"क्ष",
"ज्ञ" ],
188 'mt' => [
"Ċ",
"Ġ",
"Għ",
"Ħ",
"Ż" ],
189 'nb' => [
"Æ",
"Ø",
"Å" ],
192 'nn' => [
"Æ",
"Ø",
"Å" ],
193 'no' => [
"Æ",
"Ø",
"Å" ],
195 'om' => [
'Ch',
'Dh',
'Kh',
'Ny',
'Ph',
'Sh' ],
196 'or' => [
"\u{0B01}",
"\u{0B02}",
"\u{0B03}",
"କ୍ଷ" ],
197 'pa' => [
"\u{0A4D}" ],
198 'pl' => [
"Ą",
"Ć",
"Ę",
"Ł",
"Ń",
"Ó",
"Ś",
"Ź",
"Ż" ],
201 'ro' => [
"Ă",
"Â",
"Î",
"Ș",
"Ț" ],
203 'rup' => [
"Ă",
"Â",
"Î",
"Ľ",
"Ń",
"Ș",
"Ț" ],
206 'Á',
'Č',
'Ʒ',
'Ǯ',
'Đ',
'Ǧ',
'Ǥ',
'Ǩ',
'Ŋ',
207 'Š',
'Ŧ',
'Ž',
'Ø',
'Æ',
'Ȧ',
'Ä',
'Ö'
209 'si' => [
"\u{0D82}",
"\u{0D83}",
"\u{0DA4}" ],
210 'sk' => [
"Ä",
"Č",
"Ch",
"Ô",
"Š",
"Ž" ],
211 'sl' => [
"Č",
"Š",
"Ž" ],
212 'smn' => [
"Á",
"Č",
"Đ",
"Ŋ",
"Š",
"Ŧ",
"Ž",
"Æ",
"Ø",
"Å",
"Ä",
"Ö" ],
213 'sq' => [
"Ç",
"Dh",
"Ë",
"Gj",
"Ll",
"Nj",
"Rr",
"Sh",
"Th",
"Xh",
"Zh" ],
215 'sr-Latn' => [
"Č",
"Ć",
"Dž",
"Đ",
"Lj",
"Nj",
"Š",
"Ž" ],
216 'sv' => [
"Å",
"Ä",
"Ö" ],
217 'sv@collation=standard' => [
"Å",
"Ä",
"Ö" ],
220 "\u{0B82}",
"ஃ",
"க்ஷ",
"க்",
"ங்",
"ச்",
"ஞ்",
"ட்",
"ண்",
"த்",
"ந்",
221 "ப்",
"ம்",
"ய்",
"ர்",
"ல்",
"வ்",
"ழ்",
"ள்",
"ற்",
"ன்",
"ஜ்",
"ஶ்",
"ஷ்",
224 'te' => [
"\u{0C01}",
"\u{0C02}",
"\u{0C03}" ],
225 'th' => [
"ฯ",
"\u{0E46}",
"\u{0E4D}",
"\u{0E3A}" ],
226 'tk' => [
"Ç",
"Ä",
"Ž",
"Ň",
"Ö",
"Ş",
"Ü",
"Ý" ],
227 'tl' => [
"Ñ",
"Ng" ],
228 'to' => [
"Ng",
"ʻ" ],
229 'tr' => [
"Ç",
"Ğ",
"İ",
"Ö",
"Ş",
"Ü" ],
231 'tt' => [
"Ә",
"Ө",
"Ү",
"Җ",
"Ң",
"Һ" ],
232 'uk' => [
"Ґ",
"Ь" ],
233 'uz' => [
"Ch",
"G'",
"Ng",
"O'",
"Sh" ],
234 'vi' => [
"Ă",
"Â",
"Đ",
"Ê",
"Ô",
"Ơ",
"Ư" ],
235 'vo' => [
"Ä",
"Ö",
"Ü" ],
237 "\u{05D1}\u{05BF}",
"\u{05DB}\u{05BC}",
"\u{05E4}\u{05BC}",
238 "\u{05E9}\u{05C2}",
"\u{05EA}\u{05BC}"
240 'yo' => [
"Ẹ",
"Gb",
"Ọ",
"Ṣ" ],
252 $this->locale = $locale;
254 $localeParts = explode(
'@', $locale );
255 $this->digitTransformLanguage = $languageFactory->
getLanguage( $locale ===
'root' ?
'en' : $localeParts[0] );
257 $mainCollator = Collator::create( $locale );
258 if ( !$mainCollator ) {
259 throw new InvalidArgumentException(
"Invalid ICU locale specified for collation: $locale" );
261 $this->mainCollator = $mainCollator;
264 $this->primaryCollator = Collator::create( $locale );
265 $this->primaryCollator->setStrength( Collator::PRIMARY );
268 if ( str_ends_with( $locale,
'-u-kn' ) ) {
269 $this->useNumericCollation =
true;
271 $this->locale = substr( $this->locale, 0, -5 );
272 $this->mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
273 $this->primaryCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
278 return $this->mainCollator->getSortKey( $string );
282 $string = strval( $string );
283 if ( $string ===
'' ) {
287 $firstChar = mb_substr( $string, 0, 1,
'UTF-8' );
290 if ( ord( $firstChar ) > 0x7f && self::isCjk( mb_ord( $firstChar ) ) ) {
294 $sortKey = $this->getPrimarySortKey( $string );
295 $data = $this->getFirstLetterData();
296 $keys = $data[
'keys'];
297 $letters = $data[
'chars'];
301 static function ( $index ) use ( $keys ) {
302 return $keys[$index];
308 if ( $min ===
false ) {
313 $sortLetter = $letters[$min];
315 if ( $this->useNumericCollation ) {
320 if ( ord( $sortLetter ) >= 48 && ord( $sortLetter ) <= 57 ) {
321 $sortLetter =
wfMessage(
'category-header-numerals' )->numParams( 0, 9 )->text();
327 private function getPrimarySortKey( $string ) {
328 return $this->primaryCollator->getSortKey( $string );
335 private function getFirstLetterData() {
336 if ( $this->firstLetterData ===
null ) {
338 $cacheKey = $cache->makeKey(
342 $this->digitTransformLanguage->getCode(),
344 self::FIRST_LETTER_VERSION
346 $this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK,
function () {
347 return $this->fetchFirstLetterData();
350 return $this->firstLetterData;
356 private function fetchFirstLetterData() {
358 if ( isset( self::TAILORING_FIRST_LETTERS[$this->locale] ) ) {
359 $letters = require __DIR__ .
"/data/first-letters-root.php";
361 $letters = array_merge( $letters, self::TAILORING_FIRST_LETTERS[$this->locale] );
363 if ( isset( self::TAILORING_FIRST_LETTERS[
'-' . $this->locale] ) ) {
364 $letters = array_diff( $letters, self::TAILORING_FIRST_LETTERS[
'-' . $this->locale] );
367 $digits = [
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9' ];
368 $letters = array_diff( $letters, $digits );
369 foreach ( $digits as $digit ) {
370 $letters[] = $this->digitTransformLanguage->formatNumNoSeparators( $digit );
372 } elseif ( $this->locale ===
'root' ) {
373 $letters = require __DIR__ .
"/data/first-letters-root.php";
375 throw new RuntimeException(
"MediaWiki does not support ICU locale " .
376 "\"{$this->locale}\"" );
389 foreach ( $letters as $letter ) {
390 $key = $this->getPrimarySortKey( $letter );
391 if ( isset( $letterMap[$key] ) ) {
394 $comp = $this->mainCollator->compare( $letter, $letterMap[$key] );
395 wfDebug(
"Primary collision '$letter' '{$letterMap[$key]}' (comparison: $comp)" );
398 $comp = mb_ord( $letter ) <=> mb_ord( $letterMap[$key] );
401 $letterMap[$key] = $letter;
404 $letterMap[$key] = $letter;
407 ksort( $letterMap, SORT_STRING );
444 $duplicatePrefixes = [];
445 foreach ( $letterMap as $key => $value ) {
452 if ( $prev !==
'' && str_starts_with( $key, $prev ) ) {
453 $duplicatePrefixes[] = $key;
461 foreach ( $duplicatePrefixes as $badKey ) {
462 wfDebug(
"Removing '{$letterMap[$badKey]}' from first letters." );
463 unset( $letterMap[$badKey] );
467 'chars' => array_values( $letterMap ),
468 'keys' => array_keys( $letterMap ),
478 public static function isCjk( $codepoint ) {
479 foreach ( self::CJK_BLOCKS as $block ) {
480 if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
static findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target)
Do a binary search, and return the index of the largest item that sorts less than or equal to the tar...
getFirstLetter( $string)
Given a string, return the logical "first letter" to be used for grouping on category pages and so on...
static isCjk( $codepoint)
Test if a code point is a CJK (Chinese, Japanese, Korean) character.
__construct(LanguageFactory $languageFactory, $locale)
Language $digitTransformLanguage
getSortKey( $string)
Given a string, convert it to a (hopefully short) key that can be used for efficient sorting.
static getLocalServerInstance( $fallback=CACHE_NONE)
Factory function for CACHE_ACCEL (referenced from configuration)