MediaWiki master
IcuCollation.php
Go to the documentation of this file.
1<?php
23
27class IcuCollation extends Collation {
28 private const FIRST_LETTER_VERSION = 4;
29
31 private $primaryCollator;
32
34 private $mainCollator;
35
37 private $locale;
38
41
43 private $useNumericCollation = false;
44
46 private $firstLetterData;
47
57 private const CJK_BLOCKS = [
58 [ 0x2E80, 0x2EFF ], // CJK Radicals Supplement
59 [ 0x2F00, 0x2FDF ], // Kangxi Radicals
60 [ 0x2FF0, 0x2FFF ], // Ideographic Description Characters
61 [ 0x3000, 0x303F ], // CJK Symbols and Punctuation
62 [ 0x31C0, 0x31EF ], // CJK Strokes
63 [ 0x3200, 0x32FF ], // Enclosed CJK Letters and Months
64 [ 0x3300, 0x33FF ], // CJK Compatibility
65 [ 0x3400, 0x4DBF ], // CJK Unified Ideographs Extension A
66 [ 0x4E00, 0x9FFF ], // CJK Unified Ideographs
67 [ 0xF900, 0xFAFF ], // CJK Compatibility Ideographs
68 [ 0xFE30, 0xFE4F ], // CJK Compatibility Forms
69 [ 0x20000, 0x2A6DF ], // CJK Unified Ideographs Extension B
70 [ 0x2A700, 0x2B73F ], // CJK Unified Ideographs Extension C
71 [ 0x2B740, 0x2B81F ], // CJK Unified Ideographs Extension D
72 [ 0x2F800, 0x2FA1F ], // CJK Compatibility Ideographs Supplement
73 ];
74
96 private const TAILORING_FIRST_LETTERS = [
97 'af' => [],
98 'am' => [],
99 'ar' => [],
100 'as' => [ "\u{0982}", "\u{0981}", "\u{0983}", "\u{09CE}", "ক্ষ " ],
101 'ast' => [ "Ch", "Ll", "Ñ" ], // not in libicu
102 'az' => [ "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ],
103 'be' => [ "Ё" ],
104 'be-tarask' => [ "Ё" ],
105 'bg' => [],
106 'bn' => [ 'ং', 'ঃ', 'ঁ' ],
107 'bn@collation=traditional' => [
108 'ং', 'ঃ', 'ঁ', 'ক্', 'খ্', 'গ্', 'ঘ্', 'ঙ্', 'চ্', 'ছ্', 'জ্', 'ঝ্',
109 'ঞ্', 'ট্', 'ঠ্', 'ড্', 'ঢ্', 'ণ্', 'ৎ', 'থ্', 'দ্', 'ধ্', 'ন্', 'প্',
110 'ফ্', 'ব্', 'ভ্', 'ম্', 'য্', 'র্', 'ৰ্', 'ল্', 'ৱ্', 'শ্', 'ষ্', 'স্', 'হ্'
111 ],
112 'bo' => [],
113 'br' => [ "Ch", "C'h" ],
114 'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
115 'bs-Cyrl' => [],
116 'ca' => [],
117 'chr' => [],
118 'co' => [], // not in libicu
119 'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ],
120 'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ],
121 'da' => [ "Æ", "Ø", "Å" ],
122 'de' => [],
123 'de-AT@collation=phonebook' => [ 'ä', 'ö', 'ü', 'ß' ],
124 'dsb' => [ "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ],
125 'ee' => [ "Dz", "Ɖ", "Ɛ", "Ƒ", "Gb", "Ɣ", "Kp", "Ny", "Ŋ", "Ɔ", "Ts", "Ʋ" ],
126 'el' => [],
127 'en' => [],
128 'eo' => [ "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ],
129 'es' => [ "Ñ" ],
130 'et' => [ "Š", "Ž", "Õ", "Ä", "Ö", "Ü" ],
131 'eu' => [ "Ñ" ], // not in libicu
132 'fa' => [
133 // RTL, let's put each letter on a new line
134 "آ",
135 "ء",
136 "ه",
137 "ا",
138 "و"
139 ],
140 'fi' => [ "Å", "Ä", "Ö" ],
141 'fil' => [ "Ñ", "Ng" ],
142 'fo' => [ "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ],
143 'fr' => [],
144 'fr-CA' => [], // fr-CA sorts accents slightly different from fr.
145 'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ], // not in libicu
146 'fy' => [], // not in libicu
147 'ga' => [],
148 'gd' => [], // not in libicu
149 'gl' => [ "Ch", "Ll", "Ñ" ],
150 'gu' => [ "\u{0A82}", "\u{0A83}", "\u{0A81}", "\u{0AB3}" ],
151 'ha' => [ 'Ɓ', 'Ɗ', 'Ƙ', 'Sh', 'Ts', 'Ƴ' ],
152 'haw' => [ 'ʻ' ],
153 'he' => [],
154 'hi' => [ "\u{0902}", "\u{0903}" ],
155 'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
156 'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ],
157 'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ],
158 'hy' => [ "և" ],
159 'id' => [],
160 'ig' => [ "Ch", "Gb", "Gh", "Gw", "Ị", "Kp", "Kw", "Ṅ", "Nw", "Ny", "Ọ", "Sh", "Ụ" ],
161 'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ],
162 'it' => [],
163 'ka' => [],
164 'kk' => [ "Ү", "І" ],
165 'kl' => [ "Æ", "Ø", "Å" ],
166 'km' => [
167 "រ", "ឫ", "ឬ", "ល", "ឭ", "ឮ", "\u{17BB}\u{17C6}",
168 "\u{17C6}", "\u{17B6}\u{17C6}", "\u{17C7}",
169 "\u{17B7}\u{17C7}", "\u{17BB}\u{17C7}",
170 "\u{17C1}\u{17C7}", "\u{17C4}\u{17C7}",
171 ],
172 'kn' => [ "\u{0C81}", "\u{0C83}", "\u{0CF1}", "\u{0CF2}" ],
173 'kok' => [ "\u{0902}", "\u{0903}", "ळ", "क्ष" ],
174 'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ], // not in libicu
175 'ky' => [ "Ё" ],
176 'la' => [], // not in libicu
177 'lb' => [],
178 'lkt' => [ 'Č', 'Ǧ', 'Ȟ', 'Š', 'Ž' ],
179 'ln' => [ 'Ɛ' ],
180 'lo' => [],
181 'lt' => [ "Č", "Š", "Ž" ],
182 'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ],
183 'mk' => [ "Ѓ", "Ќ" ],
184 'ml' => [],
185 'mn' => [],
186 'mo' => [ "Ă", "Â", "Î", "Ș", "Ț" ], // not in libicu
187 'mr' => [ "\u{0902}", "\u{0903}", "ळ", "क्ष", "ज्ञ" ],
188 'ms' => [],
189 'mt' => [ "Ċ", "Ġ", "Għ", "Ħ", "Ż" ],
190 'nb' => [ "Æ", "Ø", "Å" ],
191 'ne' => [],
192 'nl' => [],
193 'nn' => [ "Æ", "Ø", "Å" ],
194 'no' => [ "Æ", "Ø", "Å" ], // not in libicu. You should probably use nb or nn instead.
195 'oc' => [], // not in libicu
196 'om' => [ 'Ch', 'Dh', 'Kh', 'Ny', 'Ph', 'Sh' ],
197 'or' => [ "\u{0B01}", "\u{0B02}", "\u{0B03}", "କ୍ଷ" ],
198 'pa' => [ "\u{0A4D}" ],
199 'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ],
200 'pt' => [],
201 'rm' => [], // not in libicu
202 'ro' => [ "Ă", "Â", "Î", "Ș", "Ț" ],
203 'ru' => [],
204 'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ș", "Ț" ], // not in libicu
205 'sco' => [],
206 'se' => [
207 'Á', 'Č', 'Ʒ', 'Ǯ', 'Đ', 'Ǧ', 'Ǥ', 'Ǩ', 'Ŋ',
208 'Š', 'Ŧ', 'Ž', 'Ø', 'Æ', 'Ȧ', 'Ä', 'Ö'
209 ],
210 'si' => [ "\u{0D82}", "\u{0D83}", "\u{0DA4}" ],
211 'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ],
212 'sl' => [ "Č", "Š", "Ž" ],
213 'smn' => [ "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ],
214 'sq' => [ "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh" ],
215 'sr' => [],
216 'sr-Latn' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
217 'sv' => [ "Å", "Ä", "Ö" ],
218 'sv@collation=standard' => [ "Å", "Ä", "Ö" ],
219 'sw' => [],
220 'ta' => [
221 "\u{0B82}", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்",
222 "ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்",
223 "ஸ்", "ஹ்", "க்ஷ்"
224 ],
225 'te' => [ "\u{0C01}", "\u{0C02}", "\u{0C03}" ],
226 'th' => [ "ฯ", "\u{0E46}", "\u{0E4D}", "\u{0E3A}" ],
227 'tk' => [ "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ],
228 'tl' => [ "Ñ", "Ng" ], // not in libicu
229 'to' => [ "Ng", "ʻ" ],
230 'tr' => [ "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ],
231 '-tr' => [ "ı" ],
232 'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ], // not in libicu
233 'uk' => [ "Ґ", "Ь" ],
234 'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ], // not in libicu
235 'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ],
236 'vo' => [ "Ä", "Ö", "Ü" ],
237 'yi' => [
238 "\u{05D1}\u{05BF}", "\u{05DB}\u{05BC}", "\u{05E4}\u{05BC}",
239 "\u{05E9}\u{05C2}", "\u{05EA}\u{05BC}"
240 ],
241 'yo' => [ "Ẹ", "Gb", "Ọ", "Ṣ" ],
242 'zu' => [],
243 ];
244
249 public function __construct(
250 LanguageFactory $languageFactory,
251 $locale
252 ) {
253 $this->locale = $locale;
254 // Drop everything after the '@' in locale's name
255 $localeParts = explode( '@', $locale );
256 $this->digitTransformLanguage = $languageFactory->getLanguage( $locale === 'root' ? 'en' : $localeParts[0] );
257
258 $mainCollator = Collator::create( $locale );
259 if ( !$mainCollator ) {
260 throw new InvalidArgumentException( "Invalid ICU locale specified for collation: $locale" );
261 }
262 $this->mainCollator = $mainCollator;
263
264 // @phan-suppress-next-line PhanPossiblyNullTypeMismatchProperty successed before, no null check needed
265 $this->primaryCollator = Collator::create( $locale );
266 $this->primaryCollator->setStrength( Collator::PRIMARY );
267
268 // If the special suffix for numeric collation is present, turn on numeric collation.
269 if ( str_ends_with( $locale, '-u-kn' ) ) {
270 $this->useNumericCollation = true;
271 // Strip off the special suffix so it doesn't trip up fetchFirstLetterData().
272 $this->locale = substr( $this->locale, 0, -5 );
273 $this->mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
274 $this->primaryCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
275 }
276 }
277
278 public function getSortKey( $string ) {
279 return $this->mainCollator->getSortKey( $string );
280 }
281
282 public function getFirstLetter( $string ) {
283 $string = strval( $string );
284 if ( $string === '' ) {
285 return '';
286 }
287
288 $firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
289
290 // If the first character is a CJK character, just return that character.
291 if ( ord( $firstChar ) > 0x7f && self::isCjk( mb_ord( $firstChar ) ) ) {
292 return $firstChar;
293 }
294
295 $sortKey = $this->getPrimarySortKey( $string );
296 $data = $this->getFirstLetterData();
297 $keys = $data['keys'];
298 $letters = $data['chars'];
299
300 // Do a binary search to find the correct letter to sort under
301 $min = ArrayUtils::findLowerBound(
302 static function ( $index ) use ( $keys ) {
303 return $keys[$index];
304 },
305 count( $keys ),
306 'strcmp',
307 $sortKey );
308
309 if ( $min === false ) {
310 // Before the first letter
311 return '';
312 }
313
314 $sortLetter = $letters[$min];
315
316 if ( $this->useNumericCollation ) {
317 // If the sort letter is a number, return '0–9' (or localized equivalent).
318 // ASCII value of 0 is 48. ASCII value of 9 is 57.
319 // Note that this also applies to non-Arabic numerals since they are
320 // mapped to Arabic numeral sort letters. For example, ২ sorts as 2.
321 if ( ord( $sortLetter ) >= 48 && ord( $sortLetter ) <= 57 ) {
322 $sortLetter = wfMessage( 'category-header-numerals' )->numParams( 0, 9 )->text();
323 }
324 }
325 return $sortLetter;
326 }
327
328 private function getPrimarySortKey( $string ) {
329 return $this->primaryCollator->getSortKey( $string );
330 }
331
336 private function getFirstLetterData() {
337 if ( $this->firstLetterData === null ) {
338 $cache = MediaWikiServices::getInstance()->getObjectCacheFactory()
339 ->getLocalServerInstance( CACHE_ANYTHING );
340 $cacheKey = $cache->makeKey(
341 'first-letters',
342 static::class,
343 $this->locale,
344 $this->digitTransformLanguage->getCode(),
345 INTL_ICU_VERSION,
346 self::FIRST_LETTER_VERSION
347 );
348 $this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK, function () {
349 return $this->fetchFirstLetterData();
350 } );
351 }
352 return $this->firstLetterData;
353 }
354
358 private function fetchFirstLetterData() {
359 // Generate data from serialized data file
360 if ( isset( self::TAILORING_FIRST_LETTERS[$this->locale] ) ) {
361 $letters = require __DIR__ . "/data/first-letters-root.php";
362 // Append additional characters
363 $letters = array_merge( $letters, self::TAILORING_FIRST_LETTERS[$this->locale] );
364 // Remove unnecessary ones, if any
365 if ( isset( self::TAILORING_FIRST_LETTERS['-' . $this->locale] ) ) {
366 $letters = array_diff( $letters, self::TAILORING_FIRST_LETTERS['-' . $this->locale] );
367 }
368 // Apply digit transforms
369 $digits = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ];
370 $letters = array_diff( $letters, $digits );
371 foreach ( $digits as $digit ) {
372 $letters[] = $this->digitTransformLanguage->formatNumNoSeparators( $digit );
373 }
374 } elseif ( $this->locale === 'root' ) {
375 $letters = require __DIR__ . "/data/first-letters-root.php";
376 } else {
377 throw new RuntimeException( "MediaWiki does not support ICU locale " .
378 "\"{$this->locale}\"" );
379 }
380
381 /* Sort the letters.
382 *
383 * It's impossible to have the precompiled data file properly sorted,
384 * because the sort order changes depending on ICU version. If the
385 * array is not properly sorted, the binary search will return random
386 * results.
387 *
388 * We also take this opportunity to remove primary collisions.
389 */
390 $letterMap = [];
391 foreach ( $letters as $letter ) {
392 $key = $this->getPrimarySortKey( $letter );
393 if ( isset( $letterMap[$key] ) ) {
394 // Primary collision (two characters with the same sort position).
395 // Keep whichever one sorts first in the main collator.
396 $comp = $this->mainCollator->compare( $letter, $letterMap[$key] );
397 wfDebug( "Primary collision '$letter' '{$letterMap[$key]}' (comparison: $comp)" );
398 // If that also has a collision, use codepoint as a tiebreaker.
399 if ( $comp === 0 ) {
400 $comp = mb_ord( $letter ) <=> mb_ord( $letterMap[$key] );
401 }
402 if ( $comp < 0 ) {
403 $letterMap[$key] = $letter;
404 }
405 } else {
406 $letterMap[$key] = $letter;
407 }
408 }
409 ksort( $letterMap, SORT_STRING );
410
411 /* Remove duplicate prefixes. Basically, if something has a sortkey
412 * which is a prefix of some other sortkey, then it is an
413 * expansion and probably should not be considered a section
414 * header.
415 *
416 * For example, 'þ' is sometimes sorted as if it is the letters
417 * 'th'. Other times it is its own primary element. Another
418 * example is '₨'. Sometimes it's a currency symbol. Sometimes it
419 * is an 'R' followed by an 's'.
420 *
421 * Additionally, an expanded element should always sort directly
422 * after its first element due to the way sortkeys work.
423 *
424 * UCA sortkey elements are of variable length, but no collation
425 * element should be a prefix of some other element, so I think
426 * this is safe. See:
427 * - https://unicode-org.github.io/icu-docs/design/collation/ICU_collation_design.htm
428 * - https://icu.unicode.org/design/collation/uca-weight-allocation
429 *
430 * Additionally, there is something called primary compression to
431 * worry about. Basically, if you have two primary elements that
432 * are more than one byte and both start with the same byte, then
433 * the first byte is dropped on the second primary. Additionally,
434 * either \x03 or \xFF may be added to mean that the next primary
435 * does not start with the first byte of the first primary.
436 *
437 * This shouldn't matter much, as the first primary is not
438 * changed, and that is what we are comparing against.
439 *
440 * tl;dr: This makes some assumptions about how ICU implements
441 * collations. It seems incredibly unlikely these assumptions
442 * will change, but nonetheless they are assumptions.
443 */
444
445 $prev = '';
446 $duplicatePrefixes = [];
447 foreach ( $letterMap as $key => $value ) {
448 // Due to the fact the array is sorted, we only have
449 // to compare with the element directly previous
450 // to the current element (skipping expansions).
451 // An element "X" will always sort directly
452 // before "XZ" (Unless we have "XY", but we
453 // do not update $prev in that case).
454 if ( $prev !== '' && str_starts_with( $key, $prev ) ) {
455 $duplicatePrefixes[] = $key;
456 // If this is an expansion, we don't want to
457 // compare the next element to this element,
458 // but to what is currently $prev
459 continue;
460 }
461 $prev = $key;
462 }
463 foreach ( $duplicatePrefixes as $badKey ) {
464 wfDebug( "Removing '{$letterMap[$badKey]}' from first letters." );
465 unset( $letterMap[$badKey] );
466 // This code assumes that unsetting does not change sort order.
467 }
468 return [
469 'chars' => array_values( $letterMap ),
470 'keys' => array_keys( $letterMap ),
471 ];
472 }
473
480 public static function isCjk( $codepoint ) {
481 foreach ( self::CJK_BLOCKS as $block ) {
482 if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
483 return true;
484 }
485 }
486 return false;
487 }
488}
const CACHE_ANYTHING
Definition Defines.php:86
wfDebug( $text, $dest='all', array $context=[])
Sends a line to the debug log if enabled or, optionally, to a comment in output.
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
getFirstLetter( $string)
Given a string, return the logical "first letter" to be used for grouping on category pages and so on...
static isCjk( $codepoint)
Test if a code point is a CJK (Chinese, Japanese, Korean) character.
__construct(LanguageFactory $languageFactory, $locale)
Language $digitTransformLanguage
getSortKey( $string)
Given a string, convert it to a (hopefully short) key that can be used for efficient sorting.
Internationalisation code See https://www.mediawiki.org/wiki/Special:MyLanguage/Localisation for more...
getLanguage( $code)
Get a cached or new language object for a given language code with normalization of the language code...
Service locator for MediaWiki core services.