Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
91.30% |
21 / 23 |
|
75.00% |
3 / 4 |
CRAP | |
0.00% |
0 / 1 |
CustomUppercaseCollation | |
91.30% |
21 / 23 |
|
75.00% |
3 / 4 |
10.07 | |
0.00% |
0 / 1 |
__construct | |
87.50% |
14 / 16 |
|
0.00% |
0 / 1 |
6.07 | |||
convertToPua | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getSortKey | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getFirstLetter | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @since 1.30 |
19 | * |
20 | * @file |
21 | */ |
22 | |
23 | use MediaWiki\Language\Language; |
24 | use MediaWiki\Languages\LanguageFactory; |
25 | |
26 | /** |
27 | * Resort normal UTF-8 order by putting a bunch of stuff in PUA |
28 | * |
29 | * This takes a bunch of characters (The alphabet) that should, |
30 | * be together, and converts them all to private-use-area characters |
31 | * so that they are all sorted in the right order relative to each |
32 | * other. |
33 | * |
34 | * This renumbers characters starting at U+F3000 (Chosen to avoid |
35 | * conflicts with other people using private use area) |
36 | * |
37 | * This does not support fancy things like secondary differences, etc. |
38 | * (It supports digraphs, trigraphs etc. though.) |
39 | * |
40 | * It is expected most people will subclass this and just override the |
41 | * constructor to hard-code an alphabet. |
42 | */ |
43 | class CustomUppercaseCollation extends NumericUppercaseCollation { |
44 | |
45 | /** @var array Sorted array of letters */ |
46 | private $alphabet; |
47 | |
48 | /** @var array List of private use area codes */ |
49 | private $puaSubset; |
50 | |
51 | /** @var array */ |
52 | private $firstLetters; |
53 | |
54 | /** |
55 | * @note This assumes $alphabet does not contain U+F3000-U+F3FFF |
56 | * |
57 | * @param LanguageFactory $languageFactory |
58 | * @param array $alphabet Sorted array of uppercase characters. Can have array elements for equal weight chars |
59 | * @param string|Language $digitTransformLang What language for number sorting. |
60 | */ |
61 | public function __construct( |
62 | LanguageFactory $languageFactory, |
63 | array $alphabet, |
64 | $digitTransformLang |
65 | ) { |
66 | if ( count( $alphabet ) < 1 || count( $alphabet ) >= 4096 ) { |
67 | throw new UnexpectedValueException( "Alphabet must be < 4096 items" ); |
68 | } |
69 | $digitTransformLang = $digitTransformLang instanceof Language |
70 | ? $digitTransformLang |
71 | : $languageFactory->getLanguage( $digitTransformLang ); |
72 | |
73 | $this->puaSubset = []; |
74 | $this->alphabet = []; |
75 | $len = count( $alphabet ); |
76 | for ( $i = 0; $i < $len; $i++ ) { |
77 | // We allow alphabet to contain array members if multiple characters should be sorted as equivalent. |
78 | for ( $j = 0; $j < count( (array)( $alphabet[$i] ) ); $j++ ) { |
79 | $this->puaSubset[] = "\xF3\xB3" . chr( (int)floor( $i / 64 ) + 128 ) . chr( ( $i % 64 ) + 128 ); |
80 | // For digraphs, we uppercase it all during sorting but not when displaying first letter. |
81 | $this->alphabet[] = $digitTransformLang->uc( ( (array)( $alphabet[$i] ) )[$j] ); |
82 | // Note: first letters is always first of group |
83 | $this->firstLetters[] = ( (array)( $alphabet[$i] ) )[0]; |
84 | } |
85 | } |
86 | |
87 | // Sort these arrays so that any trigraphs, digraphs etc. are first |
88 | // (and they get replaced first in convertToPua()). |
89 | $lengths = array_map( 'mb_strlen', $this->alphabet ); |
90 | array_multisort( $lengths, SORT_DESC, $this->firstLetters, $this->alphabet, $this->puaSubset ); |
91 | |
92 | parent::__construct( $languageFactory, $digitTransformLang ); |
93 | } |
94 | |
95 | private function convertToPua( string $string ): string { |
96 | return str_replace( $this->alphabet, $this->puaSubset, $string ); |
97 | } |
98 | |
99 | public function getSortKey( $string ) { |
100 | return $this->convertToPua( parent::getSortKey( $string ) ); |
101 | } |
102 | |
103 | public function getFirstLetter( $string ) { |
104 | $sortkey = $this->getSortKey( $string ); |
105 | |
106 | // In case a title begins with a character from our alphabet, return the corresponding |
107 | // first-letter. (This also happens if the title has a corresponding PUA code in it, to avoid |
108 | // inconsistent behaviour. This class mostly assumes that people will not use PUA codes.) |
109 | $index = array_search( substr( $sortkey, 0, 4 ), $this->puaSubset ); |
110 | if ( $index !== false ) { |
111 | return $this->firstLetters[ $index ]; |
112 | } |
113 | |
114 | // String begins with a character outside of our alphabet, fall back |
115 | return parent::getFirstLetter( $string ); |
116 | } |
117 | } |