Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
73.75% |
59 / 80 |
|
54.55% |
6 / 11 |
CRAP | |
0.00% |
0 / 1 |
AntiSpoof | |
73.75% |
59 / 80 |
|
54.55% |
6 / 11 |
64.12 | |
0.00% |
0 / 1 |
getEquivSet | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getScriptCode | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
4.25 | |||
isSubsetOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
isAllowedScriptCombination | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
3.14 | |||
stringToList | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
3.03 | |||
listToString | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
hardjoin | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
normalizeString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
stripScript | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
badCharErr | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
checkUnicodeStringStatus | |
75.61% |
31 / 41 |
|
0.00% |
0 / 1 |
18.26 |
1 | <?php |
2 | /** |
3 | * AntiSpoof.php |
4 | * Username spoofing prevention for MediaWiki |
5 | * Version 0.04 |
6 | * |
7 | * Copyright (C) Neil Harris 2006 |
8 | * Python->PHP conversion by Brion Vibber <brion@pobox.com> |
9 | * |
10 | * 2006-06-30 Handles non-CJK scripts as per UTR #39 + my extensions |
11 | * 2006-07-01 Now handles Simplified <-> Traditional Chinese rules, as |
12 | * per JET Guidelines for Internationalized Domain Names, |
13 | * and the ICANN language registry values for .cn |
14 | * 2006-09-14 Now handles 'rn' etc better, and uses stdin for input |
15 | * 2006-09-18 Added exception handling for nasty cases, eg BiDi violations |
16 | * 2006-09-19 Converted to PHP for easier integration into a MW extension |
17 | * |
18 | * This program is free software; you can redistribute it and/or modify |
19 | * it under the terms of the GNU General Public License as published by |
20 | * the Free Software Foundation; either version 2 of the License, or |
21 | * (at your option) any later version. |
22 | * |
23 | * This program is distributed in the hope that it will be useful, but |
24 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
26 | * General Public License for more details. |
27 | * |
28 | * You should have received a copy of the GNU General Public License |
29 | * along with this program; if not, write to the Free Software |
30 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 |
31 | * USA |
32 | */ |
33 | |
34 | namespace MediaWiki\Extension\AntiSpoof; |
35 | |
36 | use MediaWiki\Config\ConfigException; |
37 | use MediaWiki\Status\Status; |
38 | use UtfNormal\Utils; |
39 | use UtfNormal\Validator; |
40 | use Wikimedia\Equivset\Equivset; |
41 | |
42 | class AntiSpoof { |
43 | |
44 | private const SCRIPT_DEPRECATED = 'DEPRECATED'; |
45 | private const SCRIPT_UNASSIGNED = 'UNASSIGNED'; |
46 | |
47 | private const SCRIPT_ARABIC = 'ARABIC'; |
48 | private const SCRIPT_ARMENIAN = 'ARMENIAN'; |
49 | private const SCRIPT_ASCII_DIGITS = 'ASCII_DIGITS'; |
50 | private const SCRIPT_ASCII_PUNCTUATION = 'ASCII_PUNCTUATION'; |
51 | private const SCRIPT_BENGALI = 'BENGALI'; |
52 | private const SCRIPT_BOPOMOFO = 'BOPOMOFO'; |
53 | private const SCRIPT_BUGINESE = 'BUGINESE'; |
54 | private const SCRIPT_BUHID = 'BUHID'; |
55 | private const SCRIPT_CANADIAN_ABORIGINAL = 'CANADIAN_ABORIGINAL'; |
56 | private const SCRIPT_CHEROKEE = 'CHEROKEE'; |
57 | private const SCRIPT_COMBINING_MARKS = 'COMBINING_MARKS'; |
58 | private const SCRIPT_COPTIC = 'COPTIC'; |
59 | private const SCRIPT_COPTIC_EXTRAS = 'COPTIC_EXTRAS'; |
60 | private const SCRIPT_CYPRIOT = 'CYPRIOT'; |
61 | private const SCRIPT_CYRILLIC = 'CYRILLIC'; |
62 | private const SCRIPT_DESERET = 'DESERET'; |
63 | private const SCRIPT_DEVANAGARI = 'DEVANAGARI'; |
64 | private const SCRIPT_ETHIOPIC = 'ETHIOPIC'; |
65 | private const SCRIPT_GEORGIAN = 'GEORGIAN'; |
66 | private const SCRIPT_GLAGOLITIC = 'GLAGOLITIC'; |
67 | private const SCRIPT_GOTHIC = 'GOTHIC'; |
68 | private const SCRIPT_GREEK = 'GREEK'; |
69 | private const SCRIPT_GUJARATI = 'GUJARATI'; |
70 | private const SCRIPT_GURMUKHI = 'GURMUKHI'; |
71 | private const SCRIPT_HAN = 'HAN'; |
72 | private const SCRIPT_HANGUL = 'HANGUL'; |
73 | private const SCRIPT_HANUNOO = 'HANUNOO'; |
74 | private const SCRIPT_HEBREW = 'HEBREW'; |
75 | private const SCRIPT_HIRAGANA = 'HIRAGANA'; |
76 | private const SCRIPT_KANNADA = 'KANNADA'; |
77 | private const SCRIPT_KATAKANA = 'KATAKANA'; |
78 | private const SCRIPT_KHAROSHTHI = 'KHAROSHTHI'; |
79 | private const SCRIPT_KHMER = 'KHMER'; |
80 | private const SCRIPT_LAO = 'LAO'; |
81 | private const SCRIPT_LATIN = 'LATIN'; |
82 | private const SCRIPT_LIMBU = 'LIMBU'; |
83 | private const SCRIPT_LINEAR_B = 'LINEAR_B'; |
84 | private const SCRIPT_MALAYALAM = 'MALAYALAM'; |
85 | private const SCRIPT_MEETEI_MAYEK = 'MEETEI_MAYEK'; |
86 | private const SCRIPT_MEETEI_MAYEK_EXTENSIONS = 'MEETEI_MAYEK_EXTENSIONS'; |
87 | private const SCRIPT_MONGOLIAN = 'MONGOLIAN'; |
88 | private const SCRIPT_MYANMAR = 'MYANMAR'; |
89 | private const SCRIPT_NEW_TAI_LUE = 'NEW_TAI_LUE'; |
90 | private const SCRIPT_NKO = 'NKO'; |
91 | private const SCRIPT_OGHAM = 'OGHAM'; |
92 | private const SCRIPT_OL_CHIKI = 'OL_CHIKI'; |
93 | private const SCRIPT_OLD_ITALIC = 'OLD_ITALIC'; |
94 | private const SCRIPT_OLD_PERSIAN = 'OLD_PERSIAN'; |
95 | private const SCRIPT_ORIYA = 'ORIYA'; |
96 | private const SCRIPT_OSMANYA = 'OSMANYA'; |
97 | private const SCRIPT_RUNIC = 'RUNIC'; |
98 | private const SCRIPT_SHAVIAN = 'SHAVIAN'; |
99 | private const SCRIPT_SINHALA = 'SINHALA'; |
100 | private const SCRIPT_SYLOTI_NAGRI = 'SYLOTI_NAGRI'; |
101 | private const SCRIPT_SYRIAC = 'SYRIAC'; |
102 | private const SCRIPT_TAGALOG = 'TAGALOG'; |
103 | private const SCRIPT_TAGBANWA = 'TAGBANWA'; |
104 | private const SCRIPT_TAI_LE = 'TAI_LE'; |
105 | private const SCRIPT_TAMIL = 'TAMIL'; |
106 | private const SCRIPT_TELUGU = 'TELUGU'; |
107 | private const SCRIPT_THAANA = 'THAANA'; |
108 | private const SCRIPT_THAI = 'THAI'; |
109 | private const SCRIPT_TIBETAN = 'TIBETAN'; |
110 | private const SCRIPT_TIFINAGH = 'TIFINAGH'; |
111 | private const SCRIPT_UGARITIC = 'UGARITIC'; |
112 | private const SCRIPT_WARANG_CITI = 'WARANG_CITI'; |
113 | private const SCRIPT_YI = 'YI'; |
114 | |
115 | // phpcs:disable MediaWiki.WhiteSpace.SpaceBeforeSingleLineComment.NewLineComment |
116 | |
117 | /** |
118 | * Define script tag codes for various Unicode codepoint ranges |
119 | * If it does not have a code here, it does not have a script assignment |
120 | * NB: Braille is not in this list since it is a transliteration system, not a script; |
121 | * this does not disadvantage blind people, who will use Braille input/output methods |
122 | * and not raw Braille... |
123 | * NB: Middle dot is included in SCRIPT_LATIN for use in Catalan |
124 | * NB: All scripts described by the Unicode Consortium as "Other Scripts" or "Ancient Scripts" |
125 | * are commented out: these are either not in modern use, or only used for specialized |
126 | * religious purposes, or only of literary interest |
127 | */ |
128 | private const ALL_SCRIPT_RANGES = [ |
129 | [ 0x0020, 0x002F, |
130 | self::SCRIPT_ASCII_PUNCTUATION ], // ASCII Punctuation 1, Hyphen, ASCII Punctuation 2 |
131 | [ 0x0030, 0x0039, self::SCRIPT_ASCII_DIGITS ], // ASCII Digits |
132 | [ 0x003A, 0x0040, self::SCRIPT_ASCII_PUNCTUATION ], // Colon, ASCII Punctuation 3 |
133 | [ 0x0041, 0x005A, self::SCRIPT_LATIN ], // ASCII Uppercase |
134 | [ 0x005B, 0x0060, |
135 | self::SCRIPT_ASCII_PUNCTUATION ], // ASCII Punctuation 4, Underscore, ASCII Punctuation 5 |
136 | [ 0x0061, 0x007A, self::SCRIPT_LATIN ], // ASCII Lowercase |
137 | [ 0x007B, 0x007E, self::SCRIPT_ASCII_PUNCTUATION ], // ASCII Punctuation 5 |
138 | [ 0x00B7, 0x00B7, self::SCRIPT_LATIN ], // Middle Dot |
139 | [ 0x00C0, 0x00D6, self::SCRIPT_LATIN ], // Latin-1 Letters 1 |
140 | [ 0x00D8, 0x00F6, self::SCRIPT_LATIN ], // Latin-1 Letters 2 |
141 | [ 0x00F8, 0x02AF, |
142 | self::SCRIPT_LATIN ], // Latin-1 Letters 3, Latin Extended-A, Latin Extended-B, IPA Extensions |
143 | [ 0x0300, 0x036F, self::SCRIPT_COMBINING_MARKS ], // Combining Diacritical Marks |
144 | [ 0x0370, 0x03E1, self::SCRIPT_GREEK ], // Greek and Coptic (Greek) |
145 | [ 0x03E2, 0x03EF, self::SCRIPT_COPTIC_EXTRAS ], // Greek and Coptic (Coptic-unique) |
146 | [ 0x03F0, 0x03FF, self::SCRIPT_GREEK ], // Greek and Coptic (Greek) |
147 | [ 0x0400, 0x052F, self::SCRIPT_CYRILLIC ], // Cyrillic, Cyrillic Supplement |
148 | [ 0x0530, 0x058F, self::SCRIPT_ARMENIAN ], // Armenian |
149 | [ 0x0590, 0x05FF, self::SCRIPT_HEBREW ], // Hebrew |
150 | [ 0x0600, 0x06FF, self::SCRIPT_ARABIC ], // Arabic |
151 | [ 0x0700, 0x074F, self::SCRIPT_SYRIAC ], // Syriac |
152 | [ 0x0750, 0x077F, self::SCRIPT_ARABIC ], // Arabic Supplement |
153 | [ 0x0780, 0x07BF, self::SCRIPT_THAANA ], // Thaana |
154 | [ 0x07C0, 0x07FF, self::SCRIPT_NKO ], // NKo (N'Ko) |
155 | [ 0x0900, 0x097F, self::SCRIPT_DEVANAGARI ], // Devanagari |
156 | [ 0x0980, 0x09FF, self::SCRIPT_BENGALI ], // Bengali |
157 | [ 0x0A00, 0x0A7F, self::SCRIPT_GURMUKHI ], // Gurmukhi |
158 | [ 0x0A80, 0x0AFF, self::SCRIPT_GUJARATI ], // Gujarati |
159 | [ 0x0B00, 0x0B7F, self::SCRIPT_ORIYA ], // Oriya |
160 | [ 0x0B80, 0x0BFF, self::SCRIPT_TAMIL ], // Tamil |
161 | [ 0x0C00, 0x0C7F, self::SCRIPT_TELUGU ], // Telugu |
162 | [ 0x0C80, 0x0CFF, self::SCRIPT_KANNADA ], // Kannada |
163 | [ 0x0D00, 0x0D7F, self::SCRIPT_MALAYALAM ], // Malayalam |
164 | [ 0x0D80, 0x0DFF, self::SCRIPT_SINHALA ], // Sinhala |
165 | [ 0x0E00, 0x0E7F, self::SCRIPT_THAI ], // Thai |
166 | [ 0x0E80, 0x0EFF, self::SCRIPT_LAO ], // Lao |
167 | [ 0x0F00, 0x0FFF, self::SCRIPT_TIBETAN ], // Tibetan |
168 | [ 0x1000, 0x109F, self::SCRIPT_MYANMAR ], // Myanmar |
169 | [ 0x10A0, 0x10FF, self::SCRIPT_GEORGIAN ], // Georgian |
170 | [ 0x1100, 0x11FF, self::SCRIPT_HANGUL ], // Hangul Jamo |
171 | [ 0x1200, 0x139F, self::SCRIPT_ETHIOPIC ], // Ethiopic, Ethiopic Supplement |
172 | [ 0x13A0, 0x13FF, self::SCRIPT_CHEROKEE ], // Cherokee |
173 | [ 0x1400, 0x167F, self::SCRIPT_CANADIAN_ABORIGINAL ], // Unified Canadian Aboriginal Syllabics |
174 | // [ 0x1680, 0x169F, self::SCRIPT_OGHAM ], // Ogham |
175 | // [ 0x16A0, 0x16FF, self::SCRIPT_RUNIC ], // Runic |
176 | [ 0x1700, 0x171F, self::SCRIPT_TAGALOG ], // Tagalog |
177 | [ 0x1720, 0x173F, self::SCRIPT_HANUNOO ], // Hanunoo |
178 | [ 0x1740, 0x175F, self::SCRIPT_BUHID ], // Buhid |
179 | [ 0x1760, 0x177F, self::SCRIPT_TAGBANWA ], // Tagbanwa |
180 | [ 0x1780, 0x17FF, self::SCRIPT_KHMER ], // Khmer |
181 | [ 0x1800, 0x18AF, self::SCRIPT_MONGOLIAN ], // Mongolian |
182 | [ 0x1900, 0x194F, self::SCRIPT_LIMBU ], // Limbu |
183 | [ 0x1950, 0x197F, self::SCRIPT_TAI_LE ], // Tai Le |
184 | [ 0x1980, 0x19DF, self::SCRIPT_NEW_TAI_LUE ], // New Tai Lue |
185 | [ 0x1A00, 0x1A1F, self::SCRIPT_BUGINESE ], // Buginese |
186 | [ 0x1C50, 0x1C7F, self::SCRIPT_OL_CHIKI ], // Ol Chiki |
187 | [ 0x1C90, 0x1CBF, self::SCRIPT_GEORGIAN ], // Georgian Extended |
188 | [ 0x1E00, 0x1EFF, self::SCRIPT_LATIN ], // Latin Extended Additional |
189 | [ 0x1F00, 0x1FFF, self::SCRIPT_GREEK ], // Greek Extended |
190 | // [ 0x2C00, 0x2C5F, self::SCRIPT_GLAGOLITIC ], // Glagolitic |
191 | [ 0x2C80, 0x2CFF, self::SCRIPT_COPTIC ], // Coptic |
192 | [ 0x2D00, 0x2D2F, self::SCRIPT_GEORGIAN ], // Georgian Supplement |
193 | [ 0x2D30, 0x2D7F, self::SCRIPT_TIFINAGH ], // Tifinagh |
194 | [ 0x2D80, 0x2DDF, self::SCRIPT_ETHIOPIC ], // Ethiopic Extended |
195 | [ 0x2E80, 0x2FDF, self::SCRIPT_DEPRECATED ], // CJK Radicals Supplement, Kangxi Radicals |
196 | [ 0x3040, 0x309F, self::SCRIPT_HIRAGANA ], // Hiragana |
197 | [ 0x30A0, 0x30FF, self::SCRIPT_KATAKANA ], // Katakana |
198 | [ 0x3100, 0x312F, self::SCRIPT_BOPOMOFO ], // Bopomofo |
199 | [ 0x3130, 0x318F, self::SCRIPT_HANGUL ], // Hangul Compatibility Jamo |
200 | [ 0x31A0, 0x31BF, self::SCRIPT_BOPOMOFO ], // Bopomofo Extended |
201 | [ 0x31F0, 0x31FF, self::SCRIPT_KATAKANA ], // Katakana Phonetic Extensions |
202 | [ 0x3400, 0x4DBF, self::SCRIPT_HAN ], // CJK Unified Ideographs Extension A |
203 | [ 0x4E00, 0x9FFF, self::SCRIPT_HAN ], // CJK Unified Ideographs |
204 | [ 0xA000, 0xA4CF, self::SCRIPT_YI ], // Yi Syllables, Yi Radicals |
205 | [ 0xA800, 0xA82F, self::SCRIPT_SYLOTI_NAGRI ], // Syloti Nagri |
206 | // [ 0xAAE0, 0xAAFF, self::SCRIPT_MEETEI_MAYEK_EXTENSIONS ] // Meetei Mayek Extensions |
207 | [ 0xAB70, 0xABBF, self::SCRIPT_CHEROKEE ], // Cherokee Supplement |
208 | [ 0xABC0, 0xABFF, self::SCRIPT_MEETEI_MAYEK ], // Meetei Mayek |
209 | [ 0xAC00, 0xD7AF, self::SCRIPT_HANGUL ], // Hangul Syllables |
210 | [ 0xF900, 0xFAFF, self::SCRIPT_DEPRECATED ], // CJK Compatibility Ideographs |
211 | // [ 0x10000, 0x100FF, self::SCRIPT_LINEAR_B ], // Linear B Syllabary, Linear B Ideograms |
212 | // [ 0x10140, 0x1018F, self::SCRIPT_GREEK ], // Ancient Greek Numbers |
213 | // [ 0x10300, 0x1032F, self::SCRIPT_OLD_ITALIC ], // Old Italic |
214 | [ 0x10330, 0x1034F, self::SCRIPT_GOTHIC ], // Gothic |
215 | // [ 0x10380, 0x1039F, self::SCRIPT_UGARITIC ], // Ugaritic |
216 | // [ 0x103A0, 0x103DF, self::SCRIPT_OLD_PERSIAN ], // Old Persian |
217 | // [ 0x10400, 0x1044F, self::SCRIPT_DESERET ], // Deseret |
218 | // [ 0x10450, 0x1047F, self::SCRIPT_SHAVIAN ], // Shavian |
219 | // [ 0x10480, 0x104AF, self::SCRIPT_OSMANYA ], // Osmanya |
220 | // [ 0x10800, 0x1083F, self::SCRIPT_CYPRIOT ], // Cypriot Syllabary |
221 | [ 0x10A00, 0x10A5F, self::SCRIPT_KHAROSHTHI ], // Kharoshthi |
222 | [ 0x118A0, 0x118FF, self::SCRIPT_WARANG_CITI ], // Warang Citi |
223 | [ 0x20000, 0x2A6DF, self::SCRIPT_HAN ], // CJK Unified Ideographs Extension B |
224 | [ 0x2F800, 0x2FA1F, self::SCRIPT_DEPRECATED ] // CJK Compatibility Ideographs Supplement |
225 | ]; |
226 | |
227 | private const ALLOWED_SCRIPT_COMBINATIONS = [ |
228 | [ self::SCRIPT_COPTIC, self::SCRIPT_COPTIC_EXTRAS ], # Coptic, using old Greek chars |
229 | [ self::SCRIPT_GREEK, self::SCRIPT_COPTIC_EXTRAS ], # Coptic, using new Coptic chars |
230 | [ self::SCRIPT_HAN, self::SCRIPT_BOPOMOFO ], # Chinese |
231 | [ self::SCRIPT_HAN, self::SCRIPT_HANGUL ], # Korean |
232 | [ self::SCRIPT_HAN, self::SCRIPT_KATAKANA, self::SCRIPT_HIRAGANA ] # Japanese |
233 | ]; |
234 | |
235 | // phpcs:enable MediaWiki.WhiteSpace.SpaceBeforeSingleLineComment.NewLineComment |
236 | |
237 | /** |
238 | * @var Equivset |
239 | */ |
240 | private static $equivset; |
241 | |
242 | /** |
243 | * @return Equivset |
244 | */ |
245 | public static function getEquivSet() { |
246 | if ( !self::$equivset ) { |
247 | self::$equivset = new Equivset(); |
248 | } |
249 | |
250 | return self::$equivset; |
251 | } |
252 | |
253 | /** |
254 | * @param int $ch |
255 | * @return string |
256 | */ |
257 | private static function getScriptCode( $ch ) { |
258 | # Linear search: binary chop would be faster... |
259 | foreach ( self::ALL_SCRIPT_RANGES as $range ) { |
260 | if ( $ch >= $range[0] && $ch <= $range[1] ) { |
261 | return $range[2]; |
262 | } |
263 | } |
264 | # Otherwise... |
265 | return self::SCRIPT_UNASSIGNED; |
266 | } |
267 | |
268 | /** |
269 | * @param array $aList |
270 | * @param array $bList |
271 | * @return bool |
272 | */ |
273 | private static function isSubsetOf( $aList, $bList ) { |
274 | return count( array_diff( $aList, $bList ) ) == 0; |
275 | } |
276 | |
277 | /** |
278 | * Is this an allowed script mixture? |
279 | * |
280 | * @param array $scriptList |
281 | * @return bool |
282 | */ |
283 | private static function isAllowedScriptCombination( $scriptList ) { |
284 | foreach ( self::ALLOWED_SCRIPT_COMBINATIONS as $allowedCombo ) { |
285 | if ( self::isSubsetOf( $scriptList, $allowedCombo ) ) { |
286 | return true; |
287 | } |
288 | } |
289 | return false; |
290 | } |
291 | |
292 | /** |
293 | * Convert string into array of Unicode code points as integers |
294 | * @param string $str |
295 | * @return int[] |
296 | */ |
297 | private static function stringToList( $str ) { |
298 | $ar = []; |
299 | if ( !preg_match_all( '/./us', $str, $ar ) ) { |
300 | return []; |
301 | } |
302 | $out = []; |
303 | foreach ( $ar[0] as $char ) { |
304 | $out[] = Utils::utf8ToCodepoint( $char ); |
305 | } |
306 | return $out; |
307 | } |
308 | |
309 | /** |
310 | * @param array $list |
311 | * @return string |
312 | */ |
313 | private static function listToString( $list ) { |
314 | $out = ''; |
315 | foreach ( $list as $cp ) { |
316 | $out .= Utils::codepointToUtf8( $cp ); |
317 | } |
318 | return $out; |
319 | } |
320 | |
321 | /** |
322 | * @param array $a_list |
323 | * @return string |
324 | */ |
325 | private static function hardjoin( $a_list ) { |
326 | return implode( '', $a_list ); |
327 | } |
328 | |
329 | /** |
330 | * @param string $testName |
331 | * @return string |
332 | */ |
333 | public static function normalizeString( $testName ) { |
334 | return self::getEquivSet()->normalize( $testName ); |
335 | } |
336 | |
337 | /** |
338 | * @param int[] $text |
339 | * @param string $script |
340 | * @return int[] |
341 | */ |
342 | private static function stripScript( array $text, $script ) { |
343 | $scripts = array_map( [ __CLASS__, 'getScriptCode' ], $text ); |
344 | $out = []; |
345 | foreach ( $text as $index => $char ) { |
346 | if ( $scripts[$index] !== $script ) { |
347 | $out[] = $char; |
348 | } |
349 | } |
350 | return $out; |
351 | } |
352 | |
353 | /** |
354 | * Helper function for checkUnicodeStringStatus: Return an error on a bad character. |
355 | * @todo I would like to show Unicode character name, but it is not clear how to get it. |
356 | * @param string $msgId message identifier. |
357 | * @param int $point codepoint of the bad character. |
358 | * @return Status |
359 | */ |
360 | private static function badCharErr( $msgId, $point ) { |
361 | $symbol = Utils::codepointToUtf8( $point ); |
362 | // Combining marks are combined with the previous character. If abusing character is a |
363 | // combining mark, prepend it with space to show them correctly. |
364 | if ( self::getScriptCode( $point ) === self::SCRIPT_COMBINING_MARKS ) { |
365 | $symbol = ' ' . $symbol; |
366 | } |
367 | $code = sprintf( 'U+%04X', $point ); |
368 | if ( preg_match( '/\A\p{C}\z/u', $symbol ) ) { |
369 | $char = wfMessage( 'antispoof-bad-char-non-printable', $code ); |
370 | } else { |
371 | $char = wfMessage( 'antispoof-bad-char', $symbol, $code ); |
372 | } |
373 | return Status::newFatal( wfMessage( $msgId, $char ) ); |
374 | } |
375 | |
376 | /** |
377 | * TODO: does too much in one routine, refactor... |
378 | * @param string $testName |
379 | * @return Status |
380 | * @since 1.32 |
381 | */ |
382 | public static function checkUnicodeStringStatus( $testName ) { |
383 | global $wgAntiSpoofProhibitedCharacters; |
384 | |
385 | // Start with some sanity checking |
386 | if ( !is_array( $wgAntiSpoofProhibitedCharacters ) ) { |
387 | throw new ConfigException( '$wgAntiSpoofProhibitedCharacters should be an array!' ); |
388 | } |
389 | if ( !is_string( $testName ) ) { |
390 | return Status::newFatal( 'antispoof-badtype' ); |
391 | } |
392 | |
393 | if ( strlen( $testName ) == 0 ) { |
394 | return Status::newFatal( 'antispoof-empty' ); |
395 | } |
396 | |
397 | foreach ( self::stringToList( $testName ) as $char ) { |
398 | if ( in_array( $char, $wgAntiSpoofProhibitedCharacters ) ) { |
399 | return self::badCharErr( 'antispoof-prohibited', $char ); |
400 | } |
401 | } |
402 | |
403 | // Perform Unicode _compatibility_ decomposition |
404 | $testName = Validator::toNFKD( $testName ); |
405 | $testChars = self::stringToList( $testName ); |
406 | |
407 | // Be paranoid: check again, just in case Unicode normalization code changes... |
408 | foreach ( $testChars as $char ) { |
409 | if ( in_array( $char, $wgAntiSpoofProhibitedCharacters ) ) { |
410 | return self::badCharErr( 'antispoof-prohibited', $char ); |
411 | } |
412 | } |
413 | |
414 | // Check for this: should not happen in any valid Unicode string |
415 | if ( self::getScriptCode( $testChars[0] ) === self::SCRIPT_COMBINING_MARKS ) { |
416 | return self::badCharErr( 'antispoof-combining', $testChars[0] ); |
417 | } |
418 | |
419 | // Strip all combining characters in order to crudely strip accents |
420 | // Note: NFKD normalization should have decomposed all accented chars earlier |
421 | $testChars = self::stripScript( $testChars, self::SCRIPT_COMBINING_MARKS ); |
422 | |
423 | $testScripts = array_map( [ __CLASS__, 'getScriptCode' ], $testChars ); |
424 | $unassigned = array_search( self::SCRIPT_UNASSIGNED, $testScripts ); |
425 | if ( $unassigned !== false ) { |
426 | return self::badCharErr( 'antispoof-unassigned', $testChars[$unassigned] ); |
427 | } |
428 | $deprecated = array_search( self::SCRIPT_DEPRECATED, $testScripts ); |
429 | if ( $deprecated !== false ) { |
430 | return self::badCharErr( 'antispoof-deprecated', $testChars[$deprecated] ); |
431 | } |
432 | $testScripts = array_unique( $testScripts ); |
433 | |
434 | // We don't mind ASCII punctuation or digits |
435 | $testScripts = array_diff( $testScripts, |
436 | [ self::SCRIPT_ASCII_PUNCTUATION, self::SCRIPT_ASCII_DIGITS ] ); |
437 | |
438 | if ( !$testScripts ) { |
439 | return Status::newFatal( 'antispoof-noletters' ); |
440 | } |
441 | |
442 | if ( count( $testScripts ) > 1 && !self::isAllowedScriptCombination( $testScripts ) ) { |
443 | return Status::newFatal( 'antispoof-mixedscripts' ); |
444 | } |
445 | |
446 | // At this point, we should probably check for BiDi violations if they aren't |
447 | // caught above... |
448 | |
449 | // Squeeze out all punctuation chars |
450 | // TODO: almost the same code occurs twice, refactor into own routine |
451 | $testChars = self::stripScript( $testChars, self::SCRIPT_ASCII_PUNCTUATION ); |
452 | |
453 | $testName = self::listToString( $testChars ); |
454 | |
455 | // Replace characters in confusables set with equivalence chars |
456 | $testName = self::normalizeString( $testName ); |
457 | |
458 | // Do very simple sequence processing: "vv" -> "w", "rn" -> "m"... |
459 | // Not exhaustive, but ups the ante... |
460 | // Do this _after_ canonicalization: looks weird, but needed for consistency |
461 | $testName = str_replace( 'VV', 'W', $testName ); |
462 | $testName = str_replace( 'RN', 'M', $testName ); |
463 | |
464 | // Remove all remaining spaces, just in case any have snuck through... |
465 | $testName = self::hardjoin( explode( " ", $testName ) ); |
466 | |
467 | // Reduce repeated char sequences to single character |
468 | // BUG: TODO: implement this |
469 | |
470 | if ( strlen( $testName ) < 1 ) { |
471 | return Status::newFatal( 'antispoof-tooshort' ); |
472 | } |
473 | |
474 | // Don't ASCIIfy: we assume we are UTF-8 capable on output |
475 | |
476 | // Prepend version string, for futureproofing if this algorithm changes |
477 | $testName = "v2:" . $testName; |
478 | |
479 | // And return the canonical version of the name |
480 | return Status::newGood( $testName ); |
481 | } |
482 | } |