Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
73.75% |
59 / 80 |
|
54.55% |
6 / 11 |
CRAP | |
0.00% |
0 / 1 |
AntiSpoof | |
73.75% |
59 / 80 |
|
54.55% |
6 / 11 |
64.12 | |
0.00% |
0 / 1 |
getEquivSet | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getScriptCode | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
4.25 | |||
isSubsetOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
isAllowedScriptCombination | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
3.14 | |||
stringToList | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
3.03 | |||
listToString | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
hardjoin | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
normalizeString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
stripScript | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
badCharErr | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
checkUnicodeStringStatus | |
75.61% |
31 / 41 |
|
0.00% |
0 / 1 |
18.26 |
1 | <?php |
2 | /** |
3 | * AntiSpoof.php |
4 | * Username spoofing prevention for MediaWiki |
5 | * Version 0.04 |
6 | * |
7 | * Copyright (C) Neil Harris 2006 |
8 | * Python->PHP conversion by Brion Vibber <brion@pobox.com> |
9 | * |
10 | * 2006-06-30 Handles non-CJK scripts as per UTR #39 + my extensions |
11 | * 2006-07-01 Now handles Simplified <-> Traditional Chinese rules, as |
12 | * per JET Guidelines for Internationalized Domain Names, |
13 | * and the ICANN language registry values for .cn |
14 | * 2006-09-14 Now handles 'rn' etc better, and uses stdin for input |
15 | * 2006-09-18 Added exception handling for nasty cases, eg BiDi violations |
16 | * 2006-09-19 Converted to PHP for easier integration into a MW extension |
17 | * |
18 | * This program is free software; you can redistribute it and/or modify |
19 | * it under the terms of the GNU General Public License as published by |
20 | * the Free Software Foundation; either version 2 of the License, or |
21 | * (at your option) any later version. |
22 | * |
23 | * This program is distributed in the hope that it will be useful, but |
24 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
26 | * General Public License for more details. |
27 | * |
28 | * You should have received a copy of the GNU General Public License |
29 | * along with this program; if not, write to the Free Software |
30 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 |
31 | * USA |
32 | */ |
33 | |
34 | namespace MediaWiki\Extension\AntiSpoof; |
35 | |
36 | use ConfigException; |
37 | use Status; |
38 | use UtfNormal\Utils; |
39 | use UtfNormal\Validator; |
40 | use Wikimedia\Equivset\Equivset; |
41 | |
42 | class AntiSpoof { |
43 | |
44 | private const SCRIPT_DEPRECATED = 'DEPRECATED'; |
45 | private const SCRIPT_UNASSIGNED = 'UNASSIGNED'; |
46 | |
47 | private const SCRIPT_ARABIC = 'ARABIC'; |
48 | private const SCRIPT_ARMENIAN = 'ARMENIAN'; |
49 | private const SCRIPT_ASCII_DIGITS = 'ASCII_DIGITS'; |
50 | private const SCRIPT_ASCII_PUNCTUATION = 'ASCII_PUNCTUATION'; |
51 | private const SCRIPT_BENGALI = 'BENGALI'; |
52 | private const SCRIPT_BOPOMOFO = 'BOPOMOFO'; |
53 | private const SCRIPT_BUGINESE = 'BUGINESE'; |
54 | private const SCRIPT_BUHID = 'BUHID'; |
55 | private const SCRIPT_CANADIAN_ABORIGINAL = 'CANADIAN_ABORIGINAL'; |
56 | private const SCRIPT_CHEROKEE = 'CHEROKEE'; |
57 | private const SCRIPT_COMBINING_MARKS = 'COMBINING_MARKS'; |
58 | private const SCRIPT_COPTIC = 'COPTIC'; |
59 | private const SCRIPT_COPTIC_EXTRAS = 'COPTIC_EXTRAS'; |
60 | private const SCRIPT_CYPRIOT = 'CYPRIOT'; |
61 | private const SCRIPT_CYRILLIC = 'CYRILLIC'; |
62 | private const SCRIPT_DESERET = 'DESERET'; |
63 | private const SCRIPT_DEVANAGARI = 'DEVANAGARI'; |
64 | private const SCRIPT_ETHIOPIC = 'ETHIOPIC'; |
65 | private const SCRIPT_GEORGIAN = 'GEORGIAN'; |
66 | private const SCRIPT_GLAGOLITIC = 'GLAGOLITIC'; |
67 | private const SCRIPT_GOTHIC = 'GOTHIC'; |
68 | private const SCRIPT_GREEK = 'GREEK'; |
69 | private const SCRIPT_GUJARATI = 'GUJARATI'; |
70 | private const SCRIPT_GURMUKHI = 'GURMUKHI'; |
71 | private const SCRIPT_HAN = 'HAN'; |
72 | private const SCRIPT_HANGUL = 'HANGUL'; |
73 | private const SCRIPT_HANUNOO = 'HANUNOO'; |
74 | private const SCRIPT_HEBREW = 'HEBREW'; |
75 | private const SCRIPT_HIRAGANA = 'HIRAGANA'; |
76 | private const SCRIPT_KANNADA = 'KANNADA'; |
77 | private const SCRIPT_KATAKANA = 'KATAKANA'; |
78 | private const SCRIPT_KHAROSHTHI = 'KHAROSHTHI'; |
79 | private const SCRIPT_KHMER = 'KHMER'; |
80 | private const SCRIPT_LAO = 'LAO'; |
81 | private const SCRIPT_LATIN = 'LATIN'; |
82 | private const SCRIPT_LIMBU = 'LIMBU'; |
83 | private const SCRIPT_LINEAR_B = 'LINEAR_B'; |
84 | private const SCRIPT_MALAYALAM = 'MALAYALAM'; |
85 | private const SCRIPT_MEETEI_MAYEK = 'MEETEI_MAYEK'; |
86 | private const SCRIPT_MEETEI_MAYEK_EXTENSIONS = 'MEETEI_MAYEK_EXTENSIONS'; |
87 | private const SCRIPT_MONGOLIAN = 'MONGOLIAN'; |
88 | private const SCRIPT_MYANMAR = 'MYANMAR'; |
89 | private const SCRIPT_NEW_TAI_LUE = 'NEW_TAI_LUE'; |
90 | private const SCRIPT_NKO = 'NKO'; |
91 | private const SCRIPT_OGHAM = 'OGHAM'; |
92 | private const SCRIPT_OL_CHIKI = 'OL_CHIKI'; |
93 | private const SCRIPT_OLD_ITALIC = 'OLD_ITALIC'; |
94 | private const SCRIPT_OLD_PERSIAN = 'OLD_PERSIAN'; |
95 | private const SCRIPT_ORIYA = 'ORIYA'; |
96 | private const SCRIPT_OSMANYA = 'OSMANYA'; |
97 | private const SCRIPT_RUNIC = 'RUNIC'; |
98 | private const SCRIPT_SHAVIAN = 'SHAVIAN'; |
99 | private const SCRIPT_SINHALA = 'SINHALA'; |
100 | private const SCRIPT_SYLOTI_NAGRI = 'SYLOTI_NAGRI'; |
101 | private const SCRIPT_SYRIAC = 'SYRIAC'; |
102 | private const SCRIPT_TAGALOG = 'TAGALOG'; |
103 | private const SCRIPT_TAGBANWA = 'TAGBANWA'; |
104 | private const SCRIPT_TAI_LE = 'TAI_LE'; |
105 | private const SCRIPT_TAMIL = 'TAMIL'; |
106 | private const SCRIPT_TELUGU = 'TELUGU'; |
107 | private const SCRIPT_THAANA = 'THAANA'; |
108 | private const SCRIPT_THAI = 'THAI'; |
109 | private const SCRIPT_TIBETAN = 'TIBETAN'; |
110 | private const SCRIPT_TIFINAGH = 'TIFINAGH'; |
111 | private const SCRIPT_UGARITIC = 'UGARITIC'; |
112 | private const SCRIPT_WARANG_CITI = 'WARANG_CITI'; |
113 | private const SCRIPT_YI = 'YI'; |
114 | |
115 | /** |
116 | * Define script tag codes for various Unicode codepoint ranges |
117 | * If it does not have a code here, it does not have a script assignment |
118 | * NB: Braille is not in this list since it is a transliteration system, not a script; |
119 | * this does not disadvantage blind people, who will use Braille input/output methods |
120 | * and not raw Braille... |
121 | * NB: Middle dot is included in SCRIPT_LATIN for use in Catalan |
122 | * NB: All scripts described by the Unicode Consortium as "Other Scripts" or "Ancient Scripts" |
123 | * are commented out: these are either not in modern use, or only used for specialized |
124 | * religious purposes, or only of literary interest |
125 | */ |
126 | private const ALL_SCRIPT_RANGES = [ |
127 | [ 0x0020, 0x002F, |
128 | self::SCRIPT_ASCII_PUNCTUATION ], // ASCII Punctuation 1, Hyphen, ASCII Punctuation 2 |
129 | [ 0x0030, 0x0039, self::SCRIPT_ASCII_DIGITS ], // ASCII Digits |
130 | [ 0x003A, 0x0040, self::SCRIPT_ASCII_PUNCTUATION ], // Colon, ASCII Punctuation 3 |
131 | [ 0x0041, 0x005A, self::SCRIPT_LATIN ], // ASCII Uppercase |
132 | [ 0x005B, 0x0060, |
133 | self::SCRIPT_ASCII_PUNCTUATION ], // ASCII Punctuation 4, Underscore, ASCII Punctuation 5 |
134 | [ 0x0061, 0x007A, self::SCRIPT_LATIN ], // ASCII Lowercase |
135 | [ 0x007B, 0x007E, self::SCRIPT_ASCII_PUNCTUATION ], // ASCII Punctuation 5 |
136 | [ 0x00B7, 0x00B7, self::SCRIPT_LATIN ], // Middle Dot |
137 | [ 0x00C0, 0x00D6, self::SCRIPT_LATIN ], // Latin-1 Letters 1 |
138 | [ 0x00D8, 0x00F6, self::SCRIPT_LATIN ], // Latin-1 Letters 2 |
139 | [ 0x00F8, 0x02AF, |
140 | self::SCRIPT_LATIN ], // Latin-1 Letters 3, Latin Extended-A, Latin Extended-B, IPA Extensions |
141 | [ 0x0300, 0x036F, self::SCRIPT_COMBINING_MARKS ], // Combining Diacritical Marks |
142 | [ 0x0370, 0x03E1, self::SCRIPT_GREEK ], // Greek and Coptic (Greek) |
143 | [ 0x03E2, 0x03EF, self::SCRIPT_COPTIC_EXTRAS ], // Greek and Coptic (Coptic-unique) |
144 | [ 0x03F0, 0x03FF, self::SCRIPT_GREEK ], // Greek and Coptic (Greek) |
145 | [ 0x0400, 0x052F, self::SCRIPT_CYRILLIC ], // Cyrillic, Cyrillic Supplement |
146 | [ 0x0530, 0x058F, self::SCRIPT_ARMENIAN ], // Armenian |
147 | [ 0x0590, 0x05FF, self::SCRIPT_HEBREW ], // Hebrew |
148 | [ 0x0600, 0x06FF, self::SCRIPT_ARABIC ], // Arabic |
149 | [ 0x0700, 0x074F, self::SCRIPT_SYRIAC ], // Syriac |
150 | [ 0x0750, 0x077F, self::SCRIPT_ARABIC ], // Arabic Supplement |
151 | [ 0x0780, 0x07BF, self::SCRIPT_THAANA ], // Thaana |
152 | [ 0x07C0, 0x07FF, self::SCRIPT_NKO ], // NKo (N'Ko) |
153 | [ 0x0900, 0x097F, self::SCRIPT_DEVANAGARI ], // Devanagari |
154 | [ 0x0980, 0x09FF, self::SCRIPT_BENGALI ], // Bengali |
155 | [ 0x0A00, 0x0A7F, self::SCRIPT_GURMUKHI ], // Gurmukhi |
156 | [ 0x0A80, 0x0AFF, self::SCRIPT_GUJARATI ], // Gujarati |
157 | [ 0x0B00, 0x0B7F, self::SCRIPT_ORIYA ], // Oriya |
158 | [ 0x0B80, 0x0BFF, self::SCRIPT_TAMIL ], // Tamil |
159 | [ 0x0C00, 0x0C7F, self::SCRIPT_TELUGU ], // Telugu |
160 | [ 0x0C80, 0x0CFF, self::SCRIPT_KANNADA ], // Kannada |
161 | [ 0x0D00, 0x0D7F, self::SCRIPT_MALAYALAM ], // Malayalam |
162 | [ 0x0D80, 0x0DFF, self::SCRIPT_SINHALA ], // Sinhala |
163 | [ 0x0E00, 0x0E7F, self::SCRIPT_THAI ], // Thai |
164 | [ 0x0E80, 0x0EFF, self::SCRIPT_LAO ], // Lao |
165 | [ 0x0F00, 0x0FFF, self::SCRIPT_TIBETAN ], // Tibetan |
166 | [ 0x1000, 0x109F, self::SCRIPT_MYANMAR ], // Myanmar |
167 | [ 0x10A0, 0x10FF, self::SCRIPT_GEORGIAN ], // Georgian |
168 | [ 0x1100, 0x11FF, self::SCRIPT_HANGUL ], // Hangul Jamo |
169 | [ 0x1200, 0x139F, self::SCRIPT_ETHIOPIC ], // Ethiopic, Ethiopic Supplement |
170 | [ 0x13A0, 0x13FF, self::SCRIPT_CHEROKEE ], // Cherokee |
171 | [ 0x1400, 0x167F, self::SCRIPT_CANADIAN_ABORIGINAL ], // Unified Canadian Aboriginal Syllabics |
172 | // [ 0x1680, 0x169F, self::SCRIPT_OGHAM ], // Ogham |
173 | // [ 0x16A0, 0x16FF, self::SCRIPT_RUNIC ], // Runic |
174 | [ 0x1700, 0x171F, self::SCRIPT_TAGALOG ], // Tagalog |
175 | [ 0x1720, 0x173F, self::SCRIPT_HANUNOO ], // Hanunoo |
176 | [ 0x1740, 0x175F, self::SCRIPT_BUHID ], // Buhid |
177 | [ 0x1760, 0x177F, self::SCRIPT_TAGBANWA ], // Tagbanwa |
178 | [ 0x1780, 0x17FF, self::SCRIPT_KHMER ], // Khmer |
179 | [ 0x1800, 0x18AF, self::SCRIPT_MONGOLIAN ], // Mongolian |
180 | [ 0x1900, 0x194F, self::SCRIPT_LIMBU ], // Limbu |
181 | [ 0x1950, 0x197F, self::SCRIPT_TAI_LE ], // Tai Le |
182 | [ 0x1980, 0x19DF, self::SCRIPT_NEW_TAI_LUE ], // New Tai Lue |
183 | [ 0x1A00, 0x1A1F, self::SCRIPT_BUGINESE ], // Buginese |
184 | [ 0x1C50, 0x1C7F, self::SCRIPT_OL_CHIKI ], // Ol Chiki |
185 | [ 0x1E00, 0x1EFF, self::SCRIPT_LATIN ], // Latin Extended Additional |
186 | [ 0x1F00, 0x1FFF, self::SCRIPT_GREEK ], // Greek Extended |
187 | // [ 0x2C00, 0x2C5F, self::SCRIPT_GLAGOLITIC ], // Glagolitic |
188 | [ 0x2C80, 0x2CFF, self::SCRIPT_COPTIC ], // Coptic |
189 | [ 0x2D00, 0x2D2F, self::SCRIPT_GEORGIAN ], // Georgian Supplement |
190 | [ 0x2D30, 0x2D7F, self::SCRIPT_TIFINAGH ], // Tifinagh |
191 | [ 0x2D80, 0x2DDF, self::SCRIPT_ETHIOPIC ], // Ethiopic Extended |
192 | [ 0x2E80, 0x2FDF, self::SCRIPT_DEPRECATED ], // CJK Radicals Supplement, Kangxi Radicals |
193 | [ 0x3040, 0x309F, self::SCRIPT_HIRAGANA ], // Hiragana |
194 | [ 0x30A0, 0x30FF, self::SCRIPT_KATAKANA ], // Katakana |
195 | [ 0x3100, 0x312F, self::SCRIPT_BOPOMOFO ], // Bopomofo |
196 | [ 0x3130, 0x318F, self::SCRIPT_HANGUL ], // Hangul Compatibility Jamo |
197 | [ 0x31A0, 0x31BF, self::SCRIPT_BOPOMOFO ], // Bopomofo Extended |
198 | [ 0x3400, 0x4DBF, self::SCRIPT_HAN ], // CJK Unified Ideographs Extension A |
199 | [ 0x4E00, 0x9FFF, self::SCRIPT_HAN ], // CJK Unified Ideographs |
200 | [ 0xA000, 0xA4CF, self::SCRIPT_YI ], // Yi Syllables, Yi Radicals |
201 | [ 0xA800, 0xA82F, self::SCRIPT_SYLOTI_NAGRI ], // Syloti Nagri |
202 | // [ 0xAAE0, 0xAAFF, self::SCRIPT_MEETEI_MAYEK_EXTENSIONS ] // Meetei Mayek Extensions |
203 | [ 0xABC0, 0xABFF, self::SCRIPT_MEETEI_MAYEK ], // Meetei Mayek |
204 | [ 0xAC00, 0xD7AF, self::SCRIPT_HANGUL ], // Hangul Syllables |
205 | [ 0xF900, 0xFAFF, self::SCRIPT_DEPRECATED ], // CJK Compatibility Ideographs |
206 | // [ 0x10000, 0x100FF, self::SCRIPT_LINEAR_B ], // Linear B Syllabary, Linear B Ideograms |
207 | // [ 0x10140, 0x1018F, self::SCRIPT_GREEK ], // Ancient Greek Numbers |
208 | // [ 0x10300, 0x1032F, self::SCRIPT_OLD_ITALIC ], // Old Italic |
209 | [ 0x10330, 0x1034F, self::SCRIPT_GOTHIC ], // Gothic |
210 | // [ 0x10380, 0x1039F, self::SCRIPT_UGARITIC ], // Ugaritic |
211 | // [ 0x103A0, 0x103DF, self::SCRIPT_OLD_PERSIAN ], // Old Persian |
212 | // [ 0x10400, 0x1044F, self::SCRIPT_DESERET ], // Deseret |
213 | // [ 0x10450, 0x1047F, self::SCRIPT_SHAVIAN ], // Shavian |
214 | // [ 0x10480, 0x104AF, self::SCRIPT_OSMANYA ], // Osmanya |
215 | // [ 0x10800, 0x1083F, self::SCRIPT_CYPRIOT ], // Cypriot Syllabary |
216 | [ 0x10A00, 0x10A5F, self::SCRIPT_KHAROSHTHI ], // Kharoshthi |
217 | [ 0x118A0, 0x118FF, self::SCRIPT_WARANG_CITI ], // Warang Citi |
218 | [ 0x20000, 0x2A6DF, self::SCRIPT_HAN ], // CJK Unified Ideographs Extension B |
219 | [ 0x2F800, 0x2FA1F, self::SCRIPT_DEPRECATED ] // CJK Compatibility Ideographs Supplement |
220 | ]; |
221 | |
222 | private const ALLOWED_SCRIPT_COMBINATIONS = [ |
223 | [ self::SCRIPT_COPTIC, self::SCRIPT_COPTIC_EXTRAS ], # Coptic, using old Greek chars |
224 | [ self::SCRIPT_GREEK, self::SCRIPT_COPTIC_EXTRAS ], # Coptic, using new Coptic chars |
225 | [ self::SCRIPT_HAN, self::SCRIPT_BOPOMOFO ], # Chinese |
226 | [ self::SCRIPT_HAN, self::SCRIPT_HANGUL ], # Korean |
227 | [ self::SCRIPT_HAN, self::SCRIPT_KATAKANA, self::SCRIPT_HIRAGANA ] # Japanese |
228 | ]; |
229 | |
230 | /** |
231 | * @var Equivset |
232 | */ |
233 | private static $equivset; |
234 | |
235 | /** |
236 | * @return Equivset |
237 | */ |
238 | public static function getEquivSet() { |
239 | if ( !self::$equivset ) { |
240 | self::$equivset = new Equivset(); |
241 | } |
242 | |
243 | return self::$equivset; |
244 | } |
245 | |
246 | /** |
247 | * @param int $ch |
248 | * @return string |
249 | */ |
250 | private static function getScriptCode( $ch ) { |
251 | # Linear search: binary chop would be faster... |
252 | foreach ( self::ALL_SCRIPT_RANGES as $range ) { |
253 | if ( $ch >= $range[0] && $ch <= $range[1] ) { |
254 | return $range[2]; |
255 | } |
256 | } |
257 | # Otherwise... |
258 | return self::SCRIPT_UNASSIGNED; |
259 | } |
260 | |
261 | /** |
262 | * @param array $aList |
263 | * @param array $bList |
264 | * @return bool |
265 | */ |
266 | private static function isSubsetOf( $aList, $bList ) { |
267 | return count( array_diff( $aList, $bList ) ) == 0; |
268 | } |
269 | |
270 | /** |
271 | * Is this an allowed script mixture? |
272 | * |
273 | * @param array $scriptList |
274 | * @return bool |
275 | */ |
276 | private static function isAllowedScriptCombination( $scriptList ) { |
277 | foreach ( self::ALLOWED_SCRIPT_COMBINATIONS as $allowedCombo ) { |
278 | if ( self::isSubsetOf( $scriptList, $allowedCombo ) ) { |
279 | return true; |
280 | } |
281 | } |
282 | return false; |
283 | } |
284 | |
285 | /** |
286 | * Convert string into array of Unicode code points as integers |
287 | * @param string $str |
288 | * @return int[] |
289 | */ |
290 | private static function stringToList( $str ) { |
291 | $ar = []; |
292 | if ( !preg_match_all( '/./us', $str, $ar ) ) { |
293 | return []; |
294 | } |
295 | $out = []; |
296 | foreach ( $ar[0] as $char ) { |
297 | $out[] = Utils::utf8ToCodepoint( $char ); |
298 | } |
299 | return $out; |
300 | } |
301 | |
302 | /** |
303 | * @param array $list |
304 | * @return string |
305 | */ |
306 | private static function listToString( $list ) { |
307 | $out = ''; |
308 | foreach ( $list as $cp ) { |
309 | $out .= Utils::codepointToUtf8( $cp ); |
310 | } |
311 | return $out; |
312 | } |
313 | |
314 | /** |
315 | * @param array $a_list |
316 | * @return string |
317 | */ |
318 | private static function hardjoin( $a_list ) { |
319 | return implode( '', $a_list ); |
320 | } |
321 | |
322 | /** |
323 | * @param string $testName |
324 | * @return string |
325 | */ |
326 | public static function normalizeString( $testName ) { |
327 | return self::getEquivSet()->normalize( $testName ); |
328 | } |
329 | |
330 | /** |
331 | * @param int[] $text |
332 | * @param string $script |
333 | * @return int[] |
334 | */ |
335 | private static function stripScript( array $text, $script ) { |
336 | $scripts = array_map( [ __CLASS__, 'getScriptCode' ], $text ); |
337 | $out = []; |
338 | foreach ( $text as $index => $char ) { |
339 | if ( $scripts[$index] !== $script ) { |
340 | $out[] = $char; |
341 | } |
342 | } |
343 | return $out; |
344 | } |
345 | |
346 | /** |
347 | * Helper function for checkUnicodeStringStatus: Return an error on a bad character. |
348 | * @todo I would like to show Unicode character name, but it is not clear how to get it. |
349 | * @param string $msgId message identifier. |
350 | * @param int $point codepoint of the bad character. |
351 | * @return Status |
352 | */ |
353 | private static function badCharErr( $msgId, $point ) { |
354 | $symbol = Utils::codepointToUtf8( $point ); |
355 | // Combining marks are combined with the previous character. If abusing character is a |
356 | // combining mark, prepend it with space to show them correctly. |
357 | if ( self::getScriptCode( $point ) === self::SCRIPT_COMBINING_MARKS ) { |
358 | $symbol = ' ' . $symbol; |
359 | } |
360 | $code = sprintf( 'U+%04X', $point ); |
361 | if ( preg_match( '/\A\p{C}\z/u', $symbol ) ) { |
362 | $char = wfMessage( 'antispoof-bad-char-non-printable', $code ); |
363 | } else { |
364 | $char = wfMessage( 'antispoof-bad-char', $symbol, $code ); |
365 | } |
366 | return Status::newFatal( wfMessage( $msgId, $char ) ); |
367 | } |
368 | |
369 | /** |
370 | * TODO: does too much in one routine, refactor... |
371 | * @param string $testName |
372 | * @return Status |
373 | * @since 1.32 |
374 | */ |
375 | public static function checkUnicodeStringStatus( $testName ) { |
376 | global $wgAntiSpoofProhibitedCharacters; |
377 | |
378 | // Start with some sanity checking |
379 | if ( !is_array( $wgAntiSpoofProhibitedCharacters ) ) { |
380 | throw new ConfigException( '$wgAntiSpoofProhibitedCharacters should be an array!' ); |
381 | } |
382 | if ( !is_string( $testName ) ) { |
383 | return Status::newFatal( 'antispoof-badtype' ); |
384 | } |
385 | |
386 | if ( strlen( $testName ) == 0 ) { |
387 | return Status::newFatal( 'antispoof-empty' ); |
388 | } |
389 | |
390 | foreach ( self::stringToList( $testName ) as $char ) { |
391 | if ( in_array( $char, $wgAntiSpoofProhibitedCharacters ) ) { |
392 | return self::badCharErr( 'antispoof-prohibited', $char ); |
393 | } |
394 | } |
395 | |
396 | // Perform Unicode _compatibility_ decomposition |
397 | $testName = Validator::toNFKD( $testName ); |
398 | $testChars = self::stringToList( $testName ); |
399 | |
400 | // Be paranoid: check again, just in case Unicode normalization code changes... |
401 | foreach ( $testChars as $char ) { |
402 | if ( in_array( $char, $wgAntiSpoofProhibitedCharacters ) ) { |
403 | return self::badCharErr( 'antispoof-prohibited', $char ); |
404 | } |
405 | } |
406 | |
407 | // Check for this: should not happen in any valid Unicode string |
408 | if ( self::getScriptCode( $testChars[0] ) === self::SCRIPT_COMBINING_MARKS ) { |
409 | return self::badCharErr( 'antispoof-combining', $testChars[0] ); |
410 | } |
411 | |
412 | // Strip all combining characters in order to crudely strip accents |
413 | // Note: NFKD normalization should have decomposed all accented chars earlier |
414 | $testChars = self::stripScript( $testChars, self::SCRIPT_COMBINING_MARKS ); |
415 | |
416 | $testScripts = array_map( [ __CLASS__, 'getScriptCode' ], $testChars ); |
417 | $unassigned = array_search( self::SCRIPT_UNASSIGNED, $testScripts ); |
418 | if ( $unassigned !== false ) { |
419 | return self::badCharErr( 'antispoof-unassigned', $testChars[$unassigned] ); |
420 | } |
421 | $deprecated = array_search( self::SCRIPT_DEPRECATED, $testScripts ); |
422 | if ( $deprecated !== false ) { |
423 | return self::badCharErr( 'antispoof-deprecated', $testChars[$deprecated] ); |
424 | } |
425 | $testScripts = array_unique( $testScripts ); |
426 | |
427 | // We don't mind ASCII punctuation or digits |
428 | $testScripts = array_diff( $testScripts, |
429 | [ self::SCRIPT_ASCII_PUNCTUATION, self::SCRIPT_ASCII_DIGITS ] ); |
430 | |
431 | if ( !$testScripts ) { |
432 | return Status::newFatal( 'antispoof-noletters' ); |
433 | } |
434 | |
435 | if ( count( $testScripts ) > 1 && !self::isAllowedScriptCombination( $testScripts ) ) { |
436 | return Status::newFatal( 'antispoof-mixedscripts' ); |
437 | } |
438 | |
439 | // At this point, we should probably check for BiDi violations if they aren't |
440 | // caught above... |
441 | |
442 | // Squeeze out all punctuation chars |
443 | // TODO: almost the same code occurs twice, refactor into own routine |
444 | $testChars = self::stripScript( $testChars, self::SCRIPT_ASCII_PUNCTUATION ); |
445 | |
446 | $testName = self::listToString( $testChars ); |
447 | |
448 | // Replace characters in confusables set with equivalence chars |
449 | $testName = self::normalizeString( $testName ); |
450 | |
451 | // Do very simple sequence processing: "vv" -> "w", "rn" -> "m"... |
452 | // Not exhaustive, but ups the ante... |
453 | // Do this _after_ canonicalization: looks weird, but needed for consistency |
454 | $testName = str_replace( 'VV', 'W', $testName ); |
455 | $testName = str_replace( 'RN', 'M', $testName ); |
456 | |
457 | // Remove all remaining spaces, just in case any have snuck through... |
458 | $testName = self::hardjoin( explode( " ", $testName ) ); |
459 | |
460 | // Reduce repeated char sequences to single character |
461 | // BUG: TODO: implement this |
462 | |
463 | if ( strlen( $testName ) < 1 ) { |
464 | return Status::newFatal( 'antispoof-tooshort' ); |
465 | } |
466 | |
467 | // Don't ASCIIfy: we assume we are UTF-8 capable on output |
468 | |
469 | // Prepend version string, for futureproofing if this algorithm changes |
470 | $testName = "v2:" . $testName; |
471 | |
472 | // And return the canonical version of the name |
473 | return Status::newGood( $testName ); |
474 | } |
475 | } |