Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
74.36% |
58 / 78 |
|
54.55% |
6 / 11 |
CRAP | |
0.00% |
0 / 1 |
| AntiSpoof | |
74.36% |
58 / 78 |
|
54.55% |
6 / 11 |
60.08 | |
0.00% |
0 / 1 |
| getEquivSet | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
| getScriptCode | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
4.25 | |||
| isSubsetOf | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| isAllowedScriptCombination | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
3.14 | |||
| stringToList | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
3.03 | |||
| listToString | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| hardjoin | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| normalizeString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| stripScript | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
| badCharErr | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
| checkUnicodeStringStatus | |
76.92% |
30 / 39 |
|
0.00% |
0 / 1 |
16.41 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * AntiSpoof.php |
| 4 | * Username spoofing prevention for MediaWiki |
| 5 | * Version 0.04 |
| 6 | * |
| 7 | * Copyright (C) Neil Harris 2006 |
| 8 | * Python->PHP conversion by Brion Vibber <brion@pobox.com> |
| 9 | * |
| 10 | * 2006-06-30 Handles non-CJK scripts as per UTR #39 + my extensions |
| 11 | * 2006-07-01 Now handles Simplified <-> Traditional Chinese rules, as |
| 12 | * per JET Guidelines for Internationalized Domain Names, |
| 13 | * and the ICANN language registry values for .cn |
| 14 | * 2006-09-14 Now handles 'rn' etc better, and uses stdin for input |
| 15 | * 2006-09-18 Added exception handling for nasty cases, eg BiDi violations |
| 16 | * 2006-09-19 Converted to PHP for easier integration into a MW extension |
| 17 | * |
| 18 | * This program is free software; you can redistribute it and/or modify |
| 19 | * it under the terms of the GNU General Public License as published by |
| 20 | * the Free Software Foundation; either version 2 of the License, or |
| 21 | * (at your option) any later version. |
| 22 | * |
| 23 | * This program is distributed in the hope that it will be useful, but |
| 24 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 26 | * General Public License for more details. |
| 27 | * |
| 28 | * You should have received a copy of the GNU General Public License |
| 29 | * along with this program; if not, write to the Free Software |
| 30 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 |
| 31 | * USA |
| 32 | */ |
| 33 | |
| 34 | namespace MediaWiki\Extension\AntiSpoof; |
| 35 | |
| 36 | use MediaWiki\Config\ConfigException; |
| 37 | use MediaWiki\Status\Status; |
| 38 | use UtfNormal\Utils; |
| 39 | use UtfNormal\Validator; |
| 40 | use Wikimedia\Equivset\Equivset; |
| 41 | |
| 42 | class AntiSpoof { |
| 43 | |
| 44 | private const SCRIPT_DEPRECATED = 'DEPRECATED'; |
| 45 | private const SCRIPT_UNASSIGNED = 'UNASSIGNED'; |
| 46 | |
| 47 | private const SCRIPT_ARABIC = 'ARABIC'; |
| 48 | private const SCRIPT_ARMENIAN = 'ARMENIAN'; |
| 49 | private const SCRIPT_ASCII_DIGITS = 'ASCII_DIGITS'; |
| 50 | private const SCRIPT_ASCII_PUNCTUATION = 'ASCII_PUNCTUATION'; |
| 51 | private const SCRIPT_BENGALI = 'BENGALI'; |
| 52 | private const SCRIPT_BOPOMOFO = 'BOPOMOFO'; |
| 53 | private const SCRIPT_BUGINESE = 'BUGINESE'; |
| 54 | private const SCRIPT_BUHID = 'BUHID'; |
| 55 | private const SCRIPT_CANADIAN_ABORIGINAL = 'CANADIAN_ABORIGINAL'; |
| 56 | private const SCRIPT_CHEROKEE = 'CHEROKEE'; |
| 57 | private const SCRIPT_COMBINING_MARKS = 'COMBINING_MARKS'; |
| 58 | private const SCRIPT_COPTIC = 'COPTIC'; |
| 59 | private const SCRIPT_COPTIC_EXTRAS = 'COPTIC_EXTRAS'; |
| 60 | private const SCRIPT_CYPRIOT = 'CYPRIOT'; |
| 61 | private const SCRIPT_CYRILLIC = 'CYRILLIC'; |
| 62 | private const SCRIPT_DESERET = 'DESERET'; |
| 63 | private const SCRIPT_DEVANAGARI = 'DEVANAGARI'; |
| 64 | private const SCRIPT_ETHIOPIC = 'ETHIOPIC'; |
| 65 | private const SCRIPT_GEORGIAN = 'GEORGIAN'; |
| 66 | private const SCRIPT_GLAGOLITIC = 'GLAGOLITIC'; |
| 67 | private const SCRIPT_GOTHIC = 'GOTHIC'; |
| 68 | private const SCRIPT_GREEK = 'GREEK'; |
| 69 | private const SCRIPT_GUJARATI = 'GUJARATI'; |
| 70 | private const SCRIPT_GURMUKHI = 'GURMUKHI'; |
| 71 | private const SCRIPT_HAN = 'HAN'; |
| 72 | private const SCRIPT_HANGUL = 'HANGUL'; |
| 73 | private const SCRIPT_HANUNOO = 'HANUNOO'; |
| 74 | private const SCRIPT_HEBREW = 'HEBREW'; |
| 75 | private const SCRIPT_HIRAGANA = 'HIRAGANA'; |
| 76 | private const SCRIPT_KANNADA = 'KANNADA'; |
| 77 | private const SCRIPT_KATAKANA = 'KATAKANA'; |
| 78 | private const SCRIPT_KHAROSHTHI = 'KHAROSHTHI'; |
| 79 | private const SCRIPT_KHMER = 'KHMER'; |
| 80 | private const SCRIPT_LAO = 'LAO'; |
| 81 | private const SCRIPT_LATIN = 'LATIN'; |
| 82 | private const SCRIPT_LIMBU = 'LIMBU'; |
| 83 | private const SCRIPT_LINEAR_B = 'LINEAR_B'; |
| 84 | private const SCRIPT_MALAYALAM = 'MALAYALAM'; |
| 85 | private const SCRIPT_MEETEI_MAYEK = 'MEETEI_MAYEK'; |
| 86 | private const SCRIPT_MEETEI_MAYEK_EXTENSIONS = 'MEETEI_MAYEK_EXTENSIONS'; |
| 87 | private const SCRIPT_MONGOLIAN = 'MONGOLIAN'; |
| 88 | private const SCRIPT_MYANMAR = 'MYANMAR'; |
| 89 | private const SCRIPT_NEW_TAI_LUE = 'NEW_TAI_LUE'; |
| 90 | private const SCRIPT_NKO = 'NKO'; |
| 91 | private const SCRIPT_OGHAM = 'OGHAM'; |
| 92 | private const SCRIPT_OL_CHIKI = 'OL_CHIKI'; |
| 93 | private const SCRIPT_OLD_ITALIC = 'OLD_ITALIC'; |
| 94 | private const SCRIPT_OLD_PERSIAN = 'OLD_PERSIAN'; |
| 95 | private const SCRIPT_ORIYA = 'ORIYA'; |
| 96 | private const SCRIPT_OSMANYA = 'OSMANYA'; |
| 97 | private const SCRIPT_RUNIC = 'RUNIC'; |
| 98 | private const SCRIPT_SHAVIAN = 'SHAVIAN'; |
| 99 | private const SCRIPT_SINHALA = 'SINHALA'; |
| 100 | private const SCRIPT_SYLOTI_NAGRI = 'SYLOTI_NAGRI'; |
| 101 | private const SCRIPT_SYRIAC = 'SYRIAC'; |
| 102 | private const SCRIPT_TAGALOG = 'TAGALOG'; |
| 103 | private const SCRIPT_TAGBANWA = 'TAGBANWA'; |
| 104 | private const SCRIPT_TAI_LE = 'TAI_LE'; |
| 105 | private const SCRIPT_TAMIL = 'TAMIL'; |
| 106 | private const SCRIPT_TELUGU = 'TELUGU'; |
| 107 | private const SCRIPT_THAANA = 'THAANA'; |
| 108 | private const SCRIPT_THAI = 'THAI'; |
| 109 | private const SCRIPT_TIBETAN = 'TIBETAN'; |
| 110 | private const SCRIPT_TIFINAGH = 'TIFINAGH'; |
| 111 | private const SCRIPT_UGARITIC = 'UGARITIC'; |
| 112 | private const SCRIPT_WARANG_CITI = 'WARANG_CITI'; |
| 113 | private const SCRIPT_YI = 'YI'; |
| 114 | |
| 115 | // phpcs:disable MediaWiki.WhiteSpace.SpaceBeforeSingleLineComment.NewLineComment |
| 116 | |
| 117 | /** |
| 118 | * Define script tag codes for various Unicode codepoint ranges |
| 119 | * If it does not have a code here, it does not have a script assignment |
| 120 | * NB: Braille is not in this list since it is a transliteration system, not a script; |
| 121 | * this does not disadvantage blind people, who will use Braille input/output methods |
| 122 | * and not raw Braille... |
| 123 | * NB: Middle dot is included in SCRIPT_LATIN for use in Catalan |
| 124 | * NB: All scripts described by the Unicode Consortium as "Other Scripts" or "Ancient Scripts" |
| 125 | * are commented out: these are either not in modern use, or only used for specialized |
| 126 | * religious purposes, or only of literary interest |
| 127 | */ |
| 128 | private const ALL_SCRIPT_RANGES = [ |
| 129 | [ 0x0020, 0x002F, |
| 130 | self::SCRIPT_ASCII_PUNCTUATION ], // ASCII Punctuation 1, Hyphen, ASCII Punctuation 2 |
| 131 | [ 0x0030, 0x0039, self::SCRIPT_ASCII_DIGITS ], // ASCII Digits |
| 132 | [ 0x003A, 0x0040, self::SCRIPT_ASCII_PUNCTUATION ], // Colon, ASCII Punctuation 3 |
| 133 | [ 0x0041, 0x005A, self::SCRIPT_LATIN ], // ASCII Uppercase |
| 134 | [ 0x005B, 0x0060, |
| 135 | self::SCRIPT_ASCII_PUNCTUATION ], // ASCII Punctuation 4, Underscore, ASCII Punctuation 5 |
| 136 | [ 0x0061, 0x007A, self::SCRIPT_LATIN ], // ASCII Lowercase |
| 137 | [ 0x007B, 0x007E, self::SCRIPT_ASCII_PUNCTUATION ], // ASCII Punctuation 5 |
| 138 | [ 0x00B7, 0x00B7, self::SCRIPT_LATIN ], // Middle Dot |
| 139 | [ 0x00C0, 0x00D6, self::SCRIPT_LATIN ], // Latin-1 Letters 1 |
| 140 | [ 0x00D8, 0x00F6, self::SCRIPT_LATIN ], // Latin-1 Letters 2 |
| 141 | [ 0x00F8, 0x02AF, |
| 142 | self::SCRIPT_LATIN ], // Latin-1 Letters 3, Latin Extended-A, Latin Extended-B, IPA Extensions |
| 143 | [ 0x0300, 0x036F, self::SCRIPT_COMBINING_MARKS ], // Combining Diacritical Marks |
| 144 | [ 0x0370, 0x03E1, self::SCRIPT_GREEK ], // Greek and Coptic (Greek) |
| 145 | [ 0x03E2, 0x03EF, self::SCRIPT_COPTIC_EXTRAS ], // Greek and Coptic (Coptic-unique) |
| 146 | [ 0x03F0, 0x03FF, self::SCRIPT_GREEK ], // Greek and Coptic (Greek) |
| 147 | [ 0x0400, 0x052F, self::SCRIPT_CYRILLIC ], // Cyrillic, Cyrillic Supplement |
| 148 | [ 0x0530, 0x058F, self::SCRIPT_ARMENIAN ], // Armenian |
| 149 | [ 0x0590, 0x05FF, self::SCRIPT_HEBREW ], // Hebrew |
| 150 | [ 0x0600, 0x06FF, self::SCRIPT_ARABIC ], // Arabic |
| 151 | [ 0x0700, 0x074F, self::SCRIPT_SYRIAC ], // Syriac |
| 152 | [ 0x0750, 0x077F, self::SCRIPT_ARABIC ], // Arabic Supplement |
| 153 | [ 0x0780, 0x07BF, self::SCRIPT_THAANA ], // Thaana |
| 154 | [ 0x07C0, 0x07FF, self::SCRIPT_NKO ], // NKo (N'Ko) |
| 155 | [ 0x0900, 0x097F, self::SCRIPT_DEVANAGARI ], // Devanagari |
| 156 | [ 0x0980, 0x09FF, self::SCRIPT_BENGALI ], // Bengali |
| 157 | [ 0x0A00, 0x0A7F, self::SCRIPT_GURMUKHI ], // Gurmukhi |
| 158 | [ 0x0A80, 0x0AFF, self::SCRIPT_GUJARATI ], // Gujarati |
| 159 | [ 0x0B00, 0x0B7F, self::SCRIPT_ORIYA ], // Oriya |
| 160 | [ 0x0B80, 0x0BFF, self::SCRIPT_TAMIL ], // Tamil |
| 161 | [ 0x0C00, 0x0C7F, self::SCRIPT_TELUGU ], // Telugu |
| 162 | [ 0x0C80, 0x0CFF, self::SCRIPT_KANNADA ], // Kannada |
| 163 | [ 0x0D00, 0x0D7F, self::SCRIPT_MALAYALAM ], // Malayalam |
| 164 | [ 0x0D80, 0x0DFF, self::SCRIPT_SINHALA ], // Sinhala |
| 165 | [ 0x0E00, 0x0E7F, self::SCRIPT_THAI ], // Thai |
| 166 | [ 0x0E80, 0x0EFF, self::SCRIPT_LAO ], // Lao |
| 167 | [ 0x0F00, 0x0FFF, self::SCRIPT_TIBETAN ], // Tibetan |
| 168 | [ 0x1000, 0x109F, self::SCRIPT_MYANMAR ], // Myanmar |
| 169 | [ 0x10A0, 0x10FF, self::SCRIPT_GEORGIAN ], // Georgian |
| 170 | [ 0x1100, 0x11FF, self::SCRIPT_HANGUL ], // Hangul Jamo |
| 171 | [ 0x1200, 0x139F, self::SCRIPT_ETHIOPIC ], // Ethiopic, Ethiopic Supplement |
| 172 | [ 0x13A0, 0x13FF, self::SCRIPT_CHEROKEE ], // Cherokee |
| 173 | [ 0x1400, 0x167F, self::SCRIPT_CANADIAN_ABORIGINAL ], // Unified Canadian Aboriginal Syllabics |
| 174 | // [ 0x1680, 0x169F, self::SCRIPT_OGHAM ], // Ogham |
| 175 | // [ 0x16A0, 0x16FF, self::SCRIPT_RUNIC ], // Runic |
| 176 | [ 0x1700, 0x171F, self::SCRIPT_TAGALOG ], // Tagalog |
| 177 | [ 0x1720, 0x173F, self::SCRIPT_HANUNOO ], // Hanunoo |
| 178 | [ 0x1740, 0x175F, self::SCRIPT_BUHID ], // Buhid |
| 179 | [ 0x1760, 0x177F, self::SCRIPT_TAGBANWA ], // Tagbanwa |
| 180 | [ 0x1780, 0x17FF, self::SCRIPT_KHMER ], // Khmer |
| 181 | [ 0x1800, 0x18AF, self::SCRIPT_MONGOLIAN ], // Mongolian |
| 182 | [ 0x1900, 0x194F, self::SCRIPT_LIMBU ], // Limbu |
| 183 | [ 0x1950, 0x197F, self::SCRIPT_TAI_LE ], // Tai Le |
| 184 | [ 0x1980, 0x19DF, self::SCRIPT_NEW_TAI_LUE ], // New Tai Lue |
| 185 | [ 0x1A00, 0x1A1F, self::SCRIPT_BUGINESE ], // Buginese |
| 186 | [ 0x1C50, 0x1C7F, self::SCRIPT_OL_CHIKI ], // Ol Chiki |
| 187 | [ 0x1C90, 0x1CBF, self::SCRIPT_GEORGIAN ], // Georgian Extended |
| 188 | [ 0x1E00, 0x1EFF, self::SCRIPT_LATIN ], // Latin Extended Additional |
| 189 | [ 0x1F00, 0x1FFF, self::SCRIPT_GREEK ], // Greek Extended |
| 190 | // [ 0x2C00, 0x2C5F, self::SCRIPT_GLAGOLITIC ], // Glagolitic |
| 191 | [ 0x2C80, 0x2CFF, self::SCRIPT_COPTIC ], // Coptic |
| 192 | [ 0x2D00, 0x2D2F, self::SCRIPT_GEORGIAN ], // Georgian Supplement |
| 193 | [ 0x2D30, 0x2D7F, self::SCRIPT_TIFINAGH ], // Tifinagh |
| 194 | [ 0x2D80, 0x2DDF, self::SCRIPT_ETHIOPIC ], // Ethiopic Extended |
| 195 | [ 0x2E80, 0x2FDF, self::SCRIPT_DEPRECATED ], // CJK Radicals Supplement, Kangxi Radicals |
| 196 | [ 0x3040, 0x309F, self::SCRIPT_HIRAGANA ], // Hiragana |
| 197 | [ 0x30A0, 0x30FF, self::SCRIPT_KATAKANA ], // Katakana |
| 198 | [ 0x3100, 0x312F, self::SCRIPT_BOPOMOFO ], // Bopomofo |
| 199 | [ 0x3130, 0x318F, self::SCRIPT_HANGUL ], // Hangul Compatibility Jamo |
| 200 | [ 0x31A0, 0x31BF, self::SCRIPT_BOPOMOFO ], // Bopomofo Extended |
| 201 | [ 0x31F0, 0x31FF, self::SCRIPT_KATAKANA ], // Katakana Phonetic Extensions |
| 202 | [ 0x3400, 0x4DBF, self::SCRIPT_HAN ], // CJK Unified Ideographs Extension A |
| 203 | [ 0x4E00, 0x9FFF, self::SCRIPT_HAN ], // CJK Unified Ideographs |
| 204 | [ 0xA000, 0xA4CF, self::SCRIPT_YI ], // Yi Syllables, Yi Radicals |
| 205 | [ 0xA800, 0xA82F, self::SCRIPT_SYLOTI_NAGRI ], // Syloti Nagri |
| 206 | // [ 0xAAE0, 0xAAFF, self::SCRIPT_MEETEI_MAYEK_EXTENSIONS ] // Meetei Mayek Extensions |
| 207 | [ 0xAB70, 0xABBF, self::SCRIPT_CHEROKEE ], // Cherokee Supplement |
| 208 | [ 0xABC0, 0xABFF, self::SCRIPT_MEETEI_MAYEK ], // Meetei Mayek |
| 209 | [ 0xAC00, 0xD7AF, self::SCRIPT_HANGUL ], // Hangul Syllables |
| 210 | [ 0xF900, 0xFAFF, self::SCRIPT_DEPRECATED ], // CJK Compatibility Ideographs |
| 211 | // [ 0x10000, 0x100FF, self::SCRIPT_LINEAR_B ], // Linear B Syllabary, Linear B Ideograms |
| 212 | // [ 0x10140, 0x1018F, self::SCRIPT_GREEK ], // Ancient Greek Numbers |
| 213 | // [ 0x10300, 0x1032F, self::SCRIPT_OLD_ITALIC ], // Old Italic |
| 214 | [ 0x10330, 0x1034F, self::SCRIPT_GOTHIC ], // Gothic |
| 215 | // [ 0x10380, 0x1039F, self::SCRIPT_UGARITIC ], // Ugaritic |
| 216 | // [ 0x103A0, 0x103DF, self::SCRIPT_OLD_PERSIAN ], // Old Persian |
| 217 | // [ 0x10400, 0x1044F, self::SCRIPT_DESERET ], // Deseret |
| 218 | // [ 0x10450, 0x1047F, self::SCRIPT_SHAVIAN ], // Shavian |
| 219 | // [ 0x10480, 0x104AF, self::SCRIPT_OSMANYA ], // Osmanya |
| 220 | // [ 0x10800, 0x1083F, self::SCRIPT_CYPRIOT ], // Cypriot Syllabary |
| 221 | [ 0x10A00, 0x10A5F, self::SCRIPT_KHAROSHTHI ], // Kharoshthi |
| 222 | [ 0x118A0, 0x118FF, self::SCRIPT_WARANG_CITI ], // Warang Citi |
| 223 | [ 0x20000, 0x2A6DF, self::SCRIPT_HAN ], // CJK Unified Ideographs Extension B |
| 224 | [ 0x2F800, 0x2FA1F, self::SCRIPT_DEPRECATED ] // CJK Compatibility Ideographs Supplement |
| 225 | ]; |
| 226 | |
| 227 | private const ALLOWED_SCRIPT_COMBINATIONS = [ |
| 228 | [ self::SCRIPT_COPTIC, self::SCRIPT_COPTIC_EXTRAS ], # Coptic, using old Greek chars |
| 229 | [ self::SCRIPT_GREEK, self::SCRIPT_COPTIC_EXTRAS ], # Coptic, using new Coptic chars |
| 230 | [ self::SCRIPT_HAN, self::SCRIPT_BOPOMOFO ], # Chinese |
| 231 | [ self::SCRIPT_HAN, self::SCRIPT_HANGUL ], # Korean |
| 232 | [ self::SCRIPT_HAN, self::SCRIPT_KATAKANA, self::SCRIPT_HIRAGANA ] # Japanese |
| 233 | ]; |
| 234 | |
| 235 | // phpcs:enable MediaWiki.WhiteSpace.SpaceBeforeSingleLineComment.NewLineComment |
| 236 | |
| 237 | private static ?Equivset $equivset = null; |
| 238 | |
| 239 | public static function getEquivSet(): Equivset { |
| 240 | if ( self::$equivset === null ) { |
| 241 | self::$equivset = new Equivset(); |
| 242 | } |
| 243 | |
| 244 | return self::$equivset; |
| 245 | } |
| 246 | |
| 247 | private static function getScriptCode( int $ch ): string { |
| 248 | # Linear search: binary chop would be faster... |
| 249 | foreach ( self::ALL_SCRIPT_RANGES as $range ) { |
| 250 | if ( $ch >= $range[0] && $ch <= $range[1] ) { |
| 251 | return $range[2]; |
| 252 | } |
| 253 | } |
| 254 | # Otherwise... |
| 255 | return self::SCRIPT_UNASSIGNED; |
| 256 | } |
| 257 | |
| 258 | private static function isSubsetOf( array $aList, array $bList ): bool { |
| 259 | return count( array_diff( $aList, $bList ) ) === 0; |
| 260 | } |
| 261 | |
| 262 | /** |
| 263 | * Is this an allowed script mixture? |
| 264 | */ |
| 265 | private static function isAllowedScriptCombination( array $scriptList ): bool { |
| 266 | foreach ( self::ALLOWED_SCRIPT_COMBINATIONS as $allowedCombo ) { |
| 267 | if ( self::isSubsetOf( $scriptList, $allowedCombo ) ) { |
| 268 | return true; |
| 269 | } |
| 270 | } |
| 271 | return false; |
| 272 | } |
| 273 | |
| 274 | /** |
| 275 | * Convert string into an array of Unicode code points as integers |
| 276 | * @return int[] |
| 277 | */ |
| 278 | private static function stringToList( string $str ): array { |
| 279 | $ar = []; |
| 280 | if ( !preg_match_all( '/./us', $str, $ar ) ) { |
| 281 | return []; |
| 282 | } |
| 283 | $out = []; |
| 284 | foreach ( $ar[0] as $char ) { |
| 285 | $out[] = Utils::utf8ToCodepoint( $char ); |
| 286 | } |
| 287 | return $out; |
| 288 | } |
| 289 | |
| 290 | private static function listToString( array $list ): string { |
| 291 | $out = ''; |
| 292 | foreach ( $list as $cp ) { |
| 293 | $out .= Utils::codepointToUtf8( $cp ); |
| 294 | } |
| 295 | return $out; |
| 296 | } |
| 297 | |
| 298 | private static function hardjoin( array $a_list ): string { |
| 299 | return implode( '', $a_list ); |
| 300 | } |
| 301 | |
| 302 | public static function normalizeString( string $testName ): string { |
| 303 | return self::getEquivSet()->normalize( $testName ); |
| 304 | } |
| 305 | |
| 306 | /** |
| 307 | * @param int[] $text |
| 308 | * @param string $script |
| 309 | * @return int[] |
| 310 | */ |
| 311 | private static function stripScript( array $text, string $script ): array { |
| 312 | $scripts = array_map( [ __CLASS__, 'getScriptCode' ], $text ); |
| 313 | $out = []; |
| 314 | foreach ( $text as $index => $char ) { |
| 315 | if ( $scripts[$index] !== $script ) { |
| 316 | $out[] = $char; |
| 317 | } |
| 318 | } |
| 319 | return $out; |
| 320 | } |
| 321 | |
| 322 | /** |
| 323 | * Helper function for checkUnicodeStringStatus: Return an error on a bad character. |
| 324 | * @todo I would like to show Unicode character name, but it is not clear how to get it. |
| 325 | * @param string $msgId message identifier. |
| 326 | * @param int $point codepoint of the bad character. |
| 327 | * @return Status |
| 328 | */ |
| 329 | private static function badCharErr( string $msgId, int $point ): Status { |
| 330 | $symbol = Utils::codepointToUtf8( $point ); |
| 331 | // Combining marks are combined with the previous character. If abusing character is a |
| 332 | // combining mark, prepend it with space to show them correctly. |
| 333 | if ( self::getScriptCode( $point ) === self::SCRIPT_COMBINING_MARKS ) { |
| 334 | $symbol = ' ' . $symbol; |
| 335 | } |
| 336 | $code = sprintf( 'U+%04X', $point ); |
| 337 | if ( preg_match( '/\A\p{C}\z/u', $symbol ) ) { |
| 338 | $char = wfMessage( 'antispoof-bad-char-non-printable', $code ); |
| 339 | } else { |
| 340 | $char = wfMessage( 'antispoof-bad-char', $symbol, $code ); |
| 341 | } |
| 342 | return Status::newFatal( wfMessage( $msgId, $char ) ); |
| 343 | } |
| 344 | |
| 345 | /** |
| 346 | * TODO: does too much in one routine, refactor... |
| 347 | * @return Status<string> |
| 348 | * @since 1.32 |
| 349 | */ |
| 350 | public static function checkUnicodeStringStatus( string $testName ): Status { |
| 351 | global $wgAntiSpoofProhibitedCharacters; |
| 352 | |
| 353 | // Start with some sanity checking |
| 354 | if ( !is_array( $wgAntiSpoofProhibitedCharacters ) ) { |
| 355 | throw new ConfigException( '$wgAntiSpoofProhibitedCharacters should be an array!' ); |
| 356 | } |
| 357 | |
| 358 | if ( $testName === '' ) { |
| 359 | return Status::newFatal( 'antispoof-empty' ); |
| 360 | } |
| 361 | |
| 362 | foreach ( self::stringToList( $testName ) as $char ) { |
| 363 | if ( in_array( $char, $wgAntiSpoofProhibitedCharacters ) ) { |
| 364 | return self::badCharErr( 'antispoof-prohibited', $char ); |
| 365 | } |
| 366 | } |
| 367 | |
| 368 | // Perform Unicode _compatibility_ decomposition |
| 369 | $testName = Validator::toNFKD( $testName ); |
| 370 | $testChars = self::stringToList( $testName ); |
| 371 | |
| 372 | // Be paranoid: check again, just in case Unicode normalization code changes... |
| 373 | foreach ( $testChars as $char ) { |
| 374 | if ( in_array( $char, $wgAntiSpoofProhibitedCharacters ) ) { |
| 375 | return self::badCharErr( 'antispoof-prohibited', $char ); |
| 376 | } |
| 377 | } |
| 378 | |
| 379 | // Check for this: should not happen in any valid Unicode string |
| 380 | if ( self::getScriptCode( $testChars[0] ) === self::SCRIPT_COMBINING_MARKS ) { |
| 381 | return self::badCharErr( 'antispoof-combining', $testChars[0] ); |
| 382 | } |
| 383 | |
| 384 | // Strip all combining characters in order to crudely strip accents |
| 385 | // Note: NFKD normalization should have decomposed all accented chars earlier |
| 386 | $testChars = self::stripScript( $testChars, self::SCRIPT_COMBINING_MARKS ); |
| 387 | |
| 388 | $testScripts = array_map( [ __CLASS__, 'getScriptCode' ], $testChars ); |
| 389 | $unassigned = array_search( self::SCRIPT_UNASSIGNED, $testScripts ); |
| 390 | if ( $unassigned !== false ) { |
| 391 | return self::badCharErr( 'antispoof-unassigned', $testChars[$unassigned] ); |
| 392 | } |
| 393 | $deprecated = array_search( self::SCRIPT_DEPRECATED, $testScripts ); |
| 394 | if ( $deprecated !== false ) { |
| 395 | return self::badCharErr( 'antispoof-deprecated', $testChars[$deprecated] ); |
| 396 | } |
| 397 | $testScripts = array_unique( $testScripts ); |
| 398 | |
| 399 | // We don't mind ASCII punctuation or digits |
| 400 | $testScripts = array_diff( $testScripts, |
| 401 | [ self::SCRIPT_ASCII_PUNCTUATION, self::SCRIPT_ASCII_DIGITS ] ); |
| 402 | |
| 403 | if ( !$testScripts ) { |
| 404 | return Status::newFatal( 'antispoof-noletters' ); |
| 405 | } |
| 406 | |
| 407 | if ( count( $testScripts ) > 1 && !self::isAllowedScriptCombination( $testScripts ) ) { |
| 408 | return Status::newFatal( 'antispoof-mixedscripts' ); |
| 409 | } |
| 410 | |
| 411 | // At this point, we should probably check for BiDi violations if they aren't |
| 412 | // caught above... |
| 413 | |
| 414 | // Squeeze out all punctuation chars |
| 415 | // TODO: almost the same code occurs twice, refactor into own routine |
| 416 | $testChars = self::stripScript( $testChars, self::SCRIPT_ASCII_PUNCTUATION ); |
| 417 | |
| 418 | $testName = self::listToString( $testChars ); |
| 419 | |
| 420 | // Replace characters in confusables set with equivalence chars |
| 421 | $testName = self::normalizeString( $testName ); |
| 422 | |
| 423 | // Do very simple sequence processing: "vv" -> "w", "rn" -> "m"... |
| 424 | // Not exhaustive, but ups the ante... |
| 425 | // Do this _after_ canonicalization: looks weird, but needed for consistency |
| 426 | $testName = str_replace( 'VV', 'W', $testName ); |
| 427 | $testName = str_replace( 'RN', 'M', $testName ); |
| 428 | |
| 429 | // Remove all remaining spaces, just in case any have snuck through... |
| 430 | $testName = self::hardjoin( explode( " ", $testName ) ); |
| 431 | |
| 432 | // Reduce repeated char sequences to single character |
| 433 | // BUG: TODO: implement this |
| 434 | |
| 435 | if ( $testName === '' ) { |
| 436 | return Status::newFatal( 'antispoof-tooshort' ); |
| 437 | } |
| 438 | |
| 439 | // Don't ASCIIfy: we assume we are UTF-8 capable on output |
| 440 | |
| 441 | // Prepend version string, for futureproofing if this algorithm changes |
| 442 | $testName = "v2:" . $testName; |
| 443 | |
| 444 | // And return the canonical version of the name |
| 445 | return Status::newGood( $testName ); |
| 446 | } |
| 447 | } |