Code Coverage for /workspace/src/extensions/AntiSpoof/includes/AntiSpoof.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	73.75% covered (warning)	73.75%	59 / 80	54.55% covered (warning)	54.55%	6 / 11	CRAP	0.00% covered (danger)	0.00%	0 / 1
AntiSpoof	73.75% covered (warning)	73.75%	59 / 80	54.55% covered (warning)	54.55%	6 / 11	64.12	0.00% covered (danger)	0.00%	0 / 1
getEquivSet	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
getScriptCode	75.00% covered (warning)	75.00%	3 / 4	0.00% covered (danger)	0.00%	0 / 1	4.25
isSubsetOf	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
isAllowedScriptCombination	75.00% covered (warning)	75.00%	3 / 4	0.00% covered (danger)	0.00%	0 / 1	3.14
stringToList	85.71% covered (warning)	85.71%	6 / 7	0.00% covered (danger)	0.00%	0 / 1	3.03
listToString	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2
hardjoin	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
normalizeString	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
stripScript	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	3
badCharErr	0.00% covered (danger)	0.00%	0 / 8	0.00% covered (danger)	0.00%	0 / 1	12
checkUnicodeStringStatus	75.61% covered (warning)	75.61%	31 / 41	0.00% covered (danger)	0.00%	0 / 1	18.26

1	<?php
2	/**
3	* AntiSpoof.php
4	* Username spoofing prevention for MediaWiki
5	* Version 0.04
6	*
7	* Copyright (C) Neil Harris 2006
8	* Python->PHP conversion by Brion Vibber <brion@pobox.com>
9	*
10	* 2006-06-30 Handles non-CJK scripts as per UTR #39 + my extensions
11	* 2006-07-01 Now handles Simplified <-> Traditional Chinese rules, as
12	* per JET Guidelines for Internationalized Domain Names,
13	* and the ICANN language registry values for .cn
14	* 2006-09-14 Now handles 'rn' etc better, and uses stdin for input
15	* 2006-09-18 Added exception handling for nasty cases, eg BiDi violations
16	* 2006-09-19 Converted to PHP for easier integration into a MW extension
17	*
18	* This program is free software; you can redistribute it and/or modify
19	* it under the terms of the GNU General Public License as published by
20	* the Free Software Foundation; either version 2 of the License, or
21	* (at your option) any later version.
22	*
23	* This program is distributed in the hope that it will be useful, but
24	* WITHOUT ANY WARRANTY; without even the implied warranty of
25	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26	* General Public License for more details.
27	*
28	* You should have received a copy of the GNU General Public License
29	* along with this program; if not, write to the Free Software
30	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
31	* USA
32	*/
33
34	namespace MediaWiki\Extension\AntiSpoof;
35
36	use MediaWiki\Config\ConfigException;
37	use MediaWiki\Status\Status;
38	use UtfNormal\Utils;
39	use UtfNormal\Validator;
40	use Wikimedia\Equivset\Equivset;
41
42	class AntiSpoof {
43
44	private const SCRIPT_DEPRECATED = 'DEPRECATED';
45	private const SCRIPT_UNASSIGNED = 'UNASSIGNED';
46
47	private const SCRIPT_ARABIC = 'ARABIC';
48	private const SCRIPT_ARMENIAN = 'ARMENIAN';
49	private const SCRIPT_ASCII_DIGITS = 'ASCII_DIGITS';
50	private const SCRIPT_ASCII_PUNCTUATION = 'ASCII_PUNCTUATION';
51	private const SCRIPT_BENGALI = 'BENGALI';
52	private const SCRIPT_BOPOMOFO = 'BOPOMOFO';
53	private const SCRIPT_BUGINESE = 'BUGINESE';
54	private const SCRIPT_BUHID = 'BUHID';
55	private const SCRIPT_CANADIAN_ABORIGINAL = 'CANADIAN_ABORIGINAL';
56	private const SCRIPT_CHEROKEE = 'CHEROKEE';
57	private const SCRIPT_COMBINING_MARKS = 'COMBINING_MARKS';
58	private const SCRIPT_COPTIC = 'COPTIC';
59	private const SCRIPT_COPTIC_EXTRAS = 'COPTIC_EXTRAS';
60	private const SCRIPT_CYPRIOT = 'CYPRIOT';
61	private const SCRIPT_CYRILLIC = 'CYRILLIC';
62	private const SCRIPT_DESERET = 'DESERET';
63	private const SCRIPT_DEVANAGARI = 'DEVANAGARI';
64	private const SCRIPT_ETHIOPIC = 'ETHIOPIC';
65	private const SCRIPT_GEORGIAN = 'GEORGIAN';
66	private const SCRIPT_GLAGOLITIC = 'GLAGOLITIC';
67	private const SCRIPT_GOTHIC = 'GOTHIC';
68	private const SCRIPT_GREEK = 'GREEK';
69	private const SCRIPT_GUJARATI = 'GUJARATI';
70	private const SCRIPT_GURMUKHI = 'GURMUKHI';
71	private const SCRIPT_HAN = 'HAN';
72	private const SCRIPT_HANGUL = 'HANGUL';
73	private const SCRIPT_HANUNOO = 'HANUNOO';
74	private const SCRIPT_HEBREW = 'HEBREW';
75	private const SCRIPT_HIRAGANA = 'HIRAGANA';
76	private const SCRIPT_KANNADA = 'KANNADA';
77	private const SCRIPT_KATAKANA = 'KATAKANA';
78	private const SCRIPT_KHAROSHTHI = 'KHAROSHTHI';
79	private const SCRIPT_KHMER = 'KHMER';
80	private const SCRIPT_LAO = 'LAO';
81	private const SCRIPT_LATIN = 'LATIN';
82	private const SCRIPT_LIMBU = 'LIMBU';
83	private const SCRIPT_LINEAR_B = 'LINEAR_B';
84	private const SCRIPT_MALAYALAM = 'MALAYALAM';
85	private const SCRIPT_MEETEI_MAYEK = 'MEETEI_MAYEK';
86	private const SCRIPT_MEETEI_MAYEK_EXTENSIONS = 'MEETEI_MAYEK_EXTENSIONS';
87	private const SCRIPT_MONGOLIAN = 'MONGOLIAN';
88	private const SCRIPT_MYANMAR = 'MYANMAR';
89	private const SCRIPT_NEW_TAI_LUE = 'NEW_TAI_LUE';
90	private const SCRIPT_NKO = 'NKO';
91	private const SCRIPT_OGHAM = 'OGHAM';
92	private const SCRIPT_OL_CHIKI = 'OL_CHIKI';
93	private const SCRIPT_OLD_ITALIC = 'OLD_ITALIC';
94	private const SCRIPT_OLD_PERSIAN = 'OLD_PERSIAN';
95	private const SCRIPT_ORIYA = 'ORIYA';
96	private const SCRIPT_OSMANYA = 'OSMANYA';
97	private const SCRIPT_RUNIC = 'RUNIC';
98	private const SCRIPT_SHAVIAN = 'SHAVIAN';
99	private const SCRIPT_SINHALA = 'SINHALA';
100	private const SCRIPT_SYLOTI_NAGRI = 'SYLOTI_NAGRI';
101	private const SCRIPT_SYRIAC = 'SYRIAC';
102	private const SCRIPT_TAGALOG = 'TAGALOG';
103	private const SCRIPT_TAGBANWA = 'TAGBANWA';
104	private const SCRIPT_TAI_LE = 'TAI_LE';
105	private const SCRIPT_TAMIL = 'TAMIL';
106	private const SCRIPT_TELUGU = 'TELUGU';
107	private const SCRIPT_THAANA = 'THAANA';
108	private const SCRIPT_THAI = 'THAI';
109	private const SCRIPT_TIBETAN = 'TIBETAN';
110	private const SCRIPT_TIFINAGH = 'TIFINAGH';
111	private const SCRIPT_UGARITIC = 'UGARITIC';
112	private const SCRIPT_WARANG_CITI = 'WARANG_CITI';
113	private const SCRIPT_YI = 'YI';
114
115	/**
116	* Define script tag codes for various Unicode codepoint ranges
117	* If it does not have a code here, it does not have a script assignment
118	* NB: Braille is not in this list since it is a transliteration system, not a script;
119	* this does not disadvantage blind people, who will use Braille input/output methods
120	* and not raw Braille...
121	* NB: Middle dot is included in SCRIPT_LATIN for use in Catalan
122	* NB: All scripts described by the Unicode Consortium as "Other Scripts" or "Ancient Scripts"
123	* are commented out: these are either not in modern use, or only used for specialized
124	* religious purposes, or only of literary interest
125	*/
126	private const ALL_SCRIPT_RANGES = [
127	[ 0x0020, 0x002F,
128	self::SCRIPT_ASCII_PUNCTUATION ], // ASCII Punctuation 1, Hyphen, ASCII Punctuation 2
129	[ 0x0030, 0x0039, self::SCRIPT_ASCII_DIGITS ], // ASCII Digits
130	[ 0x003A, 0x0040, self::SCRIPT_ASCII_PUNCTUATION ], // Colon, ASCII Punctuation 3
131	[ 0x0041, 0x005A, self::SCRIPT_LATIN ], // ASCII Uppercase
132	[ 0x005B, 0x0060,
133	self::SCRIPT_ASCII_PUNCTUATION ], // ASCII Punctuation 4, Underscore, ASCII Punctuation 5
134	[ 0x0061, 0x007A, self::SCRIPT_LATIN ], // ASCII Lowercase
135	[ 0x007B, 0x007E, self::SCRIPT_ASCII_PUNCTUATION ], // ASCII Punctuation 5
136	[ 0x00B7, 0x00B7, self::SCRIPT_LATIN ], // Middle Dot
137	[ 0x00C0, 0x00D6, self::SCRIPT_LATIN ], // Latin-1 Letters 1
138	[ 0x00D8, 0x00F6, self::SCRIPT_LATIN ], // Latin-1 Letters 2
139	[ 0x00F8, 0x02AF,
140	self::SCRIPT_LATIN ], // Latin-1 Letters 3, Latin Extended-A, Latin Extended-B, IPA Extensions
141	[ 0x0300, 0x036F, self::SCRIPT_COMBINING_MARKS ], // Combining Diacritical Marks
142	[ 0x0370, 0x03E1, self::SCRIPT_GREEK ], // Greek and Coptic (Greek)
143	[ 0x03E2, 0x03EF, self::SCRIPT_COPTIC_EXTRAS ], // Greek and Coptic (Coptic-unique)
144	[ 0x03F0, 0x03FF, self::SCRIPT_GREEK ], // Greek and Coptic (Greek)
145	[ 0x0400, 0x052F, self::SCRIPT_CYRILLIC ], // Cyrillic, Cyrillic Supplement
146	[ 0x0530, 0x058F, self::SCRIPT_ARMENIAN ], // Armenian
147	[ 0x0590, 0x05FF, self::SCRIPT_HEBREW ], // Hebrew
148	[ 0x0600, 0x06FF, self::SCRIPT_ARABIC ], // Arabic
149	[ 0x0700, 0x074F, self::SCRIPT_SYRIAC ], // Syriac
150	[ 0x0750, 0x077F, self::SCRIPT_ARABIC ], // Arabic Supplement
151	[ 0x0780, 0x07BF, self::SCRIPT_THAANA ], // Thaana
152	[ 0x07C0, 0x07FF, self::SCRIPT_NKO ], // NKo (N'Ko)
153	[ 0x0900, 0x097F, self::SCRIPT_DEVANAGARI ], // Devanagari
154	[ 0x0980, 0x09FF, self::SCRIPT_BENGALI ], // Bengali
155	[ 0x0A00, 0x0A7F, self::SCRIPT_GURMUKHI ], // Gurmukhi
156	[ 0x0A80, 0x0AFF, self::SCRIPT_GUJARATI ], // Gujarati
157	[ 0x0B00, 0x0B7F, self::SCRIPT_ORIYA ], // Oriya
158	[ 0x0B80, 0x0BFF, self::SCRIPT_TAMIL ], // Tamil
159	[ 0x0C00, 0x0C7F, self::SCRIPT_TELUGU ], // Telugu
160	[ 0x0C80, 0x0CFF, self::SCRIPT_KANNADA ], // Kannada
161	[ 0x0D00, 0x0D7F, self::SCRIPT_MALAYALAM ], // Malayalam
162	[ 0x0D80, 0x0DFF, self::SCRIPT_SINHALA ], // Sinhala
163	[ 0x0E00, 0x0E7F, self::SCRIPT_THAI ], // Thai
164	[ 0x0E80, 0x0EFF, self::SCRIPT_LAO ], // Lao
165	[ 0x0F00, 0x0FFF, self::SCRIPT_TIBETAN ], // Tibetan
166	[ 0x1000, 0x109F, self::SCRIPT_MYANMAR ], // Myanmar
167	[ 0x10A0, 0x10FF, self::SCRIPT_GEORGIAN ], // Georgian
168	[ 0x1100, 0x11FF, self::SCRIPT_HANGUL ], // Hangul Jamo
169	[ 0x1200, 0x139F, self::SCRIPT_ETHIOPIC ], // Ethiopic, Ethiopic Supplement
170	[ 0x13A0, 0x13FF, self::SCRIPT_CHEROKEE ], // Cherokee
171	[ 0x1400, 0x167F, self::SCRIPT_CANADIAN_ABORIGINAL ], // Unified Canadian Aboriginal Syllabics
172	// [ 0x1680, 0x169F, self::SCRIPT_OGHAM ], // Ogham
173	// [ 0x16A0, 0x16FF, self::SCRIPT_RUNIC ], // Runic
174	[ 0x1700, 0x171F, self::SCRIPT_TAGALOG ], // Tagalog
175	[ 0x1720, 0x173F, self::SCRIPT_HANUNOO ], // Hanunoo
176	[ 0x1740, 0x175F, self::SCRIPT_BUHID ], // Buhid
177	[ 0x1760, 0x177F, self::SCRIPT_TAGBANWA ], // Tagbanwa
178	[ 0x1780, 0x17FF, self::SCRIPT_KHMER ], // Khmer
179	[ 0x1800, 0x18AF, self::SCRIPT_MONGOLIAN ], // Mongolian
180	[ 0x1900, 0x194F, self::SCRIPT_LIMBU ], // Limbu
181	[ 0x1950, 0x197F, self::SCRIPT_TAI_LE ], // Tai Le
182	[ 0x1980, 0x19DF, self::SCRIPT_NEW_TAI_LUE ], // New Tai Lue
183	[ 0x1A00, 0x1A1F, self::SCRIPT_BUGINESE ], // Buginese
184	[ 0x1C50, 0x1C7F, self::SCRIPT_OL_CHIKI ], // Ol Chiki
185	[ 0x1C90, 0x1CBF, self::SCRIPT_GEORGIAN ], // Georgian Extended
186	[ 0x1E00, 0x1EFF, self::SCRIPT_LATIN ], // Latin Extended Additional
187	[ 0x1F00, 0x1FFF, self::SCRIPT_GREEK ], // Greek Extended
188	// [ 0x2C00, 0x2C5F, self::SCRIPT_GLAGOLITIC ], // Glagolitic
189	[ 0x2C80, 0x2CFF, self::SCRIPT_COPTIC ], // Coptic
190	[ 0x2D00, 0x2D2F, self::SCRIPT_GEORGIAN ], // Georgian Supplement
191	[ 0x2D30, 0x2D7F, self::SCRIPT_TIFINAGH ], // Tifinagh
192	[ 0x2D80, 0x2DDF, self::SCRIPT_ETHIOPIC ], // Ethiopic Extended
193	[ 0x2E80, 0x2FDF, self::SCRIPT_DEPRECATED ], // CJK Radicals Supplement, Kangxi Radicals
194	[ 0x3040, 0x309F, self::SCRIPT_HIRAGANA ], // Hiragana
195	[ 0x30A0, 0x30FF, self::SCRIPT_KATAKANA ], // Katakana
196	[ 0x3100, 0x312F, self::SCRIPT_BOPOMOFO ], // Bopomofo
197	[ 0x3130, 0x318F, self::SCRIPT_HANGUL ], // Hangul Compatibility Jamo
198	[ 0x31A0, 0x31BF, self::SCRIPT_BOPOMOFO ], // Bopomofo Extended
199	[ 0x31F0, 0x31FF, self::SCRIPT_KATAKANA ], // Katakana Phonetic Extensions
200	[ 0x3400, 0x4DBF, self::SCRIPT_HAN ], // CJK Unified Ideographs Extension A
201	[ 0x4E00, 0x9FFF, self::SCRIPT_HAN ], // CJK Unified Ideographs
202	[ 0xA000, 0xA4CF, self::SCRIPT_YI ], // Yi Syllables, Yi Radicals
203	[ 0xA800, 0xA82F, self::SCRIPT_SYLOTI_NAGRI ], // Syloti Nagri
204	// [ 0xAAE0, 0xAAFF, self::SCRIPT_MEETEI_MAYEK_EXTENSIONS ] // Meetei Mayek Extensions
205	[ 0xAB70, 0xABBF, self::SCRIPT_CHEROKEE ], // Cherokee Supplement
206	[ 0xABC0, 0xABFF, self::SCRIPT_MEETEI_MAYEK ], // Meetei Mayek
207	[ 0xAC00, 0xD7AF, self::SCRIPT_HANGUL ], // Hangul Syllables
208	[ 0xF900, 0xFAFF, self::SCRIPT_DEPRECATED ], // CJK Compatibility Ideographs
209	// [ 0x10000, 0x100FF, self::SCRIPT_LINEAR_B ], // Linear B Syllabary, Linear B Ideograms
210	// [ 0x10140, 0x1018F, self::SCRIPT_GREEK ], // Ancient Greek Numbers
211	// [ 0x10300, 0x1032F, self::SCRIPT_OLD_ITALIC ], // Old Italic
212	[ 0x10330, 0x1034F, self::SCRIPT_GOTHIC ], // Gothic
213	// [ 0x10380, 0x1039F, self::SCRIPT_UGARITIC ], // Ugaritic
214	// [ 0x103A0, 0x103DF, self::SCRIPT_OLD_PERSIAN ], // Old Persian
215	// [ 0x10400, 0x1044F, self::SCRIPT_DESERET ], // Deseret
216	// [ 0x10450, 0x1047F, self::SCRIPT_SHAVIAN ], // Shavian
217	// [ 0x10480, 0x104AF, self::SCRIPT_OSMANYA ], // Osmanya
218	// [ 0x10800, 0x1083F, self::SCRIPT_CYPRIOT ], // Cypriot Syllabary
219	[ 0x10A00, 0x10A5F, self::SCRIPT_KHAROSHTHI ], // Kharoshthi
220	[ 0x118A0, 0x118FF, self::SCRIPT_WARANG_CITI ], // Warang Citi
221	[ 0x20000, 0x2A6DF, self::SCRIPT_HAN ], // CJK Unified Ideographs Extension B
222	[ 0x2F800, 0x2FA1F, self::SCRIPT_DEPRECATED ] // CJK Compatibility Ideographs Supplement
223	];
224
225	private const ALLOWED_SCRIPT_COMBINATIONS = [
226	[ self::SCRIPT_COPTIC, self::SCRIPT_COPTIC_EXTRAS ], # Coptic, using old Greek chars
227	[ self::SCRIPT_GREEK, self::SCRIPT_COPTIC_EXTRAS ], # Coptic, using new Coptic chars
228	[ self::SCRIPT_HAN, self::SCRIPT_BOPOMOFO ], # Chinese
229	[ self::SCRIPT_HAN, self::SCRIPT_HANGUL ], # Korean
230	[ self::SCRIPT_HAN, self::SCRIPT_KATAKANA, self::SCRIPT_HIRAGANA ] # Japanese
231	];
232
233	/**
234	* @var Equivset
235	*/
236	private static $equivset;
237
238	/**
239	* @return Equivset
240	*/
241	public static function getEquivSet() {
242	if ( !self::$equivset ) {
243	self::$equivset = new Equivset();
244	}
245
246	return self::$equivset;
247	}
248
249	/**
250	* @param int $ch
251	* @return string
252	*/
253	private static function getScriptCode( $ch ) {
254	# Linear search: binary chop would be faster...
255	foreach ( self::ALL_SCRIPT_RANGES as $range ) {
256	if ( $ch >= $range[0] && $ch <= $range[1] ) {
257	return $range[2];
258	}
259	}
260	# Otherwise...
261	return self::SCRIPT_UNASSIGNED;
262	}
263
264	/**
265	* @param array $aList
266	* @param array $bList
267	* @return bool
268	*/
269	private static function isSubsetOf( $aList, $bList ) {
270	return count( array_diff( $aList, $bList ) ) == 0;
271	}
272
273	/**
274	* Is this an allowed script mixture?
275	*
276	* @param array $scriptList
277	* @return bool
278	*/
279	private static function isAllowedScriptCombination( $scriptList ) {
280	foreach ( self::ALLOWED_SCRIPT_COMBINATIONS as $allowedCombo ) {
281	if ( self::isSubsetOf( $scriptList, $allowedCombo ) ) {
282	return true;
283	}
284	}
285	return false;
286	}
287
288	/**
289	* Convert string into array of Unicode code points as integers
290	* @param string $str
291	* @return int[]
292	*/
293	private static function stringToList( $str ) {
294	$ar = [];
295	if ( !preg_match_all( '/./us', $str, $ar ) ) {
296	return [];
297	}
298	$out = [];
299	foreach ( $ar[0] as $char ) {
300	$out[] = Utils::utf8ToCodepoint( $char );
301	}
302	return $out;
303	}
304
305	/**
306	* @param array $list
307	* @return string
308	*/
309	private static function listToString( $list ) {
310	$out = '';
311	foreach ( $list as $cp ) {
312	$out .= Utils::codepointToUtf8( $cp );
313	}
314	return $out;
315	}
316
317	/**
318	* @param array $a_list
319	* @return string
320	*/
321	private static function hardjoin( $a_list ) {
322	return implode( '', $a_list );
323	}
324
325	/**
326	* @param string $testName
327	* @return string
328	*/
329	public static function normalizeString( $testName ) {
330	return self::getEquivSet()->normalize( $testName );
331	}
332
333	/**
334	* @param int[] $text
335	* @param string $script
336	* @return int[]
337	*/
338	private static function stripScript( array $text, $script ) {
339	$scripts = array_map( [ __CLASS__, 'getScriptCode' ], $text );
340	$out = [];
341	foreach ( $text as $index => $char ) {
342	if ( $scripts[$index] !== $script ) {
343	$out[] = $char;
344	}
345	}
346	return $out;
347	}
348
349	/**
350	* Helper function for checkUnicodeStringStatus: Return an error on a bad character.
351	* @todo I would like to show Unicode character name, but it is not clear how to get it.
352	* @param string $msgId message identifier.
353	* @param int $point codepoint of the bad character.
354	* @return Status
355	*/
356	private static function badCharErr( $msgId, $point ) {
357	$symbol = Utils::codepointToUtf8( $point );
358	// Combining marks are combined with the previous character. If abusing character is a
359	// combining mark, prepend it with space to show them correctly.
360	if ( self::getScriptCode( $point ) === self::SCRIPT_COMBINING_MARKS ) {
361	$symbol = ' ' . $symbol;
362	}
363	$code = sprintf( 'U+%04X', $point );
364	if ( preg_match( '/\A\p{C}\z/u', $symbol ) ) {
365	$char = wfMessage( 'antispoof-bad-char-non-printable', $code );
366	} else {
367	$char = wfMessage( 'antispoof-bad-char', $symbol, $code );
368	}
369	return Status::newFatal( wfMessage( $msgId, $char ) );
370	}
371
372	/**
373	* TODO: does too much in one routine, refactor...
374	* @param string $testName
375	* @return Status
376	* @since 1.32
377	*/
378	public static function checkUnicodeStringStatus( $testName ) {
379	global $wgAntiSpoofProhibitedCharacters;
380
381	// Start with some sanity checking
382	if ( !is_array( $wgAntiSpoofProhibitedCharacters ) ) {
383	throw new ConfigException( '$wgAntiSpoofProhibitedCharacters should be an array!' );
384	}
385	if ( !is_string( $testName ) ) {
386	return Status::newFatal( 'antispoof-badtype' );
387	}
388
389	if ( strlen( $testName ) == 0 ) {
390	return Status::newFatal( 'antispoof-empty' );
391	}
392
393	foreach ( self::stringToList( $testName ) as $char ) {
394	if ( in_array( $char, $wgAntiSpoofProhibitedCharacters ) ) {
395	return self::badCharErr( 'antispoof-prohibited', $char );
396	}
397	}
398
399	// Perform Unicode _compatibility_ decomposition
400	$testName = Validator::toNFKD( $testName );
401	$testChars = self::stringToList( $testName );
402
403	// Be paranoid: check again, just in case Unicode normalization code changes...
404	foreach ( $testChars as $char ) {
405	if ( in_array( $char, $wgAntiSpoofProhibitedCharacters ) ) {
406	return self::badCharErr( 'antispoof-prohibited', $char );
407	}
408	}
409
410	// Check for this: should not happen in any valid Unicode string
411	if ( self::getScriptCode( $testChars[0] ) === self::SCRIPT_COMBINING_MARKS ) {
412	return self::badCharErr( 'antispoof-combining', $testChars[0] );
413	}
414
415	// Strip all combining characters in order to crudely strip accents
416	// Note: NFKD normalization should have decomposed all accented chars earlier
417	$testChars = self::stripScript( $testChars, self::SCRIPT_COMBINING_MARKS );
418
419	$testScripts = array_map( [ __CLASS__, 'getScriptCode' ], $testChars );
420	$unassigned = array_search( self::SCRIPT_UNASSIGNED, $testScripts );
421	if ( $unassigned !== false ) {
422	return self::badCharErr( 'antispoof-unassigned', $testChars[$unassigned] );
423	}
424	$deprecated = array_search( self::SCRIPT_DEPRECATED, $testScripts );
425	if ( $deprecated !== false ) {
426	return self::badCharErr( 'antispoof-deprecated', $testChars[$deprecated] );
427	}
428	$testScripts = array_unique( $testScripts );
429
430	// We don't mind ASCII punctuation or digits
431	$testScripts = array_diff( $testScripts,
432	[ self::SCRIPT_ASCII_PUNCTUATION, self::SCRIPT_ASCII_DIGITS ] );
433
434	if ( !$testScripts ) {
435	return Status::newFatal( 'antispoof-noletters' );
436	}
437
438	if ( count( $testScripts ) > 1 && !self::isAllowedScriptCombination( $testScripts ) ) {
439	return Status::newFatal( 'antispoof-mixedscripts' );
440	}
441
442	// At this point, we should probably check for BiDi violations if they aren't
443	// caught above...
444
445	// Squeeze out all punctuation chars
446	// TODO: almost the same code occurs twice, refactor into own routine
447	$testChars = self::stripScript( $testChars, self::SCRIPT_ASCII_PUNCTUATION );
448
449	$testName = self::listToString( $testChars );
450
451	// Replace characters in confusables set with equivalence chars
452	$testName = self::normalizeString( $testName );
453
454	// Do very simple sequence processing: "vv" -> "w", "rn" -> "m"...
455	// Not exhaustive, but ups the ante...
456	// Do this _after_ canonicalization: looks weird, but needed for consistency
457	$testName = str_replace( 'VV', 'W', $testName );
458	$testName = str_replace( 'RN', 'M', $testName );
459
460	// Remove all remaining spaces, just in case any have snuck through...
461	$testName = self::hardjoin( explode( " ", $testName ) );
462
463	// Reduce repeated char sequences to single character
464	// BUG: TODO: implement this
465
466	if ( strlen( $testName ) < 1 ) {
467	return Status::newFatal( 'antispoof-tooshort' );
468	}
469
470	// Don't ASCIIfy: we assume we are UTF-8 capable on output
471
472	// Prepend version string, for futureproofing if this algorithm changes
473	$testName = "v2:" . $testName;
474
475	// And return the canonical version of the name
476	return Status::newGood( $testName );
477	}
478	}