Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
80.00% |
276 / 345 |
|
23.53% |
4 / 17 |
CRAP | |
0.00% |
0 / 1 |
Validator | |
80.23% |
276 / 344 |
|
23.53% |
4 / 17 |
248.63 | |
0.00% |
0 / 1 |
cleanUp | |
72.73% |
8 / 11 |
|
0.00% |
0 / 1 |
6.73 | |||
toNFC | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
toNFD | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
toNFKC | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
toNFKD | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
loadData | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
quickIsNFC | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
90 | |||
quickIsNFCVerify | |
99.05% |
104 / 105 |
|
0.00% |
0 / 1 |
40 | |||
NFC | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
NFD | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
NFKC | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
NFKD | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
fastDecompose | |
92.50% |
37 / 40 |
|
0.00% |
0 / 1 |
11.05 | |||
fastCombiningSort | |
94.12% |
32 / 34 |
|
0.00% |
0 / 1 |
10.02 | |||
fastCompose | |
93.18% |
82 / 88 |
|
0.00% |
0 / 1 |
28.25 | |||
placebo | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
replaceForNativeNormalize | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | /** |
3 | * Unicode normalization routines |
4 | * |
5 | * Copyright © 2004 Brion Vibber <brion@pobox.com> |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup UtfNormal\UtfNormal |
25 | */ |
26 | namespace UtfNormal; |
27 | |
28 | use Normalizer; |
29 | |
30 | /** |
31 | * @defgroup UtfNormal UtfNormal |
32 | */ |
33 | |
34 | define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) ); |
35 | |
36 | /** |
37 | * Unicode normalization routines for working with UTF-8 strings. |
38 | * Currently, it assumes that input strings are valid UTF-8! |
39 | * |
40 | * Not as fast as I'd like, but should be usable for most purposes. |
41 | * UtfNormal\Validator::toNFC() will bail early if given ASCII text or text |
42 | * it can quickly determine is already normalized. |
43 | * |
44 | * All functions can be called static. |
45 | * |
46 | * See description of forms at http://www.unicode.org/reports/tr15/ |
47 | * |
48 | * @ingroup UtfNormal |
49 | */ |
50 | class Validator { |
51 | |
52 | /** |
53 | * @var array |
54 | */ |
55 | public static $utfCombiningClass; |
56 | |
57 | /** |
58 | * @var array |
59 | */ |
60 | public static $utfCanonicalComp; |
61 | |
62 | /** |
63 | * @var array |
64 | */ |
65 | public static $utfCanonicalDecomp; |
66 | |
67 | /** |
68 | * Load compatibility decompositions on demand if they are needed. |
69 | * |
70 | * @var array |
71 | */ |
72 | public static $utfCompatibilityDecomp; |
73 | |
74 | /** |
75 | * @var array|null |
76 | */ |
77 | public static $utfCheckNFC; |
78 | |
79 | /** |
80 | * The ultimate convenience function! Clean up invalid UTF-8 sequences, |
81 | * and convert to normal form C, canonical composition. |
82 | * |
83 | * Fast return for pure ASCII strings; some lesser optimizations for |
84 | * strings containing only known-good characters. Not as fast as toNFC(). |
85 | * |
86 | * @param string $string a UTF-8 string |
87 | * @return string a clean, shiny, normalized UTF-8 string |
88 | */ |
89 | public static function cleanUp( $string ) { |
90 | if ( NORMALIZE_INTL ) { |
91 | $string = self::replaceForNativeNormalize( $string ); |
92 | $norm = normalizer_normalize( $string, Normalizer::FORM_C ); |
93 | // T303790 - Can be simplified (remove === null) check when support >= PHP 8.1 |
94 | if ( $norm === null || $norm === false ) { |
95 | # normalizer_normalize will either return false or null |
96 | # (depending on which doc you read) if invalid utf8 string. |
97 | # quickIsNFCVerify cleans up invalid sequences. |
98 | # |
99 | # < PHP 8.1 |
100 | # https://github.com/php/php-src/blob/PHP-8.0.17/ext/intl/normalizer/normalizer_normalize.c |
101 | # >= PHP 8.1 |
102 | # https://www.php.net/manual/en/normalizer.normalize.php says return false |
103 | # https://github.com/php/php-src/blob/PHP-8.1.0/ext/intl/normalizer/normalizer_normalize.c |
104 | # Changed in https://github.com/php/php-src/commit/5dc995df375571489d9149fdccf258c0bd123317 |
105 | |
106 | if ( self::quickIsNFCVerify( $string ) ) { |
107 | # if that's true, the string is actually already normal. |
108 | return $string; |
109 | } else { |
110 | # Now we are valid but non-normal |
111 | return normalizer_normalize( $string, Normalizer::FORM_C ); |
112 | } |
113 | } else { |
114 | return $norm; |
115 | } |
116 | } elseif ( self::quickIsNFCVerify( $string ) ) { |
117 | # Side effect -- $string has had UTF-8 errors cleaned up. |
118 | return $string; |
119 | } else { |
120 | return self::NFC( $string ); |
121 | } |
122 | } |
123 | |
124 | /** |
125 | * Convert a UTF-8 string to normal form C, canonical composition. |
126 | * Fast return for pure ASCII strings; some lesser optimizations for |
127 | * strings containing only known-good characters. |
128 | * |
129 | * @param string $string a valid UTF-8 string. Input is not validated. |
130 | * @return string a UTF-8 string in normal form C |
131 | */ |
132 | public static function toNFC( $string ) { |
133 | if ( NORMALIZE_INTL ) { |
134 | return normalizer_normalize( $string, Normalizer::FORM_C ); |
135 | } elseif ( self::quickIsNFC( $string ) ) { |
136 | return $string; |
137 | } else { |
138 | return self::NFC( $string ); |
139 | } |
140 | } |
141 | |
142 | /** |
143 | * Convert a UTF-8 string to normal form D, canonical decomposition. |
144 | * Fast return for pure ASCII strings. |
145 | * |
146 | * @param string $string A valid UTF-8 string. Input is not validated. |
147 | * @return string A UTF-8 string in normal form D |
148 | */ |
149 | public static function toNFD( $string ) { |
150 | if ( NORMALIZE_INTL ) { |
151 | return normalizer_normalize( $string, Normalizer::FORM_D ); |
152 | } elseif ( preg_match( '/[\x80-\xff]/', $string ) ) { |
153 | return self::NFD( $string ); |
154 | } else { |
155 | return $string; |
156 | } |
157 | } |
158 | |
159 | /** |
160 | * Convert a UTF-8 string to normal form KC, compatibility composition. |
161 | * This may cause irreversible information loss, use judiciously. |
162 | * Fast return for pure ASCII strings. |
163 | * |
164 | * @param string $string A valid UTF-8 string. Input is not validated. |
165 | * @return string A UTF-8 string in normal form KC |
166 | */ |
167 | public static function toNFKC( $string ) { |
168 | if ( NORMALIZE_INTL ) { |
169 | return normalizer_normalize( $string, Normalizer::FORM_KC ); |
170 | } elseif ( preg_match( '/[\x80-\xff]/', $string ) ) { |
171 | return self::NFKC( $string ); |
172 | } else { |
173 | return $string; |
174 | } |
175 | } |
176 | |
177 | /** |
178 | * Convert a UTF-8 string to normal form KD, compatibility decomposition. |
179 | * This may cause irreversible information loss, use judiciously. |
180 | * Fast return for pure ASCII strings. |
181 | * |
182 | * @param string $string a valid UTF-8 string. Input is not validated. |
183 | * @return string a UTF-8 string in normal form KD |
184 | */ |
185 | public static function toNFKD( $string ) { |
186 | if ( NORMALIZE_INTL ) { |
187 | return normalizer_normalize( $string, Normalizer::FORM_KD ); |
188 | } elseif ( preg_match( '/[\x80-\xff]/', $string ) ) { |
189 | return self::NFKD( $string ); |
190 | } else { |
191 | return $string; |
192 | } |
193 | } |
194 | |
195 | /** |
196 | * Load the basic composition data if necessary |
197 | */ |
198 | public static function loadData() { |
199 | if ( !isset( self::$utfCombiningClass ) ) { |
200 | require_once __DIR__ . '/UtfNormalData.inc'; |
201 | } |
202 | } |
203 | |
204 | /** |
205 | * Returns true if the string is _definitely_ in NFC. |
206 | * Returns false if not or uncertain. |
207 | * @param string $string a valid UTF-8 string. Input is not validated. |
208 | * @return bool |
209 | */ |
210 | public static function quickIsNFC( $string ) { |
211 | # ASCII is always valid NFC! |
212 | # If it's pure ASCII, let it through. |
213 | if ( !preg_match( '/[\x80-\xff]/', $string ) ) { |
214 | return true; |
215 | } |
216 | |
217 | self::loadData(); |
218 | |
219 | $len = strlen( $string ); |
220 | for ( $i = 0; $i < $len; $i++ ) { |
221 | $c = $string[$i]; |
222 | $n = ord( $c ); |
223 | if ( $n < 0x80 ) { |
224 | continue; |
225 | } elseif ( $n >= 0xf0 ) { |
226 | $c = substr( $string, $i, 4 ); |
227 | $i += 3; |
228 | } elseif ( $n >= 0xe0 ) { |
229 | $c = substr( $string, $i, 3 ); |
230 | $i += 2; |
231 | } elseif ( $n >= 0xc0 ) { |
232 | $c = substr( $string, $i, 2 ); |
233 | $i++; |
234 | } |
235 | if ( isset( self::$utfCheckNFC[$c] ) ) { |
236 | # If it's NO or MAYBE, bail and do the slow check. |
237 | return false; |
238 | } |
239 | if ( isset( self::$utfCombiningClass[$c] ) ) { |
240 | # Combining character? We might have to do sorting, at least. |
241 | return false; |
242 | } |
243 | } |
244 | |
245 | return true; |
246 | } |
247 | |
248 | /** |
249 | * Returns true if the string is _definitely_ in NFC. |
250 | * Returns false if not or uncertain. |
251 | * @param string &$string A UTF-8 string, altered on output to be valid UTF-8 safe for XML. |
252 | * @return bool |
253 | */ |
254 | public static function quickIsNFCVerify( &$string ) { |
255 | # Screen out some characters that eg won't be allowed in XML |
256 | $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', Constants::UTF8_REPLACEMENT, $string ); |
257 | |
258 | # ASCII is always valid NFC! |
259 | # If we're only ever given plain ASCII, we can avoid the overhead |
260 | # of initializing the decomposition tables by skipping out early. |
261 | if ( !preg_match( '/[\x80-\xff]/', $string ) ) { |
262 | return true; |
263 | } |
264 | |
265 | static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null; |
266 | if ( !isset( $checkit ) ) { |
267 | # Load/build some scary lookup tables... |
268 | self::loadData(); |
269 | |
270 | $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass ); |
271 | |
272 | # Head bytes for sequences which we should do further validity checks |
273 | $checkit = array_flip( array_map( 'chr', |
274 | [ 0xc0, 0xc1, 0xe0, 0xed, 0xef, |
275 | 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, |
276 | 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ] ) ); |
277 | |
278 | # Each UTF-8 head byte is followed by a certain |
279 | # number of tail bytes. |
280 | $tailBytes = []; |
281 | for ( $n = 0; $n < 256; $n++ ) { |
282 | if ( $n < 0xc0 ) { |
283 | $remaining = 0; |
284 | } elseif ( $n < 0xe0 ) { |
285 | $remaining = 1; |
286 | } elseif ( $n < 0xf0 ) { |
287 | $remaining = 2; |
288 | } elseif ( $n < 0xf8 ) { |
289 | $remaining = 3; |
290 | } elseif ( $n < 0xfc ) { |
291 | $remaining = 4; |
292 | } elseif ( $n < 0xfe ) { |
293 | $remaining = 5; |
294 | } else { |
295 | $remaining = 0; |
296 | } |
297 | $tailBytes[chr( $n )] = $remaining; |
298 | } |
299 | } |
300 | |
301 | # Chop the text into pure-ASCII and non-ASCII areas; |
302 | # large ASCII parts can be handled much more quickly. |
303 | # Don't chop up Unicode areas for punctuation, though, |
304 | # that wastes energy. |
305 | $matches = []; |
306 | preg_match_all( |
307 | '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/', |
308 | $string, $matches ); |
309 | |
310 | $looksNormal = true; |
311 | $base = 0; |
312 | $replace = []; |
313 | foreach ( $matches[1] as $str ) { |
314 | $chunk = strlen( $str ); |
315 | |
316 | if ( $str[0] < "\x80" ) { |
317 | # ASCII chunk: guaranteed to be valid UTF-8 |
318 | # and in normal form C, so skip over it. |
319 | $base += $chunk; |
320 | continue; |
321 | } |
322 | |
323 | # We'll have to examine the chunk byte by byte to ensure |
324 | # that it consists of valid UTF-8 sequences, and to see |
325 | # if any of them might not be normalized. |
326 | |
327 | # Since PHP is not the fastest language on earth, some of |
328 | # this code is a little ugly with inner loop optimizations. |
329 | |
330 | $head = ''; |
331 | # Counting down is faster. I'm *so* sorry. |
332 | $len = $chunk + 1; |
333 | |
334 | for ( $i = -1; --$len; ) { |
335 | $remaining = $tailBytes[$c = $str[++$i]]; |
336 | if ( $remaining ) { |
337 | # UTF-8 head byte! |
338 | $sequence = $head = $c; |
339 | do { |
340 | # Look for the defined number of tail bytes... |
341 | if ( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) { |
342 | # Legal tail bytes are nice. |
343 | $sequence .= $c; |
344 | } elseif ( $len === 0 ) { |
345 | # Premature end of string! |
346 | # Drop a replacement character into output to |
347 | # represent the invalid UTF-8 sequence. |
348 | $replace[] = [ Constants::UTF8_REPLACEMENT, |
349 | $base + $i + 1 - strlen( $sequence ), |
350 | strlen( $sequence ) ]; |
351 | break 2; |
352 | } else { |
353 | # Illegal tail byte; abandon the sequence. |
354 | $replace[] = [ Constants::UTF8_REPLACEMENT, |
355 | $base + $i - strlen( $sequence ), |
356 | strlen( $sequence ) ]; |
357 | # Back up and reprocess this byte; it may itself |
358 | # be a legal ASCII or UTF-8 sequence head. |
359 | --$i; |
360 | ++$len; |
361 | continue 2; |
362 | } |
363 | } while ( --$remaining ); |
364 | |
365 | if ( isset( $checkit[$head] ) ) { |
366 | # Do some more detailed validity checks, for |
367 | # invalid characters and illegal sequences. |
368 | if ( $head == "\xed" ) { |
369 | # 0xed is relatively frequent in Korean, which |
370 | # abuts the surrogate area, so we're doing |
371 | # this check separately to speed things up. |
372 | |
373 | if ( $sequence >= Constants::UTF8_SURROGATE_FIRST ) { |
374 | # Surrogates are legal only in UTF-16 code. |
375 | # They are totally forbidden here in UTF-8 |
376 | # utopia. |
377 | $replace[] = [ Constants::UTF8_REPLACEMENT, |
378 | $base + $i + 1 - strlen( $sequence ), |
379 | strlen( $sequence ) ]; |
380 | $head = ''; |
381 | continue; |
382 | } |
383 | } else { |
384 | # Slower, but rarer checks... |
385 | $n = ord( $head ); |
386 | if ( |
387 | # "Overlong sequences" are those that are syntactically |
388 | # correct but use more UTF-8 bytes than are necessary to |
389 | # encode a character. Naïve string comparisons can be |
390 | # tricked into failing to see a match for an ASCII |
391 | # character, for instance, which can be a security hole |
392 | # if lists of excluded characters are being used. |
393 | ( $n < 0xc2 && $sequence <= Constants::UTF8_OVERLONG_A ) |
394 | || ( $n == 0xe0 && $sequence <= Constants::UTF8_OVERLONG_B ) |
395 | || ( $n == 0xf0 && $sequence <= Constants::UTF8_OVERLONG_C ) |
396 | |
397 | # U+FFFE and U+FFFF are explicitly forbidden in Unicode. |
398 | || ( $n == 0xef && |
399 | ( $sequence == Constants::UTF8_FFFE |
400 | || $sequence == Constants::UTF8_FFFF ) ) |
401 | |
402 | # Unicode has been limited to 21 bits; longer |
403 | # sequences are not allowed. |
404 | || ( $n >= 0xf0 && $sequence > Constants::UTF8_MAX ) |
405 | ) { |
406 | $replace[] = [ Constants::UTF8_REPLACEMENT, |
407 | $base + $i + 1 - strlen( $sequence ), |
408 | strlen( $sequence ) ]; |
409 | $head = ''; |
410 | continue; |
411 | } |
412 | } |
413 | } |
414 | |
415 | if ( isset( $utfCheckOrCombining[$sequence] ) ) { |
416 | # If it's NO or MAYBE, we'll have to rip |
417 | # the string apart and put it back together. |
418 | # That's going to be mighty slow. |
419 | $looksNormal = false; |
420 | } |
421 | |
422 | # The sequence is legal! |
423 | $head = ''; |
424 | } elseif ( $c < "\x80" ) { |
425 | # ASCII byte. |
426 | $head = ''; |
427 | } elseif ( $c < "\xc0" ) { |
428 | # Illegal tail bytes |
429 | if ( $head == '' ) { |
430 | # Out of the blue! |
431 | $replace[] = [ Constants::UTF8_REPLACEMENT, $base + $i, 1 ]; |
432 | } else { |
433 | # Don't add if we're continuing a broken sequence; |
434 | # we already put a replacement character when we looked |
435 | # at the broken sequence. |
436 | $replace[] = [ '', $base + $i, 1 ]; |
437 | } |
438 | } else { |
439 | # Miscellaneous freaks. |
440 | $replace[] = [ Constants::UTF8_REPLACEMENT, $base + $i, 1 ]; |
441 | $head = ''; |
442 | } |
443 | } |
444 | $base += $chunk; |
445 | } |
446 | if ( count( $replace ) ) { |
447 | # There were illegal UTF-8 sequences we need to fix up. |
448 | $out = ''; |
449 | $last = 0; |
450 | foreach ( $replace as $rep ) { |
451 | [ $replacement, $start, $length ] = $rep; |
452 | if ( $last < $start ) { |
453 | $out .= substr( $string, $last, $start - $last ); |
454 | } |
455 | $out .= $replacement; |
456 | $last = $start + $length; |
457 | } |
458 | if ( $last < strlen( $string ) ) { |
459 | $out .= substr( $string, $last ); |
460 | } |
461 | $string = $out; |
462 | } |
463 | |
464 | return $looksNormal; |
465 | } |
466 | |
467 | # These take a string and run the normalization on them, without |
468 | # checking for validity or any optimization etc. Input must be |
469 | # VALID UTF-8! |
470 | |
471 | /** |
472 | * @param string $string |
473 | * @return string |
474 | */ |
475 | public static function NFC( $string ) { |
476 | return self::fastCompose( self::NFD( $string ) ); |
477 | } |
478 | |
479 | /** |
480 | * @param string $string |
481 | * @return string |
482 | */ |
483 | public static function NFD( $string ) { |
484 | self::loadData(); |
485 | |
486 | return self::fastCombiningSort( |
487 | self::fastDecompose( $string, self::$utfCanonicalDecomp ) |
488 | ); |
489 | } |
490 | |
491 | /** |
492 | * @param string $string |
493 | * @return string |
494 | */ |
495 | public static function NFKC( $string ) { |
496 | return self::fastCompose( self::NFKD( $string ) ); |
497 | } |
498 | |
499 | /** |
500 | * @param string $string |
501 | * @return string |
502 | */ |
503 | public static function NFKD( $string ) { |
504 | if ( !isset( self::$utfCompatibilityDecomp ) ) { |
505 | require_once __DIR__ . '/UtfNormalDataK.inc'; |
506 | } |
507 | |
508 | return self::fastCombiningSort( |
509 | self::fastDecompose( $string, self::$utfCompatibilityDecomp ) ); |
510 | } |
511 | |
512 | /** |
513 | * Perform decomposition of a UTF-8 string into either D or KD form |
514 | * (depending on which decomposition map is passed to us). |
515 | * Input is assumed to be *valid* UTF-8. Invalid code will break. |
516 | * @param string $string valid UTF-8 string |
517 | * @param array $map hash of expanded decomposition map |
518 | * @return string a UTF-8 string decomposed, not yet normalized (needs sorting) |
519 | */ |
520 | public static function fastDecompose( $string, $map ) { |
521 | self::loadData(); |
522 | |
523 | $len = strlen( $string ); |
524 | $out = ''; |
525 | for ( $i = 0; $i < $len; $i++ ) { |
526 | $c = $string[$i]; |
527 | $n = ord( $c ); |
528 | if ( $n < 0x80 ) { |
529 | # ASCII chars never decompose |
530 | # THEY ARE IMMORTAL |
531 | $out .= $c; |
532 | continue; |
533 | } elseif ( $n >= 0xf0 ) { |
534 | $c = substr( $string, $i, 4 ); |
535 | $i += 3; |
536 | } elseif ( $n >= 0xe0 ) { |
537 | $c = substr( $string, $i, 3 ); |
538 | $i += 2; |
539 | } elseif ( $n >= 0xc0 ) { |
540 | $c = substr( $string, $i, 2 ); |
541 | $i++; |
542 | } |
543 | if ( isset( $map[$c] ) ) { |
544 | $out .= $map[$c]; |
545 | continue; |
546 | } else { |
547 | if ( $c >= Constants::UTF8_HANGUL_FIRST && $c <= Constants::UTF8_HANGUL_LAST ) { |
548 | # Decompose a hangul syllable into jamo; |
549 | # hardcoded for three-byte UTF-8 sequence. |
550 | # A lookup table would be slightly faster, |
551 | # but adds a lot of memory & disk needs. |
552 | $index = ( ( ord( $c[0] ) & 0x0f ) << 12 |
553 | | ( ord( $c[1] ) & 0x3f ) << 6 |
554 | | ( ord( $c[2] ) & 0x3f ) ) |
555 | - Constants::UNICODE_HANGUL_FIRST; |
556 | $l = intval( $index / Constants::UNICODE_HANGUL_NCOUNT ); |
557 | $v = intval( |
558 | ( $index % Constants::UNICODE_HANGUL_NCOUNT ) |
559 | / Constants::UNICODE_HANGUL_TCOUNT |
560 | ); |
561 | $t = $index % Constants::UNICODE_HANGUL_TCOUNT; |
562 | $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v ); |
563 | if ( $t >= 25 ) { |
564 | $out .= "\xe1\x87" . chr( 0x80 + $t - 25 ); |
565 | } elseif ( $t ) { |
566 | $out .= "\xe1\x86" . chr( 0xa7 + $t ); |
567 | } |
568 | continue; |
569 | } |
570 | } |
571 | $out .= $c; |
572 | } |
573 | |
574 | return $out; |
575 | } |
576 | |
577 | /** |
578 | * Sorts combining characters into canonical order. This is the |
579 | * final step in creating decomposed normal forms D and KD. |
580 | * @param string $string a valid, decomposed UTF-8 string. Input is not validated. |
581 | * @return string a UTF-8 string with combining characters sorted in canonical order |
582 | */ |
583 | public static function fastCombiningSort( $string ) { |
584 | self::loadData(); |
585 | |
586 | $len = strlen( $string ); |
587 | $out = ''; |
588 | $combiners = []; |
589 | $lastClass = -1; |
590 | for ( $i = 0; $i < $len; $i++ ) { |
591 | $c = $string[$i]; |
592 | $n = ord( $c ); |
593 | if ( $n >= 0x80 ) { |
594 | if ( $n >= 0xf0 ) { |
595 | $c = substr( $string, $i, 4 ); |
596 | $i += 3; |
597 | } elseif ( $n >= 0xe0 ) { |
598 | $c = substr( $string, $i, 3 ); |
599 | $i += 2; |
600 | } elseif ( $n >= 0xc0 ) { |
601 | $c = substr( $string, $i, 2 ); |
602 | $i++; |
603 | } |
604 | if ( isset( self::$utfCombiningClass[$c] ) ) { |
605 | $lastClass = self::$utfCombiningClass[$c]; |
606 | if ( isset( $combiners[$lastClass] ) ) { |
607 | $combiners[$lastClass] .= $c; |
608 | } else { |
609 | $combiners[$lastClass] = $c; |
610 | } |
611 | continue; |
612 | } |
613 | } |
614 | if ( $lastClass ) { |
615 | ksort( $combiners ); |
616 | $out .= implode( '', $combiners ); |
617 | $combiners = []; |
618 | } |
619 | $out .= $c; |
620 | $lastClass = 0; |
621 | } |
622 | if ( $lastClass ) { |
623 | ksort( $combiners ); |
624 | $out .= implode( '', $combiners ); |
625 | } |
626 | |
627 | return $out; |
628 | } |
629 | |
630 | /** |
631 | * Produces canonically composed sequences, i.e. normal form C or KC. |
632 | * |
633 | * @param string $string a valid UTF-8 string in sorted normal form D or KD. |
634 | * Input is not validated. |
635 | * @return string a UTF-8 string with canonical precomposed characters used |
636 | * where possible. |
637 | */ |
638 | public static function fastCompose( $string ) { |
639 | self::loadData(); |
640 | |
641 | $len = strlen( $string ); |
642 | $out = ''; |
643 | $lastClass = -1; |
644 | $lastHangul = 0; |
645 | $startChar = ''; |
646 | $combining = ''; |
647 | |
648 | // Optim: ord() ignores everything after the first byte |
649 | $x1 = ord( Constants::UTF8_HANGUL_VBASE ); |
650 | $x2 = ord( Constants::UTF8_HANGUL_TEND ); |
651 | for ( $i = 0; $i < $len; $i++ ) { |
652 | $c = $string[$i]; |
653 | $n = ord( $c ); |
654 | if ( $n < 0x80 ) { |
655 | # No combining characters here... |
656 | $out .= $startChar; |
657 | $out .= $combining; |
658 | $startChar = $c; |
659 | $combining = ''; |
660 | $lastClass = 0; |
661 | continue; |
662 | } elseif ( $n >= 0xf0 ) { |
663 | $c = substr( $string, $i, 4 ); |
664 | $i += 3; |
665 | } elseif ( $n >= 0xe0 ) { |
666 | $c = substr( $string, $i, 3 ); |
667 | $i += 2; |
668 | } elseif ( $n >= 0xc0 ) { |
669 | $c = substr( $string, $i, 2 ); |
670 | $i++; |
671 | } |
672 | $pair = $startChar . $c; |
673 | if ( $n > 0x80 && isset( self::$utfCombiningClass[$c] ) ) { |
674 | # A combining char; see what we can do with it |
675 | $class = self::$utfCombiningClass[$c]; |
676 | // TODO: Is refusing falsey $startChar (e.g. '0') intentional here? |
677 | if ( $startChar && |
678 | $lastClass < $class && |
679 | $class > 0 && |
680 | isset( self::$utfCanonicalComp[$pair] ) |
681 | ) { |
682 | $startChar = self::$utfCanonicalComp[$pair]; |
683 | $class = 0; |
684 | } else { |
685 | $combining .= $c; |
686 | } |
687 | $lastClass = $class; |
688 | $lastHangul = 0; |
689 | continue; |
690 | } |
691 | # New start char |
692 | if ( $lastClass === 0 ) { |
693 | if ( isset( self::$utfCanonicalComp[$pair] ) ) { |
694 | $startChar = self::$utfCanonicalComp[$pair]; |
695 | $lastHangul = 0; |
696 | continue; |
697 | } |
698 | if ( $n >= $x1 && $n <= $x2 ) { |
699 | # WARNING: Hangul code is painfully slow. |
700 | # I apologize for this ugly, ugly code; however |
701 | # performance is even more teh suck if we call |
702 | # out to nice clean functions. Lookup tables are |
703 | # marginally faster, but require a lot of space. |
704 | if ( $c >= Constants::UTF8_HANGUL_VBASE && |
705 | $c <= Constants::UTF8_HANGUL_VEND && |
706 | $startChar >= Constants::UTF8_HANGUL_LBASE && |
707 | $startChar <= Constants::UTF8_HANGUL_LEND |
708 | ) { |
709 | # $lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE; |
710 | # $vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE; |
711 | $lIndex = ord( $startChar[2] ) - 0x80; |
712 | $vIndex = ord( $c[2] ) - 0xa1; |
713 | |
714 | $hangulPoint = Constants::UNICODE_HANGUL_FIRST + |
715 | Constants::UNICODE_HANGUL_TCOUNT * |
716 | ( Constants::UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex ); |
717 | |
718 | # Hardcode the limited-range UTF-8 conversion: |
719 | $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) . |
720 | chr( $hangulPoint >> 6 & 0x3f | 0x80 ) . |
721 | chr( $hangulPoint & 0x3f | 0x80 ); |
722 | $lastHangul = 0; |
723 | continue; |
724 | } elseif ( $c >= Constants::UTF8_HANGUL_TBASE && |
725 | $c <= Constants::UTF8_HANGUL_TEND && |
726 | $startChar >= Constants::UTF8_HANGUL_FIRST && |
727 | $startChar <= Constants::UTF8_HANGUL_LAST && |
728 | !$lastHangul |
729 | ) { |
730 | # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; |
731 | $tIndex = ord( $c[2] ) - 0xa7; |
732 | if ( $tIndex < 0 ) { |
733 | $tIndex = ord( $c[2] ) - 0x80 + ( 0x11c0 - 0x11a7 ); |
734 | } |
735 | |
736 | # Increment the code point by $tIndex, without |
737 | # the function overhead of decoding and recoding UTF-8 |
738 | $tail = ord( $startChar[2] ) + $tIndex; |
739 | if ( $tail > 0xbf ) { |
740 | $tail -= 0x40; |
741 | $mid = ord( $startChar[1] ) + 1; |
742 | if ( $mid > 0xbf ) { |
743 | $startChar[0] = chr( ord( $startChar[0] ) + 1 ); |
744 | $mid -= 0x40; |
745 | } |
746 | $startChar[1] = chr( $mid ); |
747 | } |
748 | $startChar[2] = chr( $tail ); |
749 | |
750 | # If there's another jamo char after this, *don't* try to merge it. |
751 | $lastHangul = 1; |
752 | continue; |
753 | } |
754 | } |
755 | } |
756 | $out .= $startChar; |
757 | $out .= $combining; |
758 | $startChar = $c; |
759 | $combining = ''; |
760 | $lastClass = 0; |
761 | $lastHangul = 0; |
762 | } |
763 | $out .= $startChar . $combining; |
764 | |
765 | return $out; |
766 | } |
767 | |
768 | /** |
769 | * This is just used for the benchmark, comparing how long it takes to |
770 | * interate through a string without really doing anything of substance. |
771 | * @param string $string |
772 | * @return string |
773 | */ |
774 | public static function placebo( $string ) { |
775 | $len = strlen( $string ); |
776 | $out = ''; |
777 | for ( $i = 0; $i < $len; $i++ ) { |
778 | $out .= $string[$i]; |
779 | } |
780 | |
781 | return $out; |
782 | } |
783 | |
784 | /** |
785 | * Function to replace some characters that we don't want |
786 | * but most of the native normalize functions keep. |
787 | * |
788 | * @param string $string The string |
789 | * @return string String with the character codes replaced. |
790 | */ |
791 | private static function replaceForNativeNormalize( $string ) { |
792 | $string = preg_replace( |
793 | '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', |
794 | Constants::UTF8_REPLACEMENT, |
795 | $string |
796 | ); |
797 | return str_replace( [ Constants::UTF8_FFFE, Constants::UTF8_FFFF ], Constants::UTF8_REPLACEMENT, $string ); |
798 | } |
799 | } |