Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
80.00% |
280 / 350 |
|
27.78% |
5 / 18 |
CRAP | |
0.00% |
0 / 1 |
| Validator | |
80.23% |
280 / 349 |
|
27.78% |
5 / 18 |
251.65 | |
0.00% |
0 / 1 |
| cleanUp | |
78.57% |
11 / 14 |
|
0.00% |
0 / 1 |
6.35 | |||
| prependIsolatedCombining | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| toNFC | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| toNFD | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| toNFKC | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| toNFKD | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| loadData | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
| quickIsNFC | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
90 | |||
| quickIsNFCVerify | |
98.10% |
103 / 105 |
|
0.00% |
0 / 1 |
40 | |||
| NFC | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| NFD | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| NFKC | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| NFKD | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| fastDecompose | |
92.50% |
37 / 40 |
|
0.00% |
0 / 1 |
11.05 | |||
| fastCombiningSort | |
94.12% |
32 / 34 |
|
0.00% |
0 / 1 |
10.02 | |||
| fastCompose | |
93.18% |
82 / 88 |
|
0.00% |
0 / 1 |
28.25 | |||
| placebo | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
| replaceForNativeNormalize | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | /** |
| 5 | * Unicode normalization routines |
| 6 | * |
| 7 | * Copyright © 2004 Brooke Vibber <bvibber@pobox.com> |
| 8 | * https://www.mediawiki.org/ |
| 9 | * |
| 10 | * This program is free software; you can redistribute it and/or modify |
| 11 | * it under the terms of the GNU General Public License as published by |
| 12 | * the Free Software Foundation; either version 2 of the License, or |
| 13 | * (at your option) any later version. |
| 14 | * |
| 15 | * This program is distributed in the hope that it will be useful, |
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 18 | * GNU General Public License for more details. |
| 19 | * |
| 20 | * You should have received a copy of the GNU General Public License along |
| 21 | * with this program; if not, write to the Free Software Foundation, Inc., |
| 22 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
| 23 | * http://www.gnu.org/copyleft/gpl.html |
| 24 | * |
| 25 | * @file |
| 26 | */ |
| 27 | namespace UtfNormal; |
| 28 | |
| 29 | use Normalizer; |
| 30 | |
| 31 | /** |
| 32 | * @defgroup UtfNormal UtfNormal |
| 33 | */ |
| 34 | |
| 35 | define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) ); |
| 36 | |
| 37 | /** |
| 38 | * Unicode normalization routines for working with UTF-8 strings. |
| 39 | * Currently, it assumes that input strings are valid UTF-8! |
| 40 | * |
| 41 | * Not as fast as I'd like, but should be usable for most purposes. |
| 42 | * UtfNormal\Validator::toNFC() will bail early if given ASCII text or text |
| 43 | * it can quickly determine is already normalized. |
| 44 | * |
| 45 | * All functions can be called static. |
| 46 | * |
| 47 | * See description of forms at http://www.unicode.org/reports/tr15/ |
| 48 | * |
| 49 | * @ingroup UtfNormal |
| 50 | */ |
| 51 | class Validator { |
| 52 | |
| 53 | /** |
| 54 | * @var array |
| 55 | */ |
| 56 | public static $utfCombiningClass; |
| 57 | |
| 58 | /** |
| 59 | * @var array |
| 60 | */ |
| 61 | public static $utfCanonicalComp; |
| 62 | |
| 63 | /** |
| 64 | * @var array |
| 65 | */ |
| 66 | public static $utfCanonicalDecomp; |
| 67 | |
| 68 | /** |
| 69 | * Load compatibility decompositions on demand if they are needed. |
| 70 | * |
| 71 | * @var array |
| 72 | */ |
| 73 | public static $utfCompatibilityDecomp; |
| 74 | |
| 75 | /** |
| 76 | * @var array|null |
| 77 | */ |
| 78 | public static $utfCheckNFC; |
| 79 | |
| 80 | /** |
| 81 | * @var string|null |
| 82 | */ |
| 83 | public static $utfIsolatedCombiningRegex; |
| 84 | |
| 85 | /** |
| 86 | * The ultimate convenience function! Clean up invalid UTF-8 sequences, |
| 87 | * and convert to normal form C, canonical composition, then clean up |
| 88 | * isolated combining characters. |
| 89 | * |
| 90 | * Fast return for pure ASCII strings; some lesser optimizations for |
| 91 | * strings containing only known-good characters. Not as fast as toNFC(). |
| 92 | * |
| 93 | * @param string $string a UTF-8 string |
| 94 | * @return string a clean, shiny, normalized UTF-8 string |
| 95 | */ |
| 96 | public static function cleanUp( $string ) { |
| 97 | if ( NORMALIZE_INTL ) { |
| 98 | if ( !preg_match( '/[\x00-\x08\x0b\x0c\x0e-\x1f\x80-\xff]/', $string ) ) { |
| 99 | return $string; |
| 100 | } |
| 101 | $string = self::replaceForNativeNormalize( $string ); |
| 102 | $norm = normalizer_normalize( $string, Normalizer::FORM_C ); |
| 103 | if ( $norm === false ) { |
| 104 | # normalizer_normalize will return false if invalid utf8 string. |
| 105 | # quickIsNFCVerify cleans up invalid sequences. |
| 106 | if ( self::quickIsNFCVerify( $string ) ) { |
| 107 | # if that's true, the string is actually already normal. |
| 108 | # (and doesn't have any combining characters, so we can |
| 109 | # skip looking for isolated combining characters) |
| 110 | return $string; |
| 111 | } else { |
| 112 | # Now we are valid but non-normal |
| 113 | $norm = normalizer_normalize( $string, Normalizer::FORM_C ); |
| 114 | } |
| 115 | } |
| 116 | $norm = self::prependIsolatedCombining( $norm ); |
| 117 | return $norm; |
| 118 | } elseif ( self::quickIsNFCVerify( $string ) ) { |
| 119 | # Side effect -- $string has had UTF-8 errors cleaned up. |
| 120 | return $string; |
| 121 | } else { |
| 122 | return self::prependIsolatedCombining( self::NFC( $string ) ); |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | public static function prependIsolatedCombining( string $string ): string { |
| 127 | self::loadData(); |
| 128 | return preg_replace( self::$utfIsolatedCombiningRegex, "\u{25CC}", $string ); |
| 129 | } |
| 130 | |
| 131 | /** |
| 132 | * Convert a UTF-8 string to normal form C, canonical composition. |
| 133 | * Fast return for pure ASCII strings; some lesser optimizations for |
| 134 | * strings containing only known-good characters. |
| 135 | * |
| 136 | * @param string $string a valid UTF-8 string. Input is not validated. |
| 137 | * @return string a UTF-8 string in normal form C |
| 138 | */ |
| 139 | public static function toNFC( $string ) { |
| 140 | if ( NORMALIZE_INTL ) { |
| 141 | return normalizer_normalize( $string, Normalizer::FORM_C ); |
| 142 | } elseif ( self::quickIsNFC( $string ) ) { |
| 143 | return $string; |
| 144 | } else { |
| 145 | return self::NFC( $string ); |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | /** |
| 150 | * Convert a UTF-8 string to normal form D, canonical decomposition. |
| 151 | * Fast return for pure ASCII strings. |
| 152 | * |
| 153 | * @param string $string A valid UTF-8 string. Input is not validated. |
| 154 | * @return string A UTF-8 string in normal form D |
| 155 | */ |
| 156 | public static function toNFD( $string ) { |
| 157 | if ( NORMALIZE_INTL ) { |
| 158 | return normalizer_normalize( $string, Normalizer::FORM_D ); |
| 159 | } elseif ( preg_match( '/[\x80-\xff]/', $string ) ) { |
| 160 | return self::NFD( $string ); |
| 161 | } else { |
| 162 | return $string; |
| 163 | } |
| 164 | } |
| 165 | |
| 166 | /** |
| 167 | * Convert a UTF-8 string to normal form KC, compatibility composition. |
| 168 | * This may cause irreversible information loss, use judiciously. |
| 169 | * Fast return for pure ASCII strings. |
| 170 | * |
| 171 | * @param string $string A valid UTF-8 string. Input is not validated. |
| 172 | * @return string A UTF-8 string in normal form KC |
| 173 | */ |
| 174 | public static function toNFKC( $string ) { |
| 175 | if ( NORMALIZE_INTL ) { |
| 176 | return normalizer_normalize( $string, Normalizer::FORM_KC ); |
| 177 | } elseif ( preg_match( '/[\x80-\xff]/', $string ) ) { |
| 178 | return self::NFKC( $string ); |
| 179 | } else { |
| 180 | return $string; |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | /** |
| 185 | * Convert a UTF-8 string to normal form KD, compatibility decomposition. |
| 186 | * This may cause irreversible information loss, use judiciously. |
| 187 | * Fast return for pure ASCII strings. |
| 188 | * |
| 189 | * @param string $string a valid UTF-8 string. Input is not validated. |
| 190 | * @return string a UTF-8 string in normal form KD |
| 191 | */ |
| 192 | public static function toNFKD( $string ) { |
| 193 | if ( NORMALIZE_INTL ) { |
| 194 | return normalizer_normalize( $string, Normalizer::FORM_KD ); |
| 195 | } elseif ( preg_match( '/[\x80-\xff]/', $string ) ) { |
| 196 | return self::NFKD( $string ); |
| 197 | } else { |
| 198 | return $string; |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | /** |
| 203 | * Load the basic composition data if necessary |
| 204 | */ |
| 205 | public static function loadData() { |
| 206 | // @phan-suppress-next-line MediaWikiNoIssetIfDefined |
| 207 | if ( !isset( self::$utfCombiningClass ) ) { |
| 208 | require_once __DIR__ . '/UtfNormalData.inc'; |
| 209 | } |
| 210 | } |
| 211 | |
| 212 | /** |
| 213 | * Returns true if the string is _definitely_ in NFC. |
| 214 | * Returns false if not or uncertain. |
| 215 | * @param string $string a valid UTF-8 string. Input is not validated. |
| 216 | * @return bool |
| 217 | */ |
| 218 | public static function quickIsNFC( $string ) { |
| 219 | # ASCII is always valid NFC! |
| 220 | # If it's pure ASCII, let it through. |
| 221 | if ( !preg_match( '/[\x80-\xff]/', $string ) ) { |
| 222 | return true; |
| 223 | } |
| 224 | |
| 225 | self::loadData(); |
| 226 | |
| 227 | $len = strlen( $string ); |
| 228 | for ( $i = 0; $i < $len; $i++ ) { |
| 229 | $c = $string[$i]; |
| 230 | $n = ord( $c ); |
| 231 | if ( $n < 0x80 ) { |
| 232 | continue; |
| 233 | } elseif ( $n >= 0xf0 ) { |
| 234 | $c = substr( $string, $i, 4 ); |
| 235 | $i += 3; |
| 236 | } elseif ( $n >= 0xe0 ) { |
| 237 | $c = substr( $string, $i, 3 ); |
| 238 | $i += 2; |
| 239 | } elseif ( $n >= 0xc0 ) { |
| 240 | $c = substr( $string, $i, 2 ); |
| 241 | $i++; |
| 242 | } |
| 243 | if ( isset( self::$utfCheckNFC[$c] ) ) { |
| 244 | # If it's NO or MAYBE, bail and do the slow check. |
| 245 | return false; |
| 246 | } |
| 247 | if ( isset( self::$utfCombiningClass[$c] ) ) { |
| 248 | # Combining character? We might have to do sorting, at least. |
| 249 | return false; |
| 250 | } |
| 251 | } |
| 252 | |
| 253 | return true; |
| 254 | } |
| 255 | |
| 256 | /** |
| 257 | * Returns true if the string is _definitely_ in NFC. |
| 258 | * Returns false if not or uncertain. |
| 259 | * @param string &$string A UTF-8 string, altered on output to be valid UTF-8 safe for XML. |
| 260 | * @return bool |
| 261 | */ |
| 262 | public static function quickIsNFCVerify( &$string ) { |
| 263 | # Screen out some characters that eg won't be allowed in XML |
| 264 | $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', Constants::UTF8_REPLACEMENT, $string ); |
| 265 | |
| 266 | # ASCII is always valid NFC! |
| 267 | # If we're only ever given plain ASCII, we can avoid the overhead |
| 268 | # of initializing the decomposition tables by skipping out early. |
| 269 | if ( !preg_match( '/[\x80-\xff]/', $string ) ) { |
| 270 | return true; |
| 271 | } |
| 272 | |
| 273 | static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null; |
| 274 | if ( $checkit === null ) { |
| 275 | # Load/build some scary lookup tables... |
| 276 | self::loadData(); |
| 277 | |
| 278 | $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass ); |
| 279 | |
| 280 | # Head bytes for sequences which we should do further validity checks |
| 281 | $checkit = array_flip( array_map( 'chr', |
| 282 | [ 0xc0, 0xc1, 0xe0, 0xed, 0xef, |
| 283 | 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, |
| 284 | 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ] ) ); |
| 285 | |
| 286 | # Each UTF-8 head byte is followed by a certain |
| 287 | # number of tail bytes. |
| 288 | $tailBytes = []; |
| 289 | for ( $n = 0; $n < 256; $n++ ) { |
| 290 | if ( $n < 0xc0 ) { |
| 291 | $remaining = 0; |
| 292 | } elseif ( $n < 0xe0 ) { |
| 293 | $remaining = 1; |
| 294 | } elseif ( $n < 0xf0 ) { |
| 295 | $remaining = 2; |
| 296 | } elseif ( $n < 0xf8 ) { |
| 297 | $remaining = 3; |
| 298 | } elseif ( $n < 0xfc ) { |
| 299 | $remaining = 4; |
| 300 | } elseif ( $n < 0xfe ) { |
| 301 | $remaining = 5; |
| 302 | } else { |
| 303 | $remaining = 0; |
| 304 | } |
| 305 | $tailBytes[chr( $n )] = $remaining; |
| 306 | } |
| 307 | } |
| 308 | |
| 309 | # Chop the text into pure-ASCII and non-ASCII areas; |
| 310 | # large ASCII parts can be handled much more quickly. |
| 311 | # Don't chop up Unicode areas for punctuation, though, |
| 312 | # that wastes energy. |
| 313 | $matches = []; |
| 314 | preg_match_all( |
| 315 | '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/', |
| 316 | $string, $matches ); |
| 317 | |
| 318 | $looksNormal = true; |
| 319 | $base = 0; |
| 320 | $replace = []; |
| 321 | foreach ( $matches[1] as $str ) { |
| 322 | $chunk = strlen( $str ); |
| 323 | |
| 324 | if ( $str[0] < "\x80" ) { |
| 325 | # ASCII chunk: guaranteed to be valid UTF-8 |
| 326 | # and in normal form C, so skip over it. |
| 327 | $base += $chunk; |
| 328 | continue; |
| 329 | } |
| 330 | |
| 331 | # We'll have to examine the chunk byte by byte to ensure |
| 332 | # that it consists of valid UTF-8 sequences, and to see |
| 333 | # if any of them might not be normalized. |
| 334 | |
| 335 | # Since PHP is not the fastest language on earth, some of |
| 336 | # this code is a little ugly with inner loop optimizations. |
| 337 | |
| 338 | $head = ''; |
| 339 | # Counting down is faster. I'm *so* sorry. |
| 340 | $len = $chunk + 1; |
| 341 | |
| 342 | for ( $i = -1; --$len; ) { |
| 343 | $remaining = $tailBytes[$c = $str[++$i]]; |
| 344 | if ( $remaining ) { |
| 345 | # UTF-8 head byte! |
| 346 | $sequence = $head = $c; |
| 347 | do { |
| 348 | # Look for the defined number of tail bytes... |
| 349 | if ( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) { |
| 350 | # Legal tail bytes are nice. |
| 351 | $sequence .= $c; |
| 352 | } elseif ( $len === 0 ) { |
| 353 | # Premature end of string! |
| 354 | # Drop a replacement character into output to |
| 355 | # represent the invalid UTF-8 sequence. |
| 356 | $replace[] = [ Constants::UTF8_REPLACEMENT, |
| 357 | $base + $i + 1 - strlen( $sequence ), |
| 358 | strlen( $sequence ) ]; |
| 359 | break 2; |
| 360 | } else { |
| 361 | # Illegal tail byte; abandon the sequence. |
| 362 | $replace[] = [ Constants::UTF8_REPLACEMENT, |
| 363 | $base + $i - strlen( $sequence ), |
| 364 | strlen( $sequence ) ]; |
| 365 | # Back up and reprocess this byte; it may itself |
| 366 | # be a legal ASCII or UTF-8 sequence head. |
| 367 | --$i; |
| 368 | ++$len; |
| 369 | continue 2; |
| 370 | } |
| 371 | } while ( --$remaining ); |
| 372 | |
| 373 | if ( isset( $checkit[$head] ) ) { |
| 374 | # Do some more detailed validity checks, for |
| 375 | # invalid characters and illegal sequences. |
| 376 | if ( $head == "\xed" ) { |
| 377 | # 0xed is relatively frequent in Korean, which |
| 378 | # abuts the surrogate area, so we're doing |
| 379 | # this check separately to speed things up. |
| 380 | |
| 381 | if ( $sequence >= Constants::UTF8_SURROGATE_FIRST ) { |
| 382 | # Surrogates are legal only in UTF-16 code. |
| 383 | # They are totally forbidden here in UTF-8 |
| 384 | # utopia. |
| 385 | $replace[] = [ Constants::UTF8_REPLACEMENT, |
| 386 | $base + $i + 1 - strlen( $sequence ), |
| 387 | strlen( $sequence ) ]; |
| 388 | $head = ''; |
| 389 | continue; |
| 390 | } |
| 391 | } else { |
| 392 | # Slower, but rarer checks... |
| 393 | $n = ord( $head ); |
| 394 | if ( |
| 395 | # "Overlong sequences" are those that are syntactically |
| 396 | # correct but use more UTF-8 bytes than are necessary to |
| 397 | # encode a character. Naïve string comparisons can be |
| 398 | # tricked into failing to see a match for an ASCII |
| 399 | # character, for instance, which can be a security hole |
| 400 | # if lists of excluded characters are being used. |
| 401 | ( $n < 0xc2 && $sequence <= Constants::UTF8_OVERLONG_A ) |
| 402 | || ( $n == 0xe0 && $sequence <= Constants::UTF8_OVERLONG_B ) |
| 403 | || ( $n == 0xf0 && $sequence <= Constants::UTF8_OVERLONG_C ) |
| 404 | |
| 405 | # U+FFFE and U+FFFF are explicitly forbidden in Unicode. |
| 406 | || ( $n == 0xef && |
| 407 | ( $sequence == Constants::UTF8_FFFE |
| 408 | || $sequence == Constants::UTF8_FFFF ) ) |
| 409 | |
| 410 | # Unicode has been limited to 21 bits; longer |
| 411 | # sequences are not allowed. |
| 412 | || ( $n >= 0xf0 && $sequence > Constants::UTF8_MAX ) |
| 413 | ) { |
| 414 | $replace[] = [ Constants::UTF8_REPLACEMENT, |
| 415 | $base + $i + 1 - strlen( $sequence ), |
| 416 | strlen( $sequence ) ]; |
| 417 | $head = ''; |
| 418 | continue; |
| 419 | } |
| 420 | } |
| 421 | } |
| 422 | |
| 423 | if ( isset( $utfCheckOrCombining[$sequence] ) ) { |
| 424 | # If it's NO or MAYBE, we'll have to rip |
| 425 | # the string apart and put it back together. |
| 426 | # That's going to be mighty slow. |
| 427 | $looksNormal = false; |
| 428 | } |
| 429 | |
| 430 | # The sequence is legal! |
| 431 | $head = ''; |
| 432 | } elseif ( $c < "\x80" ) { |
| 433 | # ASCII byte. |
| 434 | $head = ''; |
| 435 | } elseif ( $c < "\xc0" ) { |
| 436 | # Illegal tail bytes |
| 437 | if ( $head == '' ) { |
| 438 | # Out of the blue! |
| 439 | $replace[] = [ Constants::UTF8_REPLACEMENT, $base + $i, 1 ]; |
| 440 | } else { |
| 441 | # Don't add if we're continuing a broken sequence; |
| 442 | # we already put a replacement character when we looked |
| 443 | # at the broken sequence. |
| 444 | $replace[] = [ '', $base + $i, 1 ]; |
| 445 | } |
| 446 | } else { |
| 447 | # Miscellaneous freaks. |
| 448 | $replace[] = [ Constants::UTF8_REPLACEMENT, $base + $i, 1 ]; |
| 449 | $head = ''; |
| 450 | } |
| 451 | } |
| 452 | $base += $chunk; |
| 453 | } |
| 454 | if ( count( $replace ) ) { |
| 455 | # There were illegal UTF-8 sequences we need to fix up. |
| 456 | $out = ''; |
| 457 | $last = 0; |
| 458 | foreach ( $replace as $rep ) { |
| 459 | [ $replacement, $start, $length ] = $rep; |
| 460 | if ( $last < $start ) { |
| 461 | $out .= substr( $string, $last, $start - $last ); |
| 462 | } |
| 463 | $out .= $replacement; |
| 464 | $last = $start + $length; |
| 465 | } |
| 466 | if ( $last < strlen( $string ) ) { |
| 467 | $out .= substr( $string, $last ); |
| 468 | } |
| 469 | $string = $out; |
| 470 | } |
| 471 | |
| 472 | return $looksNormal; |
| 473 | } |
| 474 | |
| 475 | # These take a string and run the normalization on them, without |
| 476 | # checking for validity or any optimization etc. Input must be |
| 477 | # VALID UTF-8! |
| 478 | |
| 479 | /** |
| 480 | * @param string $string |
| 481 | * @return string |
| 482 | */ |
| 483 | public static function NFC( $string ) { |
| 484 | return self::fastCompose( self::NFD( $string ) ); |
| 485 | } |
| 486 | |
| 487 | /** |
| 488 | * @param string $string |
| 489 | * @return string |
| 490 | */ |
| 491 | public static function NFD( $string ) { |
| 492 | self::loadData(); |
| 493 | |
| 494 | return self::fastCombiningSort( |
| 495 | self::fastDecompose( $string, self::$utfCanonicalDecomp ) |
| 496 | ); |
| 497 | } |
| 498 | |
| 499 | /** |
| 500 | * @param string $string |
| 501 | * @return string |
| 502 | */ |
| 503 | public static function NFKC( $string ) { |
| 504 | return self::fastCompose( self::NFKD( $string ) ); |
| 505 | } |
| 506 | |
| 507 | /** |
| 508 | * @param string $string |
| 509 | * @return string |
| 510 | */ |
| 511 | public static function NFKD( $string ) { |
| 512 | // @phan-suppress-next-line MediaWikiNoIssetIfDefined |
| 513 | if ( !isset( self::$utfCompatibilityDecomp ) ) { |
| 514 | require_once __DIR__ . '/UtfNormalDataK.inc'; |
| 515 | } |
| 516 | |
| 517 | return self::fastCombiningSort( |
| 518 | self::fastDecompose( $string, self::$utfCompatibilityDecomp ) ); |
| 519 | } |
| 520 | |
| 521 | /** |
| 522 | * Perform decomposition of a UTF-8 string into either D or KD form |
| 523 | * (depending on which decomposition map is passed to us). |
| 524 | * Input is assumed to be *valid* UTF-8. Invalid code will break. |
| 525 | * @param string $string valid UTF-8 string |
| 526 | * @param array $map hash of expanded decomposition map |
| 527 | * @return string a UTF-8 string decomposed, not yet normalized (needs sorting) |
| 528 | */ |
| 529 | public static function fastDecompose( $string, $map ) { |
| 530 | self::loadData(); |
| 531 | |
| 532 | $len = strlen( $string ); |
| 533 | $out = ''; |
| 534 | for ( $i = 0; $i < $len; $i++ ) { |
| 535 | $c = $string[$i]; |
| 536 | $n = ord( $c ); |
| 537 | if ( $n < 0x80 ) { |
| 538 | # ASCII chars never decompose |
| 539 | # THEY ARE IMMORTAL |
| 540 | $out .= $c; |
| 541 | continue; |
| 542 | } elseif ( $n >= 0xf0 ) { |
| 543 | $c = substr( $string, $i, 4 ); |
| 544 | $i += 3; |
| 545 | } elseif ( $n >= 0xe0 ) { |
| 546 | $c = substr( $string, $i, 3 ); |
| 547 | $i += 2; |
| 548 | } elseif ( $n >= 0xc0 ) { |
| 549 | $c = substr( $string, $i, 2 ); |
| 550 | $i++; |
| 551 | } |
| 552 | if ( isset( $map[$c] ) ) { |
| 553 | $out .= $map[$c]; |
| 554 | continue; |
| 555 | } else { |
| 556 | if ( $c >= Constants::UTF8_HANGUL_FIRST && $c <= Constants::UTF8_HANGUL_LAST ) { |
| 557 | # Decompose a hangul syllable into jamo; |
| 558 | # hardcoded for three-byte UTF-8 sequence. |
| 559 | # A lookup table would be slightly faster, |
| 560 | # but adds a lot of memory & disk needs. |
| 561 | $index = ( ( ord( $c[0] ) & 0x0f ) << 12 |
| 562 | | ( ord( $c[1] ) & 0x3f ) << 6 |
| 563 | | ( ord( $c[2] ) & 0x3f ) ) |
| 564 | - Constants::UNICODE_HANGUL_FIRST; |
| 565 | $l = intval( $index / Constants::UNICODE_HANGUL_NCOUNT ); |
| 566 | $v = intval( |
| 567 | ( $index % Constants::UNICODE_HANGUL_NCOUNT ) |
| 568 | / Constants::UNICODE_HANGUL_TCOUNT |
| 569 | ); |
| 570 | $t = $index % Constants::UNICODE_HANGUL_TCOUNT; |
| 571 | $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v ); |
| 572 | if ( $t >= 25 ) { |
| 573 | $out .= "\xe1\x87" . chr( 0x80 + $t - 25 ); |
| 574 | } elseif ( $t ) { |
| 575 | $out .= "\xe1\x86" . chr( 0xa7 + $t ); |
| 576 | } |
| 577 | continue; |
| 578 | } |
| 579 | } |
| 580 | $out .= $c; |
| 581 | } |
| 582 | |
| 583 | return $out; |
| 584 | } |
| 585 | |
| 586 | /** |
| 587 | * Sorts combining characters into canonical order. This is the |
| 588 | * final step in creating decomposed normal forms D and KD. |
| 589 | * @param string $string a valid, decomposed UTF-8 string. Input is not validated. |
| 590 | * @return string a UTF-8 string with combining characters sorted in canonical order |
| 591 | */ |
| 592 | public static function fastCombiningSort( $string ) { |
| 593 | self::loadData(); |
| 594 | |
| 595 | $len = strlen( $string ); |
| 596 | $out = ''; |
| 597 | $combiners = []; |
| 598 | $lastClass = -1; |
| 599 | for ( $i = 0; $i < $len; $i++ ) { |
| 600 | $c = $string[$i]; |
| 601 | $n = ord( $c ); |
| 602 | if ( $n >= 0x80 ) { |
| 603 | if ( $n >= 0xf0 ) { |
| 604 | $c = substr( $string, $i, 4 ); |
| 605 | $i += 3; |
| 606 | } elseif ( $n >= 0xe0 ) { |
| 607 | $c = substr( $string, $i, 3 ); |
| 608 | $i += 2; |
| 609 | } elseif ( $n >= 0xc0 ) { |
| 610 | $c = substr( $string, $i, 2 ); |
| 611 | $i++; |
| 612 | } |
| 613 | if ( isset( self::$utfCombiningClass[$c] ) ) { |
| 614 | $lastClass = self::$utfCombiningClass[$c]; |
| 615 | if ( isset( $combiners[$lastClass] ) ) { |
| 616 | $combiners[$lastClass] .= $c; |
| 617 | } else { |
| 618 | $combiners[$lastClass] = $c; |
| 619 | } |
| 620 | continue; |
| 621 | } |
| 622 | } |
| 623 | if ( $lastClass ) { |
| 624 | ksort( $combiners ); |
| 625 | $out .= implode( '', $combiners ); |
| 626 | $combiners = []; |
| 627 | } |
| 628 | $out .= $c; |
| 629 | $lastClass = 0; |
| 630 | } |
| 631 | if ( $lastClass ) { |
| 632 | ksort( $combiners ); |
| 633 | $out .= implode( '', $combiners ); |
| 634 | } |
| 635 | |
| 636 | return $out; |
| 637 | } |
| 638 | |
| 639 | /** |
| 640 | * Produces canonically composed sequences, i.e. normal form C or KC. |
| 641 | * |
| 642 | * @param string $string a valid UTF-8 string in sorted normal form D or KD. |
| 643 | * Input is not validated. |
| 644 | * @return string a UTF-8 string with canonical precomposed characters used |
| 645 | * where possible. |
| 646 | */ |
| 647 | public static function fastCompose( $string ) { |
| 648 | self::loadData(); |
| 649 | |
| 650 | $len = strlen( $string ); |
| 651 | $out = ''; |
| 652 | $lastClass = -1; |
| 653 | $lastHangul = 0; |
| 654 | $startChar = ''; |
| 655 | $combining = ''; |
| 656 | |
| 657 | $x1 = ord( Constants::UTF8_HANGUL_VBASE[0] ); |
| 658 | $x2 = ord( Constants::UTF8_HANGUL_TEND[0] ); |
| 659 | for ( $i = 0; $i < $len; $i++ ) { |
| 660 | $c = $string[$i]; |
| 661 | $n = ord( $c ); |
| 662 | if ( $n < 0x80 ) { |
| 663 | # No combining characters here... |
| 664 | $out .= $startChar; |
| 665 | $out .= $combining; |
| 666 | $startChar = $c; |
| 667 | $combining = ''; |
| 668 | $lastClass = 0; |
| 669 | continue; |
| 670 | } elseif ( $n >= 0xf0 ) { |
| 671 | $c = substr( $string, $i, 4 ); |
| 672 | $i += 3; |
| 673 | } elseif ( $n >= 0xe0 ) { |
| 674 | $c = substr( $string, $i, 3 ); |
| 675 | $i += 2; |
| 676 | } elseif ( $n >= 0xc0 ) { |
| 677 | $c = substr( $string, $i, 2 ); |
| 678 | $i++; |
| 679 | } |
| 680 | $pair = $startChar . $c; |
| 681 | if ( $n > 0x80 && isset( self::$utfCombiningClass[$c] ) ) { |
| 682 | # A combining char; see what we can do with it |
| 683 | $class = self::$utfCombiningClass[$c]; |
| 684 | if ( $startChar !== '' && |
| 685 | $lastClass < $class && |
| 686 | $class > 0 && |
| 687 | isset( self::$utfCanonicalComp[$pair] ) |
| 688 | ) { |
| 689 | $startChar = self::$utfCanonicalComp[$pair]; |
| 690 | $class = 0; |
| 691 | } else { |
| 692 | $combining .= $c; |
| 693 | } |
| 694 | $lastClass = $class; |
| 695 | $lastHangul = 0; |
| 696 | continue; |
| 697 | } |
| 698 | # New start char |
| 699 | if ( $lastClass === 0 ) { |
| 700 | if ( isset( self::$utfCanonicalComp[$pair] ) ) { |
| 701 | $startChar = self::$utfCanonicalComp[$pair]; |
| 702 | $lastHangul = 0; |
| 703 | continue; |
| 704 | } |
| 705 | if ( $n >= $x1 && $n <= $x2 ) { |
| 706 | # WARNING: Hangul code is painfully slow. |
| 707 | # I apologize for this ugly, ugly code; however |
| 708 | # performance is even more teh suck if we call |
| 709 | # out to nice clean functions. Lookup tables are |
| 710 | # marginally faster, but require a lot of space. |
| 711 | if ( $c >= Constants::UTF8_HANGUL_VBASE && |
| 712 | $c <= Constants::UTF8_HANGUL_VEND && |
| 713 | $startChar >= Constants::UTF8_HANGUL_LBASE && |
| 714 | $startChar <= Constants::UTF8_HANGUL_LEND |
| 715 | ) { |
| 716 | # $lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE; |
| 717 | # $vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE; |
| 718 | $lIndex = ord( $startChar[2] ) - 0x80; |
| 719 | $vIndex = ord( $c[2] ) - 0xa1; |
| 720 | |
| 721 | $hangulPoint = Constants::UNICODE_HANGUL_FIRST + |
| 722 | Constants::UNICODE_HANGUL_TCOUNT * |
| 723 | ( Constants::UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex ); |
| 724 | |
| 725 | # Hardcode the limited-range UTF-8 conversion: |
| 726 | $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) . |
| 727 | chr( $hangulPoint >> 6 & 0x3f | 0x80 ) . |
| 728 | chr( $hangulPoint & 0x3f | 0x80 ); |
| 729 | $lastHangul = 0; |
| 730 | continue; |
| 731 | } elseif ( $c >= Constants::UTF8_HANGUL_TBASE && |
| 732 | $c <= Constants::UTF8_HANGUL_TEND && |
| 733 | $startChar >= Constants::UTF8_HANGUL_FIRST && |
| 734 | $startChar <= Constants::UTF8_HANGUL_LAST && |
| 735 | !$lastHangul |
| 736 | ) { |
| 737 | # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE; |
| 738 | $tIndex = ord( $c[2] ) - 0xa7; |
| 739 | if ( $tIndex < 0 ) { |
| 740 | $tIndex = ord( $c[2] ) - 0x80 + ( 0x11c0 - 0x11a7 ); |
| 741 | } |
| 742 | |
| 743 | # Increment the code point by $tIndex, without |
| 744 | # the function overhead of decoding and recoding UTF-8 |
| 745 | $tail = ord( $startChar[2] ) + $tIndex; |
| 746 | if ( $tail > 0xbf ) { |
| 747 | $tail -= 0x40; |
| 748 | $mid = ord( $startChar[1] ) + 1; |
| 749 | if ( $mid > 0xbf ) { |
| 750 | $startChar[0] = chr( ord( $startChar[0] ) + 1 ); |
| 751 | $mid -= 0x40; |
| 752 | } |
| 753 | $startChar[1] = chr( $mid ); |
| 754 | } |
| 755 | $startChar[2] = chr( $tail ); |
| 756 | |
| 757 | # If there's another jamo char after this, *don't* try to merge it. |
| 758 | $lastHangul = 1; |
| 759 | continue; |
| 760 | } |
| 761 | } |
| 762 | } |
| 763 | $out .= $startChar; |
| 764 | $out .= $combining; |
| 765 | $startChar = $c; |
| 766 | $combining = ''; |
| 767 | $lastClass = 0; |
| 768 | $lastHangul = 0; |
| 769 | } |
| 770 | $out .= $startChar . $combining; |
| 771 | |
| 772 | return $out; |
| 773 | } |
| 774 | |
| 775 | /** |
| 776 | * This is just used for the benchmark, comparing how long it takes to |
| 777 | * interate through a string without really doing anything of substance. |
| 778 | * @param string $string |
| 779 | * @return string |
| 780 | */ |
| 781 | public static function placebo( $string ) { |
| 782 | $len = strlen( $string ); |
| 783 | $out = ''; |
| 784 | for ( $i = 0; $i < $len; $i++ ) { |
| 785 | $out .= $string[$i]; |
| 786 | } |
| 787 | |
| 788 | return $out; |
| 789 | } |
| 790 | |
| 791 | /** |
| 792 | * Function to replace some characters that we don't want |
| 793 | * but most of the native normalize functions keep. |
| 794 | * |
| 795 | * @param string $string The string |
| 796 | * @return string String with the character codes replaced. |
| 797 | */ |
| 798 | private static function replaceForNativeNormalize( $string ) { |
| 799 | $string = preg_replace( |
| 800 | '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', |
| 801 | Constants::UTF8_REPLACEMENT, |
| 802 | $string |
| 803 | ); |
| 804 | return str_replace( [ Constants::UTF8_FFFE, Constants::UTF8_FFFF ], Constants::UTF8_REPLACEMENT, $string ); |
| 805 | } |
| 806 | } |