Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
80.00% covered (warning)
80.00%
280 / 350
27.78% covered (danger)
27.78%
5 / 18
CRAP
0.00% covered (danger)
0.00%
0 / 1
Validator
80.23% covered (warning)
80.23%
280 / 349
27.78% covered (danger)
27.78%
5 / 18
251.65
0.00% covered (danger)
0.00%
0 / 1
 cleanUp
78.57% covered (warning)
78.57%
11 / 14
0.00% covered (danger)
0.00%
0 / 1
6.35
 prependIsolatedCombining
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 toNFC
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
12
 toNFD
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
12
 toNFKC
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
12
 toNFKD
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
12
 loadData
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 quickIsNFC
0.00% covered (danger)
0.00%
0 / 23
0.00% covered (danger)
0.00%
0 / 1
90
 quickIsNFCVerify
98.10% covered (success)
98.10%
103 / 105
0.00% covered (danger)
0.00%
0 / 1
40
 NFC
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 NFD
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 NFKC
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 NFKD
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
6
 fastDecompose
92.50% covered (success)
92.50%
37 / 40
0.00% covered (danger)
0.00%
0 / 1
11.05
 fastCombiningSort
94.12% covered (success)
94.12%
32 / 34
0.00% covered (danger)
0.00%
0 / 1
10.02
 fastCompose
93.18% covered (success)
93.18%
82 / 88
0.00% covered (danger)
0.00%
0 / 1
28.25
 placebo
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
6
 replaceForNativeNormalize
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2declare( strict_types = 1 );
3
4/**
5 * Unicode normalization routines
6 *
7 * Copyright © 2004 Brooke Vibber <bvibber@pobox.com>
8 * https://www.mediawiki.org/
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License along
21 * with this program; if not, write to the Free Software Foundation, Inc.,
22 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 * http://www.gnu.org/copyleft/gpl.html
24 *
25 * @file
26 */
27namespace UtfNormal;
28
29use Normalizer;
30
31/**
32 * @defgroup UtfNormal UtfNormal
33 */
34
35define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
36
37/**
38 * Unicode normalization routines for working with UTF-8 strings.
39 * Currently, it assumes that input strings are valid UTF-8!
40 *
41 * Not as fast as I'd like, but should be usable for most purposes.
42 * UtfNormal\Validator::toNFC() will bail early if given ASCII text or text
43 * it can quickly determine is already normalized.
44 *
45 * All functions can be called static.
46 *
47 * See description of forms at http://www.unicode.org/reports/tr15/
48 *
49 * @ingroup UtfNormal
50 */
51class Validator {
52
53    /**
54     * @var array
55     */
56    public static $utfCombiningClass;
57
58    /**
59     * @var array
60     */
61    public static $utfCanonicalComp;
62
63    /**
64     * @var array
65     */
66    public static $utfCanonicalDecomp;
67
68    /**
69     * Load compatibility decompositions on demand if they are needed.
70     *
71     * @var array
72     */
73    public static $utfCompatibilityDecomp;
74
75    /**
76     * @var array|null
77     */
78    public static $utfCheckNFC;
79
80    /**
81     * @var string|null
82     */
83    public static $utfIsolatedCombiningRegex;
84
85    /**
86     * The ultimate convenience function! Clean up invalid UTF-8 sequences,
87     * and convert to normal form C, canonical composition, then clean up
88     * isolated combining characters.
89     *
90     * Fast return for pure ASCII strings; some lesser optimizations for
91     * strings containing only known-good characters. Not as fast as toNFC().
92     *
93     * @param string $string a UTF-8 string
94     * @return string a clean, shiny, normalized UTF-8 string
95     */
96    public static function cleanUp( $string ) {
97        if ( NORMALIZE_INTL ) {
98            if ( !preg_match( '/[\x00-\x08\x0b\x0c\x0e-\x1f\x80-\xff]/', $string ) ) {
99                return $string;
100            }
101            $string = self::replaceForNativeNormalize( $string );
102            $norm = normalizer_normalize( $string, Normalizer::FORM_C );
103            if ( $norm === false ) {
104                # normalizer_normalize will return false if invalid utf8 string.
105                # quickIsNFCVerify cleans up invalid sequences.
106                if ( self::quickIsNFCVerify( $string ) ) {
107                    # if that's true, the string is actually already normal.
108                    # (and doesn't have any combining characters, so we can
109                    # skip looking for isolated combining characters)
110                    return $string;
111                } else {
112                    # Now we are valid but non-normal
113                    $norm = normalizer_normalize( $string, Normalizer::FORM_C );
114                }
115            }
116            $norm = self::prependIsolatedCombining( $norm );
117            return $norm;
118        } elseif ( self::quickIsNFCVerify( $string ) ) {
119            # Side effect -- $string has had UTF-8 errors cleaned up.
120            return $string;
121        } else {
122            return self::prependIsolatedCombining( self::NFC( $string ) );
123        }
124    }
125
126    public static function prependIsolatedCombining( string $string ): string {
127        self::loadData();
128        return preg_replace( self::$utfIsolatedCombiningRegex, "\u{25CC}", $string );
129    }
130
131    /**
132     * Convert a UTF-8 string to normal form C, canonical composition.
133     * Fast return for pure ASCII strings; some lesser optimizations for
134     * strings containing only known-good characters.
135     *
136     * @param string $string a valid UTF-8 string. Input is not validated.
137     * @return string a UTF-8 string in normal form C
138     */
139    public static function toNFC( $string ) {
140        if ( NORMALIZE_INTL ) {
141            return normalizer_normalize( $string, Normalizer::FORM_C );
142        } elseif ( self::quickIsNFC( $string ) ) {
143            return $string;
144        } else {
145            return self::NFC( $string );
146        }
147    }
148
149    /**
150     * Convert a UTF-8 string to normal form D, canonical decomposition.
151     * Fast return for pure ASCII strings.
152     *
153     * @param string $string A valid UTF-8 string. Input is not validated.
154     * @return string A UTF-8 string in normal form D
155     */
156    public static function toNFD( $string ) {
157        if ( NORMALIZE_INTL ) {
158            return normalizer_normalize( $string, Normalizer::FORM_D );
159        } elseif ( preg_match( '/[\x80-\xff]/', $string ) ) {
160            return self::NFD( $string );
161        } else {
162            return $string;
163        }
164    }
165
166    /**
167     * Convert a UTF-8 string to normal form KC, compatibility composition.
168     * This may cause irreversible information loss, use judiciously.
169     * Fast return for pure ASCII strings.
170     *
171     * @param string $string A valid UTF-8 string. Input is not validated.
172     * @return string A UTF-8 string in normal form KC
173     */
174    public static function toNFKC( $string ) {
175        if ( NORMALIZE_INTL ) {
176            return normalizer_normalize( $string, Normalizer::FORM_KC );
177        } elseif ( preg_match( '/[\x80-\xff]/', $string ) ) {
178            return self::NFKC( $string );
179        } else {
180            return $string;
181        }
182    }
183
184    /**
185     * Convert a UTF-8 string to normal form KD, compatibility decomposition.
186     * This may cause irreversible information loss, use judiciously.
187     * Fast return for pure ASCII strings.
188     *
189     * @param string $string a valid UTF-8 string. Input is not validated.
190     * @return string a UTF-8 string in normal form KD
191     */
192    public static function toNFKD( $string ) {
193        if ( NORMALIZE_INTL ) {
194            return normalizer_normalize( $string, Normalizer::FORM_KD );
195        } elseif ( preg_match( '/[\x80-\xff]/', $string ) ) {
196            return self::NFKD( $string );
197        } else {
198            return $string;
199        }
200    }
201
202    /**
203     * Load the basic composition data if necessary
204     */
205    public static function loadData() {
206        // @phan-suppress-next-line MediaWikiNoIssetIfDefined
207        if ( !isset( self::$utfCombiningClass ) ) {
208            require_once __DIR__ . '/UtfNormalData.inc';
209        }
210    }
211
212    /**
213     * Returns true if the string is _definitely_ in NFC.
214     * Returns false if not or uncertain.
215     * @param string $string a valid UTF-8 string. Input is not validated.
216     * @return bool
217     */
218    public static function quickIsNFC( $string ) {
219        # ASCII is always valid NFC!
220        # If it's pure ASCII, let it through.
221        if ( !preg_match( '/[\x80-\xff]/', $string ) ) {
222            return true;
223        }
224
225        self::loadData();
226
227        $len = strlen( $string );
228        for ( $i = 0; $i < $len; $i++ ) {
229            $c = $string[$i];
230            $n = ord( $c );
231            if ( $n < 0x80 ) {
232                continue;
233            } elseif ( $n >= 0xf0 ) {
234                $c = substr( $string, $i, 4 );
235                $i += 3;
236            } elseif ( $n >= 0xe0 ) {
237                $c = substr( $string, $i, 3 );
238                $i += 2;
239            } elseif ( $n >= 0xc0 ) {
240                $c = substr( $string, $i, 2 );
241                $i++;
242            }
243            if ( isset( self::$utfCheckNFC[$c] ) ) {
244                # If it's NO or MAYBE, bail and do the slow check.
245                return false;
246            }
247            if ( isset( self::$utfCombiningClass[$c] ) ) {
248                # Combining character? We might have to do sorting, at least.
249                return false;
250            }
251        }
252
253        return true;
254    }
255
256    /**
257     * Returns true if the string is _definitely_ in NFC.
258     * Returns false if not or uncertain.
259     * @param string &$string A UTF-8 string, altered on output to be valid UTF-8 safe for XML.
260     * @return bool
261     */
262    public static function quickIsNFCVerify( &$string ) {
263        # Screen out some characters that eg won't be allowed in XML
264        $string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', Constants::UTF8_REPLACEMENT, $string );
265
266        # ASCII is always valid NFC!
267        # If we're only ever given plain ASCII, we can avoid the overhead
268        # of initializing the decomposition tables by skipping out early.
269        if ( !preg_match( '/[\x80-\xff]/', $string ) ) {
270            return true;
271        }
272
273        static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
274        if ( $checkit === null ) {
275            # Load/build some scary lookup tables...
276            self::loadData();
277
278            $utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
279
280            # Head bytes for sequences which we should do further validity checks
281            $checkit = array_flip( array_map( 'chr',
282                [ 0xc0, 0xc1, 0xe0, 0xed, 0xef,
283                    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
284                    0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ] ) );
285
286            # Each UTF-8 head byte is followed by a certain
287            # number of tail bytes.
288            $tailBytes = [];
289            for ( $n = 0; $n < 256; $n++ ) {
290                if ( $n < 0xc0 ) {
291                    $remaining = 0;
292                } elseif ( $n < 0xe0 ) {
293                    $remaining = 1;
294                } elseif ( $n < 0xf0 ) {
295                    $remaining = 2;
296                } elseif ( $n < 0xf8 ) {
297                    $remaining = 3;
298                } elseif ( $n < 0xfc ) {
299                    $remaining = 4;
300                } elseif ( $n < 0xfe ) {
301                    $remaining = 5;
302                } else {
303                    $remaining = 0;
304                }
305                $tailBytes[chr( $n )] = $remaining;
306            }
307        }
308
309        # Chop the text into pure-ASCII and non-ASCII areas;
310        # large ASCII parts can be handled much more quickly.
311        # Don't chop up Unicode areas for punctuation, though,
312        # that wastes energy.
313        $matches = [];
314        preg_match_all(
315            '/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
316            $string, $matches );
317
318        $looksNormal = true;
319        $base = 0;
320        $replace = [];
321        foreach ( $matches[1] as $str ) {
322            $chunk = strlen( $str );
323
324            if ( $str[0] < "\x80" ) {
325                # ASCII chunk: guaranteed to be valid UTF-8
326                # and in normal form C, so skip over it.
327                $base += $chunk;
328                continue;
329            }
330
331            # We'll have to examine the chunk byte by byte to ensure
332            # that it consists of valid UTF-8 sequences, and to see
333            # if any of them might not be normalized.
334
335            # Since PHP is not the fastest language on earth, some of
336            # this code is a little ugly with inner loop optimizations.
337
338            $head = '';
339            # Counting down is faster. I'm *so* sorry.
340            $len = $chunk + 1;
341
342            for ( $i = -1; --$len; ) {
343                $remaining = $tailBytes[$c = $str[++$i]];
344                if ( $remaining ) {
345                    # UTF-8 head byte!
346                    $sequence = $head = $c;
347                    do {
348                        # Look for the defined number of tail bytes...
349                        if ( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
350                            # Legal tail bytes are nice.
351                            $sequence .= $c;
352                        } elseif ( $len === 0 ) {
353                            # Premature end of string!
354                            # Drop a replacement character into output to
355                            # represent the invalid UTF-8 sequence.
356                            $replace[] = [ Constants::UTF8_REPLACEMENT,
357                                $base + $i + 1 - strlen( $sequence ),
358                                strlen( $sequence ) ];
359                            break 2;
360                        } else {
361                            # Illegal tail byte; abandon the sequence.
362                            $replace[] = [ Constants::UTF8_REPLACEMENT,
363                                $base + $i - strlen( $sequence ),
364                                strlen( $sequence ) ];
365                            # Back up and reprocess this byte; it may itself
366                            # be a legal ASCII or UTF-8 sequence head.
367                            --$i;
368                            ++$len;
369                            continue 2;
370                        }
371                    } while ( --$remaining );
372
373                    if ( isset( $checkit[$head] ) ) {
374                        # Do some more detailed validity checks, for
375                        # invalid characters and illegal sequences.
376                        if ( $head == "\xed" ) {
377                            # 0xed is relatively frequent in Korean, which
378                            # abuts the surrogate area, so we're doing
379                            # this check separately to speed things up.
380
381                            if ( $sequence >= Constants::UTF8_SURROGATE_FIRST ) {
382                                # Surrogates are legal only in UTF-16 code.
383                                # They are totally forbidden here in UTF-8
384                                # utopia.
385                                $replace[] = [ Constants::UTF8_REPLACEMENT,
386                                    $base + $i + 1 - strlen( $sequence ),
387                                    strlen( $sequence ) ];
388                                $head = '';
389                                continue;
390                            }
391                        } else {
392                            # Slower, but rarer checks...
393                            $n = ord( $head );
394                            if (
395                                # "Overlong sequences" are those that are syntactically
396                                # correct but use more UTF-8 bytes than are necessary to
397                                # encode a character. Naïve string comparisons can be
398                                # tricked into failing to see a match for an ASCII
399                                # character, for instance, which can be a security hole
400                                # if lists of excluded characters are being used.
401                                ( $n < 0xc2 && $sequence <= Constants::UTF8_OVERLONG_A )
402                                || ( $n == 0xe0 && $sequence <= Constants::UTF8_OVERLONG_B )
403                                || ( $n == 0xf0 && $sequence <= Constants::UTF8_OVERLONG_C )
404
405                                # U+FFFE and U+FFFF are explicitly forbidden in Unicode.
406                                || ( $n == 0xef &&
407                                    ( $sequence == Constants::UTF8_FFFE
408                                    || $sequence == Constants::UTF8_FFFF ) )
409
410                                # Unicode has been limited to 21 bits; longer
411                                # sequences are not allowed.
412                                || ( $n >= 0xf0 && $sequence > Constants::UTF8_MAX )
413                            ) {
414                                $replace[] = [ Constants::UTF8_REPLACEMENT,
415                                    $base + $i + 1 - strlen( $sequence ),
416                                    strlen( $sequence ) ];
417                                $head = '';
418                                continue;
419                            }
420                        }
421                    }
422
423                    if ( isset( $utfCheckOrCombining[$sequence] ) ) {
424                        # If it's NO or MAYBE, we'll have to rip
425                        # the string apart and put it back together.
426                        # That's going to be mighty slow.
427                        $looksNormal = false;
428                    }
429
430                    # The sequence is legal!
431                    $head = '';
432                } elseif ( $c < "\x80" ) {
433                    # ASCII byte.
434                    $head = '';
435                } elseif ( $c < "\xc0" ) {
436                    # Illegal tail bytes
437                    if ( $head == '' ) {
438                        # Out of the blue!
439                        $replace[] = [ Constants::UTF8_REPLACEMENT, $base + $i, 1 ];
440                    } else {
441                        # Don't add if we're continuing a broken sequence;
442                        # we already put a replacement character when we looked
443                        # at the broken sequence.
444                        $replace[] = [ '', $base + $i, 1 ];
445                    }
446                } else {
447                    # Miscellaneous freaks.
448                    $replace[] = [ Constants::UTF8_REPLACEMENT, $base + $i, 1 ];
449                    $head = '';
450                }
451            }
452            $base += $chunk;
453        }
454        if ( count( $replace ) ) {
455            # There were illegal UTF-8 sequences we need to fix up.
456            $out = '';
457            $last = 0;
458            foreach ( $replace as $rep ) {
459                [ $replacement, $start, $length ] = $rep;
460                if ( $last < $start ) {
461                    $out .= substr( $string, $last, $start - $last );
462                }
463                $out .= $replacement;
464                $last = $start + $length;
465            }
466            if ( $last < strlen( $string ) ) {
467                $out .= substr( $string, $last );
468            }
469            $string = $out;
470        }
471
472        return $looksNormal;
473    }
474
475    # These take a string and run the normalization on them, without
476    # checking for validity or any optimization etc. Input must be
477    # VALID UTF-8!
478
479    /**
480     * @param string $string
481     * @return string
482     */
483    public static function NFC( $string ) {
484        return self::fastCompose( self::NFD( $string ) );
485    }
486
487    /**
488     * @param string $string
489     * @return string
490     */
491    public static function NFD( $string ) {
492        self::loadData();
493
494        return self::fastCombiningSort(
495            self::fastDecompose( $string, self::$utfCanonicalDecomp )
496        );
497    }
498
499    /**
500     * @param string $string
501     * @return string
502     */
503    public static function NFKC( $string ) {
504        return self::fastCompose( self::NFKD( $string ) );
505    }
506
507    /**
508     * @param string $string
509     * @return string
510     */
511    public static function NFKD( $string ) {
512        // @phan-suppress-next-line MediaWikiNoIssetIfDefined
513        if ( !isset( self::$utfCompatibilityDecomp ) ) {
514            require_once __DIR__ . '/UtfNormalDataK.inc';
515        }
516
517        return self::fastCombiningSort(
518            self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
519    }
520
521    /**
522     * Perform decomposition of a UTF-8 string into either D or KD form
523     * (depending on which decomposition map is passed to us).
524     * Input is assumed to be *valid* UTF-8. Invalid code will break.
525     * @param string $string valid UTF-8 string
526     * @param array $map hash of expanded decomposition map
527     * @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
528     */
529    public static function fastDecompose( $string, $map ) {
530        self::loadData();
531
532        $len = strlen( $string );
533        $out = '';
534        for ( $i = 0; $i < $len; $i++ ) {
535            $c = $string[$i];
536            $n = ord( $c );
537            if ( $n < 0x80 ) {
538                # ASCII chars never decompose
539                # THEY ARE IMMORTAL
540                $out .= $c;
541                continue;
542            } elseif ( $n >= 0xf0 ) {
543                $c = substr( $string, $i, 4 );
544                $i += 3;
545            } elseif ( $n >= 0xe0 ) {
546                $c = substr( $string, $i, 3 );
547                $i += 2;
548            } elseif ( $n >= 0xc0 ) {
549                $c = substr( $string, $i, 2 );
550                $i++;
551            }
552            if ( isset( $map[$c] ) ) {
553                $out .= $map[$c];
554                continue;
555            } else {
556                if ( $c >= Constants::UTF8_HANGUL_FIRST && $c <= Constants::UTF8_HANGUL_LAST ) {
557                    # Decompose a hangul syllable into jamo;
558                    # hardcoded for three-byte UTF-8 sequence.
559                    # A lookup table would be slightly faster,
560                    # but adds a lot of memory & disk needs.
561                    $index = ( ( ord( $c[0] ) & 0x0f ) << 12
562                            | ( ord( $c[1] ) & 0x3f ) << 6
563                            | ( ord( $c[2] ) & 0x3f ) )
564                        - Constants::UNICODE_HANGUL_FIRST;
565                    $l = intval( $index / Constants::UNICODE_HANGUL_NCOUNT );
566                    $v = intval(
567                        ( $index % Constants::UNICODE_HANGUL_NCOUNT )
568                        / Constants::UNICODE_HANGUL_TCOUNT
569                    );
570                    $t = $index % Constants::UNICODE_HANGUL_TCOUNT;
571                    $out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
572                    if ( $t >= 25 ) {
573                        $out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
574                    } elseif ( $t ) {
575                        $out .= "\xe1\x86" . chr( 0xa7 + $t );
576                    }
577                    continue;
578                }
579            }
580            $out .= $c;
581        }
582
583        return $out;
584    }
585
586    /**
587     * Sorts combining characters into canonical order. This is the
588     * final step in creating decomposed normal forms D and KD.
589     * @param string $string a valid, decomposed UTF-8 string. Input is not validated.
590     * @return string a UTF-8 string with combining characters sorted in canonical order
591     */
592    public static function fastCombiningSort( $string ) {
593        self::loadData();
594
595        $len = strlen( $string );
596        $out = '';
597        $combiners = [];
598        $lastClass = -1;
599        for ( $i = 0; $i < $len; $i++ ) {
600            $c = $string[$i];
601            $n = ord( $c );
602            if ( $n >= 0x80 ) {
603                if ( $n >= 0xf0 ) {
604                    $c = substr( $string, $i, 4 );
605                    $i += 3;
606                } elseif ( $n >= 0xe0 ) {
607                    $c = substr( $string, $i, 3 );
608                    $i += 2;
609                } elseif ( $n >= 0xc0 ) {
610                    $c = substr( $string, $i, 2 );
611                    $i++;
612                }
613                if ( isset( self::$utfCombiningClass[$c] ) ) {
614                    $lastClass = self::$utfCombiningClass[$c];
615                    if ( isset( $combiners[$lastClass] ) ) {
616                        $combiners[$lastClass] .= $c;
617                    } else {
618                        $combiners[$lastClass] = $c;
619                    }
620                    continue;
621                }
622            }
623            if ( $lastClass ) {
624                ksort( $combiners );
625                $out .= implode( '', $combiners );
626                $combiners = [];
627            }
628            $out .= $c;
629            $lastClass = 0;
630        }
631        if ( $lastClass ) {
632            ksort( $combiners );
633            $out .= implode( '', $combiners );
634        }
635
636        return $out;
637    }
638
639    /**
640     * Produces canonically composed sequences, i.e. normal form C or KC.
641     *
642     * @param string $string a valid UTF-8 string in sorted normal form D or KD.
643     *   Input is not validated.
644     * @return string a UTF-8 string with canonical precomposed characters used
645     *   where possible.
646     */
647    public static function fastCompose( $string ) {
648        self::loadData();
649
650        $len = strlen( $string );
651        $out = '';
652        $lastClass = -1;
653        $lastHangul = 0;
654        $startChar = '';
655        $combining = '';
656
657        $x1 = ord( Constants::UTF8_HANGUL_VBASE[0] );
658        $x2 = ord( Constants::UTF8_HANGUL_TEND[0] );
659        for ( $i = 0; $i < $len; $i++ ) {
660            $c = $string[$i];
661            $n = ord( $c );
662            if ( $n < 0x80 ) {
663                # No combining characters here...
664                $out .= $startChar;
665                $out .= $combining;
666                $startChar = $c;
667                $combining = '';
668                $lastClass = 0;
669                continue;
670            } elseif ( $n >= 0xf0 ) {
671                $c = substr( $string, $i, 4 );
672                $i += 3;
673            } elseif ( $n >= 0xe0 ) {
674                $c = substr( $string, $i, 3 );
675                $i += 2;
676            } elseif ( $n >= 0xc0 ) {
677                $c = substr( $string, $i, 2 );
678                $i++;
679            }
680            $pair = $startChar . $c;
681            if ( $n > 0x80 && isset( self::$utfCombiningClass[$c] ) ) {
682                # A combining char; see what we can do with it
683                $class = self::$utfCombiningClass[$c];
684                if ( $startChar !== '' &&
685                    $lastClass < $class &&
686                    $class > 0 &&
687                    isset( self::$utfCanonicalComp[$pair] )
688                ) {
689                    $startChar = self::$utfCanonicalComp[$pair];
690                    $class = 0;
691                } else {
692                    $combining .= $c;
693                }
694                $lastClass = $class;
695                $lastHangul = 0;
696                continue;
697            }
698            # New start char
699            if ( $lastClass === 0 ) {
700                if ( isset( self::$utfCanonicalComp[$pair] ) ) {
701                    $startChar = self::$utfCanonicalComp[$pair];
702                    $lastHangul = 0;
703                    continue;
704                }
705                if ( $n >= $x1 && $n <= $x2 ) {
706                    # WARNING: Hangul code is painfully slow.
707                    # I apologize for this ugly, ugly code; however
708                    # performance is even more teh suck if we call
709                    # out to nice clean functions. Lookup tables are
710                    # marginally faster, but require a lot of space.
711                    if ( $c >= Constants::UTF8_HANGUL_VBASE &&
712                        $c <= Constants::UTF8_HANGUL_VEND &&
713                        $startChar >= Constants::UTF8_HANGUL_LBASE &&
714                        $startChar <= Constants::UTF8_HANGUL_LEND
715                    ) {
716                        # $lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
717                        # $vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
718                        $lIndex = ord( $startChar[2] ) - 0x80;
719                        $vIndex = ord( $c[2] ) - 0xa1;
720
721                        $hangulPoint = Constants::UNICODE_HANGUL_FIRST +
722                            Constants::UNICODE_HANGUL_TCOUNT *
723                            ( Constants::UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex );
724
725                        # Hardcode the limited-range UTF-8 conversion:
726                        $startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
727                            chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
728                            chr( $hangulPoint & 0x3f | 0x80 );
729                        $lastHangul = 0;
730                        continue;
731                    } elseif ( $c >= Constants::UTF8_HANGUL_TBASE &&
732                        $c <= Constants::UTF8_HANGUL_TEND &&
733                        $startChar >= Constants::UTF8_HANGUL_FIRST &&
734                        $startChar <= Constants::UTF8_HANGUL_LAST &&
735                        !$lastHangul
736                    ) {
737                        # $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
738                        $tIndex = ord( $c[2] ) - 0xa7;
739                        if ( $tIndex < 0 ) {
740                            $tIndex = ord( $c[2] ) - 0x80 + ( 0x11c0 - 0x11a7 );
741                        }
742
743                        # Increment the code point by $tIndex, without
744                        # the function overhead of decoding and recoding UTF-8
745                        $tail = ord( $startChar[2] ) + $tIndex;
746                        if ( $tail > 0xbf ) {
747                            $tail -= 0x40;
748                            $mid = ord( $startChar[1] ) + 1;
749                            if ( $mid > 0xbf ) {
750                                $startChar[0] = chr( ord( $startChar[0] ) + 1 );
751                                $mid -= 0x40;
752                            }
753                            $startChar[1] = chr( $mid );
754                        }
755                        $startChar[2] = chr( $tail );
756
757                        # If there's another jamo char after this, *don't* try to merge it.
758                        $lastHangul = 1;
759                        continue;
760                    }
761                }
762            }
763            $out .= $startChar;
764            $out .= $combining;
765            $startChar = $c;
766            $combining = '';
767            $lastClass = 0;
768            $lastHangul = 0;
769        }
770        $out .= $startChar . $combining;
771
772        return $out;
773    }
774
775    /**
776     * This is just used for the benchmark, comparing how long it takes to
777     * interate through a string without really doing anything of substance.
778     * @param string $string
779     * @return string
780     */
781    public static function placebo( $string ) {
782        $len = strlen( $string );
783        $out = '';
784        for ( $i = 0; $i < $len; $i++ ) {
785            $out .= $string[$i];
786        }
787
788        return $out;
789    }
790
791    /**
792     * Function to replace some characters that we don't want
793     * but most of the native normalize functions keep.
794     *
795     * @param string $string The string
796     * @return string String with the character codes replaced.
797     */
798    private static function replaceForNativeNormalize( $string ) {
799        $string = preg_replace(
800            '/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
801            Constants::UTF8_REPLACEMENT,
802            $string
803        );
804        return str_replace( [ Constants::UTF8_FFFE, Constants::UTF8_FFFF ], Constants::UTF8_REPLACEMENT, $string );
805    }
806}