4 use UtfNormal\Validator;
6 if ( PHP_SAPI !==
'cli' && PHP_SAPI !==
'phpdbg' ) {
7 die(
"This script may only be executed from the command line.\n" );
11 if ( count( $argv ) > 1 ) {
14 die(
"The specified UtfNormal directory '$utfnormalDir' does not exist\n" );
16 if ( file_exists(
"$utfnormalDir/Validator.php" ) ) {
18 } elseif ( file_exists(
"$utfnormalDir/src/Validator.php" ) ) {
24 "Warning: Supplied path \"%s\" does not seem to contain UtfNormal. Trying it anyway.\n",
31 __DIR__ .
'/../../../../../../../core/vendor/wikimedia/utfnormal/src',
33 __DIR__ .
'/../../../../../../../vendor/wikimedia/utfnormal/src',
35 if ( getenv(
'MW_INSTALL_PATH' ) ) {
36 array_unshift( $trydirs, getenv(
'MW_INSTALL_PATH' ) .
'/vendor/wikimedia/utfnormal/src' );
38 foreach ( $trydirs as $trydir ) {
39 $trydir = realpath( $trydir );
40 if ( $trydir !==
false && is_dir( $trydir ) && file_exists(
"$trydir/UtfNormalData.inc" ) ) {
46 die(
"Cannot find UtfNormal. Please specify the path explicitly.\n" );
60 echo
"Loading UtfNormal from $utfnormalDir...\n";
61 require_once
"$utfnormalDir/Validator.php";
62 require_once
"$utfnormalDir/UtfNormalData.inc";
63 require_once
"$utfnormalDir/UtfNormalDataK.inc";
69 if ( !Validator::$utfCheckNFC ||
70 !Validator::$utfCombiningClass ||
71 !Validator::$utfCanonicalDecomp ||
72 !Validator::$utfCanonicalComp ||
73 !Validator::$utfCompatibilityDecomp
75 die(
"UtfNormal data files did not contain needed data.\n" );
79 function uord( $c, $firstOnly ) {
80 $ret = unpack(
'N*', mb_convert_encoding( $c,
'UTF-32BE',
'UTF-8' ) );
81 return $firstOnly ? $ret[1] : $ret;
84 echo
"Creating normalization table...\n";
85 $X = fopen( __DIR__ .
'/normalization-data.lua',
'w' );
87 die(
"Failed to open normalization-data.lua\n" );
89 fprintf(
$X,
"-- This file is automatically generated by make-normalization-table.php\n" );
90 fprintf(
$X,
"local normal = {\n" );
91 fprintf(
$X,
"\t-- Characters that might change depending on the following combiner\n" );
92 fprintf(
$X,
"\t-- (minus any that are themselves combiners, those are added later)\n" );
93 fprintf(
$X,
"\tcheck = {\n" );
94 foreach ( Validator::$utfCheckNFC as $k => $v ) {
95 if ( isset( Validator::$utfCombiningClass[$k] ) ) {
99 fprintf(
$X,
"\t\t[0x%06x] = 1,\n",
uord( $k,
true ) );
101 fprintf(
$X,
"\t},\n\n" );
102 fprintf(
$X,
"\t-- Combining characters, mapped to combining class\n" );
103 fprintf(
$X,
"\tcombclass = {\n" );
105 foreach ( Validator::$utfCombiningClass as $k => $v ) {
106 $cp =
uord( $k,
true );
108 fprintf(
$X,
"\t\t[0x%06x] = %d,\n", $cp, $v );
110 fprintf(
$X,
"\t},\n\n" );
111 fprintf(
$X,
"\t-- Characters mapped to what they decompose to\n" );
112 fprintf(
$X,
"\t-- Note Hangul to Jamo is done separately below\n" );
113 fprintf(
$X,
"\tdecomp = {\n" );
114 foreach ( Validator::$utfCanonicalDecomp as $k => $v ) {
115 fprintf(
$X,
"\t\t[0x%06x] = { ",
uord( $k,
true ) );
117 foreach (
uord( $v,
false ) as $c ) {
118 fprintf(
$X, $fmt, $c );
121 fprintf(
$X,
" },\n" );
123 fprintf(
$X,
"\t},\n\n" );
125 fprintf(
$X,
"\tdecompK = {\n" );
126 foreach ( Validator::$utfCompatibilityDecomp as $k => $v ) {
127 if ( isset( Validator::$utfCanonicalDecomp[$k] ) && Validator::$utfCanonicalDecomp[$k] === $v ) {
131 fprintf(
$X,
"\t\t[0x%06x] = { ",
uord( $k,
true ) );
133 foreach (
uord( $v,
false ) as $c ) {
134 fprintf(
$X, $fmt, $c );
137 fprintf(
$X,
" },\n" );
139 fprintf(
$X,
"\t},\n\n" );
141 fprintf(
$X,
"\t-- Character-pairs mapped to what they compose to\n" );
142 fprintf(
$X,
"\t-- Note Jamo to Hangul is done separately below\n" );
144 foreach ( Validator::$utfCanonicalComp as $k => $v ) {
145 $k =
uord( $k,
false );
146 if ( count( $k ) == 1 ) {
150 if ( isset(
$comb[$k[1]] ) ) {
154 $t[$k[1]][$k[2]] =
uord( $v,
true );
156 fprintf(
$X,
"\tcomp = {\n" );
158 foreach (
$t as $k1 => $v1 ) {
159 fprintf(
$X,
"\t\t[0x%06x] = {\n", $k1 );
161 foreach ( $v1 as $k2 => $v2 ) {
163 fprintf(
$X,
"\t\t\t[-1] = 0x%06x,\n", $v2 );
165 fprintf(
$X,
"\t\t\t[0x%06x] = 0x%06x,\n", $k2, $v2 );
168 fprintf(
$X,
"\t\t},\n" );
170 fprintf(
$X,
"\t},\n" );
172 fprintf(
$X,
"}\n" );
174 fprintf(
$X,
"\n%s\n", <<<LUA
175 -- All combining characters need to be checked, so just
do that
176 setmetatable( normal.check, { __index = normal.combclass } )
178 -- Handle Hangul to Jamo decomposition
179 setmetatable( normal.decomp, { __index = function ( _, k )
180 if k >= 0xac00 and k <= 0xd7a3 then
181 -- Decompose a Hangul syllable into Jamo
184 0x1100 + math.floor( k / 588 ),
185 0x1161 + math.floor( ( k % 588 ) / 28 )
188 ret[3] = 0x11a7 + ( k % 28 )
195 -- Handle Jamo to Hangul composition
196 local jamo_l_v_mt = { __index =
function ( t, k )
197 if k >= 0x1161 and k <= 0x1175 then
198 -- Jamo leading + Jamo vowel
199 return t.base + 28 * ( k - 0x1161 )
203 local hangul_jamo_mt = { __index =
function ( t, k )
204 if k >= 0x11a7 and k <= 0x11c2 then
205 -- Hangul + jamo
final
206 return t.base + k - 0x11a7
210 setmetatable( normal.comp, { __index = function ( t, k )
211 if k >= 0x1100 and k <= 0x1112 then
212 -- Jamo leading, return a second table that combines with a Jamo vowel
213 local t2 = { base = 0xac00 + 588 * ( k - 0x1100 ) }
214 setmetatable( t2, jamo_l_v_mt )
215 t[k] = t2 -- cache it
217 elseif k >= 0xac00 and k <= 0xd7a3 and k % 28 == 16 then
218 -- Hangul.
"k % 28 == 16" picks out just the ones that are
219 -- Jamo leading + vowel, no
final. Return a second table that combines
220 -- with a Jamo
final.
221 local t2 = { base = k }
222 setmetatable( t2, hangul_jamo_mt )
223 t[k] = t2 -- cache it
229 -- Compatibility decomposition falls back to the normal decomposition
230 setmetatable( normal.decompK, { __index = normal.decomp } )