Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 112 |
|
0.00% |
0 / 1 |
CRAP | n/a |
0 / 0 |
|
uord | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 |
1 | #!/usr/bin/php |
2 | <?php |
3 | |
4 | use UtfNormal\Validator; |
5 | |
6 | if ( PHP_SAPI !== 'cli' && PHP_SAPI !== 'phpdbg' ) { |
7 | die( "This script may only be executed from the command line.\n" ); |
8 | } |
9 | |
10 | $utfnormalDir = null; |
11 | if ( count( $argv ) > 1 ) { |
12 | $utfnormalDir = rtrim( $argv[1], '/' ); |
13 | if ( !is_dir( $utfnormalDir ) ) { |
14 | // @phan-suppress-next-line SecurityCheck-XSS |
15 | die( "The specified UtfNormal directory '$utfnormalDir' does not exist\n" ); |
16 | } |
17 | if ( file_exists( "$utfnormalDir/Validator.php" ) ) { |
18 | // Probably ok |
19 | } elseif ( file_exists( "$utfnormalDir/src/Validator.php" ) ) { |
20 | // Add the 'src' dir |
21 | $utfnormalDir = "$utfnormalDir/src"; |
22 | } else { |
23 | fprintf( |
24 | STDERR, |
25 | "Warning: Supplied path \"%s\" does not seem to contain UtfNormal. Trying it anyway.\n", |
26 | $utfnormalDir |
27 | ); |
28 | } |
29 | } else { |
30 | $trydirs = [ |
31 | // Checkouts of mediawiki/core and mediawiki/extensions in the same directory |
32 | __DIR__ . '/../../../../../../../core/vendor/wikimedia/utfnormal/src', |
33 | // Scribunto checked out inside the 'extensions' directory of mediawiki/core |
34 | __DIR__ . '/../../../../../../../vendor/wikimedia/utfnormal/src', |
35 | ]; |
36 | if ( getenv( 'MW_INSTALL_PATH' ) ) { |
37 | array_unshift( $trydirs, getenv( 'MW_INSTALL_PATH' ) . '/vendor/wikimedia/utfnormal/src' ); |
38 | } |
39 | foreach ( $trydirs as $trydir ) { |
40 | $trydir = realpath( $trydir ); |
41 | if ( $trydir !== false && is_dir( $trydir ) && file_exists( "$trydir/UtfNormalData.inc" ) ) { |
42 | $utfnormalDir = $trydir; |
43 | break; |
44 | } |
45 | } |
46 | if ( !$utfnormalDir ) { |
47 | die( "Cannot find UtfNormal. Please specify the path explicitly.\n" ); |
48 | } |
49 | } |
50 | |
51 | // @phan-suppress-next-line SecurityCheck-XSS |
52 | echo "Loading UtfNormal from $utfnormalDir...\n"; |
53 | // @phan-suppress-next-line SecurityCheck-PathTraversal |
54 | require_once "$utfnormalDir/Validator.php"; |
55 | // @phan-suppress-next-line SecurityCheck-PathTraversal |
56 | require_once "$utfnormalDir/UtfNormalData.inc"; |
57 | // @phan-suppress-next-line SecurityCheck-PathTraversal |
58 | require_once "$utfnormalDir/UtfNormalDataK.inc"; |
59 | |
60 | if ( !Validator::$utfCheckNFC || |
61 | !Validator::$utfCombiningClass || |
62 | !Validator::$utfCanonicalDecomp || |
63 | !Validator::$utfCanonicalComp || |
64 | !Validator::$utfCompatibilityDecomp |
65 | ) { |
66 | die( "UtfNormal data files did not contain needed data.\n" ); |
67 | } |
68 | |
69 | /** |
70 | * @param string $c |
71 | * @param bool $firstOnly |
72 | * @return array|string |
73 | */ |
74 | function uord( $c, $firstOnly ) { // phpcs:ignore MediaWiki.NamingConventions.PrefixedGlobalFunctions |
75 | $ret = unpack( 'N*', mb_convert_encoding( $c, 'UTF-32BE', 'UTF-8' ) ); |
76 | return $firstOnly ? $ret[1] : $ret; |
77 | } |
78 | |
79 | echo "Creating normalization table...\n"; |
80 | $X = fopen( __DIR__ . '/normalization-data.lua', 'w' ); |
81 | if ( !$X ) { |
82 | die( "Failed to open normalization-data.lua\n" ); |
83 | } |
84 | fprintf( $X, "-- This file is automatically generated by make-normalization-table.php\n" ); |
85 | fprintf( $X, "local normal = {\n" ); |
86 | fprintf( $X, "\t-- Characters that might change depending on the following combiner\n" ); |
87 | fprintf( $X, "\t-- (minus any that are themselves combiners, those are added later)\n" ); |
88 | fprintf( $X, "\tcheck = {\n" ); |
89 | foreach ( Validator::$utfCheckNFC as $k => $v ) { |
90 | if ( isset( Validator::$utfCombiningClass[$k] ) ) { |
91 | // Skip, because it's in the other table already |
92 | continue; |
93 | } |
94 | fprintf( $X, "\t\t[0x%06x] = 1,\n", uord( $k, true ) ); |
95 | } |
96 | fprintf( $X, "\t},\n\n" ); |
97 | fprintf( $X, "\t-- Combining characters, mapped to combining class\n" ); |
98 | fprintf( $X, "\tcombclass = {\n" ); |
99 | $comb = []; |
100 | foreach ( Validator::$utfCombiningClass as $k => $v ) { |
101 | $cp = uord( $k, true ); |
102 | $comb[$cp] = 1; |
103 | fprintf( $X, "\t\t[0x%06x] = %d,\n", $cp, $v ); |
104 | } |
105 | fprintf( $X, "\t},\n\n" ); |
106 | fprintf( $X, "\t-- Characters mapped to what they decompose to\n" ); |
107 | fprintf( $X, "\t-- Note Hangul to Jamo is done separately below\n" ); |
108 | fprintf( $X, "\tdecomp = {\n" ); |
109 | foreach ( Validator::$utfCanonicalDecomp as $k => $v ) { |
110 | fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) ); |
111 | $fmt = "0x%06x"; |
112 | foreach ( uord( $v, false ) as $c ) { |
113 | fprintf( $X, $fmt, $c ); |
114 | $fmt = ", 0x%06x"; |
115 | } |
116 | fprintf( $X, " },\n" ); |
117 | } |
118 | fprintf( $X, "\t},\n\n" ); |
119 | |
120 | fprintf( $X, "\tdecompK = {\n" ); |
121 | foreach ( Validator::$utfCompatibilityDecomp as $k => $v ) { |
122 | if ( isset( Validator::$utfCanonicalDecomp[$k] ) && Validator::$utfCanonicalDecomp[$k] === $v ) { |
123 | // Skip duplicates |
124 | continue; |
125 | } |
126 | fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) ); |
127 | $fmt = "0x%06x"; |
128 | foreach ( uord( $v, false ) as $c ) { |
129 | fprintf( $X, $fmt, $c ); |
130 | $fmt = ", 0x%06x"; |
131 | } |
132 | fprintf( $X, " },\n" ); |
133 | } |
134 | fprintf( $X, "\t},\n\n" ); |
135 | |
136 | fprintf( $X, "\t-- Character-pairs mapped to what they compose to\n" ); |
137 | fprintf( $X, "\t-- Note Jamo to Hangul is done separately below\n" ); |
138 | $t = []; |
139 | foreach ( Validator::$utfCanonicalComp as $k => $v ) { |
140 | $k = uord( $k, false ); |
141 | if ( count( $k ) == 1 ) { |
142 | // No idea why these are in the file |
143 | continue; |
144 | } |
145 | if ( isset( $comb[$k[1]] ) ) { |
146 | // Non-starter, no idea why these are in the file either |
147 | continue; |
148 | } |
149 | $t[$k[1]][$k[2]] = uord( $v, true ); |
150 | } |
151 | fprintf( $X, "\tcomp = {\n" ); |
152 | ksort( $t ); |
153 | foreach ( $t as $k1 => $v1 ) { |
154 | fprintf( $X, "\t\t[0x%06x] = {\n", $k1 ); |
155 | ksort( $v1 ); |
156 | foreach ( $v1 as $k2 => $v2 ) { |
157 | if ( $k2 < 0 ) { |
158 | fprintf( $X, "\t\t\t[-1] = 0x%06x,\n", $v2 ); |
159 | } else { |
160 | fprintf( $X, "\t\t\t[0x%06x] = 0x%06x,\n", $k2, $v2 ); |
161 | } |
162 | } |
163 | fprintf( $X, "\t\t},\n" ); |
164 | } |
165 | fprintf( $X, "\t},\n" ); |
166 | |
167 | fprintf( $X, "}\n" ); |
168 | |
169 | fprintf( $X, "\n%s\n", <<<LUA |
170 | -- All combining characters need to be checked, so just do that |
171 | setmetatable( normal.check, { __index = normal.combclass } ) |
172 | |
173 | -- Handle Hangul to Jamo decomposition |
174 | setmetatable( normal.decomp, { __index = function ( _, k ) |
175 | if k >= 0xac00 and k <= 0xd7a3 then |
176 | -- Decompose a Hangul syllable into Jamo |
177 | k = k - 0xac00 |
178 | local ret = { |
179 | 0x1100 + math.floor( k / 588 ), |
180 | 0x1161 + math.floor( ( k % 588 ) / 28 ) |
181 | } |
182 | if k % 28 ~= 0 then |
183 | ret[3] = 0x11a7 + ( k % 28 ) |
184 | end |
185 | return ret |
186 | end |
187 | return nil |
188 | end } ) |
189 | |
190 | -- Handle Jamo to Hangul composition |
191 | local jamo_l_v_mt = { __index = function ( t, k ) |
192 | if k >= 0x1161 and k <= 0x1175 then |
193 | -- Jamo leading + Jamo vowel |
194 | return t.base + 28 * ( k - 0x1161 ) |
195 | end |
196 | return nil |
197 | end } |
198 | local hangul_jamo_mt = { __index = function ( t, k ) |
199 | if k >= 0x11a7 and k <= 0x11c2 then |
200 | -- Hangul + jamo final |
201 | return t.base + k - 0x11a7 |
202 | end |
203 | return nil |
204 | end } |
205 | setmetatable( normal.comp, { __index = function ( t, k ) |
206 | if k >= 0x1100 and k <= 0x1112 then |
207 | -- Jamo leading, return a second table that combines with a Jamo vowel |
208 | local t2 = { base = 0xac00 + 588 * ( k - 0x1100 ) } |
209 | setmetatable( t2, jamo_l_v_mt ) |
210 | t[k] = t2 -- cache it |
211 | return t2 |
212 | elseif k >= 0xac00 and k <= 0xd7a3 and k % 28 == 16 then |
213 | -- Hangul. "k % 28 == 16" picks out just the ones that are |
214 | -- Jamo leading + vowel, no final. Return a second table that combines |
215 | -- with a Jamo final. |
216 | local t2 = { base = k } |
217 | setmetatable( t2, hangul_jamo_mt ) |
218 | t[k] = t2 -- cache it |
219 | return t2 |
220 | end |
221 | return nil |
222 | end } ) |
223 | |
224 | -- Compatibility decomposition falls back to the normal decomposition |
225 | setmetatable( normal.decompK, { __index = normal.decomp } ) |
226 | |
227 | return normal |
228 | LUA |
229 | ); |
230 | |
231 | fclose( $X ); |