MediaWiki  1.34.0
make-normalization-table.php
Go to the documentation of this file.
1 #!/usr/bin/php
2 <?php
3 
4 use UtfNormal\Validator;
5 
6 if ( PHP_SAPI !== 'cli' && PHP_SAPI !== 'phpdbg' ) {
7  die( "This script may only be executed from the command line.\n" );
8 }
9 
11 if ( count( $argv ) > 1 ) {
12  $utfnormalDir = rtrim( $argv[1], '/' );
13  if ( !is_dir( $utfnormalDir ) ) {
14  die( "The specified UtfNormal directory '$utfnormalDir' does not exist\n" );
15  }
16  if ( file_exists( "$utfnormalDir/Validator.php" ) ) {
17  // Probably ok
18  } elseif ( file_exists( "$utfnormalDir/src/Validator.php" ) ) {
19  // Add the 'src' dir
20  $utfnormalDir = "$utfnormalDir/src";
21  } else {
22  fprintf(
23  STDERR,
24  "Warning: Supplied path \"%s\" does not seem to contain UtfNormal. Trying it anyway.\n",
26  );
27  }
28 } else {
29  $trydirs = [
30  // Checkouts of mediawiki/core and mediawiki/extensions in the same directory
31  __DIR__ . '/../../../../../../../core/vendor/wikimedia/utfnormal/src',
32  // Scribunto checked out inside the 'extensions' directory of mediawiki/core
33  __DIR__ . '/../../../../../../../vendor/wikimedia/utfnormal/src',
34  ];
35  if ( getenv( 'MW_INSTALL_PATH' ) ) {
36  array_unshift( $trydirs, getenv( 'MW_INSTALL_PATH' ) . '/vendor/wikimedia/utfnormal/src' );
37  }
38  foreach ( $trydirs as $trydir ) {
39  $trydir = realpath( $trydir );
40  if ( $trydir !== false && is_dir( $trydir ) && file_exists( "$trydir/UtfNormalData.inc" ) ) {
41  $utfnormalDir = $trydir;
42  break;
43  }
44  }
45  if ( !$utfnormalDir ) {
46  die( "Cannot find UtfNormal. Please specify the path explicitly.\n" );
47  }
48 }
49 
50 //phpcs:disable MediaWiki.NamingConventions
51 
58 function loadDataFiles() {
59  global $utfnormalDir;
60  echo "Loading UtfNormal from $utfnormalDir...\n";
61  require_once "$utfnormalDir/Validator.php";
62  require_once "$utfnormalDir/UtfNormalData.inc";
63  require_once "$utfnormalDir/UtfNormalDataK.inc";
64 }
65 
66 //phpcs:enable MediaWiki.NamingConventions
68 
69 if ( !Validator::$utfCheckNFC ||
70  !Validator::$utfCombiningClass ||
71  !Validator::$utfCanonicalDecomp ||
72  !Validator::$utfCanonicalComp ||
73  !Validator::$utfCompatibilityDecomp
74 ) {
75  die( "UtfNormal data files did not contain needed data.\n" );
76 }
77 
78 // @codingStandardsIgnoreLine MediaWiki.NamingConventions.PrefixedGlobalFunctions
79 function uord( $c, $firstOnly ) {
80  $ret = unpack( 'N*', mb_convert_encoding( $c, 'UTF-32BE', 'UTF-8' ) );
81  return $firstOnly ? $ret[1] : $ret;
82 }
83 
84 echo "Creating normalization table...\n";
85 $X = fopen( __DIR__ . '/normalization-data.lua', 'w' );
86 if ( !$X ) {
87  die( "Failed to open normalization-data.lua\n" );
88 }
89 fprintf( $X, "-- This file is automatically generated by make-normalization-table.php\n" );
90 fprintf( $X, "local normal = {\n" );
91 fprintf( $X, "\t-- Characters that might change depending on the following combiner\n" );
92 fprintf( $X, "\t-- (minus any that are themselves combiners, those are added later)\n" );
93 fprintf( $X, "\tcheck = {\n" );
94 foreach ( Validator::$utfCheckNFC as $k => $v ) {
95  if ( isset( Validator::$utfCombiningClass[$k] ) ) {
96  // Skip, because it's in the other table already
97  continue;
98  }
99  fprintf( $X, "\t\t[0x%06x] = 1,\n", uord( $k, true ) );
100 }
101 fprintf( $X, "\t},\n\n" );
102 fprintf( $X, "\t-- Combining characters, mapped to combining class\n" );
103 fprintf( $X, "\tcombclass = {\n" );
104 $comb = [];
105 foreach ( Validator::$utfCombiningClass as $k => $v ) {
106  $cp = uord( $k, true );
107  $comb[$cp] = 1;
108  fprintf( $X, "\t\t[0x%06x] = %d,\n", $cp, $v );
109 }
110 fprintf( $X, "\t},\n\n" );
111 fprintf( $X, "\t-- Characters mapped to what they decompose to\n" );
112 fprintf( $X, "\t-- Note Hangul to Jamo is done separately below\n" );
113 fprintf( $X, "\tdecomp = {\n" );
114 foreach ( Validator::$utfCanonicalDecomp as $k => $v ) {
115  fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) );
116  $fmt = "0x%06x";
117  foreach ( uord( $v, false ) as $c ) {
118  fprintf( $X, $fmt, $c );
119  $fmt = ", 0x%06x";
120  }
121  fprintf( $X, " },\n" );
122 }
123 fprintf( $X, "\t},\n\n" );
124 
125 fprintf( $X, "\tdecompK = {\n" );
126 foreach ( Validator::$utfCompatibilityDecomp as $k => $v ) {
127  if ( isset( Validator::$utfCanonicalDecomp[$k] ) && Validator::$utfCanonicalDecomp[$k] === $v ) {
128  // Skip duplicates
129  continue;
130  }
131  fprintf( $X, "\t\t[0x%06x] = { ", uord( $k, true ) );
132  $fmt = "0x%06x";
133  foreach ( uord( $v, false ) as $c ) {
134  fprintf( $X, $fmt, $c );
135  $fmt = ", 0x%06x";
136  }
137  fprintf( $X, " },\n" );
138 }
139 fprintf( $X, "\t},\n\n" );
140 
141 fprintf( $X, "\t-- Character-pairs mapped to what they compose to\n" );
142 fprintf( $X, "\t-- Note Jamo to Hangul is done separately below\n" );
143 $t = [];
144 foreach ( Validator::$utfCanonicalComp as $k => $v ) {
145  $k = uord( $k, false );
146  if ( count( $k ) == 1 ) {
147  // No idea why these are in the file
148  continue;
149  }
150  if ( isset( $comb[$k[1]] ) ) {
151  // Non-starter, no idea why these are in the file either
152  continue;
153  }
154  $t[$k[1]][$k[2]] = uord( $v, true );
155 }
156 fprintf( $X, "\tcomp = {\n" );
157 ksort( $t );
158 foreach ( $t as $k1 => $v1 ) {
159  fprintf( $X, "\t\t[0x%06x] = {\n", $k1 );
160  ksort( $v1 );
161  foreach ( $v1 as $k2 => $v2 ) {
162  if ( $k2 < 0 ) {
163  fprintf( $X, "\t\t\t[-1] = 0x%06x,\n", $v2 );
164  } else {
165  fprintf( $X, "\t\t\t[0x%06x] = 0x%06x,\n", $k2, $v2 );
166  }
167  }
168  fprintf( $X, "\t\t},\n" );
169 }
170 fprintf( $X, "\t},\n" );
171 
172 fprintf( $X, "}\n" );
173 
174 fprintf( $X, "\n%s\n", <<<LUA
175 -- All combining characters need to be checked, so just do that
176 setmetatable( normal.check, { __index = normal.combclass } )
177 
178 -- Handle Hangul to Jamo decomposition
179 setmetatable( normal.decomp, { __index = function ( _, k )
180  if k >= 0xac00 and k <= 0xd7a3 then
181  -- Decompose a Hangul syllable into Jamo
182  k = k - 0xac00
183  local ret = {
184  0x1100 + math.floor( k / 588 ),
185  0x1161 + math.floor( ( k % 588 ) / 28 )
186  }
187  if k % 28 ~= 0 then
188  ret[3] = 0x11a7 + ( k % 28 )
189  end
190  return ret
191  end
192  return nil
193 end } )
194 
195 -- Handle Jamo to Hangul composition
196 local jamo_l_v_mt = { __index = function ( t, k )
197  if k >= 0x1161 and k <= 0x1175 then
198  -- Jamo leading + Jamo vowel
199  return t.base + 28 * ( k - 0x1161 )
200  end
201  return nil
202 end }
203 local hangul_jamo_mt = { __index = function ( t, k )
204  if k >= 0x11a7 and k <= 0x11c2 then
205  -- Hangul + jamo final
206  return t.base + k - 0x11a7
207  end
208  return nil
209 end }
210 setmetatable( normal.comp, { __index = function ( t, k )
211  if k >= 0x1100 and k <= 0x1112 then
212  -- Jamo leading, return a second table that combines with a Jamo vowel
213  local t2 = { base = 0xac00 + 588 * ( k - 0x1100 ) }
214  setmetatable( t2, jamo_l_v_mt )
215  t[k] = t2 -- cache it
216  return t2
217  elseif k >= 0xac00 and k <= 0xd7a3 and k % 28 == 16 then
218  -- Hangul. "k % 28 == 16" picks out just the ones that are
219  -- Jamo leading + vowel, no final. Return a second table that combines
220  -- with a Jamo final.
221  local t2 = { base = k }
222  setmetatable( t2, hangul_jamo_mt )
223  t[k] = t2 -- cache it
224  return t2
225  end
226  return nil
227 end } )
228 
229 -- Compatibility decomposition falls back to the normal decomposition
230 setmetatable( normal.decompK, { __index = normal.decomp } )
231 
232 return normal
233 LUA
234 );
235 
236 fclose( $X );
uord
if(!Validator::$utfCheckNFC||!Validator::$utfCombiningClass||!Validator::$utfCanonicalDecomp||!Validator::$utfCanonicalComp||!Validator::$utfCompatibilityDecomp) uord( $c, $firstOnly)
Definition: make-normalization-table.php:79
$comb
$comb
Definition: make-normalization-table.php:104
$t
$t
Definition: make-normalization-table.php:143
return
return[ 'OATHAuth'=> function(MediaWikiServices $services) { return new OATHAuth($services->getMainConfig(), $services->getDBLoadBalancerFactory());}, 'OATHUserRepository'=> function(MediaWikiServices $services) { global $wgOATHAuthDatabase;$auth=$services->getService( 'OATHAuth');return new OATHUserRepository($services->getDBLoadBalancerFactory() ->getMainLB( $wgOATHAuthDatabase), new \HashBagOStuff(['maxKey'=> 5]), $auth);}]
Definition: ServiceWiring.php:25
$utfnormalDir
if(PHP_SAPI !=='cli' &&PHP_SAPI !=='phpdbg') $utfnormalDir
Definition: make-normalization-table.php:10
loadDataFiles
if(getenv( 'MW_INSTALL_PATH')) foreach( $trydirs as $trydir) if(! $utfnormalDir) loadDataFiles()
This is a function solely exists so we can suppress errors.
Definition: make-normalization-table.php:58
$X
$X
Definition: make-normalization-table.php:85