MediaWiki REL1_39
generateCollationData.php
Go to the documentation of this file.
1<?php
24require_once __DIR__ . '/../Maintenance.php';
25
27
35 public $dataDir;
36
38 public $weights;
39
46
48
49 public function __construct() {
50 parent::__construct();
51 $this->addOption( 'data-dir', 'A directory on the local filesystem ' .
52 'containing allkeys.txt and ucd.all.grouped.xml from unicode.org',
53 false, true );
54 $this->addOption( 'debug-output', 'Filename for sending debug output to',
55 false, true );
56 }
57
58 public function execute() {
59 $this->dataDir = $this->getOption( 'data-dir', '.' );
60
61 $allkeysPresent = file_exists( "{$this->dataDir}/allkeys.txt" );
62 $ucdallPresent = file_exists( "{$this->dataDir}/ucd.all.grouped.xml" );
63
64 // As of January 2013, these links work for all versions of Unicode
65 // between 5.1 and 6.2, inclusive.
66 $allkeysURL = "https://www.unicode.org/Public/UCA/<Unicode version>/allkeys.txt";
67 $ucdallURL = "https://www.unicode.org/Public/<Unicode version>/ucdxml/ucd.all.grouped.zip";
68
69 if ( !$allkeysPresent || !$ucdallPresent ) {
70 $icuVersion = INTL_ICU_VERSION;
71 $unicodeVersion = IcuCollation::getUnicodeVersionForICU();
72
73 $error = "";
74
75 if ( !$allkeysPresent ) {
76 $error .= "Unable to find allkeys.txt. "
77 . "Download it and specify its location with --data-dir=<DIR>. "
78 . "\n\n";
79 }
80 if ( !$ucdallPresent ) {
81 $error .= "Unable to find ucd.all.grouped.xml. "
82 . "Download it, unzip, and specify its location with --data-dir=<DIR>. "
83 . "\n\n";
84 }
85
86 $versionKnown = false;
87 if ( version_compare( $icuVersion, "4.0", "<" ) ) {
88 // Extra old version
89 $error .= "You are using outdated version of ICU ($icuVersion), intended for "
90 . ( $unicodeVersion ? "Unicode $unicodeVersion" : "an unknown version of Unicode" )
91 . "; this file might not be available for it, and it's not supported by MediaWiki. "
92 . " You are on your own; consider upgrading PHP's intl extension or try "
93 . "one of the files available at:";
94 } elseif ( version_compare( $icuVersion, "51.0", ">=" ) ) {
95 // Extra recent version
96 $error .= "You are using ICU $icuVersion, released after this script was last updated. "
97 . "Check what is the Unicode version it is using at http://site.icu-project.org/download . "
98 . "It can't be guaranteed everything will work, but appropriate file(s) should "
99 . "be available at:";
100 } else {
101 // ICU 4.0 to 50.x
102 $versionKnown = true;
103 $error .= "You are using ICU $icuVersion, intended for "
104 . ( $unicodeVersion ? "Unicode $unicodeVersion" : "an unknown version of Unicode" )
105 . ". Appropriate file(s) should be available at:";
106 }
107 $error .= "\n";
108
109 if ( $versionKnown && $unicodeVersion ) {
110 $allkeysURL = str_replace( "<Unicode version>", "$unicodeVersion.0", $allkeysURL );
111 $ucdallURL = str_replace( "<Unicode version>", "$unicodeVersion.0", $ucdallURL );
112 }
113
114 if ( !$allkeysPresent ) {
115 $error .= "* $allkeysURL\n";
116 }
117 if ( !$ucdallPresent ) {
118 $error .= "* $ucdallURL\n";
119 }
120
121 $this->fatalError( $error );
122 }
123
124 $debugOutFileName = $this->getOption( 'debug-output' );
125 if ( $debugOutFileName ) {
126 $this->debugOutFile = fopen( $debugOutFileName, 'w' );
127 if ( !$this->debugOutFile ) {
128 $this->fatalError( "Unable to open debug output file for writing" );
129 }
130 }
131 $this->loadUcd();
132 $this->generateFirstChars();
133 }
134
135 private function loadUcd() {
136 $uxr = new UcdXmlReader( "{$this->dataDir}/ucd.all.grouped.xml" );
137 $uxr->readChars( [ $this, 'charCallback' ] );
138 }
139
140 private function charCallback( $data ) {
141 // Skip non-printable characters,
142 // but do not skip a normal space (U+0020) since
143 // people like to use that as a fake no header symbol.
144 $category = substr( $data['gc'], 0, 1 );
145 if ( strpos( 'LNPS', $category ) === false
146 && $data['cp'] !== '0020'
147 ) {
148 return;
149 }
150 $cp = hexdec( $data['cp'] );
151
152 // Skip the CJK ideograph blocks, as an optimisation measure.
153 // UCA doesn't sort them properly anyway, without tailoring.
154 if ( IcuCollation::isCjk( $cp ) ) {
155 return;
156 }
157
158 // Skip the composed Hangul syllables, we will use the bare Jamo
159 // as first letters
160 if ( $data['block'] == 'Hangul Syllables' ) {
161 return;
162 }
163
164 // Calculate implicit weight per UTS #10 v6.0.0, sec 7.1.3
165 if ( $data['UIdeo'] === 'Y' ) {
166 if ( $data['block'] == 'CJK Unified Ideographs'
167 || $data['block'] == 'CJK Compatibility Ideographs'
168 ) {
169 $base = 0xFB40;
170 } else {
171 $base = 0xFB80;
172 }
173 } else {
174 $base = 0xFBC0;
175 }
176 $a = $base + ( $cp >> 15 );
177 $b = ( $cp & 0x7fff ) | 0x8000;
178
179 $this->weights[$cp] = sprintf( ".%04X.%04X", $a, $b );
180
181 if ( $data['dm'] !== '#' ) {
182 $this->mappedChars[$cp] = true;
183 }
184
185 if ( $cp % 4096 == 0 ) {
186 print "{$data['cp']}\n";
187 }
188 }
189
190 private function generateFirstChars() {
191 $file = fopen( "{$this->dataDir}/allkeys.txt", 'r' );
192 if ( !$file ) {
193 $this->fatalError( "Unable to open allkeys.txt" );
194 }
195
196 $goodTertiaryChars = [];
197
198 // For each character with an entry in allkeys.txt, overwrite the implicit
199 // entry in $this->weights that came from the UCD.
200 // Also gather a list of tertiary weights, for use in selecting the group header
201 while ( ( $line = fgets( $file ) ) !== false ) {
202 // We're only interested in single-character weights, pick them out with a regex
203 $line = trim( $line );
204 if ( !preg_match( '/^([0-9A-F]+)\s*;\s*([^#]*)/', $line, $m ) ) {
205 continue;
206 }
207
208 $cp = hexdec( $m[1] );
209 $allWeights = trim( $m[2] );
210 $primary = '';
211 $tertiary = '';
212
213 if ( !isset( $this->weights[$cp] ) ) {
214 // Non-printable, ignore
215 continue;
216 }
217 foreach ( StringUtils::explode( '[', $allWeights ) as $weightStr ) {
218 if ( preg_match_all( '/[*.]([0-9A-F]+)/', $weightStr, $m ) ) {
219 if ( $m[1][0] !== '0000' ) {
220 $primary .= '.' . $m[1][0];
221 }
222 if ( $m[1][2] !== '0000' ) {
223 $tertiary .= '.' . $m[1][2];
224 }
225 }
226 }
227 $this->weights[$cp] = $primary;
228 if ( $tertiary === '.0008'
229 || $tertiary === '.000E'
230 ) {
231 $goodTertiaryChars[$cp] = true;
232 }
233 }
234 fclose( $file );
235
236 // Identify groups of characters with the same primary weight
237 $this->groups = [];
238 asort( $this->weights, SORT_STRING );
239 $prevWeight = reset( $this->weights );
240 $group = [];
241 foreach ( $this->weights as $cp => $weight ) {
242 if ( $weight !== $prevWeight ) {
243 $this->groups[$prevWeight] = $group;
244 $prevWeight = $weight;
245 $group = $this->groups[$weight] ?? [];
246 }
247 $group[] = $cp;
248 }
249 if ( $group ) {
250 $this->groups[$prevWeight] = $group;
251 }
252
253 // If one character has a given primary weight sequence, and a second
254 // character has a longer primary weight sequence with an initial
255 // portion equal to the first character, then remove the second
256 // character. This avoids having characters like U+A732 (double A)
257 // polluting the basic Latin sort area.
258
259 foreach ( $this->groups as $weight => $group ) {
260 if ( preg_match( '/(\.[0-9A-F]*)\./', $weight, $m ) ) {
261 if ( isset( $this->groups[$m[1]] ) ) {
262 unset( $this->groups[$weight] );
263 }
264 }
265 }
266
267 ksort( $this->groups, SORT_STRING );
268
269 // Identify the header character in each group
270 $headerChars = [];
271 $prevChar = "\000";
272 $tertiaryCollator = new Collator( 'root' );
273 $primaryCollator = new Collator( 'root' );
274 $primaryCollator->setStrength( Collator::PRIMARY );
275 $numOutOfOrder = 0;
276 foreach ( $this->groups as $weight => $group ) {
277 $uncomposedChars = [];
278 $goodChars = [];
279 foreach ( $group as $cp ) {
280 if ( isset( $goodTertiaryChars[$cp] ) ) {
281 $goodChars[] = $cp;
282 }
283 if ( !isset( $this->mappedChars[$cp] ) ) {
284 $uncomposedChars[] = $cp;
285 }
286 }
287 $x = array_intersect( $goodChars, $uncomposedChars );
288 if ( !$x ) {
289 $x = $uncomposedChars;
290 if ( !$x ) {
291 $x = $group;
292 }
293 }
294
295 // Use ICU to pick the lowest sorting character in the selection
296 $tertiaryCollator->sort( $x );
297 $cp = $x[0];
298
299 $char = UtfNormal\Utils::codepointToUtf8( $cp );
300 $headerChars[] = $char;
301 if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
302 $numOutOfOrder++;
303 }
304 $prevChar = $char;
305
306 if ( $this->debugOutFile ) {
307 fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char,
308 implode( ' ', array_map( [ UtfNormal\Utils::class, 'codepointToUtf8' ], $group ) ) ) );
309 }
310 }
311
312 print "Out of order: $numOutOfOrder / " . count( $headerChars ) . "\n";
313
314 global $IP;
315 $writer = new StaticArrayWriter();
316 file_put_contents(
317 "$IP/includes/collation/data/first-letters-root.php",
318 $writer->create( $headerChars, 'File created by generateCollationData.php' )
319 );
320 echo "first-letters-root: file written.\n";
321 }
322}
323
325 public $fileName;
326 public $callback;
328 public $xml;
329 public $blocks = [];
331
332 public function __construct( $fileName ) {
333 $this->fileName = $fileName;
334 }
335
336 public function readChars( $callback ) {
337 $this->getBlocks();
338 $this->currentBlock = reset( $this->blocks );
339 $xml = $this->open();
340 $this->callback = $callback;
341
342 while ( $xml->name !== 'repertoire' && $xml->next() );
343
344 while ( $xml->read() ) {
345 if ( $xml->nodeType == XMLReader::ELEMENT ) {
346 if ( $xml->name === 'group' ) {
347 $this->groupAttrs = $this->readAttributes();
348 } elseif ( $xml->name === 'char' ) {
349 $this->handleChar();
350 }
351 } elseif ( $xml->nodeType === XMLReader::END_ELEMENT ) {
352 if ( $xml->name === 'group' ) {
353 $this->groupAttrs = [];
354 }
355 }
356 }
357 $xml->close();
358 }
359
360 protected function open() {
361 $this->xml = new XMLReader;
362 $this->xml->open( $this->fileName );
363 if ( !$this->xml ) {
364 throw new MWException( __METHOD__ . ": unable to open {$this->fileName}" );
365 }
366 while ( $this->xml->name !== 'ucd' && $this->xml->read() );
367 $this->xml->read();
368
369 return $this->xml;
370 }
371
377 protected function readAttributes() {
378 $attrs = [];
379 while ( $this->xml->moveToNextAttribute() ) {
380 $attrs[$this->xml->name] = $this->xml->value;
381 }
382
383 return $attrs;
384 }
385
386 protected function handleChar() {
387 $attrs = $this->readAttributes() + $this->groupAttrs;
388 if ( isset( $attrs['cp'] ) ) {
389 $first = $last = hexdec( $attrs['cp'] );
390 } else {
391 $first = hexdec( $attrs['first-cp'] );
392 $last = hexdec( $attrs['last-cp'] );
393 unset( $attrs['first-cp'] );
394 unset( $attrs['last-cp'] );
395 }
396
397 for ( $cp = $first; $cp <= $last; $cp++ ) {
398 $hexCp = sprintf( "%04X", $cp );
399 foreach ( [ 'na', 'na1' ] as $nameProp ) {
400 if ( isset( $attrs[$nameProp] ) ) {
401 $attrs[$nameProp] = str_replace( '#', $hexCp, $attrs[$nameProp] );
402 }
403 }
404
405 while ( $this->currentBlock ) {
406 if ( $cp < $this->currentBlock[0] ) {
407 break;
408 } elseif ( $cp <= $this->currentBlock[1] ) {
409 $attrs['block'] = key( $this->blocks );
410 break;
411 } else {
412 $this->currentBlock = next( $this->blocks );
413 }
414 }
415
416 $attrs['cp'] = $hexCp;
417 call_user_func( $this->callback, $attrs );
418 }
419 }
420
421 public function getBlocks() {
422 if ( $this->blocks ) {
423 return $this->blocks;
424 }
425
426 $xml = $this->open();
427 while ( $xml->name !== 'blocks' && $xml->read() );
428
429 while ( $xml->read() ) {
430 if ( $xml->nodeType == XMLReader::ELEMENT ) {
431 if ( $xml->name === 'block' ) {
432 $attrs = $this->readAttributes();
433 $first = hexdec( $attrs['first-cp'] );
434 $last = hexdec( $attrs['last-cp'] );
435 $this->blocks[$attrs['name']] = [ $first, $last ];
436 }
437 }
438 }
439 $xml->close();
440
441 return $this->blocks;
442 }
443}
444
445$maintClass = GenerateCollationData::class;
446require_once RUN_MAINTENANCE_IF_MAIN;
if(!defined( 'MEDIAWIKI')) if(ini_get('mbstring.func_overload')) if(!defined( 'MW_ENTRY_POINT')) global $IP
Environment checks.
Definition Setup.php:91
Generate first letter data files for Collation.php.
__construct()
Default constructor.
$weights
The primary weights, indexed by codepoint.
execute()
Do the actual work.
$dataDir
The directory with source data files in it.
$mappedChars
A hashtable keyed by codepoint, where presence indicates that a character has a decomposition mapping...
static isCjk( $codepoint)
Test if a code point is a CJK (Chinese, Japanese, Korean) character.
static getUnicodeVersionForICU()
Return the version of Unicode appropriate for the version of ICU library currently in use,...
MediaWiki exception.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
static explode( $separator, $subject)
Workalike for explode() with limited memory usage.
readAttributes()
Read the attributes of the current element node and return them as an array.
Format a static PHP array to be written to a file.
while(( $__line=Maintenance::readconsole()) !==false) print
Definition eval.php:69
$line
Definition mcc.php:119
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.
Definition router.php:42