MediaWiki REL1_30
generateCollationData.php
Go to the documentation of this file.
1<?php
24require_once __DIR__ . '/../Maintenance.php';
25
33 public $dataDir;
34
36 public $weights;
37
44
46
50 const NORMAL_UPPERCASE = 0x08;
51 const NORMAL_HIRAGANA = 0x0E;
52
53 public function __construct() {
54 parent::__construct();
55 $this->addOption( 'data-dir', 'A directory on the local filesystem ' .
56 'containing allkeys.txt and ucd.all.grouped.xml from unicode.org',
57 false, true );
58 $this->addOption( 'debug-output', 'Filename for sending debug output to',
59 false, true );
60 }
61
62 public function execute() {
63 $this->dataDir = $this->getOption( 'data-dir', '.' );
64
65 $allkeysPresent = file_exists( "{$this->dataDir}/allkeys.txt" );
66 $ucdallPresent = file_exists( "{$this->dataDir}/ucd.all.grouped.xml" );
67
68 // As of January 2013, these links work for all versions of Unicode
69 // between 5.1 and 6.2, inclusive.
70 $allkeysURL = "http://www.unicode.org/Public/UCA/<Unicode version>/allkeys.txt";
71 $ucdallURL = "http://www.unicode.org/Public/<Unicode version>/ucdxml/ucd.all.grouped.zip";
72
73 if ( !$allkeysPresent || !$ucdallPresent ) {
74 $icuVersion = IcuCollation::getICUVersion();
75 $unicodeVersion = IcuCollation::getUnicodeVersionForICU();
76
77 $error = "";
78
79 if ( !$allkeysPresent ) {
80 $error .= "Unable to find allkeys.txt. "
81 . "Download it and specify its location with --data-dir=<DIR>. "
82 . "\n\n";
83 }
84 if ( !$ucdallPresent ) {
85 $error .= "Unable to find ucd.all.grouped.xml. "
86 . "Download it, unzip, and specify its location with --data-dir=<DIR>. "
87 . "\n\n";
88 }
89
90 $versionKnown = false;
91 if ( !$icuVersion ) {
92 // Unknown version - either very old intl,
93 // or PHP < 5.3.7 which does not expose this information
94 $error .= "As MediaWiki could not determine the version of ICU library used by your PHP's "
95 . "intl extension it can't suggest which file version to download. "
96 . "This can be caused by running a very old version of intl or PHP < 5.3.7. "
97 . "If you are sure everything is all right, find out the ICU version "
98 . "by running phpinfo(), check what is the Unicode version it is using "
99 . "at http://site.icu-project.org/download, then try finding appropriate data file(s) at:";
100 } elseif ( version_compare( $icuVersion, "4.0", "<" ) ) {
101 // Extra old version
102 $error .= "You are using outdated version of ICU ($icuVersion), intended for "
103 . ( $unicodeVersion ? "Unicode $unicodeVersion" : "an unknown version of Unicode" )
104 . "; this file might not be avalaible for it, and it's not supported by MediaWiki. "
105 . " You are on your own; consider upgrading PHP's intl extension or try "
106 . "one of the files available at:";
107 } elseif ( version_compare( $icuVersion, "51.0", ">=" ) ) {
108 // Extra recent version
109 $error .= "You are using ICU $icuVersion, released after this script was last updated. "
110 . "Check what is the Unicode version it is using at http://site.icu-project.org/download . "
111 . "It can't be guaranteed everything will work, but appropriate file(s) should "
112 . "be available at:";
113 } else {
114 // ICU 4.0 to 50.x
115 $versionKnown = true;
116 $error .= "You are using ICU $icuVersion, intended for "
117 . ( $unicodeVersion ? "Unicode $unicodeVersion" : "an unknown version of Unicode" )
118 . ". Appropriate file(s) should be available at:";
119 }
120 $error .= "\n";
121
122 if ( $versionKnown && $unicodeVersion ) {
123 $allkeysURL = str_replace( "<Unicode version>", "$unicodeVersion.0", $allkeysURL );
124 $ucdallURL = str_replace( "<Unicode version>", "$unicodeVersion.0", $ucdallURL );
125 }
126
127 if ( !$allkeysPresent ) {
128 $error .= "* $allkeysURL\n";
129 }
130 if ( !$ucdallPresent ) {
131 $error .= "* $ucdallURL\n";
132 }
133
134 $this->error( $error );
135 exit( 1 );
136 }
137
138 $debugOutFileName = $this->getOption( 'debug-output' );
139 if ( $debugOutFileName ) {
140 $this->debugOutFile = fopen( $debugOutFileName, 'w' );
141 if ( !$this->debugOutFile ) {
142 $this->error( "Unable to open debug output file for writing" );
143 exit( 1 );
144 }
145 }
146 $this->loadUcd();
147 $this->generateFirstChars();
148 }
149
150 function loadUcd() {
151 $uxr = new UcdXmlReader( "{$this->dataDir}/ucd.all.grouped.xml" );
152 $uxr->readChars( [ $this, 'charCallback' ] );
153 }
154
155 function charCallback( $data ) {
156 // Skip non-printable characters,
157 // but do not skip a normal space (U+0020) since
158 // people like to use that as a fake no header symbol.
159 $category = substr( $data['gc'], 0, 1 );
160 if ( strpos( 'LNPS', $category ) === false
161 && $data['cp'] !== '0020'
162 ) {
163 return;
164 }
165 $cp = hexdec( $data['cp'] );
166
167 // Skip the CJK ideograph blocks, as an optimisation measure.
168 // UCA doesn't sort them properly anyway, without tailoring.
169 if ( IcuCollation::isCjk( $cp ) ) {
170 return;
171 }
172
173 // Skip the composed Hangul syllables, we will use the bare Jamo
174 // as first letters
175 if ( $data['block'] == 'Hangul Syllables' ) {
176 return;
177 }
178
179 // Calculate implicit weight per UTS #10 v6.0.0, sec 7.1.3
180 if ( $data['UIdeo'] === 'Y' ) {
181 if ( $data['block'] == 'CJK Unified Ideographs'
182 || $data['block'] == 'CJK Compatibility Ideographs'
183 ) {
184 $base = 0xFB40;
185 } else {
186 $base = 0xFB80;
187 }
188 } else {
189 $base = 0xFBC0;
190 }
191 $a = $base + ( $cp >> 15 );
192 $b = ( $cp & 0x7fff ) | 0x8000;
193
194 $this->weights[$cp] = sprintf( ".%04X.%04X", $a, $b );
195
196 if ( $data['dm'] !== '#' ) {
197 $this->mappedChars[$cp] = true;
198 }
199
200 if ( $cp % 4096 == 0 ) {
201 print "{$data['cp']}\n";
202 }
203 }
204
206 $file = fopen( "{$this->dataDir}/allkeys.txt", 'r' );
207 if ( !$file ) {
208 $this->error( "Unable to open allkeys.txt" );
209 exit( 1 );
210 }
211 global $IP;
212 $outFile = fopen( "$IP/serialized/first-letters-root.ser", 'w' );
213 if ( !$outFile ) {
214 $this->error( "Unable to open output file first-letters-root.ser" );
215 exit( 1 );
216 }
217
218 $goodTertiaryChars = [];
219
220 // For each character with an entry in allkeys.txt, overwrite the implicit
221 // entry in $this->weights that came from the UCD.
222 // Also gather a list of tertiary weights, for use in selecting the group header
223 while ( false !== ( $line = fgets( $file ) ) ) {
224 // We're only interested in single-character weights, pick them out with a regex
225 $line = trim( $line );
226 if ( !preg_match( '/^([0-9A-F]+)\s*;\s*([^#]*)/', $line, $m ) ) {
227 continue;
228 }
229
230 $cp = hexdec( $m[1] );
231 $allWeights = trim( $m[2] );
232 $primary = '';
233 $tertiary = '';
234
235 if ( !isset( $this->weights[$cp] ) ) {
236 // Non-printable, ignore
237 continue;
238 }
239 foreach ( StringUtils::explode( '[', $allWeights ) as $weightStr ) {
240 preg_match_all( '/[*.]([0-9A-F]+)/', $weightStr, $m );
241 if ( !empty( $m[1] ) ) {
242 if ( $m[1][0] !== '0000' ) {
243 $primary .= '.' . $m[1][0];
244 }
245 if ( $m[1][2] !== '0000' ) {
246 $tertiary .= '.' . $m[1][2];
247 }
248 }
249 }
250 $this->weights[$cp] = $primary;
251 if ( $tertiary === '.0008'
252 || $tertiary === '.000E'
253 ) {
254 $goodTertiaryChars[$cp] = true;
255 }
256 }
257 fclose( $file );
258
259 // Identify groups of characters with the same primary weight
260 $this->groups = [];
261 asort( $this->weights, SORT_STRING );
262 $prevWeight = reset( $this->weights );
263 $group = [];
264 foreach ( $this->weights as $cp => $weight ) {
265 if ( $weight !== $prevWeight ) {
266 $this->groups[$prevWeight] = $group;
267 $prevWeight = $weight;
268 if ( isset( $this->groups[$weight] ) ) {
269 $group = $this->groups[$weight];
270 } else {
271 $group = [];
272 }
273 }
274 $group[] = $cp;
275 }
276 if ( $group ) {
277 $this->groups[$prevWeight] = $group;
278 }
279
280 // If one character has a given primary weight sequence, and a second
281 // character has a longer primary weight sequence with an initial
282 // portion equal to the first character, then remove the second
283 // character. This avoids having characters like U+A732 (double A)
284 // polluting the basic latin sort area.
285
286 foreach ( $this->groups as $weight => $group ) {
287 if ( preg_match( '/(\.[0-9A-F]*)\./', $weight, $m ) ) {
288 if ( isset( $this->groups[$m[1]] ) ) {
289 unset( $this->groups[$weight] );
290 }
291 }
292 }
293
294 ksort( $this->groups, SORT_STRING );
295
296 // Identify the header character in each group
297 $headerChars = [];
298 $prevChar = "\000";
299 $tertiaryCollator = new Collator( 'root' );
300 $primaryCollator = new Collator( 'root' );
301 $primaryCollator->setStrength( Collator::PRIMARY );
302 $numOutOfOrder = 0;
303 foreach ( $this->groups as $weight => $group ) {
304 $uncomposedChars = [];
305 $goodChars = [];
306 foreach ( $group as $cp ) {
307 if ( isset( $goodTertiaryChars[$cp] ) ) {
308 $goodChars[] = $cp;
309 }
310 if ( !isset( $this->mappedChars[$cp] ) ) {
311 $uncomposedChars[] = $cp;
312 }
313 }
314 $x = array_intersect( $goodChars, $uncomposedChars );
315 if ( !$x ) {
316 $x = $uncomposedChars;
317 if ( !$x ) {
318 $x = $group;
319 }
320 }
321
322 // Use ICU to pick the lowest sorting character in the selection
323 $tertiaryCollator->sort( $x );
324 $cp = $x[0];
325
326 $char = UtfNormal\Utils::codepointToUtf8( $cp );
327 $headerChars[] = $char;
328 if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
329 $numOutOfOrder++;
330 /*
331 printf( "Out of order: U+%05X > U+%05X\n",
332 utf8ToCodepoint( $prevChar ),
333 utf8ToCodepoint( $char ) );
334 */
335 }
336 $prevChar = $char;
337
338 if ( $this->debugOutFile ) {
339 fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char,
340 implode( ' ', array_map( 'UtfNormal\Utils::codepointToUtf8', $group ) ) ) );
341 }
342 }
343
344 print "Out of order: $numOutOfOrder / " . count( $headerChars ) . "\n";
345
346 fwrite( $outFile, serialize( $headerChars ) );
347 }
348}
349
351 public $fileName;
352 public $callback;
354 public $xml;
355 public $blocks = [];
357
358 function __construct( $fileName ) {
359 $this->fileName = $fileName;
360 }
361
362 public function readChars( $callback ) {
363 $this->getBlocks();
364 $this->currentBlock = reset( $this->blocks );
365 $xml = $this->open();
366 $this->callback = $callback;
367
368 while ( $xml->name !== 'repertoire' && $xml->next() );
369
370 while ( $xml->read() ) {
371 if ( $xml->nodeType == XMLReader::ELEMENT ) {
372 if ( $xml->name === 'group' ) {
373 $this->groupAttrs = $this->readAttributes();
374 } elseif ( $xml->name === 'char' ) {
375 $this->handleChar();
376 }
377 } elseif ( $xml->nodeType === XMLReader::END_ELEMENT ) {
378 if ( $xml->name === 'group' ) {
379 $this->groupAttrs = [];
380 }
381 }
382 }
383 $xml->close();
384 }
385
386 protected function open() {
387 $this->xml = new XMLReader;
388 $this->xml->open( $this->fileName );
389 if ( !$this->xml ) {
390 throw new MWException( __METHOD__ . ": unable to open {$this->fileName}" );
391 }
392 while ( $this->xml->name !== 'ucd' && $this->xml->read() );
393 $this->xml->read();
394
395 return $this->xml;
396 }
397
403 protected function readAttributes() {
404 $attrs = [];
405 while ( $this->xml->moveToNextAttribute() ) {
406 $attrs[$this->xml->name] = $this->xml->value;
407 }
408
409 return $attrs;
410 }
411
412 protected function handleChar() {
413 $attrs = $this->readAttributes() + $this->groupAttrs;
414 if ( isset( $attrs['cp'] ) ) {
415 $first = $last = hexdec( $attrs['cp'] );
416 } else {
417 $first = hexdec( $attrs['first-cp'] );
418 $last = hexdec( $attrs['last-cp'] );
419 unset( $attrs['first-cp'] );
420 unset( $attrs['last-cp'] );
421 }
422
423 for ( $cp = $first; $cp <= $last; $cp++ ) {
424 $hexCp = sprintf( "%04X", $cp );
425 foreach ( [ 'na', 'na1' ] as $nameProp ) {
426 if ( isset( $attrs[$nameProp] ) ) {
427 $attrs[$nameProp] = str_replace( '#', $hexCp, $attrs[$nameProp] );
428 }
429 }
430
431 while ( $this->currentBlock ) {
432 if ( $cp < $this->currentBlock[0] ) {
433 break;
434 } elseif ( $cp <= $this->currentBlock[1] ) {
435 $attrs['block'] = key( $this->blocks );
436 break;
437 } else {
438 $this->currentBlock = next( $this->blocks );
439 }
440 }
441
442 $attrs['cp'] = $hexCp;
443 call_user_func( $this->callback, $attrs );
444 }
445 }
446
447 public function getBlocks() {
448 if ( $this->blocks ) {
449 return $this->blocks;
450 }
451
452 $xml = $this->open();
453 while ( $xml->name !== 'blocks' && $xml->read() );
454
455 while ( $xml->read() ) {
456 if ( $xml->nodeType == XMLReader::ELEMENT ) {
457 if ( $xml->name === 'block' ) {
458 $attrs = $this->readAttributes();
459 $first = hexdec( $attrs['first-cp'] );
460 $last = hexdec( $attrs['last-cp'] );
461 $this->blocks[$attrs['name']] = [ $first, $last ];
462 }
463 }
464 }
465 $xml->close();
466
467 return $this->blocks;
468 }
469}
470
471$maintClass = 'GenerateCollationData';
472require_once RUN_MAINTENANCE_IF_MAIN;
serialize()
$line
Definition cdb.php:58
Generate first letter data files for Collation.php.
__construct()
Default constructor.
$weights
The primary weights, indexed by codepoint.
execute()
Do the actual work.
$dataDir
The directory with source data files in it.
$mappedChars
A hashtable keyed by codepoint, where presence indicates that a character has a decomposition mapping...
const NORMAL_UPPERCASE
Important tertiary weights from UTS #10 section 7.2.
static getICUVersion()
Return the version of ICU library used by PHP's intl extension, or false when the extension is not in...
static isCjk( $codepoint)
Test if a code point is a CJK (Chinese, Japanese, Korean) character.
static getUnicodeVersionForICU()
Return the version of Unicode appropriate for the version of ICU library currently in use,...
MediaWiki exception.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
static explode( $separator, $subject)
Workalike for explode() with limited memory usage.
readAttributes()
Read the attributes of the current element node and return them as an array.
print
Definition cleanup.php:99
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add in any and then calling but I prefer the flexibility This should also do the output encoding The system allocates a global one in $wgOut Title Represents the title of an and does all the work of translating among various forms such as plain database key
Definition design.txt:26
do that in ParserLimitReportFormat instead use this to modify the parameters of the image all existing parser cache entries will be invalid To avoid you ll need to handle that somehow(e.g. with the RejectParserCacheValue hook) because MediaWiki won 't do it for you. & $defaults error
Definition hooks.txt:2581
this hook is for auditing only RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist Do not use this to implement individual filters if they are compatible with the ChangesListFilter and ChangesListFilterGroup structure use sub classes of those in conjunction with the ChangesListSpecialPageStructuredFilters hook This hook can be used to implement filters that do not implement that or custom behavior that is not an individual filter e g Watchlist and Watchlist you will want to construct new ChangesListBooleanFilter or ChangesListStringOptionsFilter objects When constructing you specify which group they belong to You can reuse existing groups(accessed through $special->getFilterGroup)
$IP
Definition update.php:3
require_once RUN_MAINTENANCE_IF_MAIN
$last