MediaWiki REL1_31
generateCollationData.php
Go to the documentation of this file.
1<?php
24require_once __DIR__ . '/../Maintenance.php';
25
33 public $dataDir;
34
36 public $weights;
37
44
46
50 const NORMAL_UPPERCASE = 0x08;
51 const NORMAL_HIRAGANA = 0x0E;
52
53 public function __construct() {
54 parent::__construct();
55 $this->addOption( 'data-dir', 'A directory on the local filesystem ' .
56 'containing allkeys.txt and ucd.all.grouped.xml from unicode.org',
57 false, true );
58 $this->addOption( 'debug-output', 'Filename for sending debug output to',
59 false, true );
60 }
61
62 public function execute() {
63 $this->dataDir = $this->getOption( 'data-dir', '.' );
64
65 $allkeysPresent = file_exists( "{$this->dataDir}/allkeys.txt" );
66 $ucdallPresent = file_exists( "{$this->dataDir}/ucd.all.grouped.xml" );
67
68 // As of January 2013, these links work for all versions of Unicode
69 // between 5.1 and 6.2, inclusive.
70 $allkeysURL = "http://www.unicode.org/Public/UCA/<Unicode version>/allkeys.txt";
71 $ucdallURL = "http://www.unicode.org/Public/<Unicode version>/ucdxml/ucd.all.grouped.zip";
72
73 if ( !$allkeysPresent || !$ucdallPresent ) {
74 $icuVersion = IcuCollation::getICUVersion();
75 $unicodeVersion = IcuCollation::getUnicodeVersionForICU();
76
77 $error = "";
78
79 if ( !$allkeysPresent ) {
80 $error .= "Unable to find allkeys.txt. "
81 . "Download it and specify its location with --data-dir=<DIR>. "
82 . "\n\n";
83 }
84 if ( !$ucdallPresent ) {
85 $error .= "Unable to find ucd.all.grouped.xml. "
86 . "Download it, unzip, and specify its location with --data-dir=<DIR>. "
87 . "\n\n";
88 }
89
90 $versionKnown = false;
91 if ( !$icuVersion ) {
92 // Unknown version - either very old intl,
93 // or PHP < 5.3.7 which does not expose this information
94 $error .= "As MediaWiki could not determine the version of ICU library used by your PHP's "
95 . "intl extension it can't suggest which file version to download. "
96 . "This can be caused by running a very old version of intl or PHP < 5.3.7. "
97 . "If you are sure everything is all right, find out the ICU version "
98 . "by running phpinfo(), check what is the Unicode version it is using "
99 . "at http://site.icu-project.org/download, then try finding appropriate data file(s) at:";
100 } elseif ( version_compare( $icuVersion, "4.0", "<" ) ) {
101 // Extra old version
102 $error .= "You are using outdated version of ICU ($icuVersion), intended for "
103 . ( $unicodeVersion ? "Unicode $unicodeVersion" : "an unknown version of Unicode" )
104 . "; this file might not be avalaible for it, and it's not supported by MediaWiki. "
105 . " You are on your own; consider upgrading PHP's intl extension or try "
106 . "one of the files available at:";
107 } elseif ( version_compare( $icuVersion, "51.0", ">=" ) ) {
108 // Extra recent version
109 $error .= "You are using ICU $icuVersion, released after this script was last updated. "
110 . "Check what is the Unicode version it is using at http://site.icu-project.org/download . "
111 . "It can't be guaranteed everything will work, but appropriate file(s) should "
112 . "be available at:";
113 } else {
114 // ICU 4.0 to 50.x
115 $versionKnown = true;
116 $error .= "You are using ICU $icuVersion, intended for "
117 . ( $unicodeVersion ? "Unicode $unicodeVersion" : "an unknown version of Unicode" )
118 . ". Appropriate file(s) should be available at:";
119 }
120 $error .= "\n";
121
122 if ( $versionKnown && $unicodeVersion ) {
123 $allkeysURL = str_replace( "<Unicode version>", "$unicodeVersion.0", $allkeysURL );
124 $ucdallURL = str_replace( "<Unicode version>", "$unicodeVersion.0", $ucdallURL );
125 }
126
127 if ( !$allkeysPresent ) {
128 $error .= "* $allkeysURL\n";
129 }
130 if ( !$ucdallPresent ) {
131 $error .= "* $ucdallURL\n";
132 }
133
134 $this->fatalError( $error );
135 }
136
137 $debugOutFileName = $this->getOption( 'debug-output' );
138 if ( $debugOutFileName ) {
139 $this->debugOutFile = fopen( $debugOutFileName, 'w' );
140 if ( !$this->debugOutFile ) {
141 $this->fatalError( "Unable to open debug output file for writing" );
142 }
143 }
144 $this->loadUcd();
145 $this->generateFirstChars();
146 }
147
148 function loadUcd() {
149 $uxr = new UcdXmlReader( "{$this->dataDir}/ucd.all.grouped.xml" );
150 $uxr->readChars( [ $this, 'charCallback' ] );
151 }
152
153 function charCallback( $data ) {
154 // Skip non-printable characters,
155 // but do not skip a normal space (U+0020) since
156 // people like to use that as a fake no header symbol.
157 $category = substr( $data['gc'], 0, 1 );
158 if ( strpos( 'LNPS', $category ) === false
159 && $data['cp'] !== '0020'
160 ) {
161 return;
162 }
163 $cp = hexdec( $data['cp'] );
164
165 // Skip the CJK ideograph blocks, as an optimisation measure.
166 // UCA doesn't sort them properly anyway, without tailoring.
167 if ( IcuCollation::isCjk( $cp ) ) {
168 return;
169 }
170
171 // Skip the composed Hangul syllables, we will use the bare Jamo
172 // as first letters
173 if ( $data['block'] == 'Hangul Syllables' ) {
174 return;
175 }
176
177 // Calculate implicit weight per UTS #10 v6.0.0, sec 7.1.3
178 if ( $data['UIdeo'] === 'Y' ) {
179 if ( $data['block'] == 'CJK Unified Ideographs'
180 || $data['block'] == 'CJK Compatibility Ideographs'
181 ) {
182 $base = 0xFB40;
183 } else {
184 $base = 0xFB80;
185 }
186 } else {
187 $base = 0xFBC0;
188 }
189 $a = $base + ( $cp >> 15 );
190 $b = ( $cp & 0x7fff ) | 0x8000;
191
192 $this->weights[$cp] = sprintf( ".%04X.%04X", $a, $b );
193
194 if ( $data['dm'] !== '#' ) {
195 $this->mappedChars[$cp] = true;
196 }
197
198 if ( $cp % 4096 == 0 ) {
199 print "{$data['cp']}\n";
200 }
201 }
202
204 $file = fopen( "{$this->dataDir}/allkeys.txt", 'r' );
205 if ( !$file ) {
206 $this->fatalError( "Unable to open allkeys.txt" );
207 }
208 global $IP;
209 $outFile = fopen( "$IP/serialized/first-letters-root.ser", 'w' );
210 if ( !$outFile ) {
211 $this->fatalError( "Unable to open output file first-letters-root.ser" );
212 }
213
214 $goodTertiaryChars = [];
215
216 // For each character with an entry in allkeys.txt, overwrite the implicit
217 // entry in $this->weights that came from the UCD.
218 // Also gather a list of tertiary weights, for use in selecting the group header
219 while ( false !== ( $line = fgets( $file ) ) ) {
220 // We're only interested in single-character weights, pick them out with a regex
221 $line = trim( $line );
222 if ( !preg_match( '/^([0-9A-F]+)\s*;\s*([^#]*)/', $line, $m ) ) {
223 continue;
224 }
225
226 $cp = hexdec( $m[1] );
227 $allWeights = trim( $m[2] );
228 $primary = '';
229 $tertiary = '';
230
231 if ( !isset( $this->weights[$cp] ) ) {
232 // Non-printable, ignore
233 continue;
234 }
235 foreach ( StringUtils::explode( '[', $allWeights ) as $weightStr ) {
236 preg_match_all( '/[*.]([0-9A-F]+)/', $weightStr, $m );
237 if ( !empty( $m[1] ) ) {
238 if ( $m[1][0] !== '0000' ) {
239 $primary .= '.' . $m[1][0];
240 }
241 if ( $m[1][2] !== '0000' ) {
242 $tertiary .= '.' . $m[1][2];
243 }
244 }
245 }
246 $this->weights[$cp] = $primary;
247 if ( $tertiary === '.0008'
248 || $tertiary === '.000E'
249 ) {
250 $goodTertiaryChars[$cp] = true;
251 }
252 }
253 fclose( $file );
254
255 // Identify groups of characters with the same primary weight
256 $this->groups = [];
257 asort( $this->weights, SORT_STRING );
258 $prevWeight = reset( $this->weights );
259 $group = [];
260 foreach ( $this->weights as $cp => $weight ) {
261 if ( $weight !== $prevWeight ) {
262 $this->groups[$prevWeight] = $group;
263 $prevWeight = $weight;
264 if ( isset( $this->groups[$weight] ) ) {
265 $group = $this->groups[$weight];
266 } else {
267 $group = [];
268 }
269 }
270 $group[] = $cp;
271 }
272 if ( $group ) {
273 $this->groups[$prevWeight] = $group;
274 }
275
276 // If one character has a given primary weight sequence, and a second
277 // character has a longer primary weight sequence with an initial
278 // portion equal to the first character, then remove the second
279 // character. This avoids having characters like U+A732 (double A)
280 // polluting the basic latin sort area.
281
282 foreach ( $this->groups as $weight => $group ) {
283 if ( preg_match( '/(\.[0-9A-F]*)\./', $weight, $m ) ) {
284 if ( isset( $this->groups[$m[1]] ) ) {
285 unset( $this->groups[$weight] );
286 }
287 }
288 }
289
290 ksort( $this->groups, SORT_STRING );
291
292 // Identify the header character in each group
293 $headerChars = [];
294 $prevChar = "\000";
295 $tertiaryCollator = new Collator( 'root' );
296 $primaryCollator = new Collator( 'root' );
297 $primaryCollator->setStrength( Collator::PRIMARY );
298 $numOutOfOrder = 0;
299 foreach ( $this->groups as $weight => $group ) {
300 $uncomposedChars = [];
301 $goodChars = [];
302 foreach ( $group as $cp ) {
303 if ( isset( $goodTertiaryChars[$cp] ) ) {
304 $goodChars[] = $cp;
305 }
306 if ( !isset( $this->mappedChars[$cp] ) ) {
307 $uncomposedChars[] = $cp;
308 }
309 }
310 $x = array_intersect( $goodChars, $uncomposedChars );
311 if ( !$x ) {
312 $x = $uncomposedChars;
313 if ( !$x ) {
314 $x = $group;
315 }
316 }
317
318 // Use ICU to pick the lowest sorting character in the selection
319 $tertiaryCollator->sort( $x );
320 $cp = $x[0];
321
322 $char = UtfNormal\Utils::codepointToUtf8( $cp );
323 $headerChars[] = $char;
324 if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
325 $numOutOfOrder++;
326 }
327 $prevChar = $char;
328
329 if ( $this->debugOutFile ) {
330 fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char,
331 implode( ' ', array_map( 'UtfNormal\Utils::codepointToUtf8', $group ) ) ) );
332 }
333 }
334
335 print "Out of order: $numOutOfOrder / " . count( $headerChars ) . "\n";
336
337 fwrite( $outFile, serialize( $headerChars ) );
338 }
339}
340
342 public $fileName;
343 public $callback;
345 public $xml;
346 public $blocks = [];
348
349 function __construct( $fileName ) {
350 $this->fileName = $fileName;
351 }
352
353 public function readChars( $callback ) {
354 $this->getBlocks();
355 $this->currentBlock = reset( $this->blocks );
356 $xml = $this->open();
357 $this->callback = $callback;
358
359 while ( $xml->name !== 'repertoire' && $xml->next() );
360
361 while ( $xml->read() ) {
362 if ( $xml->nodeType == XMLReader::ELEMENT ) {
363 if ( $xml->name === 'group' ) {
364 $this->groupAttrs = $this->readAttributes();
365 } elseif ( $xml->name === 'char' ) {
366 $this->handleChar();
367 }
368 } elseif ( $xml->nodeType === XMLReader::END_ELEMENT ) {
369 if ( $xml->name === 'group' ) {
370 $this->groupAttrs = [];
371 }
372 }
373 }
374 $xml->close();
375 }
376
377 protected function open() {
378 $this->xml = new XMLReader;
379 $this->xml->open( $this->fileName );
380 if ( !$this->xml ) {
381 throw new MWException( __METHOD__ . ": unable to open {$this->fileName}" );
382 }
383 while ( $this->xml->name !== 'ucd' && $this->xml->read() );
384 $this->xml->read();
385
386 return $this->xml;
387 }
388
394 protected function readAttributes() {
395 $attrs = [];
396 while ( $this->xml->moveToNextAttribute() ) {
397 $attrs[$this->xml->name] = $this->xml->value;
398 }
399
400 return $attrs;
401 }
402
403 protected function handleChar() {
404 $attrs = $this->readAttributes() + $this->groupAttrs;
405 if ( isset( $attrs['cp'] ) ) {
406 $first = $last = hexdec( $attrs['cp'] );
407 } else {
408 $first = hexdec( $attrs['first-cp'] );
409 $last = hexdec( $attrs['last-cp'] );
410 unset( $attrs['first-cp'] );
411 unset( $attrs['last-cp'] );
412 }
413
414 for ( $cp = $first; $cp <= $last; $cp++ ) {
415 $hexCp = sprintf( "%04X", $cp );
416 foreach ( [ 'na', 'na1' ] as $nameProp ) {
417 if ( isset( $attrs[$nameProp] ) ) {
418 $attrs[$nameProp] = str_replace( '#', $hexCp, $attrs[$nameProp] );
419 }
420 }
421
422 while ( $this->currentBlock ) {
423 if ( $cp < $this->currentBlock[0] ) {
424 break;
425 } elseif ( $cp <= $this->currentBlock[1] ) {
426 $attrs['block'] = key( $this->blocks );
427 break;
428 } else {
429 $this->currentBlock = next( $this->blocks );
430 }
431 }
432
433 $attrs['cp'] = $hexCp;
434 call_user_func( $this->callback, $attrs );
435 }
436 }
437
438 public function getBlocks() {
439 if ( $this->blocks ) {
440 return $this->blocks;
441 }
442
443 $xml = $this->open();
444 while ( $xml->name !== 'blocks' && $xml->read() );
445
446 while ( $xml->read() ) {
447 if ( $xml->nodeType == XMLReader::ELEMENT ) {
448 if ( $xml->name === 'block' ) {
449 $attrs = $this->readAttributes();
450 $first = hexdec( $attrs['first-cp'] );
451 $last = hexdec( $attrs['last-cp'] );
452 $this->blocks[$attrs['name']] = [ $first, $last ];
453 }
454 }
455 }
456 $xml->close();
457
458 return $this->blocks;
459 }
460}
461
462$maintClass = GenerateCollationData::class;
463require_once RUN_MAINTENANCE_IF_MAIN;
serialize()
$line
Definition cdb.php:59
Generate first letter data files for Collation.php.
__construct()
Default constructor.
$weights
The primary weights, indexed by codepoint.
execute()
Do the actual work.
$dataDir
The directory with source data files in it.
$mappedChars
A hashtable keyed by codepoint, where presence indicates that a character has a decomposition mapping...
const NORMAL_UPPERCASE
Important tertiary weights from UTS #10 section 7.2.
static getICUVersion()
Return the version of ICU library used by PHP's intl extension, or false when the extension is not in...
static isCjk( $codepoint)
Test if a code point is a CJK (Chinese, Japanese, Korean) character.
static getUnicodeVersionForICU()
Return the version of Unicode appropriate for the version of ICU library currently in use,...
MediaWiki exception.
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
static explode( $separator, $subject)
Workalike for explode() with limited memory usage.
readAttributes()
Read the attributes of the current element node and return them as an array.
design txt This is a brief overview of the new design More thorough and up to date information is available on the documentation wiki at etc Handles the details of getting and saving to the user table of the and dealing with sessions and cookies OutputPage Encapsulates the entire HTML page that will be sent in response to any server request It is used by calling its functions to add in any and then calling but I prefer the flexibility This should also do the output encoding The system allocates a global one in $wgOut Title Represents the title of an and does all the work of translating among various forms such as plain database key
Definition design.txt:26
this hook is for auditing only RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist Do not use this to implement individual filters if they are compatible with the ChangesListFilter and ChangesListFilterGroup structure use sub classes of those in conjunction with the ChangesListSpecialPageStructuredFilters hook This hook can be used to implement filters that do not implement that or custom behavior that is not an individual filter e g Watchlist and Watchlist you will want to construct new ChangesListBooleanFilter or ChangesListStringOptionsFilter objects When constructing you specify which group they belong to You can reuse existing groups(accessed through $special->getFilterGroup)
while(( $__line=Maintenance::readconsole()) !==false) print
Definition eval.php:64
$IP
Definition update.php:3
require_once RUN_MAINTENANCE_IF_MAIN
$last