24 require_once __DIR__ .
'/../Maintenance.php';
50 parent::__construct();
51 $this->
addOption(
'data-dir',
'A directory on the local filesystem ' .
52 'containing allkeys.txt and ucd.all.grouped.xml from unicode.org',
54 $this->
addOption(
'debug-output',
'Filename for sending debug output to',
59 $this->dataDir = $this->
getOption(
'data-dir',
'.' );
61 $allkeysPresent = file_exists(
"{$this->dataDir}/allkeys.txt" );
62 $ucdallPresent = file_exists(
"{$this->dataDir}/ucd.all.grouped.xml" );
66 $allkeysURL =
"https://www.unicode.org/Public/UCA/<Unicode version>/allkeys.txt";
67 $ucdallURL =
"https://www.unicode.org/Public/<Unicode version>/ucdxml/ucd.all.grouped.zip";
69 if ( !$allkeysPresent || !$ucdallPresent ) {
70 $icuVersion = INTL_ICU_VERSION;
75 if ( !$allkeysPresent ) {
76 $error .=
"Unable to find allkeys.txt. "
77 .
"Download it and specify its location with --data-dir=<DIR>. "
80 if ( !$ucdallPresent ) {
81 $error .=
"Unable to find ucd.all.grouped.xml. "
82 .
"Download it, unzip, and specify its location with --data-dir=<DIR>. "
86 $versionKnown =
false;
87 if ( version_compare( $icuVersion,
"4.0",
"<" ) ) {
89 $error .=
"You are using outdated version of ICU ($icuVersion), intended for "
90 . ( $unicodeVersion ?
"Unicode $unicodeVersion" :
"an unknown version of Unicode" )
91 .
"; this file might not be available for it, and it's not supported by MediaWiki. "
92 .
" You are on your own; consider upgrading PHP's intl extension or try "
93 .
"one of the files available at:";
94 } elseif ( version_compare( $icuVersion,
"51.0",
">=" ) ) {
96 $error .=
"You are using ICU $icuVersion, released after this script was last updated. "
97 .
"Check what is the Unicode version it is using at http://site.icu-project.org/download . "
98 .
"It can't be guaranteed everything will work, but appropriate file(s) should "
102 $versionKnown =
true;
103 $error .=
"You are using ICU $icuVersion, intended for "
104 . ( $unicodeVersion ?
"Unicode $unicodeVersion" :
"an unknown version of Unicode" )
105 .
". Appropriate file(s) should be available at:";
109 if ( $versionKnown && $unicodeVersion ) {
110 $allkeysURL = str_replace(
"<Unicode version>",
"$unicodeVersion.0", $allkeysURL );
111 $ucdallURL = str_replace(
"<Unicode version>",
"$unicodeVersion.0", $ucdallURL );
114 if ( !$allkeysPresent ) {
115 $error .=
"* $allkeysURL\n";
117 if ( !$ucdallPresent ) {
118 $error .=
"* $ucdallURL\n";
124 $debugOutFileName = $this->
getOption(
'debug-output' );
125 if ( $debugOutFileName ) {
126 $this->debugOutFile = fopen( $debugOutFileName,
'w' );
127 if ( !$this->debugOutFile ) {
128 $this->
fatalError(
"Unable to open debug output file for writing" );
136 $uxr =
new UcdXmlReader(
"{$this->dataDir}/ucd.all.grouped.xml" );
137 $uxr->readChars( [ $this,
'charCallback' ] );
144 $category = substr( $data[
'gc'], 0, 1 );
145 if ( strpos(
'LNPS', $category ) ===
false
146 && $data[
'cp'] !==
'0020'
150 $cp = hexdec( $data[
'cp'] );
160 if ( $data[
'block'] ==
'Hangul Syllables' ) {
165 if ( $data[
'UIdeo'] ===
'Y' ) {
166 if ( $data[
'block'] ==
'CJK Unified Ideographs'
167 || $data[
'block'] ==
'CJK Compatibility Ideographs'
176 $a =
$base + ( $cp >> 15 );
177 $b = ( $cp & 0x7fff ) | 0x8000;
179 $this->weights[$cp] = sprintf(
".%04X.%04X", $a, $b );
181 if ( $data[
'dm'] !==
'#' ) {
182 $this->mappedChars[$cp] =
true;
185 if ( $cp % 4096 == 0 ) {
186 print
"{$data['cp']}\n";
191 $file = fopen(
"{$this->dataDir}/allkeys.txt",
'r' );
193 $this->
fatalError(
"Unable to open allkeys.txt" );
196 $goodTertiaryChars = [];
201 while ( (
$line = fgets(
$file ) ) !==
false ) {
204 if ( !preg_match(
'/^([0-9A-F]+)\s*;\s*([^#]*)/',
$line, $m ) ) {
208 $cp = hexdec( $m[1] );
209 $allWeights = trim( $m[2] );
213 if ( !isset( $this->weights[$cp] ) ) {
218 if ( preg_match_all(
'/[*.]([0-9A-F]+)/', $weightStr, $m ) ) {
219 if ( $m[1][0] !==
'0000' ) {
220 $primary .=
'.' . $m[1][0];
222 if ( $m[1][2] !==
'0000' ) {
223 $tertiary .=
'.' . $m[1][2];
227 $this->weights[$cp] = $primary;
228 if ( $tertiary ===
'.0008'
229 || $tertiary ===
'.000E'
231 $goodTertiaryChars[$cp] =
true;
238 asort( $this->weights, SORT_STRING );
239 $prevWeight = reset( $this->weights );
241 foreach ( $this->weights as $cp => $weight ) {
242 if ( $weight !== $prevWeight ) {
243 $this->groups[$prevWeight] = $group;
244 $prevWeight = $weight;
245 $group = $this->groups[$weight] ?? [];
250 $this->groups[$prevWeight] = $group;
259 foreach ( $this->groups as $weight => $group ) {
260 if ( preg_match(
'/(\.[0-9A-F]*)\./', $weight, $m ) ) {
261 if ( isset( $this->groups[$m[1]] ) ) {
262 unset( $this->groups[$weight] );
267 ksort( $this->groups, SORT_STRING );
272 $tertiaryCollator =
new Collator(
'root' );
273 $primaryCollator =
new Collator(
'root' );
274 $primaryCollator->setStrength( Collator::PRIMARY );
276 foreach ( $this->groups as $weight => $group ) {
277 $uncomposedChars = [];
279 foreach ( $group as $cp ) {
280 if ( isset( $goodTertiaryChars[$cp] ) ) {
283 if ( !isset( $this->mappedChars[$cp] ) ) {
284 $uncomposedChars[] = $cp;
287 $x = array_intersect( $goodChars, $uncomposedChars );
289 $x = $uncomposedChars;
296 $tertiaryCollator->sort( $x );
299 $char = UtfNormal\Utils::codepointToUtf8( $cp );
300 $headerChars[] = $char;
301 if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
306 if ( $this->debugOutFile ) {
307 fwrite( $this->debugOutFile, sprintf(
"%05X %s %s (%s)\n", $cp, $weight, $char,
308 implode(
' ', array_map( [ UtfNormal\Utils::class,
'codepointToUtf8' ], $group ) ) ) );
312 print
"Out of order: $numOutOfOrder / " . count( $headerChars ) .
"\n";
317 "$IP/includes/collation/data/first-letters-root.php",
318 $writer->create( $headerChars,
'File created by generateCollationData.php' )
320 echo
"first-letters-root: file written.\n";
338 $this->currentBlock = reset( $this->blocks );
342 while (
$xml->name !==
'repertoire' &&
$xml->next() );
344 while (
$xml->read() ) {
345 if (
$xml->nodeType == XMLReader::ELEMENT ) {
346 if (
$xml->name ===
'group' ) {
348 } elseif (
$xml->name ===
'char' ) {
351 } elseif (
$xml->nodeType === XMLReader::END_ELEMENT ) {
352 if (
$xml->name ===
'group' ) {
353 $this->groupAttrs = [];
361 $this->xml =
new XMLReader;
362 $this->xml->open( $this->fileName );
364 throw new MWException( __METHOD__ .
": unable to open {$this->fileName}" );
366 while ( $this->xml->name !==
'ucd' && $this->xml->read() );
379 while ( $this->xml->moveToNextAttribute() ) {
380 $attrs[$this->xml->name] = $this->xml->value;
388 if ( isset( $attrs[
'cp'] ) ) {
389 $first = $last = hexdec( $attrs[
'cp'] );
391 $first = hexdec( $attrs[
'first-cp'] );
392 $last = hexdec( $attrs[
'last-cp'] );
393 unset( $attrs[
'first-cp'] );
394 unset( $attrs[
'last-cp'] );
397 for ( $cp = $first; $cp <= $last; $cp++ ) {
398 $hexCp = sprintf(
"%04X", $cp );
399 foreach ( [
'na',
'na1' ] as $nameProp ) {
400 if ( isset( $attrs[$nameProp] ) ) {
401 $attrs[$nameProp] = str_replace(
'#', $hexCp, $attrs[$nameProp] );
405 while ( $this->currentBlock ) {
406 if ( $cp < $this->currentBlock[0] ) {
408 } elseif ( $cp <= $this->currentBlock[1] ) {
409 $attrs[
'block'] = key( $this->blocks );
412 $this->currentBlock = next( $this->blocks );
416 $attrs[
'cp'] = $hexCp;
417 call_user_func( $this->callback, $attrs );
422 if ( $this->blocks ) {
427 while (
$xml->name !==
'blocks' &&
$xml->read() );
429 while (
$xml->read() ) {
430 if (
$xml->nodeType == XMLReader::ELEMENT ) {
431 if (
$xml->name ===
'block' ) {
433 $first = hexdec( $attrs[
'first-cp'] );
434 $last = hexdec( $attrs[
'last-cp'] );
435 $this->blocks[$attrs[
'name']] = [ $first, $last ];
446 require_once RUN_MAINTENANCE_IF_MAIN;
if(!defined( 'MEDIAWIKI')) if(ini_get( 'mbstring.func_overload')) if(!defined( 'MW_ENTRY_POINT')) global $IP
Environment checks.
Generate first letter data files for Collation.php.
__construct()
Default constructor.
$weights
The primary weights, indexed by codepoint.
execute()
Do the actual work.
$dataDir
The directory with source data files in it.
$mappedChars
A hashtable keyed by codepoint, where presence indicates that a character has a decomposition mapping...
static isCjk( $codepoint)
Test if a code point is a CJK (Chinese, Japanese, Korean) character.
static getUnicodeVersionForICU()
Return the version of Unicode appropriate for the version of ICU library currently in use,...
Abstract maintenance class for quickly writing and churning out maintenance scripts with minimal effo...
addOption( $name, $description, $required=false, $withArg=false, $shortName=false, $multiOccurrence=false)
Add a parameter to the script.
getOption( $name, $default=null)
Get an option, or return the default.
fatalError( $msg, $exitCode=1)
Output a message and terminate the current script.
static explode( $separator, $subject)
Workalike for explode() with limited memory usage.
readAttributes()
Read the attributes of the current element node and return them as an array.
if(PHP_SAPI !='cli-server') if(!isset( $_SERVER['SCRIPT_FILENAME'])) $file
Item class for a filearchive table row.