Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 197 |
|
0.00% |
0 / 11 |
CRAP | |
0.00% |
0 / 2 |
| GenerateCollationData | |
0.00% |
0 / 137 |
|
0.00% |
0 / 5 |
2162 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
| execute | |
0.00% |
0 / 31 |
|
0.00% |
0 / 1 |
90 | |||
| loadUcd | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| charCallback | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
110 | |||
| generateFirstChars | |
0.00% |
0 / 79 |
|
0.00% |
0 / 1 |
650 | |||
| UcdXmlReader | |
0.00% |
0 / 60 |
|
0.00% |
0 / 6 |
992 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| readChars | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
90 | |||
| open | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
| readAttributes | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| handleChar | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
72 | |||
| getBlocks | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
56 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * Maintenance script to generate first letter data files for Collation.php. |
| 4 | * |
| 5 | * @license GPL-2.0-or-later |
| 6 | * @file |
| 7 | * @ingroup MaintenanceLanguage |
| 8 | */ |
| 9 | |
| 10 | // @codeCoverageIgnoreStart |
| 11 | require_once __DIR__ . '/../Maintenance.php'; |
| 12 | // @codeCoverageIgnoreEnd |
| 13 | |
| 14 | use MediaWiki\Maintenance\Maintenance; |
| 15 | use Wikimedia\StaticArrayWriter; |
| 16 | use Wikimedia\StringUtils\StringUtils; |
| 17 | |
| 18 | /** |
| 19 | * Generate first letter data files for Collation.php |
| 20 | * |
| 21 | * @ingroup MaintenanceLanguage |
| 22 | */ |
| 23 | class GenerateCollationData extends Maintenance { |
| 24 | /** @var string The directory with source data files in it */ |
| 25 | public $dataDir; |
| 26 | |
| 27 | /** @var int The primary weights, indexed by codepoint */ |
| 28 | public $weights; |
| 29 | |
| 30 | /** |
| 31 | * A hashtable keyed by codepoint, where presence indicates that a character |
| 32 | * has a decomposition mapping. This makes it non-preferred for group header |
| 33 | * selection. |
| 34 | * @var string[] |
| 35 | */ |
| 36 | public $mappedChars; |
| 37 | |
| 38 | /** @var string */ |
| 39 | public $debugOutFile; |
| 40 | |
| 41 | /** @var string[] */ |
| 42 | private $groups; |
| 43 | |
| 44 | public function __construct() { |
| 45 | parent::__construct(); |
| 46 | $this->addOption( 'data-dir', 'A directory on the local filesystem ' . |
| 47 | 'containing allkeys.txt and ucd.all.grouped.xml from unicode.org', |
| 48 | false, true ); |
| 49 | $this->addOption( 'debug-output', 'Filename for sending debug output to', |
| 50 | false, true ); |
| 51 | } |
| 52 | |
| 53 | public function execute() { |
| 54 | $this->dataDir = $this->getOption( 'data-dir', '.' ); |
| 55 | |
| 56 | $allkeysPresent = file_exists( "{$this->dataDir}/allkeys.txt" ); |
| 57 | $ucdallPresent = file_exists( "{$this->dataDir}/ucd.all.grouped.xml" ); |
| 58 | |
| 59 | if ( !$allkeysPresent || !$ucdallPresent ) { |
| 60 | $icuVersion = INTL_ICU_VERSION; |
| 61 | $unicodeVersion = implode( '.', array_slice( IntlChar::getUnicodeVersion(), 0, 3 ) ); |
| 62 | |
| 63 | $error = ""; |
| 64 | |
| 65 | if ( !$allkeysPresent ) { |
| 66 | $error .= "Unable to find allkeys.txt. " |
| 67 | . "Download it and specify its location with --data-dir=<DIR>. " |
| 68 | . "\n\n"; |
| 69 | } |
| 70 | if ( !$ucdallPresent ) { |
| 71 | $error .= "Unable to find ucd.all.grouped.xml. " |
| 72 | . "Download it, unzip, and specify its location with --data-dir=<DIR>. " |
| 73 | . "\n\n"; |
| 74 | } |
| 75 | |
| 76 | $error .= "You are using ICU $icuVersion, intended for Unicode $unicodeVersion. " |
| 77 | . "Appropriate file(s) should be available at:\n"; |
| 78 | |
| 79 | $allkeysURL = "https://www.unicode.org/Public/UCA/$unicodeVersion/allkeys.txt"; |
| 80 | $ucdallURL = "https://www.unicode.org/Public/$unicodeVersion/ucdxml/ucd.all.grouped.zip"; |
| 81 | |
| 82 | if ( !$allkeysPresent ) { |
| 83 | $error .= "* $allkeysURL\n"; |
| 84 | } |
| 85 | if ( !$ucdallPresent ) { |
| 86 | $error .= "* $ucdallURL\n"; |
| 87 | } |
| 88 | |
| 89 | $this->fatalError( $error ); |
| 90 | } |
| 91 | |
| 92 | $debugOutFileName = $this->getOption( 'debug-output' ); |
| 93 | if ( $debugOutFileName ) { |
| 94 | $this->debugOutFile = fopen( $debugOutFileName, 'w' ); |
| 95 | if ( !$this->debugOutFile ) { |
| 96 | $this->fatalError( "Unable to open debug output file for writing" ); |
| 97 | } |
| 98 | } |
| 99 | $this->loadUcd(); |
| 100 | $this->generateFirstChars(); |
| 101 | } |
| 102 | |
| 103 | private function loadUcd() { |
| 104 | $uxr = new UcdXmlReader( "{$this->dataDir}/ucd.all.grouped.xml" ); |
| 105 | $uxr->readChars( $this->charCallback( ... ) ); |
| 106 | } |
| 107 | |
| 108 | private function charCallback( array $data ) { |
| 109 | // Skip non-printable characters, |
| 110 | // but do not skip a normal space (U+0020) since |
| 111 | // people like to use that as a fake no header symbol. |
| 112 | $category = substr( $data['gc'], 0, 1 ); |
| 113 | if ( !str_contains( 'LNPS', $category ) |
| 114 | && $data['cp'] !== '0020' |
| 115 | ) { |
| 116 | return; |
| 117 | } |
| 118 | $cp = hexdec( $data['cp'] ); |
| 119 | |
| 120 | // Skip the CJK ideograph blocks, as an optimisation measure. |
| 121 | // UCA doesn't sort them properly anyway, without tailoring. |
| 122 | if ( IcuCollation::isCjk( $cp ) ) { |
| 123 | return; |
| 124 | } |
| 125 | |
| 126 | // Skip the composed Hangul syllables, we will use the bare Jamo |
| 127 | // as first letters |
| 128 | if ( $data['block'] == 'Hangul Syllables' ) { |
| 129 | return; |
| 130 | } |
| 131 | |
| 132 | // Skip characters that mapped to a single character we skipped above. |
| 133 | // e.g. U+2329 -> U+3008 (from CJK Symbols and Punctuation) |
| 134 | if ( $data['dm'] !== '#' && !str_contains( $data['dm'], ' ' ) && |
| 135 | !isset( $this->weights[ hexdec( $data['dm'] ) ] ) |
| 136 | ) { |
| 137 | return; |
| 138 | } |
| 139 | |
| 140 | // Calculate implicit weight per UTS #10 v6.0.0, sec 7.1.3 |
| 141 | $a = 0xFBC0 + ( $cp >> 15 ); |
| 142 | $b = ( $cp & 0x7fff ) | 0x8000; |
| 143 | |
| 144 | $this->weights[$cp] = sprintf( ".%04X.%04X", $a, $b ); |
| 145 | |
| 146 | if ( $data['dm'] !== '#' ) { |
| 147 | $this->mappedChars[$cp] = true; |
| 148 | } |
| 149 | |
| 150 | if ( $cp % 4096 == 0 ) { |
| 151 | print "{$data['cp']}\n"; |
| 152 | } |
| 153 | } |
| 154 | |
| 155 | private function generateFirstChars() { |
| 156 | $file = fopen( "{$this->dataDir}/allkeys.txt", 'r' ); |
| 157 | if ( !$file ) { |
| 158 | $this->fatalError( "Unable to open allkeys.txt" ); |
| 159 | } |
| 160 | |
| 161 | $goodTertiaryChars = []; |
| 162 | |
| 163 | // For each character with an entry in allkeys.txt, overwrite the implicit |
| 164 | // entry in $this->weights that came from the UCD. |
| 165 | // Also gather a list of tertiary weights, for use in selecting the group header |
| 166 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
| 167 | while ( ( $line = fgets( $file ) ) !== false ) { |
| 168 | // We're only interested in single-character weights, pick them out with a regex |
| 169 | $line = trim( $line ); |
| 170 | if ( !preg_match( '/^([0-9A-F]+)\s*;\s*([^#]*)/', $line, $m ) ) { |
| 171 | continue; |
| 172 | } |
| 173 | |
| 174 | $cp = hexdec( $m[1] ); |
| 175 | $allWeights = trim( $m[2] ); |
| 176 | $primary = ''; |
| 177 | $tertiary = ''; |
| 178 | |
| 179 | if ( !isset( $this->weights[$cp] ) ) { |
| 180 | // Non-printable, ignore |
| 181 | continue; |
| 182 | } |
| 183 | foreach ( StringUtils::explode( '[', $allWeights ) as $weightStr ) { |
| 184 | if ( preg_match_all( '/[*.]([0-9A-F]+)/', $weightStr, $m ) ) { |
| 185 | if ( $m[1][0] !== '0000' ) { |
| 186 | $primary .= '.' . $m[1][0]; |
| 187 | } |
| 188 | if ( $m[1][2] !== '0000' ) { |
| 189 | $tertiary .= '.' . $m[1][2]; |
| 190 | } |
| 191 | } |
| 192 | } |
| 193 | $this->weights[$cp] = $primary; |
| 194 | if ( $tertiary === '.0008' |
| 195 | || $tertiary === '.000E' |
| 196 | ) { |
| 197 | $goodTertiaryChars[$cp] = true; |
| 198 | } |
| 199 | } |
| 200 | fclose( $file ); |
| 201 | |
| 202 | // Identify groups of characters with the same primary weight |
| 203 | $this->groups = []; |
| 204 | asort( $this->weights, SORT_STRING ); |
| 205 | $prevWeight = reset( $this->weights ); |
| 206 | $group = []; |
| 207 | foreach ( $this->weights as $cp => $weight ) { |
| 208 | if ( $weight !== $prevWeight ) { |
| 209 | $this->groups[$prevWeight] = $group; |
| 210 | $prevWeight = $weight; |
| 211 | $group = $this->groups[$weight] ?? []; |
| 212 | } |
| 213 | $group[] = $cp; |
| 214 | } |
| 215 | if ( $group ) { |
| 216 | $this->groups[$prevWeight] = $group; |
| 217 | } |
| 218 | |
| 219 | // If one character has a given primary weight sequence, and a second |
| 220 | // character has a longer primary weight sequence with an initial |
| 221 | // portion equal to the first character, then remove the second |
| 222 | // character. This avoids having characters like U+A732 (double A) |
| 223 | // polluting the basic Latin sort area. |
| 224 | |
| 225 | foreach ( $this->groups as $weight => $group ) { |
| 226 | if ( preg_match( '/(\.[0-9A-F]*)\./', $weight, $m ) ) { |
| 227 | if ( isset( $this->groups[$m[1]] ) ) { |
| 228 | unset( $this->groups[$weight] ); |
| 229 | } |
| 230 | } |
| 231 | } |
| 232 | |
| 233 | ksort( $this->groups, SORT_STRING ); |
| 234 | |
| 235 | // Identify the header character in each group |
| 236 | $headerChars = []; |
| 237 | $prevChar = "\000"; |
| 238 | $tertiaryCollator = new Collator( 'root' ); |
| 239 | $primaryCollator = new Collator( 'root' ); |
| 240 | $primaryCollator->setStrength( Collator::PRIMARY ); |
| 241 | $numOutOfOrder = 0; |
| 242 | foreach ( $this->groups as $weight => $group ) { |
| 243 | $uncomposedChars = []; |
| 244 | $goodChars = []; |
| 245 | foreach ( $group as $cp ) { |
| 246 | if ( isset( $goodTertiaryChars[$cp] ) ) { |
| 247 | $goodChars[] = $cp; |
| 248 | } |
| 249 | if ( !isset( $this->mappedChars[$cp] ) ) { |
| 250 | $uncomposedChars[] = $cp; |
| 251 | } |
| 252 | } |
| 253 | $x = array_intersect( $goodChars, $uncomposedChars ); |
| 254 | if ( !$x ) { |
| 255 | $x = $uncomposedChars; |
| 256 | if ( !$x ) { |
| 257 | $x = $group; |
| 258 | } |
| 259 | } |
| 260 | |
| 261 | // Use ICU to pick the lowest sorting character in the selection |
| 262 | $tertiaryCollator->sort( $x ); |
| 263 | $cp = $x[0]; |
| 264 | |
| 265 | $char = UtfNormal\Utils::codepointToUtf8( $cp ); |
| 266 | $headerChars[] = $char; |
| 267 | if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) { |
| 268 | $numOutOfOrder++; |
| 269 | } |
| 270 | $prevChar = $char; |
| 271 | |
| 272 | if ( $this->debugOutFile ) { |
| 273 | fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char, |
| 274 | implode( ' ', array_map( [ UtfNormal\Utils::class, 'codepointToUtf8' ], $group ) ) ) ); |
| 275 | } |
| 276 | } |
| 277 | |
| 278 | print "Out of order: $numOutOfOrder / " . count( $headerChars ) . "\n"; |
| 279 | |
| 280 | global $IP; |
| 281 | $writer = new StaticArrayWriter(); |
| 282 | file_put_contents( |
| 283 | "$IP/lanuages/data/first-letters-root.php", |
| 284 | $writer->create( $headerChars, 'File created by generateCollationData.php' ) |
| 285 | ); |
| 286 | echo "first-letters-root: file written.\n"; |
| 287 | } |
| 288 | } |
| 289 | |
| 290 | class UcdXmlReader { |
| 291 | /** @var string */ |
| 292 | public $fileName; |
| 293 | /** @var callable */ |
| 294 | public $callback; |
| 295 | /** @var array */ |
| 296 | public $groupAttrs; |
| 297 | /** @var XMLReader */ |
| 298 | public $xml; |
| 299 | /** @var array[] */ |
| 300 | public $blocks = []; |
| 301 | /** @var array */ |
| 302 | public $currentBlock; |
| 303 | |
| 304 | public function __construct( string $fileName ) { |
| 305 | $this->fileName = $fileName; |
| 306 | } |
| 307 | |
| 308 | public function readChars( callable $callback ) { |
| 309 | $this->getBlocks(); |
| 310 | $this->currentBlock = reset( $this->blocks ); |
| 311 | $xml = $this->open(); |
| 312 | $this->callback = $callback; |
| 313 | |
| 314 | while ( $xml->name !== 'repertoire' && $xml->next() ); |
| 315 | |
| 316 | while ( $xml->read() ) { |
| 317 | if ( $xml->nodeType == XMLReader::ELEMENT ) { |
| 318 | if ( $xml->name === 'group' ) { |
| 319 | $this->groupAttrs = $this->readAttributes(); |
| 320 | } elseif ( $xml->name === 'char' ) { |
| 321 | $this->handleChar(); |
| 322 | } |
| 323 | } elseif ( $xml->nodeType === XMLReader::END_ELEMENT ) { |
| 324 | if ( $xml->name === 'group' ) { |
| 325 | $this->groupAttrs = []; |
| 326 | } |
| 327 | } |
| 328 | } |
| 329 | $xml->close(); |
| 330 | } |
| 331 | |
| 332 | protected function open(): XMLReader { |
| 333 | $this->xml = new XMLReader; |
| 334 | if ( !$this->xml->open( $this->fileName ) ) { |
| 335 | throw new RuntimeException( __METHOD__ . ": unable to open {$this->fileName}" ); |
| 336 | } |
| 337 | while ( $this->xml->name !== 'ucd' && $this->xml->read() ); |
| 338 | $this->xml->read(); |
| 339 | |
| 340 | return $this->xml; |
| 341 | } |
| 342 | |
| 343 | /** |
| 344 | * Read the attributes of the current element node and return them |
| 345 | * as an array |
| 346 | * @return array |
| 347 | */ |
| 348 | protected function readAttributes() { |
| 349 | $attrs = []; |
| 350 | while ( $this->xml->moveToNextAttribute() ) { |
| 351 | $attrs[$this->xml->name] = $this->xml->value; |
| 352 | } |
| 353 | |
| 354 | return $attrs; |
| 355 | } |
| 356 | |
| 357 | protected function handleChar() { |
| 358 | $attrs = $this->readAttributes() + $this->groupAttrs; |
| 359 | if ( isset( $attrs['cp'] ) ) { |
| 360 | $first = $last = hexdec( $attrs['cp'] ); |
| 361 | } else { |
| 362 | $first = hexdec( $attrs['first-cp'] ); |
| 363 | $last = hexdec( $attrs['last-cp'] ); |
| 364 | unset( $attrs['first-cp'] ); |
| 365 | unset( $attrs['last-cp'] ); |
| 366 | } |
| 367 | |
| 368 | for ( $cp = $first; $cp <= $last; $cp++ ) { |
| 369 | $hexCp = sprintf( "%04X", $cp ); |
| 370 | foreach ( [ 'na', 'na1' ] as $nameProp ) { |
| 371 | if ( isset( $attrs[$nameProp] ) ) { |
| 372 | $attrs[$nameProp] = str_replace( '#', $hexCp, $attrs[$nameProp] ); |
| 373 | } |
| 374 | } |
| 375 | |
| 376 | while ( $this->currentBlock ) { |
| 377 | if ( $cp < $this->currentBlock[0] ) { |
| 378 | break; |
| 379 | } elseif ( $cp <= $this->currentBlock[1] ) { |
| 380 | $attrs['block'] = key( $this->blocks ); |
| 381 | break; |
| 382 | } else { |
| 383 | $this->currentBlock = next( $this->blocks ); |
| 384 | } |
| 385 | } |
| 386 | |
| 387 | $attrs['cp'] = $hexCp; |
| 388 | ( $this->callback )( $attrs ); |
| 389 | } |
| 390 | } |
| 391 | |
| 392 | public function getBlocks(): array { |
| 393 | if ( $this->blocks ) { |
| 394 | return $this->blocks; |
| 395 | } |
| 396 | |
| 397 | $xml = $this->open(); |
| 398 | while ( $xml->name !== 'blocks' && $xml->read() ); |
| 399 | |
| 400 | while ( $xml->read() ) { |
| 401 | if ( $xml->nodeType == XMLReader::ELEMENT ) { |
| 402 | if ( $xml->name === 'block' ) { |
| 403 | $attrs = $this->readAttributes(); |
| 404 | $first = hexdec( $attrs['first-cp'] ); |
| 405 | $last = hexdec( $attrs['last-cp'] ); |
| 406 | $this->blocks[$attrs['name']] = [ $first, $last ]; |
| 407 | } |
| 408 | } |
| 409 | } |
| 410 | $xml->close(); |
| 411 | |
| 412 | return $this->blocks; |
| 413 | } |
| 414 | } |
| 415 | |
| 416 | // @codeCoverageIgnoreStart |
| 417 | $maintClass = GenerateCollationData::class; |
| 418 | require_once RUN_MAINTENANCE_IF_MAIN; |
| 419 | // @codeCoverageIgnoreEnd |