Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
0.00% covered (danger)
0.00%
0 / 197
0.00% covered (danger)
0.00%
0 / 11
CRAP
0.00% covered (danger)
0.00%
0 / 2
GenerateCollationData
0.00% covered (danger)
0.00%
0 / 137
0.00% covered (danger)
0.00%
0 / 5
2162
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
2
 execute
0.00% covered (danger)
0.00%
0 / 31
0.00% covered (danger)
0.00%
0 / 1
90
 loadUcd
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 charCallback
0.00% covered (danger)
0.00%
0 / 19
0.00% covered (danger)
0.00%
0 / 1
110
 generateFirstChars
0.00% covered (danger)
0.00%
0 / 79
0.00% covered (danger)
0.00%
0 / 1
650
UcdXmlReader
0.00% covered (danger)
0.00%
0 / 60
0.00% covered (danger)
0.00%
0 / 6
992
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 readChars
0.00% covered (danger)
0.00%
0 / 15
0.00% covered (danger)
0.00%
0 / 1
90
 open
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
20
 readAttributes
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
6
 handleChar
0.00% covered (danger)
0.00%
0 / 21
0.00% covered (danger)
0.00%
0 / 1
72
 getBlocks
0.00% covered (danger)
0.00%
0 / 13
0.00% covered (danger)
0.00%
0 / 1
56
1<?php
2/**
3 * Maintenance script to generate first letter data files for Collation.php.
4 *
5 * @license GPL-2.0-or-later
6 * @file
7 * @ingroup MaintenanceLanguage
8 */
9
10// @codeCoverageIgnoreStart
11require_once __DIR__ . '/../Maintenance.php';
12// @codeCoverageIgnoreEnd
13
14use MediaWiki\Maintenance\Maintenance;
15use Wikimedia\StaticArrayWriter;
16use Wikimedia\StringUtils\StringUtils;
17
18/**
19 * Generate first letter data files for Collation.php
20 *
21 * @ingroup MaintenanceLanguage
22 */
23class GenerateCollationData extends Maintenance {
24    /** @var string The directory with source data files in it */
25    public $dataDir;
26
27    /** @var int The primary weights, indexed by codepoint */
28    public $weights;
29
30    /**
31     * A hashtable keyed by codepoint, where presence indicates that a character
32     * has a decomposition mapping. This makes it non-preferred for group header
33     * selection.
34     * @var string[]
35     */
36    public $mappedChars;
37
38    /** @var string */
39    public $debugOutFile;
40
41    /** @var string[] */
42    private $groups;
43
44    public function __construct() {
45        parent::__construct();
46        $this->addOption( 'data-dir', 'A directory on the local filesystem ' .
47            'containing allkeys.txt and ucd.all.grouped.xml from unicode.org',
48            false, true );
49        $this->addOption( 'debug-output', 'Filename for sending debug output to',
50            false, true );
51    }
52
53    public function execute() {
54        $this->dataDir = $this->getOption( 'data-dir', '.' );
55
56        $allkeysPresent = file_exists( "{$this->dataDir}/allkeys.txt" );
57        $ucdallPresent = file_exists( "{$this->dataDir}/ucd.all.grouped.xml" );
58
59        if ( !$allkeysPresent || !$ucdallPresent ) {
60            $icuVersion = INTL_ICU_VERSION;
61            $unicodeVersion = implode( '.', array_slice( IntlChar::getUnicodeVersion(), 0, 3 ) );
62
63            $error = "";
64
65            if ( !$allkeysPresent ) {
66                $error .= "Unable to find allkeys.txt. "
67                    . "Download it and specify its location with --data-dir=<DIR>. "
68                    . "\n\n";
69            }
70            if ( !$ucdallPresent ) {
71                $error .= "Unable to find ucd.all.grouped.xml. "
72                    . "Download it, unzip, and specify its location with --data-dir=<DIR>. "
73                    . "\n\n";
74            }
75
76            $error .= "You are using ICU $icuVersion, intended for Unicode $unicodeVersion"
77                . "Appropriate file(s) should be available at:\n";
78
79            $allkeysURL = "https://www.unicode.org/Public/UCA/$unicodeVersion/allkeys.txt";
80            $ucdallURL = "https://www.unicode.org/Public/$unicodeVersion/ucdxml/ucd.all.grouped.zip";
81
82            if ( !$allkeysPresent ) {
83                $error .= "$allkeysURL\n";
84            }
85            if ( !$ucdallPresent ) {
86                $error .= "$ucdallURL\n";
87            }
88
89            $this->fatalError( $error );
90        }
91
92        $debugOutFileName = $this->getOption( 'debug-output' );
93        if ( $debugOutFileName ) {
94            $this->debugOutFile = fopen( $debugOutFileName, 'w' );
95            if ( !$this->debugOutFile ) {
96                $this->fatalError( "Unable to open debug output file for writing" );
97            }
98        }
99        $this->loadUcd();
100        $this->generateFirstChars();
101    }
102
103    private function loadUcd() {
104        $uxr = new UcdXmlReader( "{$this->dataDir}/ucd.all.grouped.xml" );
105        $uxr->readChars( $this->charCallback( ... ) );
106    }
107
108    private function charCallback( array $data ) {
109        // Skip non-printable characters,
110        // but do not skip a normal space (U+0020) since
111        // people like to use that as a fake no header symbol.
112        $category = substr( $data['gc'], 0, 1 );
113        if ( !str_contains( 'LNPS', $category )
114            && $data['cp'] !== '0020'
115        ) {
116            return;
117        }
118        $cp = hexdec( $data['cp'] );
119
120        // Skip the CJK ideograph blocks, as an optimisation measure.
121        // UCA doesn't sort them properly anyway, without tailoring.
122        if ( IcuCollation::isCjk( $cp ) ) {
123            return;
124        }
125
126        // Skip the composed Hangul syllables, we will use the bare Jamo
127        // as first letters
128        if ( $data['block'] == 'Hangul Syllables' ) {
129            return;
130        }
131
132        // Skip characters that mapped to a single character we skipped above.
133        // e.g. U+2329 -> U+3008 (from CJK Symbols and Punctuation)
134        if ( $data['dm'] !== '#' && !str_contains( $data['dm'], ' ' ) &&
135            !isset( $this->weights[ hexdec( $data['dm'] ) ] )
136        ) {
137            return;
138        }
139
140        // Calculate implicit weight per UTS #10 v6.0.0, sec 7.1.3
141        $a = 0xFBC0 + ( $cp >> 15 );
142        $b = ( $cp & 0x7fff ) | 0x8000;
143
144        $this->weights[$cp] = sprintf( ".%04X.%04X", $a, $b );
145
146        if ( $data['dm'] !== '#' ) {
147            $this->mappedChars[$cp] = true;
148        }
149
150        if ( $cp % 4096 == 0 ) {
151            print "{$data['cp']}\n";
152        }
153    }
154
155    private function generateFirstChars() {
156        $file = fopen( "{$this->dataDir}/allkeys.txt", 'r' );
157        if ( !$file ) {
158            $this->fatalError( "Unable to open allkeys.txt" );
159        }
160
161        $goodTertiaryChars = [];
162
163        // For each character with an entry in allkeys.txt, overwrite the implicit
164        // entry in $this->weights that came from the UCD.
165        // Also gather a list of tertiary weights, for use in selecting the group header
166        // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
167        while ( ( $line = fgets( $file ) ) !== false ) {
168            // We're only interested in single-character weights, pick them out with a regex
169            $line = trim( $line );
170            if ( !preg_match( '/^([0-9A-F]+)\s*;\s*([^#]*)/', $line, $m ) ) {
171                continue;
172            }
173
174            $cp = hexdec( $m[1] );
175            $allWeights = trim( $m[2] );
176            $primary = '';
177            $tertiary = '';
178
179            if ( !isset( $this->weights[$cp] ) ) {
180                // Non-printable, ignore
181                continue;
182            }
183            foreach ( StringUtils::explode( '[', $allWeights ) as $weightStr ) {
184                if ( preg_match_all( '/[*.]([0-9A-F]+)/', $weightStr, $m ) ) {
185                    if ( $m[1][0] !== '0000' ) {
186                        $primary .= '.' . $m[1][0];
187                    }
188                    if ( $m[1][2] !== '0000' ) {
189                        $tertiary .= '.' . $m[1][2];
190                    }
191                }
192            }
193            $this->weights[$cp] = $primary;
194            if ( $tertiary === '.0008'
195                || $tertiary === '.000E'
196            ) {
197                $goodTertiaryChars[$cp] = true;
198            }
199        }
200        fclose( $file );
201
202        // Identify groups of characters with the same primary weight
203        $this->groups = [];
204        asort( $this->weights, SORT_STRING );
205        $prevWeight = reset( $this->weights );
206        $group = [];
207        foreach ( $this->weights as $cp => $weight ) {
208            if ( $weight !== $prevWeight ) {
209                $this->groups[$prevWeight] = $group;
210                $prevWeight = $weight;
211                $group = $this->groups[$weight] ?? [];
212            }
213            $group[] = $cp;
214        }
215        if ( $group ) {
216            $this->groups[$prevWeight] = $group;
217        }
218
219        // If one character has a given primary weight sequence, and a second
220        // character has a longer primary weight sequence with an initial
221        // portion equal to the first character, then remove the second
222        // character. This avoids having characters like U+A732 (double A)
223        // polluting the basic Latin sort area.
224
225        foreach ( $this->groups as $weight => $group ) {
226            if ( preg_match( '/(\.[0-9A-F]*)\./', $weight, $m ) ) {
227                if ( isset( $this->groups[$m[1]] ) ) {
228                    unset( $this->groups[$weight] );
229                }
230            }
231        }
232
233        ksort( $this->groups, SORT_STRING );
234
235        // Identify the header character in each group
236        $headerChars = [];
237        $prevChar = "\000";
238        $tertiaryCollator = new Collator( 'root' );
239        $primaryCollator = new Collator( 'root' );
240        $primaryCollator->setStrength( Collator::PRIMARY );
241        $numOutOfOrder = 0;
242        foreach ( $this->groups as $weight => $group ) {
243            $uncomposedChars = [];
244            $goodChars = [];
245            foreach ( $group as $cp ) {
246                if ( isset( $goodTertiaryChars[$cp] ) ) {
247                    $goodChars[] = $cp;
248                }
249                if ( !isset( $this->mappedChars[$cp] ) ) {
250                    $uncomposedChars[] = $cp;
251                }
252            }
253            $x = array_intersect( $goodChars, $uncomposedChars );
254            if ( !$x ) {
255                $x = $uncomposedChars;
256                if ( !$x ) {
257                    $x = $group;
258                }
259            }
260
261            // Use ICU to pick the lowest sorting character in the selection
262            $tertiaryCollator->sort( $x );
263            $cp = $x[0];
264
265            $char = UtfNormal\Utils::codepointToUtf8( $cp );
266            $headerChars[] = $char;
267            if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
268                $numOutOfOrder++;
269            }
270            $prevChar = $char;
271
272            if ( $this->debugOutFile ) {
273                fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char,
274                    implode( ' ', array_map( [ UtfNormal\Utils::class, 'codepointToUtf8' ], $group ) ) ) );
275            }
276        }
277
278        print "Out of order: $numOutOfOrder / " . count( $headerChars ) . "\n";
279
280        global $IP;
281        $writer = new StaticArrayWriter();
282        file_put_contents(
283            "$IP/lanuages/data/first-letters-root.php",
284            $writer->create( $headerChars, 'File created by generateCollationData.php' )
285        );
286        echo "first-letters-root: file written.\n";
287    }
288}
289
290class UcdXmlReader {
291    /** @var string */
292    public $fileName;
293    /** @var callable */
294    public $callback;
295    /** @var array */
296    public $groupAttrs;
297    /** @var XMLReader */
298    public $xml;
299    /** @var array[] */
300    public $blocks = [];
301    /** @var array */
302    public $currentBlock;
303
304    public function __construct( string $fileName ) {
305        $this->fileName = $fileName;
306    }
307
308    public function readChars( callable $callback ) {
309        $this->getBlocks();
310        $this->currentBlock = reset( $this->blocks );
311        $xml = $this->open();
312        $this->callback = $callback;
313
314        while ( $xml->name !== 'repertoire' && $xml->next() );
315
316        while ( $xml->read() ) {
317            if ( $xml->nodeType == XMLReader::ELEMENT ) {
318                if ( $xml->name === 'group' ) {
319                    $this->groupAttrs = $this->readAttributes();
320                } elseif ( $xml->name === 'char' ) {
321                    $this->handleChar();
322                }
323            } elseif ( $xml->nodeType === XMLReader::END_ELEMENT ) {
324                if ( $xml->name === 'group' ) {
325                    $this->groupAttrs = [];
326                }
327            }
328        }
329        $xml->close();
330    }
331
332    protected function open(): XMLReader {
333        $this->xml = new XMLReader;
334        if ( !$this->xml->open( $this->fileName ) ) {
335            throw new RuntimeException( __METHOD__ . ": unable to open {$this->fileName}" );
336        }
337        while ( $this->xml->name !== 'ucd' && $this->xml->read() );
338        $this->xml->read();
339
340        return $this->xml;
341    }
342
343    /**
344     * Read the attributes of the current element node and return them
345     * as an array
346     * @return array
347     */
348    protected function readAttributes() {
349        $attrs = [];
350        while ( $this->xml->moveToNextAttribute() ) {
351            $attrs[$this->xml->name] = $this->xml->value;
352        }
353
354        return $attrs;
355    }
356
357    protected function handleChar() {
358        $attrs = $this->readAttributes() + $this->groupAttrs;
359        if ( isset( $attrs['cp'] ) ) {
360            $first = $last = hexdec( $attrs['cp'] );
361        } else {
362            $first = hexdec( $attrs['first-cp'] );
363            $last = hexdec( $attrs['last-cp'] );
364            unset( $attrs['first-cp'] );
365            unset( $attrs['last-cp'] );
366        }
367
368        for ( $cp = $first; $cp <= $last; $cp++ ) {
369            $hexCp = sprintf( "%04X", $cp );
370            foreach ( [ 'na', 'na1' ] as $nameProp ) {
371                if ( isset( $attrs[$nameProp] ) ) {
372                    $attrs[$nameProp] = str_replace( '#', $hexCp, $attrs[$nameProp] );
373                }
374            }
375
376            while ( $this->currentBlock ) {
377                if ( $cp < $this->currentBlock[0] ) {
378                    break;
379                } elseif ( $cp <= $this->currentBlock[1] ) {
380                    $attrs['block'] = key( $this->blocks );
381                    break;
382                } else {
383                    $this->currentBlock = next( $this->blocks );
384                }
385            }
386
387            $attrs['cp'] = $hexCp;
388            ( $this->callback )( $attrs );
389        }
390    }
391
392    public function getBlocks(): array {
393        if ( $this->blocks ) {
394            return $this->blocks;
395        }
396
397        $xml = $this->open();
398        while ( $xml->name !== 'blocks' && $xml->read() );
399
400        while ( $xml->read() ) {
401            if ( $xml->nodeType == XMLReader::ELEMENT ) {
402                if ( $xml->name === 'block' ) {
403                    $attrs = $this->readAttributes();
404                    $first = hexdec( $attrs['first-cp'] );
405                    $last = hexdec( $attrs['last-cp'] );
406                    $this->blocks[$attrs['name']] = [ $first, $last ];
407                }
408            }
409        }
410        $xml->close();
411
412        return $this->blocks;
413    }
414}
415
416// @codeCoverageIgnoreStart
417$maintClass = GenerateCollationData::class;
418require_once RUN_MAINTENANCE_IF_MAIN;
419// @codeCoverageIgnoreEnd