Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 72 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
GenerateNormalizerDataAr | |
0.00% |
0 / 69 |
|
0.00% |
0 / 3 |
272 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
getDbType | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 64 |
|
0.00% |
0 / 1 |
210 |
1 | <?php |
2 | /** |
3 | * Generates the normalizer data file for Arabic. |
4 | * |
5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation; either version 2 of the License, or |
8 | * (at your option) any later version. |
9 | * |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | * GNU General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU General Public License along |
16 | * with this program; if not, write to the Free Software Foundation, Inc., |
17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
18 | * http://www.gnu.org/copyleft/gpl.html |
19 | * |
20 | * @file |
21 | * @ingroup MaintenanceLanguage |
22 | */ |
23 | |
24 | require_once __DIR__ . '/../Maintenance.php'; |
25 | |
26 | use Wikimedia\StaticArrayWriter; |
27 | |
28 | /** |
29 | * Generates the normalizer data file for Arabic. |
30 | * |
31 | * This data file is used after normalizing to NFC. |
32 | * |
33 | * Example usage: |
34 | * |
35 | * curl 'https://unicode.org/Public/6.0.0/ucd/UnicodeData.txt' > /tmp/UnicodeData.txt |
36 | * php generateNormalizerDataAr.php --unicode-data-file /tmp/UnicodeData.txt |
37 | * |
38 | * @ingroup MaintenanceLanguage |
39 | */ |
40 | class GenerateNormalizerDataAr extends Maintenance { |
41 | public function __construct() { |
42 | parent::__construct(); |
43 | $this->addDescription( 'Generate the normalizer data file for Arabic' ); |
44 | $this->addOption( 'unicode-data-file', 'The local location of the data file ' . |
45 | 'from https://unicode.org/Public/6.0.0/ucd/UnicodeData.txt', false, true ); |
46 | } |
47 | |
48 | public function getDbType() { |
49 | return Maintenance::DB_NONE; |
50 | } |
51 | |
52 | public function execute() { |
53 | if ( !$this->hasOption( 'unicode-data-file' ) ) { |
54 | $dataFile = 'UnicodeData.txt'; |
55 | if ( !file_exists( $dataFile ) ) { |
56 | $this->fatalError( "Unable to find UnicodeData.txt. Please specify " . |
57 | "its location with --unicode-data-file=<FILE>" ); |
58 | } |
59 | } else { |
60 | $dataFile = $this->getOption( 'unicode-data-file' ); |
61 | if ( !file_exists( $dataFile ) ) { |
62 | $this->fatalError( 'Unable to find the specified data file.' ); |
63 | } |
64 | } |
65 | |
66 | $file = fopen( $dataFile, 'r' ); |
67 | if ( !$file ) { |
68 | $this->fatalError( 'Unable to open the data file.' ); |
69 | } |
70 | |
71 | // For the file format, see https://www.unicode.org/reports/tr44/ |
72 | $fieldNames = [ |
73 | 'Code', |
74 | 'Name', |
75 | 'General_Category', |
76 | 'Canonical_Combining_Class', |
77 | 'Bidi_Class', |
78 | 'Decomposition_Type_Mapping', |
79 | 'Numeric_Type_Value_6', |
80 | 'Numeric_Type_Value_7', |
81 | 'Numeric_Type_Value_8', |
82 | 'Bidi_Mirrored', |
83 | 'Unicode_1_Name', |
84 | 'ISO_Comment', |
85 | 'Simple_Uppercase_Mapping', |
86 | 'Simple_Lowercase_Mapping', |
87 | 'Simple_Titlecase_Mapping' |
88 | ]; |
89 | |
90 | $pairs = []; |
91 | |
92 | $lineNum = 0; |
93 | while ( ( $line = fgets( $file ) ) !== false ) { |
94 | ++$lineNum; |
95 | |
96 | # Strip comments |
97 | $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) ); |
98 | if ( $line === '' ) { |
99 | continue; |
100 | } |
101 | |
102 | # Split fields |
103 | $numberedData = explode( ';', $line ); |
104 | $data = []; |
105 | foreach ( $fieldNames as $number => $name ) { |
106 | $data[$name] = $numberedData[$number]; |
107 | } |
108 | |
109 | $code = base_convert( $data['Code'], 16, 10 ); |
110 | if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A |
111 | || ( $code >= 0xFE70 && $code <= 0xFEFF ) # Arabic presentation forms B |
112 | ) { |
113 | if ( $data['Decomposition_Type_Mapping'] === '' ) { |
114 | // No decomposition |
115 | continue; |
116 | } |
117 | if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/', |
118 | $data['Decomposition_Type_Mapping'], $m ) |
119 | ) { |
120 | $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" ); |
121 | $this->error( $line ); |
122 | continue; |
123 | } |
124 | |
125 | $source = UtfNormal\Utils::hexSequenceToUtf8( $data['Code'] ); |
126 | $dest = UtfNormal\Utils::hexSequenceToUtf8( $m[2] ); |
127 | $pairs[$source] = $dest; |
128 | } |
129 | } |
130 | |
131 | global $IP; |
132 | $writer = new StaticArrayWriter(); |
133 | file_put_contents( "$IP/includes/languages/data/NormalizeAr.php", $writer->writeClass( |
134 | $pairs, |
135 | [ |
136 | 'header' => 'Generated by generateNormalizerDataAr.php. Do not modify!', |
137 | 'namespace' => 'MediaWiki\\Languages\\Data', |
138 | 'class' => 'NormalizeAr', |
139 | 'const' => 'PAIRS', |
140 | ] |
141 | ) ); |
142 | |
143 | echo "ar: " . count( $pairs ) . " pairs written.\n"; |
144 | } |
145 | } |
146 | |
147 | $maintClass = GenerateNormalizerDataAr::class; |
148 | require_once RUN_MAINTENANCE_IF_MAIN; |