Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 69 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
GenerateNormalizerDataAr | |
0.00% |
0 / 69 |
|
0.00% |
0 / 3 |
272 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
getDbType | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
execute | |
0.00% |
0 / 64 |
|
0.00% |
0 / 1 |
210 |
1 | <?php |
2 | /** |
3 | * Generates the normalizer data file for Arabic. |
4 | * |
5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by |
7 | * the Free Software Foundation; either version 2 of the License, or |
8 | * (at your option) any later version. |
9 | * |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | * GNU General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU General Public License along |
16 | * with this program; if not, write to the Free Software Foundation, Inc., |
17 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
18 | * http://www.gnu.org/copyleft/gpl.html |
19 | * |
20 | * @file |
21 | * @ingroup MaintenanceLanguage |
22 | */ |
23 | |
24 | // @codeCoverageIgnoreStart |
25 | require_once __DIR__ . '/../Maintenance.php'; |
26 | // @codeCoverageIgnoreEnd |
27 | |
28 | use Wikimedia\StaticArrayWriter; |
29 | |
30 | /** |
31 | * Generates the normalizer data file for Arabic. |
32 | * |
33 | * This data file is used after normalizing to NFC. |
34 | * |
35 | * Example usage: |
36 | * |
37 | * curl 'https://unicode.org/Public/6.0.0/ucd/UnicodeData.txt' > /tmp/UnicodeData.txt |
38 | * php generateNormalizerDataAr.php --unicode-data-file /tmp/UnicodeData.txt |
39 | * |
40 | * @ingroup MaintenanceLanguage |
41 | */ |
42 | class GenerateNormalizerDataAr extends Maintenance { |
43 | public function __construct() { |
44 | parent::__construct(); |
45 | $this->addDescription( 'Generate the normalizer data file for Arabic' ); |
46 | $this->addOption( 'unicode-data-file', 'The local location of the data file ' . |
47 | 'from https://unicode.org/Public/6.0.0/ucd/UnicodeData.txt', false, true ); |
48 | } |
49 | |
50 | public function getDbType() { |
51 | return Maintenance::DB_NONE; |
52 | } |
53 | |
54 | public function execute() { |
55 | if ( !$this->hasOption( 'unicode-data-file' ) ) { |
56 | $dataFile = 'UnicodeData.txt'; |
57 | if ( !file_exists( $dataFile ) ) { |
58 | $this->fatalError( "Unable to find UnicodeData.txt. Please specify " . |
59 | "its location with --unicode-data-file=<FILE>" ); |
60 | } |
61 | } else { |
62 | $dataFile = $this->getOption( 'unicode-data-file' ); |
63 | if ( !file_exists( $dataFile ) ) { |
64 | $this->fatalError( 'Unable to find the specified data file.' ); |
65 | } |
66 | } |
67 | |
68 | $file = fopen( $dataFile, 'r' ); |
69 | if ( !$file ) { |
70 | $this->fatalError( 'Unable to open the data file.' ); |
71 | } |
72 | |
73 | // For the file format, see https://www.unicode.org/reports/tr44/ |
74 | $fieldNames = [ |
75 | 'Code', |
76 | 'Name', |
77 | 'General_Category', |
78 | 'Canonical_Combining_Class', |
79 | 'Bidi_Class', |
80 | 'Decomposition_Type_Mapping', |
81 | 'Numeric_Type_Value_6', |
82 | 'Numeric_Type_Value_7', |
83 | 'Numeric_Type_Value_8', |
84 | 'Bidi_Mirrored', |
85 | 'Unicode_1_Name', |
86 | 'ISO_Comment', |
87 | 'Simple_Uppercase_Mapping', |
88 | 'Simple_Lowercase_Mapping', |
89 | 'Simple_Titlecase_Mapping' |
90 | ]; |
91 | |
92 | $pairs = []; |
93 | |
94 | $lineNum = 0; |
95 | // phpcs:ignore Generic.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition |
96 | while ( ( $line = fgets( $file ) ) !== false ) { |
97 | ++$lineNum; |
98 | |
99 | # Strip comments |
100 | $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) ); |
101 | if ( $line === '' ) { |
102 | continue; |
103 | } |
104 | |
105 | # Split fields |
106 | $numberedData = explode( ';', $line ); |
107 | $data = []; |
108 | foreach ( $fieldNames as $number => $name ) { |
109 | $data[$name] = $numberedData[$number]; |
110 | } |
111 | |
112 | $code = base_convert( $data['Code'], 16, 10 ); |
113 | if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A |
114 | || ( $code >= 0xFE70 && $code <= 0xFEFF ) # Arabic presentation forms B |
115 | ) { |
116 | if ( $data['Decomposition_Type_Mapping'] === '' ) { |
117 | // No decomposition |
118 | continue; |
119 | } |
120 | if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/', |
121 | $data['Decomposition_Type_Mapping'], $m ) |
122 | ) { |
123 | $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" ); |
124 | $this->error( $line ); |
125 | continue; |
126 | } |
127 | |
128 | $source = UtfNormal\Utils::hexSequenceToUtf8( $data['Code'] ); |
129 | $dest = UtfNormal\Utils::hexSequenceToUtf8( $m[2] ); |
130 | $pairs[$source] = $dest; |
131 | } |
132 | } |
133 | |
134 | global $IP; |
135 | $writer = new StaticArrayWriter(); |
136 | file_put_contents( "$IP/includes/languages/data/NormalizeAr.php", $writer->writeClass( |
137 | $pairs, |
138 | [ |
139 | 'header' => 'Generated by generateNormalizerDataAr.php. Do not modify!', |
140 | 'namespace' => 'MediaWiki\\Languages\\Data', |
141 | 'class' => 'NormalizeAr', |
142 | 'const' => 'PAIRS', |
143 | ] |
144 | ) ); |
145 | |
146 | echo "ar: " . count( $pairs ) . " pairs written.\n"; |
147 | } |
148 | } |
149 | |
150 | // @codeCoverageIgnoreStart |
151 | $maintClass = GenerateNormalizerDataAr::class; |
152 | require_once RUN_MAINTENANCE_IF_MAIN; |
153 | // @codeCoverageIgnoreEnd |