Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
85.82% |
115 / 134 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
CLDRParser | |
85.82% |
115 / 134 |
|
0.00% |
0 / 3 |
64.94 | |
0.00% |
0 / 1 |
parseMain | |
79.03% |
49 / 62 |
|
0.00% |
0 / 1 |
26.46 | |||
parseSupplemental | |
92.31% |
24 / 26 |
|
0.00% |
0 / 1 |
11.06 | |||
parseCurrencySymbols | |
91.30% |
42 / 46 |
|
0.00% |
0 / 1 |
23.35 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\CLDR; |
4 | |
5 | use RuntimeException; |
6 | use SimpleXMLElement; |
7 | |
8 | /** |
9 | * Extract data from cldr XML. |
10 | * |
11 | * @author Niklas Laxström |
12 | * @author Ryan Kaldari |
13 | * @author Santhosh Thottingal |
14 | * @author Sam Reed |
15 | * @copyright Copyright © 2007-2015 |
16 | * @license GPL-2.0-or-later |
17 | */ |
18 | class CLDRParser { |
19 | |
20 | public const LOCALITY_DEFAULT = '!DEFAULT'; |
21 | public const LANGUAGE_DEFAULT = '!root'; |
22 | public const CURRENCY_DEFAULT = '!DEFAULT'; |
23 | |
24 | /** |
25 | * Read the main/<locale>.xml file from CLDR core and convert to PHP |
26 | * |
27 | * @param string $inputFile filename |
28 | */ |
29 | public function parseMain( $inputFile ): array { |
30 | $contents = file_get_contents( $inputFile ); |
31 | $doc = new SimpleXMLElement( $contents ); |
32 | |
33 | $data = [ |
34 | 'indexCharacters' => [], |
35 | 'languageNames' => [], |
36 | 'currencyNames' => [], |
37 | 'currencySymbols' => [], |
38 | 'countryNames' => [], |
39 | 'timeUnits' => [], |
40 | ]; |
41 | |
42 | // Take a Unicode Set for an alphabet and extract simple example characters. |
43 | // For example, "[aàâ b {ch}]" is extracted as `["a", "b", "ch"]`. |
44 | // TODO: Unicode Set allows for more complex syntax, but we support only |
45 | // the subset currently used here. Should rely on a library instead. |
46 | $indexCharacters = $doc->xpath( '//characters/exemplarCharacters[@type="index"]' ); |
47 | if ( $indexCharacters && count( $indexCharacters ) === 1 ) { |
48 | [ $characters ] = $indexCharacters; |
49 | $splitSequence = preg_split( '/\s/', |
50 | trim( (string)$characters, '[]' ) ); |
51 | $data['indexCharacters'] = array_map( |
52 | static fn ( $letter ) => preg_replace_callback_array( [ |
53 | // Convert unicode literals to characters. |
54 | '/^\\\\u([\da-f]{4})/i' => static fn ( $m ) => mb_chr( hexdec( $m[1] ) ), |
55 | |
56 | // Take only the first character from a set like "aàâ". |
57 | // When the character is made up of multiple symbols, it |
58 | // will be enclosed in curly braces like "{ch}", and in this |
59 | // case we want the entire group. It's possible that the |
60 | // two cases are combined like "{ch}ç". |
61 | '/^(?:([^{])|\{([^}]+)\}).*$/u' => static fn ( $m ) => $m[2] ?? $m[1], |
62 | ], $letter ), |
63 | $splitSequence |
64 | ); |
65 | } |
66 | |
67 | foreach ( $doc->xpath( '//languages/language' ) as $elem ) { |
68 | if ( (string)$elem['alt'] !== '' ) { |
69 | continue; |
70 | } |
71 | |
72 | if ( (string)$elem['type'] === 'root' ) { |
73 | continue; |
74 | } |
75 | |
76 | $key = str_replace( '_', '-', strtolower( $elem['type'] ) ); |
77 | |
78 | $data['languageNames'][$key] = (string)$elem; |
79 | } |
80 | |
81 | foreach ( $doc->xpath( '//currencies/currency' ) as $elem ) { |
82 | if ( (string)$elem->displayName[0] === '' ) { |
83 | continue; |
84 | } |
85 | |
86 | $data['currencyNames'][(string)$elem['type']] = (string)$elem->displayName[0]; |
87 | if ( (string)$elem->symbol[0] !== '' ) { |
88 | $data['currencySymbols'][(string)$elem['type']] = (string)$elem->symbol[0]; |
89 | } |
90 | } |
91 | |
92 | foreach ( $doc->xpath( '//territories/territory' ) as $elem ) { |
93 | if ( (string)$elem['alt'] !== '' && (string)$elem['alt'] !== 'short' ) { |
94 | continue; |
95 | } |
96 | |
97 | if ( (string)$elem['type'] === 'ZZ' || |
98 | !preg_match( '/^[A-Z][A-Z]$/', $elem['type'] ) |
99 | ) { |
100 | continue; |
101 | } |
102 | |
103 | $data['countryNames'][(string)$elem['type']] = (string)$elem; |
104 | } |
105 | foreach ( $doc->xpath( '//units/unitLength' ) as $unitLength ) { |
106 | if ( (string)$unitLength['type'] !== 'long' ) { |
107 | continue; |
108 | } |
109 | foreach ( $unitLength->unit as $elem ) { |
110 | $type = (string)$elem['type']; |
111 | $pos = strpos( $type, 'duration' ); |
112 | if ( $pos === false ) { |
113 | continue; |
114 | } |
115 | $type = substr( $type, strlen( 'duration-' ) ); |
116 | foreach ( $elem->unitPattern as $pattern ) { |
117 | $data['timeUnits'][$type . '-' . (string)$pattern['count']] = (string)$pattern; |
118 | } |
119 | } |
120 | } |
121 | foreach ( $doc->xpath( '//fields/field' ) as $field ) { |
122 | $fieldType = (string)$field['type']; |
123 | |
124 | foreach ( $field->relativeTime as $relative ) { |
125 | $type = (string)$relative['type']; |
126 | foreach ( $relative->relativeTimePattern as $pattern ) { |
127 | $data['timeUnits'][$fieldType . '-' . $type |
128 | . '-' . (string)$pattern['count']] = (string)$pattern; |
129 | } |
130 | } |
131 | } |
132 | |
133 | ksort( $data['timeUnits'] ); |
134 | return $data; |
135 | } |
136 | |
137 | /** |
138 | * Parse method for the file structure found in common/supplemental/supplementalData.xml |
139 | * @param string $inputFile |
140 | */ |
141 | public function parseSupplemental( $inputFile ): array { |
142 | // Open the input file for reading |
143 | |
144 | $contents = file_get_contents( $inputFile ); |
145 | $doc = new SimpleXMLElement( $contents ); |
146 | |
147 | $data = [ |
148 | 'currencyFractions' => [], |
149 | 'localeCurrencies' => [], |
150 | ]; |
151 | |
152 | // Pull currency attributes - digits, rounding, and cashRounding. |
153 | // This will tell us how many decmal places make sense to use with any currency, |
154 | // or if the currency is totally non-fractional |
155 | foreach ( $doc->xpath( '//currencyData/fractions/info' ) as $elem ) { |
156 | $iso4217 = (string)$elem['iso4217']; |
157 | if ( $iso4217 === '' ) { |
158 | continue; |
159 | } |
160 | if ( $iso4217 === 'DEFAULT' ) { |
161 | $iso4217 = self::CURRENCY_DEFAULT; |
162 | } |
163 | |
164 | $attributes = [ 'digits', 'rounding', 'cashDigits', 'cashRounding' ]; |
165 | foreach ( $attributes as $att ) { |
166 | if ( (string)$elem[$att] !== '' ) { |
167 | $data['currencyFractions'][$iso4217][$att] = (string)$elem[$att]; |
168 | } |
169 | } |
170 | } |
171 | |
172 | ksort( $data['currencyFractions'] ); |
173 | |
174 | // Pull a map of regions to currencies in order of preference. |
175 | foreach ( $doc->xpath( '//currencyData/region' ) as $elem ) { |
176 | if ( (string)$elem['iso3166'] === '' ) { |
177 | continue; |
178 | } |
179 | |
180 | $region = (string)$elem['iso3166']; |
181 | |
182 | foreach ( $elem->currency as $currencynode ) { |
183 | if ( (string)$currencynode['to'] === '' && (string)$currencynode['tender'] !== 'false' ) { |
184 | $data['localeCurrencies'][$region][] = (string)$currencynode['iso4217']; |
185 | } |
186 | } |
187 | } |
188 | |
189 | ksort( $data['localeCurrencies'] ); |
190 | return $data; |
191 | } |
192 | |
193 | /** |
194 | * Parse method for the currency section in the names files. |
195 | * This is separate from the regular parse function, because we need all of |
196 | * the currency locale information, even if mediawiki doesn't support the language. |
197 | * (For instance: en_AU uses '$' for AUD, not USD, but it's not a supported mediawiki locality) |
198 | * @param string $inputDir the directory, in which we will parse everything. |
199 | */ |
200 | public function parseCurrencySymbols( $inputDir ): array { |
201 | if ( !file_exists( $inputDir ) ) { |
202 | throw new RuntimeException( 'Input directory not found.' ); |
203 | } |
204 | $files = scandir( $inputDir ); |
205 | |
206 | $data = [ |
207 | 'currencySymbols' => [], |
208 | ]; |
209 | |
210 | // Foreach files! |
211 | foreach ( $files as $inputFile ) { |
212 | if ( strpos( $inputFile, '.xml' ) < 1 ) { |
213 | continue; |
214 | } |
215 | |
216 | $contents = file_get_contents( $inputDir . '/' . $inputFile ); |
217 | $doc = new SimpleXMLElement( $contents ); |
218 | |
219 | // Tags in the <identity> section are guaranteed to appear once |
220 | $languages = $doc->xpath( '//identity/language/@type' ); |
221 | $language = $languages |
222 | ? (string)$languages[0] |
223 | : pathinfo( $inputFile, PATHINFO_FILENAME ); |
224 | |
225 | // The <script> element is optional |
226 | $scripts = $doc->xpath( '//identity/script/@type' ); |
227 | $script = $scripts ? (string)$scripts[0] : ''; |
228 | // expand the language |
229 | if ( $script !== '' ) { |
230 | $language .= '-' . strtolower( $script ); |
231 | } |
232 | |
233 | // The <territory> element is optional |
234 | $territories = $doc->xpath( '//identity/territory/@type' ); |
235 | $territory = $territories ? (string)$territories[0] : self::LOCALITY_DEFAULT; |
236 | |
237 | if ( $language === 'root' ) { |
238 | $language = self::LANGUAGE_DEFAULT; |
239 | } |
240 | |
241 | foreach ( $doc->xpath( '//currencies/currency' ) as $elem ) { |
242 | if ( (string)$elem->symbol[0] !== '' ) { |
243 | $data['currencySymbols'][(string)$elem['type']][$language][$territory] = |
244 | (string)$elem->symbol[0]; |
245 | } |
246 | } |
247 | } |
248 | |
249 | // now massage the data somewhat. It's pretty blown up at this point. |
250 | |
251 | /** |
252 | * Part 1: Stop blowing up on defaults. |
253 | * Defaults apparently come in many forms. Listed below in order of scope |
254 | * (widest to narrowest) |
255 | * 1) The ISO code itself, in the absence of any other defaults |
256 | * 2) The 'root' language file definition |
257 | * 3) Language with no locality - locality will come in as 'DEFAULT' |
258 | * |
259 | * Intended behavior: |
260 | * From narrowest scope to widest, collapse the defaults |
261 | */ |
262 | foreach ( $data['currencySymbols'] as $currency => $language ) { |
263 | // get the currency default symbol. This will either be defined in the |
264 | // 'root' language file, or taken from the ISO code. |
265 | $default = $language[self::LANGUAGE_DEFAULT][self::LOCALITY_DEFAULT] ?? $currency; |
266 | |
267 | foreach ( $language as $lang => $territories ) { |
268 | if ( is_array( $territories ) ) { |
269 | // Collapse a language (no locality) array if it's just the default. One value will do fine. |
270 | if ( count( $territories ) === 1 && array_key_exists( self::LOCALITY_DEFAULT, $territories ) ) { |
271 | $data['currencySymbols'][$currency][$lang] = $territories[self::LOCALITY_DEFAULT]; |
272 | if ( $territories[self::LOCALITY_DEFAULT] === $default |
273 | && $lang !== self::LANGUAGE_DEFAULT |
274 | ) { |
275 | unset( $data['currencySymbols'][$currency][$lang] ); |
276 | } |
277 | } else { |
278 | // Collapse a language (with locality) array if it's default is just the default |
279 | if ( !array_key_exists( self::LOCALITY_DEFAULT, $territories ) |
280 | || ( $territories[self::LOCALITY_DEFAULT] === $default |
281 | && $lang !== self::LANGUAGE_DEFAULT ) |
282 | ) { |
283 | foreach ( $territories as $territory => $symbol ) { |
284 | if ( $symbol === $default ) { |
285 | unset( $data['currencySymbols'][$currency][$lang][$territory] ); |
286 | } |
287 | } |
288 | } |
289 | ksort( $data['currencySymbols'][$currency][$lang] ); |
290 | } |
291 | } |
292 | } |
293 | |
294 | ksort( $data['currencySymbols'][$currency] ); |
295 | } |
296 | |
297 | ksort( $data['currencySymbols'] ); |
298 | return $data; |
299 | } |
300 | |
301 | } |