Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
97.50% |
39 / 40 |
|
50.00% |
1 / 2 |
CRAP | |
0.00% |
0 / 1 |
Encoder | |
97.50% |
39 / 40 |
|
50.00% |
1 / 2 |
19 | |
0.00% |
0 / 1 |
convert | |
95.83% |
23 / 24 |
|
0.00% |
0 / 1 |
14 | |||
doConvert | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
5 |
1 | <?php |
2 | /** |
3 | * @file |
4 | * @license https://opensource.org/licenses/Apache-2.0 Apache-2.0 |
5 | */ |
6 | |
7 | namespace Wikimedia\CSS\Parser; |
8 | |
9 | use RuntimeException; |
10 | use UtfNormal\Constants; |
11 | use UtfNormal\Utils; |
12 | |
13 | /** |
14 | * Character set conversion for CSS |
15 | * |
16 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#input-byte-stream |
17 | */ |
18 | class Encoder { |
19 | |
20 | /** |
21 | * @var array Mapping from CSS encoding tags to mbstring/iconv encodings |
22 | * @see https://encoding.spec.whatwg.org/#concept-encoding-get |
23 | */ |
24 | protected static $encodings = [ |
25 | 'unicode-1-1-utf-8' => 'UTF-8', |
26 | 'utf-8' => 'UTF-8', |
27 | 'utf8' => 'UTF-8', |
28 | '866' => 'CP866', |
29 | 'cp866' => 'CP866', |
30 | 'csibm866' => 'CP866', |
31 | 'ibm866' => 'CP866', |
32 | 'csisolatin2' => 'ISO-8859-2', |
33 | 'iso-8859-2' => 'ISO-8859-2', |
34 | 'iso-ir-101' => 'ISO-8859-2', |
35 | 'iso8859-2' => 'ISO-8859-2', |
36 | 'iso88592' => 'ISO-8859-2', |
37 | 'iso_8859-2' => 'ISO-8859-2', |
38 | 'iso_8859-2:1987' => 'ISO-8859-2', |
39 | 'l2' => 'ISO-8859-2', |
40 | 'latin2' => 'ISO-8859-2', |
41 | 'csisolatin3' => 'ISO-8859-3', |
42 | 'iso-8859-3' => 'ISO-8859-3', |
43 | 'iso-ir-109' => 'ISO-8859-3', |
44 | 'iso8859-3' => 'ISO-8859-3', |
45 | 'iso88593' => 'ISO-8859-3', |
46 | 'iso_8859-3' => 'ISO-8859-3', |
47 | 'iso_8859-3:1988' => 'ISO-8859-3', |
48 | 'l3' => 'ISO-8859-3', |
49 | 'latin3' => 'ISO-8859-3', |
50 | 'csisolatin4' => 'ISO-8859-4', |
51 | 'iso-8859-4' => 'ISO-8859-4', |
52 | 'iso-ir-110' => 'ISO-8859-4', |
53 | 'iso8859-4' => 'ISO-8859-4', |
54 | 'iso88594' => 'ISO-8859-4', |
55 | 'iso_8859-4' => 'ISO-8859-4', |
56 | 'iso_8859-4:1988' => 'ISO-8859-4', |
57 | 'l4' => 'ISO-8859-4', |
58 | 'latin4' => 'ISO-8859-4', |
59 | 'csisolatincyrillic' => 'ISO-8859-5', |
60 | 'cyrillic' => 'ISO-8859-5', |
61 | 'iso-8859-5' => 'ISO-8859-5', |
62 | 'iso-ir-144' => 'ISO-8859-5', |
63 | 'iso8859-5' => 'ISO-8859-5', |
64 | 'iso88595' => 'ISO-8859-5', |
65 | 'iso_8859-5' => 'ISO-8859-5', |
66 | 'iso_8859-5:1988' => 'ISO-8859-5', |
67 | 'arabic' => 'ISO-8859-6', |
68 | 'asmo-708' => 'ISO-8859-6', |
69 | 'csiso88596e' => 'ISO-8859-6', |
70 | 'csiso88596i' => 'ISO-8859-6', |
71 | 'csisolatinarabic' => 'ISO-8859-6', |
72 | 'ecma-114' => 'ISO-8859-6', |
73 | 'iso-8859-6' => 'ISO-8859-6', |
74 | 'iso-8859-6-e' => 'ISO-8859-6', |
75 | 'iso-8859-6-i' => 'ISO-8859-6', |
76 | 'iso-ir-127' => 'ISO-8859-6', |
77 | 'iso8859-6' => 'ISO-8859-6', |
78 | 'iso88596' => 'ISO-8859-6', |
79 | 'iso_8859-6' => 'ISO-8859-6', |
80 | 'iso_8859-6:1987' => 'ISO-8859-6', |
81 | 'csisolatingreek' => 'ISO-8859-7', |
82 | 'ecma-118' => 'ISO-8859-7', |
83 | 'elot_928' => 'ISO-8859-7', |
84 | 'greek' => 'ISO-8859-7', |
85 | 'greek8' => 'ISO-8859-7', |
86 | 'iso-8859-7' => 'ISO-8859-7', |
87 | 'iso-ir-126' => 'ISO-8859-7', |
88 | 'iso8859-7' => 'ISO-8859-7', |
89 | 'iso88597' => 'ISO-8859-7', |
90 | 'iso_8859-7' => 'ISO-8859-7', |
91 | 'iso_8859-7:1987' => 'ISO-8859-7', |
92 | 'sun_eu_greek' => 'ISO-8859-7', |
93 | 'csiso88598e' => 'ISO-8859-8', |
94 | 'csisolatinhebrew' => 'ISO-8859-8', |
95 | 'hebrew' => 'ISO-8859-8', |
96 | 'iso-8859-8' => 'ISO-8859-8', |
97 | 'iso-8859-8-e' => 'ISO-8859-8', |
98 | 'iso-ir-138' => 'ISO-8859-8', |
99 | 'iso8859-8' => 'ISO-8859-8', |
100 | 'iso88598' => 'ISO-8859-8', |
101 | 'iso_8859-8' => 'ISO-8859-8', |
102 | 'iso_8859-8:1988' => 'ISO-8859-8', |
103 | 'visual' => 'ISO-8859-8', |
104 | // ISO-8859-8-I? |
105 | 'csiso88598i' => 'ISO-8859-8', |
106 | // ISO-8859-8-I? |
107 | 'iso-8859-8-i' => 'ISO-8859-8', |
108 | // ISO-8859-8-I? |
109 | 'logical' => 'ISO-8859-8', |
110 | 'csisolatin6' => 'ISO-8859-10', |
111 | 'iso-8859-10' => 'ISO-8859-10', |
112 | 'iso-ir-157' => 'ISO-8859-10', |
113 | 'iso8859-10' => 'ISO-8859-10', |
114 | 'iso885910' => 'ISO-8859-10', |
115 | 'l6' => 'ISO-8859-10', |
116 | 'latin6' => 'ISO-8859-10', |
117 | 'iso-8859-13' => 'ISO-8859-13', |
118 | 'iso8859-13' => 'ISO-8859-13', |
119 | 'iso885913' => 'ISO-8859-13', |
120 | 'iso-8859-14' => 'ISO-8859-14', |
121 | 'iso8859-14' => 'ISO-8859-14', |
122 | 'iso885914' => 'ISO-8859-14', |
123 | 'csisolatin9' => 'ISO-8859-15', |
124 | 'iso-8859-15' => 'ISO-8859-15', |
125 | 'iso8859-15' => 'ISO-8859-15', |
126 | 'iso885915' => 'ISO-8859-15', |
127 | 'iso_8859-15' => 'ISO-8859-15', |
128 | 'l9' => 'ISO-8859-15', |
129 | 'iso-8859-16' => 'ISO-8859-16', |
130 | 'cskoi8r' => 'KOI8-R', |
131 | 'koi' => 'KOI8-R', |
132 | 'koi8' => 'KOI8-R', |
133 | 'koi8-r' => 'KOI8-R', |
134 | 'koi8_r' => 'KOI8-R', |
135 | 'koi8-ru' => 'KOI8-U', |
136 | 'koi8-u' => 'KOI8-U', |
137 | 'csmacintosh' => 'macintosh', |
138 | 'mac' => 'macintosh', |
139 | 'macintosh' => 'macintosh', |
140 | 'x-mac-roman' => 'macintosh', |
141 | 'dos-874' => 'Windows-874', |
142 | 'iso-8859-11' => 'Windows-874', |
143 | 'iso8859-11' => 'Windows-874', |
144 | 'iso885911' => 'Windows-874', |
145 | 'tis-620' => 'Windows-874', |
146 | 'windows-874' => 'Windows-874', |
147 | 'cp1250' => 'Windows-1250', |
148 | 'windows-1250' => 'Windows-1250', |
149 | 'x-cp1250' => 'Windows-1250', |
150 | 'cp1251' => 'Windows-1251', |
151 | 'windows-1251' => 'Windows-1251', |
152 | 'x-cp1251' => 'Windows-1251', |
153 | 'ansi_x3.4-1968' => 'Windows-1252', |
154 | 'ascii' => 'Windows-1252', |
155 | 'cp1252' => 'Windows-1252', |
156 | 'cp819' => 'Windows-1252', |
157 | 'csisolatin1' => 'Windows-1252', |
158 | 'ibm819' => 'Windows-1252', |
159 | 'iso-8859-1' => 'Windows-1252', |
160 | 'iso-ir-100' => 'Windows-1252', |
161 | 'iso8859-1' => 'Windows-1252', |
162 | 'iso88591' => 'Windows-1252', |
163 | 'iso_8859-1' => 'Windows-1252', |
164 | 'iso_8859-1:1987' => 'Windows-1252', |
165 | 'l1' => 'Windows-1252', |
166 | 'latin1' => 'Windows-1252', |
167 | 'us-ascii' => 'Windows-1252', |
168 | 'windows-1252' => 'Windows-1252', |
169 | 'x-cp1252' => 'Windows-1252', |
170 | 'cp1253' => 'Windows-1253', |
171 | 'windows-1253' => 'Windows-1253', |
172 | 'x-cp1253' => 'Windows-1253', |
173 | 'cp1254' => 'Windows-1254', |
174 | 'csisolatin5' => 'Windows-1254', |
175 | 'iso-8859-9' => 'Windows-1254', |
176 | 'iso-ir-148' => 'Windows-1254', |
177 | 'iso8859-9' => 'Windows-1254', |
178 | 'iso88599' => 'Windows-1254', |
179 | 'iso_8859-9' => 'Windows-1254', |
180 | 'iso_8859-9:1989' => 'Windows-1254', |
181 | 'l5' => 'Windows-1254', |
182 | 'latin5' => 'Windows-1254', |
183 | 'windows-1254' => 'Windows-1254', |
184 | 'x-cp1254' => 'Windows-1254', |
185 | 'cp1255' => 'Windows-1255', |
186 | 'windows-1255' => 'Windows-1255', |
187 | 'x-cp1255' => 'Windows-1255', |
188 | 'cp1256' => 'Windows-1256', |
189 | 'windows-1256' => 'Windows-1256', |
190 | 'x-cp1256' => 'Windows-1256', |
191 | 'cp1257' => 'Windows-1257', |
192 | 'windows-1257' => 'Windows-1257', |
193 | 'x-cp1257' => 'Windows-1257', |
194 | 'cp1258' => 'Windows-1258', |
195 | 'windows-1258' => 'Windows-1258', |
196 | 'x-cp1258' => 'Windows-1258', |
197 | 'x-mac-cyrillic' => 'mac-cyrillic', |
198 | 'x-mac-ukrainian' => 'mac-cyrillic', |
199 | // GBK |
200 | 'chinese' => 'GB18030', |
201 | // GBK |
202 | 'csgb2312' => 'GB18030', |
203 | // GBK |
204 | 'csiso58gb231280' => 'GB18030', |
205 | // GBK |
206 | 'gb2312' => 'GB18030', |
207 | // GBK |
208 | 'gb_2312' => 'GB18030', |
209 | // GBK |
210 | 'gb_2312-80' => 'GB18030', |
211 | // GBK |
212 | 'gbk' => 'GB18030', |
213 | // GBK |
214 | 'iso-ir-58' => 'GB18030', |
215 | // GBK |
216 | 'x-gbk' => 'GB18030', |
217 | 'gb18030' => 'GB18030', |
218 | 'big5' => 'BIG-5', |
219 | 'big5-hkscs' => 'BIG-5', |
220 | 'cn-big5' => 'BIG-5', |
221 | 'csbig5' => 'BIG-5', |
222 | 'x-x-big5' => 'BIG-5', |
223 | 'cseucpkdfmtjapanese' => 'EUC-JP', |
224 | 'euc-jp' => 'EUC-JP', |
225 | 'x-euc-jp' => 'EUC-JP', |
226 | 'csiso2022jp' => 'ISO-2022-JP', |
227 | 'iso-2022-jp' => 'ISO-2022-JP', |
228 | 'csshiftjis' => 'SJIS', |
229 | 'ms932' => 'SJIS', |
230 | 'ms_kanji' => 'SJIS', |
231 | 'shift-jis' => 'SJIS', |
232 | 'shift_jis' => 'SJIS', |
233 | 'sjis' => 'SJIS', |
234 | 'windows-31j' => 'SJIS', |
235 | 'x-sjis' => 'SJIS', |
236 | 'cseuckr' => 'EUC-KR', |
237 | 'csksc56011987' => 'EUC-KR', |
238 | 'euc-kr' => 'EUC-KR', |
239 | 'iso-ir-149' => 'EUC-KR', |
240 | 'korean' => 'EUC-KR', |
241 | 'ks_c_5601-1987' => 'EUC-KR', |
242 | 'ks_c_5601-1989' => 'EUC-KR', |
243 | 'ksc5601' => 'EUC-KR', |
244 | 'ksc_5601' => 'EUC-KR', |
245 | 'windows-949' => 'EUC-KR', |
246 | 'csiso2022kr' => 'replacement', |
247 | 'hz-gb-2312' => 'replacement', |
248 | 'iso-2022-cn' => 'replacement', |
249 | 'iso-2022-cn-ext' => 'replacement', |
250 | 'iso-2022-kr' => 'replacement', |
251 | 'replacement' => 'replacement', |
252 | 'utf-16be' => 'UTF-16BE', |
253 | 'utf-16' => 'UTF-16LE', |
254 | 'utf-16le' => 'UTF-16LE', |
255 | 'x-user-defined' => 'x-user-defined', |
256 | ]; |
257 | |
258 | /** |
259 | * Convert CSS text to UTF-8 |
260 | * @param string $text Text being detected |
261 | * @param string[] $encodings Encodings to use at various points in the algorithm: |
262 | * - transport: Encoding from HTTP or the like |
263 | * - environment: Encoding from HTML `<link>` or the like |
264 | * @return string |
265 | */ |
266 | public static function convert( $text, $encodings = [] ) { |
267 | // First, check for a BOM and honor that if it's present. |
268 | if ( strpos( $text, "\xef\xbb\xbf" ) === 0 ) { |
269 | // UTF-8 with BOM (convert it anyway in case the BOM is a lie) |
270 | return self::doConvert( 'UTF-8', substr( $text, 3 ) ); |
271 | } |
272 | $start = substr( $text, 0, 2 ); |
273 | if ( $start === "\xfe\xff" ) { |
274 | return self::doConvert( 'UTF-16BE', substr( $text, 2 ) ); |
275 | } |
276 | if ( $start === "\xff\xfe" ) { |
277 | return self::doConvert( 'UTF-16LE', substr( $text, 2 ) ); |
278 | } |
279 | |
280 | // 1. Transport encoding |
281 | $encoding = isset( $encodings['transport'] ) |
282 | ? trim( strtolower( $encodings['transport'] ), "\t\n\f\r " ) |
283 | : null; |
284 | if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) { |
285 | return self::doConvert( self::$encodings[$encoding], $text ); |
286 | } |
287 | |
288 | // 2. @charset rule |
289 | if ( preg_match( '/^@charset "([\x00-\x21\x23-\x7f]{0,1012})";/', $text, $m ) ) { |
290 | $encoding = trim( strtolower( $m[1] ), "\t\n\f\r " ); |
291 | if ( $encoding === 'utf-16be' || $encoding === 'utf-16le' ) { |
292 | // It's obviously lying. |
293 | $encoding = 'utf-8'; |
294 | } |
295 | if ( isset( self::$encodings[$encoding] ) ) { |
296 | return self::doConvert( self::$encodings[$encoding], $text ); |
297 | } |
298 | } |
299 | |
300 | // 3. Environment encoding |
301 | $encoding = isset( $encodings['environment'] ) |
302 | ? trim( strtolower( $encodings['environment'] ), "\t\n\f\r " ) |
303 | : null; |
304 | if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) { |
305 | return self::doConvert( self::$encodings[$encoding], $text ); |
306 | } |
307 | |
308 | // 4. Just use UTF-8 |
309 | return self::doConvert( 'UTF-8', $text ); |
310 | } |
311 | |
312 | /** |
313 | * Actually perform the conversion |
314 | * @param string $encoding |
315 | * @param string $text |
316 | * @return string |
317 | */ |
318 | protected static function doConvert( $encoding, $text ) { |
319 | // Pseudo-encoding that just outputs one replacement character |
320 | if ( $encoding === 'replacement' ) { |
321 | return Constants::UTF8_REPLACEMENT; |
322 | } |
323 | |
324 | // Pseudo-encoding that shifts non-ASCII bytes to the BMP private use area |
325 | if ( $encoding === 'x-user-defined' ) { |
326 | return preg_replace_callback( '/[\x80-\xff]/', static function ( $m ) { |
327 | return Utils::codepointToUtf8( 0xf700 + ord( $m[0] ) ); |
328 | }, $text ); |
329 | } |
330 | |
331 | // We prefer mbstring because it has sane handling of invalid input, |
332 | // where iconv just chokes and returns false. But we need iconv for |
333 | // some encodings mbstring doesn't support. |
334 | if ( in_array( $encoding, mb_list_encodings(), true ) ) { |
335 | $old = mb_substitute_character(); |
336 | mb_substitute_character( Constants::UNICODE_REPLACEMENT ); |
337 | $text = mb_convert_encoding( $text, 'UTF-8', $encoding ); |
338 | mb_substitute_character( $old ); |
339 | return $text; |
340 | } |
341 | |
342 | // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged |
343 | $ret = @iconv( $encoding, 'UTF-8', $text ); |
344 | if ( $ret === false ) { |
345 | throw new RuntimeException( "Cannot convert '$text' from $encoding" ); |
346 | } |
347 | return $ret; |
348 | } |
349 | } |