Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
100.00% covered (success)
100.00%
40 / 40
100.00% covered (success)
100.00%
2 / 2
CRAP
100.00% covered (success)
100.00%
1 / 1
Encoder
100.00% covered (success)
100.00%
40 / 40
100.00% covered (success)
100.00%
2 / 2
19
100.00% covered (success)
100.00%
1 / 1
 convert
100.00% covered (success)
100.00%
24 / 24
100.00% covered (success)
100.00%
1 / 1
14
 doConvert
100.00% covered (success)
100.00%
16 / 16
100.00% covered (success)
100.00%
1 / 1
5
1<?php
2/**
3 * @file
4 * @license https://opensource.org/licenses/Apache-2.0 Apache-2.0
5 */
6
7namespace Wikimedia\CSS\Parser;
8
9use RuntimeException;
10use UtfNormal\Constants;
11use UtfNormal\Utils;
12
13/**
14 * Character set conversion for CSS
15 *
16 * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#input-byte-stream
17 */
18class Encoder {
19
20    /**
21     * @var array Mapping from CSS encoding tags to mbstring/iconv encodings
22     * @see https://encoding.spec.whatwg.org/#concept-encoding-get
23     */
24    protected static $encodings = [
25        'unicode-1-1-utf-8'     => 'UTF-8',
26        'utf-8'                 => 'UTF-8',
27        'utf8'                  => 'UTF-8',
28        '866'                   => 'CP866',
29        'cp866'                 => 'CP866',
30        'csibm866'              => 'CP866',
31        'ibm866'                => 'CP866',
32        'csisolatin2'           => 'ISO-8859-2',
33        'iso-8859-2'            => 'ISO-8859-2',
34        'iso-ir-101'            => 'ISO-8859-2',
35        'iso8859-2'             => 'ISO-8859-2',
36        'iso88592'              => 'ISO-8859-2',
37        'iso_8859-2'            => 'ISO-8859-2',
38        'iso_8859-2:1987'       => 'ISO-8859-2',
39        'l2'                    => 'ISO-8859-2',
40        'latin2'                => 'ISO-8859-2',
41        'csisolatin3'           => 'ISO-8859-3',
42        'iso-8859-3'            => 'ISO-8859-3',
43        'iso-ir-109'            => 'ISO-8859-3',
44        'iso8859-3'             => 'ISO-8859-3',
45        'iso88593'              => 'ISO-8859-3',
46        'iso_8859-3'            => 'ISO-8859-3',
47        'iso_8859-3:1988'       => 'ISO-8859-3',
48        'l3'                    => 'ISO-8859-3',
49        'latin3'                => 'ISO-8859-3',
50        'csisolatin4'           => 'ISO-8859-4',
51        'iso-8859-4'            => 'ISO-8859-4',
52        'iso-ir-110'            => 'ISO-8859-4',
53        'iso8859-4'             => 'ISO-8859-4',
54        'iso88594'              => 'ISO-8859-4',
55        'iso_8859-4'            => 'ISO-8859-4',
56        'iso_8859-4:1988'       => 'ISO-8859-4',
57        'l4'                    => 'ISO-8859-4',
58        'latin4'                => 'ISO-8859-4',
59        'csisolatincyrillic'    => 'ISO-8859-5',
60        'cyrillic'              => 'ISO-8859-5',
61        'iso-8859-5'            => 'ISO-8859-5',
62        'iso-ir-144'            => 'ISO-8859-5',
63        'iso8859-5'             => 'ISO-8859-5',
64        'iso88595'              => 'ISO-8859-5',
65        'iso_8859-5'            => 'ISO-8859-5',
66        'iso_8859-5:1988'       => 'ISO-8859-5',
67        'arabic'                => 'ISO-8859-6',
68        'asmo-708'              => 'ISO-8859-6',
69        'csiso88596e'           => 'ISO-8859-6',
70        'csiso88596i'           => 'ISO-8859-6',
71        'csisolatinarabic'      => 'ISO-8859-6',
72        'ecma-114'              => 'ISO-8859-6',
73        'iso-8859-6'            => 'ISO-8859-6',
74        'iso-8859-6-e'          => 'ISO-8859-6',
75        'iso-8859-6-i'          => 'ISO-8859-6',
76        'iso-ir-127'            => 'ISO-8859-6',
77        'iso8859-6'             => 'ISO-8859-6',
78        'iso88596'              => 'ISO-8859-6',
79        'iso_8859-6'            => 'ISO-8859-6',
80        'iso_8859-6:1987'       => 'ISO-8859-6',
81        'csisolatingreek'       => 'ISO-8859-7',
82        'ecma-118'              => 'ISO-8859-7',
83        'elot_928'              => 'ISO-8859-7',
84        'greek'                 => 'ISO-8859-7',
85        'greek8'                => 'ISO-8859-7',
86        'iso-8859-7'            => 'ISO-8859-7',
87        'iso-ir-126'            => 'ISO-8859-7',
88        'iso8859-7'             => 'ISO-8859-7',
89        'iso88597'              => 'ISO-8859-7',
90        'iso_8859-7'            => 'ISO-8859-7',
91        'iso_8859-7:1987'       => 'ISO-8859-7',
92        'sun_eu_greek'          => 'ISO-8859-7',
93        'csiso88598e'           => 'ISO-8859-8',
94        'csisolatinhebrew'      => 'ISO-8859-8',
95        'hebrew'                => 'ISO-8859-8',
96        'iso-8859-8'            => 'ISO-8859-8',
97        'iso-8859-8-e'          => 'ISO-8859-8',
98        'iso-ir-138'            => 'ISO-8859-8',
99        'iso8859-8'             => 'ISO-8859-8',
100        'iso88598'              => 'ISO-8859-8',
101        'iso_8859-8'            => 'ISO-8859-8',
102        'iso_8859-8:1988'       => 'ISO-8859-8',
103        'visual'                => 'ISO-8859-8',
104        // ISO-8859-8-I?
105        'csiso88598i'           => 'ISO-8859-8',
106        // ISO-8859-8-I?
107        'iso-8859-8-i'          => 'ISO-8859-8',
108        // ISO-8859-8-I?
109        'logical'               => 'ISO-8859-8',
110        'csisolatin6'           => 'ISO-8859-10',
111        'iso-8859-10'           => 'ISO-8859-10',
112        'iso-ir-157'            => 'ISO-8859-10',
113        'iso8859-10'            => 'ISO-8859-10',
114        'iso885910'             => 'ISO-8859-10',
115        'l6'                    => 'ISO-8859-10',
116        'latin6'                => 'ISO-8859-10',
117        'iso-8859-13'           => 'ISO-8859-13',
118        'iso8859-13'            => 'ISO-8859-13',
119        'iso885913'             => 'ISO-8859-13',
120        'iso-8859-14'           => 'ISO-8859-14',
121        'iso8859-14'            => 'ISO-8859-14',
122        'iso885914'             => 'ISO-8859-14',
123        'csisolatin9'           => 'ISO-8859-15',
124        'iso-8859-15'           => 'ISO-8859-15',
125        'iso8859-15'            => 'ISO-8859-15',
126        'iso885915'             => 'ISO-8859-15',
127        'iso_8859-15'           => 'ISO-8859-15',
128        'l9'                    => 'ISO-8859-15',
129        'iso-8859-16'           => 'ISO-8859-16',
130        'cskoi8r'               => 'KOI8-R',
131        'koi'                   => 'KOI8-R',
132        'koi8'                  => 'KOI8-R',
133        'koi8-r'                => 'KOI8-R',
134        'koi8_r'                => 'KOI8-R',
135        'koi8-ru'               => 'KOI8-U',
136        'koi8-u'                => 'KOI8-U',
137        'csmacintosh'           => 'macintosh',
138        'mac'                   => 'macintosh',
139        'macintosh'             => 'macintosh',
140        'x-mac-roman'           => 'macintosh',
141        'dos-874'               => 'Windows-874',
142        'iso-8859-11'           => 'Windows-874',
143        'iso8859-11'            => 'Windows-874',
144        'iso885911'             => 'Windows-874',
145        'tis-620'               => 'Windows-874',
146        'windows-874'           => 'Windows-874',
147        'cp1250'                => 'Windows-1250',
148        'windows-1250'          => 'Windows-1250',
149        'x-cp1250'              => 'Windows-1250',
150        'cp1251'                => 'Windows-1251',
151        'windows-1251'          => 'Windows-1251',
152        'x-cp1251'              => 'Windows-1251',
153        'ansi_x3.4-1968'        => 'Windows-1252',
154        'ascii'                 => 'Windows-1252',
155        'cp1252'                => 'Windows-1252',
156        'cp819'                 => 'Windows-1252',
157        'csisolatin1'           => 'Windows-1252',
158        'ibm819'                => 'Windows-1252',
159        'iso-8859-1'            => 'Windows-1252',
160        'iso-ir-100'            => 'Windows-1252',
161        'iso8859-1'             => 'Windows-1252',
162        'iso88591'              => 'Windows-1252',
163        'iso_8859-1'            => 'Windows-1252',
164        'iso_8859-1:1987'       => 'Windows-1252',
165        'l1'                    => 'Windows-1252',
166        'latin1'                => 'Windows-1252',
167        'us-ascii'              => 'Windows-1252',
168        'windows-1252'          => 'Windows-1252',
169        'x-cp1252'              => 'Windows-1252',
170        'cp1253'                => 'Windows-1253',
171        'windows-1253'          => 'Windows-1253',
172        'x-cp1253'              => 'Windows-1253',
173        'cp1254'                => 'Windows-1254',
174        'csisolatin5'           => 'Windows-1254',
175        'iso-8859-9'            => 'Windows-1254',
176        'iso-ir-148'            => 'Windows-1254',
177        'iso8859-9'             => 'Windows-1254',
178        'iso88599'              => 'Windows-1254',
179        'iso_8859-9'            => 'Windows-1254',
180        'iso_8859-9:1989'       => 'Windows-1254',
181        'l5'                    => 'Windows-1254',
182        'latin5'                => 'Windows-1254',
183        'windows-1254'          => 'Windows-1254',
184        'x-cp1254'              => 'Windows-1254',
185        'cp1255'                => 'Windows-1255',
186        'windows-1255'          => 'Windows-1255',
187        'x-cp1255'              => 'Windows-1255',
188        'cp1256'                => 'Windows-1256',
189        'windows-1256'          => 'Windows-1256',
190        'x-cp1256'              => 'Windows-1256',
191        'cp1257'                => 'Windows-1257',
192        'windows-1257'          => 'Windows-1257',
193        'x-cp1257'              => 'Windows-1257',
194        'cp1258'                => 'Windows-1258',
195        'windows-1258'          => 'Windows-1258',
196        'x-cp1258'              => 'Windows-1258',
197        'x-mac-cyrillic'        => 'mac-cyrillic',
198        'x-mac-ukrainian'       => 'mac-cyrillic',
199        // GBK
200        'chinese'               => 'GB18030',
201        // GBK
202        'csgb2312'              => 'GB18030',
203        // GBK
204        'csiso58gb231280'       => 'GB18030',
205        // GBK
206        'gb2312'                => 'GB18030',
207        // GBK
208        'gb_2312'               => 'GB18030',
209        // GBK
210        'gb_2312-80'            => 'GB18030',
211        // GBK
212        'gbk'                   => 'GB18030',
213        // GBK
214        'iso-ir-58'             => 'GB18030',
215        // GBK
216        'x-gbk'                 => 'GB18030',
217        'gb18030'               => 'GB18030',
218        'big5'                  => 'BIG-5',
219        'big5-hkscs'            => 'BIG-5',
220        'cn-big5'               => 'BIG-5',
221        'csbig5'                => 'BIG-5',
222        'x-x-big5'              => 'BIG-5',
223        'cseucpkdfmtjapanese'   => 'EUC-JP',
224        'euc-jp'                => 'EUC-JP',
225        'x-euc-jp'              => 'EUC-JP',
226        'csiso2022jp'           => 'ISO-2022-JP',
227        'iso-2022-jp'           => 'ISO-2022-JP',
228        'csshiftjis'            => 'SJIS',
229        'ms932'                 => 'SJIS',
230        'ms_kanji'              => 'SJIS',
231        'shift-jis'             => 'SJIS',
232        'shift_jis'             => 'SJIS',
233        'sjis'                  => 'SJIS',
234        'windows-31j'           => 'SJIS',
235        'x-sjis'                => 'SJIS',
236        'cseuckr'               => 'EUC-KR',
237        'csksc56011987'         => 'EUC-KR',
238        'euc-kr'                => 'EUC-KR',
239        'iso-ir-149'            => 'EUC-KR',
240        'korean'                => 'EUC-KR',
241        'ks_c_5601-1987'        => 'EUC-KR',
242        'ks_c_5601-1989'        => 'EUC-KR',
243        'ksc5601'               => 'EUC-KR',
244        'ksc_5601'              => 'EUC-KR',
245        'windows-949'           => 'EUC-KR',
246        'csiso2022kr'           => 'replacement',
247        'hz-gb-2312'            => 'replacement',
248        'iso-2022-cn'           => 'replacement',
249        'iso-2022-cn-ext'       => 'replacement',
250        'iso-2022-kr'           => 'replacement',
251        'replacement'           => 'replacement',
252        'utf-16be'              => 'UTF-16BE',
253        'utf-16'                => 'UTF-16LE',
254        'utf-16le'              => 'UTF-16LE',
255        'x-user-defined'        => 'x-user-defined',
256    ];
257
258    /**
259     * Convert CSS text to UTF-8
260     * @param string $text Text being detected
261     * @param string[] $encodings Encodings to use at various points in the algorithm:
262     *  - transport: Encoding from HTTP or the like
263     *  - environment: Encoding from HTML `<link>` or the like
264     * @return string
265     */
266    public static function convert( $text, $encodings = [] ) {
267        // First, check for a BOM and honor that if it's present.
268        if ( strpos( $text, "\xef\xbb\xbf" ) === 0 ) {
269            // UTF-8 with BOM (convert it anyway in case the BOM is a lie)
270            return self::doConvert( 'UTF-8', substr( $text, 3 ) );
271        }
272        $start = substr( $text, 0, 2 );
273        if ( $start === "\xfe\xff" ) {
274            return self::doConvert( 'UTF-16BE', substr( $text, 2 ) );
275        }
276        if ( $start === "\xff\xfe" ) {
277            return self::doConvert( 'UTF-16LE', substr( $text, 2 ) );
278        }
279
280        // 1. Transport encoding
281        $encoding = isset( $encodings['transport'] )
282            ? trim( strtolower( $encodings['transport'] ), "\t\n\f\r " )
283            : null;
284        if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) {
285            return self::doConvert( self::$encodings[$encoding], $text );
286        }
287
288        // 2. @charset rule
289        if ( preg_match( '/^@charset "([\x00-\x21\x23-\x7f]{0,1012})";/', $text, $m ) ) {
290            $encoding = trim( strtolower( $m[1] ), "\t\n\f\r " );
291            if ( $encoding === 'utf-16be' || $encoding === 'utf-16le' ) {
292                // It's obviously lying.
293                $encoding = 'utf-8';
294            }
295            if ( isset( self::$encodings[$encoding] ) ) {
296                return self::doConvert( self::$encodings[$encoding], $text );
297            }
298        }
299
300        // 3. Environment encoding
301        $encoding = isset( $encodings['environment'] )
302            ? trim( strtolower( $encodings['environment'] ), "\t\n\f\r " )
303            : null;
304        if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) {
305            return self::doConvert( self::$encodings[$encoding], $text );
306        }
307
308        // 4. Just use UTF-8
309        return self::doConvert( 'UTF-8', $text );
310    }
311
312    /**
313     * Actually perform the conversion
314     * @param string $encoding
315     * @param string $text
316     * @return string
317     */
318    protected static function doConvert( $encoding, $text ) {
319        // Pseudo-encoding that just outputs one replacement character
320        if ( $encoding === 'replacement' ) {
321            return Constants::UTF8_REPLACEMENT;
322        }
323
324        // Pseudo-encoding that shifts non-ASCII bytes to the BMP private use area
325        if ( $encoding === 'x-user-defined' ) {
326            return preg_replace_callback( '/[\x80-\xff]/', static function ( $m ) {
327                return Utils::codepointToUtf8( 0xf700 + ord( $m[0] ) );
328            }, $text );
329        }
330
331        // We prefer mbstring because it has sane handling of invalid input,
332        // where iconv just chokes and returns false. But we need iconv for
333        // some encodings mbstring doesn't support.
334        if ( in_array( $encoding, mb_list_encodings(), true ) ) {
335            $old = mb_substitute_character();
336            mb_substitute_character( Constants::UNICODE_REPLACEMENT );
337            $text = mb_convert_encoding( $text, 'UTF-8', $encoding );
338            mb_substitute_character( $old );
339            return $text;
340        }
341
342        // phpcs:ignore Generic.PHP.NoSilencedErrors.Discouraged
343        $ret = @iconv( $encoding, 'UTF-8', $text );
344        if ( $ret === false ) {
345            throw new RuntimeException( "Cannot convert '$text' from $encoding" );
346        }
347        return $ret;
348    }
349}