Code Coverage
 
Classes and Traits
Functions and Methods
Lines
Total
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
2 / 2
CRAP
100.00% covered (success)
100.00%
39 / 39
Encoder
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
2 / 2
19
100.00% covered (success)
100.00%
39 / 39
 convert
100.00% covered (success)
100.00%
1 / 1
14
100.00% covered (success)
100.00%
24 / 24
 doConvert
100.00% covered (success)
100.00%
1 / 1
5
100.00% covered (success)
100.00%
15 / 15
<?php
/**
 * @file
 * @license https://opensource.org/licenses/Apache-2.0 Apache-2.0
 */
namespace Wikimedia\CSS\Parser;
/**
 * Character set conversion for CSS
 * @see https://www.w3.org/TR/2014/CR-css-syntax-3-20140220/#input-byte-stream
 */
class Encoder {
    /**
     * @var array Mapping from CSS encoding tags to mbstring/iconv encodings
     * @see https://encoding.spec.whatwg.org/#concept-encoding-get
     */
    protected static $encodings = [
        'unicode-1-1-utf-8'     => 'UTF-8',
        'utf-8'                 => 'UTF-8',
        'utf8'                  => 'UTF-8',
        '866'                   => 'CP866',
        'cp866'                 => 'CP866',
        'csibm866'              => 'CP866',
        'ibm866'                => 'CP866',
        'csisolatin2'           => 'ISO-8859-2',
        'iso-8859-2'            => 'ISO-8859-2',
        'iso-ir-101'            => 'ISO-8859-2',
        'iso8859-2'             => 'ISO-8859-2',
        'iso88592'              => 'ISO-8859-2',
        'iso_8859-2'            => 'ISO-8859-2',
        'iso_8859-2:1987'       => 'ISO-8859-2',
        'l2'                    => 'ISO-8859-2',
        'latin2'                => 'ISO-8859-2',
        'csisolatin3'           => 'ISO-8859-3',
        'iso-8859-3'            => 'ISO-8859-3',
        'iso-ir-109'            => 'ISO-8859-3',
        'iso8859-3'             => 'ISO-8859-3',
        'iso88593'              => 'ISO-8859-3',
        'iso_8859-3'            => 'ISO-8859-3',
        'iso_8859-3:1988'       => 'ISO-8859-3',
        'l3'                    => 'ISO-8859-3',
        'latin3'                => 'ISO-8859-3',
        'csisolatin4'           => 'ISO-8859-4',
        'iso-8859-4'            => 'ISO-8859-4',
        'iso-ir-110'            => 'ISO-8859-4',
        'iso8859-4'             => 'ISO-8859-4',
        'iso88594'              => 'ISO-8859-4',
        'iso_8859-4'            => 'ISO-8859-4',
        'iso_8859-4:1988'       => 'ISO-8859-4',
        'l4'                    => 'ISO-8859-4',
        'latin4'                => 'ISO-8859-4',
        'csisolatincyrillic'    => 'ISO-8859-5',
        'cyrillic'              => 'ISO-8859-5',
        'iso-8859-5'            => 'ISO-8859-5',
        'iso-ir-144'            => 'ISO-8859-5',
        'iso8859-5'             => 'ISO-8859-5',
        'iso88595'              => 'ISO-8859-5',
        'iso_8859-5'            => 'ISO-8859-5',
        'iso_8859-5:1988'       => 'ISO-8859-5',
        'arabic'                => 'ISO-8859-6',
        'asmo-708'              => 'ISO-8859-6',
        'csiso88596e'           => 'ISO-8859-6',
        'csiso88596i'           => 'ISO-8859-6',
        'csisolatinarabic'      => 'ISO-8859-6',
        'ecma-114'              => 'ISO-8859-6',
        'iso-8859-6'            => 'ISO-8859-6',
        'iso-8859-6-e'          => 'ISO-8859-6',
        'iso-8859-6-i'          => 'ISO-8859-6',
        'iso-ir-127'            => 'ISO-8859-6',
        'iso8859-6'             => 'ISO-8859-6',
        'iso88596'              => 'ISO-8859-6',
        'iso_8859-6'            => 'ISO-8859-6',
        'iso_8859-6:1987'       => 'ISO-8859-6',
        'csisolatingreek'       => 'ISO-8859-7',
        'ecma-118'              => 'ISO-8859-7',
        'elot_928'              => 'ISO-8859-7',
        'greek'                 => 'ISO-8859-7',
        'greek8'                => 'ISO-8859-7',
        'iso-8859-7'            => 'ISO-8859-7',
        'iso-ir-126'            => 'ISO-8859-7',
        'iso8859-7'             => 'ISO-8859-7',
        'iso88597'              => 'ISO-8859-7',
        'iso_8859-7'            => 'ISO-8859-7',
        'iso_8859-7:1987'       => 'ISO-8859-7',
        'sun_eu_greek'          => 'ISO-8859-7',
        'csiso88598e'           => 'ISO-8859-8',
        'csisolatinhebrew'      => 'ISO-8859-8',
        'hebrew'                => 'ISO-8859-8',
        'iso-8859-8'            => 'ISO-8859-8',
        'iso-8859-8-e'          => 'ISO-8859-8',
        'iso-ir-138'            => 'ISO-8859-8',
        'iso8859-8'             => 'ISO-8859-8',
        'iso88598'              => 'ISO-8859-8',
        'iso_8859-8'            => 'ISO-8859-8',
        'iso_8859-8:1988'       => 'ISO-8859-8',
        'visual'                => 'ISO-8859-8',
        'csiso88598i'           => 'ISO-8859-8', // ISO-8859-8-I?
        'iso-8859-8-i'          => 'ISO-8859-8', // ISO-8859-8-I?
        'logical'               => 'ISO-8859-8', // ISO-8859-8-I?
        'csisolatin6'           => 'ISO-8859-10',
        'iso-8859-10'           => 'ISO-8859-10',
        'iso-ir-157'            => 'ISO-8859-10',
        'iso8859-10'            => 'ISO-8859-10',
        'iso885910'             => 'ISO-8859-10',
        'l6'                    => 'ISO-8859-10',
        'latin6'                => 'ISO-8859-10',
        'iso-8859-13'           => 'ISO-8859-13',
        'iso8859-13'            => 'ISO-8859-13',
        'iso885913'             => 'ISO-8859-13',
        'iso-8859-14'           => 'ISO-8859-14',
        'iso8859-14'            => 'ISO-8859-14',
        'iso885914'             => 'ISO-8859-14',
        'csisolatin9'           => 'ISO-8859-15',
        'iso-8859-15'           => 'ISO-8859-15',
        'iso8859-15'            => 'ISO-8859-15',
        'iso885915'             => 'ISO-8859-15',
        'iso_8859-15'           => 'ISO-8859-15',
        'l9'                    => 'ISO-8859-15',
        'iso-8859-16'           => 'ISO-8859-16',
        'cskoi8r'               => 'KOI8-R',
        'koi'                   => 'KOI8-R',
        'koi8'                  => 'KOI8-R',
        'koi8-r'                => 'KOI8-R',
        'koi8_r'                => 'KOI8-R',
        'koi8-ru'               => 'KOI8-U',
        'koi8-u'                => 'KOI8-U',
        'csmacintosh'           => 'macintosh',
        'mac'                   => 'macintosh',
        'macintosh'             => 'macintosh',
        'x-mac-roman'           => 'macintosh',
        'dos-874'               => 'Windows-874',
        'iso-8859-11'           => 'Windows-874',
        'iso8859-11'            => 'Windows-874',
        'iso885911'             => 'Windows-874',
        'tis-620'               => 'Windows-874',
        'windows-874'           => 'Windows-874',
        'cp1250'                => 'Windows-1250',
        'windows-1250'          => 'Windows-1250',
        'x-cp1250'              => 'Windows-1250',
        'cp1251'                => 'Windows-1251',
        'windows-1251'          => 'Windows-1251',
        'x-cp1251'              => 'Windows-1251',
        'ansi_x3.4-1968'        => 'Windows-1252',
        'ascii'                 => 'Windows-1252',
        'cp1252'                => 'Windows-1252',
        'cp819'                 => 'Windows-1252',
        'csisolatin1'           => 'Windows-1252',
        'ibm819'                => 'Windows-1252',
        'iso-8859-1'            => 'Windows-1252',
        'iso-ir-100'            => 'Windows-1252',
        'iso8859-1'             => 'Windows-1252',
        'iso88591'              => 'Windows-1252',
        'iso_8859-1'            => 'Windows-1252',
        'iso_8859-1:1987'       => 'Windows-1252',
        'l1'                    => 'Windows-1252',
        'latin1'                => 'Windows-1252',
        'us-ascii'              => 'Windows-1252',
        'windows-1252'          => 'Windows-1252',
        'x-cp1252'              => 'Windows-1252',
        'cp1253'                => 'Windows-1253',
        'windows-1253'          => 'Windows-1253',
        'x-cp1253'              => 'Windows-1253',
        'cp1254'                => 'Windows-1254',
        'csisolatin5'           => 'Windows-1254',
        'iso-8859-9'            => 'Windows-1254',
        'iso-ir-148'            => 'Windows-1254',
        'iso8859-9'             => 'Windows-1254',
        'iso88599'              => 'Windows-1254',
        'iso_8859-9'            => 'Windows-1254',
        'iso_8859-9:1989'       => 'Windows-1254',
        'l5'                    => 'Windows-1254',
        'latin5'                => 'Windows-1254',
        'windows-1254'          => 'Windows-1254',
        'x-cp1254'              => 'Windows-1254',
        'cp1255'                => 'Windows-1255',
        'windows-1255'          => 'Windows-1255',
        'x-cp1255'              => 'Windows-1255',
        'cp1256'                => 'Windows-1256',
        'windows-1256'          => 'Windows-1256',
        'x-cp1256'              => 'Windows-1256',
        'cp1257'                => 'Windows-1257',
        'windows-1257'          => 'Windows-1257',
        'x-cp1257'              => 'Windows-1257',
        'cp1258'                => 'Windows-1258',
        'windows-1258'          => 'Windows-1258',
        'x-cp1258'              => 'Windows-1258',
        'x-mac-cyrillic'        => 'mac-cyrillic',
        'x-mac-ukrainian'       => 'mac-cyrillic',
        'chinese'               => 'GB18030', // GBK
        'csgb2312'              => 'GB18030', // GBK
        'csiso58gb231280'       => 'GB18030', // GBK
        'gb2312'                => 'GB18030', // GBK
        'gb_2312'               => 'GB18030', // GBK
        'gb_2312-80'            => 'GB18030', // GBK
        'gbk'                   => 'GB18030', // GBK
        'iso-ir-58'             => 'GB18030', // GBK
        'x-gbk'                 => 'GB18030', // GBK
        'gb18030'               => 'GB18030',
        'big5'                  => 'BIG-5',
        'big5-hkscs'            => 'BIG-5',
        'cn-big5'               => 'BIG-5',
        'csbig5'                => 'BIG-5',
        'x-x-big5'              => 'BIG-5',
        'cseucpkdfmtjapanese'   => 'EUC-JP',
        'euc-jp'                => 'EUC-JP',
        'x-euc-jp'              => 'EUC-JP',
        'csiso2022jp'           => 'ISO-2022-JP',
        'iso-2022-jp'           => 'ISO-2022-JP',
        'csshiftjis'            => 'SJIS',
        'ms932'                 => 'SJIS',
        'ms_kanji'              => 'SJIS',
        'shift-jis'             => 'SJIS',
        'shift_jis'             => 'SJIS',
        'sjis'                  => 'SJIS',
        'windows-31j'           => 'SJIS',
        'x-sjis'                => 'SJIS',
        'cseuckr'               => 'EUC-KR',
        'csksc56011987'         => 'EUC-KR',
        'euc-kr'                => 'EUC-KR',
        'iso-ir-149'            => 'EUC-KR',
        'korean'                => 'EUC-KR',
        'ks_c_5601-1987'        => 'EUC-KR',
        'ks_c_5601-1989'        => 'EUC-KR',
        'ksc5601'               => 'EUC-KR',
        'ksc_5601'              => 'EUC-KR',
        'windows-949'           => 'EUC-KR',
        'csiso2022kr'           => 'replacement',
        'hz-gb-2312'            => 'replacement',
        'iso-2022-cn'           => 'replacement',
        'iso-2022-cn-ext'       => 'replacement',
        'iso-2022-kr'           => 'replacement',
        'utf-16be'              => 'UTF-16BE',
        'utf-16'                => 'UTF-16LE',
        'utf-16le'              => 'UTF-16LE',
        'x-user-defined'        => 'x-user-defined',
    ];
    /**
     * Convert CSS text to UTF-8
     * @param string $text Text being detected
     * @param string[] $encodings Encodings to use at various points in the algorithm:
     *  - transport: Encoding from HTTP or the like
     *  - environment: Encoding from HTML `<link>` or the like
     * @return string
     */
    public static function convert( $text, $encodings = [] ) {
        // First, check for a BOM and honor that if it's present.
        if ( substr( $text, 0, 3 ) === "\xef\xbb\xbf" ) {
            // UTF-8 with BOM (convert it anyway in case the BOM is a lie)
            return self::doConvert( 'UTF-8', substr( $text, 3 ) );
        }
        $start = substr( $text, 0, 2 );
        if ( $start === "\xfe\xff" ) {
            return self::doConvert( 'UTF-16BE', substr( $text, 2 ) );
        }
        if ( $start === "\xff\xfe" ) {
            return self::doConvert( 'UTF-16LE', substr( $text, 2 ) );
        }
        // 1. Transport encoding
        $encoding = isset( $encodings['transport'] )
            ? trim( strtolower( $encodings['transport'] ), "\t\n\f\r " )
            : null;
        if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) {
            return self::doConvert( self::$encodings[$encoding], $text );
        }
        // 2. @charset rule
        if ( preg_match( '/^@charset "([\x00-\x21\x23-\x7f]{0,1012})";/', $text, $m ) ) {
            $encoding = trim( strtolower( $m[1] ), "\t\n\f\r " );
            if ( $encoding === 'utf-16be' || $encoding === 'utf-16le' ) {
                // It's obviously lying.
                $encoding = 'utf-8';
            }
            if ( isset( self::$encodings[$encoding] ) ) {
                return self::doConvert( self::$encodings[$encoding], $text );
            }
        }
        // 3. Environment encoding
        $encoding = isset( $encodings['environment'] )
            ? trim( strtolower( $encodings['environment'] ), "\t\n\f\r " )
            : null;
        if ( $encoding !== null && isset( self::$encodings[$encoding] ) ) {
            return self::doConvert( self::$encodings[$encoding], $text );
        }
        // 4. Just use UTF-8
        return self::doConvert( 'UTF-8', $text );
    }
    /**
     * Actually perform the conversion
     * @param string $encoding
     * @param string $text
     * @return string
     */
    protected static function doConvert( $encoding, $text ) {
        // Pseudo-encoding that just outputs one replacement character
        if ( $encoding === 'replacement' ) {
            return \UtfNormal\Constants::UTF8_REPLACEMENT;
        }
        // Pseudo-encoding that shifts non-ASCII bytes to the BMP private use area
        if ( $encoding === 'x-user-defined' ) {
            return preg_replace_callback( '/[\x80-\xff]/', function ( $m ) {
                return \UtfNormal\Utils::codepointToUtf8( 0xf700 + ord( $m[0] ) );
            }, $text );
        }
        // We prefer mbstring because it has sane handling of invalid input,
        // where iconv just chokes and returns false. But we need iconv for
        // some encodings mbstring doesn't support.
        if ( in_array( $encoding, mb_list_encodings(), true ) ) {
            $old = mb_substitute_character();
            mb_substitute_character( \UtfNormal\Constants::UNICODE_REPLACEMENT );
            $text = mb_convert_encoding( $text, 'UTF-8', $encoding );
            mb_substitute_character( $old );
            return $text;
        }
        $ret = \Wikimedia\AtEase\AtEase::quietCall( 'iconv', $encoding, 'UTF-8', $text );
        if ( $ret === false ) {
            throw new \RuntimeException( "Cannot convert '$text' from $encoding" );
        }
        return $ret;
    }
}