Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
100.00% covered (success)
100.00%
339 / 339
100.00% covered (success)
100.00%
26 / 26
CRAP
100.00% covered (success)
100.00%
1 / 1
DataSourceTokenizer
100.00% covered (success)
100.00%
339 / 339
100.00% covered (success)
100.00%
26 / 26
201
100.00% covered (success)
100.00%
1 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 nextChar
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
7
 consumeCharacter
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
3
 reconsumeCharacter
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 lookAhead
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
1
 getParseErrors
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 clearParseErrors
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 parseError
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
6
 consumeToken
100.00% covered (success)
100.00%
108 / 108
100.00% covered (success)
100.00%
1 / 1
55
 consumeNumericToken
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
3
 consumeIdentLikeToken
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
11
 consumeStringToken
100.00% covered (success)
100.00%
22 / 22
100.00% covered (success)
100.00%
1 / 1
10
 consumeUrlToken
100.00% covered (success)
100.00%
36 / 36
100.00% covered (success)
100.00%
1 / 1
18
 consumeBadUrlRemnants
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
5
 isWhitespace
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
3
 isNameStartCharacter
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
6
 isNameCharacter
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
9
 isNonPrintable
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
6
 isDigit
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 isHexDigit
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
6
 isValidEscape
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
2
 wouldStartIdentifier
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
6
 wouldStartNumber
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
6
 consumeEscape
100.00% covered (success)
100.00%
17 / 17
100.00% covered (success)
100.00%
1 / 1
10
 consumeName
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 consumeNumber
100.00% covered (success)
100.00%
39 / 39
100.00% covered (success)
100.00%
1 / 1
16
1<?php
2/**
3 * @file
4 * @license https://opensource.org/licenses/Apache-2.0 Apache-2.0
5 */
6
7namespace Wikimedia\CSS\Parser;
8
9use InvalidArgumentException;
10use UnexpectedValueException;
11use UtfNormal\Constants;
12use UtfNormal\Utils;
13use Wikimedia\CSS\Objects\Token;
14
15/**
16 * Parse CSS into tokens
17 *
18 * This implements the tokenizer from the CSS Syntax Module Level 3 candidate recommendation.
19 * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/
20 */
21class DataSourceTokenizer implements Tokenizer {
22
23    /** @var DataSource */
24    protected $source;
25
26    /** @var int line in the input */
27    protected $line = 1;
28
29    /** @var int position in the line in the input */
30    protected $pos = 0;
31
32    /** @var string|null|object The most recently consumed character */
33    protected $currentCharacter = null;
34
35    /** @var string|null The next character to be consumed */
36    protected $nextCharacter = null;
37
38    /** @var array Parse errors. Each error is [ string $tag, int $line, int $pos ] */
39    protected $parseErrors = [];
40
41    /**
42     * @param DataSource $source
43     * @param array $options Configuration options.
44     *  (none currently defined)
45     */
46    public function __construct( DataSource $source, array $options = [] ) {
47        $this->source = $source;
48    }
49
50    /**
51     * Read a character from the data source
52     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#input-preprocessing
53     * @return string One UTF-8 character, or empty string on EOF
54     */
55    protected function nextChar() {
56        $char = $this->source->readCharacter();
57
58        // Perform transformations per the spec
59
60        // Any U+0000 or surrogate code point becomes U+FFFD
61        if ( $char === "\0" || ( $char >= "\u{D800}" && $char <= "\u{DFFF}" ) ) {
62            return Constants::UTF8_REPLACEMENT;
63        }
64
65        // Any U+000D, U+000C, or pair of U+000D + U+000A becomes U+000A
66        if ( $char === "\f" ) {
67            // U+000C
68            return "\n";
69        }
70
71        if ( $char === "\r" ) {
72            // Either U+000D + U+000A or a lone U+000D
73            $char2 = $this->source->readCharacter();
74            if ( $char2 !== "\n" ) {
75                $this->source->putBackCharacter( $char2 );
76            }
77            return "\n";
78        }
79
80        return $char;
81    }
82
83    /**
84     * Update the current and next character fields
85     */
86    protected function consumeCharacter() {
87        if ( $this->currentCharacter === "\n" ) {
88            $this->line++;
89            $this->pos = 1;
90        } elseif ( $this->currentCharacter !== DataSource::EOF ) {
91            $this->pos++;
92        }
93
94        $this->currentCharacter = $this->nextChar();
95        $this->nextCharacter = $this->nextChar();
96        $this->source->putBackCharacter( $this->nextCharacter );
97    }
98
99    /**
100     * Reconsume the next character
101     *
102     * In more normal terms, this pushes a character back onto the data source,
103     * so it will be read again for the next call to self::consumeCharacter().
104     */
105    protected function reconsumeCharacter() {
106        // @codeCoverageIgnoreStart
107        if ( !is_string( $this->currentCharacter ) ) {
108            throw new UnexpectedValueException( "[$this->line:$this->pos] Can't reconsume" );
109        }
110        // @codeCoverageIgnoreEnd
111
112        if ( $this->currentCharacter === DataSource::EOF ) {
113            // Huh?
114            return;
115        }
116
117        $this->source->putBackCharacter( $this->currentCharacter );
118        $this->nextCharacter = $this->currentCharacter;
119        $this->currentCharacter = (object)[];
120        $this->pos--;
121    }
122
123    /**
124     * Look ahead at the next three characters
125     * @return string[] Three characters
126     */
127    protected function lookAhead() {
128        $ret = [
129            $this->nextChar(),
130            $this->nextChar(),
131            $this->nextChar(),
132        ];
133        $this->source->putBackCharacter( $ret[2] );
134        $this->source->putBackCharacter( $ret[1] );
135        $this->source->putBackCharacter( $ret[0] );
136
137        return $ret;
138    }
139
140    /** @inheritDoc */
141    public function getParseErrors() {
142        return $this->parseErrors;
143    }
144
145    /** @inheritDoc */
146    public function clearParseErrors() {
147        $this->parseErrors = [];
148    }
149
150    /**
151     * Record a parse error
152     * @param string $tag Error tag
153     * @param array|null $position Report the error as starting at this
154     *  position instead of at the current position.
155     * @param array $data Extra data about the error.
156     */
157    protected function parseError( $tag, array $position = null, array $data = [] ) {
158        if ( $position ) {
159            if ( isset( $position['position'] ) ) {
160                $position = $position['position'];
161            }
162            if ( count( $position ) !== 2 || !is_int( $position[0] ) || !is_int( $position[1] ) ) {
163                // @codeCoverageIgnoreStart
164                throw new InvalidArgumentException( 'Invalid position' );
165                // @codeCoverageIgnoreEnd
166            }
167            $err = [ $tag, $position[0], $position[1] ];
168        } else {
169            $err = [ $tag, $this->line, $this->pos ];
170        }
171        $this->parseErrors[] = array_merge( $err, $data );
172    }
173
174    /**
175     * Read a token from the data source
176     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-token
177     * @return Token
178     * @suppress PhanPluginDuplicateAdjacentStatement,PhanPluginDuplicateSwitchCaseLooseEquality
179     */
180    public function consumeToken() {
181        // We "consume comments" inline below, see `case '/'`.
182
183        $this->consumeCharacter();
184        $pos = [ 'position' => [ $this->line, $this->pos ] ];
185
186        switch ( (string)$this->currentCharacter ) {
187            case "\n":
188            case "\t":
189            case ' ':
190                // Whitespace token
191                while ( self::isWhitespace( $this->nextCharacter ) ) {
192                    $this->consumeCharacter();
193                }
194                return new Token( Token::T_WHITESPACE, $pos );
195
196            case '"':
197            case '\'':
198                // String token
199                return $this->consumeStringToken( $this->currentCharacter, $pos );
200
201            case '#':
202                [ $next, $next2, $next3 ] = $this->lookAhead();
203                if ( self::isNameCharacter( $this->nextCharacter ) ||
204                    self::isValidEscape( $next, $next2 )
205                ) {
206                    return new Token( Token::T_HASH, $pos + [
207                        'typeFlag' => self::wouldStartIdentifier( $next, $next2, $next3 ) ? 'id' : 'unrestricted',
208                        'value' => $this->consumeName(),
209                    ] );
210                }
211
212                return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] );
213
214            case '(':
215                return new Token( Token::T_LEFT_PAREN, $pos );
216
217            case ')':
218                return new Token( Token::T_RIGHT_PAREN, $pos );
219
220            case '+':
221            case '.':
222                [ $next, $next2, ] = $this->lookAhead();
223                if ( self::wouldStartNumber( $this->currentCharacter, $next, $next2 ) ) {
224                    $this->reconsumeCharacter();
225                    return $this->consumeNumericToken( $pos );
226                }
227
228                return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] );
229
230            case ',':
231                return new Token( Token::T_COMMA, $pos );
232
233            case '-':
234                [ $next, $next2, ] = $this->lookAhead();
235                if ( self::wouldStartNumber( $this->currentCharacter, $next, $next2 ) ) {
236                    $this->reconsumeCharacter();
237                    return $this->consumeNumericToken( $pos );
238                }
239
240                if ( $next === '-' && $next2 === '>' ) {
241                    $this->consumeCharacter();
242                    $this->consumeCharacter();
243                    return new Token( Token::T_CDC, $pos );
244                }
245
246                if ( self::wouldStartIdentifier( $this->currentCharacter, $next, $next2 ) ) {
247                    $this->reconsumeCharacter();
248                    return $this->consumeIdentLikeToken( $pos );
249                }
250
251                return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] );
252
253            case '/':
254                if ( $this->nextCharacter === '*' ) {
255                    $this->consumeCharacter();
256                    $this->consumeCharacter();
257                    while ( $this->currentCharacter !== DataSource::EOF &&
258                        // @phan-suppress-next-line PhanSuspiciousValueComparisonInLoop
259                        !( $this->currentCharacter === '*' && $this->nextCharacter === '/' )
260                    ) {
261                        $this->consumeCharacter();
262                    }
263                    if ( $this->currentCharacter === DataSource::EOF ) {
264                        $this->parseError( 'unclosed-comment', $pos );
265                    }
266                    $this->consumeCharacter();
267                    // @phan-suppress-next-line PhanPossiblyInfiniteRecursionSameParams
268                    return $this->consumeToken();
269                }
270
271                return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] );
272
273            case ':':
274                return new Token( Token::T_COLON, $pos );
275
276            case ';':
277                return new Token( Token::T_SEMICOLON, $pos );
278
279            case '<':
280                [ $next, $next2, $next3 ] = $this->lookAhead();
281                if ( $next === '!' && $next2 === '-' && $next3 === '-' ) {
282                    $this->consumeCharacter();
283                    $this->consumeCharacter();
284                    $this->consumeCharacter();
285                    return new Token( Token::T_CDO, $pos );
286                }
287
288                return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] );
289
290            case '@':
291                [ $next, $next2, $next3 ] = $this->lookAhead();
292                if ( self::wouldStartIdentifier( $next, $next2, $next3 ) ) {
293                    return new Token( Token::T_AT_KEYWORD, $pos + [ 'value' => $this->consumeName() ] );
294                }
295
296                return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] );
297
298            case '[':
299                return new Token( Token::T_LEFT_BRACKET, $pos );
300
301            case '\\':
302                if ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) {
303                    $this->reconsumeCharacter();
304                    return $this->consumeIdentLikeToken( $pos );
305                }
306
307                $this->parseError( 'bad-escape' );
308                return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] );
309
310            case ']':
311                return new Token( Token::T_RIGHT_BRACKET, $pos );
312
313            case '{':
314                return new Token( Token::T_LEFT_BRACE, $pos );
315
316            case '}':
317                return new Token( Token::T_RIGHT_BRACE, $pos );
318
319            case '0':
320            case '1':
321            case '2':
322            case '3':
323            case '4':
324            case '5':
325            case '6':
326            case '7':
327            case '8':
328            case '9':
329                $this->reconsumeCharacter();
330                return $this->consumeNumericToken( $pos );
331
332            case DataSource::EOF:
333                return new Token( Token::T_EOF, $pos );
334
335            default:
336                if ( self::isNameStartCharacter( $this->currentCharacter ) ) {
337                    $this->reconsumeCharacter();
338                    return $this->consumeIdentLikeToken( $pos );
339                }
340
341                return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] );
342        }
343    }
344
345    /**
346     * Consume a numeric token
347     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-numeric-token
348     * @param array $data Data for the new token (typically contains just 'position')
349     * @return Token
350     */
351    protected function consumeNumericToken( array $data ) {
352        [ $data['representation'], $data['value'], $data['typeFlag'] ] = $this->consumeNumber();
353
354        [ $next, $next2, $next3 ] = $this->lookAhead();
355        if ( self::wouldStartIdentifier( $next, $next2, $next3 ) ) {
356            return new Token( Token::T_DIMENSION, $data + [ 'unit' => $this->consumeName() ] );
357        } elseif ( $this->nextCharacter === '%' ) {
358            $this->consumeCharacter();
359            return new Token( Token::T_PERCENTAGE, $data );
360        } else {
361            return new Token( Token::T_NUMBER, $data );
362        }
363    }
364
365    /**
366     * Consume an ident-like token
367     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-ident-like-token
368     * @param array $data Data for the new token (typically contains just 'position')
369     * @return Token
370     */
371    protected function consumeIdentLikeToken( array $data ) {
372        $name = $this->consumeName();
373
374        if ( $this->nextCharacter === '(' ) {
375            $this->consumeCharacter();
376
377            if ( !strcasecmp( $name, 'url' ) ) {
378                while ( true ) {
379                    [ $next, $next2 ] = $this->lookAhead();
380                    if ( !self::isWhitespace( $next ) || !self::isWhitespace( $next2 ) ) {
381                        break;
382                    }
383                    $this->consumeCharacter();
384                }
385                if ( $next !== '"' && $next !== '\'' &&
386                    !( self::isWhitespace( $next ) && ( $next2 === '"' || $next2 === '\'' ) )
387                ) {
388                    return $this->consumeUrlToken( $data );
389                }
390            }
391
392            return new Token( Token::T_FUNCTION, $data + [ 'value' => $name ] );
393        }
394
395        return new Token( Token::T_IDENT, $data + [ 'value' => $name ] );
396    }
397
398    /**
399     * Consume a string token
400     *
401     * This assumes the leading quote or apostrophe has already been consumed.
402     *
403     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-string-token
404     * @param string $endChar Ending character of the string
405     * @param array $data Data for the new token (typically contains just 'position')
406     * @return Token
407     */
408    protected function consumeStringToken( $endChar, array $data ) {
409        $data['value'] = '';
410
411        while ( true ) {
412            $this->consumeCharacter();
413            switch ( $this->currentCharacter ) {
414                case DataSource::EOF:
415                    $this->parseError( 'unclosed-string', $data );
416                    break 2;
417
418                case $endChar:
419                    break 2;
420
421                case "\n":
422                    $this->parseError( 'newline-in-string' );
423                    $this->reconsumeCharacter();
424                    return new Token( Token::T_BAD_STRING, [ 'value' => '' ] + $data );
425
426                case '\\':
427                    if ( $this->nextCharacter === DataSource::EOF ) {
428                        // Do nothing
429                    } elseif ( $this->nextCharacter === "\n" ) {
430                        // Consume it
431                        $this->consumeCharacter();
432                    } elseif ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) {
433                        $data['value'] .= $this->consumeEscape();
434                    } else {
435                        // @codeCoverageIgnoreStart
436                        throw new UnexpectedValueException( "[$this->line:$this->pos] Unexpected state" );
437                        // @codeCoverageIgnoreEnd
438                    }
439                    break;
440
441                default:
442                    $data['value'] .= $this->currentCharacter;
443                    break;
444            }
445        }
446
447        // @phan-suppress-next-line PhanPluginUnreachableCode Reached by break 2
448        return new Token( Token::T_STRING, $data );
449    }
450
451    /**
452     * Consume a URL token
453     *
454     * This assumes the leading "url(" has already been consumed.
455     *
456     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-url-token
457     * @param array $data Data for the new token (typically contains just 'position')
458     * @return Token
459     */
460    protected function consumeUrlToken( array $data ) {
461        // 1.
462        $data['value'] = '';
463
464        // 2.
465        while ( self::isWhitespace( $this->nextCharacter ) ) {
466            $this->consumeCharacter();
467        }
468
469        // 3.
470        while ( true ) {
471            $this->consumeCharacter();
472            switch ( $this->currentCharacter ) {
473                case DataSource::EOF:
474                    $this->parseError( 'unclosed-url', $data );
475                    break 2;
476
477                // @codeCoverageIgnoreStart
478                case ')':
479                // @codeCoverageIgnoreEnd
480                    break 2;
481
482                // @codeCoverageIgnoreStart
483                case "\n":
484                case "\t":
485                case ' ':
486                // @codeCoverageIgnoreEnd
487                    while ( self::isWhitespace( $this->nextCharacter ) ) {
488                        $this->consumeCharacter();
489                    }
490                    if ( $this->nextCharacter === ')' ) {
491                        $this->consumeCharacter();
492                        break 2;
493                    } elseif ( $this->nextCharacter === DataSource::EOF ) {
494                        $this->consumeCharacter();
495                        $this->parseError( 'unclosed-url', $data );
496                        break 2;
497                    } else {
498                        $this->consumeBadUrlRemnants();
499                        return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data );
500                    }
501
502                // @codeCoverageIgnoreStart
503                case '"':
504                case '\'':
505                case '(':
506                // @codeCoverageIgnoreEnd
507                    $this->parseError( 'bad-character-in-url' );
508                    $this->consumeBadUrlRemnants();
509                    return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data );
510
511                // @codeCoverageIgnoreStart
512                case '\\':
513                // @codeCoverageIgnoreEnd
514                    if ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) {
515                        $data['value'] .= $this->consumeEscape();
516                    } else {
517                        $this->parseError( 'bad-escape' );
518                        $this->consumeBadUrlRemnants();
519                        return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data );
520                    }
521                    break;
522
523                default:
524                    if ( self::isNonPrintable( $this->currentCharacter ) ) {
525                        $this->parseError( 'bad-character-in-url' );
526                        $this->consumeBadUrlRemnants();
527                        return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data );
528                    }
529
530                    $data['value'] .= $this->currentCharacter;
531                    break;
532            }
533        }
534
535        // @phan-suppress-next-line PhanPluginUnreachableCode Reached by break 2
536        return new Token( Token::T_URL, $data );
537    }
538
539    /**
540     * Clean up after finding an error in a URL
541     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-remnants-of-bad-url
542     */
543    protected function consumeBadUrlRemnants() {
544        while ( true ) {
545            $this->consumeCharacter();
546            if ( $this->currentCharacter === ')' || $this->currentCharacter === DataSource::EOF ) {
547                break;
548            }
549            if ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) {
550                $this->consumeEscape();
551            }
552        }
553    }
554
555    /**
556     * Indicate if a character is whitespace
557     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#whitespace
558     * @param string $char A single UTF-8 character
559     * @return bool
560     */
561    protected static function isWhitespace( $char ) {
562        return $char === "\n" || $char === "\t" || $char === " ";
563    }
564
565    /**
566     * Indicate if a character is a name-start code point
567     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#name-start-code-point
568     * @param string $char A single UTF-8 character
569     * @return bool
570     */
571    protected static function isNameStartCharacter( $char ) {
572        // Every non-ASCII character is a name start character, so we can just
573        // check the first byte.
574        $char = ord( $char );
575        return ( $char >= 0x41 && $char <= 0x5a ) ||
576            ( $char >= 0x61 && $char <= 0x7a ) ||
577            $char >= 0x80 || $char === 0x5f;
578    }
579
580    /**
581     * Indicate if a character is a name code point
582     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#name-code-point
583     * @param string $char A single UTF-8 character
584     * @return bool
585     */
586    protected static function isNameCharacter( $char ) {
587        // Every non-ASCII character is a name character, so we can just check
588        // the first byte.
589        $char = ord( $char );
590        return ( $char >= 0x41 && $char <= 0x5a ) ||
591            ( $char >= 0x61 && $char <= 0x7a ) ||
592            ( $char >= 0x30 && $char <= 0x39 ) ||
593            $char >= 0x80 || $char === 0x5f || $char === 0x2d;
594    }
595
596    /**
597     * Indicate if a character is non-printable
598     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#non-printable-code-point
599     * @param string $char A single UTF-8 character
600     * @return bool
601     */
602    protected static function isNonPrintable( $char ) {
603        // No non-ASCII character is non-printable, so we can just check the
604        // first byte.
605        $char = ord( $char );
606        return ( $char >= 0x00 && $char <= 0x08 ) ||
607            $char === 0x0b ||
608            ( $char >= 0x0e && $char <= 0x1f ) ||
609            $char === 0x7f;
610    }
611
612    /**
613     * Indicate if a character is a digit
614     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#digit
615     * @param string $char A single UTF-8 character
616     * @return bool
617     */
618    protected static function isDigit( $char ) {
619        // No non-ASCII character is a digit, so we can just check the first
620        // byte.
621        $char = ord( $char );
622        return $char >= 0x30 && $char <= 0x39;
623    }
624
625    /**
626     * Indicate if a character is a hex digit
627     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#hex-digit
628     * @param string $char A single UTF-8 character
629     * @return bool
630     */
631    protected static function isHexDigit( $char ) {
632        // No non-ASCII character is a hex digit, so we can just check the
633        // first byte.
634        $char = ord( $char );
635        return ( $char >= 0x30 && $char <= 0x39 ) ||
636            ( $char >= 0x41 && $char <= 0x46 ) ||
637            ( $char >= 0x61 && $char <= 0x66 );
638    }
639
640    /**
641     * Determine if two characters constitute a valid escape
642     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#starts-with-a-valid-escape
643     * @param string $char1
644     * @param string $char2
645     * @return bool
646     */
647    protected static function isValidEscape( $char1, $char2 ) {
648        return $char1 === '\\' && $char2 !== "\n";
649    }
650
651    /**
652     * Determine if three characters would start an identifier
653     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#would-start-an-identifier
654     * @param string $char1
655     * @param string $char2
656     * @param string $char3
657     * @return bool
658     */
659    protected static function wouldStartIdentifier( $char1, $char2, $char3 ) {
660        if ( $char1 === '-' ) {
661            return self::isNameStartCharacter( $char2 ) || $char2 === '-' ||
662                self::isValidEscape( $char2, $char3 );
663        } elseif ( self::isNameStartCharacter( $char1 ) ) {
664            return true;
665        } elseif ( $char1 === '\\' ) {
666            return self::isValidEscape( $char1, $char2 );
667        } else {
668            return false;
669        }
670    }
671
672    /**
673     * Determine if three characters would start a number
674     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#starts-with-a-number
675     * @param string $char1
676     * @param string $char2
677     * @param string $char3
678     * @return bool
679     */
680    protected static function wouldStartNumber( $char1, $char2, $char3 ) {
681        if ( $char1 === '+' || $char1 === '-' ) {
682            return self::isDigit( $char2 ) ||
683                ( $char2 === '.' && self::isDigit( $char3 ) );
684        } elseif ( $char1 === '.' ) {
685            return self::isDigit( $char2 );
686        // @codeCoverageIgnoreStart
687        // Nothing reaches this code
688        } else {
689            return self::isDigit( $char1 );
690        }
691        // @codeCoverageIgnoreEnd
692    }
693
694    /**
695     * Consume a valid escape
696     *
697     * This assumes the leading backslash is consumed.
698     *
699     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-escaped-code-point
700     * @return string Escaped character
701     */
702    protected function consumeEscape() {
703        $position = [ 'position' => [ $this->line, $this->pos ] ];
704
705        $this->consumeCharacter();
706
707        // 1-6 hexits, plus one optional whitespace character
708        if ( self::isHexDigit( $this->currentCharacter ) ) {
709            $num = $this->currentCharacter;
710            while ( strlen( $num ) < 6 && self::isHexDigit( $this->nextCharacter ) ) {
711                $this->consumeCharacter();
712                $num .= $this->currentCharacter;
713            }
714            if ( self::isWhitespace( $this->nextCharacter ) ) {
715                $this->consumeCharacter();
716            }
717
718            $num = intval( $num, 16 );
719            if ( $num === 0 || ( $num >= 0xd800 && $num <= 0xdfff ) || $num > 0x10ffff ) {
720                return Constants::UTF8_REPLACEMENT;
721            }
722            return Utils::codepointToUtf8( $num );
723        }
724
725        if ( $this->currentCharacter === DataSource::EOF ) {
726            $this->parseError( 'bad-escape', $position );
727            return Constants::UTF8_REPLACEMENT;
728        }
729
730        return $this->currentCharacter;
731    }
732
733    /**
734     * Consume a name
735     *
736     * Note this does not do validation on the input stream. Call
737     * self::wouldStartIdentifier() or the like before calling the method if
738     * necessary.
739     *
740     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-name
741     * @return string Name
742     */
743    protected function consumeName() {
744        $name = '';
745
746        while ( true ) {
747            $this->consumeCharacter();
748
749            if ( self::isNameCharacter( $this->currentCharacter ) ) {
750                $name .= $this->currentCharacter;
751            } elseif ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) {
752                $name .= $this->consumeEscape();
753            } else {
754                $this->reconsumeCharacter();
755                break;
756            }
757        }
758
759        return $name;
760    }
761
762    /**
763     * Consume a number
764     *
765     * Note this does not do validation on the input stream. Call
766     * self::wouldStartNumber() before calling the method if necessary.
767     *
768     * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-number
769     * @return array [ string $value, int|float $number, string $type ('integer' or 'number') ]
770     * @suppress PhanPluginDuplicateAdjacentStatement
771     */
772    protected function consumeNumber() {
773        // 1.
774        $repr = '';
775        $type = 'integer';
776
777        // 2.
778        if ( $this->nextCharacter === '+' || $this->nextCharacter === '-' ) {
779            $this->consumeCharacter();
780            $repr .= $this->currentCharacter;
781        }
782
783        // 3.
784        while ( self::isDigit( $this->nextCharacter ) ) {
785            $this->consumeCharacter();
786            $repr .= $this->currentCharacter;
787        }
788
789        // 4.
790        if ( $this->nextCharacter === '.' ) {
791            [ $next, $next2, ] = $this->lookAhead();
792            if ( self::isDigit( $next2 ) ) {
793                // 4.1.
794                $this->consumeCharacter();
795                $this->consumeCharacter();
796                // 4.2.
797                $repr .= $next . $next2;
798                // 4.3.
799                $type = 'number';
800                // 4.4.
801                while ( self::isDigit( $this->nextCharacter ) ) {
802                    $this->consumeCharacter();
803                    $repr .= $this->currentCharacter;
804                }
805            }
806        }
807
808        // 5.
809        if ( $this->nextCharacter === 'e' || $this->nextCharacter === 'E' ) {
810            [ $next, $next2, $next3 ] = $this->lookAhead();
811            $ok = false;
812            if ( ( $next2 === '+' || $next2 === '-' ) && self::isDigit( $next3 ) ) {
813                $ok = true;
814                // 5.1.
815                $this->consumeCharacter();
816                $this->consumeCharacter();
817                $this->consumeCharacter();
818                // 5.2.
819                $repr .= $next . $next2 . $next3;
820            } elseif ( self::isDigit( $next2 ) ) {
821                $ok = true;
822                // 5.1.
823                $this->consumeCharacter();
824                $this->consumeCharacter();
825                // 5.2.
826                $repr .= $next . $next2;
827            }
828            if ( $ok ) {
829                // 5.3.
830                $type = 'number';
831                // 5.4.
832                while ( self::isDigit( $this->nextCharacter ) ) {
833                    $this->consumeCharacter();
834                    $repr .= $this->currentCharacter;
835                }
836            }
837        }
838
839        // 6. We assume PHP's casting follows the same rules as
840        // https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#convert-string-to-number
841        $value = $type === 'integer' ? (int)$repr : (float)$repr;
842
843        // 7.
844        return [ $repr, $value, $type ];
845    }
846}