Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
339 / 339 |
|
100.00% |
26 / 26 |
CRAP | |
100.00% |
1 / 1 |
DataSourceTokenizer | |
100.00% |
339 / 339 |
|
100.00% |
26 / 26 |
201 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
nextChar | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
7 | |||
consumeCharacter | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
3 | |||
reconsumeCharacter | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
lookAhead | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
getParseErrors | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
clearParseErrors | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
parseError | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
6 | |||
consumeToken | |
100.00% |
108 / 108 |
|
100.00% |
1 / 1 |
55 | |||
consumeNumericToken | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
3 | |||
consumeIdentLikeToken | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
11 | |||
consumeStringToken | |
100.00% |
22 / 22 |
|
100.00% |
1 / 1 |
10 | |||
consumeUrlToken | |
100.00% |
36 / 36 |
|
100.00% |
1 / 1 |
18 | |||
consumeBadUrlRemnants | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
5 | |||
isWhitespace | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
3 | |||
isNameStartCharacter | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
6 | |||
isNameCharacter | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
9 | |||
isNonPrintable | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
6 | |||
isDigit | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
isHexDigit | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
6 | |||
isValidEscape | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
wouldStartIdentifier | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
6 | |||
wouldStartNumber | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
6 | |||
consumeEscape | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
10 | |||
consumeName | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
consumeNumber | |
100.00% |
39 / 39 |
|
100.00% |
1 / 1 |
16 |
1 | <?php |
2 | /** |
3 | * @file |
4 | * @license https://opensource.org/licenses/Apache-2.0 Apache-2.0 |
5 | */ |
6 | |
7 | namespace Wikimedia\CSS\Parser; |
8 | |
9 | use InvalidArgumentException; |
10 | use UnexpectedValueException; |
11 | use UtfNormal\Constants; |
12 | use UtfNormal\Utils; |
13 | use Wikimedia\CSS\Objects\Token; |
14 | |
15 | /** |
16 | * Parse CSS into tokens |
17 | * |
18 | * This implements the tokenizer from the CSS Syntax Module Level 3 candidate recommendation. |
19 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/ |
20 | */ |
21 | class DataSourceTokenizer implements Tokenizer { |
22 | |
23 | /** @var DataSource */ |
24 | protected $source; |
25 | |
26 | /** @var int line in the input */ |
27 | protected $line = 1; |
28 | |
29 | /** @var int position in the line in the input */ |
30 | protected $pos = 0; |
31 | |
32 | /** @var string|null|object The most recently consumed character */ |
33 | protected $currentCharacter = null; |
34 | |
35 | /** @var string|null The next character to be consumed */ |
36 | protected $nextCharacter = null; |
37 | |
38 | /** @var array Parse errors. Each error is [ string $tag, int $line, int $pos ] */ |
39 | protected $parseErrors = []; |
40 | |
41 | /** |
42 | * @param DataSource $source |
43 | * @param array $options Configuration options. |
44 | * (none currently defined) |
45 | */ |
46 | public function __construct( DataSource $source, array $options = [] ) { |
47 | $this->source = $source; |
48 | } |
49 | |
50 | /** |
51 | * Read a character from the data source |
52 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#input-preprocessing |
53 | * @return string One UTF-8 character, or empty string on EOF |
54 | */ |
55 | protected function nextChar() { |
56 | $char = $this->source->readCharacter(); |
57 | |
58 | // Perform transformations per the spec |
59 | |
60 | // Any U+0000 or surrogate code point becomes U+FFFD |
61 | if ( $char === "\0" || ( $char >= "\u{D800}" && $char <= "\u{DFFF}" ) ) { |
62 | return Constants::UTF8_REPLACEMENT; |
63 | } |
64 | |
65 | // Any U+000D, U+000C, or pair of U+000D + U+000A becomes U+000A |
66 | if ( $char === "\f" ) { |
67 | // U+000C |
68 | return "\n"; |
69 | } |
70 | |
71 | if ( $char === "\r" ) { |
72 | // Either U+000D + U+000A or a lone U+000D |
73 | $char2 = $this->source->readCharacter(); |
74 | if ( $char2 !== "\n" ) { |
75 | $this->source->putBackCharacter( $char2 ); |
76 | } |
77 | return "\n"; |
78 | } |
79 | |
80 | return $char; |
81 | } |
82 | |
83 | /** |
84 | * Update the current and next character fields |
85 | */ |
86 | protected function consumeCharacter() { |
87 | if ( $this->currentCharacter === "\n" ) { |
88 | $this->line++; |
89 | $this->pos = 1; |
90 | } elseif ( $this->currentCharacter !== DataSource::EOF ) { |
91 | $this->pos++; |
92 | } |
93 | |
94 | $this->currentCharacter = $this->nextChar(); |
95 | $this->nextCharacter = $this->nextChar(); |
96 | $this->source->putBackCharacter( $this->nextCharacter ); |
97 | } |
98 | |
99 | /** |
100 | * Reconsume the next character |
101 | * |
102 | * In more normal terms, this pushes a character back onto the data source, |
103 | * so it will be read again for the next call to self::consumeCharacter(). |
104 | */ |
105 | protected function reconsumeCharacter() { |
106 | // @codeCoverageIgnoreStart |
107 | if ( !is_string( $this->currentCharacter ) ) { |
108 | throw new UnexpectedValueException( "[$this->line:$this->pos] Can't reconsume" ); |
109 | } |
110 | // @codeCoverageIgnoreEnd |
111 | |
112 | if ( $this->currentCharacter === DataSource::EOF ) { |
113 | // Huh? |
114 | return; |
115 | } |
116 | |
117 | $this->source->putBackCharacter( $this->currentCharacter ); |
118 | $this->nextCharacter = $this->currentCharacter; |
119 | $this->currentCharacter = (object)[]; |
120 | $this->pos--; |
121 | } |
122 | |
123 | /** |
124 | * Look ahead at the next three characters |
125 | * @return string[] Three characters |
126 | */ |
127 | protected function lookAhead() { |
128 | $ret = [ |
129 | $this->nextChar(), |
130 | $this->nextChar(), |
131 | $this->nextChar(), |
132 | ]; |
133 | $this->source->putBackCharacter( $ret[2] ); |
134 | $this->source->putBackCharacter( $ret[1] ); |
135 | $this->source->putBackCharacter( $ret[0] ); |
136 | |
137 | return $ret; |
138 | } |
139 | |
140 | /** @inheritDoc */ |
141 | public function getParseErrors() { |
142 | return $this->parseErrors; |
143 | } |
144 | |
145 | /** @inheritDoc */ |
146 | public function clearParseErrors() { |
147 | $this->parseErrors = []; |
148 | } |
149 | |
150 | /** |
151 | * Record a parse error |
152 | * @param string $tag Error tag |
153 | * @param array|null $position Report the error as starting at this |
154 | * position instead of at the current position. |
155 | * @param array $data Extra data about the error. |
156 | */ |
157 | protected function parseError( $tag, array $position = null, array $data = [] ) { |
158 | if ( $position ) { |
159 | if ( isset( $position['position'] ) ) { |
160 | $position = $position['position']; |
161 | } |
162 | if ( count( $position ) !== 2 || !is_int( $position[0] ) || !is_int( $position[1] ) ) { |
163 | // @codeCoverageIgnoreStart |
164 | throw new InvalidArgumentException( 'Invalid position' ); |
165 | // @codeCoverageIgnoreEnd |
166 | } |
167 | $err = [ $tag, $position[0], $position[1] ]; |
168 | } else { |
169 | $err = [ $tag, $this->line, $this->pos ]; |
170 | } |
171 | $this->parseErrors[] = array_merge( $err, $data ); |
172 | } |
173 | |
174 | /** |
175 | * Read a token from the data source |
176 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-token |
177 | * @return Token |
178 | * @suppress PhanPluginDuplicateAdjacentStatement,PhanPluginDuplicateSwitchCaseLooseEquality |
179 | */ |
180 | public function consumeToken() { |
181 | // We "consume comments" inline below, see `case '/'`. |
182 | |
183 | $this->consumeCharacter(); |
184 | $pos = [ 'position' => [ $this->line, $this->pos ] ]; |
185 | |
186 | switch ( (string)$this->currentCharacter ) { |
187 | case "\n": |
188 | case "\t": |
189 | case ' ': |
190 | // Whitespace token |
191 | while ( self::isWhitespace( $this->nextCharacter ) ) { |
192 | $this->consumeCharacter(); |
193 | } |
194 | return new Token( Token::T_WHITESPACE, $pos ); |
195 | |
196 | case '"': |
197 | case '\'': |
198 | // String token |
199 | return $this->consumeStringToken( $this->currentCharacter, $pos ); |
200 | |
201 | case '#': |
202 | [ $next, $next2, $next3 ] = $this->lookAhead(); |
203 | if ( self::isNameCharacter( $this->nextCharacter ) || |
204 | self::isValidEscape( $next, $next2 ) |
205 | ) { |
206 | return new Token( Token::T_HASH, $pos + [ |
207 | 'typeFlag' => self::wouldStartIdentifier( $next, $next2, $next3 ) ? 'id' : 'unrestricted', |
208 | 'value' => $this->consumeName(), |
209 | ] ); |
210 | } |
211 | |
212 | return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); |
213 | |
214 | case '(': |
215 | return new Token( Token::T_LEFT_PAREN, $pos ); |
216 | |
217 | case ')': |
218 | return new Token( Token::T_RIGHT_PAREN, $pos ); |
219 | |
220 | case '+': |
221 | case '.': |
222 | [ $next, $next2, ] = $this->lookAhead(); |
223 | if ( self::wouldStartNumber( $this->currentCharacter, $next, $next2 ) ) { |
224 | $this->reconsumeCharacter(); |
225 | return $this->consumeNumericToken( $pos ); |
226 | } |
227 | |
228 | return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); |
229 | |
230 | case ',': |
231 | return new Token( Token::T_COMMA, $pos ); |
232 | |
233 | case '-': |
234 | [ $next, $next2, ] = $this->lookAhead(); |
235 | if ( self::wouldStartNumber( $this->currentCharacter, $next, $next2 ) ) { |
236 | $this->reconsumeCharacter(); |
237 | return $this->consumeNumericToken( $pos ); |
238 | } |
239 | |
240 | if ( $next === '-' && $next2 === '>' ) { |
241 | $this->consumeCharacter(); |
242 | $this->consumeCharacter(); |
243 | return new Token( Token::T_CDC, $pos ); |
244 | } |
245 | |
246 | if ( self::wouldStartIdentifier( $this->currentCharacter, $next, $next2 ) ) { |
247 | $this->reconsumeCharacter(); |
248 | return $this->consumeIdentLikeToken( $pos ); |
249 | } |
250 | |
251 | return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); |
252 | |
253 | case '/': |
254 | if ( $this->nextCharacter === '*' ) { |
255 | $this->consumeCharacter(); |
256 | $this->consumeCharacter(); |
257 | while ( $this->currentCharacter !== DataSource::EOF && |
258 | // @phan-suppress-next-line PhanSuspiciousValueComparisonInLoop |
259 | !( $this->currentCharacter === '*' && $this->nextCharacter === '/' ) |
260 | ) { |
261 | $this->consumeCharacter(); |
262 | } |
263 | if ( $this->currentCharacter === DataSource::EOF ) { |
264 | $this->parseError( 'unclosed-comment', $pos ); |
265 | } |
266 | $this->consumeCharacter(); |
267 | // @phan-suppress-next-line PhanPossiblyInfiniteRecursionSameParams |
268 | return $this->consumeToken(); |
269 | } |
270 | |
271 | return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); |
272 | |
273 | case ':': |
274 | return new Token( Token::T_COLON, $pos ); |
275 | |
276 | case ';': |
277 | return new Token( Token::T_SEMICOLON, $pos ); |
278 | |
279 | case '<': |
280 | [ $next, $next2, $next3 ] = $this->lookAhead(); |
281 | if ( $next === '!' && $next2 === '-' && $next3 === '-' ) { |
282 | $this->consumeCharacter(); |
283 | $this->consumeCharacter(); |
284 | $this->consumeCharacter(); |
285 | return new Token( Token::T_CDO, $pos ); |
286 | } |
287 | |
288 | return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); |
289 | |
290 | case '@': |
291 | [ $next, $next2, $next3 ] = $this->lookAhead(); |
292 | if ( self::wouldStartIdentifier( $next, $next2, $next3 ) ) { |
293 | return new Token( Token::T_AT_KEYWORD, $pos + [ 'value' => $this->consumeName() ] ); |
294 | } |
295 | |
296 | return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); |
297 | |
298 | case '[': |
299 | return new Token( Token::T_LEFT_BRACKET, $pos ); |
300 | |
301 | case '\\': |
302 | if ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) { |
303 | $this->reconsumeCharacter(); |
304 | return $this->consumeIdentLikeToken( $pos ); |
305 | } |
306 | |
307 | $this->parseError( 'bad-escape' ); |
308 | return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); |
309 | |
310 | case ']': |
311 | return new Token( Token::T_RIGHT_BRACKET, $pos ); |
312 | |
313 | case '{': |
314 | return new Token( Token::T_LEFT_BRACE, $pos ); |
315 | |
316 | case '}': |
317 | return new Token( Token::T_RIGHT_BRACE, $pos ); |
318 | |
319 | case '0': |
320 | case '1': |
321 | case '2': |
322 | case '3': |
323 | case '4': |
324 | case '5': |
325 | case '6': |
326 | case '7': |
327 | case '8': |
328 | case '9': |
329 | $this->reconsumeCharacter(); |
330 | return $this->consumeNumericToken( $pos ); |
331 | |
332 | case DataSource::EOF: |
333 | return new Token( Token::T_EOF, $pos ); |
334 | |
335 | default: |
336 | if ( self::isNameStartCharacter( $this->currentCharacter ) ) { |
337 | $this->reconsumeCharacter(); |
338 | return $this->consumeIdentLikeToken( $pos ); |
339 | } |
340 | |
341 | return new Token( Token::T_DELIM, $pos + [ 'value' => $this->currentCharacter ] ); |
342 | } |
343 | } |
344 | |
345 | /** |
346 | * Consume a numeric token |
347 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-numeric-token |
348 | * @param array $data Data for the new token (typically contains just 'position') |
349 | * @return Token |
350 | */ |
351 | protected function consumeNumericToken( array $data ) { |
352 | [ $data['representation'], $data['value'], $data['typeFlag'] ] = $this->consumeNumber(); |
353 | |
354 | [ $next, $next2, $next3 ] = $this->lookAhead(); |
355 | if ( self::wouldStartIdentifier( $next, $next2, $next3 ) ) { |
356 | return new Token( Token::T_DIMENSION, $data + [ 'unit' => $this->consumeName() ] ); |
357 | } elseif ( $this->nextCharacter === '%' ) { |
358 | $this->consumeCharacter(); |
359 | return new Token( Token::T_PERCENTAGE, $data ); |
360 | } else { |
361 | return new Token( Token::T_NUMBER, $data ); |
362 | } |
363 | } |
364 | |
365 | /** |
366 | * Consume an ident-like token |
367 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-ident-like-token |
368 | * @param array $data Data for the new token (typically contains just 'position') |
369 | * @return Token |
370 | */ |
371 | protected function consumeIdentLikeToken( array $data ) { |
372 | $name = $this->consumeName(); |
373 | |
374 | if ( $this->nextCharacter === '(' ) { |
375 | $this->consumeCharacter(); |
376 | |
377 | if ( !strcasecmp( $name, 'url' ) ) { |
378 | while ( true ) { |
379 | [ $next, $next2 ] = $this->lookAhead(); |
380 | if ( !self::isWhitespace( $next ) || !self::isWhitespace( $next2 ) ) { |
381 | break; |
382 | } |
383 | $this->consumeCharacter(); |
384 | } |
385 | if ( $next !== '"' && $next !== '\'' && |
386 | !( self::isWhitespace( $next ) && ( $next2 === '"' || $next2 === '\'' ) ) |
387 | ) { |
388 | return $this->consumeUrlToken( $data ); |
389 | } |
390 | } |
391 | |
392 | return new Token( Token::T_FUNCTION, $data + [ 'value' => $name ] ); |
393 | } |
394 | |
395 | return new Token( Token::T_IDENT, $data + [ 'value' => $name ] ); |
396 | } |
397 | |
398 | /** |
399 | * Consume a string token |
400 | * |
401 | * This assumes the leading quote or apostrophe has already been consumed. |
402 | * |
403 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-string-token |
404 | * @param string $endChar Ending character of the string |
405 | * @param array $data Data for the new token (typically contains just 'position') |
406 | * @return Token |
407 | */ |
408 | protected function consumeStringToken( $endChar, array $data ) { |
409 | $data['value'] = ''; |
410 | |
411 | while ( true ) { |
412 | $this->consumeCharacter(); |
413 | switch ( $this->currentCharacter ) { |
414 | case DataSource::EOF: |
415 | $this->parseError( 'unclosed-string', $data ); |
416 | break 2; |
417 | |
418 | case $endChar: |
419 | break 2; |
420 | |
421 | case "\n": |
422 | $this->parseError( 'newline-in-string' ); |
423 | $this->reconsumeCharacter(); |
424 | return new Token( Token::T_BAD_STRING, [ 'value' => '' ] + $data ); |
425 | |
426 | case '\\': |
427 | if ( $this->nextCharacter === DataSource::EOF ) { |
428 | // Do nothing |
429 | } elseif ( $this->nextCharacter === "\n" ) { |
430 | // Consume it |
431 | $this->consumeCharacter(); |
432 | } elseif ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) { |
433 | $data['value'] .= $this->consumeEscape(); |
434 | } else { |
435 | // @codeCoverageIgnoreStart |
436 | throw new UnexpectedValueException( "[$this->line:$this->pos] Unexpected state" ); |
437 | // @codeCoverageIgnoreEnd |
438 | } |
439 | break; |
440 | |
441 | default: |
442 | $data['value'] .= $this->currentCharacter; |
443 | break; |
444 | } |
445 | } |
446 | |
447 | // @phan-suppress-next-line PhanPluginUnreachableCode Reached by break 2 |
448 | return new Token( Token::T_STRING, $data ); |
449 | } |
450 | |
451 | /** |
452 | * Consume a URL token |
453 | * |
454 | * This assumes the leading "url(" has already been consumed. |
455 | * |
456 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-url-token |
457 | * @param array $data Data for the new token (typically contains just 'position') |
458 | * @return Token |
459 | */ |
460 | protected function consumeUrlToken( array $data ) { |
461 | // 1. |
462 | $data['value'] = ''; |
463 | |
464 | // 2. |
465 | while ( self::isWhitespace( $this->nextCharacter ) ) { |
466 | $this->consumeCharacter(); |
467 | } |
468 | |
469 | // 3. |
470 | while ( true ) { |
471 | $this->consumeCharacter(); |
472 | switch ( $this->currentCharacter ) { |
473 | case DataSource::EOF: |
474 | $this->parseError( 'unclosed-url', $data ); |
475 | break 2; |
476 | |
477 | // @codeCoverageIgnoreStart |
478 | case ')': |
479 | // @codeCoverageIgnoreEnd |
480 | break 2; |
481 | |
482 | // @codeCoverageIgnoreStart |
483 | case "\n": |
484 | case "\t": |
485 | case ' ': |
486 | // @codeCoverageIgnoreEnd |
487 | while ( self::isWhitespace( $this->nextCharacter ) ) { |
488 | $this->consumeCharacter(); |
489 | } |
490 | if ( $this->nextCharacter === ')' ) { |
491 | $this->consumeCharacter(); |
492 | break 2; |
493 | } elseif ( $this->nextCharacter === DataSource::EOF ) { |
494 | $this->consumeCharacter(); |
495 | $this->parseError( 'unclosed-url', $data ); |
496 | break 2; |
497 | } else { |
498 | $this->consumeBadUrlRemnants(); |
499 | return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data ); |
500 | } |
501 | |
502 | // @codeCoverageIgnoreStart |
503 | case '"': |
504 | case '\'': |
505 | case '(': |
506 | // @codeCoverageIgnoreEnd |
507 | $this->parseError( 'bad-character-in-url' ); |
508 | $this->consumeBadUrlRemnants(); |
509 | return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data ); |
510 | |
511 | // @codeCoverageIgnoreStart |
512 | case '\\': |
513 | // @codeCoverageIgnoreEnd |
514 | if ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) { |
515 | $data['value'] .= $this->consumeEscape(); |
516 | } else { |
517 | $this->parseError( 'bad-escape' ); |
518 | $this->consumeBadUrlRemnants(); |
519 | return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data ); |
520 | } |
521 | break; |
522 | |
523 | default: |
524 | if ( self::isNonPrintable( $this->currentCharacter ) ) { |
525 | $this->parseError( 'bad-character-in-url' ); |
526 | $this->consumeBadUrlRemnants(); |
527 | return new Token( Token::T_BAD_URL, [ 'value' => '' ] + $data ); |
528 | } |
529 | |
530 | $data['value'] .= $this->currentCharacter; |
531 | break; |
532 | } |
533 | } |
534 | |
535 | // @phan-suppress-next-line PhanPluginUnreachableCode Reached by break 2 |
536 | return new Token( Token::T_URL, $data ); |
537 | } |
538 | |
539 | /** |
540 | * Clean up after finding an error in a URL |
541 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-remnants-of-bad-url |
542 | */ |
543 | protected function consumeBadUrlRemnants() { |
544 | while ( true ) { |
545 | $this->consumeCharacter(); |
546 | if ( $this->currentCharacter === ')' || $this->currentCharacter === DataSource::EOF ) { |
547 | break; |
548 | } |
549 | if ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) { |
550 | $this->consumeEscape(); |
551 | } |
552 | } |
553 | } |
554 | |
555 | /** |
556 | * Indicate if a character is whitespace |
557 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#whitespace |
558 | * @param string $char A single UTF-8 character |
559 | * @return bool |
560 | */ |
561 | protected static function isWhitespace( $char ) { |
562 | return $char === "\n" || $char === "\t" || $char === " "; |
563 | } |
564 | |
565 | /** |
566 | * Indicate if a character is a name-start code point |
567 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#name-start-code-point |
568 | * @param string $char A single UTF-8 character |
569 | * @return bool |
570 | */ |
571 | protected static function isNameStartCharacter( $char ) { |
572 | // Every non-ASCII character is a name start character, so we can just |
573 | // check the first byte. |
574 | $char = ord( $char ); |
575 | return ( $char >= 0x41 && $char <= 0x5a ) || |
576 | ( $char >= 0x61 && $char <= 0x7a ) || |
577 | $char >= 0x80 || $char === 0x5f; |
578 | } |
579 | |
580 | /** |
581 | * Indicate if a character is a name code point |
582 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#name-code-point |
583 | * @param string $char A single UTF-8 character |
584 | * @return bool |
585 | */ |
586 | protected static function isNameCharacter( $char ) { |
587 | // Every non-ASCII character is a name character, so we can just check |
588 | // the first byte. |
589 | $char = ord( $char ); |
590 | return ( $char >= 0x41 && $char <= 0x5a ) || |
591 | ( $char >= 0x61 && $char <= 0x7a ) || |
592 | ( $char >= 0x30 && $char <= 0x39 ) || |
593 | $char >= 0x80 || $char === 0x5f || $char === 0x2d; |
594 | } |
595 | |
596 | /** |
597 | * Indicate if a character is non-printable |
598 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#non-printable-code-point |
599 | * @param string $char A single UTF-8 character |
600 | * @return bool |
601 | */ |
602 | protected static function isNonPrintable( $char ) { |
603 | // No non-ASCII character is non-printable, so we can just check the |
604 | // first byte. |
605 | $char = ord( $char ); |
606 | return ( $char >= 0x00 && $char <= 0x08 ) || |
607 | $char === 0x0b || |
608 | ( $char >= 0x0e && $char <= 0x1f ) || |
609 | $char === 0x7f; |
610 | } |
611 | |
612 | /** |
613 | * Indicate if a character is a digit |
614 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#digit |
615 | * @param string $char A single UTF-8 character |
616 | * @return bool |
617 | */ |
618 | protected static function isDigit( $char ) { |
619 | // No non-ASCII character is a digit, so we can just check the first |
620 | // byte. |
621 | $char = ord( $char ); |
622 | return $char >= 0x30 && $char <= 0x39; |
623 | } |
624 | |
625 | /** |
626 | * Indicate if a character is a hex digit |
627 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#hex-digit |
628 | * @param string $char A single UTF-8 character |
629 | * @return bool |
630 | */ |
631 | protected static function isHexDigit( $char ) { |
632 | // No non-ASCII character is a hex digit, so we can just check the |
633 | // first byte. |
634 | $char = ord( $char ); |
635 | return ( $char >= 0x30 && $char <= 0x39 ) || |
636 | ( $char >= 0x41 && $char <= 0x46 ) || |
637 | ( $char >= 0x61 && $char <= 0x66 ); |
638 | } |
639 | |
640 | /** |
641 | * Determine if two characters constitute a valid escape |
642 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#starts-with-a-valid-escape |
643 | * @param string $char1 |
644 | * @param string $char2 |
645 | * @return bool |
646 | */ |
647 | protected static function isValidEscape( $char1, $char2 ) { |
648 | return $char1 === '\\' && $char2 !== "\n"; |
649 | } |
650 | |
651 | /** |
652 | * Determine if three characters would start an identifier |
653 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#would-start-an-identifier |
654 | * @param string $char1 |
655 | * @param string $char2 |
656 | * @param string $char3 |
657 | * @return bool |
658 | */ |
659 | protected static function wouldStartIdentifier( $char1, $char2, $char3 ) { |
660 | if ( $char1 === '-' ) { |
661 | return self::isNameStartCharacter( $char2 ) || $char2 === '-' || |
662 | self::isValidEscape( $char2, $char3 ); |
663 | } elseif ( self::isNameStartCharacter( $char1 ) ) { |
664 | return true; |
665 | } elseif ( $char1 === '\\' ) { |
666 | return self::isValidEscape( $char1, $char2 ); |
667 | } else { |
668 | return false; |
669 | } |
670 | } |
671 | |
672 | /** |
673 | * Determine if three characters would start a number |
674 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#starts-with-a-number |
675 | * @param string $char1 |
676 | * @param string $char2 |
677 | * @param string $char3 |
678 | * @return bool |
679 | */ |
680 | protected static function wouldStartNumber( $char1, $char2, $char3 ) { |
681 | if ( $char1 === '+' || $char1 === '-' ) { |
682 | return self::isDigit( $char2 ) || |
683 | ( $char2 === '.' && self::isDigit( $char3 ) ); |
684 | } elseif ( $char1 === '.' ) { |
685 | return self::isDigit( $char2 ); |
686 | // @codeCoverageIgnoreStart |
687 | // Nothing reaches this code |
688 | } else { |
689 | return self::isDigit( $char1 ); |
690 | } |
691 | // @codeCoverageIgnoreEnd |
692 | } |
693 | |
694 | /** |
695 | * Consume a valid escape |
696 | * |
697 | * This assumes the leading backslash is consumed. |
698 | * |
699 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-escaped-code-point |
700 | * @return string Escaped character |
701 | */ |
702 | protected function consumeEscape() { |
703 | $position = [ 'position' => [ $this->line, $this->pos ] ]; |
704 | |
705 | $this->consumeCharacter(); |
706 | |
707 | // 1-6 hexits, plus one optional whitespace character |
708 | if ( self::isHexDigit( $this->currentCharacter ) ) { |
709 | $num = $this->currentCharacter; |
710 | while ( strlen( $num ) < 6 && self::isHexDigit( $this->nextCharacter ) ) { |
711 | $this->consumeCharacter(); |
712 | $num .= $this->currentCharacter; |
713 | } |
714 | if ( self::isWhitespace( $this->nextCharacter ) ) { |
715 | $this->consumeCharacter(); |
716 | } |
717 | |
718 | $num = intval( $num, 16 ); |
719 | if ( $num === 0 || ( $num >= 0xd800 && $num <= 0xdfff ) || $num > 0x10ffff ) { |
720 | return Constants::UTF8_REPLACEMENT; |
721 | } |
722 | return Utils::codepointToUtf8( $num ); |
723 | } |
724 | |
725 | if ( $this->currentCharacter === DataSource::EOF ) { |
726 | $this->parseError( 'bad-escape', $position ); |
727 | return Constants::UTF8_REPLACEMENT; |
728 | } |
729 | |
730 | return $this->currentCharacter; |
731 | } |
732 | |
733 | /** |
734 | * Consume a name |
735 | * |
736 | * Note this does not do validation on the input stream. Call |
737 | * self::wouldStartIdentifier() or the like before calling the method if |
738 | * necessary. |
739 | * |
740 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-name |
741 | * @return string Name |
742 | */ |
743 | protected function consumeName() { |
744 | $name = ''; |
745 | |
746 | while ( true ) { |
747 | $this->consumeCharacter(); |
748 | |
749 | if ( self::isNameCharacter( $this->currentCharacter ) ) { |
750 | $name .= $this->currentCharacter; |
751 | } elseif ( self::isValidEscape( $this->currentCharacter, $this->nextCharacter ) ) { |
752 | $name .= $this->consumeEscape(); |
753 | } else { |
754 | $this->reconsumeCharacter(); |
755 | break; |
756 | } |
757 | } |
758 | |
759 | return $name; |
760 | } |
761 | |
762 | /** |
763 | * Consume a number |
764 | * |
765 | * Note this does not do validation on the input stream. Call |
766 | * self::wouldStartNumber() before calling the method if necessary. |
767 | * |
768 | * @see https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#consume-number |
769 | * @return array [ string $value, int|float $number, string $type ('integer' or 'number') ] |
770 | * @suppress PhanPluginDuplicateAdjacentStatement |
771 | */ |
772 | protected function consumeNumber() { |
773 | // 1. |
774 | $repr = ''; |
775 | $type = 'integer'; |
776 | |
777 | // 2. |
778 | if ( $this->nextCharacter === '+' || $this->nextCharacter === '-' ) { |
779 | $this->consumeCharacter(); |
780 | $repr .= $this->currentCharacter; |
781 | } |
782 | |
783 | // 3. |
784 | while ( self::isDigit( $this->nextCharacter ) ) { |
785 | $this->consumeCharacter(); |
786 | $repr .= $this->currentCharacter; |
787 | } |
788 | |
789 | // 4. |
790 | if ( $this->nextCharacter === '.' ) { |
791 | [ $next, $next2, ] = $this->lookAhead(); |
792 | if ( self::isDigit( $next2 ) ) { |
793 | // 4.1. |
794 | $this->consumeCharacter(); |
795 | $this->consumeCharacter(); |
796 | // 4.2. |
797 | $repr .= $next . $next2; |
798 | // 4.3. |
799 | $type = 'number'; |
800 | // 4.4. |
801 | while ( self::isDigit( $this->nextCharacter ) ) { |
802 | $this->consumeCharacter(); |
803 | $repr .= $this->currentCharacter; |
804 | } |
805 | } |
806 | } |
807 | |
808 | // 5. |
809 | if ( $this->nextCharacter === 'e' || $this->nextCharacter === 'E' ) { |
810 | [ $next, $next2, $next3 ] = $this->lookAhead(); |
811 | $ok = false; |
812 | if ( ( $next2 === '+' || $next2 === '-' ) && self::isDigit( $next3 ) ) { |
813 | $ok = true; |
814 | // 5.1. |
815 | $this->consumeCharacter(); |
816 | $this->consumeCharacter(); |
817 | $this->consumeCharacter(); |
818 | // 5.2. |
819 | $repr .= $next . $next2 . $next3; |
820 | } elseif ( self::isDigit( $next2 ) ) { |
821 | $ok = true; |
822 | // 5.1. |
823 | $this->consumeCharacter(); |
824 | $this->consumeCharacter(); |
825 | // 5.2. |
826 | $repr .= $next . $next2; |
827 | } |
828 | if ( $ok ) { |
829 | // 5.3. |
830 | $type = 'number'; |
831 | // 5.4. |
832 | while ( self::isDigit( $this->nextCharacter ) ) { |
833 | $this->consumeCharacter(); |
834 | $repr .= $this->currentCharacter; |
835 | } |
836 | } |
837 | } |
838 | |
839 | // 6. We assume PHP's casting follows the same rules as |
840 | // https://www.w3.org/TR/2019/CR-css-syntax-3-20190716/#convert-string-to-number |
841 | $value = $type === 'integer' ? (int)$repr : (float)$repr; |
842 | |
843 | // 7. |
844 | return [ $repr, $value, $type ]; |
845 | } |
846 | } |