Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
97 / 97 |
|
100.00% |
6 / 6 |
CRAP | |
100.00% |
1 / 1 |
AbuseFilterTokenizer | |
100.00% |
97 / 97 |
|
100.00% |
6 / 6 |
32 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getCacheKey | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getTokens | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
tokenize | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
nextToken | |
100.00% |
42 / 42 |
|
100.00% |
1 / 1 |
15 | |||
readStringLiteral | |
100.00% |
39 / 39 |
|
100.00% |
1 / 1 |
13 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\AbuseFilter\Parser; |
4 | |
5 | use BagOStuff; |
6 | use MediaWiki\Extension\AbuseFilter\Parser\Exception\UserVisibleException; |
7 | |
8 | /** |
9 | * Tokenizer for AbuseFilter rules. |
10 | */ |
11 | class AbuseFilterTokenizer { |
12 | /** @var int Tokenizer cache version. Increment this when changing the syntax. */ |
13 | public const CACHE_VERSION = 4; |
14 | private const COMMENT_START_RE = '/\s*\/\*/A'; |
15 | private const ID_SYMBOL_RE = '/[0-9A-Za-z_]+/A'; |
16 | public const OPERATOR_RE = |
17 | '/(\!\=\=|\!\=|\!|\*\*|\*|\/|\+|\-|%|&|\||\^|\:\=|\?|\:|\<\=|\<|\>\=|\>|\=\=\=|\=\=|\=)/A'; |
18 | private const BASE = '0(?<base>[xbo])'; |
19 | private const DIGIT = '[0-9A-Fa-f]'; |
20 | private const DIGITS = self::DIGIT . '+' . '(?:\.\d*)?|\.\d+'; |
21 | private const RADIX_RE = '/(?:' . self::BASE . ')?(?<input>' . self::DIGITS . ')(?!\w)/Au'; |
22 | private const WHITESPACE = "\011\012\013\014\015\040"; |
23 | |
24 | // Order is important. The punctuation-matching regex requires that |
25 | // ** comes before *, etc. They are sorted to make it easy to spot |
26 | // such errors. |
27 | public const OPERATORS = [ |
28 | // Inequality |
29 | '!==', '!=', '!', |
30 | // Multiplication/exponentiation |
31 | '**', '*', |
32 | // Other arithmetic |
33 | '/', '+', '-', '%', |
34 | // Logic |
35 | '&', '|', '^', |
36 | // Setting |
37 | ':=', |
38 | // Ternary |
39 | '?', ':', |
40 | // Less than |
41 | '<=', '<', |
42 | // Greater than |
43 | '>=', '>', |
44 | // Equality |
45 | '===', '==', '=', |
46 | ]; |
47 | |
48 | public const PUNCTUATION = [ |
49 | ',' => AFPToken::TCOMMA, |
50 | '(' => AFPToken::TBRACE, |
51 | ')' => AFPToken::TBRACE, |
52 | '[' => AFPToken::TSQUAREBRACKET, |
53 | ']' => AFPToken::TSQUAREBRACKET, |
54 | ';' => AFPToken::TSTATEMENTSEPARATOR, |
55 | ]; |
56 | |
57 | public const BASES = [ |
58 | 'b' => 2, |
59 | 'x' => 16, |
60 | 'o' => 8 |
61 | ]; |
62 | |
63 | public const BASE_CHARS_RES = [ |
64 | 2 => '/^[01]+$/', |
65 | 8 => '/^[0-7]+$/', |
66 | 16 => '/^[0-9A-Fa-f]+$/', |
67 | 10 => '/^[0-9.]+$/', |
68 | ]; |
69 | |
70 | public const KEYWORDS = [ |
71 | 'in', 'like', 'true', 'false', 'null', 'contains', 'matches', |
72 | 'rlike', 'irlike', 'regex', 'if', 'then', 'else', 'end', |
73 | ]; |
74 | |
75 | /** |
76 | * @var BagOStuff |
77 | */ |
78 | private $cache; |
79 | |
80 | /** |
81 | * @param BagOStuff $cache |
82 | */ |
83 | public function __construct( BagOStuff $cache ) { |
84 | $this->cache = $cache; |
85 | } |
86 | |
87 | /** |
88 | * Get a cache key used to store the tokenized code |
89 | * |
90 | * @param string $code Not yet tokenized |
91 | * @return string |
92 | * @internal |
93 | */ |
94 | public function getCacheKey( $code ) { |
95 | return $this->cache->makeGlobalKey( __CLASS__, self::CACHE_VERSION, crc32( $code ) ); |
96 | } |
97 | |
98 | /** |
99 | * Get the tokens for the given code. |
100 | * |
101 | * @param string $code |
102 | * @return array[] |
103 | * @phan-return array<int,array{0:AFPToken,1:int}> |
104 | */ |
105 | public function getTokens( string $code ): array { |
106 | return $this->cache->getWithSetCallback( |
107 | $this->getCacheKey( $code ), |
108 | BagOStuff::TTL_DAY, |
109 | function () use ( $code ) { |
110 | return $this->tokenize( $code ); |
111 | } |
112 | ); |
113 | } |
114 | |
115 | /** |
116 | * @param string $code |
117 | * @return array[] |
118 | * @phan-return array<int,array{0:AFPToken,1:int}> |
119 | */ |
120 | private function tokenize( string $code ): array { |
121 | $tokens = []; |
122 | $curPos = 0; |
123 | |
124 | do { |
125 | $prevPos = $curPos; |
126 | $token = $this->nextToken( $code, $curPos ); |
127 | $tokens[ $token->pos ] = [ $token, $curPos ]; |
128 | } while ( $curPos !== $prevPos ); |
129 | |
130 | return $tokens; |
131 | } |
132 | |
133 | /** |
134 | * @param string $code |
135 | * @param int &$offset |
136 | * @return AFPToken |
137 | * @throws UserVisibleException |
138 | */ |
139 | private function nextToken( $code, &$offset ) { |
140 | $matches = []; |
141 | $start = $offset; |
142 | |
143 | // Read past comments |
144 | while ( preg_match( self::COMMENT_START_RE, $code, $matches, 0, $offset ) ) { |
145 | if ( strpos( $code, '*/', $offset ) === false ) { |
146 | throw new UserVisibleException( |
147 | 'unclosedcomment', $offset, [] ); |
148 | } |
149 | $offset = strpos( $code, '*/', $offset ) + 2; |
150 | } |
151 | |
152 | // Spaces |
153 | $offset += strspn( $code, self::WHITESPACE, $offset ); |
154 | if ( $offset >= strlen( $code ) ) { |
155 | return new AFPToken( AFPToken::TNONE, '', $start ); |
156 | } |
157 | |
158 | $chr = $code[$offset]; |
159 | |
160 | // Punctuation |
161 | if ( isset( self::PUNCTUATION[$chr] ) ) { |
162 | $offset++; |
163 | return new AFPToken( self::PUNCTUATION[$chr], $chr, $start ); |
164 | } |
165 | |
166 | // String literal |
167 | if ( $chr === '"' || $chr === "'" ) { |
168 | return self::readStringLiteral( $code, $offset, $start ); |
169 | } |
170 | |
171 | $matches = []; |
172 | |
173 | // Operators |
174 | if ( preg_match( self::OPERATOR_RE, $code, $matches, 0, $offset ) ) { |
175 | $token = $matches[0]; |
176 | $offset += strlen( $token ); |
177 | return new AFPToken( AFPToken::TOP, $token, $start ); |
178 | } |
179 | |
180 | // Numbers |
181 | $matchesv2 = []; |
182 | if ( preg_match( self::RADIX_RE, $code, $matchesv2, 0, $offset ) ) { |
183 | $token = $matchesv2[0]; |
184 | $baseChar = $matchesv2['base']; |
185 | $input = $matchesv2['input']; |
186 | $base = $baseChar ? self::BASES[$baseChar] : 10; |
187 | if ( preg_match( self::BASE_CHARS_RES[$base], $input ) ) { |
188 | $num = $base !== 10 ? base_convert( $input, $base, 10 ) : $input; |
189 | $offset += strlen( $token ); |
190 | return ( strpos( $input, '.' ) !== false ) |
191 | ? new AFPToken( AFPToken::TFLOAT, floatval( $num ), $start ) |
192 | : new AFPToken( AFPToken::TINT, intval( $num ), $start ); |
193 | } |
194 | } |
195 | |
196 | // IDs / Keywords |
197 | |
198 | if ( preg_match( self::ID_SYMBOL_RE, $code, $matches, 0, $offset ) ) { |
199 | $token = $matches[0]; |
200 | $offset += strlen( $token ); |
201 | $type = in_array( $token, self::KEYWORDS ) |
202 | ? AFPToken::TKEYWORD |
203 | : AFPToken::TID; |
204 | return new AFPToken( $type, $token, $start ); |
205 | } |
206 | |
207 | throw new UserVisibleException( |
208 | 'unrecognisedtoken', $start, [ substr( $code, $start ) ] ); |
209 | } |
210 | |
211 | /** |
212 | * @param string $code |
213 | * @param int &$offset |
214 | * @param int $start |
215 | * @return AFPToken |
216 | * @throws UserVisibleException |
217 | */ |
218 | private static function readStringLiteral( $code, &$offset, $start ) { |
219 | $type = $code[$offset]; |
220 | $offset++; |
221 | $length = strlen( $code ); |
222 | $token = ''; |
223 | while ( $offset < $length ) { |
224 | if ( $code[$offset] === $type ) { |
225 | $offset++; |
226 | return new AFPToken( AFPToken::TSTRING, $token, $start ); |
227 | } |
228 | |
229 | // Performance: Use a PHP function (implemented in C) |
230 | // to scan ahead. |
231 | $addLength = strcspn( $code, $type . "\\", $offset ); |
232 | if ( $addLength ) { |
233 | $token .= substr( $code, $offset, $addLength ); |
234 | $offset += $addLength; |
235 | } elseif ( $code[$offset] === '\\' ) { |
236 | switch ( $code[$offset + 1] ) { |
237 | case '\\': |
238 | $token .= '\\'; |
239 | break; |
240 | case $type: |
241 | $token .= $type; |
242 | break; |
243 | case 'n': |
244 | $token .= "\n"; |
245 | break; |
246 | case 'r': |
247 | $token .= "\r"; |
248 | break; |
249 | case 't': |
250 | $token .= "\t"; |
251 | break; |
252 | case 'x': |
253 | $chr = substr( $code, $offset + 2, 2 ); |
254 | |
255 | if ( preg_match( '/^[0-9A-Fa-f]{2}$/', $chr ) ) { |
256 | $token .= chr( hexdec( $chr ) ); |
257 | // \xXX -- 2 done later |
258 | $offset += 2; |
259 | } else { |
260 | $token .= '\\x'; |
261 | } |
262 | break; |
263 | default: |
264 | $token .= "\\" . $code[$offset + 1]; |
265 | } |
266 | |
267 | $offset += 2; |
268 | |
269 | } else { |
270 | // Should never happen |
271 | // @codeCoverageIgnoreStart |
272 | $token .= $code[$offset]; |
273 | $offset++; |
274 | // @codeCoverageIgnoreEnd |
275 | } |
276 | } |
277 | throw new UserVisibleException( 'unclosedstring', $offset, [] ); |
278 | } |
279 | } |