Code Coverage for /workspace/src/extensions/AbuseFilter/includes/Parser/AbuseFilterTokenizer.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	100.00% covered (success)	100.00%	97 / 97	100.00% covered (success)	100.00%	6 / 6	CRAP	100.00% covered (success)	100.00%	1 / 1
AbuseFilterTokenizer	100.00% covered (success)	100.00%	97 / 97	100.00% covered (success)	100.00%	6 / 6	32	100.00% covered (success)	100.00%	1 / 1
__construct	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
getCacheKey	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
getTokens	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	1
tokenize	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	1
nextToken	100.00% covered (success)	100.00%	42 / 42	100.00% covered (success)	100.00%	1 / 1	15
readStringLiteral	100.00% covered (success)	100.00%	39 / 39	100.00% covered (success)	100.00%	1 / 1	13

1	<?php
2
3	namespace MediaWiki\Extension\AbuseFilter\Parser;
4
5	use BagOStuff;
6	use MediaWiki\Extension\AbuseFilter\Parser\Exception\UserVisibleException;
7
8	/**
9	* Tokenizer for AbuseFilter rules.
10	*/
11	class AbuseFilterTokenizer {
12	/** @var int Tokenizer cache version. Increment this when changing the syntax. */
13	public const CACHE_VERSION = 4;
14	private const COMMENT_START_RE = '/\s\/\/A';
15	private const ID_SYMBOL_RE = '/[0-9A-Za-z_]+/A';
16	public const OPERATOR_RE =
17	'/(\!\=\=\|\!\=\|\!\|\\\|\*\|\/\|\+\|\-\|%\|&\|\\|\|\^\|\:\=\|\?\|\:\|\<\=\|\<\|\>\=\|\>\|\=\=\=\|\=\=\|\=)/A';
18	private const BASE = '0(?<base>[xbo])';
19	private const DIGIT = '[0-9A-Fa-f]';
20	private const DIGITS = self::DIGIT . '+' . '(?:\.\d*)?\|\.\d+';
21	private const RADIX_RE = '/(?:' . self::BASE . ')?(?<input>' . self::DIGITS . ')(?!\w)/Au';
22	private const WHITESPACE = "\011\012\013\014\015\040";
23
24	// Order is important. The punctuation-matching regex requires that
25	// ** comes before *, etc. They are sorted to make it easy to spot
26	// such errors.
27	public const OPERATORS = [
28	// Inequality
29	'!==', '!=', '!',
30	// Multiplication/exponentiation
31	'*', '',
32	// Other arithmetic
33	'/', '+', '-', '%',
34	// Logic
35	'&', '\|', '^',
36	// Setting
37	':=',
38	// Ternary
39	'?', ':',
40	// Less than
41	'<=', '<',
42	// Greater than
43	'>=', '>',
44	// Equality
45	'===', '==', '=',
46	];
47
48	public const PUNCTUATION = [
49	',' => AFPToken::TCOMMA,
50	'(' => AFPToken::TBRACE,
51	')' => AFPToken::TBRACE,
52	'[' => AFPToken::TSQUAREBRACKET,
53	']' => AFPToken::TSQUAREBRACKET,
54	';' => AFPToken::TSTATEMENTSEPARATOR,
55	];
56
57	public const BASES = [
58	'b' => 2,
59	'x' => 16,
60	'o' => 8
61	];
62
63	public const BASE_CHARS_RES = [
64	2 => '/^[01]+$/',
65	8 => '/^[0-7]+$/',
66	16 => '/^[0-9A-Fa-f]+$/',
67	10 => '/^[0-9.]+$/',
68	];
69
70	public const KEYWORDS = [
71	'in', 'like', 'true', 'false', 'null', 'contains', 'matches',
72	'rlike', 'irlike', 'regex', 'if', 'then', 'else', 'end',
73	];
74
75	/**
76	* @var BagOStuff
77	*/
78	private $cache;
79
80	/**
81	* @param BagOStuff $cache
82	*/
83	public function __construct( BagOStuff $cache ) {
84	$this->cache = $cache;
85	}
86
87	/**
88	* Get a cache key used to store the tokenized code
89	*
90	* @param string $code Not yet tokenized
91	* @return string
92	* @internal
93	*/
94	public function getCacheKey( $code ) {
95	return $this->cache->makeGlobalKey( __CLASS__, self::CACHE_VERSION, crc32( $code ) );
96	}
97
98	/**
99	* Get the tokens for the given code.
100	*
101	* @param string $code
102	* @return array[]
103	* @phan-return array<int,array{0:AFPToken,1:int}>
104	*/
105	public function getTokens( string $code ): array {
106	return $this->cache->getWithSetCallback(
107	$this->getCacheKey( $code ),
108	BagOStuff::TTL_DAY,
109	function () use ( $code ) {
110	return $this->tokenize( $code );
111	}
112	);
113	}
114
115	/**
116	* @param string $code
117	* @return array[]
118	* @phan-return array<int,array{0:AFPToken,1:int}>
119	*/
120	private function tokenize( string $code ): array {
121	$tokens = [];
122	$curPos = 0;
123
124	do {
125	$prevPos = $curPos;
126	$token = $this->nextToken( $code, $curPos );
127	$tokens[ $token->pos ] = [ $token, $curPos ];
128	} while ( $curPos !== $prevPos );
129
130	return $tokens;
131	}
132
133	/**
134	* @param string $code
135	* @param int &$offset
136	* @return AFPToken
137	* @throws UserVisibleException
138	*/
139	private function nextToken( $code, &$offset ) {
140	$matches = [];
141	$start = $offset;
142
143	// Read past comments
144	while ( preg_match( self::COMMENT_START_RE, $code, $matches, 0, $offset ) ) {
145	if ( strpos( $code, '*/', $offset ) === false ) {
146	throw new UserVisibleException(
147	'unclosedcomment', $offset, [] );
148	}
149	$offset = strpos( $code, '*/', $offset ) + 2;
150	}
151
152	// Spaces
153	$offset += strspn( $code, self::WHITESPACE, $offset );
154	if ( $offset >= strlen( $code ) ) {
155	return new AFPToken( AFPToken::TNONE, '', $start );
156	}
157
158	$chr = $code[$offset];
159
160	// Punctuation
161	if ( isset( self::PUNCTUATION[$chr] ) ) {
162	$offset++;
163	return new AFPToken( self::PUNCTUATION[$chr], $chr, $start );
164	}
165
166	// String literal
167	if ( $chr === '"' \|\| $chr === "'" ) {
168	return self::readStringLiteral( $code, $offset, $start );
169	}
170
171	$matches = [];
172
173	// Operators
174	if ( preg_match( self::OPERATOR_RE, $code, $matches, 0, $offset ) ) {
175	$token = $matches[0];
176	$offset += strlen( $token );
177	return new AFPToken( AFPToken::TOP, $token, $start );
178	}
179
180	// Numbers
181	$matchesv2 = [];
182	if ( preg_match( self::RADIX_RE, $code, $matchesv2, 0, $offset ) ) {
183	$token = $matchesv2[0];
184	$baseChar = $matchesv2['base'];
185	$input = $matchesv2['input'];
186	$base = $baseChar ? self::BASES[$baseChar] : 10;
187	if ( preg_match( self::BASE_CHARS_RES[$base], $input ) ) {
188	$num = $base !== 10 ? base_convert( $input, $base, 10 ) : $input;
189	$offset += strlen( $token );
190	return ( strpos( $input, '.' ) !== false )
191	? new AFPToken( AFPToken::TFLOAT, floatval( $num ), $start )
192	: new AFPToken( AFPToken::TINT, intval( $num ), $start );
193	}
194	}
195
196	// IDs / Keywords
197
198	if ( preg_match( self::ID_SYMBOL_RE, $code, $matches, 0, $offset ) ) {
199	$token = $matches[0];
200	$offset += strlen( $token );
201	$type = in_array( $token, self::KEYWORDS )
202	? AFPToken::TKEYWORD
203	: AFPToken::TID;
204	return new AFPToken( $type, $token, $start );
205	}
206
207	throw new UserVisibleException(
208	'unrecognisedtoken', $start, [ substr( $code, $start ) ] );
209	}
210
211	/**
212	* @param string $code
213	* @param int &$offset
214	* @param int $start
215	* @return AFPToken
216	* @throws UserVisibleException
217	*/
218	private static function readStringLiteral( $code, &$offset, $start ) {
219	$type = $code[$offset];
220	$offset++;
221	$length = strlen( $code );
222	$token = '';
223	while ( $offset < $length ) {
224	if ( $code[$offset] === $type ) {
225	$offset++;
226	return new AFPToken( AFPToken::TSTRING, $token, $start );
227	}
228
229	// Performance: Use a PHP function (implemented in C)
230	// to scan ahead.
231	$addLength = strcspn( $code, $type . "\\", $offset );
232	if ( $addLength ) {
233	$token .= substr( $code, $offset, $addLength );
234	$offset += $addLength;
235	} elseif ( $code[$offset] === '\\' ) {
236	switch ( $code[$offset + 1] ) {
237	case '\\':
238	$token .= '\\';
239	break;
240	case $type:
241	$token .= $type;
242	break;
243	case 'n':
244	$token .= "\n";
245	break;
246	case 'r':
247	$token .= "\r";
248	break;
249	case 't':
250	$token .= "\t";
251	break;
252	case 'x':
253	$chr = substr( $code, $offset + 2, 2 );
254
255	if ( preg_match( '/^[0-9A-Fa-f]{2}$/', $chr ) ) {
256	$token .= chr( hexdec( $chr ) );
257	// \xXX -- 2 done later
258	$offset += 2;
259	} else {
260	$token .= '\\x';
261	}
262	break;
263	default:
264	$token .= "\\" . $code[$offset + 1];
265	}
266
267	$offset += 2;
268
269	} else {
270	// Should never happen
271	// @codeCoverageIgnoreStart
272	$token .= $code[$offset];
273	$offset++;
274	// @codeCoverageIgnoreEnd
275	}
276	}
277	throw new UserVisibleException( 'unclosedstring', $offset, [] );
278	}
279	}