Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
100.00% |
101 / 101 |
|
100.00% |
6 / 6 |
CRAP | |
100.00% |
1 / 1 |
| AbuseFilterTokenizer | |
100.00% |
101 / 101 |
|
100.00% |
6 / 6 |
33 | |
100.00% |
1 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getCacheKey | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| getTokens | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
| tokenize | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
| nextToken | |
100.00% |
42 / 42 |
|
100.00% |
1 / 1 |
15 | |||
| readStringLiteral | |
100.00% |
41 / 41 |
|
100.00% |
1 / 1 |
14 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace MediaWiki\Extension\AbuseFilter\Parser; |
| 4 | |
| 5 | use MediaWiki\Extension\AbuseFilter\Parser\Exception\UserVisibleException; |
| 6 | use Wikimedia\ObjectCache\BagOStuff; |
| 7 | |
| 8 | /** |
| 9 | * Tokenizer for AbuseFilter rules. |
| 10 | */ |
| 11 | class AbuseFilterTokenizer { |
| 12 | /** @var int Tokenizer cache version. Increment this when changing the syntax. */ |
| 13 | public const CACHE_VERSION = 4; |
| 14 | private const COMMENT_START_RE = '/\s*\/\*/A'; |
| 15 | private const ID_SYMBOL_RE = '/[0-9A-Za-z_]+/A'; |
| 16 | public const OPERATOR_RE = |
| 17 | '/(\!\=\=|\!\=|\!|\*\*|\*|\/|\+|\-|%|&|\||\^|\:\=|\?|\:|\<\=|\<|\>\=|\>|\=\=\=|\=\=|\=)/A'; |
| 18 | private const BASE = '0(?<base>[xbo])'; |
| 19 | private const DIGIT = '[0-9A-Fa-f]'; |
| 20 | private const DIGITS = self::DIGIT . '+' . '(?:\.\d*)?|\.\d+'; |
| 21 | private const RADIX_RE = '/(?:' . self::BASE . ')?(?<input>' . self::DIGITS . ')(?!\w)/Au'; |
| 22 | private const WHITESPACE = "\011\012\013\014\015\040"; |
| 23 | |
| 24 | // Order is important. The punctuation-matching regex requires that |
| 25 | // ** comes before *, etc. They are sorted to make it easy to spot |
| 26 | // such errors. |
| 27 | public const OPERATORS = [ |
| 28 | // Inequality |
| 29 | '!==', '!=', '!', |
| 30 | // Multiplication/exponentiation |
| 31 | '**', '*', |
| 32 | // Other arithmetic |
| 33 | '/', '+', '-', '%', |
| 34 | // Logic |
| 35 | '&', '|', '^', |
| 36 | // Setting |
| 37 | ':=', |
| 38 | // Ternary |
| 39 | '?', ':', |
| 40 | // Less than |
| 41 | '<=', '<', |
| 42 | // Greater than |
| 43 | '>=', '>', |
| 44 | // Equality |
| 45 | '===', '==', '=', |
| 46 | ]; |
| 47 | |
| 48 | public const PUNCTUATION = [ |
| 49 | ',' => AFPToken::TCOMMA, |
| 50 | '(' => AFPToken::TBRACE, |
| 51 | ')' => AFPToken::TBRACE, |
| 52 | '[' => AFPToken::TSQUAREBRACKET, |
| 53 | ']' => AFPToken::TSQUAREBRACKET, |
| 54 | ';' => AFPToken::TSTATEMENTSEPARATOR, |
| 55 | ]; |
| 56 | |
| 57 | public const BASES = [ |
| 58 | 'b' => 2, |
| 59 | 'x' => 16, |
| 60 | 'o' => 8 |
| 61 | ]; |
| 62 | |
| 63 | public const BASE_CHARS_RES = [ |
| 64 | 2 => '/^[01]+$/', |
| 65 | 8 => '/^[0-7]+$/', |
| 66 | 16 => '/^[0-9A-Fa-f]+$/', |
| 67 | 10 => '/^[0-9.]+$/', |
| 68 | ]; |
| 69 | |
| 70 | public const KEYWORDS = [ |
| 71 | 'in', 'like', 'true', 'false', 'null', 'contains', 'matches', |
| 72 | 'rlike', 'irlike', 'regex', 'if', 'then', 'else', 'end', |
| 73 | ]; |
| 74 | |
| 75 | public function __construct( private readonly BagOStuff $cache ) { |
| 76 | } |
| 77 | |
| 78 | /** |
| 79 | * Get a cache key used to store the tokenized code |
| 80 | * |
| 81 | * @param string $code Not yet tokenized |
| 82 | * @return string |
| 83 | * @internal |
| 84 | */ |
| 85 | public function getCacheKey( $code ) { |
| 86 | return $this->cache->makeGlobalKey( 'abusefilter-tokens', |
| 87 | __CLASS__, self::CACHE_VERSION, crc32( $code ) |
| 88 | ); |
| 89 | } |
| 90 | |
| 91 | /** |
| 92 | * Get the tokens for the given code. |
| 93 | * |
| 94 | * @param string $code |
| 95 | * @return array<int,array{0:AFPToken,1:int}> |
| 96 | */ |
| 97 | public function getTokens( string $code ): array { |
| 98 | return $this->cache->getWithSetCallback( |
| 99 | $this->getCacheKey( $code ), |
| 100 | BagOStuff::TTL_DAY, |
| 101 | function () use ( $code ) { |
| 102 | return $this->tokenize( $code ); |
| 103 | } |
| 104 | ); |
| 105 | } |
| 106 | |
| 107 | /** |
| 108 | * @param string $code |
| 109 | * @return array<int,array{0:AFPToken,1:int}> |
| 110 | */ |
| 111 | private function tokenize( string $code ): array { |
| 112 | $tokens = []; |
| 113 | $curPos = 0; |
| 114 | |
| 115 | do { |
| 116 | $prevPos = $curPos; |
| 117 | $token = $this->nextToken( $code, $curPos ); |
| 118 | $tokens[ $token->pos ] = [ $token, $curPos ]; |
| 119 | } while ( $curPos !== $prevPos ); |
| 120 | |
| 121 | return $tokens; |
| 122 | } |
| 123 | |
| 124 | /** |
| 125 | * @param string $code |
| 126 | * @param int &$offset |
| 127 | * @return AFPToken |
| 128 | * @throws UserVisibleException |
| 129 | */ |
| 130 | private function nextToken( $code, &$offset ) { |
| 131 | $matches = []; |
| 132 | $start = $offset; |
| 133 | |
| 134 | // Read past comments |
| 135 | while ( preg_match( self::COMMENT_START_RE, $code, $matches, 0, $offset ) ) { |
| 136 | $pos = strpos( $code, '*/', $offset ); |
| 137 | if ( $pos === false ) { |
| 138 | throw new UserVisibleException( 'unclosedcomment', $offset, [] ); |
| 139 | } |
| 140 | $offset = $pos + 2; |
| 141 | } |
| 142 | |
| 143 | // Spaces |
| 144 | $offset += strspn( $code, self::WHITESPACE, $offset ); |
| 145 | if ( $offset >= strlen( $code ) ) { |
| 146 | return new AFPToken( AFPToken::TNONE, '', $start ); |
| 147 | } |
| 148 | |
| 149 | $chr = $code[$offset]; |
| 150 | |
| 151 | // Punctuation |
| 152 | if ( isset( self::PUNCTUATION[$chr] ) ) { |
| 153 | $offset++; |
| 154 | return new AFPToken( self::PUNCTUATION[$chr], $chr, $start ); |
| 155 | } |
| 156 | |
| 157 | // String literal |
| 158 | if ( $chr === '"' || $chr === "'" ) { |
| 159 | return self::readStringLiteral( $code, $offset, $start ); |
| 160 | } |
| 161 | |
| 162 | $matches = []; |
| 163 | |
| 164 | // Operators |
| 165 | if ( preg_match( self::OPERATOR_RE, $code, $matches, 0, $offset ) ) { |
| 166 | $token = $matches[0]; |
| 167 | $offset += strlen( $token ); |
| 168 | return new AFPToken( AFPToken::TOP, $token, $start ); |
| 169 | } |
| 170 | |
| 171 | // Numbers |
| 172 | $matchesv2 = []; |
| 173 | if ( preg_match( self::RADIX_RE, $code, $matchesv2, 0, $offset ) ) { |
| 174 | $token = $matchesv2[0]; |
| 175 | $baseChar = $matchesv2['base']; |
| 176 | $input = $matchesv2['input']; |
| 177 | $base = $baseChar ? self::BASES[$baseChar] : 10; |
| 178 | if ( preg_match( self::BASE_CHARS_RES[$base], $input ) ) { |
| 179 | $num = $base !== 10 ? base_convert( $input, $base, 10 ) : $input; |
| 180 | $offset += strlen( $token ); |
| 181 | return str_contains( $input, '.' ) |
| 182 | ? new AFPToken( AFPToken::TFLOAT, floatval( $num ), $start ) |
| 183 | : new AFPToken( AFPToken::TINT, intval( $num ), $start ); |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | // IDs / Keywords |
| 188 | |
| 189 | if ( preg_match( self::ID_SYMBOL_RE, $code, $matches, 0, $offset ) ) { |
| 190 | $token = $matches[0]; |
| 191 | $offset += strlen( $token ); |
| 192 | $type = in_array( $token, self::KEYWORDS ) |
| 193 | ? AFPToken::TKEYWORD |
| 194 | : AFPToken::TID; |
| 195 | return new AFPToken( $type, $token, $start ); |
| 196 | } |
| 197 | |
| 198 | throw new UserVisibleException( |
| 199 | 'unrecognisedtoken', $start, [ substr( $code, $start ) ] ); |
| 200 | } |
| 201 | |
| 202 | /** |
| 203 | * @param string $code |
| 204 | * @param int &$offset |
| 205 | * @param int $start |
| 206 | * @return AFPToken |
| 207 | * @throws UserVisibleException |
| 208 | */ |
| 209 | private static function readStringLiteral( $code, &$offset, $start ) { |
| 210 | $type = $code[$offset]; |
| 211 | $offset++; |
| 212 | $length = strlen( $code ); |
| 213 | $token = ''; |
| 214 | while ( $offset < $length ) { |
| 215 | if ( $code[$offset] === $type ) { |
| 216 | $offset++; |
| 217 | return new AFPToken( AFPToken::TSTRING, $token, $start ); |
| 218 | } |
| 219 | |
| 220 | // Performance: Use a PHP function (implemented in C) |
| 221 | // to scan ahead. |
| 222 | $addLength = strcspn( $code, $type . "\\", $offset ); |
| 223 | if ( $addLength ) { |
| 224 | $token .= substr( $code, $offset, $addLength ); |
| 225 | $offset += $addLength; |
| 226 | } elseif ( $code[$offset] === '\\' ) { |
| 227 | if ( !isset( $code[$offset + 1] ) ) { |
| 228 | // Unterminated escape sequence, hence unterminated string. (T390416) |
| 229 | throw new UserVisibleException( 'unclosedstring', $offset + 1, [] ); |
| 230 | } |
| 231 | |
| 232 | switch ( $code[$offset + 1] ) { |
| 233 | case '\\': |
| 234 | $token .= '\\'; |
| 235 | break; |
| 236 | case $type: |
| 237 | $token .= $type; |
| 238 | break; |
| 239 | case 'n': |
| 240 | $token .= "\n"; |
| 241 | break; |
| 242 | case 'r': |
| 243 | $token .= "\r"; |
| 244 | break; |
| 245 | case 't': |
| 246 | $token .= "\t"; |
| 247 | break; |
| 248 | case 'x': |
| 249 | $chr = substr( $code, $offset + 2, 2 ); |
| 250 | |
| 251 | if ( preg_match( '/^[0-9A-Fa-f]{2}$/', $chr ) ) { |
| 252 | $token .= chr( hexdec( $chr ) ); |
| 253 | // \xXX -- 2 done later |
| 254 | $offset += 2; |
| 255 | } else { |
| 256 | $token .= '\\x'; |
| 257 | } |
| 258 | break; |
| 259 | default: |
| 260 | $token .= "\\" . $code[$offset + 1]; |
| 261 | } |
| 262 | |
| 263 | $offset += 2; |
| 264 | |
| 265 | } else { |
| 266 | // Should never happen |
| 267 | // @codeCoverageIgnoreStart |
| 268 | $token .= $code[$offset]; |
| 269 | $offset++; |
| 270 | // @codeCoverageIgnoreEnd |
| 271 | } |
| 272 | } |
| 273 | throw new UserVisibleException( 'unclosedstring', $offset, [] ); |
| 274 | } |
| 275 | } |