Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
61.11% |
297 / 486 |
|
15.38% |
4 / 26 |
CRAP | |
0.00% |
0 / 1 |
UstringLibrary | |
61.11% |
297 / 486 |
|
15.38% |
4 / 26 |
1997.81 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
register | |
39.39% |
13 / 33 |
|
0.00% |
0 / 1 |
2.89 | |||
checkString | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
5.03 | |||
ustringIsUtf8 | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
ustringByteoffset | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
90 | |||
ustringCodepoint | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
ustringGcodepointInit | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
ustringToNFC | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
ustringToNFD | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
ustringToNFKC | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
ustringToNFKD | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
ustringChar | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
56 | |||
ustringLen | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
ustringSub | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
ustringUpper | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
ustringLower | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
checkPattern | |
88.89% |
8 / 9 |
|
0.00% |
0 / 1 |
4.02 | |||
patternToRegex | |
70.90% |
95 / 134 |
|
0.00% |
0 / 1 |
79.45 | |||
bracketedCharSetToRegex | |
96.43% |
27 / 28 |
|
0.00% |
0 / 1 |
16 | |||
addCapturesFromMatch | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
5 | |||
ustringFind | |
96.15% |
25 / 26 |
|
0.00% |
0 / 1 |
8 | |||
ustringMatch | |
92.86% |
13 / 14 |
|
0.00% |
0 / 1 |
5.01 | |||
ustringGmatchInit | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
ustringGmatchCallback | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
ustringGsub | |
79.38% |
77 / 97 |
|
0.00% |
0 / 1 |
52.33 | |||
handlePCREError | |
37.04% |
10 / 27 |
|
0.00% |
0 / 1 |
34.96 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\Scribunto\Engines\LuaCommon; |
4 | |
5 | use LogicException; |
6 | use MapCacheLRU; |
7 | use MediaWiki\MainConfigNames; |
8 | use MediaWiki\MediaWikiServices; |
9 | use UtfNormal\Validator; |
10 | |
11 | class UstringLibrary extends LibraryBase { |
12 | /** |
13 | * Limit on pattern lengths, in bytes not characters |
14 | * @var int |
15 | */ |
16 | private $patternLengthLimit = 10000; |
17 | |
18 | /** |
19 | * Limit on string lengths, in bytes not characters |
20 | * @var int |
21 | */ |
22 | private $stringLengthLimit; |
23 | |
24 | /** |
25 | * PHP until 5.6.9 are buggy when the regex in preg_replace an |
26 | * preg_match_all matches the empty string. |
27 | * @var bool |
28 | */ |
29 | private $phpBug53823; |
30 | |
31 | /** |
32 | * A cache of patterns and the regexes they generate. |
33 | * @var MapCacheLRU |
34 | */ |
35 | private $patternRegexCache; |
36 | |
37 | /** @inheritDoc */ |
38 | public function __construct( $engine ) { |
39 | $maxArticleSize = MediaWikiServices::getInstance()->getMainConfig()->get( MainConfigNames::MaxArticleSize ); |
40 | $this->stringLengthLimit = $maxArticleSize * 1024; |
41 | $this->phpBug53823 = preg_replace( '//us', 'x', "\xc3\xa1" ) === "x\xc3x\xa1x"; |
42 | $this->patternRegexCache = new MapCacheLRU( 100 ); |
43 | |
44 | parent::__construct( $engine ); |
45 | } |
46 | |
47 | public function register() { |
48 | $perf = $this->getEngine()->getPerformanceCharacteristics(); |
49 | |
50 | if ( $perf['phpCallsRequireSerialization'] ) { |
51 | $lib = [ |
52 | // Pattern matching is still much faster in PHP, even with the |
53 | // overhead of serialization |
54 | 'find' => [ $this, 'ustringFind' ], |
55 | 'match' => [ $this, 'ustringMatch' ], |
56 | 'gmatch_init' => [ $this, 'ustringGmatchInit' ], |
57 | 'gmatch_callback' => [ $this, 'ustringGmatchCallback' ], |
58 | 'gsub' => [ $this, 'ustringGsub' ], |
59 | ]; |
60 | } else { |
61 | $lib = [ |
62 | 'isutf8' => [ $this, 'ustringIsUtf8' ], |
63 | 'byteoffset' => [ $this, 'ustringByteoffset' ], |
64 | 'codepoint' => [ $this, 'ustringCodepoint' ], |
65 | 'gcodepoint_init' => [ $this, 'ustringGcodepointInit' ], |
66 | 'toNFC' => [ $this, 'ustringToNFC' ], |
67 | 'toNFD' => [ $this, 'ustringToNFD' ], |
68 | 'toNFKC' => [ $this, 'ustringToNFKC' ], |
69 | 'toNFKD' => [ $this, 'ustringToNFKD' ], |
70 | 'char' => [ $this, 'ustringChar' ], |
71 | 'len' => [ $this, 'ustringLen' ], |
72 | 'sub' => [ $this, 'ustringSub' ], |
73 | 'upper' => [ $this, 'ustringUpper' ], |
74 | 'lower' => [ $this, 'ustringLower' ], |
75 | 'find' => [ $this, 'ustringFind' ], |
76 | 'match' => [ $this, 'ustringMatch' ], |
77 | 'gmatch_init' => [ $this, 'ustringGmatchInit' ], |
78 | 'gmatch_callback' => [ $this, 'ustringGmatchCallback' ], |
79 | 'gsub' => [ $this, 'ustringGsub' ], |
80 | ]; |
81 | } |
82 | return $this->getEngine()->registerInterface( 'mw.ustring.lua', $lib, [ |
83 | 'stringLengthLimit' => $this->stringLengthLimit, |
84 | 'patternLengthLimit' => $this->patternLengthLimit, |
85 | ] ); |
86 | } |
87 | |
88 | /** |
89 | * Check a string first parameter |
90 | * @param string $name Function name, for errors |
91 | * @param mixed &$s Value to check |
92 | * @param bool $checkEncoding Whether to validate UTF-8 encoding. |
93 | */ |
94 | private function checkString( $name, &$s, $checkEncoding = true ) { |
95 | if ( $this->getLuaType( $s ) == 'number' ) { |
96 | $s = (string)$s; |
97 | } else { |
98 | $this->checkType( $name, 1, $s, 'string' ); |
99 | if ( $checkEncoding && !mb_check_encoding( $s, 'UTF-8' ) ) { |
100 | throw new LuaError( "bad argument #1 to '$name' (string is not UTF-8)" ); |
101 | } |
102 | if ( strlen( $s ) > $this->stringLengthLimit ) { |
103 | throw new LuaError( |
104 | "bad argument #1 to '$name' (string is longer than $this->stringLengthLimit bytes)" |
105 | ); |
106 | } |
107 | } |
108 | } |
109 | |
110 | /** |
111 | * Handler for isUtf8 |
112 | * @internal |
113 | * @param string $s |
114 | * @return bool[] |
115 | */ |
116 | public function ustringIsUtf8( $s ) { |
117 | $this->checkString( 'isutf8', $s, false ); |
118 | return [ mb_check_encoding( $s, 'UTF-8' ) ]; |
119 | } |
120 | |
121 | /** |
122 | * Handler for byteoffset |
123 | * @internal |
124 | * @param string $s |
125 | * @param int $l |
126 | * @param int $i |
127 | * @return int[]|null[] |
128 | */ |
129 | public function ustringByteoffset( $s, $l = 1, $i = 1 ) { |
130 | $this->checkString( 'byteoffset', $s ); |
131 | $this->checkTypeOptional( 'byteoffset', 2, $l, 'number', 1 ); |
132 | $this->checkTypeOptional( 'byteoffset', 3, $i, 'number', 1 ); |
133 | |
134 | $bytelen = strlen( $s ); |
135 | if ( $i < 0 ) { |
136 | $i = $bytelen + $i + 1; |
137 | } |
138 | if ( $i < 1 || $i > $bytelen ) { |
139 | return [ null ]; |
140 | } |
141 | $i--; |
142 | $j = $i; |
143 | while ( ( ord( $s[$i] ) & 0xc0 ) === 0x80 ) { |
144 | $i--; |
145 | } |
146 | if ( $l > 0 && $j === $i ) { |
147 | $l--; |
148 | } |
149 | $char = mb_strlen( substr( $s, 0, $i ), 'UTF-8' ) + $l; |
150 | if ( $char < 0 || $char >= mb_strlen( $s, 'UTF-8' ) ) { |
151 | return [ null ]; |
152 | } else { |
153 | return [ strlen( mb_substr( $s, 0, $char, 'UTF-8' ) ) + 1 ]; |
154 | } |
155 | } |
156 | |
157 | /** |
158 | * Handler for codepoint |
159 | * @internal |
160 | * @param string $s |
161 | * @param int $i |
162 | * @param int|null $j |
163 | * @return int[] |
164 | */ |
165 | public function ustringCodepoint( $s, $i = 1, $j = null ) { |
166 | $this->checkString( 'codepoint', $s ); |
167 | $this->checkTypeOptional( 'codepoint', 2, $i, 'number', 1 ); |
168 | $this->checkTypeOptional( 'codepoint', 3, $j, 'number', $i ); |
169 | |
170 | $l = mb_strlen( $s, 'UTF-8' ); |
171 | if ( $i < 0 ) { |
172 | $i = $l + $i + 1; |
173 | } |
174 | if ( $j < 0 ) { |
175 | $j = $l + $j + 1; |
176 | } |
177 | if ( $j < $i ) { |
178 | return []; |
179 | } |
180 | $i = max( 1, min( $i, $l + 1 ) ); |
181 | $j = max( 1, min( $j, $l + 1 ) ); |
182 | $s = mb_substr( $s, $i - 1, $j - $i + 1, 'UTF-8' ); |
183 | return unpack( 'N*', mb_convert_encoding( $s, 'UTF-32BE', 'UTF-8' ) ); |
184 | } |
185 | |
186 | /** |
187 | * Handler for gcodepointInit |
188 | * @internal |
189 | * @param string $s |
190 | * @param int $i |
191 | * @param int|null $j |
192 | * @return int[][] |
193 | */ |
194 | public function ustringGcodepointInit( $s, $i = 1, $j = null ) { |
195 | return [ $this->ustringCodepoint( $s, $i, $j ) ]; |
196 | } |
197 | |
198 | /** |
199 | * Handler for toNFC |
200 | * @internal |
201 | * @param string $s |
202 | * @return string[]|null[] |
203 | */ |
204 | public function ustringToNFC( $s ) { |
205 | $this->checkString( 'toNFC', $s, false ); |
206 | if ( !mb_check_encoding( $s, 'UTF-8' ) ) { |
207 | return [ null ]; |
208 | } |
209 | return [ Validator::toNFC( $s ) ]; |
210 | } |
211 | |
212 | /** |
213 | * Handler for toNFD |
214 | * @internal |
215 | * @param string $s |
216 | * @return string[]|null[] |
217 | */ |
218 | public function ustringToNFD( $s ) { |
219 | $this->checkString( 'toNFD', $s, false ); |
220 | if ( !mb_check_encoding( $s, 'UTF-8' ) ) { |
221 | return [ null ]; |
222 | } |
223 | return [ Validator::toNFD( $s ) ]; |
224 | } |
225 | |
226 | /** |
227 | * Handler for toNFKC |
228 | * @internal |
229 | * @param string $s |
230 | * @return string[]|null[] |
231 | */ |
232 | public function ustringToNFKC( $s ) { |
233 | $this->checkString( 'toNFKC', $s, false ); |
234 | if ( !mb_check_encoding( $s, 'UTF-8' ) ) { |
235 | return [ null ]; |
236 | } |
237 | return [ Validator::toNFKC( $s ) ]; |
238 | } |
239 | |
240 | /** |
241 | * Handler for toNFKD |
242 | * @internal |
243 | * @param string $s |
244 | * @return string[]|null[] |
245 | */ |
246 | public function ustringToNFKD( $s ) { |
247 | $this->checkString( 'toNFKD', $s, false ); |
248 | if ( !mb_check_encoding( $s, 'UTF-8' ) ) { |
249 | return [ null ]; |
250 | } |
251 | return [ Validator::toNFKD( $s ) ]; |
252 | } |
253 | |
254 | /** |
255 | * Handler for char |
256 | * @internal |
257 | * @return string[] |
258 | */ |
259 | public function ustringChar() { |
260 | $args = func_get_args(); |
261 | if ( count( $args ) > $this->stringLengthLimit ) { |
262 | throw new LuaError( "too many arguments to 'char'" ); |
263 | } |
264 | foreach ( $args as $k => &$v ) { |
265 | if ( !is_numeric( $v ) ) { |
266 | $this->checkType( 'char', $k + 1, $v, 'number' ); |
267 | } |
268 | $v = (int)floor( $v ); |
269 | if ( $v < 0 || $v > 0x10ffff ) { |
270 | $k++; |
271 | throw new LuaError( "bad argument #$k to 'char' (value out of range)" ); |
272 | } |
273 | } |
274 | $s = pack( 'N*', ...$args ); |
275 | $s = mb_convert_encoding( $s, 'UTF-8', 'UTF-32BE' ); |
276 | if ( strlen( $s ) > $this->stringLengthLimit ) { |
277 | throw new LuaError( "result to long for 'char'" ); |
278 | } |
279 | return [ $s ]; |
280 | } |
281 | |
282 | /** |
283 | * Handler for len |
284 | * @internal |
285 | * @param string $s |
286 | * @return int[]|null[] |
287 | */ |
288 | public function ustringLen( $s ) { |
289 | $this->checkString( 'len', $s, false ); |
290 | if ( !mb_check_encoding( $s, 'UTF-8' ) ) { |
291 | return [ null ]; |
292 | } |
293 | return [ mb_strlen( $s, 'UTF-8' ) ]; |
294 | } |
295 | |
296 | /** |
297 | * Handler for sub |
298 | * @internal |
299 | * @param string $s |
300 | * @param int $i |
301 | * @param int $j |
302 | * @return string[] |
303 | */ |
304 | public function ustringSub( $s, $i = 1, $j = -1 ) { |
305 | $this->checkString( 'sub', $s ); |
306 | $this->checkTypeOptional( 'sub', 2, $i, 'number', 1 ); |
307 | $this->checkTypeOptional( 'sub', 3, $j, 'number', -1 ); |
308 | |
309 | $len = mb_strlen( $s, 'UTF-8' ); |
310 | if ( $i < 0 ) { |
311 | $i = $len + $i + 1; |
312 | } |
313 | if ( $j < 0 ) { |
314 | $j = $len + $j + 1; |
315 | } |
316 | if ( $j < $i ) { |
317 | return [ '' ]; |
318 | } |
319 | $i = max( 1, min( (int)$i, $len + 1 ) ); |
320 | $j = max( 1, min( (int)$j, $len + 1 ) ); |
321 | $s = mb_substr( $s, $i - 1, $j - $i + 1, 'UTF-8' ); |
322 | return [ $s ]; |
323 | } |
324 | |
325 | /** |
326 | * Handler for upper |
327 | * @internal |
328 | * @param string $s |
329 | * @return string[] |
330 | */ |
331 | public function ustringUpper( $s ) { |
332 | $this->checkString( 'upper', $s ); |
333 | return [ mb_strtoupper( $s, 'UTF-8' ) ]; |
334 | } |
335 | |
336 | /** |
337 | * Handler for lower |
338 | * @internal |
339 | * @param string $s |
340 | * @return string[] |
341 | */ |
342 | public function ustringLower( $s ) { |
343 | $this->checkString( 'lower', $s ); |
344 | return [ mb_strtolower( $s, 'UTF-8' ) ]; |
345 | } |
346 | |
347 | /** |
348 | * Check a pattern as the second argument |
349 | * @param string $name Lua function name, for errors |
350 | * @param mixed $pattern Lua pattern |
351 | */ |
352 | private function checkPattern( $name, $pattern ) { |
353 | if ( $this->getLuaType( $pattern ) == 'number' ) { |
354 | $pattern = (string)$pattern; |
355 | } |
356 | $this->checkType( $name, 2, $pattern, 'string' ); |
357 | if ( !mb_check_encoding( $pattern, 'UTF-8' ) ) { |
358 | throw new LuaError( "bad argument #2 to '$name' (string is not UTF-8)" ); |
359 | } |
360 | if ( strlen( $pattern ) > $this->patternLengthLimit ) { |
361 | throw new LuaError( |
362 | "bad argument #2 to '$name' (pattern is longer than $this->patternLengthLimit bytes)" |
363 | ); |
364 | } |
365 | } |
366 | |
367 | /** |
368 | * Convert a Lua pattern into a PCRE regex |
369 | * @param string $pattern Lua pattern to convert |
370 | * @param string|false $anchor Regex fragment (`^` or `\G`) to use |
371 | * when anchoring the start of the regex, or false to disable start-anchoring. |
372 | * @param string $name Lua function name, for errors |
373 | * @return array [ string $re, array $capt, bool $anypos ] |
374 | * - $re: The regular expression |
375 | * - $capt: Definition of capturing groups, see addCapturesFromMatch() |
376 | * - $anypos: Whether any positional captures were encountered in the pattern. |
377 | * @return-taint none |
378 | */ |
379 | private function patternToRegex( $pattern, $anchor, $name ) { |
380 | $cacheKey = serialize( [ $pattern, $anchor ] ); |
381 | if ( !$this->patternRegexCache->has( $cacheKey ) ) { |
382 | $this->checkPattern( $name, $pattern ); |
383 | $pat = preg_split( '//us', $pattern, -1, PREG_SPLIT_NO_EMPTY ); |
384 | |
385 | static $charsets = null, $brcharsets = null; |
386 | if ( $charsets === null ) { |
387 | $charsets = [ |
388 | // If you change these, also change lualib/ustring/make-tables.php |
389 | // (and run it to regenerate charsets.lua) |
390 | 'a' => '\p{L}', |
391 | 'c' => '\p{Cc}', |
392 | 'd' => '\p{Nd}', |
393 | 'l' => '\p{Ll}', |
394 | 'p' => '\p{P}', |
395 | 's' => '\p{Xps}', |
396 | 'u' => '\p{Lu}', |
397 | 'w' => '[\p{L}\p{Nd}]', |
398 | 'x' => '[0-9A-Fa-f0-9A-Fa-f]', |
399 | 'z' => '\0', |
400 | |
401 | // These *must* be the inverse of the above |
402 | 'A' => '\P{L}', |
403 | 'C' => '\P{Cc}', |
404 | 'D' => '\P{Nd}', |
405 | 'L' => '\P{Ll}', |
406 | 'P' => '\P{P}', |
407 | 'S' => '\P{Xps}', |
408 | 'U' => '\P{Lu}', |
409 | 'W' => '[^\p{L}\p{Nd}]', |
410 | 'X' => '[^0-9A-Fa-f0-9A-Fa-f]', |
411 | 'Z' => '[^\0]', |
412 | ]; |
413 | $brcharsets = [ |
414 | 'w' => '\p{L}\p{Nd}', |
415 | 'x' => '0-9A-Fa-f0-9A-Fa-f', |
416 | |
417 | // Negated sets that are not expressable as a simple \P{} are |
418 | // unfortunately complicated. |
419 | |
420 | // Xan is L plus N, so ^Xan plus Nl plus No is anything that's not L or Nd |
421 | 'W' => '\P{Xan}\p{Nl}\p{No}', |
422 | |
423 | // Manually constructed. Fun. |
424 | 'X' => '\x00-\x2f\x3a-\x40\x47-\x60\x67-\x{ff0f}' |
425 | . '\x{ff1a}-\x{ff20}\x{ff27}-\x{ff40}\x{ff47}-\x{10ffff}', |
426 | |
427 | // Ha! |
428 | 'Z' => '\x01-\x{10ffff}', |
429 | ] + $charsets; |
430 | } |
431 | |
432 | $re = '/'; |
433 | $len = count( $pat ); |
434 | $capt = []; |
435 | $anypos = false; |
436 | $captparen = []; |
437 | $opencapt = []; |
438 | $bct = 0; |
439 | |
440 | for ( $i = 0; $i < $len; $i++ ) { |
441 | $ii = $i + 1; |
442 | $q = false; |
443 | switch ( $pat[$i] ) { |
444 | case '^': |
445 | $q = $i; |
446 | $re .= ( $anchor === false || $q ) ? '\\^' : $anchor; |
447 | break; |
448 | |
449 | case '$': |
450 | $q = ( $i < $len - 1 ); |
451 | $re .= $q ? '\\$' : '$'; |
452 | break; |
453 | |
454 | case '(': |
455 | if ( $i + 1 >= $len ) { |
456 | throw new LuaError( "Unmatched open-paren at pattern character $ii" ); |
457 | } |
458 | $n = count( $capt ) + 1; |
459 | $capt[$n] = ( $pat[$i + 1] === ')' ); |
460 | if ( $capt[$n] ) { |
461 | $anypos = true; |
462 | } |
463 | $re .= "(?<m$n>"; |
464 | $opencapt[] = $n; |
465 | $captparen[$n] = $ii; |
466 | break; |
467 | |
468 | case ')': |
469 | if ( count( $opencapt ) <= 0 ) { |
470 | throw new LuaError( "Unmatched close-paren at pattern character $ii" ); |
471 | } |
472 | array_pop( $opencapt ); |
473 | $re .= $pat[$i]; |
474 | break; |
475 | |
476 | case '%': |
477 | $i++; |
478 | if ( $i >= $len ) { |
479 | throw new LuaError( "malformed pattern (ends with '%')" ); |
480 | } |
481 | if ( isset( $charsets[$pat[$i]] ) ) { |
482 | $re .= $charsets[$pat[$i]]; |
483 | $q = true; |
484 | } elseif ( $pat[$i] === 'b' ) { |
485 | if ( $i + 2 >= $len ) { |
486 | throw new LuaError( "malformed pattern (missing arguments to \'%b\')" ); |
487 | } |
488 | $d1 = preg_quote( $pat[++$i], '/' ); |
489 | $d2 = preg_quote( $pat[++$i], '/' ); |
490 | if ( $d1 === $d2 ) { |
491 | $re .= "{$d1}[^$d1]*$d1"; |
492 | } else { |
493 | $bct++; |
494 | $re .= "(?<b$bct>$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)"; |
495 | } |
496 | } elseif ( $pat[$i] === 'f' ) { |
497 | if ( $i + 1 >= $len || $pat[++$i] !== '[' ) { |
498 | throw new LuaError( "missing '[' after %f in pattern at pattern character $ii" ); |
499 | } |
500 | [ $i, $re2 ] = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ); |
501 | // Because %f considers the beginning and end of the string |
502 | // to be \0, determine if $re2 matches that and take it |
503 | // into account with "^" and "$". |
504 | // @phan-suppress-next-line PhanParamSuspiciousOrder |
505 | if ( preg_match( "/$re2/us", "\0" ) ) { |
506 | $re .= "(?<!^)(?<!$re2)(?=$re2|$)"; |
507 | } else { |
508 | $re .= "(?<!$re2)(?=$re2)"; |
509 | } |
510 | } elseif ( $pat[$i] >= '0' && $pat[$i] <= '9' ) { |
511 | $n = ord( $pat[$i] ) - 0x30; |
512 | if ( $n === 0 || $n > count( $capt ) || in_array( $n, $opencapt ) ) { |
513 | throw new LuaError( "invalid capture index %$n at pattern character $ii" ); |
514 | } |
515 | $re .= "\\g{m$n}"; |
516 | } else { |
517 | $re .= preg_quote( $pat[$i], '/' ); |
518 | $q = true; |
519 | } |
520 | break; |
521 | |
522 | case '[': |
523 | [ $i, $re2 ] = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ); |
524 | $re .= $re2; |
525 | $q = true; |
526 | break; |
527 | |
528 | case ']': |
529 | throw new LuaError( "Unmatched close-bracket at pattern character $ii" ); |
530 | |
531 | case '.': |
532 | $re .= $pat[$i]; |
533 | $q = true; |
534 | break; |
535 | |
536 | default: |
537 | $re .= preg_quote( $pat[$i], '/' ); |
538 | $q = true; |
539 | break; |
540 | } |
541 | if ( $q && $i + 1 < $len ) { |
542 | switch ( $pat[$i + 1] ) { |
543 | case '*': |
544 | case '+': |
545 | case '?': |
546 | $re .= $pat[++$i]; |
547 | break; |
548 | case '-': |
549 | $re .= '*?'; |
550 | $i++; |
551 | break; |
552 | } |
553 | } |
554 | } |
555 | if ( count( $opencapt ) ) { |
556 | $ii = $captparen[$opencapt[0]]; |
557 | throw new LuaError( "Unclosed capture beginning at pattern character $ii" ); |
558 | } |
559 | $re .= '/us'; |
560 | |
561 | $this->patternRegexCache->set( $cacheKey, [ $re, $capt, $anypos ] ); |
562 | } |
563 | return $this->patternRegexCache->get( $cacheKey ); |
564 | } |
565 | |
566 | /** |
567 | * Convert a Lua pattern bracketed character set to a PCRE regex fragment |
568 | * @param string[] $pat Pattern being processed, split into individual characters. |
569 | * @param int $i Offset of the start of the bracketed character set in $pat. |
570 | * @param int $len Length of $pat. |
571 | * @param array $brcharsets Mapping from Lua pattern percent escapes to |
572 | * regex-style character ranges. |
573 | * @return array [ int $new_i, string $re_fragment ] |
574 | */ |
575 | private function bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ) { |
576 | $ii = $i + 1; |
577 | $re = '['; |
578 | $i++; |
579 | if ( $i < $len && $pat[$i] === '^' ) { |
580 | $re .= '^'; |
581 | $i++; |
582 | } |
583 | for ( $j = $i; $i < $len && ( $j == $i || $pat[$i] !== ']' ); $i++ ) { |
584 | if ( $pat[$i] === '%' ) { |
585 | $i++; |
586 | if ( $i >= $len ) { |
587 | break; |
588 | } |
589 | $re .= $brcharsets[$pat[$i]] ?? preg_quote( $pat[$i], '/' ); |
590 | } elseif ( $i + 2 < $len && |
591 | $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' && $pat[$i + 2] !== '%' |
592 | ) { |
593 | if ( $pat[$i] <= $pat[$i + 2] ) { |
594 | $re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i + 2], '/' ); |
595 | } |
596 | $i += 2; |
597 | } else { |
598 | $re .= preg_quote( $pat[$i], '/' ); |
599 | } |
600 | } |
601 | if ( $i >= $len ) { |
602 | throw new LuaError( |
603 | "Missing close-bracket for character set beginning at pattern character $ii" |
604 | ); |
605 | } |
606 | $re .= ']'; |
607 | |
608 | // Lua just ignores invalid ranges, while pcre throws an error. |
609 | // We filter them out above, but then we need to special-case empty sets |
610 | if ( $re === '[]' ) { |
611 | // Can't directly quantify (*FAIL), so wrap it. |
612 | // "(?!)" would be simpler and could be quantified if not for a bug in PCRE 8.13 to 8.33 |
613 | $re = '(?:(*FAIL))'; |
614 | } elseif ( $re === '[^]' ) { |
615 | // 's' modifier is always used, so this works |
616 | $re = '.'; |
617 | } |
618 | |
619 | return [ $i, $re ]; |
620 | } |
621 | |
622 | /** |
623 | * Append captured groups to a result array |
624 | * @param array $arr Result array to append to. |
625 | * @param string $s String matched against. |
626 | * @param array $m Matches, from preg_match with PREG_OFFSET_CAPTURE. |
627 | * @param array $capt Capture groups (in $m) to process, see patternToRegex() |
628 | * @param bool $m0_if_no_captures Whether to append "$0" if $capt is empty. |
629 | * @return array |
630 | */ |
631 | private function addCapturesFromMatch( $arr, $s, $m, $capt, $m0_if_no_captures ) { |
632 | if ( count( $capt ) ) { |
633 | foreach ( $capt as $n => $pos ) { |
634 | if ( $pos ) { |
635 | $o = mb_strlen( substr( $s, 0, $m["m$n"][1] ), 'UTF-8' ) + 1; |
636 | $arr[] = $o; |
637 | } else { |
638 | $arr[] = $m["m$n"][0]; |
639 | } |
640 | } |
641 | } elseif ( $m0_if_no_captures ) { |
642 | $arr[] = $m[0][0]; |
643 | } |
644 | return $arr; |
645 | } |
646 | |
647 | /** |
648 | * Handler for find |
649 | * @internal |
650 | * @param string $s |
651 | * @param string $pattern |
652 | * @param int $init |
653 | * @param bool $plain |
654 | * @return array Format is [ null ], or [ int, int ], or [ int, int, (string|int)... ] |
655 | */ |
656 | public function ustringFind( $s, $pattern, $init = 1, $plain = false ) { |
657 | $this->checkString( 'find', $s ); |
658 | $this->checkTypeOptional( 'find', 3, $init, 'number', 1 ); |
659 | $this->checkTypeOptional( 'find', 4, $plain, 'boolean', false ); |
660 | |
661 | $len = mb_strlen( $s, 'UTF-8' ); |
662 | if ( $init < 0 ) { |
663 | $init = $len + $init + 1; |
664 | } elseif ( $init > $len + 1 ) { |
665 | $init = $len + 1; |
666 | } |
667 | |
668 | if ( $init > 1 ) { |
669 | $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) ); |
670 | } else { |
671 | $init = 1; |
672 | $offset = 0; |
673 | } |
674 | |
675 | if ( $plain ) { |
676 | $this->checkPattern( 'find', $pattern ); |
677 | if ( $pattern !== '' ) { |
678 | $ret = mb_strpos( $s, $pattern, $init - 1, 'UTF-8' ); |
679 | } else { |
680 | $ret = $init - 1; |
681 | } |
682 | if ( $ret === false ) { |
683 | return [ null ]; |
684 | } else { |
685 | return [ $ret + 1, $ret + mb_strlen( $pattern ) ]; |
686 | } |
687 | } else { |
688 | [ $re, $capt ] = $this->patternToRegex( $pattern, '\G', 'find' ); |
689 | if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) { |
690 | return [ null ]; |
691 | } |
692 | $o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' ); |
693 | $ret = [ $o + 1, $o + mb_strlen( $m[0][0], 'UTF-8' ) ]; |
694 | return $this->addCapturesFromMatch( $ret, $s, $m, $capt, false ); |
695 | } |
696 | } |
697 | |
698 | /** |
699 | * Handler for match |
700 | * @internal |
701 | * @param string $s |
702 | * @param string $pattern |
703 | * @param int $init |
704 | * @return array Format is [ null ] or [ (string|int)... ] |
705 | */ |
706 | public function ustringMatch( $s, $pattern, $init = 1 ) { |
707 | $this->checkString( 'match', $s ); |
708 | $this->checkTypeOptional( 'match', 3, $init, 'number', 1 ); |
709 | |
710 | $len = mb_strlen( $s, 'UTF-8' ); |
711 | if ( $init < 0 ) { |
712 | $init = $len + $init + 1; |
713 | } elseif ( $init > $len + 1 ) { |
714 | $init = $len + 1; |
715 | } |
716 | if ( $init > 1 ) { |
717 | $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) ); |
718 | } else { |
719 | $offset = 0; |
720 | } |
721 | |
722 | [ $re, $capt ] = $this->patternToRegex( $pattern, '\G', 'match' ); |
723 | if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) { |
724 | return [ null ]; |
725 | } |
726 | return $this->addCapturesFromMatch( [], $s, $m, $capt, true ); |
727 | } |
728 | |
729 | /** |
730 | * Handler for gmatchInit |
731 | * @internal |
732 | * @param string $s |
733 | * @param string $pattern |
734 | * @return array Format is [ string, bool[] ] |
735 | */ |
736 | public function ustringGmatchInit( $s, $pattern ) { |
737 | $this->checkString( 'gmatch', $s ); |
738 | |
739 | [ $re, $capt ] = $this->patternToRegex( $pattern, false, 'gmatch' ); |
740 | return [ $re, $capt ]; |
741 | } |
742 | |
743 | /** |
744 | * Handler for gmatchCallback |
745 | * @internal |
746 | * @param string $s |
747 | * @param string $re |
748 | * @param bool[] $capt |
749 | * @param int $pos |
750 | * @return array Format is [ int, [ null, (string|int)... ] ] |
751 | */ |
752 | public function ustringGmatchCallback( $s, $re, $capt, $pos ) { |
753 | if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $pos ) ) { |
754 | return [ $pos, [] ]; |
755 | } |
756 | $pos = $m[0][1] + strlen( $m[0][0] ); |
757 | return [ $pos, $this->addCapturesFromMatch( [ null ], $s, $m, $capt, true ) ]; |
758 | } |
759 | |
760 | /** |
761 | * Handler for gsub |
762 | * @internal |
763 | * @param string $s |
764 | * @param string $pattern |
765 | * @param mixed $repl |
766 | * @param string|int|null $n |
767 | * @return array Format is [ string, int ] |
768 | */ |
769 | public function ustringGsub( $s, $pattern, $repl, $n = null ) { |
770 | $this->checkString( 'gsub', $s ); |
771 | $this->checkTypeOptional( 'gsub', 4, $n, 'number', null ); |
772 | |
773 | if ( $n === null ) { |
774 | $n = -1; |
775 | } elseif ( $n < 1 ) { |
776 | return [ $s, 0 ]; |
777 | } |
778 | |
779 | [ $re, $capt, $anypos ] = $this->patternToRegex( $pattern, '^', 'gsub' ); |
780 | $captures = []; |
781 | |
782 | if ( $this->phpBug53823 ) { |
783 | // PHP bug 53823 means that a zero-length match before a UTF-8 |
784 | // character will match again before every byte of that character. |
785 | // The workaround is to capture the first "character" of/after the |
786 | // match and verify that its first byte is legal to start a UTF-8 |
787 | // character. |
788 | $re = '/(?=(?<phpBug53823>.|$))' . substr( $re, 1 ); |
789 | } |
790 | |
791 | if ( $anypos ) { |
792 | // preg_replace_callback doesn't take a "flags" argument, so we |
793 | // can't pass PREG_OFFSET_CAPTURE to it, which is needed to handle |
794 | // position captures. So instead we have to do a preg_match_all and |
795 | // handle the captures ourself. |
796 | $ct = preg_match_all( $re, $s, $mm, PREG_OFFSET_CAPTURE | PREG_SET_ORDER ); |
797 | for ( $i = 0; $i < $ct; $i++ ) { |
798 | $m = $mm[$i]; |
799 | if ( $this->phpBug53823 ) { |
800 | $c = ord( $m['phpBug53823'][0] ); |
801 | if ( $c >= 0x80 && $c <= 0xbf ) { |
802 | continue; |
803 | } |
804 | } |
805 | $c = [ $m[0][0] ]; |
806 | foreach ( $this->addCapturesFromMatch( [], $s, $m, $capt, false ) as $k => $v ) { |
807 | $k++; |
808 | $c["m$k"] = $v; |
809 | } |
810 | $captures[] = $c; |
811 | if ( $n >= 0 && count( $captures ) >= $n ) { |
812 | break; |
813 | } |
814 | } |
815 | } |
816 | |
817 | switch ( $this->getLuaType( $repl ) ) { |
818 | case 'string': |
819 | case 'number': |
820 | $cb = static function ( $m ) use ( $repl, $anypos, &$captures ) { |
821 | if ( $anypos ) { |
822 | $m = array_shift( $captures ); |
823 | } |
824 | return preg_replace_callback( '/%([%0-9])/', static function ( $m2 ) use ( $m ) { |
825 | $x = $m2[1]; |
826 | if ( $x === '%' ) { |
827 | return '%'; |
828 | } elseif ( $x === '0' ) { |
829 | return $m[0]; |
830 | } elseif ( isset( $m["m$x"] ) ) { |
831 | return $m["m$x"]; |
832 | } elseif ( $x === '1' ) { |
833 | // Match undocumented Lua string.gsub behavior |
834 | return $m[0]; |
835 | } else { |
836 | throw new LuaError( "invalid capture index %$x in replacement string" ); |
837 | } |
838 | }, $repl ); |
839 | }; |
840 | break; |
841 | |
842 | case 'table': |
843 | $cb = function ( $m ) use ( $repl, $anypos, &$captures ) { |
844 | if ( $anypos ) { |
845 | $m = array_shift( $captures ); |
846 | } |
847 | $x = $m['m1'] ?? $m[0]; |
848 | if ( !isset( $repl[$x] ) ) { |
849 | return $m[0]; |
850 | } |
851 | $type = $this->getLuaType( $repl[$x] ); |
852 | if ( $type !== 'string' && $type !== 'number' ) { |
853 | throw new LuaError( "invalid replacement value (a $type)" ); |
854 | } |
855 | return $repl[$x]; |
856 | }; |
857 | break; |
858 | |
859 | case 'function': |
860 | $interpreter = $this->getInterpreter(); |
861 | $cb = function ( $m ) use ( $interpreter, $capt, $repl, $anypos, &$captures ) { |
862 | if ( $anypos ) { |
863 | $m = array_shift( $captures ); |
864 | } |
865 | $args = []; |
866 | if ( count( $capt ) ) { |
867 | foreach ( $capt as $i => $pos ) { |
868 | // @phan-suppress-next-line PhanTypeArraySuspiciousNullable |
869 | $args[] = $m["m$i"]; |
870 | } |
871 | } else { |
872 | $args[] = $m[0]; |
873 | } |
874 | $ret = $interpreter->callFunction( $repl, ...$args ); |
875 | if ( count( $ret ) === 0 || $ret[0] === null ) { |
876 | return $m[0]; |
877 | } |
878 | $type = $this->getLuaType( $ret[0] ); |
879 | if ( $type !== 'string' && $type !== 'number' ) { |
880 | throw new LuaError( "invalid replacement value (a $type)" ); |
881 | } |
882 | return $ret[0]; |
883 | }; |
884 | break; |
885 | |
886 | default: |
887 | $this->checkType( 'gsub', 3, $repl, 'function or table or string' ); |
888 | throw new LogicException( 'checkType above should have failed' ); |
889 | } |
890 | |
891 | $skippedMatches = 0; |
892 | if ( $this->phpBug53823 ) { |
893 | // Since we're having bogus matches, we need to keep track of the |
894 | // necessary adjustment and stop manually once we hit the limit. |
895 | $maxMatches = $n < 0 ? INF : $n; |
896 | $n = -1; |
897 | $realCallback = $cb; |
898 | $cb = static function ( $m ) use ( $realCallback, &$skippedMatches, &$maxMatches ) { |
899 | $c = ord( $m['phpBug53823'] ); |
900 | if ( ( $c >= 0x80 && $c <= 0xbf ) || $maxMatches <= 0 ) { |
901 | $skippedMatches++; |
902 | return $m[0]; |
903 | } else { |
904 | $maxMatches--; |
905 | return $realCallback( $m ); |
906 | } |
907 | }; |
908 | } |
909 | |
910 | $count = 0; |
911 | $s2 = preg_replace_callback( $re, $cb, $s, $n, $count ); |
912 | if ( $s2 === null ) { |
913 | $this->handlePCREError( preg_last_error(), $pattern ); |
914 | } |
915 | return [ $s2, $count - $skippedMatches ]; |
916 | } |
917 | |
918 | /** |
919 | * Handle a PCRE error |
920 | * @param int $error From preg_last_error() |
921 | * @param string $pattern Pattern being matched |
922 | * @throws LuaError |
923 | */ |
924 | private function handlePCREError( $error, $pattern ) { |
925 | $PREG_JIT_STACKLIMIT_ERROR = defined( 'PREG_JIT_STACKLIMIT_ERROR' ) |
926 | ? PREG_JIT_STACKLIMIT_ERROR |
927 | : 'PREG_JIT_STACKLIMIT_ERROR'; |
928 | |
929 | $error = preg_last_error(); |
930 | switch ( $error ) { |
931 | case PREG_NO_ERROR: |
932 | // Huh? |
933 | break; |
934 | case PREG_INTERNAL_ERROR: |
935 | throw new LuaError( "PCRE internal error" ); |
936 | case PREG_BACKTRACK_LIMIT_ERROR: |
937 | throw new LuaError( |
938 | "PCRE backtrack limit reached while matching pattern '$pattern'" |
939 | ); |
940 | case PREG_RECURSION_LIMIT_ERROR: |
941 | throw new LuaError( |
942 | "PCRE recursion limit reached while matching pattern '$pattern'" |
943 | ); |
944 | case PREG_BAD_UTF8_ERROR: |
945 | // Should have alreay been caught, but just in case |
946 | throw new LuaError( "PCRE bad UTF-8 error" ); |
947 | case PREG_BAD_UTF8_OFFSET_ERROR: |
948 | // Shouldn't happen, but just in case |
949 | throw new LuaError( "PCRE bad UTF-8 offset error" ); |
950 | case $PREG_JIT_STACKLIMIT_ERROR: |
951 | throw new LuaError( |
952 | "PCRE JIT stack limit reached while matching pattern '$pattern'" |
953 | ); |
954 | default: |
955 | throw new LuaError( |
956 | "PCRE error code $error while matching pattern '$pattern'" |
957 | ); |
958 | } |
959 | } |
960 | } |