MediaWiki REL1_34
UstringLibrary.php
Go to the documentation of this file.
1<?php
2
3use UtfNormal\Validator;
4
10 private $patternLengthLimit = 10000;
11
17 private $stringLengthLimit = null;
18
24 private $phpBug53823 = false;
25
30 private $patternRegexCache = null;
31
32 public function __construct( $engine ) {
33 if ( $this->stringLengthLimit === null ) {
34 global $wgMaxArticleSize;
35 $this->stringLengthLimit = $wgMaxArticleSize * 1024;
36 }
37
38 $this->phpBug53823 = preg_replace( '//us', 'x', "\xc3\xa1" ) === "x\xc3x\xa1x";
39 $this->patternRegexCache = new MapCacheLRU( 100 );
40
41 parent::__construct( $engine );
42 }
43
44 public function register() {
45 $perf = $this->getEngine()->getPerformanceCharacteristics();
46
47 if ( $perf['phpCallsRequireSerialization'] ) {
48 $lib = [
49 // Pattern matching is still much faster in PHP, even with the
50 // overhead of serialization
51 'find' => [ $this, 'ustringFind' ],
52 'match' => [ $this, 'ustringMatch' ],
53 'gmatch_init' => [ $this, 'ustringGmatchInit' ],
54 'gmatch_callback' => [ $this, 'ustringGmatchCallback' ],
55 'gsub' => [ $this, 'ustringGsub' ],
56 ];
57 } else {
58 $lib = [
59 'isutf8' => [ $this, 'ustringIsUtf8' ],
60 'byteoffset' => [ $this, 'ustringByteoffset' ],
61 'codepoint' => [ $this, 'ustringCodepoint' ],
62 'gcodepoint_init' => [ $this, 'ustringGcodepointInit' ],
63 'toNFC' => [ $this, 'ustringToNFC' ],
64 'toNFD' => [ $this, 'ustringToNFD' ],
65 'toNFKC' => [ $this, 'ustringToNFKC' ],
66 'toNFKD' => [ $this, 'ustringToNFKD' ],
67 'char' => [ $this, 'ustringChar' ],
68 'len' => [ $this, 'ustringLen' ],
69 'sub' => [ $this, 'ustringSub' ],
70 'upper' => [ $this, 'ustringUpper' ],
71 'lower' => [ $this, 'ustringLower' ],
72 'find' => [ $this, 'ustringFind' ],
73 'match' => [ $this, 'ustringMatch' ],
74 'gmatch_init' => [ $this, 'ustringGmatchInit' ],
75 'gmatch_callback' => [ $this, 'ustringGmatchCallback' ],
76 'gsub' => [ $this, 'ustringGsub' ],
77 ];
78 }
79 return $this->getEngine()->registerInterface( 'mw.ustring.lua', $lib, [
80 'stringLengthLimit' => $this->stringLengthLimit,
81 'patternLengthLimit' => $this->patternLengthLimit,
82 ] );
83 }
84
85 private function checkString( $name, $s, $checkEncoding = true ) {
86 if ( $this->getLuaType( $s ) == 'number' ) {
87 $s = (string)$s;
88 } else {
89 $this->checkType( $name, 1, $s, 'string' );
90 if ( $checkEncoding && !mb_check_encoding( $s, 'UTF-8' ) ) {
91 throw new Scribunto_LuaError( "bad argument #1 to '$name' (string is not UTF-8)" );
92 }
93 if ( strlen( $s ) > $this->stringLengthLimit ) {
94 throw new Scribunto_LuaError(
95 "bad argument #1 to '$name' (string is longer than $this->stringLengthLimit bytes)"
96 );
97 }
98 }
99 }
100
107 public function ustringIsUtf8( $s ) {
108 $this->checkString( 'isutf8', $s, false );
109 return [ mb_check_encoding( $s, 'UTF-8' ) ];
110 }
111
120 public function ustringByteoffset( $s, $l = 1, $i = 1 ) {
121 $this->checkString( 'byteoffset', $s );
122 $this->checkTypeOptional( 'byteoffset', 2, $l, 'number', 1 );
123 $this->checkTypeOptional( 'byteoffset', 3, $i, 'number', 1 );
124
125 $bytelen = strlen( $s );
126 if ( $i < 0 ) {
127 $i = $bytelen + $i + 1;
128 }
129 if ( $i < 1 || $i > $bytelen ) {
130 return [ null ];
131 }
132 $i--;
133 $j = $i;
134 while ( ( ord( $s[$i] ) & 0xc0 ) === 0x80 ) {
135 $i--;
136 }
137 if ( $l > 0 && $j === $i ) {
138 $l--;
139 }
140 $char = mb_strlen( substr( $s, 0, $i ), 'UTF-8' ) + $l;
141 if ( $char < 0 || $char >= mb_strlen( $s, 'UTF-8' ) ) {
142 return [ null ];
143 } else {
144 return [ strlen( mb_substr( $s, 0, $char, 'UTF-8' ) ) + 1 ];
145 }
146 }
147
156 public function ustringCodepoint( $s, $i = 1, $j = null ) {
157 $this->checkString( 'codepoint', $s );
158 $this->checkTypeOptional( 'codepoint', 2, $i, 'number', 1 );
159 $this->checkTypeOptional( 'codepoint', 3, $j, 'number', $i );
160
161 $l = mb_strlen( $s, 'UTF-8' );
162 if ( $i < 0 ) {
163 $i = $l + $i + 1;
164 }
165 if ( $j < 0 ) {
166 $j = $l + $j + 1;
167 }
168 if ( $j < $i ) {
169 return [];
170 }
171 $i = max( 1, min( $i, $l + 1 ) );
172 $j = max( 1, min( $j, $l + 1 ) );
173 $s = mb_substr( $s, $i - 1, $j - $i + 1, 'UTF-8' );
174 return unpack( 'N*', mb_convert_encoding( $s, 'UTF-32BE', 'UTF-8' ) );
175 }
176
185 public function ustringGcodepointInit( $s, $i = 1, $j = null ) {
186 return [ $this->ustringCodepoint( $s, $i, $j ) ];
187 }
188
195 public function ustringToNFC( $s ) {
196 $this->checkString( 'toNFC', $s, false );
197 if ( !mb_check_encoding( $s, 'UTF-8' ) ) {
198 return [ null ];
199 }
200 return [ Validator::toNFC( $s ) ];
201 }
202
209 public function ustringToNFD( $s ) {
210 $this->checkString( 'toNFD', $s, false );
211 if ( !mb_check_encoding( $s, 'UTF-8' ) ) {
212 return [ null ];
213 }
214 return [ Validator::toNFD( $s ) ];
215 }
216
223 public function ustringToNFKC( $s ) {
224 $this->checkString( 'toNFKC', $s, false );
225 if ( !mb_check_encoding( $s, 'UTF-8' ) ) {
226 return [ null ];
227 }
228 return [ Validator::toNFKC( $s ) ];
229 }
230
237 public function ustringToNFKD( $s ) {
238 $this->checkString( 'toNFKD', $s, false );
239 if ( !mb_check_encoding( $s, 'UTF-8' ) ) {
240 return [ null ];
241 }
242 return [ Validator::toNFKD( $s ) ];
243 }
244
250 public function ustringChar() {
251 $args = func_get_args();
252 if ( count( $args ) > $this->stringLengthLimit ) {
253 throw new Scribunto_LuaError( "too many arguments to 'char'" );
254 }
255 foreach ( $args as $k => &$v ) {
256 if ( !is_numeric( $v ) ) {
257 $this->checkType( 'char', $k + 1, $v, 'number' );
258 }
259 $v = (int)floor( $v );
260 if ( $v < 0 || $v > 0x10ffff ) {
261 $k++;
262 throw new Scribunto_LuaError( "bad argument #$k to 'char' (value out of range)" );
263 }
264 }
265 $s = pack( 'N*', ...$args );
266 $s = mb_convert_encoding( $s, 'UTF-8', 'UTF-32BE' );
267 if ( strlen( $s ) > $this->stringLengthLimit ) {
268 throw new Scribunto_LuaError( "result to long for 'char'" );
269 }
270 return [ $s ];
271 }
272
279 public function ustringLen( $s ) {
280 $this->checkString( 'len', $s, false );
281 if ( !mb_check_encoding( $s, 'UTF-8' ) ) {
282 return [ null ];
283 }
284 return [ mb_strlen( $s, 'UTF-8' ) ];
285 }
286
295 public function ustringSub( $s, $i = 1, $j = -1 ) {
296 $this->checkString( 'sub', $s );
297 $this->checkTypeOptional( 'sub', 2, $i, 'number', 1 );
298 $this->checkTypeOptional( 'sub', 3, $j, 'number', -1 );
299
300 $len = mb_strlen( $s, 'UTF-8' );
301 if ( $i < 0 ) {
302 $i = $len + $i + 1;
303 }
304 if ( $j < 0 ) {
305 $j = $len + $j + 1;
306 }
307 if ( $j < $i ) {
308 return [ '' ];
309 }
310 $i = max( 1, min( $i, $len + 1 ) );
311 $j = max( 1, min( $j, $len + 1 ) );
312 $s = mb_substr( $s, $i - 1, $j - $i + 1, 'UTF-8' );
313 return [ $s ];
314 }
315
322 public function ustringUpper( $s ) {
323 $this->checkString( 'upper', $s );
324 return [ mb_strtoupper( $s, 'UTF-8' ) ];
325 }
326
333 public function ustringLower( $s ) {
334 $this->checkString( 'lower', $s );
335 return [ mb_strtolower( $s, 'UTF-8' ) ];
336 }
337
338 private function checkPattern( $name, $pattern ) {
339 if ( $this->getLuaType( $pattern ) == 'number' ) {
340 $pattern = (string)$pattern;
341 }
342 $this->checkType( $name, 2, $pattern, 'string' );
343 if ( !mb_check_encoding( $pattern, 'UTF-8' ) ) {
344 throw new Scribunto_LuaError( "bad argument #2 to '$name' (string is not UTF-8)" );
345 }
346 if ( strlen( $pattern ) > $this->patternLengthLimit ) {
347 throw new Scribunto_LuaError(
348 "bad argument #2 to '$name' (pattern is longer than $this->patternLengthLimit bytes)"
349 );
350 }
351 }
352
353 /* Convert a Lua pattern into a PCRE regex */
354 private function patternToRegex( $pattern, $anchor, $name ) {
355 $cacheKey = serialize( [ $pattern, $anchor ] );
356 if ( !$this->patternRegexCache->has( $cacheKey ) ) {
357 $this->checkPattern( $name, $pattern );
358 $pat = preg_split( '//us', $pattern, null, PREG_SPLIT_NO_EMPTY );
359
360 static $charsets = null, $brcharsets = null;
361 if ( $charsets === null ) {
362 $charsets = [
363 // If you change these, also change lualib/ustring/make-tables.php
364 // (and run it to regenerate charsets.lua)
365 'a' => '\p{L}',
366 'c' => '\p{Cc}',
367 'd' => '\p{Nd}',
368 'l' => '\p{Ll}',
369 'p' => '\p{P}',
370 's' => '\p{Xps}',
371 'u' => '\p{Lu}',
372 'w' => '[\p{L}\p{Nd}]',
373 'x' => '[0-9A-Fa-f0-9A-Fa-f]',
374 'z' => '\0',
375
376 // These *must* be the inverse of the above
377 'A' => '\P{L}',
378 'C' => '\P{Cc}',
379 'D' => '\P{Nd}',
380 'L' => '\P{Ll}',
381 'P' => '\P{P}',
382 'S' => '\P{Xps}',
383 'U' => '\P{Lu}',
384 'W' => '[^\p{L}\p{Nd}]',
385 'X' => '[^0-9A-Fa-f0-9A-Fa-f]',
386 'Z' => '[^\0]',
387 ];
388 $brcharsets = [
389 'w' => '\p{L}\p{Nd}',
390 'x' => '0-9A-Fa-f0-9A-Fa-f',
391
392 // Negated sets that are not expressable as a simple \P{} are
393 // unfortunately complicated.
394
395 // Xan is L plus N, so ^Xan plus Nl plus No is anything that's not L or Nd
396 'W' => '\P{Xan}\p{Nl}\p{No}',
397
398 // Manually constructed. Fun.
399 'X' => '\x00-\x2f\x3a-\x40\x47-\x60\x67-\x{ff0f}'
400 . '\x{ff1a}-\x{ff20}\x{ff27}-\x{ff40}\x{ff47}-\x{10ffff}',
401
402 // Ha!
403 'Z' => '\x01-\x{10ffff}',
404 ] + $charsets;
405 }
406
407 $re = '/';
408 $len = count( $pat );
409 $capt = [];
410 $anypos = false;
411 $captparen = [];
412 $opencapt = [];
413 $bct = 0;
414
415 for ( $i = 0; $i < $len; $i++ ) {
416 $ii = $i + 1;
417 $q = false;
418 switch ( $pat[$i] ) {
419 case '^':
420 $q = $i;
421 $re .= ( $anchor === false || $q ) ? '\\^' : $anchor;
422 break;
423
424 case '$':
425 $q = ( $i < $len - 1 );
426 $re .= $q ? '\\$' : '$';
427 break;
428
429 case '(':
430 if ( $i + 1 >= $len ) {
431 throw new Scribunto_LuaError( "Unmatched open-paren at pattern character $ii" );
432 }
433 $n = count( $capt ) + 1;
434 $capt[$n] = ( $pat[$i + 1] === ')' );
435 if ( $capt[$n] ) {
436 $anypos = true;
437 }
438 $re .= "(?<m$n>";
439 $opencapt[] = $n;
440 $captparen[$n] = $ii;
441 break;
442
443 case ')':
444 if ( count( $opencapt ) <= 0 ) {
445 throw new Scribunto_LuaError( "Unmatched close-paren at pattern character $ii" );
446 }
447 array_pop( $opencapt );
448 $re .= $pat[$i];
449 break;
450
451 case '%':
452 $i++;
453 if ( $i >= $len ) {
454 throw new Scribunto_LuaError( "malformed pattern (ends with '%')" );
455 }
456 if ( isset( $charsets[$pat[$i]] ) ) {
457 $re .= $charsets[$pat[$i]];
458 $q = true;
459 } elseif ( $pat[$i] === 'b' ) {
460 if ( $i + 2 >= $len ) {
461 throw new Scribunto_LuaError( "malformed pattern (missing arguments to \'%b\')" );
462 }
463 $d1 = preg_quote( $pat[++$i], '/' );
464 $d2 = preg_quote( $pat[++$i], '/' );
465 if ( $d1 === $d2 ) {
466 $re .= "{$d1}[^$d1]*$d1";
467 } else {
468 $bct++;
469 $re .= "(?<b$bct>$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)";
470 }
471 } elseif ( $pat[$i] === 'f' ) {
472 if ( $i + 1 >= $len || $pat[++$i] !== '[' ) {
473 throw new Scribunto_LuaError( "missing '[' after %f in pattern at pattern character $ii" );
474 }
475 list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets );
476 // Because %f considers the beginning and end of the string
477 // to be \0, determine if $re2 matches that and take it
478 // into account with "^" and "$".
479 // @phan-suppress-next-line PhanParamSuspiciousOrder
480 if ( preg_match( "/$re2/us", "\0" ) ) {
481 $re .= "(?<!^)(?<!$re2)(?=$re2|$)";
482 } else {
483 $re .= "(?<!$re2)(?=$re2)";
484 }
485 } elseif ( $pat[$i] >= '0' && $pat[$i] <= '9' ) {
486 $n = ord( $pat[$i] ) - 0x30;
487 if ( $n === 0 || $n > count( $capt ) || in_array( $n, $opencapt ) ) {
488 throw new Scribunto_LuaError( "invalid capture index %$n at pattern character $ii" );
489 }
490 $re .= "\\g{m$n}";
491 } else {
492 $re .= preg_quote( $pat[$i], '/' );
493 $q = true;
494 }
495 break;
496
497 case '[':
498 list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets );
499 $re .= $re2;
500 $q = true;
501 break;
502
503 case ']':
504 throw new Scribunto_LuaError( "Unmatched close-bracket at pattern character $ii" );
505
506 case '.':
507 $re .= $pat[$i];
508 $q = true;
509 break;
510
511 default:
512 $re .= preg_quote( $pat[$i], '/' );
513 $q = true;
514 break;
515 }
516 if ( $q && $i + 1 < $len ) {
517 switch ( $pat[$i + 1] ) {
518 case '*':
519 case '+':
520 case '?':
521 $re .= $pat[++$i];
522 break;
523 case '-':
524 $re .= '*?';
525 $i++;
526 break;
527 }
528 }
529 }
530 if ( count( $opencapt ) ) {
531 $ii = $captparen[$opencapt[0]];
532 throw new Scribunto_LuaError( "Unclosed capture beginning at pattern character $ii" );
533 }
534 $re .= '/us';
535
536 $this->patternRegexCache->set( $cacheKey, [ $re, $capt, $anypos ] );
537 }
538 return $this->patternRegexCache->get( $cacheKey );
539 }
540
541 private function bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ) {
542 $ii = $i + 1;
543 $re = '[';
544 $i++;
545 if ( $i < $len && $pat[$i] === '^' ) {
546 $re .= '^';
547 $i++;
548 }
549 for ( $j = $i; $i < $len && ( $j == $i || $pat[$i] !== ']' ); $i++ ) {
550 if ( $pat[$i] === '%' ) {
551 $i++;
552 if ( $i >= $len ) {
553 break;
554 }
555 if ( isset( $brcharsets[$pat[$i]] ) ) {
556 $re .= $brcharsets[$pat[$i]];
557 } else {
558 $re .= preg_quote( $pat[$i], '/' );
559 }
560 } elseif ( $i + 2 < $len &&
561 $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' && $pat[$i + 2] !== '%'
562 ) {
563 if ( $pat[$i] <= $pat[$i + 2] ) {
564 $re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i + 2], '/' );
565 }
566 $i += 2;
567 } else {
568 $re .= preg_quote( $pat[$i], '/' );
569 }
570 }
571 if ( $i >= $len ) {
572 throw new Scribunto_LuaError(
573 "Missing close-bracket for character set beginning at pattern character $ii"
574 );
575 }
576 $re .= ']';
577
578 // Lua just ignores invalid ranges, while pcre throws an error.
579 // We filter them out above, but then we need to special-case empty sets
580 if ( $re === '[]' ) {
581 // Can't directly quantify (*FAIL), so wrap it.
582 // "(?!)" would be simpler and could be quantified if not for a bug in PCRE 8.13 to 8.33
583 $re = '(?:(*FAIL))';
584 } elseif ( $re === '[^]' ) {
585 $re = '.'; // 's' modifier is always used, so this works
586 }
587
588 return [ $i, $re ];
589 }
590
591 private function addCapturesFromMatch( $arr, $s, $m, $capt, $m0_if_no_captures ) {
592 if ( count( $capt ) ) {
593 foreach ( $capt as $n => $pos ) {
594 if ( $pos ) {
595 $o = mb_strlen( substr( $s, 0, $m["m$n"][1] ), 'UTF-8' ) + 1;
596 $arr[] = $o;
597 } else {
598 $arr[] = $m["m$n"][0];
599 }
600 }
601 } elseif ( $m0_if_no_captures ) {
602 $arr[] = $m[0][0];
603 }
604 return $arr;
605 }
606
616 public function ustringFind( $s, $pattern, $init = 1, $plain = false ) {
617 $this->checkString( 'find', $s );
618 $this->checkTypeOptional( 'find', 3, $init, 'number', 1 );
619 $this->checkTypeOptional( 'find', 4, $plain, 'boolean', false );
620
621 $len = mb_strlen( $s, 'UTF-8' );
622 if ( $init < 0 ) {
623 $init = $len + $init + 1;
624 } elseif ( $init > $len + 1 ) {
625 $init = $len + 1;
626 }
627
628 if ( $init > 1 ) {
629 $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
630 } else {
631 $init = 1;
632 $offset = 0;
633 }
634
635 if ( $plain ) {
636 $this->checkPattern( 'find', $pattern );
637 if ( $pattern !== '' ) {
638 $ret = mb_strpos( $s, $pattern, $init - 1, 'UTF-8' );
639 } else {
640 $ret = $init - 1;
641 }
642 if ( $ret === false ) {
643 return [ null ];
644 } else {
645 return [ $ret + 1, $ret + mb_strlen( $pattern ) ];
646 }
647 } else {
648 list( $re, $capt ) = $this->patternToRegex( $pattern, '\G', 'find' );
649 if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) {
650 return [ null ];
651 }
652 $o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' );
653 $ret = [ $o + 1, $o + mb_strlen( $m[0][0], 'UTF-8' ) ];
654 return $this->addCapturesFromMatch( $ret, $s, $m, $capt, false );
655 }
656 }
657
666 public function ustringMatch( $s, $pattern, $init = 1 ) {
667 $this->checkString( 'match', $s );
668 $this->checkTypeOptional( 'match', 3, $init, 'number', 1 );
669
670 $len = mb_strlen( $s, 'UTF-8' );
671 if ( $init < 0 ) {
672 $init = $len + $init + 1;
673 } elseif ( $init > $len + 1 ) {
674 $init = $len + 1;
675 }
676 if ( $init > 1 ) {
677 $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
678 } else {
679 $offset = 0;
680 }
681
682 list( $re, $capt ) = $this->patternToRegex( $pattern, '\G', 'match' );
683 if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) {
684 return [ null ];
685 }
686 return $this->addCapturesFromMatch( [], $s, $m, $capt, true );
687 }
688
696 public function ustringGmatchInit( $s, $pattern ) {
697 $this->checkString( 'gmatch', $s );
698
699 list( $re, $capt ) = $this->patternToRegex( $pattern, false, 'gmatch' );
700 return [ $re, $capt ];
701 }
702
712 public function ustringGmatchCallback( $s, $re, $capt, $pos ) {
713 if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $pos ) ) {
714 return [ $pos, [] ];
715 }
716 $pos = $m[0][1] + strlen( $m[0][0] );
717 return [ $pos, $this->addCapturesFromMatch( [ null ], $s, $m, $capt, true ) ];
718 }
719
729 public function ustringGsub( $s, $pattern, $repl, $n = null ) {
730 $this->checkString( 'gsub', $s );
731 $this->checkTypeOptional( 'gsub', 4, $n, 'number', null );
732
733 if ( $n === null ) {
734 $n = -1;
735 } elseif ( $n < 1 ) {
736 return [ $s, 0 ];
737 }
738
739 list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^', 'gsub' );
740 $captures = [];
741
742 if ( $this->phpBug53823 ) {
743 // PHP bug 53823 means that a zero-length match before a UTF-8
744 // character will match again before every byte of that character.
745 // The workaround is to capture the first "character" of/after the
746 // match and verify that its first byte is legal to start a UTF-8
747 // character.
748 $re = '/(?=(?<phpBug53823>.|$))' . substr( $re, 1 );
749 }
750
751 if ( $anypos ) {
752 // preg_replace_callback doesn't take a "flags" argument, so we
753 // can't pass PREG_OFFSET_CAPTURE to it, which is needed to handle
754 // position captures. So instead we have to do a preg_match_all and
755 // handle the captures ourself.
756 $ct = preg_match_all( $re, $s, $mm, PREG_OFFSET_CAPTURE | PREG_SET_ORDER );
757 for ( $i = 0; $i < $ct; $i++ ) {
758 $m = $mm[$i];
759 if ( $this->phpBug53823 ) {
760 $c = ord( $m['phpBug53823'][0] );
761 if ( $c >= 0x80 && $c <= 0xbf ) {
762 continue;
763 }
764 }
765 $c = [ $m[0][0] ];
766 foreach ( $this->addCapturesFromMatch( [], $s, $m, $capt, false ) as $k => $v ) {
767 $k++;
768 $c["m$k"] = $v;
769 }
770 $captures[] = $c;
771 if ( $n >= 0 && count( $captures ) >= $n ) {
772 break;
773 }
774 }
775 }
776
777 switch ( $this->getLuaType( $repl ) ) {
778 case 'string':
779 case 'number':
780 $cb = function ( $m ) use ( $repl, $anypos, &$captures ) {
781 if ( $anypos ) {
782 $m = array_shift( $captures );
783 }
784 return preg_replace_callback( '/%([%0-9])/', function ( $m2 ) use ( $m ) {
785 $x = $m2[1];
786 if ( $x === '%' ) {
787 return '%';
788 } elseif ( $x === '0' ) {
789 return $m[0];
790 } elseif ( isset( $m["m$x"] ) ) {
791 return $m["m$x"];
792 } elseif ( $x === '1' ) {
793 // Match undocumented Lua string.gsub behavior
794 return $m[0];
795 } else {
796 throw new Scribunto_LuaError( "invalid capture index %$x in replacement string" );
797 }
798 }, $repl );
799 };
800 break;
801
802 case 'table':
803 $cb = function ( $m ) use ( $repl, $anypos, &$captures ) {
804 if ( $anypos ) {
805 $m = array_shift( $captures );
806 }
807 $x = $m['m1'] ?? $m[0];
808 if ( !isset( $repl[$x] ) || $repl[$x] === null ) {
809 return $m[0];
810 }
811 $type = $this->getLuaType( $repl[$x] );
812 if ( $type !== 'string' && $type !== 'number' ) {
813 throw new Scribunto_LuaError( "invalid replacement value (a $type)" );
814 }
815 return $repl[$x];
816 };
817 break;
818
819 case 'function':
820 $interpreter = $this->getInterpreter();
821 $cb = function ( $m ) use ( $interpreter, $capt, $repl, $anypos, &$captures ) {
822 if ( $anypos ) {
823 $m = array_shift( $captures );
824 }
825 $args = [];
826 if ( count( $capt ) ) {
827 foreach ( $capt as $i => $pos ) {
828 $args[] = $m["m$i"];
829 }
830 } else {
831 $args[] = $m[0];
832 }
833 $ret = $interpreter->callFunction( $repl, ...$args );
834 if ( count( $ret ) === 0 || $ret[0] === null ) {
835 return $m[0];
836 }
837 $type = $this->getLuaType( $ret[0] );
838 if ( $type !== 'string' && $type !== 'number' ) {
839 throw new Scribunto_LuaError( "invalid replacement value (a $type)" );
840 }
841 return $ret[0];
842 };
843 break;
844
845 default:
846 $this->checkType( 'gsub', 3, $repl, 'function or table or string' );
847 }
848
849 $skippedMatches = 0;
850 if ( $this->phpBug53823 ) {
851 // Since we're having bogus matches, we need to keep track of the
852 // necessary adjustment and stop manually once we hit the limit.
853 $maxMatches = $n < 0 ? INF : $n;
854 $n = -1;
855 $realCallback = $cb;
856 $cb = function ( $m ) use ( $realCallback, &$skippedMatches, &$maxMatches ) {
857 $c = ord( $m['phpBug53823'] );
858 if ( $c >= 0x80 && $c <= 0xbf || $maxMatches <= 0 ) {
859 $skippedMatches++;
860 return $m[0];
861 } else {
862 $maxMatches--;
863 return $realCallback( $m );
864 }
865 };
866 }
867
868 $count = 0;
869 $s2 = preg_replace_callback( $re, $cb, $s, $n, $count );
870 if ( $s2 === null ) {
871 self::handlePCREError( preg_last_error(), $pattern );
872 }
873 return [ $s2, $count - $skippedMatches ];
874 }
875
882 private function handlePCREError( $error, $pattern ) {
883 $PREG_JIT_STACKLIMIT_ERROR = defined( 'PREG_JIT_STACKLIMIT_ERROR' )
884 ? PREG_JIT_STACKLIMIT_ERROR
885 : 'PREG_JIT_STACKLIMIT_ERROR';
886
887 $error = preg_last_error();
888 switch ( $error ) {
889 case PREG_NO_ERROR:
890 // Huh?
891 break;
892 case PREG_INTERNAL_ERROR:
893 throw new Scribunto_LuaError( "PCRE internal error" );
894 case PREG_BACKTRACK_LIMIT_ERROR:
895 throw new Scribunto_LuaError(
896 "PCRE backtrack limit reached while matching pattern '$pattern'"
897 );
898 case PREG_RECURSION_LIMIT_ERROR:
899 throw new Scribunto_LuaError(
900 "PCRE recursion limit reached while matching pattern '$pattern'"
901 );
902 case PREG_BAD_UTF8_ERROR:
903 // Should have alreay been caught, but just in case
904 throw new Scribunto_LuaError( "PCRE bad UTF-8 error" );
905 case PREG_BAD_UTF8_OFFSET_ERROR:
906 // Shouldn't happen, but just in case
907 throw new Scribunto_LuaError( "PCRE bad UTF-8 offset error" );
908 case $PREG_JIT_STACKLIMIT_ERROR:
909 throw new Scribunto_LuaError(
910 "PCRE JIT stack limit reached while matching pattern '$pattern'"
911 );
912 default:
913 throw new Scribunto_LuaError(
914 "PCRE error code $error while matching pattern '$pattern'"
915 );
916 }
917 }
918}
serialize()
$wgMaxArticleSize
Maximum article size in kilobytes.
if( $line===false) $args
Definition cdb.php:64
Handles a simple LRU key/value map with a maximum number of entries.
This class provides some basic services that Lua libraries will probably need.
getLuaType( $var)
Get the Lua type corresponding to the type of the variable.
getInterpreter()
Get the interpreter.
checkType( $name, $argIdx, $arg, $expectType)
Check the type of a variable.
Scribunto_LuaEngine $engine
getEngine()
Get the engine.
checkTypeOptional( $name, $argIdx, &$arg, $expectType, $default)
Check the type of a variable, with default if null.
ustringGsub( $s, $pattern, $repl, $n=null)
Handler for gsub.
ustringSub( $s, $i=1, $j=-1)
Handler for sub.
ustringUpper( $s)
Handler for upper.
boolean $phpBug53823
PHP until 5.6.9 are buggy when the regex in preg_replace an preg_match_all matches the empty string.
MapCacheLRU $patternRegexCache
A cache of patterns and the regexes they generate.
handlePCREError( $error, $pattern)
Handle a PCRE error.
ustringMatch( $s, $pattern, $init=1)
Handler for match.
integer null $stringLengthLimit
Limit on string lengths, in bytes not characters If null, $wgMaxArticleSize * 1024 will be used.
ustringLen( $s)
Handler for len.
ustringIsUtf8( $s)
Handler for isUtf8.
ustringToNFC( $s)
Handler for toNFC.
ustringLower( $s)
Handler for lower.
checkString( $name, $s, $checkEncoding=true)
ustringByteoffset( $s, $l=1, $i=1)
Handler for byteoffset.
bracketedCharSetToRegex( $pat, $i, $len, $brcharsets)
ustringCodepoint( $s, $i=1, $j=null)
Handler for codepoint.
ustringToNFKD( $s)
Handler for toNFKD.
patternToRegex( $pattern, $anchor, $name)
ustringGmatchInit( $s, $pattern)
Handler for gmatchInit.
ustringToNFD( $s)
Handler for toNFD.
ustringToNFKC( $s)
Handler for toNFKC.
ustringChar()
Handler for char.
integer $patternLengthLimit
Limit on pattern lengths, in bytes not characters.
ustringGcodepointInit( $s, $i=1, $j=null)
Handler for gcodepointInit.
addCapturesFromMatch( $arr, $s, $m, $capt, $m0_if_no_captures)
ustringFind( $s, $pattern, $init=1, $plain=false)
Handler for find.
ustringGmatchCallback( $s, $re, $capt, $pos)
Handler for gmatchCallback.