3 use UtfNormal\Validator;
10 private $patternLengthLimit = 10000;
17 private $stringLengthLimit =
null;
24 private $phpBug53823 =
false;
30 private $patternRegexCache =
null;
33 if ( $this->stringLengthLimit ===
null ) {
38 $this->phpBug53823 = preg_replace(
'//us',
'x',
"\xc3\xa1" ) ===
"x\xc3x\xa1x";
41 parent::__construct( $engine );
44 public function register() {
45 $perf = $this->
getEngine()->getPerformanceCharacteristics();
47 if ( $perf[
'phpCallsRequireSerialization'] ) {
51 'find' => [ $this,
'ustringFind' ],
52 'match' => [ $this,
'ustringMatch' ],
53 'gmatch_init' => [ $this,
'ustringGmatchInit' ],
54 'gmatch_callback' => [ $this,
'ustringGmatchCallback' ],
55 'gsub' => [ $this,
'ustringGsub' ],
59 'isutf8' => [ $this,
'ustringIsUtf8' ],
60 'byteoffset' => [ $this,
'ustringByteoffset' ],
61 'codepoint' => [ $this,
'ustringCodepoint' ],
62 'gcodepoint_init' => [ $this,
'ustringGcodepointInit' ],
63 'toNFC' => [ $this,
'ustringToNFC' ],
64 'toNFD' => [ $this,
'ustringToNFD' ],
65 'toNFKC' => [ $this,
'ustringToNFKC' ],
66 'toNFKD' => [ $this,
'ustringToNFKD' ],
67 'char' => [ $this,
'ustringChar' ],
68 'len' => [ $this,
'ustringLen' ],
69 'sub' => [ $this,
'ustringSub' ],
70 'upper' => [ $this,
'ustringUpper' ],
71 'lower' => [ $this,
'ustringLower' ],
72 'find' => [ $this,
'ustringFind' ],
73 'match' => [ $this,
'ustringMatch' ],
74 'gmatch_init' => [ $this,
'ustringGmatchInit' ],
75 'gmatch_callback' => [ $this,
'ustringGmatchCallback' ],
76 'gsub' => [ $this,
'ustringGsub' ],
79 return $this->
getEngine()->registerInterface(
'mw.ustring.lua', $lib, [
80 'stringLengthLimit' => $this->stringLengthLimit,
81 'patternLengthLimit' => $this->patternLengthLimit,
85 private function checkString( $name,
$s, $checkEncoding =
true ) {
90 if ( $checkEncoding && !mb_check_encoding(
$s,
'UTF-8' ) ) {
93 if ( strlen(
$s ) > $this->stringLengthLimit ) {
95 "bad argument #1 to '$name' (string is longer than $this->stringLengthLimit bytes)"
107 public function ustringIsUtf8(
$s ) {
108 $this->checkString(
'isutf8',
$s,
false );
109 return [ mb_check_encoding(
$s,
'UTF-8' ) ];
120 public function ustringByteoffset(
$s, $l = 1, $i = 1 ) {
121 $this->checkString(
'byteoffset',
$s );
125 $bytelen = strlen(
$s );
127 $i = $bytelen + $i + 1;
129 if ( $i < 1 || $i > $bytelen ) {
134 while ( ( ord(
$s[$i] ) & 0xc0 ) === 0x80 ) {
137 if ( $l > 0 && $j === $i ) {
140 $char = mb_strlen( substr(
$s, 0, $i ),
'UTF-8' ) + $l;
141 if ( $char < 0 || $char >= mb_strlen(
$s,
'UTF-8' ) ) {
144 return [ strlen( mb_substr(
$s, 0, $char,
'UTF-8' ) ) + 1 ];
156 public function ustringCodepoint(
$s, $i = 1, $j =
null ) {
157 $this->checkString(
'codepoint',
$s );
161 $l = mb_strlen(
$s,
'UTF-8' );
171 $i = max( 1, min( $i, $l + 1 ) );
172 $j = max( 1, min( $j, $l + 1 ) );
173 $s = mb_substr(
$s, $i - 1, $j - $i + 1,
'UTF-8' );
174 return unpack(
'N*', mb_convert_encoding(
$s,
'UTF-32BE',
'UTF-8' ) );
185 public function ustringGcodepointInit(
$s, $i = 1, $j =
null ) {
186 return [ $this->ustringCodepoint(
$s, $i, $j ) ];
195 public function ustringToNFC(
$s ) {
196 $this->checkString(
'toNFC',
$s,
false );
197 if ( !mb_check_encoding(
$s,
'UTF-8' ) ) {
200 return [ Validator::toNFC(
$s ) ];
209 public function ustringToNFD(
$s ) {
210 $this->checkString(
'toNFD',
$s,
false );
211 if ( !mb_check_encoding(
$s,
'UTF-8' ) ) {
214 return [ Validator::toNFD(
$s ) ];
223 public function ustringToNFKC(
$s ) {
224 $this->checkString(
'toNFKC',
$s,
false );
225 if ( !mb_check_encoding(
$s,
'UTF-8' ) ) {
228 return [ Validator::toNFKC(
$s ) ];
237 public function ustringToNFKD(
$s ) {
238 $this->checkString(
'toNFKD',
$s,
false );
239 if ( !mb_check_encoding(
$s,
'UTF-8' ) ) {
242 return [ Validator::toNFKD(
$s ) ];
250 public function ustringChar() {
251 $args = func_get_args();
252 if ( count(
$args ) > $this->stringLengthLimit ) {
255 foreach (
$args as $k => &$v ) {
256 if ( !is_numeric( $v ) ) {
257 $this->
checkType(
'char', $k + 1, $v,
'number' );
259 $v = (int)floor( $v );
260 if ( $v < 0 || $v > 0x10ffff ) {
266 $s = mb_convert_encoding(
$s,
'UTF-8',
'UTF-32BE' );
267 if ( strlen(
$s ) > $this->stringLengthLimit ) {
279 public function ustringLen(
$s ) {
280 $this->checkString(
'len',
$s,
false );
281 if ( !mb_check_encoding(
$s,
'UTF-8' ) ) {
284 return [ mb_strlen(
$s,
'UTF-8' ) ];
295 public function ustringSub(
$s, $i = 1, $j = -1 ) {
296 $this->checkString(
'sub',
$s );
300 $len = mb_strlen(
$s,
'UTF-8' );
310 $i = max( 1, min( $i, $len + 1 ) );
311 $j = max( 1, min( $j, $len + 1 ) );
312 $s = mb_substr(
$s, $i - 1, $j - $i + 1,
'UTF-8' );
322 public function ustringUpper(
$s ) {
323 $this->checkString(
'upper',
$s );
324 return [ mb_strtoupper(
$s,
'UTF-8' ) ];
333 public function ustringLower(
$s ) {
334 $this->checkString(
'lower',
$s );
335 return [ mb_strtolower(
$s,
'UTF-8' ) ];
338 private function checkPattern( $name, $pattern ) {
339 if ( $this->
getLuaType( $pattern ) ==
'number' ) {
340 $pattern = (string)$pattern;
342 $this->
checkType( $name, 2, $pattern,
'string' );
343 if ( !mb_check_encoding( $pattern,
'UTF-8' ) ) {
346 if ( strlen( $pattern ) > $this->patternLengthLimit ) {
348 "bad argument #2 to '$name' (pattern is longer than $this->patternLengthLimit bytes)"
354 private function patternToRegex( $pattern, $anchor, $name ) {
355 $cacheKey =
serialize( [ $pattern, $anchor ] );
356 if ( !$this->patternRegexCache->has( $cacheKey ) ) {
357 $this->checkPattern( $name, $pattern );
358 $pat = preg_split(
'//us', $pattern,
null, PREG_SPLIT_NO_EMPTY );
360 static $charsets =
null, $brcharsets =
null;
361 if ( $charsets ===
null ) {
372 'w' =>
'[\p{L}\p{Nd}]',
373 'x' =>
'[0-9A-Fa-f0-9A-Fa-f]',
384 'W' =>
'[^\p{L}\p{Nd}]',
385 'X' =>
'[^0-9A-Fa-f0-9A-Fa-f]',
389 'w' =>
'\p{L}\p{Nd}',
390 'x' =>
'0-9A-Fa-f0-9A-Fa-f',
396 'W' =>
'\P{Xan}\p{Nl}\p{No}',
399 'X' =>
'\x00-\x2f\x3a-\x40\x47-\x60\x67-\x{ff0f}'
400 .
'\x{ff1a}-\x{ff20}\x{ff27}-\x{ff40}\x{ff47}-\x{10ffff}',
403 'Z' =>
'\x01-\x{10ffff}',
408 $len = count( $pat );
415 for ( $i = 0; $i < $len; $i++ ) {
418 switch ( $pat[$i] ) {
421 $re .= ( $anchor ===
false || $q ) ?
'\\^' : $anchor;
425 $q = ( $i < $len - 1 );
426 $re .= $q ?
'\\$' :
'$';
430 if ( $i + 1 >= $len ) {
433 $n = count( $capt ) + 1;
434 $capt[$n] = ( $pat[$i + 1] ===
')' );
440 $captparen[$n] = $ii;
444 if ( count( $opencapt ) <= 0 ) {
447 array_pop( $opencapt );
456 if ( isset( $charsets[$pat[$i]] ) ) {
457 $re .= $charsets[$pat[$i]];
459 } elseif ( $pat[$i] ===
'b' ) {
460 if ( $i + 2 >= $len ) {
463 $d1 = preg_quote( $pat[++$i],
'/' );
464 $d2 = preg_quote( $pat[++$i],
'/' );
466 $re .=
"{$d1}[^$d1]*$d1";
469 $re .=
"(?<b$bct>$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)";
471 } elseif ( $pat[$i] ===
'f' ) {
472 if ( $i + 1 >= $len || $pat[++$i] !==
'[' ) {
473 throw new Scribunto_LuaError(
"missing '[' after %f in pattern at pattern character $ii" );
475 list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets );
480 if ( preg_match(
"/$re2/us",
"\0" ) ) {
481 $re .=
"(?<!^)(?<!$re2)(?=$re2|$)";
483 $re .=
"(?<!$re2)(?=$re2)";
485 } elseif ( $pat[$i] >=
'0' && $pat[$i] <=
'9' ) {
486 $n = ord( $pat[$i] ) - 0x30;
487 if ( $n === 0 || $n > count( $capt ) || in_array( $n, $opencapt ) ) {
492 $re .= preg_quote( $pat[$i],
'/' );
498 list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets );
512 $re .= preg_quote( $pat[$i],
'/' );
516 if ( $q && $i + 1 < $len ) {
517 switch ( $pat[$i + 1] ) {
530 if ( count( $opencapt ) ) {
531 $ii = $captparen[$opencapt[0]];
532 throw new Scribunto_LuaError(
"Unclosed capture beginning at pattern character $ii" );
536 $this->patternRegexCache->set( $cacheKey, [ $re, $capt, $anypos ] );
538 return $this->patternRegexCache->get( $cacheKey );
541 private function bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ) {
545 if ( $i < $len && $pat[$i] ===
'^' ) {
549 for ( $j = $i; $i < $len && ( $j == $i || $pat[$i] !==
']' ); $i++ ) {
550 if ( $pat[$i] ===
'%' ) {
555 if ( isset( $brcharsets[$pat[$i]] ) ) {
556 $re .= $brcharsets[$pat[$i]];
558 $re .= preg_quote( $pat[$i],
'/' );
560 } elseif ( $i + 2 < $len &&
561 $pat[$i + 1] ===
'-' && $pat[$i + 2] !==
']' && $pat[$i + 2] !==
'%'
563 if ( $pat[$i] <= $pat[$i + 2] ) {
564 $re .= preg_quote( $pat[$i],
'/' ) .
'-' . preg_quote( $pat[$i + 2],
'/' );
568 $re .= preg_quote( $pat[$i],
'/' );
573 "Missing close-bracket for character set beginning at pattern character $ii"
580 if ( $re ===
'[]' ) {
584 } elseif ( $re ===
'[^]' ) {
591 private function addCapturesFromMatch( $arr,
$s, $m, $capt, $m0_if_no_captures ) {
592 if ( count( $capt ) ) {
593 foreach ( $capt as $n => $pos ) {
595 $o = mb_strlen( substr(
$s, 0, $m[
"m$n"][1] ),
'UTF-8' ) + 1;
598 $arr[] = $m[
"m$n"][0];
601 } elseif ( $m0_if_no_captures ) {
616 public function ustringFind(
$s, $pattern, $init = 1, $plain =
false ) {
617 $this->checkString(
'find',
$s );
621 $len = mb_strlen(
$s,
'UTF-8' );
623 $init = $len + $init + 1;
624 } elseif ( $init > $len + 1 ) {
629 $offset = strlen( mb_substr(
$s, 0, $init - 1,
'UTF-8' ) );
636 $this->checkPattern(
'find', $pattern );
637 if ( $pattern !==
'' ) {
638 $ret = mb_strpos(
$s, $pattern, $init - 1,
'UTF-8' );
642 if ( $ret ===
false ) {
645 return [ $ret + 1, $ret + mb_strlen( $pattern ) ];
648 list( $re, $capt ) = $this->patternToRegex( $pattern,
'\G',
'find' );
649 if ( !preg_match( $re,
$s, $m, PREG_OFFSET_CAPTURE, $offset ) ) {
652 $o = mb_strlen( substr(
$s, 0, $m[0][1] ),
'UTF-8' );
653 $ret = [ $o + 1, $o + mb_strlen( $m[0][0],
'UTF-8' ) ];
654 return $this->addCapturesFromMatch( $ret,
$s, $m, $capt,
false );
666 public function ustringMatch(
$s, $pattern, $init = 1 ) {
667 $this->checkString(
'match',
$s );
670 $len = mb_strlen(
$s,
'UTF-8' );
672 $init = $len + $init + 1;
673 } elseif ( $init > $len + 1 ) {
677 $offset = strlen( mb_substr(
$s, 0, $init - 1,
'UTF-8' ) );
682 list( $re, $capt ) = $this->patternToRegex( $pattern,
'\G',
'match' );
683 if ( !preg_match( $re,
$s, $m, PREG_OFFSET_CAPTURE, $offset ) ) {
686 return $this->addCapturesFromMatch( [],
$s, $m, $capt,
true );
696 public function ustringGmatchInit(
$s, $pattern ) {
697 $this->checkString(
'gmatch',
$s );
699 list( $re, $capt ) = $this->patternToRegex( $pattern,
false,
'gmatch' );
700 return [ $re, $capt ];
712 public function ustringGmatchCallback(
$s, $re, $capt, $pos ) {
713 if ( !preg_match( $re,
$s, $m, PREG_OFFSET_CAPTURE, $pos ) ) {
716 $pos = $m[0][1] + strlen( $m[0][0] );
717 return [ $pos, $this->addCapturesFromMatch( [
null ],
$s, $m, $capt,
true ) ];
729 public function ustringGsub(
$s, $pattern, $repl, $n =
null ) {
730 $this->checkString(
'gsub',
$s );
735 } elseif ( $n < 1 ) {
739 list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern,
'^',
'gsub' );
742 if ( $this->phpBug53823 ) {
748 $re =
'/(?=(?<phpBug53823>.|$))' . substr( $re, 1 );
756 $ct = preg_match_all( $re,
$s, $mm, PREG_OFFSET_CAPTURE | PREG_SET_ORDER );
757 for ( $i = 0; $i < $ct; $i++ ) {
759 if ( $this->phpBug53823 ) {
760 $c = ord( $m[
'phpBug53823'][0] );
761 if ( $c >= 0x80 && $c <= 0xbf ) {
766 foreach ( $this->addCapturesFromMatch( [],
$s, $m, $capt,
false ) as $k => $v ) {
771 if ( $n >= 0 && count( $captures ) >= $n ) {
780 $cb =
function ( $m ) use ( $repl, $anypos, &$captures ) {
782 $m = array_shift( $captures );
784 return preg_replace_callback(
'/%([%0-9])/',
function ( $m2 ) use ( $m ) {
788 } elseif ( $x ===
'0' ) {
790 } elseif ( isset( $m[
"m$x"] ) ) {
792 } elseif ( $x ===
'1' ) {
803 $cb =
function ( $m ) use ( $repl, $anypos, &$captures ) {
805 $m = array_shift( $captures );
807 $x = $m[
'm1'] ?? $m[0];
808 if ( !isset( $repl[$x] ) || $repl[$x] ===
null ) {
812 if (
$type !==
'string' &&
$type !==
'number' ) {
821 $cb =
function ( $m ) use ( $interpreter, $capt, $repl, $anypos, &$captures ) {
823 $m = array_shift( $captures );
826 if ( count( $capt ) ) {
827 foreach ( $capt as $i => $pos ) {
833 $ret = $interpreter->callFunction( $repl, ...
$args );
834 if ( count( $ret ) === 0 || $ret[0] ===
null ) {
838 if (
$type !==
'string' &&
$type !==
'number' ) {
846 $this->
checkType(
'gsub', 3, $repl,
'function or table or string' );
850 if ( $this->phpBug53823 ) {
853 $maxMatches = $n < 0 ? INF : $n;
856 $cb =
function ( $m ) use ( $realCallback, &$skippedMatches, &$maxMatches ) {
857 $c = ord( $m[
'phpBug53823'] );
858 if ( $c >= 0x80 && $c <= 0xbf || $maxMatches <= 0 ) {
863 return $realCallback( $m );
869 $s2 = preg_replace_callback( $re, $cb,
$s, $n, $count );
870 if ( $s2 ===
null ) {
871 self::handlePCREError( preg_last_error(), $pattern );
873 return [ $s2, $count - $skippedMatches ];
882 private function handlePCREError( $error, $pattern ) {
883 $PREG_JIT_STACKLIMIT_ERROR = defined(
'PREG_JIT_STACKLIMIT_ERROR' )
884 ? PREG_JIT_STACKLIMIT_ERROR
885 :
'PREG_JIT_STACKLIMIT_ERROR';
887 $error = preg_last_error();
892 case PREG_INTERNAL_ERROR:
894 case PREG_BACKTRACK_LIMIT_ERROR:
896 "PCRE backtrack limit reached while matching pattern '$pattern'"
898 case PREG_RECURSION_LIMIT_ERROR:
900 "PCRE recursion limit reached while matching pattern '$pattern'"
902 case PREG_BAD_UTF8_ERROR:
905 case PREG_BAD_UTF8_OFFSET_ERROR:
908 case $PREG_JIT_STACKLIMIT_ERROR:
910 "PCRE JIT stack limit reached while matching pattern '$pattern'"
914 "PCRE error code $error while matching pattern '$pattern'"