MediaWiki  1.34.4
UstringLibrary.php
Go to the documentation of this file.
1 <?php
2 
3 use UtfNormal\Validator;
4 
10  private $patternLengthLimit = 10000;
11 
17  private $stringLengthLimit = null;
18 
24  private $phpBug53823 = false;
25 
30  private $patternRegexCache = null;
31 
32  public function __construct( $engine ) {
33  if ( $this->stringLengthLimit === null ) {
34  global $wgMaxArticleSize;
35  $this->stringLengthLimit = $wgMaxArticleSize * 1024;
36  }
37 
38  $this->phpBug53823 = preg_replace( '//us', 'x', "\xc3\xa1" ) === "x\xc3x\xa1x";
39  $this->patternRegexCache = new MapCacheLRU( 100 );
40 
41  parent::__construct( $engine );
42  }
43 
44  public function register() {
45  $perf = $this->getEngine()->getPerformanceCharacteristics();
46 
47  if ( $perf['phpCallsRequireSerialization'] ) {
48  $lib = [
49  // Pattern matching is still much faster in PHP, even with the
50  // overhead of serialization
51  'find' => [ $this, 'ustringFind' ],
52  'match' => [ $this, 'ustringMatch' ],
53  'gmatch_init' => [ $this, 'ustringGmatchInit' ],
54  'gmatch_callback' => [ $this, 'ustringGmatchCallback' ],
55  'gsub' => [ $this, 'ustringGsub' ],
56  ];
57  } else {
58  $lib = [
59  'isutf8' => [ $this, 'ustringIsUtf8' ],
60  'byteoffset' => [ $this, 'ustringByteoffset' ],
61  'codepoint' => [ $this, 'ustringCodepoint' ],
62  'gcodepoint_init' => [ $this, 'ustringGcodepointInit' ],
63  'toNFC' => [ $this, 'ustringToNFC' ],
64  'toNFD' => [ $this, 'ustringToNFD' ],
65  'toNFKC' => [ $this, 'ustringToNFKC' ],
66  'toNFKD' => [ $this, 'ustringToNFKD' ],
67  'char' => [ $this, 'ustringChar' ],
68  'len' => [ $this, 'ustringLen' ],
69  'sub' => [ $this, 'ustringSub' ],
70  'upper' => [ $this, 'ustringUpper' ],
71  'lower' => [ $this, 'ustringLower' ],
72  'find' => [ $this, 'ustringFind' ],
73  'match' => [ $this, 'ustringMatch' ],
74  'gmatch_init' => [ $this, 'ustringGmatchInit' ],
75  'gmatch_callback' => [ $this, 'ustringGmatchCallback' ],
76  'gsub' => [ $this, 'ustringGsub' ],
77  ];
78  }
79  return $this->getEngine()->registerInterface( 'mw.ustring.lua', $lib, [
80  'stringLengthLimit' => $this->stringLengthLimit,
81  'patternLengthLimit' => $this->patternLengthLimit,
82  ] );
83  }
84 
85  private function checkString( $name, $s, $checkEncoding = true ) {
86  if ( $this->getLuaType( $s ) == 'number' ) {
87  $s = (string)$s;
88  } else {
89  $this->checkType( $name, 1, $s, 'string' );
90  if ( $checkEncoding && !mb_check_encoding( $s, 'UTF-8' ) ) {
91  throw new Scribunto_LuaError( "bad argument #1 to '$name' (string is not UTF-8)" );
92  }
93  if ( strlen( $s ) > $this->stringLengthLimit ) {
94  throw new Scribunto_LuaError(
95  "bad argument #1 to '$name' (string is longer than $this->stringLengthLimit bytes)"
96  );
97  }
98  }
99  }
100 
107  public function ustringIsUtf8( $s ) {
108  $this->checkString( 'isutf8', $s, false );
109  return [ mb_check_encoding( $s, 'UTF-8' ) ];
110  }
111 
120  public function ustringByteoffset( $s, $l = 1, $i = 1 ) {
121  $this->checkString( 'byteoffset', $s );
122  $this->checkTypeOptional( 'byteoffset', 2, $l, 'number', 1 );
123  $this->checkTypeOptional( 'byteoffset', 3, $i, 'number', 1 );
124 
125  $bytelen = strlen( $s );
126  if ( $i < 0 ) {
127  $i = $bytelen + $i + 1;
128  }
129  if ( $i < 1 || $i > $bytelen ) {
130  return [ null ];
131  }
132  $i--;
133  $j = $i;
134  while ( ( ord( $s[$i] ) & 0xc0 ) === 0x80 ) {
135  $i--;
136  }
137  if ( $l > 0 && $j === $i ) {
138  $l--;
139  }
140  $char = mb_strlen( substr( $s, 0, $i ), 'UTF-8' ) + $l;
141  if ( $char < 0 || $char >= mb_strlen( $s, 'UTF-8' ) ) {
142  return [ null ];
143  } else {
144  return [ strlen( mb_substr( $s, 0, $char, 'UTF-8' ) ) + 1 ];
145  }
146  }
147 
156  public function ustringCodepoint( $s, $i = 1, $j = null ) {
157  $this->checkString( 'codepoint', $s );
158  $this->checkTypeOptional( 'codepoint', 2, $i, 'number', 1 );
159  $this->checkTypeOptional( 'codepoint', 3, $j, 'number', $i );
160 
161  $l = mb_strlen( $s, 'UTF-8' );
162  if ( $i < 0 ) {
163  $i = $l + $i + 1;
164  }
165  if ( $j < 0 ) {
166  $j = $l + $j + 1;
167  }
168  if ( $j < $i ) {
169  return [];
170  }
171  $i = max( 1, min( $i, $l + 1 ) );
172  $j = max( 1, min( $j, $l + 1 ) );
173  $s = mb_substr( $s, $i - 1, $j - $i + 1, 'UTF-8' );
174  return unpack( 'N*', mb_convert_encoding( $s, 'UTF-32BE', 'UTF-8' ) );
175  }
176 
185  public function ustringGcodepointInit( $s, $i = 1, $j = null ) {
186  return [ $this->ustringCodepoint( $s, $i, $j ) ];
187  }
188 
195  public function ustringToNFC( $s ) {
196  $this->checkString( 'toNFC', $s, false );
197  if ( !mb_check_encoding( $s, 'UTF-8' ) ) {
198  return [ null ];
199  }
200  return [ Validator::toNFC( $s ) ];
201  }
202 
209  public function ustringToNFD( $s ) {
210  $this->checkString( 'toNFD', $s, false );
211  if ( !mb_check_encoding( $s, 'UTF-8' ) ) {
212  return [ null ];
213  }
214  return [ Validator::toNFD( $s ) ];
215  }
216 
223  public function ustringToNFKC( $s ) {
224  $this->checkString( 'toNFKC', $s, false );
225  if ( !mb_check_encoding( $s, 'UTF-8' ) ) {
226  return [ null ];
227  }
228  return [ Validator::toNFKC( $s ) ];
229  }
230 
237  public function ustringToNFKD( $s ) {
238  $this->checkString( 'toNFKD', $s, false );
239  if ( !mb_check_encoding( $s, 'UTF-8' ) ) {
240  return [ null ];
241  }
242  return [ Validator::toNFKD( $s ) ];
243  }
244 
250  public function ustringChar() {
251  $args = func_get_args();
252  if ( count( $args ) > $this->stringLengthLimit ) {
253  throw new Scribunto_LuaError( "too many arguments to 'char'" );
254  }
255  foreach ( $args as $k => &$v ) {
256  if ( !is_numeric( $v ) ) {
257  $this->checkType( 'char', $k + 1, $v, 'number' );
258  }
259  $v = (int)floor( $v );
260  if ( $v < 0 || $v > 0x10ffff ) {
261  $k++;
262  throw new Scribunto_LuaError( "bad argument #$k to 'char' (value out of range)" );
263  }
264  }
265  $s = pack( 'N*', ...$args );
266  $s = mb_convert_encoding( $s, 'UTF-8', 'UTF-32BE' );
267  if ( strlen( $s ) > $this->stringLengthLimit ) {
268  throw new Scribunto_LuaError( "result to long for 'char'" );
269  }
270  return [ $s ];
271  }
272 
279  public function ustringLen( $s ) {
280  $this->checkString( 'len', $s, false );
281  if ( !mb_check_encoding( $s, 'UTF-8' ) ) {
282  return [ null ];
283  }
284  return [ mb_strlen( $s, 'UTF-8' ) ];
285  }
286 
295  public function ustringSub( $s, $i = 1, $j = -1 ) {
296  $this->checkString( 'sub', $s );
297  $this->checkTypeOptional( 'sub', 2, $i, 'number', 1 );
298  $this->checkTypeOptional( 'sub', 3, $j, 'number', -1 );
299 
300  $len = mb_strlen( $s, 'UTF-8' );
301  if ( $i < 0 ) {
302  $i = $len + $i + 1;
303  }
304  if ( $j < 0 ) {
305  $j = $len + $j + 1;
306  }
307  if ( $j < $i ) {
308  return [ '' ];
309  }
310  $i = max( 1, min( $i, $len + 1 ) );
311  $j = max( 1, min( $j, $len + 1 ) );
312  $s = mb_substr( $s, $i - 1, $j - $i + 1, 'UTF-8' );
313  return [ $s ];
314  }
315 
322  public function ustringUpper( $s ) {
323  $this->checkString( 'upper', $s );
324  return [ mb_strtoupper( $s, 'UTF-8' ) ];
325  }
326 
333  public function ustringLower( $s ) {
334  $this->checkString( 'lower', $s );
335  return [ mb_strtolower( $s, 'UTF-8' ) ];
336  }
337 
338  private function checkPattern( $name, $pattern ) {
339  if ( $this->getLuaType( $pattern ) == 'number' ) {
340  $pattern = (string)$pattern;
341  }
342  $this->checkType( $name, 2, $pattern, 'string' );
343  if ( !mb_check_encoding( $pattern, 'UTF-8' ) ) {
344  throw new Scribunto_LuaError( "bad argument #2 to '$name' (string is not UTF-8)" );
345  }
346  if ( strlen( $pattern ) > $this->patternLengthLimit ) {
347  throw new Scribunto_LuaError(
348  "bad argument #2 to '$name' (pattern is longer than $this->patternLengthLimit bytes)"
349  );
350  }
351  }
352 
353  /* Convert a Lua pattern into a PCRE regex */
354  private function patternToRegex( $pattern, $anchor, $name ) {
355  $cacheKey = serialize( [ $pattern, $anchor ] );
356  if ( !$this->patternRegexCache->has( $cacheKey ) ) {
357  $this->checkPattern( $name, $pattern );
358  $pat = preg_split( '//us', $pattern, null, PREG_SPLIT_NO_EMPTY );
359 
360  static $charsets = null, $brcharsets = null;
361  if ( $charsets === null ) {
362  $charsets = [
363  // If you change these, also change lualib/ustring/make-tables.php
364  // (and run it to regenerate charsets.lua)
365  'a' => '\p{L}',
366  'c' => '\p{Cc}',
367  'd' => '\p{Nd}',
368  'l' => '\p{Ll}',
369  'p' => '\p{P}',
370  's' => '\p{Xps}',
371  'u' => '\p{Lu}',
372  'w' => '[\p{L}\p{Nd}]',
373  'x' => '[0-9A-Fa-f0-9A-Fa-f]',
374  'z' => '\0',
375 
376  // These *must* be the inverse of the above
377  'A' => '\P{L}',
378  'C' => '\P{Cc}',
379  'D' => '\P{Nd}',
380  'L' => '\P{Ll}',
381  'P' => '\P{P}',
382  'S' => '\P{Xps}',
383  'U' => '\P{Lu}',
384  'W' => '[^\p{L}\p{Nd}]',
385  'X' => '[^0-9A-Fa-f0-9A-Fa-f]',
386  'Z' => '[^\0]',
387  ];
388  $brcharsets = [
389  'w' => '\p{L}\p{Nd}',
390  'x' => '0-9A-Fa-f0-9A-Fa-f',
391 
392  // Negated sets that are not expressable as a simple \P{} are
393  // unfortunately complicated.
394 
395  // Xan is L plus N, so ^Xan plus Nl plus No is anything that's not L or Nd
396  'W' => '\P{Xan}\p{Nl}\p{No}',
397 
398  // Manually constructed. Fun.
399  'X' => '\x00-\x2f\x3a-\x40\x47-\x60\x67-\x{ff0f}'
400  . '\x{ff1a}-\x{ff20}\x{ff27}-\x{ff40}\x{ff47}-\x{10ffff}',
401 
402  // Ha!
403  'Z' => '\x01-\x{10ffff}',
404  ] + $charsets;
405  }
406 
407  $re = '/';
408  $len = count( $pat );
409  $capt = [];
410  $anypos = false;
411  $captparen = [];
412  $opencapt = [];
413  $bct = 0;
414 
415  for ( $i = 0; $i < $len; $i++ ) {
416  $ii = $i + 1;
417  $q = false;
418  switch ( $pat[$i] ) {
419  case '^':
420  $q = $i;
421  $re .= ( $anchor === false || $q ) ? '\\^' : $anchor;
422  break;
423 
424  case '$':
425  $q = ( $i < $len - 1 );
426  $re .= $q ? '\\$' : '$';
427  break;
428 
429  case '(':
430  if ( $i + 1 >= $len ) {
431  throw new Scribunto_LuaError( "Unmatched open-paren at pattern character $ii" );
432  }
433  $n = count( $capt ) + 1;
434  $capt[$n] = ( $pat[$i + 1] === ')' );
435  if ( $capt[$n] ) {
436  $anypos = true;
437  }
438  $re .= "(?<m$n>";
439  $opencapt[] = $n;
440  $captparen[$n] = $ii;
441  break;
442 
443  case ')':
444  if ( count( $opencapt ) <= 0 ) {
445  throw new Scribunto_LuaError( "Unmatched close-paren at pattern character $ii" );
446  }
447  array_pop( $opencapt );
448  $re .= $pat[$i];
449  break;
450 
451  case '%':
452  $i++;
453  if ( $i >= $len ) {
454  throw new Scribunto_LuaError( "malformed pattern (ends with '%')" );
455  }
456  if ( isset( $charsets[$pat[$i]] ) ) {
457  $re .= $charsets[$pat[$i]];
458  $q = true;
459  } elseif ( $pat[$i] === 'b' ) {
460  if ( $i + 2 >= $len ) {
461  throw new Scribunto_LuaError( "malformed pattern (missing arguments to \'%b\')" );
462  }
463  $d1 = preg_quote( $pat[++$i], '/' );
464  $d2 = preg_quote( $pat[++$i], '/' );
465  if ( $d1 === $d2 ) {
466  $re .= "{$d1}[^$d1]*$d1";
467  } else {
468  $bct++;
469  $re .= "(?<b$bct>$d1(?:(?>[^$d1$d2]+)|(?P>b$bct))*$d2)";
470  }
471  } elseif ( $pat[$i] === 'f' ) {
472  if ( $i + 1 >= $len || $pat[++$i] !== '[' ) {
473  throw new Scribunto_LuaError( "missing '[' after %f in pattern at pattern character $ii" );
474  }
475  list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets );
476  // Because %f considers the beginning and end of the string
477  // to be \0, determine if $re2 matches that and take it
478  // into account with "^" and "$".
479  // @phan-suppress-next-line PhanParamSuspiciousOrder
480  if ( preg_match( "/$re2/us", "\0" ) ) {
481  $re .= "(?<!^)(?<!$re2)(?=$re2|$)";
482  } else {
483  $re .= "(?<!$re2)(?=$re2)";
484  }
485  } elseif ( $pat[$i] >= '0' && $pat[$i] <= '9' ) {
486  $n = ord( $pat[$i] ) - 0x30;
487  if ( $n === 0 || $n > count( $capt ) || in_array( $n, $opencapt ) ) {
488  throw new Scribunto_LuaError( "invalid capture index %$n at pattern character $ii" );
489  }
490  $re .= "\\g{m$n}";
491  } else {
492  $re .= preg_quote( $pat[$i], '/' );
493  $q = true;
494  }
495  break;
496 
497  case '[':
498  list( $i, $re2 ) = $this->bracketedCharSetToRegex( $pat, $i, $len, $brcharsets );
499  $re .= $re2;
500  $q = true;
501  break;
502 
503  case ']':
504  throw new Scribunto_LuaError( "Unmatched close-bracket at pattern character $ii" );
505 
506  case '.':
507  $re .= $pat[$i];
508  $q = true;
509  break;
510 
511  default:
512  $re .= preg_quote( $pat[$i], '/' );
513  $q = true;
514  break;
515  }
516  if ( $q && $i + 1 < $len ) {
517  switch ( $pat[$i + 1] ) {
518  case '*':
519  case '+':
520  case '?':
521  $re .= $pat[++$i];
522  break;
523  case '-':
524  $re .= '*?';
525  $i++;
526  break;
527  }
528  }
529  }
530  if ( count( $opencapt ) ) {
531  $ii = $captparen[$opencapt[0]];
532  throw new Scribunto_LuaError( "Unclosed capture beginning at pattern character $ii" );
533  }
534  $re .= '/us';
535 
536  $this->patternRegexCache->set( $cacheKey, [ $re, $capt, $anypos ] );
537  }
538  return $this->patternRegexCache->get( $cacheKey );
539  }
540 
541  private function bracketedCharSetToRegex( $pat, $i, $len, $brcharsets ) {
542  $ii = $i + 1;
543  $re = '[';
544  $i++;
545  if ( $i < $len && $pat[$i] === '^' ) {
546  $re .= '^';
547  $i++;
548  }
549  for ( $j = $i; $i < $len && ( $j == $i || $pat[$i] !== ']' ); $i++ ) {
550  if ( $pat[$i] === '%' ) {
551  $i++;
552  if ( $i >= $len ) {
553  break;
554  }
555  if ( isset( $brcharsets[$pat[$i]] ) ) {
556  $re .= $brcharsets[$pat[$i]];
557  } else {
558  $re .= preg_quote( $pat[$i], '/' );
559  }
560  } elseif ( $i + 2 < $len &&
561  $pat[$i + 1] === '-' && $pat[$i + 2] !== ']' && $pat[$i + 2] !== '%'
562  ) {
563  if ( $pat[$i] <= $pat[$i + 2] ) {
564  $re .= preg_quote( $pat[$i], '/' ) . '-' . preg_quote( $pat[$i + 2], '/' );
565  }
566  $i += 2;
567  } else {
568  $re .= preg_quote( $pat[$i], '/' );
569  }
570  }
571  if ( $i >= $len ) {
572  throw new Scribunto_LuaError(
573  "Missing close-bracket for character set beginning at pattern character $ii"
574  );
575  }
576  $re .= ']';
577 
578  // Lua just ignores invalid ranges, while pcre throws an error.
579  // We filter them out above, but then we need to special-case empty sets
580  if ( $re === '[]' ) {
581  // Can't directly quantify (*FAIL), so wrap it.
582  // "(?!)" would be simpler and could be quantified if not for a bug in PCRE 8.13 to 8.33
583  $re = '(?:(*FAIL))';
584  } elseif ( $re === '[^]' ) {
585  $re = '.'; // 's' modifier is always used, so this works
586  }
587 
588  return [ $i, $re ];
589  }
590 
591  private function addCapturesFromMatch( $arr, $s, $m, $capt, $m0_if_no_captures ) {
592  if ( count( $capt ) ) {
593  foreach ( $capt as $n => $pos ) {
594  if ( $pos ) {
595  $o = mb_strlen( substr( $s, 0, $m["m$n"][1] ), 'UTF-8' ) + 1;
596  $arr[] = $o;
597  } else {
598  $arr[] = $m["m$n"][0];
599  }
600  }
601  } elseif ( $m0_if_no_captures ) {
602  $arr[] = $m[0][0];
603  }
604  return $arr;
605  }
606 
616  public function ustringFind( $s, $pattern, $init = 1, $plain = false ) {
617  $this->checkString( 'find', $s );
618  $this->checkTypeOptional( 'find', 3, $init, 'number', 1 );
619  $this->checkTypeOptional( 'find', 4, $plain, 'boolean', false );
620 
621  $len = mb_strlen( $s, 'UTF-8' );
622  if ( $init < 0 ) {
623  $init = $len + $init + 1;
624  } elseif ( $init > $len + 1 ) {
625  $init = $len + 1;
626  }
627 
628  if ( $init > 1 ) {
629  $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
630  } else {
631  $init = 1;
632  $offset = 0;
633  }
634 
635  if ( $plain ) {
636  $this->checkPattern( 'find', $pattern );
637  if ( $pattern !== '' ) {
638  $ret = mb_strpos( $s, $pattern, $init - 1, 'UTF-8' );
639  } else {
640  $ret = $init - 1;
641  }
642  if ( $ret === false ) {
643  return [ null ];
644  } else {
645  return [ $ret + 1, $ret + mb_strlen( $pattern ) ];
646  }
647  } else {
648  list( $re, $capt ) = $this->patternToRegex( $pattern, '\G', 'find' );
649  if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) {
650  return [ null ];
651  }
652  $o = mb_strlen( substr( $s, 0, $m[0][1] ), 'UTF-8' );
653  $ret = [ $o + 1, $o + mb_strlen( $m[0][0], 'UTF-8' ) ];
654  return $this->addCapturesFromMatch( $ret, $s, $m, $capt, false );
655  }
656  }
657 
666  public function ustringMatch( $s, $pattern, $init = 1 ) {
667  $this->checkString( 'match', $s );
668  $this->checkTypeOptional( 'match', 3, $init, 'number', 1 );
669 
670  $len = mb_strlen( $s, 'UTF-8' );
671  if ( $init < 0 ) {
672  $init = $len + $init + 1;
673  } elseif ( $init > $len + 1 ) {
674  $init = $len + 1;
675  }
676  if ( $init > 1 ) {
677  $offset = strlen( mb_substr( $s, 0, $init - 1, 'UTF-8' ) );
678  } else {
679  $offset = 0;
680  }
681 
682  list( $re, $capt ) = $this->patternToRegex( $pattern, '\G', 'match' );
683  if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $offset ) ) {
684  return [ null ];
685  }
686  return $this->addCapturesFromMatch( [], $s, $m, $capt, true );
687  }
688 
696  public function ustringGmatchInit( $s, $pattern ) {
697  $this->checkString( 'gmatch', $s );
698 
699  list( $re, $capt ) = $this->patternToRegex( $pattern, false, 'gmatch' );
700  return [ $re, $capt ];
701  }
702 
712  public function ustringGmatchCallback( $s, $re, $capt, $pos ) {
713  if ( !preg_match( $re, $s, $m, PREG_OFFSET_CAPTURE, $pos ) ) {
714  return [ $pos, [] ];
715  }
716  $pos = $m[0][1] + strlen( $m[0][0] );
717  return [ $pos, $this->addCapturesFromMatch( [ null ], $s, $m, $capt, true ) ];
718  }
719 
729  public function ustringGsub( $s, $pattern, $repl, $n = null ) {
730  $this->checkString( 'gsub', $s );
731  $this->checkTypeOptional( 'gsub', 4, $n, 'number', null );
732 
733  if ( $n === null ) {
734  $n = -1;
735  } elseif ( $n < 1 ) {
736  return [ $s, 0 ];
737  }
738 
739  list( $re, $capt, $anypos ) = $this->patternToRegex( $pattern, '^', 'gsub' );
740  $captures = [];
741 
742  if ( $this->phpBug53823 ) {
743  // PHP bug 53823 means that a zero-length match before a UTF-8
744  // character will match again before every byte of that character.
745  // The workaround is to capture the first "character" of/after the
746  // match and verify that its first byte is legal to start a UTF-8
747  // character.
748  $re = '/(?=(?<phpBug53823>.|$))' . substr( $re, 1 );
749  }
750 
751  if ( $anypos ) {
752  // preg_replace_callback doesn't take a "flags" argument, so we
753  // can't pass PREG_OFFSET_CAPTURE to it, which is needed to handle
754  // position captures. So instead we have to do a preg_match_all and
755  // handle the captures ourself.
756  $ct = preg_match_all( $re, $s, $mm, PREG_OFFSET_CAPTURE | PREG_SET_ORDER );
757  for ( $i = 0; $i < $ct; $i++ ) {
758  $m = $mm[$i];
759  if ( $this->phpBug53823 ) {
760  $c = ord( $m['phpBug53823'][0] );
761  if ( $c >= 0x80 && $c <= 0xbf ) {
762  continue;
763  }
764  }
765  $c = [ $m[0][0] ];
766  foreach ( $this->addCapturesFromMatch( [], $s, $m, $capt, false ) as $k => $v ) {
767  $k++;
768  $c["m$k"] = $v;
769  }
770  $captures[] = $c;
771  if ( $n >= 0 && count( $captures ) >= $n ) {
772  break;
773  }
774  }
775  }
776 
777  switch ( $this->getLuaType( $repl ) ) {
778  case 'string':
779  case 'number':
780  $cb = function ( $m ) use ( $repl, $anypos, &$captures ) {
781  if ( $anypos ) {
782  $m = array_shift( $captures );
783  }
784  return preg_replace_callback( '/%([%0-9])/', function ( $m2 ) use ( $m ) {
785  $x = $m2[1];
786  if ( $x === '%' ) {
787  return '%';
788  } elseif ( $x === '0' ) {
789  return $m[0];
790  } elseif ( isset( $m["m$x"] ) ) {
791  return $m["m$x"];
792  } elseif ( $x === '1' ) {
793  // Match undocumented Lua string.gsub behavior
794  return $m[0];
795  } else {
796  throw new Scribunto_LuaError( "invalid capture index %$x in replacement string" );
797  }
798  }, $repl );
799  };
800  break;
801 
802  case 'table':
803  $cb = function ( $m ) use ( $repl, $anypos, &$captures ) {
804  if ( $anypos ) {
805  $m = array_shift( $captures );
806  }
807  $x = $m['m1'] ?? $m[0];
808  if ( !isset( $repl[$x] ) || $repl[$x] === null ) {
809  return $m[0];
810  }
811  $type = $this->getLuaType( $repl[$x] );
812  if ( $type !== 'string' && $type !== 'number' ) {
813  throw new Scribunto_LuaError( "invalid replacement value (a $type)" );
814  }
815  return $repl[$x];
816  };
817  break;
818 
819  case 'function':
820  $interpreter = $this->getInterpreter();
821  $cb = function ( $m ) use ( $interpreter, $capt, $repl, $anypos, &$captures ) {
822  if ( $anypos ) {
823  $m = array_shift( $captures );
824  }
825  $args = [];
826  if ( count( $capt ) ) {
827  foreach ( $capt as $i => $pos ) {
828  $args[] = $m["m$i"];
829  }
830  } else {
831  $args[] = $m[0];
832  }
833  $ret = $interpreter->callFunction( $repl, ...$args );
834  if ( count( $ret ) === 0 || $ret[0] === null ) {
835  return $m[0];
836  }
837  $type = $this->getLuaType( $ret[0] );
838  if ( $type !== 'string' && $type !== 'number' ) {
839  throw new Scribunto_LuaError( "invalid replacement value (a $type)" );
840  }
841  return $ret[0];
842  };
843  break;
844 
845  default:
846  $this->checkType( 'gsub', 3, $repl, 'function or table or string' );
847  }
848 
849  $skippedMatches = 0;
850  if ( $this->phpBug53823 ) {
851  // Since we're having bogus matches, we need to keep track of the
852  // necessary adjustment and stop manually once we hit the limit.
853  $maxMatches = $n < 0 ? INF : $n;
854  $n = -1;
855  $realCallback = $cb;
856  $cb = function ( $m ) use ( $realCallback, &$skippedMatches, &$maxMatches ) {
857  $c = ord( $m['phpBug53823'] );
858  if ( $c >= 0x80 && $c <= 0xbf || $maxMatches <= 0 ) {
859  $skippedMatches++;
860  return $m[0];
861  } else {
862  $maxMatches--;
863  return $realCallback( $m );
864  }
865  };
866  }
867 
868  $count = 0;
869  $s2 = preg_replace_callback( $re, $cb, $s, $n, $count );
870  if ( $s2 === null ) {
871  self::handlePCREError( preg_last_error(), $pattern );
872  }
873  return [ $s2, $count - $skippedMatches ];
874  }
875 
882  private function handlePCREError( $error, $pattern ) {
883  $PREG_JIT_STACKLIMIT_ERROR = defined( 'PREG_JIT_STACKLIMIT_ERROR' )
884  ? PREG_JIT_STACKLIMIT_ERROR
885  : 'PREG_JIT_STACKLIMIT_ERROR';
886 
887  $error = preg_last_error();
888  switch ( $error ) {
889  case PREG_NO_ERROR:
890  // Huh?
891  break;
892  case PREG_INTERNAL_ERROR:
893  throw new Scribunto_LuaError( "PCRE internal error" );
894  case PREG_BACKTRACK_LIMIT_ERROR:
895  throw new Scribunto_LuaError(
896  "PCRE backtrack limit reached while matching pattern '$pattern'"
897  );
898  case PREG_RECURSION_LIMIT_ERROR:
899  throw new Scribunto_LuaError(
900  "PCRE recursion limit reached while matching pattern '$pattern'"
901  );
902  case PREG_BAD_UTF8_ERROR:
903  // Should have alreay been caught, but just in case
904  throw new Scribunto_LuaError( "PCRE bad UTF-8 error" );
905  case PREG_BAD_UTF8_OFFSET_ERROR:
906  // Shouldn't happen, but just in case
907  throw new Scribunto_LuaError( "PCRE bad UTF-8 offset error" );
908  case $PREG_JIT_STACKLIMIT_ERROR:
909  throw new Scribunto_LuaError(
910  "PCRE JIT stack limit reached while matching pattern '$pattern'"
911  );
912  default:
913  throw new Scribunto_LuaError(
914  "PCRE error code $error while matching pattern '$pattern'"
915  );
916  }
917  }
918 }
Scribunto_LuaUstringLibrary\$patternRegexCache
MapCacheLRU $patternRegexCache
A cache of patterns and the regexes they generate.
Definition: UstringLibrary.php:30
$wgMaxArticleSize
$wgMaxArticleSize
Maximum article size in kilobytes.
Definition: DefaultSettings.php:2316
Scribunto_LuaError
Definition: LuaCommon.php:992
Scribunto_LuaUstringLibrary\ustringByteoffset
ustringByteoffset( $s, $l=1, $i=1)
Handler for byteoffset.
Definition: UstringLibrary.php:120
Scribunto_LuaUstringLibrary\ustringLower
ustringLower( $s)
Handler for lower.
Definition: UstringLibrary.php:333
Scribunto_LuaUstringLibrary\ustringUpper
ustringUpper( $s)
Handler for upper.
Definition: UstringLibrary.php:322
$s
$s
Definition: mergeMessageFileList.php:185
Scribunto_LuaUstringLibrary\ustringToNFKD
ustringToNFKD( $s)
Handler for toNFKD.
Definition: UstringLibrary.php:237
Scribunto_LuaUstringLibrary\ustringChar
ustringChar()
Handler for char.
Definition: UstringLibrary.php:250
serialize
serialize()
Definition: ApiMessageTrait.php:138
Scribunto_LuaLibraryBase\getEngine
getEngine()
Get the engine.
Definition: LibraryBase.php:56
Scribunto_LuaUstringLibrary\checkPattern
checkPattern( $name, $pattern)
Definition: UstringLibrary.php:338
Scribunto_LuaUstringLibrary\ustringGmatchCallback
ustringGmatchCallback( $s, $re, $capt, $pos)
Handler for gmatchCallback.
Definition: UstringLibrary.php:712
Scribunto_LuaLibraryBase\checkType
checkType( $name, $argIdx, $arg, $expectType)
Check the type of a variable.
Definition: LibraryBase.php:141
Scribunto_LuaLibraryBase\getInterpreter
getInterpreter()
Get the interpreter.
Definition: LibraryBase.php:65
Scribunto_LuaUstringLibrary\addCapturesFromMatch
addCapturesFromMatch( $arr, $s, $m, $capt, $m0_if_no_captures)
Definition: UstringLibrary.php:591
Scribunto_LuaUstringLibrary\ustringGsub
ustringGsub( $s, $pattern, $repl, $n=null)
Handler for gsub.
Definition: UstringLibrary.php:729
Scribunto_LuaUstringLibrary\ustringToNFKC
ustringToNFKC( $s)
Handler for toNFKC.
Definition: UstringLibrary.php:223
MapCacheLRU
Handles a simple LRU key/value map with a maximum number of entries.
Definition: MapCacheLRU.php:37
Scribunto_LuaUstringLibrary\$stringLengthLimit
integer null $stringLengthLimit
Limit on string lengths, in bytes not characters If null, $wgMaxArticleSize * 1024 will be used.
Definition: UstringLibrary.php:17
Scribunto_LuaLibraryBase\checkTypeOptional
checkTypeOptional( $name, $argIdx, &$arg, $expectType, $default)
Check the type of a variable, with default if null.
Definition: LibraryBase.php:164
Scribunto_LuaUstringLibrary\handlePCREError
handlePCREError( $error, $pattern)
Handle a PCRE error.
Definition: UstringLibrary.php:882
Scribunto_LuaLibraryBase\$engine
Scribunto_LuaEngine $engine
Definition: LibraryBase.php:31
Scribunto_LuaUstringLibrary\$patternLengthLimit
integer $patternLengthLimit
Limit on pattern lengths, in bytes not characters.
Definition: UstringLibrary.php:10
Scribunto_LuaUstringLibrary\ustringToNFD
ustringToNFD( $s)
Handler for toNFD.
Definition: UstringLibrary.php:209
Scribunto_LuaLibraryBase
This class provides some basic services that Lua libraries will probably need.
Definition: LibraryBase.php:27
Scribunto_LuaUstringLibrary\__construct
__construct( $engine)
Definition: UstringLibrary.php:32
Scribunto_LuaUstringLibrary\ustringToNFC
ustringToNFC( $s)
Handler for toNFC.
Definition: UstringLibrary.php:195
Scribunto_LuaUstringLibrary\ustringCodepoint
ustringCodepoint( $s, $i=1, $j=null)
Handler for codepoint.
Definition: UstringLibrary.php:156
Scribunto_LuaUstringLibrary\patternToRegex
patternToRegex( $pattern, $anchor, $name)
Definition: UstringLibrary.php:354
Scribunto_LuaUstringLibrary\ustringMatch
ustringMatch( $s, $pattern, $init=1)
Handler for match.
Definition: UstringLibrary.php:666
Scribunto_LuaUstringLibrary
Definition: UstringLibrary.php:5
Scribunto_LuaUstringLibrary\ustringGmatchInit
ustringGmatchInit( $s, $pattern)
Handler for gmatchInit.
Definition: UstringLibrary.php:696
Scribunto_LuaUstringLibrary\ustringIsUtf8
ustringIsUtf8( $s)
Handler for isUtf8.
Definition: UstringLibrary.php:107
Scribunto_LuaUstringLibrary\ustringSub
ustringSub( $s, $i=1, $j=-1)
Handler for sub.
Definition: UstringLibrary.php:295
$args
if( $line===false) $args
Definition: cdb.php:64
Scribunto_LuaUstringLibrary\ustringLen
ustringLen( $s)
Handler for len.
Definition: UstringLibrary.php:279
Scribunto_LuaUstringLibrary\ustringGcodepointInit
ustringGcodepointInit( $s, $i=1, $j=null)
Handler for gcodepointInit.
Definition: UstringLibrary.php:185
Scribunto_LuaUstringLibrary\checkString
checkString( $name, $s, $checkEncoding=true)
Definition: UstringLibrary.php:85
Scribunto_LuaUstringLibrary\$phpBug53823
boolean $phpBug53823
PHP until 5.6.9 are buggy when the regex in preg_replace an preg_match_all matches the empty string.
Definition: UstringLibrary.php:24
Scribunto_LuaUstringLibrary\bracketedCharSetToRegex
bracketedCharSetToRegex( $pat, $i, $len, $brcharsets)
Definition: UstringLibrary.php:541
Scribunto_LuaLibraryBase\getLuaType
getLuaType( $var)
Get the Lua type corresponding to the type of the variable.
Definition: LibraryBase.php:106
Scribunto_LuaUstringLibrary\ustringFind
ustringFind( $s, $pattern, $init=1, $plain=false)
Handler for find.
Definition: UstringLibrary.php:616
$type
$type
Definition: testCompression.php:48