MediaWiki  master
MagicWordArray.php
Go to the documentation of this file.
1 <?php
2 
25 namespace MediaWiki\Parser;
26 
27 use Exception;
30 use MWException;
31 
38  public $names = [];
39 
41  private $factory;
42 
44  private $hash;
45 
47  private $baseRegex;
48 
50  private $regex;
51 
56  public function __construct( $names = [], MagicWordFactory $factory = null ) {
57  $this->names = $names;
58  $this->factory = $factory ?: MediaWikiServices::getInstance()->getMagicWordFactory();
59  }
60 
66  public function add( $name ) {
67  $this->names[] = $name;
68  $this->hash = $this->baseRegex = $this->regex = null;
69  }
70 
76  public function addArray( $names ) {
77  $this->names = array_merge( $this->names, array_values( $names ) );
78  $this->hash = $this->baseRegex = $this->regex = null;
79  }
80 
85  public function getHash() {
86  if ( $this->hash === null ) {
87  $this->hash = [ 0 => [], 1 => [] ];
88  foreach ( $this->names as $name ) {
89  $magic = $this->factory->get( $name );
90  $case = intval( $magic->isCaseSensitive() );
91  foreach ( $magic->getSynonyms() as $syn ) {
92  if ( !$case ) {
93  $syn = $this->factory->getContentLanguage()->lc( $syn );
94  }
95  $this->hash[$case][$syn] = $name;
96  }
97  }
98  }
99  return $this->hash;
100  }
101 
112  public function getBaseRegex( bool $capture = true, string $delimiter = '/' ): array {
113  if ( $capture && $delimiter === '/' && $this->baseRegex !== null ) {
114  return $this->baseRegex;
115  }
116  $regex = [ 0 => [], 1 => [] ];
117  $allGroups = [];
118  foreach ( $this->names as $name ) {
119  $magic = $this->factory->get( $name );
120  $case = $magic->isCaseSensitive() ? 1 : 0;
121  foreach ( $magic->getSynonyms() as $i => $syn ) {
122  if ( $capture ) {
123  // Group name must start with a non-digit in PCRE 8.34+
124  $it = strtr( $i, '0123456789', 'abcdefghij' );
125  $groupName = $it . '_' . $name;
126  $group = '(?P<' . $groupName . '>' . preg_quote( $syn, $delimiter ) . ')';
127  // look for same group names to avoid same named subpatterns in the regex
128  if ( isset( $allGroups[$groupName] ) ) {
129  throw new MWException(
130  __METHOD__ . ': duplicate internal name in magic word array: ' . $name
131  );
132  }
133  $allGroups[$groupName] = true;
134  $regex[$case][] = $group;
135  } else {
136  $regex[$case][] = preg_quote( $syn, $delimiter );
137  }
138  }
139  }
140  '@phan-var array<int,string[]> $regex';
141  foreach ( $regex as $case => &$re ) {
142  $re = count( $re ) ? implode( '|', $re ) : '(?!)';
143  if ( !$case ) {
144  $re = "(?i:{$re})";
145  }
146  }
147  '@phan-var array<int,string> $regex';
148 
149  if ( $capture && $delimiter === '/' ) {
150  $this->baseRegex = $regex;
151  }
152  return $regex;
153  }
154 
160  public function getRegex() {
161  if ( $this->regex === null ) {
162  $this->regex = [];
163  $base = $this->getBaseRegex( true, '/' );
164  foreach ( $base as $case => $re ) {
165  $this->regex[$case] = "/{$re}/S";
166  }
167  // As a performance optimization, turn on unicode mode only for
168  // case-insensitive matching.
169  $this->regex[0] .= 'u';
170  }
171  return $this->regex;
172  }
173 
181  public function getVariableRegex() {
182  wfDeprecated( __METHOD__, '1.36' );
183  return str_replace( "\\$1", "(.*?)", $this->getRegex() );
184  }
185 
192  public function getRegexStart() {
193  $newRegex = [];
194  $base = $this->getBaseRegex( true, '/' );
195  foreach ( $base as $case => $re ) {
196  $newRegex[$case] = "/^(?:{$re})/S";
197  }
198  // As a performance optimization, turn on unicode mode only for
199  // case-insensitive matching.
200  $newRegex[0] .= 'u';
201  return $newRegex;
202  }
203 
210  public function getVariableStartToEndRegex() {
211  $newRegex = [];
212  $base = $this->getBaseRegex( true, '/' );
213  foreach ( $base as $case => $re ) {
214  $newRegex[$case] = str_replace( "\\$1", "(.*?)", "/^(?:{$re})$/S" );
215  }
216  // As a performance optimization, turn on unicode mode only for
217  // case-insensitive matching.
218  $newRegex[0] .= 'u';
219  return $newRegex;
220  }
221 
226  public function getNames() {
227  return $this->names;
228  }
229 
240  public function parseMatch( $m ) {
241  reset( $m );
242  while ( ( $key = key( $m ) ) !== null ) {
243  $value = current( $m );
244  next( $m );
245  if ( $key === 0 || $value === '' ) {
246  continue;
247  }
248  $parts = explode( '_', $key, 2 );
249  if ( count( $parts ) != 2 ) {
250  // This shouldn't happen
251  // continue;
252  throw new MWException( __METHOD__ . ': bad parameter name' );
253  }
254  [ /* $synIndex */, $magicName ] = $parts;
255  $paramValue = next( $m );
256  return [ $magicName, $paramValue ];
257  }
258  // This shouldn't happen either
259  throw new MWException( __METHOD__ . ': parameter not found' );
260  }
261 
272  public function matchVariableStartToEnd( $text ) {
273  $regexes = $this->getVariableStartToEndRegex();
274  foreach ( $regexes as $regex ) {
275  $m = [];
276  if ( preg_match( $regex, $text, $m ) ) {
277  return $this->parseMatch( $m );
278  }
279  }
280  return [ false, false ];
281  }
282 
291  public function matchStartToEnd( $text ) {
292  $hash = $this->getHash();
293  if ( isset( $hash[1][$text] ) ) {
294  return $hash[1][$text];
295  }
296  $lc = $this->factory->getContentLanguage()->lc( $text );
297  return $hash[0][$lc] ?? false;
298  }
299 
308  public function matchAndRemove( &$text ) {
309  $found = [];
310  $regexes = $this->getRegex();
311  foreach ( $regexes as $regex ) {
312  $matches = [];
313  $res = preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
314  if ( $res === false ) {
315  $error = preg_last_error();
316  // TODO: Remove function_exists when we require PHP8
317  $errorText = function_exists( 'preg_last_error_msg' ) ? preg_last_error_msg() : '';
318  LoggerFactory::getInstance( 'parser' )->warning( 'preg_match_all error: {code} {errorText}', [
319  'code' => $error,
320  'regex' => $regex,
321  'text' => $text,
322  'errorText' => $errorText
323  ] );
324  throw new Exception( "preg_match_all error $error: $errorText" );
325  } elseif ( $res ) {
326  foreach ( $matches as $m ) {
327  [ $name, $param ] = $this->parseMatch( $m );
328  $found[$name] = $param;
329  }
330  }
331  $res = preg_replace( $regex, '', $text );
332  if ( $res === null ) {
333  $error = preg_last_error();
334  // TODO: Remove function_exists when we require PHP8
335  $errorText = function_exists( 'preg_last_error_msg' ) ? preg_last_error_msg() : '';
336  LoggerFactory::getInstance( 'parser' )->warning( 'preg_replace error: {code} {errorText}', [
337  'code' => $error,
338  'regex' => $regex,
339  'text' => $text,
340  'errorText' => $errorText
341  ] );
342  throw new Exception( "preg_replace error $error: $errorText" );
343  }
344  $text = $res;
345  }
346  return $found;
347  }
348 
359  public function matchStartAndRemove( &$text ) {
360  $regexes = $this->getRegexStart();
361  foreach ( $regexes as $regex ) {
362  if ( preg_match( $regex, $text, $m ) ) {
363  [ $id, ] = $this->parseMatch( $m );
364  if ( strlen( $m[0] ) >= strlen( $text ) ) {
365  $text = '';
366  } else {
367  $text = substr( $text, strlen( $m[0] ) );
368  }
369  return $id;
370  }
371  }
372  return false;
373  }
374 }
375 
376 class_alias( MagicWordArray::class, 'MagicWordArray' );
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
$matches
if(!defined('MW_SETUP_CALLBACK'))
Definition: WebStart.php:88
MediaWiki exception.
Definition: MWException.php:32
PSR-3 logger instance factory.
static getInstance( $channel)
Get a named logger instance from the currently configured logger factory.
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Class for handling an array of magic words.
__construct( $names=[], MagicWordFactory $factory=null)
addArray( $names)
Add a number of magic words by name.
getRegex()
Get an unanchored regex that does not match parameters.
matchVariableStartToEnd( $text)
Match some text, with parameter capture Returns an array with the magic word name in the first elemen...
getVariableStartToEndRegex()
Get an anchored regex for matching variables with parameters.
matchAndRemove(&$text)
Returns an associative array, ID => param value, for all items that match Removes the matched items f...
add( $name)
Add a magic word by name.
matchStartToEnd( $text)
Match some text, without parameter capture Returns the magic word name, or false if there was no capt...
getRegexStart()
Get a regex anchored to the start of the string that does not match parameters.
getVariableRegex()
Get a regex for matching variables with parameters.
getHash()
Get a 2-d hashtable for this array.
parseMatch( $m)
Parse a match array from preg_match Returns array(magic word ID, parameter value) If there is no para...
getBaseRegex(bool $capture=true, string $delimiter='/')
Get the base regex.
matchStartAndRemove(&$text)
Return the ID of the magic word at the start of $text, and remove the prefix from $text.
A factory that stores information about MagicWords, and creates them on demand with caching.