MediaWiki REL1_40
MagicWordArray.php
Go to the documentation of this file.
1<?php
2
25namespace MediaWiki\Parser;
26
27use Exception;
30use MWException;
31
38 public $names = [];
39
41 private $factory;
42
44 private $hash;
45
47 private $baseRegex;
48
50 private $regex;
51
56 public function __construct( $names = [], MagicWordFactory $factory = null ) {
57 $this->names = $names;
58 $this->factory = $factory ?: MediaWikiServices::getInstance()->getMagicWordFactory();
59 }
60
66 public function add( $name ) {
67 $this->names[] = $name;
68 $this->hash = $this->baseRegex = $this->regex = null;
69 }
70
76 public function addArray( $names ) {
77 $this->names = array_merge( $this->names, array_values( $names ) );
78 $this->hash = $this->baseRegex = $this->regex = null;
79 }
80
85 public function getHash() {
86 if ( $this->hash === null ) {
87 $this->hash = [ 0 => [], 1 => [] ];
88 foreach ( $this->names as $name ) {
89 $magic = $this->factory->get( $name );
90 $case = intval( $magic->isCaseSensitive() );
91 foreach ( $magic->getSynonyms() as $syn ) {
92 if ( !$case ) {
93 $syn = $this->factory->getContentLanguage()->lc( $syn );
94 }
95 $this->hash[$case][$syn] = $name;
96 }
97 }
98 }
99 return $this->hash;
100 }
101
112 public function getBaseRegex( bool $capture = true, string $delimiter = '/' ): array {
113 if ( $capture && $delimiter === '/' && $this->baseRegex !== null ) {
114 return $this->baseRegex;
115 }
116 $regex = [ 0 => [], 1 => [] ];
117 $allGroups = [];
118 foreach ( $this->names as $name ) {
119 $magic = $this->factory->get( $name );
120 $case = $magic->isCaseSensitive() ? 1 : 0;
121 foreach ( $magic->getSynonyms() as $i => $syn ) {
122 if ( $capture ) {
123 // Group name must start with a non-digit in PCRE 8.34+
124 $it = strtr( $i, '0123456789', 'abcdefghij' );
125 $groupName = $it . '_' . $name;
126 $group = '(?P<' . $groupName . '>' . preg_quote( $syn, $delimiter ) . ')';
127 // look for same group names to avoid same named subpatterns in the regex
128 if ( isset( $allGroups[$groupName] ) ) {
129 throw new MWException(
130 __METHOD__ . ': duplicate internal name in magic word array: ' . $name
131 );
132 }
133 $allGroups[$groupName] = true;
134 $regex[$case][] = $group;
135 } else {
136 $regex[$case][] = preg_quote( $syn, $delimiter );
137 }
138 }
139 }
140 '@phan-var array<int,string[]> $regex';
141 foreach ( $regex as $case => &$re ) {
142 $re = count( $re ) ? implode( '|', $re ) : '(?!)';
143 if ( !$case ) {
144 $re = "(?i:{$re})";
145 }
146 }
147 '@phan-var array<int,string> $regex';
148
149 if ( $capture && $delimiter === '/' ) {
150 $this->baseRegex = $regex;
151 }
152 return $regex;
153 }
154
160 public function getRegex() {
161 if ( $this->regex === null ) {
162 $this->regex = [];
163 $base = $this->getBaseRegex( true, '/' );
164 foreach ( $base as $case => $re ) {
165 $this->regex[$case] = "/{$re}/S";
166 }
167 // As a performance optimization, turn on unicode mode only for
168 // case-insensitive matching.
169 $this->regex[0] .= 'u';
170 }
171 return $this->regex;
172 }
173
181 public function getVariableRegex() {
182 wfDeprecated( __METHOD__, '1.36' );
183 return str_replace( "\\$1", "(.*?)", $this->getRegex() );
184 }
185
192 public function getRegexStart() {
193 $newRegex = [];
194 $base = $this->getBaseRegex( true, '/' );
195 foreach ( $base as $case => $re ) {
196 $newRegex[$case] = "/^(?:{$re})/S";
197 }
198 // As a performance optimization, turn on unicode mode only for
199 // case-insensitive matching.
200 $newRegex[0] .= 'u';
201 return $newRegex;
202 }
203
210 public function getVariableStartToEndRegex() {
211 $newRegex = [];
212 $base = $this->getBaseRegex( true, '/' );
213 foreach ( $base as $case => $re ) {
214 $newRegex[$case] = str_replace( "\\$1", "(.*?)", "/^(?:{$re})$/S" );
215 }
216 // As a performance optimization, turn on unicode mode only for
217 // case-insensitive matching.
218 $newRegex[0] .= 'u';
219 return $newRegex;
220 }
221
226 public function getNames() {
227 return $this->names;
228 }
229
240 public function parseMatch( $m ) {
241 reset( $m );
242 while ( ( $key = key( $m ) ) !== null ) {
243 $value = current( $m );
244 next( $m );
245 if ( $key === 0 || $value === '' ) {
246 continue;
247 }
248 $parts = explode( '_', $key, 2 );
249 if ( count( $parts ) != 2 ) {
250 // This shouldn't happen
251 // continue;
252 throw new MWException( __METHOD__ . ': bad parameter name' );
253 }
254 [ /* $synIndex */, $magicName ] = $parts;
255 $paramValue = next( $m );
256 return [ $magicName, $paramValue ];
257 }
258 // This shouldn't happen either
259 throw new MWException( __METHOD__ . ': parameter not found' );
260 }
261
272 public function matchVariableStartToEnd( $text ) {
273 $regexes = $this->getVariableStartToEndRegex();
274 foreach ( $regexes as $regex ) {
275 $m = [];
276 if ( preg_match( $regex, $text, $m ) ) {
277 return $this->parseMatch( $m );
278 }
279 }
280 return [ false, false ];
281 }
282
291 public function matchStartToEnd( $text ) {
292 $hash = $this->getHash();
293 if ( isset( $hash[1][$text] ) ) {
294 return $hash[1][$text];
295 }
296 $lc = $this->factory->getContentLanguage()->lc( $text );
297 return $hash[0][$lc] ?? false;
298 }
299
308 public function matchAndRemove( &$text ) {
309 $found = [];
310 $regexes = $this->getRegex();
311 foreach ( $regexes as $regex ) {
312 $matches = [];
313 $res = preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
314 if ( $res === false ) {
315 $error = preg_last_error();
316 // TODO: Remove function_exists when we require PHP8
317 $errorText = function_exists( 'preg_last_error_msg' ) ? preg_last_error_msg() : '';
318 LoggerFactory::getInstance( 'parser' )->warning( 'preg_match_all error: {code} {errorText}', [
319 'code' => $error,
320 'regex' => $regex,
321 'text' => $text,
322 'errorText' => $errorText
323 ] );
324 // T321234: Don't try to fix old revisions with broken UTF-8, just return as is
325 if ( $error === PREG_BAD_UTF8_ERROR ) {
326 continue;
327 }
328 throw new Exception( "preg_match_all error $error: $errorText" );
329 } elseif ( $res ) {
330 foreach ( $matches as $m ) {
331 [ $name, $param ] = $this->parseMatch( $m );
332 $found[$name] = $param;
333 }
334 }
335 $res = preg_replace( $regex, '', $text );
336 if ( $res === null ) {
337 $error = preg_last_error();
338 // TODO: Remove function_exists when we require PHP8
339 $errorText = function_exists( 'preg_last_error_msg' ) ? preg_last_error_msg() : '';
340 LoggerFactory::getInstance( 'parser' )->warning( 'preg_replace error: {code} {errorText}', [
341 'code' => $error,
342 'regex' => $regex,
343 'text' => $text,
344 'errorText' => $errorText
345 ] );
346 throw new Exception( "preg_replace error $error: $errorText" );
347 }
348 $text = $res;
349 }
350 return $found;
351 }
352
363 public function matchStartAndRemove( &$text ) {
364 $regexes = $this->getRegexStart();
365 foreach ( $regexes as $regex ) {
366 if ( preg_match( $regex, $text, $m ) ) {
367 [ $id, ] = $this->parseMatch( $m );
368 if ( strlen( $m[0] ) >= strlen( $text ) ) {
369 $text = '';
370 } else {
371 $text = substr( $text, strlen( $m[0] ) );
372 }
373 return $id;
374 }
375 }
376 return false;
377 }
378}
379
380class_alias( MagicWordArray::class, 'MagicWordArray' );
wfDeprecated( $function, $version=false, $component=false, $callerOffset=2)
Logs a warning that a deprecated feature was used.
if(!defined('MW_SETUP_CALLBACK'))
The persistent session ID (if any) loaded at startup.
Definition WebStart.php:88
MediaWiki exception.
PSR-3 logger instance factory.
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Class for handling an array of magic words.
getRegexStart()
Get a regex anchored to the start of the string that does not match parameters.
matchVariableStartToEnd( $text)
Match some text, with parameter capture Returns an array with the magic word name in the first elemen...
matchAndRemove(&$text)
Returns an associative array, ID => param value, for all items that match Removes the matched items f...
matchStartAndRemove(&$text)
Return the ID of the magic word at the start of $text, and remove the prefix from $text.
getBaseRegex(bool $capture=true, string $delimiter='/')
Get the base regex.
__construct( $names=[], MagicWordFactory $factory=null)
matchStartToEnd( $text)
Match some text, without parameter capture Returns the magic word name, or false if there was no capt...
getVariableRegex()
Get a regex for matching variables with parameters.
parseMatch( $m)
Parse a match array from preg_match Returns array(magic word ID, parameter value) If there is no para...
addArray( $names)
Add a number of magic words by name.
add( $name)
Add a magic word by name.
getRegex()
Get an unanchored regex that does not match parameters.
getVariableStartToEndRegex()
Get an anchored regex for matching variables with parameters.
getHash()
Get a 2-d hashtable for this array.
A factory that stores information about MagicWords, and creates them on demand with caching.