Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
97.14% |
102 / 105 |
|
92.31% |
12 / 13 |
CRAP | |
0.00% |
0 / 1 |
MagicWordArray | |
98.08% |
102 / 104 |
|
92.31% |
12 / 13 |
47 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
add | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getHash | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
5 | |||
getBaseRegex | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
13 | |||
getRegex | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
getRegexStart | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
getVariableStartToEndRegex | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
getNames | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
parseMatch | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
6.29 | |||
matchVariableStartToEnd | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
matchStartToEnd | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
matchAndRemove | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
3 | |||
matchStartAndRemove | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
4 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | */ |
20 | |
21 | namespace MediaWiki\Parser; |
22 | |
23 | use LogicException; |
24 | use MediaWiki\Logger\LoggerFactory; |
25 | use MediaWiki\MediaWikiServices; |
26 | |
27 | /** |
28 | * Class for handling an array of magic words |
29 | * |
30 | * See docs/magicword.md. |
31 | * |
32 | * @since 1.11 |
33 | * @ingroup Parser |
34 | */ |
35 | class MagicWordArray { |
36 | |
37 | /** @var string[] */ |
38 | public $names = []; |
39 | private MagicWordFactory $factory; |
40 | |
41 | /** @var array<int,array<string,string>>|null */ |
42 | private $hash; |
43 | |
44 | /** @var string[]|null */ |
45 | private $baseRegex; |
46 | |
47 | /** @var string[]|null */ |
48 | private $regex; |
49 | |
50 | /** |
51 | * @param string[] $names |
52 | * @param MagicWordFactory|null $factory |
53 | */ |
54 | public function __construct( $names = [], ?MagicWordFactory $factory = null ) { |
55 | $this->names = $names; |
56 | $this->factory = $factory ?: MediaWikiServices::getInstance()->getMagicWordFactory(); |
57 | } |
58 | |
59 | /** |
60 | * Add a magic word by name |
61 | * |
62 | * @param string $name |
63 | */ |
64 | public function add( $name ): void { |
65 | $this->names[] = $name; |
66 | $this->hash = $this->baseRegex = $this->regex = null; |
67 | } |
68 | |
69 | /** |
70 | * Get a 2-d hashtable for this array |
71 | * |
72 | * @return array<int,array<string,string>> |
73 | */ |
74 | public function getHash(): array { |
75 | if ( $this->hash === null ) { |
76 | $this->hash = [ 0 => [], 1 => [] ]; |
77 | foreach ( $this->names as $name ) { |
78 | $magic = $this->factory->get( $name ); |
79 | $case = intval( $magic->isCaseSensitive() ); |
80 | foreach ( $magic->getSynonyms() as $syn ) { |
81 | if ( !$case ) { |
82 | $syn = $this->factory->getContentLanguage()->lc( $syn ); |
83 | } |
84 | $this->hash[$case][$syn] = $name; |
85 | } |
86 | } |
87 | } |
88 | return $this->hash; |
89 | } |
90 | |
91 | /** |
92 | * Get the base regex |
93 | * |
94 | * @internal For use in {@see Parser} only |
95 | * @param bool $capture Set to false to suppress the capture groups, |
96 | * which can cause unexpected conflicts when this regexp is embedded in |
97 | * other regexps with similar constructs. |
98 | * @param string $delimiter The delimiter which will be used for the |
99 | * eventual regexp. |
100 | * @return array<int,string> |
101 | */ |
102 | public function getBaseRegex( bool $capture = true, string $delimiter = '/' ): array { |
103 | if ( $capture && $delimiter === '/' && $this->baseRegex !== null ) { |
104 | return $this->baseRegex; |
105 | } |
106 | $regex = [ 0 => [], 1 => [] ]; |
107 | foreach ( $this->names as $name ) { |
108 | $magic = $this->factory->get( $name ); |
109 | $case = $magic->isCaseSensitive() ? 1 : 0; |
110 | foreach ( $magic->getSynonyms() as $i => $syn ) { |
111 | if ( $capture ) { |
112 | // Group name must start with a non-digit in PCRE 8.34+ |
113 | $it = strtr( $i, '0123456789', 'abcdefghij' ); |
114 | $groupName = $it . '_' . $name; |
115 | $group = '(?P<' . $groupName . '>' . preg_quote( $syn, $delimiter ) . ')'; |
116 | $regex[$case][] = $group; |
117 | } else { |
118 | $regex[$case][] = preg_quote( $syn, $delimiter ); |
119 | } |
120 | } |
121 | } |
122 | '@phan-var array<int,string[]> $regex'; |
123 | foreach ( $regex as $case => &$re ) { |
124 | $re = count( $re ) ? implode( '|', $re ) : '(?!)'; |
125 | if ( !$case ) { |
126 | $re = "(?i:{$re})"; |
127 | } |
128 | } |
129 | '@phan-var array<int,string> $regex'; |
130 | |
131 | if ( $capture && $delimiter === '/' ) { |
132 | $this->baseRegex = $regex; |
133 | } |
134 | return $regex; |
135 | } |
136 | |
137 | /** |
138 | * Get an unanchored regex that does not match parameters |
139 | * |
140 | * @return array<int,string> |
141 | */ |
142 | private function getRegex(): array { |
143 | if ( $this->regex === null ) { |
144 | $this->regex = []; |
145 | $base = $this->getBaseRegex( true, '/' ); |
146 | foreach ( $base as $case => $re ) { |
147 | $this->regex[$case] = "/$re/JS"; |
148 | } |
149 | // As a performance optimization, turn on unicode mode only for |
150 | // case-insensitive matching. |
151 | $this->regex[0] .= 'u'; |
152 | } |
153 | return $this->regex; |
154 | } |
155 | |
156 | /** |
157 | * Get a regex anchored to the start of the string that does not match parameters |
158 | * |
159 | * @return array<int,string> |
160 | */ |
161 | private function getRegexStart(): array { |
162 | $newRegex = []; |
163 | $base = $this->getBaseRegex( true, '/' ); |
164 | foreach ( $base as $case => $re ) { |
165 | $newRegex[$case] = "/^(?:$re)/JS"; |
166 | } |
167 | // As a performance optimization, turn on unicode mode only for |
168 | // case-insensitive matching. |
169 | $newRegex[0] .= 'u'; |
170 | return $newRegex; |
171 | } |
172 | |
173 | /** |
174 | * Get an anchored regex for matching variables with parameters |
175 | * |
176 | * @return array<int,string> |
177 | */ |
178 | private function getVariableStartToEndRegex(): array { |
179 | $newRegex = []; |
180 | $base = $this->getBaseRegex( true, '/' ); |
181 | foreach ( $base as $case => $re ) { |
182 | $newRegex[$case] = str_replace( '\$1', '(.*?)', "/^(?:$re)$/JS" ); |
183 | } |
184 | // As a performance optimization, turn on unicode mode only for |
185 | // case-insensitive matching. |
186 | $newRegex[0] .= 'u'; |
187 | return $newRegex; |
188 | } |
189 | |
190 | /** |
191 | * @since 1.20 |
192 | * @return string[] |
193 | */ |
194 | public function getNames() { |
195 | return $this->names; |
196 | } |
197 | |
198 | /** |
199 | * Parse a match array from preg_match |
200 | * |
201 | * @param array<string|int,string> $matches |
202 | * @return array{0:string,1:string|false} Pair of (magic word ID, parameter value), |
203 | * where the latter is instead false if there is no parameter value. |
204 | */ |
205 | private function parseMatch( array $matches ): array { |
206 | $magicName = null; |
207 | foreach ( $matches as $key => $match ) { |
208 | if ( $magicName !== null ) { |
209 | // The structure we found at this point is [ …, |
210 | // 'a_magicWordName' => 'matchedSynonym', |
211 | // n => 'matchedSynonym (again)', |
212 | // n + 1 => 'parameterValue', |
213 | // … ] |
214 | return [ $magicName, $matches[$key + 1] ?? false ]; |
215 | } |
216 | // Skip the initial full match and any non-matching group |
217 | if ( $match !== '' && $key !== 0 ) { |
218 | $parts = explode( '_', $key, 2 ); |
219 | if ( !isset( $parts[1] ) ) { |
220 | throw new LogicException( 'Unexpected group name' ); |
221 | } |
222 | $magicName = $parts[1]; |
223 | } |
224 | } |
225 | throw new LogicException( 'Unexpected $m array with no match' ); |
226 | } |
227 | |
228 | /** |
229 | * Match some text, with parameter capture |
230 | * |
231 | * @param string $text |
232 | * @return (string|false)[] Magic word name in the first element and the parameter in the second |
233 | * element. Both elements are false if there was no match. |
234 | */ |
235 | public function matchVariableStartToEnd( $text ): array { |
236 | $regexes = $this->getVariableStartToEndRegex(); |
237 | foreach ( $regexes as $regex ) { |
238 | $m = []; |
239 | if ( preg_match( $regex, $text, $m ) ) { |
240 | return $this->parseMatch( $m ); |
241 | } |
242 | } |
243 | return [ false, false ]; |
244 | } |
245 | |
246 | /** |
247 | * Match some text, without parameter capture |
248 | * |
249 | * @see MagicWord::matchStartToEnd |
250 | * @param string $text |
251 | * @return string|false The magic word name, or false if there was no capture |
252 | */ |
253 | public function matchStartToEnd( $text ) { |
254 | $hash = $this->getHash(); |
255 | if ( isset( $hash[1][$text] ) ) { |
256 | return $hash[1][$text]; |
257 | } |
258 | $lc = $this->factory->getContentLanguage()->lc( $text ); |
259 | return $hash[0][$lc] ?? false; |
260 | } |
261 | |
262 | /** |
263 | * Return an associative array for all items that match. |
264 | * |
265 | * Cannot be used for magic words with parameters. |
266 | * Removes the matched items from the input string (passed by reference) |
267 | * |
268 | * @see MagicWord::matchAndRemove |
269 | * @param string &$text |
270 | * @return array<string,false> Keyed by magic word ID |
271 | */ |
272 | public function matchAndRemove( &$text ): array { |
273 | $found = []; |
274 | $regexes = $this->getRegex(); |
275 | $res = preg_replace_callback( $regexes, function ( $m ) use ( &$found ) { |
276 | [ $name, $param ] = $this->parseMatch( $m ); |
277 | $found[$name] = $param; |
278 | return ''; |
279 | }, $text ); |
280 | // T321234: Don't try to fix old revisions with broken UTF-8, just return $text as is |
281 | if ( $res === null ) { |
282 | $error = preg_last_error(); |
283 | $errorText = preg_last_error_msg(); |
284 | LoggerFactory::getInstance( 'parser' )->warning( 'preg_match_all error: {code} {errorText}', [ |
285 | 'code' => $error, |
286 | 'regex' => $regexes, |
287 | 'text' => $text, |
288 | 'errorText' => $errorText |
289 | ] ); |
290 | if ( $error !== PREG_BAD_UTF8_ERROR ) { |
291 | throw new LogicException( "preg_match_all error $error: $errorText" ); |
292 | } |
293 | } else { |
294 | $text = $res; |
295 | } |
296 | return $found; |
297 | } |
298 | |
299 | /** |
300 | * Return the ID of the magic word at the start of $text, and remove |
301 | * the prefix from $text. |
302 | * |
303 | * Does not match parameters. |
304 | * |
305 | * @see MagicWord::matchStartAndRemove |
306 | * @param string &$text Unmodified if no match is found. |
307 | * @return string|false False if no match is found. |
308 | */ |
309 | public function matchStartAndRemove( &$text ) { |
310 | $regexes = $this->getRegexStart(); |
311 | foreach ( $regexes as $regex ) { |
312 | if ( preg_match( $regex, $text, $m ) ) { |
313 | [ $id, ] = $this->parseMatch( $m ); |
314 | if ( strlen( $m[0] ) >= strlen( $text ) ) { |
315 | $text = ''; |
316 | } else { |
317 | $text = substr( $text, strlen( $m[0] ) ); |
318 | } |
319 | return $id; |
320 | } |
321 | } |
322 | return false; |
323 | } |
324 | } |
325 | |
326 | /** @deprecated class alias since 1.40 */ |
327 | class_alias( MagicWordArray::class, 'MagicWordArray' ); |