Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
5.60% |
7 / 125 |
|
30.00% |
3 / 10 |
CRAP | |
0.00% |
0 / 1 |
StringUtils | |
5.60% |
7 / 125 |
|
30.00% |
3 / 10 |
1065.51 | |
0.00% |
0 / 1 |
isUtf8 | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
delimiterExplode | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
72 | |||
hungryDelimiterReplace | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
delimiterReplaceCallback | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
182 | |||
delimiterReplace | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
replaceMarkup | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
isValidPCRERegex | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
escapeRegexReplacement | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
explode | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
unpack | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 |
1 | <?php |
2 | |
3 | use MediaWiki\Libs\UnpackFailedException; |
4 | use Wikimedia\Assert\Assert; |
5 | use Wikimedia\AtEase\AtEase; |
6 | |
7 | /** |
8 | * Methods to play with strings. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License as published by |
12 | * the Free Software Foundation; either version 2 of the License, or |
13 | * (at your option) any later version. |
14 | * |
15 | * This program is distributed in the hope that it will be useful, |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 | * GNU General Public License for more details. |
19 | * |
20 | * You should have received a copy of the GNU General Public License along |
21 | * with this program; if not, write to the Free Software Foundation, Inc., |
22 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
23 | * http://www.gnu.org/copyleft/gpl.html |
24 | * |
25 | * @file |
26 | */ |
27 | |
28 | /** |
29 | * A collection of static methods to play with strings. |
30 | */ |
31 | class StringUtils { |
32 | /** |
33 | * Test whether a string is valid UTF-8. |
34 | * |
35 | * The function check for invalid byte sequences, overlong encoding but |
36 | * not for different normalisations. |
37 | * |
38 | * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation. |
39 | * In particular, the pure PHP code path did not in fact check for overlong forms. |
40 | * Beware of this when backporting code to that version of MediaWiki. |
41 | * |
42 | * @since 1.21 |
43 | * @param string $value String to check |
44 | * @return bool Whether the given $value is a valid UTF-8 encoded string |
45 | */ |
46 | public static function isUtf8( $value ) { |
47 | return mb_check_encoding( (string)$value, 'UTF-8' ); |
48 | } |
49 | |
50 | /** |
51 | * Explode a string, but ignore any instances of the separator inside |
52 | * the given start and end delimiters, which may optionally nest. |
53 | * The delimiters are literal strings, not regular expressions. |
54 | * @param string $startDelim Start delimiter |
55 | * @param string $endDelim End delimiter |
56 | * @param string $separator Separator string for the explode. |
57 | * @param string $subject Subject string to explode. |
58 | * @param bool $nested True iff the delimiters are allowed to nest. |
59 | * @return ArrayIterator |
60 | */ |
61 | public static function delimiterExplode( $startDelim, $endDelim, $separator, |
62 | $subject, $nested = false ) { |
63 | $inputPos = 0; |
64 | $lastPos = 0; |
65 | $depth = 0; |
66 | $encStart = preg_quote( $startDelim, '!' ); |
67 | $encEnd = preg_quote( $endDelim, '!' ); |
68 | $encSep = preg_quote( $separator, '!' ); |
69 | $len = strlen( $subject ); |
70 | $m = []; |
71 | $exploded = []; |
72 | while ( |
73 | $inputPos < $len && |
74 | preg_match( |
75 | "!$encStart|$encEnd|$encSep!S", $subject, $m, |
76 | PREG_OFFSET_CAPTURE, $inputPos |
77 | ) |
78 | ) { |
79 | $match = $m[0][0]; |
80 | $matchPos = $m[0][1]; |
81 | $inputPos = $matchPos + strlen( $match ); |
82 | if ( $match === $separator ) { |
83 | if ( $depth === 0 ) { |
84 | $exploded[] = substr( |
85 | $subject, $lastPos, $matchPos - $lastPos |
86 | ); |
87 | $lastPos = $inputPos; |
88 | } |
89 | } elseif ( $match === $startDelim ) { |
90 | if ( $depth === 0 || $nested ) { |
91 | $depth++; |
92 | } |
93 | } else { |
94 | $depth--; |
95 | } |
96 | } |
97 | $exploded[] = substr( $subject, $lastPos ); |
98 | // This method could be rewritten in the future to avoid creating an |
99 | // intermediate array, since the return type is just an iterator. |
100 | return new ArrayIterator( $exploded ); |
101 | } |
102 | |
103 | /** |
104 | * Perform an operation equivalent to `preg_replace()` |
105 | * |
106 | * Matches this code: |
107 | * |
108 | * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject ); |
109 | * |
110 | * ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this |
111 | * implementation is fast but memory-hungry and inflexible. The memory requirements are such |
112 | * that I don't recommend using it on anything but guaranteed small chunks of text. |
113 | * |
114 | * @param string $startDelim |
115 | * @param string $endDelim |
116 | * @param string $replace |
117 | * @param string $subject |
118 | * @return string |
119 | */ |
120 | public static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) { |
121 | $segments = explode( $startDelim, $subject ); |
122 | $output = array_shift( $segments ); |
123 | foreach ( $segments as $s ) { |
124 | $endDelimPos = strpos( $s, $endDelim ); |
125 | if ( $endDelimPos === false ) { |
126 | $output .= $startDelim . $s; |
127 | } else { |
128 | $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) ); |
129 | } |
130 | } |
131 | |
132 | return $output; |
133 | } |
134 | |
135 | /** |
136 | * Perform an operation equivalent to `preg_replace_callback()` |
137 | * |
138 | * Matches this code: |
139 | * |
140 | * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject ); |
141 | * |
142 | * If the start delimiter ends with an initial substring of the end delimiter, |
143 | * e.g. in the case of C-style comments, the behavior differs from the model |
144 | * regex. In this implementation, the end must share no characters with the |
145 | * start, so e.g. `/*\/` is not considered to be both the start and end of a |
146 | * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`. |
147 | * |
148 | * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace() |
149 | * but uses far less memory. The delimiters are literal strings, not regular expressions. |
150 | * |
151 | * @param string $startDelim Start delimiter |
152 | * @param string $endDelim End delimiter |
153 | * @param callable $callback Function to call on each match |
154 | * @param string $subject |
155 | * @param string $flags Regular expression flags |
156 | * @return string |
157 | */ |
158 | private static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, |
159 | $subject, $flags = '' |
160 | ) { |
161 | $inputPos = 0; |
162 | $outputPos = 0; |
163 | $contentPos = 0; |
164 | $output = ''; |
165 | $foundStart = false; |
166 | $encStart = preg_quote( $startDelim, '!' ); |
167 | $encEnd = preg_quote( $endDelim, '!' ); |
168 | $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp'; |
169 | $endLength = strlen( $endDelim ); |
170 | $m = []; |
171 | |
172 | while ( $inputPos < strlen( $subject ) && |
173 | preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) |
174 | ) { |
175 | $tokenOffset = $m[0][1]; |
176 | if ( $m[1][0] != '' ) { |
177 | if ( $foundStart && |
178 | $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 |
179 | ) { |
180 | # An end match is present at the same location |
181 | $tokenType = 'end'; |
182 | $tokenLength = $endLength; |
183 | } else { |
184 | $tokenType = 'start'; |
185 | $tokenLength = strlen( $m[0][0] ); |
186 | } |
187 | } elseif ( $m[2][0] != '' ) { |
188 | $tokenType = 'end'; |
189 | $tokenLength = strlen( $m[0][0] ); |
190 | } else { |
191 | throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ ); |
192 | } |
193 | |
194 | if ( $tokenType == 'start' ) { |
195 | # Only move the start position if we haven't already found a start |
196 | # This means that START START END matches outer pair |
197 | if ( !$foundStart ) { |
198 | # Found start |
199 | $inputPos = $tokenOffset + $tokenLength; |
200 | # Write out the non-matching section |
201 | $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos ); |
202 | $outputPos = $tokenOffset; |
203 | $contentPos = $inputPos; |
204 | $foundStart = true; |
205 | } else { |
206 | # Move the input position past the *first character* of START, |
207 | # to protect against missing END when it overlaps with START |
208 | $inputPos = $tokenOffset + 1; |
209 | } |
210 | } elseif ( $tokenType == 'end' ) { |
211 | if ( $foundStart ) { |
212 | # Found match |
213 | $output .= $callback( [ |
214 | substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ), |
215 | substr( $subject, $contentPos, $tokenOffset - $contentPos ) |
216 | ] ); |
217 | $foundStart = false; |
218 | } else { |
219 | # Non-matching end, write it out |
220 | $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos ); |
221 | } |
222 | $inputPos = $outputPos = $tokenOffset + $tokenLength; |
223 | } else { |
224 | throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ ); |
225 | } |
226 | } |
227 | if ( $outputPos < strlen( $subject ) ) { |
228 | $output .= substr( $subject, $outputPos ); |
229 | } |
230 | |
231 | return $output; |
232 | } |
233 | |
234 | /** |
235 | * Perform an operation equivalent to `preg_replace()` with flags. |
236 | * |
237 | * Matches this code: |
238 | * |
239 | * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject ); |
240 | * |
241 | * @param string $startDelim Start delimiter regular expression |
242 | * @param string $endDelim End delimiter regular expression |
243 | * @param string $replace Replacement string. May contain $1, which will be |
244 | * replaced by the text between the delimiters |
245 | * @param string $subject String to search |
246 | * @param string $flags Regular expression flags |
247 | * @return string The string with the matches replaced |
248 | */ |
249 | public static function delimiterReplace( |
250 | $startDelim, $endDelim, $replace, $subject, $flags = '' |
251 | ) { |
252 | return self::delimiterReplaceCallback( |
253 | $startDelim, $endDelim, |
254 | static function ( array $matches ) use ( $replace ) { |
255 | return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] ); |
256 | }, |
257 | $subject, $flags |
258 | ); |
259 | } |
260 | |
261 | /** |
262 | * More or less "markup-safe" str_replace() |
263 | * Ignores any instances of the separator inside `<...>` |
264 | * @param string $search |
265 | * @param string $replace |
266 | * @param string $text |
267 | * @return string |
268 | */ |
269 | public static function replaceMarkup( $search, $replace, $text ) { |
270 | $placeholder = "\x00"; |
271 | |
272 | // Remove placeholder instances |
273 | $text = str_replace( $placeholder, '', $text ); |
274 | |
275 | // Replace instances of the separator inside HTML-like tags with the placeholder |
276 | $cleaned = self::delimiterReplaceCallback( |
277 | '<', '>', |
278 | static function ( array $matches ) use ( $search, $placeholder ) { |
279 | return str_replace( $search, $placeholder, $matches[0] ); |
280 | }, |
281 | $text |
282 | ); |
283 | |
284 | // Explode, then put the replaced separators back in |
285 | $cleaned = str_replace( $search, $replace, $cleaned ); |
286 | $text = str_replace( $placeholder, $search, $cleaned ); |
287 | |
288 | return $text; |
289 | } |
290 | |
291 | /** |
292 | * Utility function to check if the given string is a valid PCRE regex. Avoids |
293 | * manually calling suppressWarnings and restoreWarnings, and provides a |
294 | * one-line solution without the need to use @. |
295 | * |
296 | * @since 1.34 |
297 | * @param string $string The string you want to check being a valid regex |
298 | * @return bool |
299 | */ |
300 | public static function isValidPCRERegex( $string ) { |
301 | AtEase::suppressWarnings(); |
302 | // @phan-suppress-next-line PhanParamSuspiciousOrder False positive |
303 | $isValid = preg_match( $string, '' ); |
304 | AtEase::restoreWarnings(); |
305 | return $isValid !== false; |
306 | } |
307 | |
308 | /** |
309 | * Escape a string to make it suitable for inclusion in a preg_replace() |
310 | * replacement parameter. |
311 | * |
312 | * @param string $string |
313 | * @return string |
314 | */ |
315 | public static function escapeRegexReplacement( $string ) { |
316 | $string = str_replace( '\\', '\\\\', $string ); |
317 | return str_replace( '$', '\\$', $string ); |
318 | } |
319 | |
320 | /** |
321 | * Workalike for explode() with limited memory usage. |
322 | * |
323 | * @param string $separator |
324 | * @param string $subject |
325 | * @return ArrayIterator|ExplodeIterator |
326 | */ |
327 | public static function explode( $separator, $subject ) { |
328 | if ( substr_count( $subject, $separator ) > 1000 ) { |
329 | return new ExplodeIterator( $separator, $subject ); |
330 | } else { |
331 | return new ArrayIterator( explode( $separator, $subject ) ); |
332 | } |
333 | } |
334 | |
335 | /** |
336 | * Wrapper around php's unpack. |
337 | * |
338 | * @param string $format The format string (See php's docs) |
339 | * @param string $data A binary string of binary data |
340 | * @param int|false $length The minimum length of $data or false. This is to |
341 | * prevent reading beyond the end of $data. false to disable the check. |
342 | * |
343 | * Also be careful when using this function to read unsigned 32 bit integer |
344 | * because php might make it negative. |
345 | * |
346 | * @throws UnpackFailedException If $data not long enough, or if unpack fails |
347 | * @return array Associative array of the extracted data |
348 | * @since 1.42 |
349 | */ |
350 | public static function unpack( string $format, string $data, $length = false ): array { |
351 | Assert::parameterType( [ 'integer', 'false' ], $length, '$length' ); |
352 | if ( $length !== false ) { |
353 | $realLen = strlen( $data ); |
354 | if ( $realLen < $length ) { |
355 | throw new UnpackFailedException( "Tried to unpack a " |
356 | . "string of length $realLen, but needed one " |
357 | . "of at least length $length." |
358 | ); |
359 | } |
360 | } |
361 | |
362 | AtEase::suppressWarnings(); |
363 | $result = unpack( $format, $data ); |
364 | AtEase::restoreWarnings(); |
365 | |
366 | if ( $result === false ) { |
367 | // If it cannot extract the packed data. |
368 | throw new UnpackFailedException( "unpack could not unpack binary data" ); |
369 | } |
370 | return $result; |
371 | } |
372 | } |