Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
72.17% |
83 / 115 |
|
20.00% |
3 / 15 |
CRAP | |
0.00% |
0 / 1 |
PHPUtils | |
72.17% |
83 / 115 |
|
20.00% |
3 / 15 |
69.11 | |
0.00% |
0 / 1 |
counterToBase64 | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
jsonEncode | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
jsonDecode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
makeSet | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
lastItem | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
pushArray | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
5 | |||
safeSubstr | |
90.48% |
38 / 42 |
|
0.00% |
0 / 1 |
9.07 | |||
assertValidUTF8 | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
reStrip | |
100.00% |
31 / 31 |
|
100.00% |
1 / 1 |
7 | |||
encodeURIComponent | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
sortArray | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
iterable_to_array | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
unreachable | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
stripPrefix | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
stripSuffix | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | |
9 | /** |
10 | * This file contains Parsoid-independent PHP helper functions. |
11 | * Over time, more functions can be migrated out of various other files here. |
12 | * @module |
13 | */ |
14 | |
15 | class PHPUtils { |
16 | /** |
17 | * Convert a counter to a Base64 encoded string. |
18 | * Padding is stripped. /,+ are replaced with _,- respectively. |
19 | * Warning: Max integer is 2^31 - 1 for bitwise operations. |
20 | * @param int $n |
21 | * @return string |
22 | */ |
23 | public static function counterToBase64( int $n ): string { |
24 | $str = ''; |
25 | do { |
26 | $str = chr( $n & 0xff ) . $str; |
27 | $n >>= 8; |
28 | } while ( $n > 0 ); |
29 | return rtrim( strtr( base64_encode( $str ), '+/', '-_' ), '=' ); |
30 | } |
31 | |
32 | /** |
33 | * FIXME: Copied from FormatJson.php in core |
34 | * |
35 | * Characters problematic in JavaScript. |
36 | * |
37 | * @note These are listed in ECMA-262 (5.1 Ed.), ยง7.3 Line Terminators along with U+000A (LF) |
38 | * and U+000D (CR). However, PHP already escapes LF and CR according to RFC 4627. |
39 | */ |
40 | private const BAD_CHARS = [ |
41 | "\u{2028}", // U+2028 LINE SEPARATOR |
42 | "\u{2029}", // U+2029 PARAGRAPH SEPARATOR |
43 | ]; |
44 | |
45 | /** |
46 | * FIXME: Copied from FormatJson.php in core |
47 | * |
48 | * Escape sequences for characters listed in FormatJson::BAD_CHARS. |
49 | */ |
50 | private const BAD_CHARS_ESCAPED = [ |
51 | '\u2028', // U+2028 LINE SEPARATOR |
52 | '\u2029', // U+2029 PARAGRAPH SEPARATOR |
53 | ]; |
54 | |
55 | /** |
56 | * FIXME: Core has FormatJson::encode that does a more comprehensive job |
57 | * |
58 | * json_encode wrapper function |
59 | * - unscapes slashes and unicode |
60 | * |
61 | * @param mixed $o |
62 | * @return string |
63 | */ |
64 | public static function jsonEncode( $o ): string { |
65 | $str = json_encode( $o, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE | JSON_THROW_ON_ERROR ); |
66 | $str = str_replace( self::BAD_CHARS, self::BAD_CHARS_ESCAPED, $str ); |
67 | return $str; |
68 | } |
69 | |
70 | /** |
71 | * FIXME: Core has FormatJson::parse that does a more comprehensive job |
72 | * json_decode wrapper function |
73 | * @param string $str String to decode into the json object |
74 | * @param bool $assoc Controls whether to parse as an an associative array - defaults to true |
75 | * @return mixed |
76 | */ |
77 | public static function jsonDecode( string $str, bool $assoc = true ) { |
78 | return json_decode( $str, $assoc ); |
79 | } |
80 | |
81 | /** |
82 | * Convert array to associative array usable as a read-only Set. |
83 | * |
84 | * @param array $a |
85 | * @return array |
86 | */ |
87 | public static function makeSet( array $a ): array { |
88 | return array_fill_keys( $a, true ); |
89 | } |
90 | |
91 | /** |
92 | * Helper to get last item of the array |
93 | * @param mixed[] $a |
94 | * @return mixed |
95 | */ |
96 | public static function lastItem( array $a ) { |
97 | // Tim Starling recommends not using end() for perf reasons |
98 | // since apparently it can be O(n) where the refcount on the |
99 | // array is > 1. |
100 | // |
101 | // Note that end() is usable in non-array scenarios. But, in our case, |
102 | // we are almost always dealing with arrays, so this helper probably |
103 | // better for cases where we aren't sure the array isn't shared. |
104 | return $a[count( $a ) - 1] ?? null; |
105 | } |
106 | |
107 | /** |
108 | * Append an array to an accumulator using the most efficient method |
109 | * available. Pushing N elements onto $dest is guaranteed to be O(N). |
110 | * |
111 | * See https://w.wiki/3zvE |
112 | * |
113 | * @param array &$dest Destination array |
114 | * @param array ...$sources Arrays to merge |
115 | */ |
116 | public static function pushArray( array &$dest, array ...$sources ): void { |
117 | if ( count( $sources ) === 0 ) { |
118 | return; |
119 | } |
120 | // If the number of elements to be pushed is greater than the size |
121 | // of the destination, then we can just use PHP's native array_merge |
122 | // since the size of $dest is also O(N). |
123 | $sourceCount = array_sum( array_map( fn ( $s ) => count( $s ), $sources ) ); |
124 | if ( count( $dest ) < $sourceCount ) { |
125 | $dest = array_merge( $dest, ...$sources ); |
126 | return; |
127 | } |
128 | // ...otherwise append each item in turn to $dest. |
129 | foreach ( $sources as $source ) { |
130 | foreach ( $source as $item ) { |
131 | $dest[] = $item; |
132 | } |
133 | } |
134 | } |
135 | |
136 | /** |
137 | * Return a substring, asserting that it is valid UTF-8. |
138 | * By default we assume the full string was valid UTF-8, which allows |
139 | * us to look at the first and last bytes to make this check. |
140 | * You can check the entire string if you are feeling paranoid; it |
141 | * will take O(N) time (where N is the length of the substring) but |
142 | * so does the substring operation. |
143 | * |
144 | * If the substring would start beyond the end of the string or |
145 | * end before the start of the string, then this function will |
146 | * return the empty string (as would JavaScript); note that the |
147 | * native `substr` would return `false` in this case. |
148 | * |
149 | * Using this helper instead of native `substr` is |
150 | * useful during the PHP port to verify that we don't break up |
151 | * Unicode codepoints by the switch from JavaScript UCS-2 offsets |
152 | * to PHP UTF-8 byte offsets. |
153 | * |
154 | * @param string $s The (sub)string to check |
155 | * @param int $start The starting offset (in bytes). If negative, the |
156 | * offset is counted from the end of the string. |
157 | * @param ?int $length (optional) The maximum length of the returned |
158 | * string. If negative, the end position is counted from the end of |
159 | * the string. |
160 | * @param bool $checkEntireString Whether to do a slower verification |
161 | * of the entire string, not just the edges. Defaults to false. |
162 | * @return string The checked substring |
163 | */ |
164 | public static function safeSubstr( |
165 | string $s, int $start, ?int $length = null, |
166 | bool $checkEntireString = false |
167 | ): string { |
168 | if ( $length === null ) { |
169 | $ss = substr( $s, $start ); |
170 | } else { |
171 | $ss = substr( $s, $start, $length ); |
172 | } |
173 | if ( $ss === false ) { |
174 | $ss = ''; |
175 | } |
176 | if ( strlen( $ss ) === 0 ) { |
177 | return $ss; |
178 | } |
179 | $firstChar = ord( $ss ); |
180 | Assert::invariant( |
181 | ( $firstChar & 0xC0 ) !== 0x80, |
182 | 'Bad UTF-8 at start of string' |
183 | ); |
184 | $i = 0; |
185 | // This next loop won't step off the front of the string because we've |
186 | // already asserted that the first character is not 10xx xxxx |
187 | do { |
188 | $i--; |
189 | Assert::invariant( |
190 | $i > -5, |
191 | // This should never happen, assuming the original string |
192 | // was valid UTF-8 |
193 | 'Bad UTF-8 at end of string (>4 byte sequence)' |
194 | ); |
195 | $lastChar = ord( $ss[$i] ); |
196 | } while ( ( $lastChar & 0xC0 ) === 0x80 ); |
197 | if ( ( $lastChar & 0x80 ) === 0 ) { |
198 | Assert::invariant( |
199 | // This shouldn't happen, assuming original string was valid |
200 | $i === -1, 'Bad UTF-8 at end of string (1 byte sequence)' |
201 | ); |
202 | } elseif ( ( $lastChar & 0xE0 ) === 0xC0 ) { |
203 | Assert::invariant( |
204 | $i === -2, 'Bad UTF-8 at end of string (2 byte sequence)' |
205 | ); |
206 | } elseif ( ( $lastChar & 0xF0 ) === 0xE0 ) { |
207 | Assert::invariant( |
208 | $i === -3, 'Bad UTF-8 at end of string (3 byte sequence)' |
209 | ); |
210 | } elseif ( ( $lastChar & 0xF8 ) === 0xF0 ) { |
211 | Assert::invariant( |
212 | $i === -4, 'Bad UTF-8 at end of string (4 byte sequence)' |
213 | ); |
214 | } else { |
215 | throw new UnreachableException( |
216 | // This shouldn't happen, assuming original string was valid |
217 | 'Bad UTF-8 at end of string' |
218 | ); |
219 | } |
220 | if ( $checkEntireString ) { |
221 | // We did the head/tail checks first because they give better |
222 | // diagnostics in the common case where we broke UTF-8 by |
223 | // the substring operation. |
224 | self::assertValidUTF8( $ss ); |
225 | } |
226 | return $ss; |
227 | } |
228 | |
229 | /** |
230 | * Helper for verifying a valid UTF-8 encoding. Using |
231 | * safeSubstr() is a more efficient way of doing this check in |
232 | * most places, where you can assume that the original string was |
233 | * valid UTF-8. This function does a complete traversal of the |
234 | * string, in time proportional to the length of the string. |
235 | * |
236 | * @param string $s The string to check. |
237 | */ |
238 | public static function assertValidUTF8( string $s ): void { |
239 | // Slow complete O(N) check for UTF-8 validity |
240 | $r = preg_match( '//u', $s ); |
241 | Assert::invariant( |
242 | $r === 1, |
243 | 'Bad UTF-8 (full string verification)' |
244 | ); |
245 | } |
246 | |
247 | /** |
248 | * Helper for joining pieces of regular expressions together. This |
249 | * safely strips delimiters from regular expression strings, while |
250 | * ensuring that the result is safely escaped for the new delimiter |
251 | * you plan to use (see the `$delimiter` argument to `preg_quote`). |
252 | * Note that using a meta-character for the new delimiter can lead to |
253 | * unexpected results; for example, if you use `!` then escaping |
254 | * `(?!foo)` will break the regular expression. |
255 | * |
256 | * @param string $re The regular expression to strip |
257 | * @param ?string $newDelimiter Optional delimiter which will be |
258 | * used when recomposing this stripped regular expression into a |
259 | * new regular expression. |
260 | * @return string The regular expression without delimiters or flags |
261 | */ |
262 | public static function reStrip( |
263 | string $re, ?string $newDelimiter = null |
264 | ): string { |
265 | static $delimiterPairs = [ |
266 | '(' => ')', |
267 | '[' => ']', |
268 | '{' => '}', |
269 | '<' => '>', |
270 | ]; |
271 | // Believe it or not, PHP allows leading whitespace in the $re |
272 | // tested with C's "isspace", which is [ \f\n\r\t\v] |
273 | $re = preg_replace( '/^[ \f\n\r\t\v]+/', '', $re ); |
274 | Assert::invariant( strlen( $re ) > 0, "empty regexp" ); |
275 | $startDelimiter = $re[0]; |
276 | // PHP actually supports balanced delimiters (ie open paren on left |
277 | // and close paren on right). |
278 | $endDelimiter = $delimiterPairs[$startDelimiter] ?? $startDelimiter; |
279 | $endDelimiterPos = strrpos( $re, $endDelimiter ); |
280 | Assert::invariant( |
281 | $endDelimiterPos !== false && $endDelimiterPos > 0, |
282 | "can't find end delimiter" |
283 | ); |
284 | $flags = substr( $re, $endDelimiterPos + 1 ); |
285 | Assert::invariant( |
286 | preg_match( '/^[imsxADSUXJu \n]*$/D', $flags ) === 1, |
287 | "unexpected flags" |
288 | ); |
289 | $stripped = substr( $re, 1, $endDelimiterPos - 1 ); |
290 | if ( |
291 | $newDelimiter === null || |
292 | $startDelimiter === $newDelimiter || |
293 | $endDelimiter === $newDelimiter |
294 | ) { |
295 | return $stripped; // done! |
296 | } |
297 | $newCloseDelimiter = $delimiterPairs[$startDelimiter] ?? $startDelimiter; |
298 | // escape the new delimiter |
299 | preg_match_all( '/[^\\\\]|\\\\./s', $stripped, $matches ); |
300 | return implode( '', array_map( static function ( $c ) use ( $newDelimiter, $newCloseDelimiter ) { |
301 | return ( $c === $newDelimiter || $c === $newCloseDelimiter ) |
302 | ? ( '\\' . $c ) : $c; |
303 | }, $matches[0] ) ); |
304 | } |
305 | |
306 | /** |
307 | * JS-compatible encodeURIComponent function |
308 | * FIXME: See T221147 (for a post-port update) |
309 | * |
310 | * @param string $str |
311 | * @return string |
312 | */ |
313 | public static function encodeURIComponent( string $str ): string { |
314 | $revert = [ '%21' => '!', '%2A' => '*', '%27' => "'", '%28' => '(', '%29' => ')' ]; |
315 | return strtr( rawurlencode( $str ), $revert ); |
316 | } |
317 | |
318 | /** |
319 | * Sort keys in an array, recursively, for better reproducibility. |
320 | * (This is especially useful before serializing as JSON.) |
321 | * |
322 | * @param mixed &$array |
323 | */ |
324 | public static function sortArray( &$array ): void { |
325 | if ( !is_array( $array ) ) { |
326 | return; |
327 | } |
328 | ksort( $array ); |
329 | foreach ( $array as $k => $v ) { |
330 | self::sortArray( $array[$k] ); |
331 | } |
332 | } |
333 | |
334 | /** |
335 | * Convert an iterable to an array. |
336 | * |
337 | * This function is similar to *but not the same as* the built-in |
338 | * iterator_to_array, because arrays are iterable but not Traversable! |
339 | * |
340 | * This function is also present in the wmde/iterable-functions library, |
341 | * but it's short enough that we don't need to pull in an entire new |
342 | * dependency here. |
343 | * |
344 | * @see https://stackoverflow.com/questions/44587973/php-iterable-to-array-or-traversable |
345 | * @see https://github.com/wmde/iterable-functions/blob/master/src/functions.php |
346 | * |
347 | * @phan-template T |
348 | * @param iterable<T> $iterable |
349 | * @return array<T> |
350 | */ |
351 | public static function iterable_to_array( iterable $iterable ): array { // phpcs:ignore MediaWiki.NamingConventions.LowerCamelFunctionsName.FunctionName,Generic.Files.LineLength.TooLong |
352 | if ( is_array( $iterable ) ) { |
353 | return $iterable; |
354 | } |
355 | '@phan-var \Traversable $iterable'; // @var \Traversable $iterable |
356 | return iterator_to_array( $iterable ); |
357 | } |
358 | |
359 | /** |
360 | * Indicate that the code which calls this function is intended to be |
361 | * unreachable. |
362 | * |
363 | * This is a workaround for T247093; this has been moved upstream |
364 | * into wikimedia/assert. |
365 | * |
366 | * @param string $reason |
367 | * @return never |
368 | * @deprecated Just throw an UnreachableException instead. |
369 | */ |
370 | public static function unreachable( string $reason = "should never happen" ) { |
371 | throw new UnreachableException( $reason ); |
372 | } |
373 | |
374 | /** |
375 | * If a string starts with a given prefix, remove the prefix. Otherwise, |
376 | * return the original string. Like preg_replace( "/^$prefix/", '', $subject ) |
377 | * except about 1.14x faster in the replacement case and 2x faster in |
378 | * the no-op case. |
379 | * |
380 | * Note: adding type declarations to the parameters adds an overhead of 3%. |
381 | * The benchmark above was without type declarations. |
382 | * |
383 | * @param string $subject |
384 | * @param string $prefix |
385 | * @return string |
386 | */ |
387 | public static function stripPrefix( $subject, $prefix ) { |
388 | if ( str_starts_with( $subject, $prefix ) ) { |
389 | return substr( $subject, strlen( $prefix ) ); |
390 | } else { |
391 | return $subject; |
392 | } |
393 | } |
394 | |
395 | /** |
396 | * If a string ends with a given suffix, remove the suffix. Otherwise, |
397 | * return the original string. Like preg_replace( "/$suffix$/", '', $subject ) |
398 | * except faster. |
399 | * |
400 | * @param string $subject |
401 | * @param string $suffix |
402 | * @return string |
403 | */ |
404 | public static function stripSuffix( $subject, $suffix ) { |
405 | if ( str_ends_with( $subject, $suffix ) ) { |
406 | return substr( $subject, 0, -strlen( $suffix ) ); |
407 | } else { |
408 | return $subject; |
409 | } |
410 | } |
411 | |
412 | } |