Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
67.27% |
74 / 110 |
|
13.33% |
2 / 15 |
CRAP | |
0.00% |
0 / 1 |
PHPUtils | |
67.27% |
74 / 110 |
|
13.33% |
2 / 15 |
81.43 | |
0.00% |
0 / 1 |
counterToBase64 | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
jsonEncode | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
jsonDecode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
makeSet | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
lastItem | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
pushArray | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
safeSubstr | |
90.48% |
38 / 42 |
|
0.00% |
0 / 1 |
9.07 | |||
assertValidUTF8 | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
reStrip | |
100.00% |
31 / 31 |
|
100.00% |
1 / 1 |
7 | |||
encodeURIComponent | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
sortArray | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
iterable_to_array | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
unreachable | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
stripPrefix | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
stripSuffix | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | |
9 | /** |
10 | * This file contains Parsoid-independent PHP helper functions. |
11 | * Over time, more functions can be migrated out of various other files here. |
12 | * @module |
13 | */ |
14 | |
15 | class PHPUtils { |
16 | /** |
17 | * Convert a counter to a Base64 encoded string. |
18 | * Padding is stripped. \,+ are replaced with _,- respectively. |
19 | * Warning: Max integer is 2^31 - 1 for bitwise operations. |
20 | * @param int $n |
21 | * @return string |
22 | */ |
23 | public static function counterToBase64( int $n ): string { |
24 | $str = ''; |
25 | do { |
26 | $str = chr( $n & 0xff ) . $str; |
27 | $n >>= 8; |
28 | } while ( $n > 0 ); |
29 | return rtrim( strtr( base64_encode( $str ), '+/', '-_' ), '=' ); |
30 | } |
31 | |
32 | /** |
33 | * FIXME: Copied from FormatJson.php in core |
34 | * |
35 | * Characters problematic in JavaScript. |
36 | * |
37 | * @note These are listed in ECMA-262 (5.1 Ed.), ยง7.3 Line Terminators along with U+000A (LF) |
38 | * and U+000D (CR). However, PHP already escapes LF and CR according to RFC 4627. |
39 | */ |
40 | private const BAD_CHARS = [ |
41 | "\u{2028}", // U+2028 LINE SEPARATOR |
42 | "\u{2029}", // U+2029 PARAGRAPH SEPARATOR |
43 | ]; |
44 | |
45 | /** |
46 | * FIXME: Copied from FormatJson.php in core |
47 | * |
48 | * Escape sequences for characters listed in FormatJson::BAD_CHARS. |
49 | */ |
50 | private const BAD_CHARS_ESCAPED = [ |
51 | '\u2028', // U+2028 LINE SEPARATOR |
52 | '\u2029', // U+2029 PARAGRAPH SEPARATOR |
53 | ]; |
54 | |
55 | /** |
56 | * FIXME: Core has FormatJson::encode that does a more comprehensive job |
57 | * |
58 | * json_encode wrapper function |
59 | * - unscapes slashes and unicode |
60 | * |
61 | * @param mixed $o |
62 | * @return string |
63 | */ |
64 | public static function jsonEncode( $o ): string { |
65 | $str = json_encode( $o, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE | JSON_THROW_ON_ERROR ); |
66 | $str = str_replace( self::BAD_CHARS, self::BAD_CHARS_ESCAPED, $str ); |
67 | return $str; |
68 | } |
69 | |
70 | /** |
71 | * FIXME: Core has FormatJson::parse that does a more comprehensive job |
72 | * json_decode wrapper function |
73 | * @param string $str String to decode into the json object |
74 | * @param bool $assoc Controls whether to parse as an an associative array - defaults to true |
75 | * @return mixed |
76 | */ |
77 | public static function jsonDecode( string $str, bool $assoc = true ) { |
78 | return json_decode( $str, $assoc ); |
79 | } |
80 | |
81 | /** |
82 | * Convert array to associative array usable as a read-only Set. |
83 | * |
84 | * @param array $a |
85 | * @return array |
86 | */ |
87 | public static function makeSet( array $a ): array { |
88 | return array_fill_keys( $a, true ); |
89 | } |
90 | |
91 | /** |
92 | * Helper to get last item of the array |
93 | * @param mixed[] $a |
94 | * @return mixed |
95 | */ |
96 | public static function lastItem( array $a ) { |
97 | // Tim Starling recommends not using end() for perf reasons |
98 | // since apparently it can be O(n) where the refcount on the |
99 | // array is > 1. |
100 | // |
101 | // Note that end() is usable in non-array scenarios. But, in our case, |
102 | // we are almost always dealing with arrays, so this helper probably |
103 | // better for cases where we aren't sure the array isn't shared. |
104 | return $a[count( $a ) - 1] ?? null; |
105 | } |
106 | |
107 | /** |
108 | * Append an array to an accumulator using the most efficient method |
109 | * available. Makes sure that accumulation is O(n). |
110 | * |
111 | * See https://w.wiki/3zvE |
112 | * |
113 | * @param array &$dest Destination array |
114 | * @param array $source Array to merge |
115 | */ |
116 | public static function pushArray( array &$dest, array $source ): void { |
117 | if ( count( $dest ) < count( $source ) ) { |
118 | $dest = array_merge( $dest, $source ); |
119 | } else { |
120 | foreach ( $source as $item ) { |
121 | $dest[] = $item; |
122 | } |
123 | } |
124 | } |
125 | |
126 | /** |
127 | * Return a substring, asserting that it is valid UTF-8. |
128 | * By default we assume the full string was valid UTF-8, which allows |
129 | * us to look at the first and last bytes to make this check. |
130 | * You can check the entire string if you are feeling paranoid; it |
131 | * will take O(N) time (where N is the length of the substring) but |
132 | * so does the substring operation. |
133 | * |
134 | * If the substring would start beyond the end of the string or |
135 | * end before the start of the string, then this function will |
136 | * return the empty string (as would JavaScript); note that the |
137 | * native `substr` would return `false` in this case. |
138 | * |
139 | * Using this helper instead of native `substr` is |
140 | * useful during the PHP port to verify that we don't break up |
141 | * Unicode codepoints by the switch from JavaScript UCS-2 offsets |
142 | * to PHP UTF-8 byte offsets. |
143 | * |
144 | * @param string $s The (sub)string to check |
145 | * @param int $start The starting offset (in bytes). If negative, the |
146 | * offset is counted from the end of the string. |
147 | * @param ?int $length (optional) The maximum length of the returned |
148 | * string. If negative, the end position is counted from the end of |
149 | * the string. |
150 | * @param bool $checkEntireString Whether to do a slower verification |
151 | * of the entire string, not just the edges. Defaults to false. |
152 | * @return string The checked substring |
153 | */ |
154 | public static function safeSubstr( |
155 | string $s, int $start, ?int $length = null, |
156 | bool $checkEntireString = false |
157 | ): string { |
158 | if ( $length === null ) { |
159 | $ss = substr( $s, $start ); |
160 | } else { |
161 | $ss = substr( $s, $start, $length ); |
162 | } |
163 | if ( $ss === false ) { |
164 | $ss = ''; |
165 | } |
166 | if ( strlen( $ss ) === 0 ) { |
167 | return $ss; |
168 | } |
169 | $firstChar = ord( $ss ); |
170 | Assert::invariant( |
171 | ( $firstChar & 0xC0 ) !== 0x80, |
172 | 'Bad UTF-8 at start of string' |
173 | ); |
174 | $i = 0; |
175 | // This next loop won't step off the front of the string because we've |
176 | // already asserted that the first character is not 10xx xxxx |
177 | do { |
178 | $i--; |
179 | Assert::invariant( |
180 | $i > -5, |
181 | // This should never happen, assuming the original string |
182 | // was valid UTF-8 |
183 | 'Bad UTF-8 at end of string (>4 byte sequence)' |
184 | ); |
185 | $lastChar = ord( $ss[$i] ); |
186 | } while ( ( $lastChar & 0xC0 ) === 0x80 ); |
187 | if ( ( $lastChar & 0x80 ) === 0 ) { |
188 | Assert::invariant( |
189 | // This shouldn't happen, assuming original string was valid |
190 | $i === -1, 'Bad UTF-8 at end of string (1 byte sequence)' |
191 | ); |
192 | } elseif ( ( $lastChar & 0xE0 ) === 0xC0 ) { |
193 | Assert::invariant( |
194 | $i === -2, 'Bad UTF-8 at end of string (2 byte sequence)' |
195 | ); |
196 | } elseif ( ( $lastChar & 0xF0 ) === 0xE0 ) { |
197 | Assert::invariant( |
198 | $i === -3, 'Bad UTF-8 at end of string (3 byte sequence)' |
199 | ); |
200 | } elseif ( ( $lastChar & 0xF8 ) === 0xF0 ) { |
201 | Assert::invariant( |
202 | $i === -4, 'Bad UTF-8 at end of string (4 byte sequence)' |
203 | ); |
204 | } else { |
205 | throw new UnreachableException( |
206 | // This shouldn't happen, assuming original string was valid |
207 | 'Bad UTF-8 at end of string' |
208 | ); |
209 | } |
210 | if ( $checkEntireString ) { |
211 | // We did the head/tail checks first because they give better |
212 | // diagnostics in the common case where we broke UTF-8 by |
213 | // the substring operation. |
214 | self::assertValidUTF8( $ss ); |
215 | } |
216 | return $ss; |
217 | } |
218 | |
219 | /** |
220 | * Helper for verifying a valid UTF-8 encoding. Using |
221 | * safeSubstr() is a more efficient way of doing this check in |
222 | * most places, where you can assume that the original string was |
223 | * valid UTF-8. This function does a complete traversal of the |
224 | * string, in time proportional to the length of the string. |
225 | * |
226 | * @param string $s The string to check. |
227 | */ |
228 | public static function assertValidUTF8( string $s ): void { |
229 | // Slow complete O(N) check for UTF-8 validity |
230 | $r = preg_match( '//u', $s ); |
231 | Assert::invariant( |
232 | $r === 1, |
233 | 'Bad UTF-8 (full string verification)' |
234 | ); |
235 | } |
236 | |
237 | /** |
238 | * Helper for joining pieces of regular expressions together. This |
239 | * safely strips delimiters from regular expression strings, while |
240 | * ensuring that the result is safely escaped for the new delimiter |
241 | * you plan to use (see the `$delimiter` argument to `preg_quote`). |
242 | * Note that using a meta-character for the new delimiter can lead to |
243 | * unexpected results; for example, if you use `!` then escaping |
244 | * `(?!foo)` will break the regular expression. |
245 | * |
246 | * @param string $re The regular expression to strip |
247 | * @param ?string $newDelimiter Optional delimiter which will be |
248 | * used when recomposing this stripped regular expression into a |
249 | * new regular expression. |
250 | * @return string The regular expression without delimiters or flags |
251 | */ |
252 | public static function reStrip( |
253 | string $re, ?string $newDelimiter = null |
254 | ): string { |
255 | static $delimiterPairs = [ |
256 | '(' => ')', |
257 | '[' => ']', |
258 | '{' => '}', |
259 | '<' => '>', |
260 | ]; |
261 | // Believe it or not, PHP allows leading whitespace in the $re |
262 | // tested with C's "isspace", which is [ \f\n\r\t\v] |
263 | $re = preg_replace( '/^[ \f\n\r\t\v]+/', '', $re ); |
264 | Assert::invariant( strlen( $re ) > 0, "empty regexp" ); |
265 | $startDelimiter = $re[0]; |
266 | // PHP actually supports balanced delimiters (ie open paren on left |
267 | // and close paren on right). |
268 | $endDelimiter = $delimiterPairs[$startDelimiter] ?? $startDelimiter; |
269 | $endDelimiterPos = strrpos( $re, $endDelimiter ); |
270 | Assert::invariant( |
271 | $endDelimiterPos !== false && $endDelimiterPos > 0, |
272 | "can't find end delimiter" |
273 | ); |
274 | $flags = substr( $re, $endDelimiterPos + 1 ); |
275 | Assert::invariant( |
276 | preg_match( '/^[imsxADSUXJu \n]*$/D', $flags ) === 1, |
277 | "unexpected flags" |
278 | ); |
279 | $stripped = substr( $re, 1, $endDelimiterPos - 1 ); |
280 | if ( |
281 | $newDelimiter === null || |
282 | $startDelimiter === $newDelimiter || |
283 | $endDelimiter === $newDelimiter |
284 | ) { |
285 | return $stripped; // done! |
286 | } |
287 | $newCloseDelimiter = $delimiterPairs[$startDelimiter] ?? $startDelimiter; |
288 | // escape the new delimiter |
289 | preg_match_all( '/[^\\\\]|\\\\./s', $stripped, $matches ); |
290 | return implode( '', array_map( static function ( $c ) use ( $newDelimiter, $newCloseDelimiter ) { |
291 | return ( $c === $newDelimiter || $c === $newCloseDelimiter ) |
292 | ? ( '\\' . $c ) : $c; |
293 | }, $matches[0] ) ); |
294 | } |
295 | |
296 | /** |
297 | * JS-compatible encodeURIComponent function |
298 | * FIXME: See T221147 (for a post-port update) |
299 | * |
300 | * @param string $str |
301 | * @return string |
302 | */ |
303 | public static function encodeURIComponent( string $str ): string { |
304 | $revert = [ '%21' => '!', '%2A' => '*', '%27' => "'", '%28' => '(', '%29' => ')' ]; |
305 | return strtr( rawurlencode( $str ), $revert ); |
306 | } |
307 | |
308 | /** |
309 | * Sort keys in an array, recursively, for better reproducibility. |
310 | * (This is especially useful before serializing as JSON.) |
311 | * |
312 | * @param mixed &$array |
313 | */ |
314 | public static function sortArray( &$array ): void { |
315 | if ( !is_array( $array ) ) { |
316 | return; |
317 | } |
318 | ksort( $array ); |
319 | foreach ( $array as $k => $v ) { |
320 | self::sortArray( $array[$k] ); |
321 | } |
322 | } |
323 | |
324 | /** |
325 | * Convert an iterable to an array. |
326 | * |
327 | * This function is similar to *but not the same as* the built-in |
328 | * iterator_to_array, because arrays are iterable but not Traversable! |
329 | * |
330 | * This function is also present in the wmde/iterable-functions library, |
331 | * but it's short enough that we don't need to pull in an entire new |
332 | * dependency here. |
333 | * |
334 | * @see https://stackoverflow.com/questions/44587973/php-iterable-to-array-or-traversable |
335 | * @see https://github.com/wmde/iterable-functions/blob/master/src/functions.php |
336 | * |
337 | * @phan-template T |
338 | * @param iterable<T> $iterable |
339 | * @return array<T> |
340 | */ |
341 | public static function iterable_to_array( iterable $iterable ): array { // phpcs:ignore MediaWiki.NamingConventions.LowerCamelFunctionsName.FunctionName,Generic.Files.LineLength.TooLong |
342 | if ( is_array( $iterable ) ) { |
343 | return $iterable; |
344 | } |
345 | '@phan-var \Traversable $iterable'; // @var \Traversable $iterable |
346 | return iterator_to_array( $iterable ); |
347 | } |
348 | |
349 | /** |
350 | * Indicate that the code which calls this function is intended to be |
351 | * unreachable. |
352 | * |
353 | * This is a workaround for T247093; this has been moved upstream |
354 | * into wikimedia/assert. |
355 | * |
356 | * @param string $reason |
357 | * @return never |
358 | * @deprecated Just throw an UnreachableException instead. |
359 | */ |
360 | public static function unreachable( string $reason = "should never happen" ) { |
361 | throw new UnreachableException( $reason ); |
362 | } |
363 | |
364 | /** |
365 | * If a string starts with a given prefix, remove the prefix. Otherwise, |
366 | * return the original string. Like preg_replace( "/^$prefix/", '', $subject ) |
367 | * except about 1.14x faster in the replacement case and 2x faster in |
368 | * the no-op case. |
369 | * |
370 | * Note: adding type declarations to the parameters adds an overhead of 3%. |
371 | * The benchmark above was without type declarations. |
372 | * |
373 | * @param string $subject |
374 | * @param string $prefix |
375 | * @return string |
376 | */ |
377 | public static function stripPrefix( $subject, $prefix ) { |
378 | if ( str_starts_with( $subject, $prefix ) ) { |
379 | return substr( $subject, strlen( $prefix ) ); |
380 | } else { |
381 | return $subject; |
382 | } |
383 | } |
384 | |
385 | /** |
386 | * If a string ends with a given suffix, remove the suffix. Otherwise, |
387 | * return the original string. Like preg_replace( "/$suffix$/", '', $subject ) |
388 | * except faster. |
389 | * |
390 | * @param string $subject |
391 | * @param string $suffix |
392 | * @return string |
393 | */ |
394 | public static function stripSuffix( $subject, $suffix ) { |
395 | if ( str_ends_with( $subject, $suffix ) ) { |
396 | return substr( $subject, 0, -strlen( $suffix ) ); |
397 | } else { |
398 | return $subject; |
399 | } |
400 | } |
401 | |
402 | } |