Code Coverage for /src/src/Utils/PHPUtils.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	72.17% covered (warning)	72.17%	83 / 115	20.00% covered (danger)	20.00%	3 / 15	CRAP	0.00% covered (danger)	0.00%	0 / 1
PHPUtils	72.17% covered (warning)	72.17%	83 / 115	20.00% covered (danger)	20.00%	3 / 15	69.11	0.00% covered (danger)	0.00%	0 / 1
counterToBase64	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	2
jsonEncode	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	2
jsonDecode	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
makeSet	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
lastItem	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
pushArray	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	5
safeSubstr	90.48% covered (success)	90.48%	38 / 42	0.00% covered (danger)	0.00%	0 / 1	9.07
assertValidUTF8	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	1
reStrip	100.00% covered (success)	100.00%	31 / 31	100.00% covered (success)	100.00%	1 / 1	7
encodeURIComponent	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	2
sortArray	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	12
iterable_to_array	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	6
unreachable	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
stripPrefix	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	6
stripSuffix	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	6

1	<?php
2	declare( strict_types = 1 );
3
4	namespace Wikimedia\Parsoid\Utils;
5
6	use Wikimedia\Assert\Assert;
7	use Wikimedia\Assert\UnreachableException;
8
9	/**
10	* This file contains Parsoid-independent PHP helper functions.
11	* Over time, more functions can be migrated out of various other files here.
12	* @module
13	*/
14
15	class PHPUtils {
16	/**
17	* Convert a counter to a Base64 encoded string.
18	* Padding is stripped. /,+ are replaced with _,- respectively.
19	* Warning: Max integer is 2^31 - 1 for bitwise operations.
20	* @param int $n
21	* @return string
22	*/
23	public static function counterToBase64( int $n ): string {
24	$str = '';
25	do {
26	$str = chr( $n & 0xff ) . $str;
27	$n >>= 8;
28	} while ( $n > 0 );
29	return rtrim( strtr( base64_encode( $str ), '+/', '-_' ), '=' );
30	}
31
32	/**
33	* FIXME: Copied from FormatJson.php in core
34	*
35	* Characters problematic in JavaScript.
36	*
37	* @note These are listed in ECMA-262 (5.1 Ed.), §7.3 Line Terminators along with U+000A (LF)
38	* and U+000D (CR). However, PHP already escapes LF and CR according to RFC 4627.
39	*/
40	private const BAD_CHARS = [
41	"\u{2028}", // U+2028 LINE SEPARATOR
42	"\u{2029}", // U+2029 PARAGRAPH SEPARATOR
43	];
44
45	/**
46	* FIXME: Copied from FormatJson.php in core
47	*
48	* Escape sequences for characters listed in FormatJson::BAD_CHARS.
49	*/
50	private const BAD_CHARS_ESCAPED = [
51	'\u2028', // U+2028 LINE SEPARATOR
52	'\u2029', // U+2029 PARAGRAPH SEPARATOR
53	];
54
55	/**
56	* FIXME: Core has FormatJson::encode that does a more comprehensive job
57	*
58	* json_encode wrapper function
59	* - unscapes slashes and unicode
60	*
61	* @param mixed $o
62	* @return string
63	*/
64	public static function jsonEncode( $o ): string {
65	$str = json_encode( $o, JSON_UNESCAPED_SLASHES \| JSON_UNESCAPED_UNICODE \| JSON_THROW_ON_ERROR );
66	$str = str_replace( self::BAD_CHARS, self::BAD_CHARS_ESCAPED, $str );
67	return $str;
68	}
69
70	/**
71	* FIXME: Core has FormatJson::parse that does a more comprehensive job
72	* json_decode wrapper function
73	* @param string $str String to decode into the json object
74	* @param bool $assoc Controls whether to parse as an an associative array - defaults to true
75	* @return mixed
76	*/
77	public static function jsonDecode( string $str, bool $assoc = true ) {
78	return json_decode( $str, $assoc );
79	}
80
81	/**
82	* Convert array to associative array usable as a read-only Set.
83	*
84	* @param array $a
85	* @return array
86	*/
87	public static function makeSet( array $a ): array {
88	return array_fill_keys( $a, true );
89	}
90
91	/**
92	* Helper to get last item of the array
93	* @param mixed[] $a
94	* @return mixed
95	*/
96	public static function lastItem( array $a ) {
97	// Tim Starling recommends not using end() for perf reasons
98	// since apparently it can be O(n) where the refcount on the
99	// array is > 1.
100	//
101	// Note that end() is usable in non-array scenarios. But, in our case,
102	// we are almost always dealing with arrays, so this helper probably
103	// better for cases where we aren't sure the array isn't shared.
104	return $a[count( $a ) - 1] ?? null;
105	}
106
107	/**
108	* Append an array to an accumulator using the most efficient method
109	* available. Pushing N elements onto $dest is guaranteed to be O(N).
110	*
111	* See https://w.wiki/3zvE
112	*
113	* @param array &$dest Destination array
114	* @param array ...$sources Arrays to merge
115	*/
116	public static function pushArray( array &$dest, array ...$sources ): void {
117	if ( count( $sources ) === 0 ) {
118	return;
119	}
120	// If the number of elements to be pushed is greater than the size
121	// of the destination, then we can just use PHP's native array_merge
122	// since the size of $dest is also O(N).
123	$sourceCount = array_sum( array_map( fn ( $s ) => count( $s ), $sources ) );
124	if ( count( $dest ) < $sourceCount ) {
125	$dest = array_merge( $dest, ...$sources );
126	return;
127	}
128	// ...otherwise append each item in turn to $dest.
129	foreach ( $sources as $source ) {
130	foreach ( $source as $item ) {
131	$dest[] = $item;
132	}
133	}
134	}
135
136	/**
137	* Return a substring, asserting that it is valid UTF-8.
138	* By default we assume the full string was valid UTF-8, which allows
139	* us to look at the first and last bytes to make this check.
140	* You can check the entire string if you are feeling paranoid; it
141	* will take O(N) time (where N is the length of the substring) but
142	* so does the substring operation.
143	*
144	* If the substring would start beyond the end of the string or
145	* end before the start of the string, then this function will
146	* return the empty string (as would JavaScript); note that the
147	* native `substr` would return `false` in this case.
148	*
149	* Using this helper instead of native `substr` is
150	* useful during the PHP port to verify that we don't break up
151	* Unicode codepoints by the switch from JavaScript UCS-2 offsets
152	* to PHP UTF-8 byte offsets.
153	*
154	* @param string $s The (sub)string to check
155	* @param int $start The starting offset (in bytes). If negative, the
156	* offset is counted from the end of the string.
157	* @param ?int $length (optional) The maximum length of the returned
158	* string. If negative, the end position is counted from the end of
159	* the string.
160	* @param bool $checkEntireString Whether to do a slower verification
161	* of the entire string, not just the edges. Defaults to false.
162	* @return string The checked substring
163	*/
164	public static function safeSubstr(
165	string $s, int $start, ?int $length = null,
166	bool $checkEntireString = false
167	): string {
168	if ( $length === null ) {
169	$ss = substr( $s, $start );
170	} else {
171	$ss = substr( $s, $start, $length );
172	}
173	if ( $ss === false ) {
174	$ss = '';
175	}
176	if ( strlen( $ss ) === 0 ) {
177	return $ss;
178	}
179	$firstChar = ord( $ss );
180	Assert::invariant(
181	( $firstChar & 0xC0 ) !== 0x80,
182	'Bad UTF-8 at start of string'
183	);
184	$i = 0;
185	// This next loop won't step off the front of the string because we've
186	// already asserted that the first character is not 10xx xxxx
187	do {
188	$i--;
189	Assert::invariant(
190	$i > -5,
191	// This should never happen, assuming the original string
192	// was valid UTF-8
193	'Bad UTF-8 at end of string (>4 byte sequence)'
194	);
195	$lastChar = ord( $ss[$i] );
196	} while ( ( $lastChar & 0xC0 ) === 0x80 );
197	if ( ( $lastChar & 0x80 ) === 0 ) {
198	Assert::invariant(
199	// This shouldn't happen, assuming original string was valid
200	$i === -1, 'Bad UTF-8 at end of string (1 byte sequence)'
201	);
202	} elseif ( ( $lastChar & 0xE0 ) === 0xC0 ) {
203	Assert::invariant(
204	$i === -2, 'Bad UTF-8 at end of string (2 byte sequence)'
205	);
206	} elseif ( ( $lastChar & 0xF0 ) === 0xE0 ) {
207	Assert::invariant(
208	$i === -3, 'Bad UTF-8 at end of string (3 byte sequence)'
209	);
210	} elseif ( ( $lastChar & 0xF8 ) === 0xF0 ) {
211	Assert::invariant(
212	$i === -4, 'Bad UTF-8 at end of string (4 byte sequence)'
213	);
214	} else {
215	throw new UnreachableException(
216	// This shouldn't happen, assuming original string was valid
217	'Bad UTF-8 at end of string'
218	);
219	}
220	if ( $checkEntireString ) {
221	// We did the head/tail checks first because they give better
222	// diagnostics in the common case where we broke UTF-8 by
223	// the substring operation.
224	self::assertValidUTF8( $ss );
225	}
226	return $ss;
227	}
228
229	/**
230	* Helper for verifying a valid UTF-8 encoding. Using
231	* safeSubstr() is a more efficient way of doing this check in
232	* most places, where you can assume that the original string was
233	* valid UTF-8. This function does a complete traversal of the
234	* string, in time proportional to the length of the string.
235	*
236	* @param string $s The string to check.
237	*/
238	public static function assertValidUTF8( string $s ): void {
239	// Slow complete O(N) check for UTF-8 validity
240	$r = preg_match( '//u', $s );
241	Assert::invariant(
242	$r === 1,
243	'Bad UTF-8 (full string verification)'
244	);
245	}
246
247	/**
248	* Helper for joining pieces of regular expressions together. This
249	* safely strips delimiters from regular expression strings, while
250	* ensuring that the result is safely escaped for the new delimiter
251	* you plan to use (see the `$delimiter` argument to `preg_quote`).
252	* Note that using a meta-character for the new delimiter can lead to
253	* unexpected results; for example, if you use `!` then escaping
254	* `(?!foo)` will break the regular expression.
255	*
256	* @param string $re The regular expression to strip
257	* @param ?string $newDelimiter Optional delimiter which will be
258	* used when recomposing this stripped regular expression into a
259	* new regular expression.
260	* @return string The regular expression without delimiters or flags
261	*/
262	public static function reStrip(
263	string $re, ?string $newDelimiter = null
264	): string {
265	static $delimiterPairs = [
266	'(' => ')',
267	'[' => ']',
268	'{' => '}',
269	'<' => '>',
270	];
271	// Believe it or not, PHP allows leading whitespace in the $re
272	// tested with C's "isspace", which is [ \f\n\r\t\v]
273	$re = preg_replace( '/^[ \f\n\r\t\v]+/', '', $re );
274	Assert::invariant( strlen( $re ) > 0, "empty regexp" );
275	$startDelimiter = $re[0];
276	// PHP actually supports balanced delimiters (ie open paren on left
277	// and close paren on right).
278	$endDelimiter = $delimiterPairs[$startDelimiter] ?? $startDelimiter;
279	$endDelimiterPos = strrpos( $re, $endDelimiter );
280	Assert::invariant(
281	$endDelimiterPos !== false && $endDelimiterPos > 0,
282	"can't find end delimiter"
283	);
284	$flags = substr( $re, $endDelimiterPos + 1 );
285	Assert::invariant(
286	preg_match( '/^[imsxADSUXJu \n]*$/D', $flags ) === 1,
287	"unexpected flags"
288	);
289	$stripped = substr( $re, 1, $endDelimiterPos - 1 );
290	if (
291	$newDelimiter === null \|\|
292	$startDelimiter === $newDelimiter \|\|
293	$endDelimiter === $newDelimiter
294	) {
295	return $stripped; // done!
296	}
297	$newCloseDelimiter = $delimiterPairs[$startDelimiter] ?? $startDelimiter;
298	// escape the new delimiter
299	preg_match_all( '/[^\\\\]\|\\\\./s', $stripped, $matches );
300	return implode( '', array_map( static function ( $c ) use ( $newDelimiter, $newCloseDelimiter ) {
301	return ( $c === $newDelimiter \|\| $c === $newCloseDelimiter )
302	? ( '\\' . $c ) : $c;
303	}, $matches[0] ) );
304	}
305
306	/**
307	* JS-compatible encodeURIComponent function
308	* FIXME: See T221147 (for a post-port update)
309	*
310	* @param string $str
311	* @return string
312	*/
313	public static function encodeURIComponent( string $str ): string {
314	$revert = [ '%21' => '!', '%2A' => '*', '%27' => "'", '%28' => '(', '%29' => ')' ];
315	return strtr( rawurlencode( $str ), $revert );
316	}
317
318	/**
319	* Sort keys in an array, recursively, for better reproducibility.
320	* (This is especially useful before serializing as JSON.)
321	*
322	* @param mixed &$array
323	*/
324	public static function sortArray( &$array ): void {
325	if ( !is_array( $array ) ) {
326	return;
327	}
328	ksort( $array );
329	foreach ( $array as $k => $v ) {
330	self::sortArray( $array[$k] );
331	}
332	}
333
334	/**
335	* Convert an iterable to an array.
336	*
337	* This function is similar to but not the same as the built-in
338	* iterator_to_array, because arrays are iterable but not Traversable!
339	*
340	* This function is also present in the wmde/iterable-functions library,
341	* but it's short enough that we don't need to pull in an entire new
342	* dependency here.
343	*
344	* @see https://stackoverflow.com/questions/44587973/php-iterable-to-array-or-traversable
345	* @see https://github.com/wmde/iterable-functions/blob/master/src/functions.php
346	*
347	* @phan-template T
348	* @param iterable<T> $iterable
349	* @return array<T>
350	*/
351	public static function iterable_to_array( iterable $iterable ): array { // phpcs:ignore MediaWiki.NamingConventions.LowerCamelFunctionsName.FunctionName,Generic.Files.LineLength.TooLong
352	if ( is_array( $iterable ) ) {
353	return $iterable;
354	}
355	'@phan-var \Traversable $iterable'; // @var \Traversable $iterable
356	return iterator_to_array( $iterable );
357	}
358
359	/**
360	* Indicate that the code which calls this function is intended to be
361	* unreachable.
362	*
363	* This is a workaround for T247093; this has been moved upstream
364	* into wikimedia/assert.
365	*
366	* @param string $reason
367	* @return never
368	* @deprecated Just throw an UnreachableException instead.
369	*/
370	public static function unreachable( string $reason = "should never happen" ) {
371	throw new UnreachableException( $reason );
372	}
373
374	/**
375	* If a string starts with a given prefix, remove the prefix. Otherwise,
376	* return the original string. Like preg_replace( "/^$prefix/", '', $subject )
377	* except about 1.14x faster in the replacement case and 2x faster in
378	* the no-op case.
379	*
380	* Note: adding type declarations to the parameters adds an overhead of 3%.
381	* The benchmark above was without type declarations.
382	*
383	* @param string $subject
384	* @param string $prefix
385	* @return string
386	*/
387	public static function stripPrefix( $subject, $prefix ) {
388	if ( str_starts_with( $subject, $prefix ) ) {
389	return substr( $subject, strlen( $prefix ) );
390	} else {
391	return $subject;
392	}
393	}
394
395	/**
396	* If a string ends with a given suffix, remove the suffix. Otherwise,
397	* return the original string. Like preg_replace( "/$suffix$/", '', $subject )
398	* except faster.
399	*
400	* @param string $subject
401	* @param string $suffix
402	* @return string
403	*/
404	public static function stripSuffix( $subject, $suffix ) {
405	if ( str_ends_with( $subject, $suffix ) ) {
406	return substr( $subject, 0, -strlen( $suffix ) );
407	} else {
408	return $subject;
409	}
410	}
411
412	}