MediaWiki master
StringUtils.php
Go to the documentation of this file.
1<?php
2
4use Wikimedia\Assert\Assert;
5use Wikimedia\AtEase\AtEase;
6
46 public static function isUtf8( $value ) {
47 return mb_check_encoding( (string)$value, 'UTF-8' );
48 }
49
61 public static function delimiterExplode( $startDelim, $endDelim, $separator,
62 $subject, $nested = false ) {
63 $inputPos = 0;
64 $lastPos = 0;
65 $depth = 0;
66 $encStart = preg_quote( $startDelim, '!' );
67 $encEnd = preg_quote( $endDelim, '!' );
68 $encSep = preg_quote( $separator, '!' );
69 $len = strlen( $subject );
70 $m = [];
71 $exploded = [];
72 while (
73 $inputPos < $len &&
74 preg_match(
75 "!$encStart|$encEnd|$encSep!S", $subject, $m,
76 PREG_OFFSET_CAPTURE, $inputPos
77 )
78 ) {
79 $match = $m[0][0];
80 $matchPos = $m[0][1];
81 $inputPos = $matchPos + strlen( $match );
82 if ( $match === $separator ) {
83 if ( $depth === 0 ) {
84 $exploded[] = substr(
85 $subject, $lastPos, $matchPos - $lastPos
86 );
87 $lastPos = $inputPos;
88 }
89 } elseif ( $match === $startDelim ) {
90 if ( $depth === 0 || $nested ) {
91 $depth++;
92 }
93 } else {
94 $depth--;
95 }
96 }
97 $exploded[] = substr( $subject, $lastPos );
98 // This method could be rewritten in the future to avoid creating an
99 // intermediate array, since the return type is just an iterator.
100 return new ArrayIterator( $exploded );
101 }
102
120 public static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
121 $segments = explode( $startDelim, $subject );
122 $output = array_shift( $segments );
123 foreach ( $segments as $s ) {
124 $endDelimPos = strpos( $s, $endDelim );
125 if ( $endDelimPos === false ) {
126 $output .= $startDelim . $s;
127 } else {
128 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
129 }
130 }
131
132 return $output;
133 }
134
158 private static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
159 $subject, $flags = ''
160 ) {
161 $inputPos = 0;
162 $outputPos = 0;
163 $contentPos = 0;
164 $output = '';
165 $foundStart = false;
166 $encStart = preg_quote( $startDelim, '!' );
167 $encEnd = preg_quote( $endDelim, '!' );
168 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
169 $endLength = strlen( $endDelim );
170 $m = [];
171
172 while ( $inputPos < strlen( $subject ) &&
173 preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
174 ) {
175 $tokenOffset = $m[0][1];
176 if ( $m[1][0] != '' ) {
177 if ( $foundStart &&
178 $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
179 ) {
180 # An end match is present at the same location
181 $tokenType = 'end';
182 $tokenLength = $endLength;
183 } else {
184 $tokenType = 'start';
185 $tokenLength = strlen( $m[0][0] );
186 }
187 } elseif ( $m[2][0] != '' ) {
188 $tokenType = 'end';
189 $tokenLength = strlen( $m[0][0] );
190 } else {
191 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
192 }
193
194 if ( $tokenType == 'start' ) {
195 # Only move the start position if we haven't already found a start
196 # This means that START START END matches outer pair
197 if ( !$foundStart ) {
198 # Found start
199 $inputPos = $tokenOffset + $tokenLength;
200 # Write out the non-matching section
201 $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
202 $outputPos = $tokenOffset;
203 $contentPos = $inputPos;
204 $foundStart = true;
205 } else {
206 # Move the input position past the *first character* of START,
207 # to protect against missing END when it overlaps with START
208 $inputPos = $tokenOffset + 1;
209 }
210 } elseif ( $tokenType == 'end' ) {
211 if ( $foundStart ) {
212 # Found match
213 $output .= $callback( [
214 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
215 substr( $subject, $contentPos, $tokenOffset - $contentPos )
216 ] );
217 $foundStart = false;
218 } else {
219 # Non-matching end, write it out
220 $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
221 }
222 $inputPos = $outputPos = $tokenOffset + $tokenLength;
223 } else {
224 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
225 }
226 }
227 if ( $outputPos < strlen( $subject ) ) {
228 $output .= substr( $subject, $outputPos );
229 }
230
231 return $output;
232 }
233
249 public static function delimiterReplace(
250 $startDelim, $endDelim, $replace, $subject, $flags = ''
251 ) {
252 return self::delimiterReplaceCallback(
253 $startDelim, $endDelim,
254 static function ( array $matches ) use ( $replace ) {
255 return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] );
256 },
257 $subject, $flags
258 );
259 }
260
269 public static function replaceMarkup( $search, $replace, $text ) {
270 $placeholder = "\x00";
271
272 // Remove placeholder instances
273 $text = str_replace( $placeholder, '', $text );
274
275 // Replace instances of the separator inside HTML-like tags with the placeholder
276 $cleaned = self::delimiterReplaceCallback(
277 '<', '>',
278 static function ( array $matches ) use ( $search, $placeholder ) {
279 return str_replace( $search, $placeholder, $matches[0] );
280 },
281 $text
282 );
283
284 // Explode, then put the replaced separators back in
285 $cleaned = str_replace( $search, $replace, $cleaned );
286 $text = str_replace( $placeholder, $search, $cleaned );
287
288 return $text;
289 }
290
300 public static function isValidPCRERegex( $string ) {
301 AtEase::suppressWarnings();
302 // @phan-suppress-next-line PhanParamSuspiciousOrder False positive
303 $isValid = preg_match( $string, '' );
304 AtEase::restoreWarnings();
305 return $isValid !== false;
306 }
307
315 public static function escapeRegexReplacement( $string ) {
316 $string = str_replace( '\\', '\\\\', $string );
317 return str_replace( '$', '\\$', $string );
318 }
319
327 public static function explode( $separator, $subject ) {
328 if ( substr_count( $subject, $separator ) > 1000 ) {
329 return new ExplodeIterator( $separator, $subject );
330 } else {
331 return new ArrayIterator( explode( $separator, $subject ) );
332 }
333 }
334
350 public static function unpack( string $format, string $data, $length = false ): array {
351 Assert::parameterType( [ 'integer', 'false' ], $length, '$length' );
352 if ( $length !== false ) {
353 $realLen = strlen( $data );
354 if ( $realLen < $length ) {
355 throw new UnpackFailedException( "Tried to unpack a "
356 . "string of length $realLen, but needed one "
357 . "of at least length $length."
358 );
359 }
360 }
361
362 AtEase::suppressWarnings();
363 $result = unpack( $format, $data );
364 AtEase::restoreWarnings();
365
366 if ( $result === false ) {
367 // If it cannot extract the packed data.
368 throw new UnpackFailedException( "unpack could not unpack binary data" );
369 }
370 return $result;
371 }
372}
An iterator which works exactly like:
A collection of static methods to play with strings.
static hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject)
Perform an operation equivalent to preg_replace()
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
static isValidPCRERegex( $string)
Utility function to check if the given string is a valid PCRE regex.
static unpack(string $format, string $data, $length=false)
Wrapper around php's unpack.
static escapeRegexReplacement( $string)
Escape a string to make it suitable for inclusion in a preg_replace() replacement parameter.
static isUtf8( $value)
Test whether a string is valid UTF-8.
static replaceMarkup( $search, $replace, $text)
More or less "markup-safe" str_replace() Ignores any instances of the separator inside <....
static explode( $separator, $subject)
Workalike for explode() with limited memory usage.
static delimiterExplode( $startDelim, $endDelim, $separator, $subject, $nested=false)
Explode a string, but ignore any instances of the separator inside the given start and end delimiters...