MediaWiki master
StringUtils.php
Go to the documentation of this file.
1<?php
2
4
5use ArrayIterator;
6use InvalidArgumentException;
8use Wikimedia\Assert\Assert;
9use Wikimedia\AtEase\AtEase;
10
50 public static function isUtf8( $value ) {
51 return mb_check_encoding( (string)$value, 'UTF-8' );
52 }
53
65 public static function delimiterExplode( $startDelim, $endDelim, $separator,
66 $subject, $nested = false ) {
67 $inputPos = 0;
68 $lastPos = 0;
69 $depth = 0;
70 $encStart = preg_quote( $startDelim, '!' );
71 $encEnd = preg_quote( $endDelim, '!' );
72 $encSep = preg_quote( $separator, '!' );
73 $len = strlen( $subject );
74 $m = [];
75 $exploded = [];
76 while (
77 $inputPos < $len &&
78 preg_match(
79 "!$encStart|$encEnd|$encSep!S", $subject, $m,
80 PREG_OFFSET_CAPTURE, $inputPos
81 )
82 ) {
83 $match = $m[0][0];
84 $matchPos = $m[0][1];
85 $inputPos = $matchPos + strlen( $match );
86 if ( $match === $separator ) {
87 if ( $depth === 0 ) {
88 $exploded[] = substr(
89 $subject, $lastPos, $matchPos - $lastPos
90 );
91 $lastPos = $inputPos;
92 }
93 } elseif ( $match === $startDelim ) {
94 if ( $depth === 0 || $nested ) {
95 $depth++;
96 }
97 } else {
98 $depth--;
99 }
100 }
101 $exploded[] = substr( $subject, $lastPos );
102 // This method could be rewritten in the future to avoid creating an
103 // intermediate array, since the return type is just an iterator.
104 return new ArrayIterator( $exploded );
105 }
106
124 public static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
125 $segments = explode( $startDelim, $subject );
126 $output = array_shift( $segments );
127 foreach ( $segments as $s ) {
128 $endDelimPos = strpos( $s, $endDelim );
129 if ( $endDelimPos === false ) {
130 $output .= $startDelim . $s;
131 } else {
132 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
133 }
134 }
135
136 return $output;
137 }
138
162 private static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
163 $subject, $flags = ''
164 ) {
165 $inputPos = 0;
166 $outputPos = 0;
167 $contentPos = 0;
168 $output = '';
169 $foundStart = false;
170 $encStart = preg_quote( $startDelim, '!' );
171 $encEnd = preg_quote( $endDelim, '!' );
172 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
173 $endLength = strlen( $endDelim );
174 $m = [];
175
176 while ( $inputPos < strlen( $subject ) &&
177 preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
178 ) {
179 $tokenOffset = $m[0][1];
180 if ( $m[1][0] != '' ) {
181 if ( $foundStart &&
182 $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
183 ) {
184 # An end match is present at the same location
185 $tokenType = 'end';
186 $tokenLength = $endLength;
187 } else {
188 $tokenType = 'start';
189 $tokenLength = strlen( $m[0][0] );
190 }
191 } elseif ( $m[2][0] != '' ) {
192 $tokenType = 'end';
193 $tokenLength = strlen( $m[0][0] );
194 } else {
195 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
196 }
197
198 if ( $tokenType == 'start' ) {
199 # Only move the start position if we haven't already found a start
200 # This means that START START END matches outer pair
201 if ( !$foundStart ) {
202 # Found start
203 $inputPos = $tokenOffset + $tokenLength;
204 # Write out the non-matching section
205 $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
206 $outputPos = $tokenOffset;
207 $contentPos = $inputPos;
208 $foundStart = true;
209 } else {
210 # Move the input position past the *first character* of START,
211 # to protect against missing END when it overlaps with START
212 $inputPos = $tokenOffset + 1;
213 }
214 } elseif ( $tokenType == 'end' ) {
215 if ( $foundStart ) {
216 # Found match
217 $output .= $callback( [
218 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
219 substr( $subject, $contentPos, $tokenOffset - $contentPos )
220 ] );
221 $foundStart = false;
222 } else {
223 # Non-matching end, write it out
224 $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
225 }
226 $inputPos = $outputPos = $tokenOffset + $tokenLength;
227 } else {
228 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
229 }
230 }
231 if ( $outputPos < strlen( $subject ) ) {
232 $output .= substr( $subject, $outputPos );
233 }
234
235 return $output;
236 }
237
253 public static function delimiterReplace(
254 $startDelim, $endDelim, $replace, $subject, $flags = ''
255 ) {
256 return self::delimiterReplaceCallback(
257 $startDelim, $endDelim,
258 static function ( array $matches ) use ( $replace ) {
259 return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] );
260 },
261 $subject, $flags
262 );
263 }
264
273 public static function replaceMarkup( $search, $replace, $text ) {
274 $placeholder = "\x00";
275
276 // Remove placeholder instances
277 $text = str_replace( $placeholder, '', $text );
278
279 // Replace instances of the separator inside HTML-like tags with the placeholder
280 $cleaned = self::delimiterReplaceCallback(
281 '<', '>',
282 static function ( array $matches ) use ( $search, $placeholder ) {
283 return str_replace( $search, $placeholder, $matches[0] );
284 },
285 $text
286 );
287
288 // Explode, then put the replaced separators back in
289 $cleaned = str_replace( $search, $replace, $cleaned );
290 $text = str_replace( $placeholder, $search, $cleaned );
291
292 return $text;
293 }
294
304 public static function isValidPCRERegex( $string ) {
305 AtEase::suppressWarnings();
306 // @phan-suppress-next-line PhanParamSuspiciousOrder False positive
307 $isValid = preg_match( $string, '' );
308 AtEase::restoreWarnings();
309 return $isValid !== false;
310 }
311
319 public static function escapeRegexReplacement( $string ) {
320 $string = str_replace( '\\', '\\\\', $string );
321 return str_replace( '$', '\\$', $string );
322 }
323
331 public static function explode( $separator, $subject ) {
332 if ( substr_count( $subject, $separator ) > 1000 ) {
333 return new ExplodeIterator( $separator, $subject );
334 } else {
335 return new ArrayIterator( explode( $separator, $subject ) );
336 }
337 }
338
354 public static function unpack( string $format, string $data, $length = false ): array {
355 Assert::parameterType( [ 'integer', 'false' ], $length, '$length' );
356 if ( $length !== false ) {
357 $realLen = strlen( $data );
358 if ( $realLen < $length ) {
359 throw new UnpackFailedException( "Tried to unpack a "
360 . "string of length $realLen, but needed one "
361 . "of at least length $length."
362 );
363 }
364 }
365
366 AtEase::suppressWarnings();
367 $result = unpack( $format, $data );
368 AtEase::restoreWarnings();
369
370 if ( $result === false ) {
371 // If it cannot extract the packed data.
372 throw new UnpackFailedException( "unpack could not unpack binary data" );
373 }
374 return $result;
375 }
376}
377
379class_alias( StringUtils::class, 'StringUtils' );
An iterator which works exactly like:
A collection of static methods to play with strings.
static replaceMarkup( $search, $replace, $text)
More or less "markup-safe" str_replace() Ignores any instances of the separator inside <....
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
static delimiterExplode( $startDelim, $endDelim, $separator, $subject, $nested=false)
Explode a string, but ignore any instances of the separator inside the given start and end delimiters...
static explode( $separator, $subject)
Workalike for explode() with limited memory usage.
static hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject)
Perform an operation equivalent to preg_replace()
static escapeRegexReplacement( $string)
Escape a string to make it suitable for inclusion in a preg_replace() replacement parameter.
static isValidPCRERegex( $string)
Utility function to check if the given string is a valid PCRE regex.
static isUtf8( $value)
Test whether a string is valid UTF-8.
static unpack(string $format, string $data, $length=false)
Wrapper around php's unpack.