MediaWiki master
StringUtils.php
Go to the documentation of this file.
1<?php
2
4use Wikimedia\Assert\Assert;
5use Wikimedia\AtEase\AtEase;
6
46 public static function isUtf8( $value ) {
47 return mb_check_encoding( (string)$value, 'UTF-8' );
48 }
49
61 public static function delimiterExplode( $startDelim, $endDelim, $separator,
62 $subject, $nested = false ) {
63 $inputPos = 0;
64 $lastPos = 0;
65 $depth = 0;
66 $encStart = preg_quote( $startDelim, '!' );
67 $encEnd = preg_quote( $endDelim, '!' );
68 $encSep = preg_quote( $separator, '!' );
69 $len = strlen( $subject );
70 $m = [];
71 $exploded = [];
72 while (
73 $inputPos < $len &&
74 preg_match(
75 "!$encStart|$encEnd|$encSep!S", $subject, $m,
76 PREG_OFFSET_CAPTURE, $inputPos
77 )
78 ) {
79 $match = $m[0][0];
80 $matchPos = $m[0][1];
81 $inputPos = $matchPos + strlen( $match );
82 if ( $match === $separator ) {
83 if ( $depth === 0 ) {
84 $exploded[] = substr(
85 $subject, $lastPos, $matchPos - $lastPos
86 );
87 $lastPos = $inputPos;
88 }
89 } elseif ( $match === $startDelim ) {
90 if ( $depth === 0 || $nested ) {
91 $depth++;
92 }
93 } else {
94 $depth--;
95 }
96 }
97 $exploded[] = substr( $subject, $lastPos );
98 // This method could be rewritten in the future to avoid creating an
99 // intermediate array, since the return type is just an iterator.
100 return new ArrayIterator( $exploded );
101 }
102
120 public static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
121 $segments = explode( $startDelim, $subject );
122 $output = array_shift( $segments );
123 foreach ( $segments as $s ) {
124 $endDelimPos = strpos( $s, $endDelim );
125 if ( $endDelimPos === false ) {
126 $output .= $startDelim . $s;
127 } else {
128 $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
129 }
130 }
131
132 return $output;
133 }
134
159 private static function delimiterReplaceCallback( $startDelim, $endDelim, $callback,
160 $subject, $flags = ''
161 ) {
162 $inputPos = 0;
163 $outputPos = 0;
164 $contentPos = 0;
165 $output = '';
166 $foundStart = false;
167 $encStart = preg_quote( $startDelim, '!' );
168 $encEnd = preg_quote( $endDelim, '!' );
169 $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
170 $endLength = strlen( $endDelim );
171 $m = [];
172
173 while ( $inputPos < strlen( $subject ) &&
174 preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos )
175 ) {
176 $tokenOffset = $m[0][1];
177 if ( $m[1][0] != '' ) {
178 if ( $foundStart &&
179 $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0
180 ) {
181 # An end match is present at the same location
182 $tokenType = 'end';
183 $tokenLength = $endLength;
184 } else {
185 $tokenType = 'start';
186 $tokenLength = strlen( $m[0][0] );
187 }
188 } elseif ( $m[2][0] != '' ) {
189 $tokenType = 'end';
190 $tokenLength = strlen( $m[0][0] );
191 } else {
192 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
193 }
194
195 if ( $tokenType == 'start' ) {
196 # Only move the start position if we haven't already found a start
197 # This means that START START END matches outer pair
198 if ( !$foundStart ) {
199 # Found start
200 $inputPos = $tokenOffset + $tokenLength;
201 # Write out the non-matching section
202 $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
203 $outputPos = $tokenOffset;
204 $contentPos = $inputPos;
205 $foundStart = true;
206 } else {
207 # Move the input position past the *first character* of START,
208 # to protect against missing END when it overlaps with START
209 $inputPos = $tokenOffset + 1;
210 }
211 } elseif ( $tokenType == 'end' ) {
212 if ( $foundStart ) {
213 # Found match
214 $output .= $callback( [
215 substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
216 substr( $subject, $contentPos, $tokenOffset - $contentPos )
217 ] );
218 $foundStart = false;
219 } else {
220 # Non-matching end, write it out
221 $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
222 }
223 $inputPos = $outputPos = $tokenOffset + $tokenLength;
224 } else {
225 throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ );
226 }
227 }
228 if ( $outputPos < strlen( $subject ) ) {
229 $output .= substr( $subject, $outputPos );
230 }
231
232 return $output;
233 }
234
250 public static function delimiterReplace(
251 $startDelim, $endDelim, $replace, $subject, $flags = ''
252 ) {
253 return self::delimiterReplaceCallback(
254 $startDelim, $endDelim,
255 static function ( array $matches ) use ( $replace ) {
256 return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] );
257 },
258 $subject, $flags
259 );
260 }
261
270 public static function replaceMarkup( $search, $replace, $text ) {
271 $placeholder = "\x00";
272
273 // Remove placeholder instances
274 $text = str_replace( $placeholder, '', $text );
275
276 // Replace instances of the separator inside HTML-like tags with the placeholder
277 $cleaned = self::delimiterReplaceCallback(
278 '<', '>',
279 static function ( array $matches ) use ( $search, $placeholder ) {
280 return str_replace( $search, $placeholder, $matches[0] );
281 },
282 $text
283 );
284
285 // Explode, then put the replaced separators back in
286 $cleaned = str_replace( $search, $replace, $cleaned );
287 $text = str_replace( $placeholder, $search, $cleaned );
288
289 return $text;
290 }
291
301 public static function isValidPCRERegex( $string ) {
302 AtEase::suppressWarnings();
303 // @phan-suppress-next-line PhanParamSuspiciousOrder False positive
304 $isValid = preg_match( $string, '' );
305 AtEase::restoreWarnings();
306 return $isValid !== false;
307 }
308
316 public static function escapeRegexReplacement( $string ) {
317 $string = str_replace( '\\', '\\\\', $string );
318 return str_replace( '$', '\\$', $string );
319 }
320
328 public static function explode( $separator, $subject ) {
329 if ( substr_count( $subject, $separator ) > 1000 ) {
330 return new ExplodeIterator( $separator, $subject );
331 } else {
332 return new ArrayIterator( explode( $separator, $subject ) );
333 }
334 }
335
351 public static function unpack( string $format, string $data, $length = false ): array {
352 Assert::parameterType( [ 'integer', 'false' ], $length, '$length' );
353 if ( $length !== false ) {
354 $realLen = strlen( $data );
355 if ( $realLen < $length ) {
356 throw new UnpackFailedException( "Tried to unpack a "
357 . "string of length $realLen, but needed one "
358 . "of at least length $length."
359 );
360 }
361 }
362
363 AtEase::suppressWarnings();
364 $result = unpack( $format, $data );
365 AtEase::restoreWarnings();
366
367 if ( $result === false ) {
368 // If it cannot extract the packed data.
369 throw new UnpackFailedException( "unpack could not unpack binary data" );
370 }
371 return $result;
372 }
373}
An iterator which works exactly like:
A collection of static methods to play with strings.
static hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject)
Perform an operation equivalent to preg_replace()
static delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags='')
Perform an operation equivalent to preg_replace() with flags.
static isValidPCRERegex( $string)
Utility function to check if the given string is a valid PCRE regex.
static unpack(string $format, string $data, $length=false)
Wrapper around php's unpack.
static escapeRegexReplacement( $string)
Escape a string to make it suitable for inclusion in a preg_replace() replacement parameter.
static isUtf8( $value)
Test whether a string is valid UTF-8.
static replaceMarkup( $search, $replace, $text)
More or less "markup-safe" str_replace() Ignores any instances of the separator inside <....
static explode( $separator, $subject)
Workalike for explode() with limited memory usage.
static delimiterExplode( $startDelim, $endDelim, $separator, $subject, $nested=false)
Explode a string, but ignore any instances of the separator inside the given start and end delimiters...