Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
5.56% |
7 / 126 |
|
30.00% |
3 / 10 |
CRAP | |
0.00% |
0 / 1 |
| StringUtils | |
5.60% |
7 / 125 |
|
30.00% |
3 / 10 |
1065.51 | |
0.00% |
0 / 1 |
| isUtf8 | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| delimiterExplode | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
72 | |||
| hungryDelimiterReplace | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
| delimiterReplaceCallback | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
182 | |||
| delimiterReplace | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
| replaceMarkup | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
| isValidPCRERegex | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| escapeRegexReplacement | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| explode | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| unpack | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
20 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace Wikimedia\StringUtils; |
| 4 | |
| 5 | use ArrayIterator; |
| 6 | use InvalidArgumentException; |
| 7 | use Wikimedia\Assert\Assert; |
| 8 | use Wikimedia\AtEase\AtEase; |
| 9 | use Wikimedia\UnpackFailedException; |
| 10 | |
| 11 | /** |
| 12 | * Methods to play with strings. |
| 13 | * |
| 14 | * @license GPL-2.0-or-later |
| 15 | * @file |
| 16 | */ |
| 17 | |
| 18 | /** |
| 19 | * A collection of static methods to play with strings. |
| 20 | */ |
| 21 | class StringUtils { |
| 22 | /** |
| 23 | * Test whether a string is valid UTF-8. |
| 24 | * |
| 25 | * The function check for invalid byte sequences, overlong encoding but |
| 26 | * not for different normalisations. |
| 27 | * |
| 28 | * @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation. |
| 29 | * In particular, the pure PHP code path did not in fact check for overlong forms. |
| 30 | * Beware of this when backporting code to that version of MediaWiki. |
| 31 | * |
| 32 | * @since 1.21 |
| 33 | * @param string $value String to check |
| 34 | * @return bool Whether the given $value is a valid UTF-8 encoded string |
| 35 | */ |
| 36 | public static function isUtf8( $value ) { |
| 37 | return mb_check_encoding( (string)$value, 'UTF-8' ); |
| 38 | } |
| 39 | |
| 40 | /** |
| 41 | * Explode a string, but ignore any instances of the separator inside |
| 42 | * the given start and end delimiters, which may optionally nest. |
| 43 | * The delimiters are literal strings, not regular expressions. |
| 44 | * @param string $startDelim Start delimiter |
| 45 | * @param string $endDelim End delimiter |
| 46 | * @param string $separator Separator string for the explode. |
| 47 | * @param string $subject Subject string to explode. |
| 48 | * @param bool $nested True iff the delimiters are allowed to nest. |
| 49 | * @return ArrayIterator |
| 50 | */ |
| 51 | public static function delimiterExplode( $startDelim, $endDelim, $separator, |
| 52 | $subject, $nested = false ) { |
| 53 | $inputPos = 0; |
| 54 | $lastPos = 0; |
| 55 | $depth = 0; |
| 56 | $encStart = preg_quote( $startDelim, '!' ); |
| 57 | $encEnd = preg_quote( $endDelim, '!' ); |
| 58 | $encSep = preg_quote( $separator, '!' ); |
| 59 | $len = strlen( $subject ); |
| 60 | $m = []; |
| 61 | $exploded = []; |
| 62 | while ( |
| 63 | $inputPos < $len && |
| 64 | preg_match( |
| 65 | "!$encStart|$encEnd|$encSep!S", $subject, $m, |
| 66 | PREG_OFFSET_CAPTURE, $inputPos |
| 67 | ) |
| 68 | ) { |
| 69 | $match = $m[0][0]; |
| 70 | $matchPos = $m[0][1]; |
| 71 | $inputPos = $matchPos + strlen( $match ); |
| 72 | if ( $match === $separator ) { |
| 73 | if ( $depth === 0 ) { |
| 74 | $exploded[] = substr( |
| 75 | $subject, $lastPos, $matchPos - $lastPos |
| 76 | ); |
| 77 | $lastPos = $inputPos; |
| 78 | } |
| 79 | } elseif ( $match === $startDelim ) { |
| 80 | if ( $depth === 0 || $nested ) { |
| 81 | $depth++; |
| 82 | } |
| 83 | } else { |
| 84 | $depth--; |
| 85 | } |
| 86 | } |
| 87 | $exploded[] = substr( $subject, $lastPos ); |
| 88 | // This method could be rewritten in the future to avoid creating an |
| 89 | // intermediate array, since the return type is just an iterator. |
| 90 | return new ArrayIterator( $exploded ); |
| 91 | } |
| 92 | |
| 93 | /** |
| 94 | * Perform an operation equivalent to `preg_replace()` |
| 95 | * |
| 96 | * Matches this code: |
| 97 | * |
| 98 | * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject ); |
| 99 | * |
| 100 | * ..except that it's worst-case O(N) instead of O(N^2). Compared to delimiterReplace(), this |
| 101 | * implementation is fast but memory-hungry and inflexible. The memory requirements are such |
| 102 | * that I don't recommend using it on anything but guaranteed small chunks of text. |
| 103 | * |
| 104 | * @param string $startDelim |
| 105 | * @param string $endDelim |
| 106 | * @param string $replace |
| 107 | * @param string $subject |
| 108 | * @return string |
| 109 | */ |
| 110 | public static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) { |
| 111 | $segments = explode( $startDelim, $subject ); |
| 112 | $output = array_shift( $segments ); |
| 113 | foreach ( $segments as $s ) { |
| 114 | $endDelimPos = strpos( $s, $endDelim ); |
| 115 | if ( $endDelimPos === false ) { |
| 116 | $output .= $startDelim . $s; |
| 117 | } else { |
| 118 | $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) ); |
| 119 | } |
| 120 | } |
| 121 | |
| 122 | return $output; |
| 123 | } |
| 124 | |
| 125 | /** |
| 126 | * Perform an operation equivalent to `preg_replace_callback()` |
| 127 | * |
| 128 | * Matches this code: |
| 129 | * |
| 130 | * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject ); |
| 131 | * |
| 132 | * If the start delimiter ends with an initial substring of the end delimiter, |
| 133 | * e.g. in the case of C-style comments, the behavior differs from the model |
| 134 | * regex. In this implementation, the end must share no characters with the |
| 135 | * start, so e.g. `/*\/` is not considered to be both the start and end of a |
| 136 | * comment. `/*\/xy/*\/` is considered to be a single comment with contents `/xy/`. |
| 137 | * |
| 138 | * The implementation of delimiterReplaceCallback() is slower than hungryDelimiterReplace() |
| 139 | * but uses far less memory. The delimiters are literal strings, not regular expressions. |
| 140 | * |
| 141 | * @param string $startDelim Start delimiter |
| 142 | * @param string $endDelim End delimiter |
| 143 | * @param callable $callback Function to call on each match |
| 144 | * @param string $subject |
| 145 | * @param string $flags Regular expression flags |
| 146 | * @return string |
| 147 | */ |
| 148 | private static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, |
| 149 | $subject, $flags = '' |
| 150 | ) { |
| 151 | $inputPos = 0; |
| 152 | $outputPos = 0; |
| 153 | $contentPos = 0; |
| 154 | $output = ''; |
| 155 | $foundStart = false; |
| 156 | $encStart = preg_quote( $startDelim, '!' ); |
| 157 | $encEnd = preg_quote( $endDelim, '!' ); |
| 158 | $strcmp = !str_contains( $flags, 'i' ) ? 'strcmp' : 'strcasecmp'; |
| 159 | $endLength = strlen( $endDelim ); |
| 160 | $m = []; |
| 161 | |
| 162 | while ( $inputPos < strlen( $subject ) && |
| 163 | preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) |
| 164 | ) { |
| 165 | $tokenOffset = $m[0][1]; |
| 166 | if ( $m[1][0] != '' ) { |
| 167 | if ( $foundStart && |
| 168 | $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 |
| 169 | ) { |
| 170 | # An end match is present at the same location |
| 171 | $tokenType = 'end'; |
| 172 | $tokenLength = $endLength; |
| 173 | } else { |
| 174 | $tokenType = 'start'; |
| 175 | $tokenLength = strlen( $m[0][0] ); |
| 176 | } |
| 177 | } elseif ( $m[2][0] != '' ) { |
| 178 | $tokenType = 'end'; |
| 179 | $tokenLength = strlen( $m[0][0] ); |
| 180 | } else { |
| 181 | throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ ); |
| 182 | } |
| 183 | |
| 184 | if ( $tokenType == 'start' ) { |
| 185 | # Only move the start position if we haven't already found a start |
| 186 | # This means that START START END matches outer pair |
| 187 | if ( !$foundStart ) { |
| 188 | # Found start |
| 189 | $inputPos = $tokenOffset + $tokenLength; |
| 190 | # Write out the non-matching section |
| 191 | $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos ); |
| 192 | $outputPos = $tokenOffset; |
| 193 | $contentPos = $inputPos; |
| 194 | $foundStart = true; |
| 195 | } else { |
| 196 | # Move the input position past the *first character* of START, |
| 197 | # to protect against missing END when it overlaps with START |
| 198 | $inputPos = $tokenOffset + 1; |
| 199 | } |
| 200 | } elseif ( $tokenType == 'end' ) { |
| 201 | if ( $foundStart ) { |
| 202 | # Found match |
| 203 | $output .= $callback( [ |
| 204 | substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ), |
| 205 | substr( $subject, $contentPos, $tokenOffset - $contentPos ) |
| 206 | ] ); |
| 207 | $foundStart = false; |
| 208 | } else { |
| 209 | # Non-matching end, write it out |
| 210 | $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos ); |
| 211 | } |
| 212 | $inputPos = $outputPos = $tokenOffset + $tokenLength; |
| 213 | } else { |
| 214 | throw new InvalidArgumentException( 'Invalid delimiter given to ' . __METHOD__ ); |
| 215 | } |
| 216 | } |
| 217 | if ( $outputPos < strlen( $subject ) ) { |
| 218 | $output .= substr( $subject, $outputPos ); |
| 219 | } |
| 220 | |
| 221 | return $output; |
| 222 | } |
| 223 | |
| 224 | /** |
| 225 | * Perform an operation equivalent to `preg_replace()` with flags. |
| 226 | * |
| 227 | * Matches this code: |
| 228 | * |
| 229 | * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject ); |
| 230 | * |
| 231 | * @param string $startDelim Start delimiter regular expression |
| 232 | * @param string $endDelim End delimiter regular expression |
| 233 | * @param string $replace Replacement string. May contain $1, which will be |
| 234 | * replaced by the text between the delimiters |
| 235 | * @param string $subject String to search |
| 236 | * @param string $flags Regular expression flags |
| 237 | * @return string The string with the matches replaced |
| 238 | */ |
| 239 | public static function delimiterReplace( |
| 240 | $startDelim, $endDelim, $replace, $subject, $flags = '' |
| 241 | ) { |
| 242 | return self::delimiterReplaceCallback( |
| 243 | $startDelim, $endDelim, |
| 244 | static function ( array $matches ) use ( $replace ) { |
| 245 | return strtr( $replace, [ '$0' => $matches[0], '$1' => $matches[1] ] ); |
| 246 | }, |
| 247 | $subject, $flags |
| 248 | ); |
| 249 | } |
| 250 | |
| 251 | /** |
| 252 | * More or less "markup-safe" str_replace() |
| 253 | * Ignores any instances of the separator inside `<...>` |
| 254 | * @param string $search |
| 255 | * @param string $replace |
| 256 | * @param string $text |
| 257 | * @return string |
| 258 | */ |
| 259 | public static function replaceMarkup( $search, $replace, $text ) { |
| 260 | $placeholder = "\x00"; |
| 261 | |
| 262 | // Remove placeholder instances |
| 263 | $text = str_replace( $placeholder, '', $text ); |
| 264 | |
| 265 | // Replace instances of the separator inside HTML-like tags with the placeholder |
| 266 | $cleaned = self::delimiterReplaceCallback( |
| 267 | '<', '>', |
| 268 | static function ( array $matches ) use ( $search, $placeholder ) { |
| 269 | return str_replace( $search, $placeholder, $matches[0] ); |
| 270 | }, |
| 271 | $text |
| 272 | ); |
| 273 | |
| 274 | // Explode, then put the replaced separators back in |
| 275 | $cleaned = str_replace( $search, $replace, $cleaned ); |
| 276 | $text = str_replace( $placeholder, $search, $cleaned ); |
| 277 | |
| 278 | return $text; |
| 279 | } |
| 280 | |
| 281 | /** |
| 282 | * Utility function to check if the given string is a valid PCRE regex. Avoids |
| 283 | * manually calling suppressWarnings and restoreWarnings, and provides a |
| 284 | * one-line solution without the need to use @. |
| 285 | * |
| 286 | * @since 1.34 |
| 287 | * @param string $string The string you want to check being a valid regex |
| 288 | * @return bool |
| 289 | */ |
| 290 | public static function isValidPCRERegex( $string ) { |
| 291 | AtEase::suppressWarnings(); |
| 292 | // @phan-suppress-next-line PhanParamSuspiciousOrder False positive |
| 293 | $isValid = preg_match( $string, '' ); |
| 294 | AtEase::restoreWarnings(); |
| 295 | return $isValid !== false; |
| 296 | } |
| 297 | |
| 298 | /** |
| 299 | * Escape a string to make it suitable for inclusion in a preg_replace() |
| 300 | * replacement parameter. |
| 301 | * |
| 302 | * @param string $string |
| 303 | * @return string |
| 304 | */ |
| 305 | public static function escapeRegexReplacement( $string ) { |
| 306 | $string = str_replace( '\\', '\\\\', $string ); |
| 307 | return str_replace( '$', '\\$', $string ); |
| 308 | } |
| 309 | |
| 310 | /** |
| 311 | * Workalike for explode() with limited memory usage. |
| 312 | * |
| 313 | * @param string $separator |
| 314 | * @param string $subject |
| 315 | * @return ArrayIterator|ExplodeIterator |
| 316 | */ |
| 317 | public static function explode( $separator, $subject ) { |
| 318 | if ( substr_count( $subject, $separator ) > 1000 ) { |
| 319 | return new ExplodeIterator( $separator, $subject ); |
| 320 | } else { |
| 321 | return new ArrayIterator( explode( $separator, $subject ) ); |
| 322 | } |
| 323 | } |
| 324 | |
| 325 | /** |
| 326 | * Wrapper around php's unpack. |
| 327 | * |
| 328 | * @param string $format The format string (See php's docs) |
| 329 | * @param string $data A binary string of binary data |
| 330 | * @param int|false $length The minimum length of $data or false. This is to |
| 331 | * prevent reading beyond the end of $data. false to disable the check. |
| 332 | * |
| 333 | * Also be careful when using this function to read unsigned 32 bit integer |
| 334 | * because php might make it negative. |
| 335 | * |
| 336 | * @throws UnpackFailedException If $data not long enough, or if unpack fails |
| 337 | * @return array Associative array of the extracted data |
| 338 | * @since 1.42 |
| 339 | */ |
| 340 | public static function unpack( string $format, string $data, $length = false ): array { |
| 341 | Assert::parameterType( [ 'integer', 'false' ], $length, '$length' ); |
| 342 | if ( $length !== false ) { |
| 343 | $realLen = strlen( $data ); |
| 344 | if ( $realLen < $length ) { |
| 345 | throw new UnpackFailedException( "Tried to unpack a " |
| 346 | . "string of length $realLen, but needed one " |
| 347 | . "of at least length $length." |
| 348 | ); |
| 349 | } |
| 350 | } |
| 351 | |
| 352 | AtEase::suppressWarnings(); |
| 353 | $result = unpack( $format, $data ); |
| 354 | AtEase::restoreWarnings(); |
| 355 | |
| 356 | if ( $result === false ) { |
| 357 | // If it cannot extract the packed data. |
| 358 | throw new UnpackFailedException( "unpack could not unpack binary data" ); |
| 359 | } |
| 360 | return $result; |
| 361 | } |
| 362 | } |
| 363 | |
| 364 | /** @deprecated class alias since 1.44 */ |
| 365 | class_alias( StringUtils::class, 'StringUtils' ); |