Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
93.28% |
111 / 119 |
|
80.00% |
4 / 5 |
CRAP | |
0.00% |
0 / 1 |
| UrlUtils | |
93.28% |
111 / 119 |
|
80.00% |
4 / 5 |
42.54 | |
0.00% |
0 / 1 |
| parseUrl | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
6 | |||
| assembleUrl | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
6 | |||
| removeDotSegments | |
100.00% |
43 / 43 |
|
100.00% |
1 / 1 |
18 | |||
| expandUrl | |
100.00% |
29 / 29 |
|
100.00% |
1 / 1 |
8 | |||
| matchesDomainList | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
| 1 | <?php |
| 2 | |
| 3 | declare( strict_types = 1 ); |
| 4 | |
| 5 | namespace Wikimedia\Parsoid\Utils; |
| 6 | |
| 7 | /** |
| 8 | * Utilities for manipulating URLs |
| 9 | * @see https://datatracker.ietf.org/doc/html/rfc3986 |
| 10 | */ |
| 11 | class UrlUtils { |
| 12 | |
| 13 | /** |
| 14 | * Parse a possibly-relative URL into components |
| 15 | * |
| 16 | * Note no percent-decoding is performed, and only minimal syntax validation. |
| 17 | * |
| 18 | * @param string $url |
| 19 | * @return array{scheme: ?string, authority: ?string, path: string, query: ?string, fragment: ?string} |
| 20 | * - 'scheme': Scheme of the url, if any. |
| 21 | * - 'authority': Authority part of the url, if any. |
| 22 | * This is the part in between the "//" and the path. For http, this is the "user@host:port". |
| 23 | * - 'path': Path part of the URL. Never null, but may be the empty string. |
| 24 | * - 'query': Query part of the URL, if any. |
| 25 | * - 'fragment': Fragment part of the URL, if any. |
| 26 | */ |
| 27 | public static function parseUrl( string $url ): array { |
| 28 | $ret = [ |
| 29 | 'scheme' => null, |
| 30 | 'authority' => null, |
| 31 | 'path' => '', |
| 32 | 'query' => null, |
| 33 | 'fragment' => null, |
| 34 | ]; |
| 35 | |
| 36 | // Scheme? |
| 37 | if ( preg_match( '!^([a-z][a-z0-9+.-]*):!i', $url, $m ) ) { |
| 38 | $ret['scheme'] = $m[1]; |
| 39 | $url = substr( $url, strlen( $m[0] ) ); |
| 40 | } |
| 41 | |
| 42 | // Fragment? |
| 43 | $i = strpos( $url, '#' ); |
| 44 | if ( $i !== false ) { |
| 45 | $ret['fragment'] = substr( $url, $i + 1 ); |
| 46 | $url = substr( $url, 0, $i ); |
| 47 | } |
| 48 | |
| 49 | // Query? |
| 50 | $i = strpos( $url, '?' ); |
| 51 | if ( $i !== false ) { |
| 52 | $ret['query'] = substr( $url, $i + 1 ); |
| 53 | $url = substr( $url, 0, $i ); |
| 54 | } |
| 55 | |
| 56 | // Split authority and path |
| 57 | if ( str_starts_with( $url, '//' ) ) { |
| 58 | $i = strpos( $url, '/', 2 ); |
| 59 | if ( $i === false ) { |
| 60 | $ret['authority'] = substr( $url, 2 ); |
| 61 | $ret['path'] = ''; |
| 62 | } else { |
| 63 | $ret['authority'] = substr( $url, 2, $i - 2 ); |
| 64 | $ret['path'] = substr( $url, $i ); |
| 65 | } |
| 66 | } else { |
| 67 | $ret['path'] = $url; |
| 68 | } |
| 69 | |
| 70 | return $ret; |
| 71 | } |
| 72 | |
| 73 | /** |
| 74 | * This function will reassemble a URL parsed with self::parseURL(). |
| 75 | * |
| 76 | * Note no percent-encoding or syntax validation is performed. |
| 77 | * |
| 78 | * @param array $urlParts URL parts, as output from self::parseUrl |
| 79 | * @return string URL assembled from its component parts |
| 80 | */ |
| 81 | public static function assembleUrl( array $urlParts ): string { |
| 82 | $ret = ''; |
| 83 | |
| 84 | if ( isset( $urlParts['scheme'] ) ) { |
| 85 | $ret .= $urlParts['scheme'] . ':'; |
| 86 | } |
| 87 | |
| 88 | if ( isset( $urlParts['authority'] ) ) { |
| 89 | $ret .= '//' . $urlParts['authority']; |
| 90 | } |
| 91 | |
| 92 | if ( isset( $urlParts['path'] ) ) { |
| 93 | $ret .= $urlParts['path']; |
| 94 | } |
| 95 | |
| 96 | if ( isset( $urlParts['query'] ) ) { |
| 97 | $ret .= '?' . $urlParts['query']; |
| 98 | } |
| 99 | |
| 100 | if ( isset( $urlParts['fragment'] ) ) { |
| 101 | $ret .= '#' . $urlParts['fragment']; |
| 102 | } |
| 103 | |
| 104 | return $ret; |
| 105 | } |
| 106 | |
| 107 | /** |
| 108 | * Remove all dot-segments in the provided URL path. For example, |
| 109 | * '/a/./b/../c/' becomes '/a/c/'. |
| 110 | * |
| 111 | * @see https://tools.ietf.org/html/rfc3986#section-5.2.4 |
| 112 | * @note Copied from MediaWiki's UrlUtils::removeDotSegments() |
| 113 | * @param string $urlPath URL path, potentially containing dot-segments |
| 114 | * @return string URL path with all dot-segments removed |
| 115 | */ |
| 116 | public static function removeDotSegments( string $urlPath ): string { |
| 117 | $output = ''; |
| 118 | $inputOffset = 0; |
| 119 | $inputLength = strlen( $urlPath ); |
| 120 | |
| 121 | while ( $inputOffset < $inputLength ) { |
| 122 | $prefixLengthOne = substr( $urlPath, $inputOffset, 1 ); |
| 123 | $prefixLengthTwo = substr( $urlPath, $inputOffset, 2 ); |
| 124 | $prefixLengthThree = substr( $urlPath, $inputOffset, 3 ); |
| 125 | $prefixLengthFour = substr( $urlPath, $inputOffset, 4 ); |
| 126 | $trimOutput = false; |
| 127 | |
| 128 | if ( $prefixLengthTwo == './' ) { |
| 129 | # Step A, remove leading "./" |
| 130 | $inputOffset += 2; |
| 131 | } elseif ( $prefixLengthThree == '../' ) { |
| 132 | # Step A, remove leading "../" |
| 133 | $inputOffset += 3; |
| 134 | } elseif ( ( $prefixLengthTwo == '/.' ) && ( $inputOffset + 2 == $inputLength ) ) { |
| 135 | # Step B, replace leading "/.$" with "/" |
| 136 | $inputOffset += 1; |
| 137 | $urlPath[$inputOffset] = '/'; |
| 138 | } elseif ( $prefixLengthThree == '/./' ) { |
| 139 | # Step B, replace leading "/./" with "/" |
| 140 | $inputOffset += 2; |
| 141 | } elseif ( $prefixLengthThree == '/..' && ( $inputOffset + 3 == $inputLength ) ) { |
| 142 | # Step C, replace leading "/..$" with "/" and |
| 143 | # remove last path component in output |
| 144 | $inputOffset += 2; |
| 145 | $urlPath[$inputOffset] = '/'; |
| 146 | $trimOutput = true; |
| 147 | } elseif ( $prefixLengthFour == '/../' ) { |
| 148 | # Step C, replace leading "/../" with "/" and |
| 149 | # remove last path component in output |
| 150 | $inputOffset += 3; |
| 151 | $trimOutput = true; |
| 152 | } elseif ( ( $prefixLengthOne == '.' ) && ( $inputOffset + 1 == $inputLength ) ) { |
| 153 | # Step D, remove "^.$" |
| 154 | $inputOffset += 1; |
| 155 | } elseif ( ( $prefixLengthTwo == '..' ) && ( $inputOffset + 2 == $inputLength ) ) { |
| 156 | # Step D, remove "^..$" |
| 157 | $inputOffset += 2; |
| 158 | } else { |
| 159 | # Step E, move leading path segment to output |
| 160 | if ( $prefixLengthOne == '/' ) { |
| 161 | $slashPos = strpos( $urlPath, '/', $inputOffset + 1 ); |
| 162 | } else { |
| 163 | $slashPos = strpos( $urlPath, '/', $inputOffset ); |
| 164 | } |
| 165 | if ( $slashPos === false ) { |
| 166 | $output .= substr( $urlPath, $inputOffset ); |
| 167 | $inputOffset = $inputLength; |
| 168 | } else { |
| 169 | $output .= substr( $urlPath, $inputOffset, $slashPos - $inputOffset ); |
| 170 | $inputOffset += $slashPos - $inputOffset; |
| 171 | } |
| 172 | } |
| 173 | |
| 174 | if ( $trimOutput ) { |
| 175 | $slashPos = strrpos( $output, '/' ); |
| 176 | if ( $slashPos === false ) { |
| 177 | $output = ''; |
| 178 | } else { |
| 179 | $output = substr( $output, 0, $slashPos ); |
| 180 | } |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | return $output; |
| 185 | } |
| 186 | |
| 187 | /** |
| 188 | * Expand a relative URL using a base URL |
| 189 | * |
| 190 | * @see https://tools.ietf.org/html/rfc3986#section-5.2.2 |
| 191 | * @param string $url Relative URL to expand |
| 192 | * @param string $base Base URL to expand relative to |
| 193 | * @return string Expanded URL |
| 194 | */ |
| 195 | public static function expandUrl( string $url, string $base ): string { |
| 196 | $b = self::parseUrl( $base ); |
| 197 | $r = self::parseUrl( $url ); |
| 198 | |
| 199 | $t = []; |
| 200 | if ( isset( $r['scheme'] ) ) { |
| 201 | $t['scheme'] = $r['scheme']; |
| 202 | $t['authority'] = $r['authority'] ?? null; |
| 203 | $t['path'] = self::removeDotSegments( $r['path'] ); |
| 204 | $t['query'] = $r['query'] ?? null; |
| 205 | } else { |
| 206 | if ( isset( $r['authority'] ) ) { |
| 207 | $t['authority'] = $r['authority']; |
| 208 | $t['path'] = self::removeDotSegments( $r['path'] ); |
| 209 | $t['query'] = $r['query'] ?? null; |
| 210 | } else { |
| 211 | if ( $r['path'] === '' ) { |
| 212 | $t['path'] = $b['path']; |
| 213 | $t['query'] = $r['query'] ?? $b['query'] ?? null; |
| 214 | } else { |
| 215 | if ( $r['path'][0] === '/' ) { |
| 216 | $t['path'] = self::removeDotSegments( $r['path'] ); |
| 217 | } else { |
| 218 | // start merge(), see RFC 3986 §5.2.3 |
| 219 | if ( isset( $b['authority'] ) && $b['path'] === '' ) { |
| 220 | $t['path'] = '/' . $r['path']; |
| 221 | } else { |
| 222 | $i = strrpos( $b['path'], '/' ); |
| 223 | if ( $i === false ) { |
| 224 | $t['path'] = $r['path']; |
| 225 | } else { |
| 226 | $t['path'] = substr( $b['path'], 0, $i + 1 ) . $r['path']; |
| 227 | } |
| 228 | } |
| 229 | // end merge() |
| 230 | $t['path'] = self::removeDotSegments( $t['path'] ); |
| 231 | } |
| 232 | $t['query'] = $r['query'] ?? null; |
| 233 | } |
| 234 | $t['authority'] = $b['authority'] ?? null; |
| 235 | } |
| 236 | $t['scheme'] = $b['scheme'] ?? null; |
| 237 | } |
| 238 | $t['fragment'] = $r['fragment'] ?? null; |
| 239 | |
| 240 | return self::assembleUrl( $t ); |
| 241 | } |
| 242 | |
| 243 | /** |
| 244 | * Check whether a given URL has a domain that occurs in a given set of domains |
| 245 | * |
| 246 | * @param string $url |
| 247 | * @param array $domains Array of domains (strings) |
| 248 | * @return bool True if the host part of $url ends in one of the strings in $domains |
| 249 | */ |
| 250 | public static function matchesDomainList( string $url, array $domains ): bool { |
| 251 | $bits = self::parseUrl( $url ); |
| 252 | if ( isset( $bits['authority'] ) ) { |
| 253 | $host = '.' . $bits['authority']; |
| 254 | foreach ( $domains as $domain ) { |
| 255 | $domain = '.' . $domain; |
| 256 | if ( substr( $host, -strlen( $domain ) ) === $domain ) { |
| 257 | return true; |
| 258 | } |
| 259 | } |
| 260 | } |
| 261 | return false; |
| 262 | } |
| 263 | } |