Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
97.55% |
159 / 163 |
|
66.67% |
8 / 12 |
CRAP | |
0.00% |
0 / 1 |
UrlUtils | |
97.55% |
159 / 163 |
|
66.67% |
8 / 12 |
90 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
14 | |||
expand | |
97.22% |
35 / 36 |
|
0.00% |
0 / 1 |
20 | |||
getServer | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
getCanonicalServer | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
assemble | |
100.00% |
21 / 21 |
|
100.00% |
1 / 1 |
11 | |||
removeDotSegments | |
100.00% |
39 / 39 |
|
100.00% |
1 / 1 |
18 | |||
validProtocols | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
validAbsoluteProtocols | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
validProtocolsInternal | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
parse | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
10 | |||
expandIRI | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
2.01 | |||
matchesDomainList | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Utils; |
4 | |
5 | use BadMethodCallException; |
6 | use InvalidArgumentException; |
7 | use MediaWiki\Debug\MWDebug; |
8 | use MediaWiki\MainConfigSchema; |
9 | |
10 | /** |
11 | * A service to expand, parse, and otherwise manipulate URLs. |
12 | * |
13 | * @since 1.39 |
14 | * @newable |
15 | */ |
16 | class UrlUtils { |
17 | public const SERVER = 'server'; |
18 | public const CANONICAL_SERVER = 'canonicalServer'; |
19 | public const INTERNAL_SERVER = 'internalServer'; |
20 | public const FALLBACK_PROTOCOL = 'fallbackProtocol'; |
21 | public const HTTPS_PORT = 'httpsPort'; |
22 | public const VALID_PROTOCOLS = 'validProtocols'; |
23 | |
24 | /** @var ?string */ |
25 | private $server = null; |
26 | |
27 | /** @var ?string */ |
28 | private $canonicalServer = null; |
29 | |
30 | /** @var ?string */ |
31 | private $internalServer = null; |
32 | /** @var string */ |
33 | private $fallbackProtocol = 'http'; |
34 | |
35 | /** @var int */ |
36 | private $httpsPort = 443; |
37 | |
38 | /** @var array */ |
39 | private $validProtocols = MainConfigSchema::UrlProtocols['default']; |
40 | |
41 | /** @var ?string */ |
42 | private $validProtocolsCache = null; |
43 | |
44 | /** @var ?string */ |
45 | private $validAbsoluteProtocolsCache = null; |
46 | |
47 | /** |
48 | * @stable to call |
49 | * @param array $options All keys are optional, but if you omit SERVER then calling expand() |
50 | * (and getServer(), expandIRI(), and matchesDomainList()) will throw. Recognized keys: |
51 | * * self::SERVER: The protocol and server portion of the URLs to expand, with no other parts |
52 | * (port, path, etc.). Example: 'https://example.com'. Protocol-relative URLs are |
53 | * allowed. |
54 | * * self::CANONICAL_SERVER: If SERVER is protocol-relative, this can be set to a |
55 | * fully-qualified version for use when PROTO_CANONICAL is passed to expand(). Defaults |
56 | * to SERVER, with 'http:' prepended if SERVER is protocol-relative. |
57 | * * self::INTERNAL_SERVER: An alternative to SERVER that's used when PROTO_INTERNAL is |
58 | * passed to expand(). It's intended for sites that have a different server name exposed |
59 | * to CDNs. Defaults to SERVER. |
60 | * * self::FALLBACK_PROTOCOL: Used by expand() when no $defaultProto parameter is provided. |
61 | * Defaults to 'http'. The instance created by ServiceWiring sets this to 'https' if the |
62 | * current request is detected to be via HTTPS, and 'http' otherwise. |
63 | * * self::HTTPS_PORT: Defaults to 443. Used when a protocol-relative URL is expanded to |
64 | * https. |
65 | * * self::VALID_PROTOCOLS: An array of recognized URL protocols. The default can be found |
66 | * in MainConfigSchema::UrlProtocols['default']. |
67 | */ |
68 | public function __construct( array $options = [] ) { |
69 | foreach ( $options as $key => $value ) { |
70 | switch ( $key ) { |
71 | case self::SERVER: |
72 | case self::CANONICAL_SERVER: |
73 | case self::INTERNAL_SERVER: |
74 | case self::FALLBACK_PROTOCOL: |
75 | case self::HTTPS_PORT: |
76 | case self::VALID_PROTOCOLS: |
77 | $this->$key = $value; |
78 | break; |
79 | |
80 | default: |
81 | throw new InvalidArgumentException( "Unrecognized option \"$key\"" ); |
82 | } |
83 | } |
84 | |
85 | if ( $this->server !== null ) { |
86 | if ( $this->canonicalServer === null || $this->canonicalServer === false ) { |
87 | $this->canonicalServer = $this->expand( $this->server, PROTO_HTTP ); |
88 | } |
89 | if ( $this->internalServer === null || $this->internalServer === false ) { |
90 | $this->internalServer = $this->server; |
91 | } |
92 | } |
93 | } |
94 | |
95 | /** |
96 | * Expand a potentially local URL to a fully-qualified URL using $wgServer |
97 | * (or one of its alternatives). |
98 | * |
99 | * The meaning of the PROTO_* constants is as follows: |
100 | * PROTO_HTTP: Output a URL starting with http:// |
101 | * PROTO_HTTPS: Output a URL starting with https:// |
102 | * PROTO_RELATIVE: Output a URL starting with // (protocol-relative URL) |
103 | * PROTO_FALLBACK: Output a URL starting with the FALLBACK_PROTOCOL option |
104 | * PROTO_CURRENT: Legacy alias for PROTO_FALLBACK |
105 | * PROTO_CANONICAL: For URLs without a domain, like /w/index.php, use CANONICAL_SERVER. For |
106 | * protocol-relative URLs, use the protocol of CANONICAL_SERVER |
107 | * PROTO_INTERNAL: Like PROTO_CANONICAL, but uses INTERNAL_SERVER instead of CANONICAL_SERVER |
108 | * |
109 | * If $url specifies a protocol, or $url is domain-relative and $wgServer |
110 | * specifies a protocol, PROTO_HTTP, PROTO_HTTPS, PROTO_RELATIVE and |
111 | * PROTO_CURRENT do not change that. |
112 | * |
113 | * Parent references (/../) in the path are resolved (as in ::removeDotSegments). |
114 | * |
115 | * @todo this won't work with current-path-relative URLs like "subdir/foo.html", etc. |
116 | * |
117 | * @throws BadMethodCallException if no server was passed to the constructor |
118 | * @param string $url An URL; can be absolute (e.g. http://example.com/foo/bar), |
119 | * protocol-relative (//example.com/foo/bar) or domain-relative (/foo/bar). |
120 | * @param string|int|null $defaultProto One of the PROTO_* constants, as described above. |
121 | * @return ?string Fully-qualified URL, current-path-relative URL or null if |
122 | * no valid URL can be constructed |
123 | */ |
124 | public function expand( string $url, $defaultProto = PROTO_FALLBACK ): ?string { |
125 | if ( $defaultProto === PROTO_CANONICAL ) { |
126 | $serverUrl = $this->canonicalServer; |
127 | } elseif ( $defaultProto === PROTO_INTERNAL ) { |
128 | $serverUrl = $this->internalServer; |
129 | } else { |
130 | $serverUrl = $this->server; |
131 | if ( $defaultProto === PROTO_FALLBACK ) { |
132 | $defaultProto = $this->fallbackProtocol . '://'; |
133 | } |
134 | } |
135 | |
136 | if ( str_starts_with( $url, '/' ) ) { |
137 | if ( $serverUrl === null ) { |
138 | throw new BadMethodCallException( 'Cannot call expand() if the appropriate ' . |
139 | 'SERVER/CANONICAL_SERVER/INTERNAL_SERVER option was not passed to the ' . |
140 | 'constructor' ); |
141 | } |
142 | |
143 | // Analyze $serverUrl to obtain its protocol |
144 | $bits = $this->parse( $serverUrl ); |
145 | $serverProto = $bits && $bits['scheme'] != '' ? $bits['scheme'] . '://' : null; |
146 | |
147 | if ( $defaultProto === PROTO_CANONICAL || $defaultProto === PROTO_INTERNAL ) { |
148 | // Fall back to HTTP in the ridiculous case that CanonicalServer or InternalServer |
149 | // doesn't have a protocol |
150 | $defaultProto = $serverProto ?? PROTO_HTTP; |
151 | } |
152 | |
153 | // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal T308355 |
154 | $defaultProtoWithoutSlashes = $defaultProto === PROTO_FALLBACK ? '' : substr( $defaultProto, 0, -2 ); |
155 | |
156 | if ( str_starts_with( $url, '//' ) ) { |
157 | $url = $defaultProtoWithoutSlashes . $url; |
158 | } else { |
159 | // If $serverUrl is protocol-relative, prepend $defaultProtoWithoutSlashes, |
160 | // otherwise leave it alone. |
161 | if ( $serverProto ) { |
162 | $url = $serverUrl . $url; |
163 | } else { |
164 | // If an HTTPS URL is synthesized from a protocol-relative Server, allow the |
165 | // user to override the port number (T67184) |
166 | if ( $defaultProto === PROTO_HTTPS && $this->httpsPort != 443 ) { |
167 | if ( isset( $bits['port'] ) ) { |
168 | throw new InvalidArgumentException( |
169 | 'A protocol-relative server may not contain a port number' ); |
170 | } |
171 | $url = "$defaultProtoWithoutSlashes$serverUrl:{$this->httpsPort}$url"; |
172 | } else { |
173 | $url = "$defaultProtoWithoutSlashes$serverUrl$url"; |
174 | } |
175 | } |
176 | } |
177 | } |
178 | |
179 | $bits = $this->parse( $url ); |
180 | |
181 | if ( $bits && isset( $bits['path'] ) ) { |
182 | $bits['path'] = self::removeDotSegments( $bits['path'] ); |
183 | return self::assemble( $bits ); |
184 | } elseif ( $bits ) { |
185 | # No path to expand |
186 | return $url; |
187 | } elseif ( !str_starts_with( $url, '/' ) ) { |
188 | # URL is a relative path |
189 | return self::removeDotSegments( $url ); |
190 | } |
191 | |
192 | # Expanded URL is not valid. |
193 | return null; |
194 | } |
195 | |
196 | /** |
197 | * Get the wiki's "server", i.e. the protocol and host part of the URL, with a |
198 | * protocol specified using a PROTO_* constant as in expand() |
199 | * |
200 | * @throws BadMethodCallException if no server was passed to the constructor |
201 | * @param string|int|null $proto One of the PROTO_* constants. |
202 | * @return ?string The URL, or null on failure |
203 | */ |
204 | public function getServer( $proto ): ?string { |
205 | $url = $this->expand( '/', $proto ); |
206 | if ( $url === null ) { |
207 | return null; |
208 | } |
209 | return substr( $url, 0, -1 ); |
210 | } |
211 | |
212 | /** |
213 | * Get the canonical server, i.e. the canonical protocol and host part of |
214 | * the wiki's URL. |
215 | * @return string |
216 | */ |
217 | public function getCanonicalServer(): string { |
218 | // @phan-suppress-next-line PhanTypeMismatchReturnNullable -- throw if unconfigured |
219 | return $this->canonicalServer; |
220 | } |
221 | |
222 | /** |
223 | * This function will reassemble a URL parsed with parse(). This is useful if you need to edit |
224 | * part of a URL and put it back together. |
225 | * |
226 | * This is the basic structure used (brackets contain keys for $urlParts): |
227 | * [scheme][delimiter][user]:[pass]@[host]:[port][path]?[query]#[fragment] |
228 | * |
229 | * @since 1.41 |
230 | * @param array $urlParts URL parts, as output from parse() |
231 | * @return string URL assembled from its component parts |
232 | */ |
233 | public static function assemble( array $urlParts ): string { |
234 | $result = ''; |
235 | |
236 | if ( isset( $urlParts['delimiter'] ) ) { |
237 | if ( isset( $urlParts['scheme'] ) ) { |
238 | $result .= $urlParts['scheme']; |
239 | } |
240 | |
241 | $result .= $urlParts['delimiter']; |
242 | } |
243 | |
244 | if ( isset( $urlParts['host'] ) ) { |
245 | if ( isset( $urlParts['user'] ) ) { |
246 | $result .= $urlParts['user']; |
247 | if ( isset( $urlParts['pass'] ) ) { |
248 | $result .= ':' . $urlParts['pass']; |
249 | } |
250 | $result .= '@'; |
251 | } |
252 | |
253 | $result .= $urlParts['host']; |
254 | |
255 | if ( isset( $urlParts['port'] ) ) { |
256 | $result .= ':' . $urlParts['port']; |
257 | } |
258 | } |
259 | |
260 | if ( isset( $urlParts['path'] ) ) { |
261 | $result .= $urlParts['path']; |
262 | } |
263 | |
264 | if ( isset( $urlParts['query'] ) && $urlParts['query'] !== '' ) { |
265 | $result .= '?' . $urlParts['query']; |
266 | } |
267 | |
268 | if ( isset( $urlParts['fragment'] ) ) { |
269 | $result .= '#' . $urlParts['fragment']; |
270 | } |
271 | |
272 | return $result; |
273 | } |
274 | |
275 | /** |
276 | * Remove all dot-segments in the provided URL path. For example, '/a/./b/../c/' becomes |
277 | * '/a/c/'. For details on the algorithm, please see RFC3986 section 5.2.4. |
278 | * |
279 | * @since 1.41 |
280 | * @param string $urlPath URL path, potentially containing dot-segments |
281 | * @return string URL path with all dot-segments removed |
282 | */ |
283 | public static function removeDotSegments( string $urlPath ): string { |
284 | $output = ''; |
285 | $inputOffset = 0; |
286 | $inputLength = strlen( $urlPath ); |
287 | |
288 | while ( $inputOffset < $inputLength ) { |
289 | $trimOutput = false; |
290 | if ( substr_compare( $urlPath, './', $inputOffset, 2 ) === 0 ) { |
291 | # Step A, remove leading "./" |
292 | $inputOffset += 2; |
293 | } elseif ( substr_compare( $urlPath, '../', $inputOffset, 3 ) === 0 ) { |
294 | # Step A, remove leading "../" |
295 | $inputOffset += 3; |
296 | } elseif ( $inputOffset + 2 === $inputLength && str_ends_with( $urlPath, '/.' ) ) { |
297 | # Step B, replace leading "/.$" with "/" |
298 | $inputOffset++; |
299 | $urlPath[$inputOffset] = '/'; |
300 | } elseif ( substr_compare( $urlPath, '/./', $inputOffset, 3 ) === 0 ) { |
301 | # Step B, replace leading "/./" with "/" |
302 | $inputOffset += 2; |
303 | } elseif ( $inputOffset + 3 === $inputLength && str_ends_with( $urlPath, '/..' ) ) { |
304 | # Step C, replace leading "/..$" with "/" and |
305 | # remove last path component in output |
306 | $inputOffset += 2; |
307 | $urlPath[$inputOffset] = '/'; |
308 | $trimOutput = true; |
309 | } elseif ( substr_compare( $urlPath, '/../', $inputOffset, 4 ) === 0 ) { |
310 | # Step C, replace leading "/../" with "/" and |
311 | # remove last path component in output |
312 | $inputOffset += 3; |
313 | $trimOutput = true; |
314 | } elseif ( $inputOffset + 1 === $inputLength && str_ends_with( $urlPath, '.' ) ) { |
315 | # Step D, remove "^.$" |
316 | $inputOffset++; |
317 | } elseif ( $inputOffset + 2 === $inputLength && str_ends_with( $urlPath, '..' ) ) { |
318 | # Step D, remove "^..$" |
319 | $inputOffset += 2; |
320 | } else { |
321 | # Step E, move leading path segment to output |
322 | if ( $urlPath[$inputOffset] === '/' ) { |
323 | $slashPos = strpos( $urlPath, '/', $inputOffset + 1 ); |
324 | } else { |
325 | $slashPos = strpos( $urlPath, '/', $inputOffset ); |
326 | } |
327 | if ( $slashPos === false ) { |
328 | $output .= substr( $urlPath, $inputOffset ); |
329 | $inputOffset = $inputLength; |
330 | } else { |
331 | $output .= substr( $urlPath, $inputOffset, $slashPos - $inputOffset ); |
332 | $inputOffset += $slashPos - $inputOffset; |
333 | } |
334 | } |
335 | |
336 | if ( $trimOutput ) { |
337 | $slashPos = strrpos( $output, '/' ); |
338 | if ( $slashPos === false ) { |
339 | $output = ''; |
340 | } else { |
341 | $output = substr( $output, 0, $slashPos ); |
342 | } |
343 | } |
344 | } |
345 | |
346 | return $output; |
347 | } |
348 | |
349 | /** |
350 | * Returns a partial regular expression of recognized URL protocols, e.g. "http:\/\/|https:\/\/" |
351 | * |
352 | * @return string |
353 | */ |
354 | public function validProtocols(): string { |
355 | $this->validProtocolsCache ??= $this->validProtocolsInternal( true ); |
356 | return $this->validProtocolsCache; |
357 | } |
358 | |
359 | /** |
360 | * Like validProtocols(), but excludes '//' from the protocol list. Use this if you need a |
361 | * regex that matches all URL protocols but does not match protocol-relative URLs |
362 | * |
363 | * @return string |
364 | */ |
365 | public function validAbsoluteProtocols(): string { |
366 | $this->validAbsoluteProtocolsCache ??= $this->validProtocolsInternal( false ); |
367 | return $this->validAbsoluteProtocolsCache; |
368 | } |
369 | |
370 | /** |
371 | * Returns a partial regular expression of URL protocols, e.g. "http:\/\/|https:\/\/" |
372 | * |
373 | * @param bool $includeProtocolRelative If false, remove '//' from the returned protocol list. |
374 | * @return string |
375 | */ |
376 | private function validProtocolsInternal( bool $includeProtocolRelative ): string { |
377 | if ( !is_array( $this->validProtocols ) ) { |
378 | MWDebug::deprecated( '$wgUrlProtocols that is not an array', '1.39' ); |
379 | return (string)$this->validProtocols; |
380 | } |
381 | |
382 | $protocols = []; |
383 | foreach ( $this->validProtocols as $protocol ) { |
384 | // Filter out '//' if !$includeProtocolRelative |
385 | if ( $includeProtocolRelative || $protocol !== '//' ) { |
386 | $protocols[] = preg_quote( $protocol, '/' ); |
387 | } |
388 | } |
389 | |
390 | return implode( '|', $protocols ); |
391 | } |
392 | |
393 | /** |
394 | * Advanced and configurable version of parse_url(). |
395 | * |
396 | * 1) Add a "delimiter" element to the array, which helps permits to blindly re-assemble |
397 | * any URL regardless of protocol, including those that don't use `://`, |
398 | * such as "mailto:" and "news:". |
399 | * 2) Reject URLs with protocols not in $wgUrlProtocols. |
400 | * 3) Reject relative or incomplete URLs that parse_url would return a partial array for. |
401 | * |
402 | * If all you need is to extract parts of an HTTP or HTTPS URL (i.e. not specific to |
403 | * site-configurable extra protocols, or user input) then `parse_url()` can be used |
404 | * directly instead. |
405 | * |
406 | * @param string $url A URL to parse |
407 | * @return ?string[] Bits of the URL in an associative array, or null on failure. |
408 | * Possible fields: |
409 | * - scheme: URI scheme (protocol), e.g. 'http', 'mailto'. Lowercase, always present, but can |
410 | * be an empty string for protocol-relative URLs. |
411 | * - delimiter: either '://', ':' or '//'. Always present. |
412 | * - host: domain name / IP. Always present, but could be an empty string, e.g. for file: URLs. |
413 | * - port: port number. Will be missing when port is not explicitly specified. |
414 | * - user: user name, e.g. for HTTP Basic auth URLs such as http://user:pass@example.com/ |
415 | * Missing when there is no username. |
416 | * - pass: password, same as above. |
417 | * - path: path including the leading /. Will be missing when empty (e.g. 'http://example.com') |
418 | * - query: query string (as a string; see wfCgiToArray() for parsing it), can be missing. |
419 | * - fragment: the part after #, can be missing. |
420 | */ |
421 | public function parse( string $url ): ?array { |
422 | // Protocol-relative URLs are handled really badly by parse_url(). It's so bad that the |
423 | // easiest way to handle them is to just prepend 'http:' and strip the protocol out later. |
424 | $wasRelative = str_starts_with( $url, '//' ); |
425 | if ( $wasRelative ) { |
426 | $url = "http:$url"; |
427 | } |
428 | $bits = parse_url( $url ); |
429 | // parse_url() returns an array without scheme for invalid URLs, e.g. |
430 | // parse_url("something bad://example") == [ 'path' => 'something bad://example' ] |
431 | if ( !$bits || !isset( $bits['scheme'] ) ) { |
432 | return null; |
433 | } |
434 | |
435 | // parse_url() incorrectly handles schemes case-sensitively. Convert it to lowercase. |
436 | $bits['scheme'] = strtolower( $bits['scheme'] ); |
437 | $bits['host'] ??= ''; |
438 | |
439 | // most of the protocols are followed by ://, but mailto: and sometimes news: not, check for it |
440 | if ( in_array( $bits['scheme'] . '://', $this->validProtocols ) ) { |
441 | $bits['delimiter'] = '://'; |
442 | } elseif ( in_array( $bits['scheme'] . ':', $this->validProtocols ) ) { |
443 | $bits['delimiter'] = ':'; |
444 | } else { |
445 | return null; |
446 | } |
447 | |
448 | /* parse_url loses the third / for file:///c:/ urls */ |
449 | if ( $bits['scheme'] === 'file' && isset( $bits['path'] ) && !str_starts_with( $bits['path'], '/' ) ) { |
450 | $bits['path'] = '/' . $bits['path']; |
451 | } |
452 | |
453 | // If the URL was protocol-relative, fix scheme and delimiter |
454 | if ( $wasRelative ) { |
455 | $bits['scheme'] = ''; |
456 | $bits['delimiter'] = '//'; |
457 | } |
458 | return $bits; |
459 | } |
460 | |
461 | /** |
462 | * Take a URL, make sure it's expanded to fully qualified, and replace any encoded non-ASCII |
463 | * Unicode characters with their UTF-8 original forms for more compact display and legibility |
464 | * for local audiences. |
465 | * |
466 | * @todo handle punycode domains too |
467 | * |
468 | * @throws BadMethodCallException if no server was passed to the constructor |
469 | * @param string $url |
470 | * @return ?string |
471 | */ |
472 | public function expandIRI( string $url ): ?string { |
473 | $expanded = $this->expand( $url ); |
474 | if ( $expanded === null ) { |
475 | return null; |
476 | } |
477 | return preg_replace_callback( |
478 | '/(?:%[89A-F][0-9A-F])+/i', |
479 | static fn ( $m ) => urldecode( $m[0] ), |
480 | $expanded |
481 | ); |
482 | } |
483 | |
484 | /** |
485 | * Check whether a given URL has a domain that occurs in a given set of domains |
486 | * |
487 | * @throws BadMethodCallException if no server was passed to the constructor |
488 | * @param string $url |
489 | * @param array $domains Array of domains (strings) |
490 | * @return bool True if the host part of $url ends in one of the strings in $domains |
491 | */ |
492 | public function matchesDomainList( string $url, array $domains ): bool { |
493 | $bits = $this->parse( $url ); |
494 | if ( is_array( $bits ) && isset( $bits['host'] ) ) { |
495 | $host = '.' . $bits['host']; |
496 | foreach ( $domains as $domain ) { |
497 | $domain = '.' . $domain; |
498 | if ( str_ends_with( $host, $domain ) ) { |
499 | return true; |
500 | } |
501 | } |
502 | } |
503 | return false; |
504 | } |
505 | } |