MediaWiki  master
UrlUtils.php
Go to the documentation of this file.
1 <?php
2 
3 namespace MediaWiki\Utils;
4 
5 use BadMethodCallException;
6 use Exception;
7 use InvalidArgumentException;
9 use MWDebug;
10 
17 class UrlUtils {
18  public const SERVER = 'server';
19  public const CANONICAL_SERVER = 'canonicalServer';
20  public const INTERNAL_SERVER = 'internalServer';
21  public const FALLBACK_PROTOCOL = 'fallbackProtocol';
22  public const HTTPS_PORT = 'httpsPort';
23  public const VALID_PROTOCOLS = 'validProtocols';
24 
26  private $server = null;
27 
29  private $canonicalServer = null;
30 
32  private $internalServer = null;
34  private $fallbackProtocol = 'http';
35 
37  private $httpsPort = 443;
38 
40  private $validProtocols = MainConfigSchema::UrlProtocols['default'];
41 
43  private $validProtocolsCache = null;
44 
46  private $validAbsoluteProtocolsCache = null;
47 
69  public function __construct( array $options = [] ) {
70  foreach ( $options as $key => $value ) {
71  switch ( $key ) {
72  case self::SERVER:
76  case self::HTTPS_PORT:
78  $this->$key = $value;
79  break;
80 
81  default:
82  throw new InvalidArgumentException( "Unrecognized option \"$key\"" );
83  }
84  }
85 
86  if ( $this->server !== null ) {
87  if ( $this->canonicalServer === null ) {
88  $this->canonicalServer = $this->expand( $this->server, PROTO_HTTP );
89  }
90  if ( $this->internalServer === null ) {
91  $this->internalServer = $this->server;
92  }
93  }
94  }
95 
118  public function expand( string $url, $defaultProto = PROTO_FALLBACK ): ?string {
119  if ( $defaultProto === PROTO_CANONICAL ) {
120  $serverUrl = $this->canonicalServer;
121  } elseif ( $defaultProto === PROTO_INTERNAL ) {
122  $serverUrl = $this->internalServer;
123  } else {
124  $serverUrl = $this->server;
125  if ( $defaultProto === PROTO_FALLBACK ) {
126  $defaultProto = $this->fallbackProtocol . '://';
127  }
128  }
129 
130  if ( substr( $url, 0, 1 ) === '/' ) {
131  if ( $serverUrl === null ) {
132  throw new BadMethodCallException( 'Cannot call expand() if the appropriate ' .
133  'SERVER/CANONICAL_SERVER/INTERNAL_SERVER option was not passed to the ' .
134  'constructor' );
135  }
136 
137  // Analyze $serverUrl to obtain its protocol
138  $bits = $this->parse( $serverUrl );
139  $serverProto = $bits && $bits['scheme'] != '' ? $bits['scheme'] . '://' : null;
140 
141  if ( $defaultProto === PROTO_CANONICAL || $defaultProto === PROTO_INTERNAL ) {
142  // Fall back to HTTP in the ridiculous case that CanonicalServer or InternalServer
143  // doesn't have a protocol
144  $defaultProto = $serverProto ?? PROTO_HTTP;
145  }
146 
147  // @phan-suppress-next-line PhanTypeMismatchArgumentNullableInternal T308355
148  $defaultProtoWithoutSlashes = $defaultProto === PROTO_FALLBACK ? '' : substr( $defaultProto, 0, -2 );
149 
150  if ( substr( $url, 0, 2 ) == '//' ) {
151  $url = $defaultProtoWithoutSlashes . $url;
152  } else {
153  // If $serverUrl is protocol-relative, prepend $defaultProtoWithoutSlashes,
154  // otherwise leave it alone.
155  if ( $serverProto ) {
156  $url = $serverUrl . $url;
157  } else {
158  // If an HTTPS URL is synthesized from a protocol-relative Server, allow the
159  // user to override the port number (T67184)
160  if ( $defaultProto === PROTO_HTTPS && $this->httpsPort != 443 ) {
161  if ( isset( $bits['port'] ) ) {
162  throw new Exception(
163  'A protocol-relative server may not contain a port number' );
164  }
165  $url = "$defaultProtoWithoutSlashes$serverUrl:{$this->httpsPort}$url";
166  } else {
167  $url = "$defaultProtoWithoutSlashes$serverUrl$url";
168  }
169  }
170  }
171  }
172 
173  $bits = $this->parse( $url );
174 
175  if ( $bits && isset( $bits['path'] ) ) {
176  $bits['path'] = $this->removeDotSegments( $bits['path'] );
177  return $this->assemble( $bits );
178  } elseif ( $bits ) {
179  # No path to expand
180  return $url;
181  } elseif ( substr( $url, 0, 1 ) != '/' ) {
182  # URL is a relative path
183  return $this->removeDotSegments( $url );
184  }
185 
186  # Expanded URL is not valid.
187  return null;
188  }
189 
198  public function getServer( $proto ): ?string {
199  $url = $this->expand( '/', $proto );
200  if ( $url === null ) {
201  return null;
202  }
203  return substr( $url, 0, -1 );
204  }
205 
218  public function assemble( array $urlParts ): string {
219  $result = '';
220 
221  if ( isset( $urlParts['delimiter'] ) ) {
222  if ( isset( $urlParts['scheme'] ) ) {
223  $result .= $urlParts['scheme'];
224  }
225 
226  $result .= $urlParts['delimiter'];
227  }
228 
229  if ( isset( $urlParts['host'] ) ) {
230  if ( isset( $urlParts['user'] ) ) {
231  $result .= $urlParts['user'];
232  if ( isset( $urlParts['pass'] ) ) {
233  $result .= ':' . $urlParts['pass'];
234  }
235  $result .= '@';
236  }
237 
238  $result .= $urlParts['host'];
239 
240  if ( isset( $urlParts['port'] ) ) {
241  $result .= ':' . $urlParts['port'];
242  }
243  }
244 
245  if ( isset( $urlParts['path'] ) ) {
246  $result .= $urlParts['path'];
247  }
248 
249  if ( isset( $urlParts['query'] ) && $urlParts['query'] !== '' ) {
250  $result .= '?' . $urlParts['query'];
251  }
252 
253  if ( isset( $urlParts['fragment'] ) ) {
254  $result .= '#' . $urlParts['fragment'];
255  }
256 
257  return $result;
258  }
259 
269  public function removeDotSegments( string $urlPath ): string {
270  $output = '';
271  $inputOffset = 0;
272  $inputLength = strlen( $urlPath );
273 
274  while ( $inputOffset < $inputLength ) {
275  $prefixLengthOne = substr( $urlPath, $inputOffset, 1 );
276  $prefixLengthTwo = substr( $urlPath, $inputOffset, 2 );
277  $prefixLengthThree = substr( $urlPath, $inputOffset, 3 );
278  $prefixLengthFour = substr( $urlPath, $inputOffset, 4 );
279  $trimOutput = false;
280 
281  if ( $prefixLengthTwo == './' ) {
282  # Step A, remove leading "./"
283  $inputOffset += 2;
284  } elseif ( $prefixLengthThree == '../' ) {
285  # Step A, remove leading "../"
286  $inputOffset += 3;
287  } elseif ( ( $prefixLengthTwo == '/.' ) && ( $inputOffset + 2 == $inputLength ) ) {
288  # Step B, replace leading "/.$" with "/"
289  $inputOffset += 1;
290  $urlPath[$inputOffset] = '/';
291  } elseif ( $prefixLengthThree == '/./' ) {
292  # Step B, replace leading "/./" with "/"
293  $inputOffset += 2;
294  } elseif ( $prefixLengthThree == '/..' && ( $inputOffset + 3 == $inputLength ) ) {
295  # Step C, replace leading "/..$" with "/" and
296  # remove last path component in output
297  $inputOffset += 2;
298  $urlPath[$inputOffset] = '/';
299  $trimOutput = true;
300  } elseif ( $prefixLengthFour == '/../' ) {
301  # Step C, replace leading "/../" with "/" and
302  # remove last path component in output
303  $inputOffset += 3;
304  $trimOutput = true;
305  } elseif ( ( $prefixLengthOne == '.' ) && ( $inputOffset + 1 == $inputLength ) ) {
306  # Step D, remove "^.$"
307  $inputOffset += 1;
308  } elseif ( ( $prefixLengthTwo == '..' ) && ( $inputOffset + 2 == $inputLength ) ) {
309  # Step D, remove "^..$"
310  $inputOffset += 2;
311  } else {
312  # Step E, move leading path segment to output
313  if ( $prefixLengthOne == '/' ) {
314  $slashPos = strpos( $urlPath, '/', $inputOffset + 1 );
315  } else {
316  $slashPos = strpos( $urlPath, '/', $inputOffset );
317  }
318  if ( $slashPos === false ) {
319  $output .= substr( $urlPath, $inputOffset );
320  $inputOffset = $inputLength;
321  } else {
322  $output .= substr( $urlPath, $inputOffset, $slashPos - $inputOffset );
323  $inputOffset += $slashPos - $inputOffset;
324  }
325  }
326 
327  if ( $trimOutput ) {
328  $slashPos = strrpos( $output, '/' );
329  if ( $slashPos === false ) {
330  $output = '';
331  } else {
332  $output = substr( $output, 0, $slashPos );
333  }
334  }
335  }
336 
337  return $output;
338  }
339 
345  public function validProtocols(): string {
346  if ( $this->validProtocolsCache !== null ) {
347  return $this->validProtocolsCache; // @codeCoverageIgnore
348  }
349  $this->validProtocolsCache = $this->validProtocolsInternal( true );
350  return $this->validProtocolsCache;
351  }
352 
359  public function validAbsoluteProtocols(): string {
360  if ( $this->validAbsoluteProtocolsCache !== null ) {
361  return $this->validAbsoluteProtocolsCache; // @codeCoverageIgnore
362  }
363  $this->validAbsoluteProtocolsCache = $this->validProtocolsInternal( false );
364  return $this->validAbsoluteProtocolsCache;
365  }
366 
373  private function validProtocolsInternal( bool $includeProtocolRelative ): string {
374  if ( !is_array( $this->validProtocols ) ) {
375  MWDebug::deprecated( '$wgUrlProtocols that is not an array', '1.39' );
376  return (string)$this->validProtocols;
377  }
378 
379  $protocols = [];
380  foreach ( $this->validProtocols as $protocol ) {
381  // Filter out '//' if !$includeProtocolRelative
382  if ( $includeProtocolRelative || $protocol !== '//' ) {
383  $protocols[] = preg_quote( $protocol, '/' );
384  }
385  }
386 
387  return implode( '|', $protocols );
388  }
389 
415  public function parse( string $url ): ?array {
416  // Protocol-relative URLs are handled really badly by parse_url(). It's so bad that the
417  // easiest way to handle them is to just prepend 'http:' and strip the protocol out later.
418  $wasRelative = substr( $url, 0, 2 ) == '//';
419  if ( $wasRelative ) {
420  $url = "http:$url";
421  }
422  $bits = parse_url( $url );
423  // parse_url() returns an array without scheme for some invalid URLs, e.g.
424  // parse_url("%0Ahttp://example.com") == [ 'host' => '%0Ahttp', 'path' => 'example.com' ]
425  if ( !$bits || !isset( $bits['scheme'] ) ) {
426  return null;
427  }
428 
429  // parse_url() incorrectly handles schemes case-sensitively. Convert it to lowercase.
430  $bits['scheme'] = strtolower( $bits['scheme'] );
431 
432  // most of the protocols are followed by ://, but mailto: and sometimes news: not, check for it
433  if ( in_array( $bits['scheme'] . '://', $this->validProtocols ) ) {
434  $bits['delimiter'] = '://';
435  } elseif ( in_array( $bits['scheme'] . ':', $this->validProtocols ) ) {
436  $bits['delimiter'] = ':';
437  // parse_url detects for news: and mailto: the host part of an url as path
438  // We have to correct this wrong detection
439  if ( isset( $bits['path'] ) ) {
440  $bits['host'] = $bits['path'];
441  $bits['path'] = '';
442  }
443  } else {
444  return null;
445  }
446 
447  // Provide an empty host for, e.g., file:/// urls (see T30627)
448  if ( !isset( $bits['host'] ) ) {
449  $bits['host'] = '';
450 
451  // See T47069
452  if ( isset( $bits['path'] ) ) {
453  /* parse_url loses the third / for file:///c:/ urls (but not on variants) */
454  if ( substr( $bits['path'], 0, 1 ) !== '/' ) {
455  $bits['path'] = '/' . $bits['path'];
456  }
457  } else {
458  $bits['path'] = '';
459  }
460  }
461 
462  // If the URL was protocol-relative, fix scheme and delimiter
463  if ( $wasRelative ) {
464  $bits['scheme'] = '';
465  $bits['delimiter'] = '//';
466  }
467  return $bits;
468  }
469 
481  public function expandIRI( string $url ): ?string {
482  $expanded = $this->expand( $url );
483  if ( $expanded === null ) {
484  return null;
485  }
486  return preg_replace_callback(
487  '/((?:%[89A-F][0-9A-F])+)/i',
488  static function ( array $matches ) {
489  return urldecode( $matches[1] );
490  },
491  $expanded
492  );
493  }
494 
503  public function matchesDomainList( string $url, array $domains ): bool {
504  $bits = $this->parse( $url );
505  if ( is_array( $bits ) && isset( $bits['host'] ) ) {
506  $host = '.' . $bits['host'];
507  foreach ( $domains as $domain ) {
508  $domain = '.' . $domain;
509  if ( substr( $host, -strlen( $domain ) ) === $domain ) {
510  return true;
511  }
512  }
513  }
514  return false;
515  }
516 }
const PROTO_CANONICAL
Definition: Defines.php:199
const PROTO_HTTPS
Definition: Defines.php:194
const PROTO_INTERNAL
Definition: Defines.php:200
const PROTO_FALLBACK
Definition: Defines.php:196
const PROTO_HTTP
Definition: Defines.php:193
$matches
if(!defined('MW_SETUP_CALLBACK'))
The persistent session ID (if any) loaded at startup.
Definition: WebStart.php:82
New debugger system that outputs a toolbar on page view.
Definition: MWDebug.php:36
static deprecated( $function, $version=false, $component=false, $callerOffset=2)
Show a warning that $function is deprecated.
Definition: MWDebug.php:224
This class contains schema declarations for all configuration variables known to MediaWiki core.
A service to expand, parse, and otherwise manipulate URLs.
Definition: UrlUtils.php:17
parse(string $url)
parse_url() work-alike, but non-broken.
Definition: UrlUtils.php:415
getServer( $proto)
Get the wiki's "server", i.e.
Definition: UrlUtils.php:198
expand(string $url, $defaultProto=PROTO_FALLBACK)
Expand a potentially local URL to a fully-qualified URL.
Definition: UrlUtils.php:118
expandIRI(string $url)
Take a URL, make sure it's expanded to fully qualified, and replace any encoded non-ASCII Unicode cha...
Definition: UrlUtils.php:481
validProtocols()
Returns a regular expression of recognized URL protocols.
Definition: UrlUtils.php:345
removeDotSegments(string $urlPath)
Remove all dot-segments in the provided URL path.
Definition: UrlUtils.php:269
matchesDomainList(string $url, array $domains)
Check whether a given URL has a domain that occurs in a given set of domains.
Definition: UrlUtils.php:503
__construct(array $options=[])
Definition: UrlUtils.php:69
assemble(array $urlParts)
This function will reassemble a URL parsed with parse().
Definition: UrlUtils.php:218
validAbsoluteProtocols()
Like validProtocols(), but excludes '//' from the protocol list.
Definition: UrlUtils.php:359