Code Coverage for /src/src/Utils/UrlUtils.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	93.28% covered (success)	93.28%	111 / 119	80.00% covered (warning)	80.00%	4 / 5	CRAP	0.00% covered (danger)	0.00%	0 / 1
UrlUtils	93.28% covered (success)	93.28%	111 / 119	80.00% covered (warning)	80.00%	4 / 5	42.54	0.00% covered (danger)	0.00%	0 / 1
parseUrl	100.00% covered (success)	100.00%	27 / 27	100.00% covered (success)	100.00%	1 / 1	6
assembleUrl	100.00% covered (success)	100.00%	12 / 12	100.00% covered (success)	100.00%	1 / 1	6
removeDotSegments	100.00% covered (success)	100.00%	43 / 43	100.00% covered (success)	100.00%	1 / 1	18
expandUrl	100.00% covered (success)	100.00%	29 / 29	100.00% covered (success)	100.00%	1 / 1	8
matchesDomainList	0.00% covered (danger)	0.00%	0 / 8	0.00% covered (danger)	0.00%	0 / 1	20

1	<?php
2
3	declare( strict_types = 1 );
4
5	namespace Wikimedia\Parsoid\Utils;
6
7	/**
8	* Utilities for manipulating URLs
9	* @see https://tools.ietf.org/html/rfc3986
10	*/
11	class UrlUtils {
12
13	/**
14	* Parse a possibly-relative URL into components
15	*
16	* Note no percent-decoding is performed, and only minimal syntax validation.
17	*
18	* @param string $url
19	* @return (string\|null)[]
20	* - 'scheme': Scheme of the url, if any.
21	* - 'authority': Authority part of the url, if any.
22	* This is the part in between the "//" and the path. For http, this is the "user@host:port".
23	* - 'path': Path part of the URL. Never null, but may be the empty string.
24	* - 'query': Query part of the URL, if any.
25	* - 'fragment': Fragment part of the URL, if any.
26	*/
27	public static function parseUrl( string $url ): array {
28	$ret = [
29	'scheme' => null,
30	'authority' => null,
31	'path' => '',
32	'query' => null,
33	'fragment' => null,
34	];
35
36	// Scheme?
37	if ( preg_match( '!^([a-z][a-z0-9+.-]*):!i', $url, $m ) ) {
38	$ret['scheme'] = $m[1];
39	$url = substr( $url, strlen( $m[0] ) );
40	}
41
42	// Fragment?
43	$i = strpos( $url, '#' );
44	if ( $i !== false ) {
45	$ret['fragment'] = substr( $url, $i + 1 );
46	$url = substr( $url, 0, $i );
47	}
48
49	// Query?
50	$i = strpos( $url, '?' );
51	if ( $i !== false ) {
52	$ret['query'] = substr( $url, $i + 1 );
53	$url = substr( $url, 0, $i );
54	}
55
56	// Split authority and path
57	if ( substr( $url, 0, 2 ) === '//' ) {
58	$i = strpos( $url, '/', 2 );
59	if ( $i === false ) {
60	$ret['authority'] = substr( $url, 2 );
61	$ret['path'] = '';
62	} else {
63	$ret['authority'] = substr( $url, 2, $i - 2 );
64	$ret['path'] = substr( $url, $i );
65	}
66	} else {
67	$ret['path'] = $url;
68	}
69
70	return $ret;
71	}
72
73	/**
74	* This function will reassemble a URL parsed with self::parseURL().
75	*
76	* Note no percent-encoding or syntax validation is performed.
77	*
78	* @param array $urlParts URL parts, as output from self::parseUrl
79	* @return string URL assembled from its component parts
80	*/
81	public static function assembleUrl( array $urlParts ): string {
82	$ret = '';
83
84	if ( isset( $urlParts['scheme'] ) ) {
85	$ret .= $urlParts['scheme'] . ':';
86	}
87
88	if ( isset( $urlParts['authority'] ) ) {
89	$ret .= '//' . $urlParts['authority'];
90	}
91
92	if ( isset( $urlParts['path'] ) ) {
93	$ret .= $urlParts['path'];
94	}
95
96	if ( isset( $urlParts['query'] ) ) {
97	$ret .= '?' . $urlParts['query'];
98	}
99
100	if ( isset( $urlParts['fragment'] ) ) {
101	$ret .= '#' . $urlParts['fragment'];
102	}
103
104	return $ret;
105	}
106
107	/**
108	* Remove all dot-segments in the provided URL path. For example,
109	* '/a/./b/../c/' becomes '/a/c/'.
110	*
111	* @see https://tools.ietf.org/html/rfc3986#section-5.2.4
112	* @note Copied from MediaWiki's UrlUtils::removeDotSegments()
113	* @param string $urlPath URL path, potentially containing dot-segments
114	* @return string URL path with all dot-segments removed
115	*/
116	public static function removeDotSegments( string $urlPath ): string {
117	$output = '';
118	$inputOffset = 0;
119	$inputLength = strlen( $urlPath );
120
121	while ( $inputOffset < $inputLength ) {
122	$prefixLengthOne = substr( $urlPath, $inputOffset, 1 );
123	$prefixLengthTwo = substr( $urlPath, $inputOffset, 2 );
124	$prefixLengthThree = substr( $urlPath, $inputOffset, 3 );
125	$prefixLengthFour = substr( $urlPath, $inputOffset, 4 );
126	$trimOutput = false;
127
128	if ( $prefixLengthTwo == './' ) {
129	# Step A, remove leading "./"
130	$inputOffset += 2;
131	} elseif ( $prefixLengthThree == '../' ) {
132	# Step A, remove leading "../"
133	$inputOffset += 3;
134	} elseif ( ( $prefixLengthTwo == '/.' ) && ( $inputOffset + 2 == $inputLength ) ) {
135	# Step B, replace leading "/.$" with "/"
136	$inputOffset += 1;
137	$urlPath[$inputOffset] = '/';
138	} elseif ( $prefixLengthThree == '/./' ) {
139	# Step B, replace leading "/./" with "/"
140	$inputOffset += 2;
141	} elseif ( $prefixLengthThree == '/..' && ( $inputOffset + 3 == $inputLength ) ) {
142	# Step C, replace leading "/..$" with "/" and
143	# remove last path component in output
144	$inputOffset += 2;
145	$urlPath[$inputOffset] = '/';
146	$trimOutput = true;
147	} elseif ( $prefixLengthFour == '/../' ) {
148	# Step C, replace leading "/../" with "/" and
149	# remove last path component in output
150	$inputOffset += 3;
151	$trimOutput = true;
152	} elseif ( ( $prefixLengthOne == '.' ) && ( $inputOffset + 1 == $inputLength ) ) {
153	# Step D, remove "^.$"
154	$inputOffset += 1;
155	} elseif ( ( $prefixLengthTwo == '..' ) && ( $inputOffset + 2 == $inputLength ) ) {
156	# Step D, remove "^..$"
157	$inputOffset += 2;
158	} else {
159	# Step E, move leading path segment to output
160	if ( $prefixLengthOne == '/' ) {
161	$slashPos = strpos( $urlPath, '/', $inputOffset + 1 );
162	} else {
163	$slashPos = strpos( $urlPath, '/', $inputOffset );
164	}
165	if ( $slashPos === false ) {
166	$output .= substr( $urlPath, $inputOffset );
167	$inputOffset = $inputLength;
168	} else {
169	$output .= substr( $urlPath, $inputOffset, $slashPos - $inputOffset );
170	$inputOffset += $slashPos - $inputOffset;
171	}
172	}
173
174	if ( $trimOutput ) {
175	$slashPos = strrpos( $output, '/' );
176	if ( $slashPos === false ) {
177	$output = '';
178	} else {
179	$output = substr( $output, 0, $slashPos );
180	}
181	}
182	}
183
184	return $output;
185	}
186
187	/**
188	* Expand a relative URL using a base URL
189	*
190	* @see https://tools.ietf.org/html/rfc3986#section-5.2.2
191	* @param string $url Relative URL to expand
192	* @param string $base Base URL to expand relative to
193	* @return string Expanded URL
194	*/
195	public static function expandUrl( string $url, string $base ): string {
196	$b = self::parseUrl( $base );
197	$r = self::parseUrl( $url );
198
199	$t = [];
200	if ( isset( $r['scheme'] ) ) {
201	$t['scheme'] = $r['scheme'];
202	$t['authority'] = $r['authority'] ?? null;
203	$t['path'] = self::removeDotSegments( $r['path'] );
204	$t['query'] = $r['query'] ?? null;
205	} else {
206	if ( isset( $r['authority'] ) ) {
207	$t['authority'] = $r['authority'];
208	$t['path'] = self::removeDotSegments( $r['path'] );
209	$t['query'] = $r['query'] ?? null;
210	} else {
211	if ( $r['path'] === '' ) {
212	$t['path'] = $b['path'];
213	$t['query'] = $r['query'] ?? $b['query'] ?? null;
214	} else {
215	if ( $r['path'][0] === '/' ) {
216	$t['path'] = self::removeDotSegments( $r['path'] );
217	} else {
218	// start merge(), see RFC 3986 §5.2.3
219	if ( isset( $b['authority'] ) && $b['path'] === '' ) {
220	$t['path'] = '/' . $r['path'];
221	} else {
222	$i = strrpos( $b['path'], '/' );
223	if ( $i === false ) {
224	$t['path'] = $r['path'];
225	} else {
226	$t['path'] = substr( $b['path'], 0, $i + 1 ) . $r['path'];
227	}
228	}
229	// end merge()
230	$t['path'] = self::removeDotSegments( $t['path'] );
231	}
232	$t['query'] = $r['query'] ?? null;
233	}
234	$t['authority'] = $b['authority'] ?? null;
235	}
236	$t['scheme'] = $b['scheme'] ?? null;
237	}
238	$t['fragment'] = $r['fragment'] ?? null;
239
240	return self::assembleUrl( $t );
241	}
242
243	/**
244	* Check whether a given URL has a domain that occurs in a given set of domains
245	*
246	* @param string $url
247	* @param array $domains Array of domains (strings)
248	* @return bool True if the host part of $url ends in one of the strings in $domains
249	*/
250	public static function matchesDomainList( string $url, array $domains ): bool {
251	$bits = self::parseUrl( $url );
252	if ( isset( $bits['authority'] ) ) {
253	$host = '.' . $bits['authority'];
254	foreach ( $domains as $domain ) {
255	$domain = '.' . $domain;
256	if ( substr( $host, -strlen( $domain ) ) === $domain ) {
257	return true;
258	}
259	}
260	}
261	return false;
262	}
263	}