Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
93.28% |
111 / 119 |
|
80.00% |
4 / 5 |
CRAP | |
0.00% |
0 / 1 |
UrlUtils | |
93.28% |
111 / 119 |
|
80.00% |
4 / 5 |
42.54 | |
0.00% |
0 / 1 |
parseUrl | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
6 | |||
assembleUrl | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
6 | |||
removeDotSegments | |
100.00% |
43 / 43 |
|
100.00% |
1 / 1 |
18 | |||
expandUrl | |
100.00% |
29 / 29 |
|
100.00% |
1 / 1 |
8 | |||
matchesDomainList | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 |
1 | <?php |
2 | |
3 | declare( strict_types = 1 ); |
4 | |
5 | namespace Wikimedia\Parsoid\Utils; |
6 | |
7 | /** |
8 | * Utilities for manipulating URLs |
9 | * @see https://tools.ietf.org/html/rfc3986 |
10 | */ |
11 | class UrlUtils { |
12 | |
13 | /** |
14 | * Parse a possibly-relative URL into components |
15 | * |
16 | * Note no percent-decoding is performed, and only minimal syntax validation. |
17 | * |
18 | * @param string $url |
19 | * @return (string|null)[] |
20 | * - 'scheme': Scheme of the url, if any. |
21 | * - 'authority': Authority part of the url, if any. |
22 | * This is the part in between the "//" and the path. For http, this is the "user@host:port". |
23 | * - 'path': Path part of the URL. Never null, but may be the empty string. |
24 | * - 'query': Query part of the URL, if any. |
25 | * - 'fragment': Fragment part of the URL, if any. |
26 | */ |
27 | public static function parseUrl( string $url ): array { |
28 | $ret = [ |
29 | 'scheme' => null, |
30 | 'authority' => null, |
31 | 'path' => '', |
32 | 'query' => null, |
33 | 'fragment' => null, |
34 | ]; |
35 | |
36 | // Scheme? |
37 | if ( preg_match( '!^([a-z][a-z0-9+.-]*):!i', $url, $m ) ) { |
38 | $ret['scheme'] = $m[1]; |
39 | $url = substr( $url, strlen( $m[0] ) ); |
40 | } |
41 | |
42 | // Fragment? |
43 | $i = strpos( $url, '#' ); |
44 | if ( $i !== false ) { |
45 | $ret['fragment'] = substr( $url, $i + 1 ); |
46 | $url = substr( $url, 0, $i ); |
47 | } |
48 | |
49 | // Query? |
50 | $i = strpos( $url, '?' ); |
51 | if ( $i !== false ) { |
52 | $ret['query'] = substr( $url, $i + 1 ); |
53 | $url = substr( $url, 0, $i ); |
54 | } |
55 | |
56 | // Split authority and path |
57 | if ( substr( $url, 0, 2 ) === '//' ) { |
58 | $i = strpos( $url, '/', 2 ); |
59 | if ( $i === false ) { |
60 | $ret['authority'] = substr( $url, 2 ); |
61 | $ret['path'] = ''; |
62 | } else { |
63 | $ret['authority'] = substr( $url, 2, $i - 2 ); |
64 | $ret['path'] = substr( $url, $i ); |
65 | } |
66 | } else { |
67 | $ret['path'] = $url; |
68 | } |
69 | |
70 | return $ret; |
71 | } |
72 | |
73 | /** |
74 | * This function will reassemble a URL parsed with self::parseURL(). |
75 | * |
76 | * Note no percent-encoding or syntax validation is performed. |
77 | * |
78 | * @param array $urlParts URL parts, as output from self::parseUrl |
79 | * @return string URL assembled from its component parts |
80 | */ |
81 | public static function assembleUrl( array $urlParts ): string { |
82 | $ret = ''; |
83 | |
84 | if ( isset( $urlParts['scheme'] ) ) { |
85 | $ret .= $urlParts['scheme'] . ':'; |
86 | } |
87 | |
88 | if ( isset( $urlParts['authority'] ) ) { |
89 | $ret .= '//' . $urlParts['authority']; |
90 | } |
91 | |
92 | if ( isset( $urlParts['path'] ) ) { |
93 | $ret .= $urlParts['path']; |
94 | } |
95 | |
96 | if ( isset( $urlParts['query'] ) ) { |
97 | $ret .= '?' . $urlParts['query']; |
98 | } |
99 | |
100 | if ( isset( $urlParts['fragment'] ) ) { |
101 | $ret .= '#' . $urlParts['fragment']; |
102 | } |
103 | |
104 | return $ret; |
105 | } |
106 | |
107 | /** |
108 | * Remove all dot-segments in the provided URL path. For example, |
109 | * '/a/./b/../c/' becomes '/a/c/'. |
110 | * |
111 | * @see https://tools.ietf.org/html/rfc3986#section-5.2.4 |
112 | * @note Copied from MediaWiki's UrlUtils::removeDotSegments() |
113 | * @param string $urlPath URL path, potentially containing dot-segments |
114 | * @return string URL path with all dot-segments removed |
115 | */ |
116 | public static function removeDotSegments( string $urlPath ): string { |
117 | $output = ''; |
118 | $inputOffset = 0; |
119 | $inputLength = strlen( $urlPath ); |
120 | |
121 | while ( $inputOffset < $inputLength ) { |
122 | $prefixLengthOne = substr( $urlPath, $inputOffset, 1 ); |
123 | $prefixLengthTwo = substr( $urlPath, $inputOffset, 2 ); |
124 | $prefixLengthThree = substr( $urlPath, $inputOffset, 3 ); |
125 | $prefixLengthFour = substr( $urlPath, $inputOffset, 4 ); |
126 | $trimOutput = false; |
127 | |
128 | if ( $prefixLengthTwo == './' ) { |
129 | # Step A, remove leading "./" |
130 | $inputOffset += 2; |
131 | } elseif ( $prefixLengthThree == '../' ) { |
132 | # Step A, remove leading "../" |
133 | $inputOffset += 3; |
134 | } elseif ( ( $prefixLengthTwo == '/.' ) && ( $inputOffset + 2 == $inputLength ) ) { |
135 | # Step B, replace leading "/.$" with "/" |
136 | $inputOffset += 1; |
137 | $urlPath[$inputOffset] = '/'; |
138 | } elseif ( $prefixLengthThree == '/./' ) { |
139 | # Step B, replace leading "/./" with "/" |
140 | $inputOffset += 2; |
141 | } elseif ( $prefixLengthThree == '/..' && ( $inputOffset + 3 == $inputLength ) ) { |
142 | # Step C, replace leading "/..$" with "/" and |
143 | # remove last path component in output |
144 | $inputOffset += 2; |
145 | $urlPath[$inputOffset] = '/'; |
146 | $trimOutput = true; |
147 | } elseif ( $prefixLengthFour == '/../' ) { |
148 | # Step C, replace leading "/../" with "/" and |
149 | # remove last path component in output |
150 | $inputOffset += 3; |
151 | $trimOutput = true; |
152 | } elseif ( ( $prefixLengthOne == '.' ) && ( $inputOffset + 1 == $inputLength ) ) { |
153 | # Step D, remove "^.$" |
154 | $inputOffset += 1; |
155 | } elseif ( ( $prefixLengthTwo == '..' ) && ( $inputOffset + 2 == $inputLength ) ) { |
156 | # Step D, remove "^..$" |
157 | $inputOffset += 2; |
158 | } else { |
159 | # Step E, move leading path segment to output |
160 | if ( $prefixLengthOne == '/' ) { |
161 | $slashPos = strpos( $urlPath, '/', $inputOffset + 1 ); |
162 | } else { |
163 | $slashPos = strpos( $urlPath, '/', $inputOffset ); |
164 | } |
165 | if ( $slashPos === false ) { |
166 | $output .= substr( $urlPath, $inputOffset ); |
167 | $inputOffset = $inputLength; |
168 | } else { |
169 | $output .= substr( $urlPath, $inputOffset, $slashPos - $inputOffset ); |
170 | $inputOffset += $slashPos - $inputOffset; |
171 | } |
172 | } |
173 | |
174 | if ( $trimOutput ) { |
175 | $slashPos = strrpos( $output, '/' ); |
176 | if ( $slashPos === false ) { |
177 | $output = ''; |
178 | } else { |
179 | $output = substr( $output, 0, $slashPos ); |
180 | } |
181 | } |
182 | } |
183 | |
184 | return $output; |
185 | } |
186 | |
187 | /** |
188 | * Expand a relative URL using a base URL |
189 | * |
190 | * @see https://tools.ietf.org/html/rfc3986#section-5.2.2 |
191 | * @param string $url Relative URL to expand |
192 | * @param string $base Base URL to expand relative to |
193 | * @return string Expanded URL |
194 | */ |
195 | public static function expandUrl( string $url, string $base ): string { |
196 | $b = self::parseUrl( $base ); |
197 | $r = self::parseUrl( $url ); |
198 | |
199 | $t = []; |
200 | if ( isset( $r['scheme'] ) ) { |
201 | $t['scheme'] = $r['scheme']; |
202 | $t['authority'] = $r['authority'] ?? null; |
203 | $t['path'] = self::removeDotSegments( $r['path'] ); |
204 | $t['query'] = $r['query'] ?? null; |
205 | } else { |
206 | if ( isset( $r['authority'] ) ) { |
207 | $t['authority'] = $r['authority']; |
208 | $t['path'] = self::removeDotSegments( $r['path'] ); |
209 | $t['query'] = $r['query'] ?? null; |
210 | } else { |
211 | if ( $r['path'] === '' ) { |
212 | $t['path'] = $b['path']; |
213 | $t['query'] = $r['query'] ?? $b['query'] ?? null; |
214 | } else { |
215 | if ( $r['path'][0] === '/' ) { |
216 | $t['path'] = self::removeDotSegments( $r['path'] ); |
217 | } else { |
218 | // start merge(), see RFC 3986 §5.2.3 |
219 | if ( isset( $b['authority'] ) && $b['path'] === '' ) { |
220 | $t['path'] = '/' . $r['path']; |
221 | } else { |
222 | $i = strrpos( $b['path'], '/' ); |
223 | if ( $i === false ) { |
224 | $t['path'] = $r['path']; |
225 | } else { |
226 | $t['path'] = substr( $b['path'], 0, $i + 1 ) . $r['path']; |
227 | } |
228 | } |
229 | // end merge() |
230 | $t['path'] = self::removeDotSegments( $t['path'] ); |
231 | } |
232 | $t['query'] = $r['query'] ?? null; |
233 | } |
234 | $t['authority'] = $b['authority'] ?? null; |
235 | } |
236 | $t['scheme'] = $b['scheme'] ?? null; |
237 | } |
238 | $t['fragment'] = $r['fragment'] ?? null; |
239 | |
240 | return self::assembleUrl( $t ); |
241 | } |
242 | |
243 | /** |
244 | * Check whether a given URL has a domain that occurs in a given set of domains |
245 | * |
246 | * @param string $url |
247 | * @param array $domains Array of domains (strings) |
248 | * @return bool True if the host part of $url ends in one of the strings in $domains |
249 | */ |
250 | public static function matchesDomainList( string $url, array $domains ): bool { |
251 | $bits = self::parseUrl( $url ); |
252 | if ( isset( $bits['authority'] ) ) { |
253 | $host = '.' . $bits['authority']; |
254 | foreach ( $domains as $domain ) { |
255 | $domain = '.' . $domain; |
256 | if ( substr( $host, -strlen( $domain ) ) === $domain ) { |
257 | return true; |
258 | } |
259 | } |
260 | } |
261 | return false; |
262 | } |
263 | } |