MediaWiki master
LinkFilter.php
Go to the documentation of this file.
1<?php
8
13use Wikimedia\IPUtils;
18
40 public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) {
41 if ( !( $content instanceof TextContent ) ) {
42 // TODO: handle other types of content too.
43 // Maybe create ContentHandler::matchFilter( LinkFilter ).
44 // Think about a common base class for LinkFilter and MagicWord.
45 return 0;
46 }
47
48 $text = $content->getText();
49 $regex = self::makeRegex( $filterEntry, $protocol );
50 return preg_match( $regex, $text );
51 }
52
62 private static function makeRegex( $filterEntry, $protocol ) {
63 $regex = '!' . preg_quote( $protocol, '!' );
64 if ( str_starts_with( $filterEntry, '*.' ) ) {
65 $regex .= '(?:[A-Za-z0-9.-]+\.|)';
66 $filterEntry = substr( $filterEntry, 2 );
67 }
68 $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
69 return $regex;
70 }
71
79 private static function indexifyHost( $host, $reverse = true ) {
80 // Canonicalize.
81 $host = rawurldecode( $host );
82 if ( $host !== '' ) {
83 $tmp = idn_to_utf8( $host );
84 if ( $tmp !== false ) {
85 $host = $tmp;
86 }
87 }
88 $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
89 if ( StringUtils::isUtf8( $host ) ) {
90 // Save a little space by not percent-encoding valid UTF-8 bytes
91 $okChars .= '\x80-\xf4';
92 }
93 $host = preg_replace_callback(
94 '<[^' . $okChars . ']+>',
95 static fn ( $m ) => rawurlencode( $m[0] ),
96 strtolower( $host )
97 );
98
99 // IPv6? RFC 3986 syntax.
100 if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
101 $ip = $m[1];
102 if ( IPUtils::isValid( $ip ) ) {
103 if ( !$reverse ) {
104 return '[' . IPUtils::sanitizeIP( $ip ) . ']';
105 }
106 return 'V6.' . implode( '.', explode( ':', IPUtils::sanitizeIP( $ip ) ) ) . '.';
107 }
108 if ( substr( $ip, -2 ) === ':*' ) {
109 $cutIp = substr( $ip, 0, -2 );
110 if ( IPUtils::isValid( "{$cutIp}::" ) ) {
111 // Wildcard IP doesn't contain "::", so multiple parts can be wild
112 $ct = count( explode( ':', $ip ) ) - 1;
113 if ( !$reverse ) {
114 return '[' . IPUtils::sanitizeIP( "{$cutIp}::" ) . ']';
115 }
116 return 'V6.' .
117 implode( '.', array_slice( explode( ':', IPUtils::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
118 '.*.';
119 }
120 if ( IPUtils::isValid( "{$cutIp}:1" ) ) {
121 // Wildcard IP does contain "::", so only the last part is wild
122 if ( !$reverse ) {
123 return '[' . IPUtils::sanitizeIP( "{$cutIp}:1" ) . ']';
124 }
125 return 'V6.' .
126 substr( implode( '.', explode( ':', IPUtils::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
127 '*.';
128 }
129 }
130 }
131
132 // Regularize explicit specification of the DNS root.
133 // Browsers seem to do this for IPv4 literals too.
134 if ( substr( $host, -1 ) === '.' ) {
135 $host = substr( $host, 0, -1 );
136 }
137
138 // IPv4?
139 $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
140 if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
141 if ( !$reverse ) {
142 return $host;
143 }
144 return 'V4.' . implode( '.', array_map( static function ( $v ) {
145 return $v === '*' ? $v : (int)$v;
146 }, explode( '.', $host ) ) ) . '.';
147 }
148
149 // Must be a host name.
150 if ( $reverse ) {
151 return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
152 } else {
153 return $host;
154 }
155 }
156
166 public static function makeIndexes( $url, $reverseDomain = true ) {
167 // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
168 // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
169 // versus "https://" prefix. If you change that, you'll likely need to update
170 // refreshExternallinksIndex.php accordingly.
171
172 $bits = MediaWikiServices::getInstance()->getUrlUtils()->parse( $url );
173 if ( !$bits ) {
174 return [];
175 }
176
177 // URI RFC identifies the email/server part of mailto or news protocol as 'path',
178 // while we want to match the email's domain or news server the same way we are
179 // matching hosts for other URLs.
180 if ( in_array( $bits['scheme'], [ 'mailto', 'news' ] ) ) {
181 // (T347574) Only set host if it's not already set (if // is used)
182 if ( array_key_exists( 'path', $bits ) ) {
183 $bits['host'] = $bits['path'];
184 }
185 $bits['path'] = '';
186 }
187
188 // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
189 // For emails turn it into "domain.reversed@localpart"
190 if ( $bits['scheme'] == 'mailto' ) {
191 $mailparts = explode( '@', $bits['host'], 2 );
192 if ( count( $mailparts ) === 2 ) {
193 $domainpart = self::indexifyHost( $mailparts[1], $reverseDomain );
194 } else {
195 // No @, assume it's a local part with no domain
196 $domainpart = '';
197 }
198 if ( $reverseDomain ) {
199 $bits['host'] = $domainpart . '@' . $mailparts[0];
200 } else {
201 $bits['host'] = $mailparts[0] . '@' . $domainpart;
202 }
203 } else {
204 $bits['host'] = self::indexifyHost( $bits['host'], $reverseDomain );
205 }
206
207 // Reconstruct the pseudo-URL
208 $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
209 // Leave out user and password. Add the port, path, query and fragment
210 if ( isset( $bits['port'] ) ) {
211 $index .= ':' . $bits['port'];
212 }
213 $index2 = $bits['path'] ?? '/';
214 if ( isset( $bits['query'] ) ) {
215 $index2 .= '?' . $bits['query'];
216 }
217 if ( isset( $bits['fragment'] ) ) {
218 $index2 .= '#' . $bits['fragment'];
219 }
220
221 if ( $bits['scheme'] == '' ) {
222 return [ [ "https:$index", $index2 ] ];
223 } else {
224 return [ [ $index, $index2 ] ];
225 }
226 }
227
234 public static function getIndexedUrlsNonReversed( $urls ) {
235 $newLinks = [];
236 foreach ( $urls as $url ) {
237 $indexes = self::makeIndexes( $url, false );
238 if ( !$indexes ) {
239 continue;
240 }
241 foreach ( $indexes as $index ) {
242 $newLinks[] = $index[0] . $index[1];
243 }
244 }
245 return $newLinks;
246 }
247
248 public static function reverseIndexes( string $domainIndex ): string {
249 $bits = MediaWikiServices::getInstance()->getUrlUtils()->parse( $domainIndex );
250 if ( !$bits ) {
251 return '';
252 }
253
254 // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
255 // For emails turn it into "domain.reversed@localpart"
256 if ( $bits['scheme'] == 'mailto' ) {
257 $mailparts = explode( '@', $bits['path'], 2 );
258 if ( count( $mailparts ) === 2 ) {
259 $domainpart = rtrim( self::reverseDomain( $mailparts[0] ), '.' );
260 $bits['host'] = $mailparts[1] . '@' . $domainpart;
261 } else {
262 // No @, assume it's a local part with no domain
263 $bits['host'] = $mailparts[0];
264 }
265 } else {
266 $bits['host'] = rtrim( self::reverseDomain( $bits['host'] ), '.' );
267 }
268
269 $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
270 if ( isset( $bits['port'] ) && $bits['port'] ) {
271 $index .= ':' . $bits['port'];
272 }
273 return $index;
274 }
275
276 private static function reverseDomain( string $domain ): string {
277 if ( str_starts_with( $domain, 'V6.' ) ) {
278 $ipv6 = str_replace( '.', ':', trim( substr( $domain, 3 ), '.' ) );
279 if ( IPUtils::isValid( $ipv6 ) ) {
280 return '[' . $ipv6 . ']';
281 }
282 } elseif ( str_starts_with( $domain, 'V4.' ) ) {
283 $ipv4 = trim( substr( $domain, 3 ), '.' );
284 if ( IPUtils::isValid( $ipv4 ) ) {
285 return $ipv4;
286 }
287 }
288 return self::indexifyHost( $domain );
289 }
290
318 public static function getQueryConditions( $filterEntry, array $options = [] ) {
319 $options += [
320 'protocol' => [ 'http://', 'https://' ],
321 'oneWildcard' => false,
322 'db' => null,
323 ];
324 $domainGaps = MediaWikiServices::getInstance()->getMainConfig()->get(
325 MainConfigNames::ExternalLinksDomainGaps
326 );
327
328 if ( is_string( $options['protocol'] ) ) {
329 $options['protocol'] = [ $options['protocol'] ];
330 } elseif ( $options['protocol'] === null ) {
331 $options['protocol'] = [ 'http://', 'https://' ];
332 }
333
334 $domainConditions = [];
335 $db = $options['db'] ?: MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase();
336 foreach ( $options['protocol'] as $protocol ) {
337 $like = self::makeLikeArray( $filterEntry, $protocol );
338 if ( $like === false ) {
339 continue;
340 }
341 [ $likeDomain, $likePath ] = $like;
342 $trimmedlikeDomain = self::keepOneWildcard( $likeDomain );
343 if ( $trimmedlikeDomain[count( $trimmedlikeDomain ) - 1] instanceof LikeMatch ) {
344 array_pop( $trimmedlikeDomain );
345 }
346 $index1 = implode( '', $trimmedlikeDomain );
347 if ( $options['oneWildcard'] && $likePath[0] != '/' ) {
348 $thisDomainExpr = $db->expr( 'el_to_domain_index', '=', $index1 );
349 } else {
350 $thisDomainExpr = $db->expr(
351 'el_to_domain_index',
352 IExpression::LIKE,
353 new LikeValue( $index1, $db->anyString() )
354 );
355 }
356 foreach ( $domainGaps[$index1] ?? [] as $from => $to ) {
357 $thisDomainExpr = $thisDomainExpr->andExpr( $db->expr( 'el_id', '<', $from )->or( 'el_id', '>', $to ) );
358 }
359 $domainConditions[] = $thisDomainExpr;
360 }
361 if ( !$domainConditions ) {
362 return false;
363 }
364 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable
365 $trimmedlikePath = self::keepOneWildcard( $likePath );
366 if ( $trimmedlikePath[count( $trimmedlikePath ) - 1] instanceof LikeMatch ) {
367 array_pop( $trimmedlikePath );
368 }
369 $index2 = implode( '', $trimmedlikePath );
370
371 return [
372 $db->orExpr( $domainConditions ),
373 $db->expr( 'el_to_path', IExpression::LIKE, new LikeValue( $index2, $db->anyString() ) ),
374 ];
375 }
376
377 public static function getProtocolPrefix( ?string $protocol ): ?string {
378 // Find the right prefix
379 $urlProtocols = MediaWikiServices::getInstance()->getMainConfig()
380 ->get( MainConfigNames::UrlProtocols );
381 if ( $protocol && !in_array( $protocol, $urlProtocols ) ) {
382 foreach ( $urlProtocols as $p ) {
383 if ( str_starts_with( $p, $protocol ) ) {
384 $protocol = $p;
385 break;
386 }
387 }
388
389 return $protocol;
390 } else {
391 return null;
392 }
393 }
394
395 public static function prepareProtocols(): array {
396 $urlProtocols = MediaWikiServices::getInstance()->getMainConfig()
397 ->get( MainConfigNames::UrlProtocols );
398 $protocols = [ '' ];
399 foreach ( $urlProtocols as $p ) {
400 if ( $p !== '//' ) {
401 $protocols[] = substr( $p, 0, strpos( $p, ':' ) );
402 }
403 }
404
405 return $protocols;
406 }
407
420 public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) {
421 $services = MediaWikiServices::getInstance();
422 $db = $services->getConnectionProvider()->getReplicaDatabase();
423 $likeDomain = [];
424 $likePath = [];
425
426 $target = $protocol . $filterEntry;
427 $bits = $services->getUrlUtils()->parse( $target );
428 if ( !$bits ) {
429 return false;
430 }
431
432 // URI RFC identifies the email/server part of mailto or news protocol as 'path',
433 // while we want to match the email's domain or news server the same way we are
434 // matching hosts for other URLs.
435 if ( in_array( $bits['scheme'], [ 'mailto', 'news' ] ) ) {
436 // (T364743) Only set host if it's not already set (if // is used)
437 if ( array_key_exists( 'path', $bits ) ) {
438 $bits['host'] = $bits['path'];
439 }
440 $bits['path'] = '';
441 }
442
443 $subdomains = false;
444 if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
445 // Email address with domain and non-empty local part
446 $mailparts = explode( '@', $bits['host'], 2 );
447 $domainpart = self::indexifyHost( $mailparts[1] );
448 if ( $mailparts[0] === '*' ) {
449 $subdomains = true;
450 $bits['host'] = $domainpart . '@';
451 } else {
452 $bits['host'] = $domainpart . '@' . $mailparts[0];
453 }
454 } else {
455 // Non-email, or email with only a domain part.
456 $bits['host'] = self::indexifyHost( $bits['host'] );
457 if ( substr( $bits['host'], -3 ) === '.*.' ) {
458 $subdomains = true;
459 $bits['host'] = substr( $bits['host'], 0, -2 );
460 }
461 }
462
463 $likeDomain[] = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
464
465 if ( $subdomains ) {
466 $likeDomain[] = $db->anyString();
467 }
468
469 if ( isset( $bits['port'] ) ) {
470 $likeDomain[] = ':' . $bits['port'];
471 }
472 if ( isset( $bits['path'] ) ) {
473 $likePath[] = $bits['path'];
474 } else {
475 $likePath[] = '/';
476 }
477 if ( isset( $bits['query'] ) ) {
478 $likePath[] = '?' . $bits['query'];
479 }
480 if ( isset( $bits['fragment'] ) ) {
481 $likePath[] = '#' . $bits['fragment'];
482 }
483 $likePath[] = $db->anyString();
484
485 // Check for stray asterisks: asterisk only allowed at the start of the domain
486 foreach ( array_merge( $likeDomain, $likePath ) as $likepart ) {
487 if ( !( $likepart instanceof LikeMatch ) && str_contains( $likepart, '*' ) ) {
488 return false;
489 }
490 }
491
492 return [ $likeDomain, $likePath ];
493 }
494
503 public static function keepOneWildcard( $arr ) {
504 if ( !is_array( $arr ) ) {
505 return $arr;
506 }
507
508 foreach ( $arr as $key => $value ) {
509 if ( $value instanceof LikeMatch ) {
510 return array_slice( $arr, 0, $key + 1 );
511 }
512 }
513
514 return $arr;
515 }
516}
if(!defined('MW_SETUP_CALLBACK'))
Definition WebStart.php:68
Content object implementation for representing flat text.
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
Used by Database::buildLike() to represent characters that have special meaning in SQL LIKE clauses a...
Definition LikeMatch.php:10
Content of like value.
Definition LikeValue.php:14
A collection of static methods to play with strings.
Content objects represent page content, e.g.
Definition Content.php:28