MediaWiki master
LinkFilter.php
Go to the documentation of this file.
1<?php
22
23use Content;
26use StringUtils;
27use TextContent;
28use Wikimedia\IPUtils;
33
55 public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) {
56 if ( !( $content instanceof TextContent ) ) {
57 // TODO: handle other types of content too.
58 // Maybe create ContentHandler::matchFilter( LinkFilter ).
59 // Think about a common base class for LinkFilter and MagicWord.
60 return 0;
61 }
62
63 $text = $content->getText();
64 $regex = self::makeRegex( $filterEntry, $protocol );
65 return preg_match( $regex, $text );
66 }
67
77 private static function makeRegex( $filterEntry, $protocol ) {
78 $regex = '!' . preg_quote( $protocol, '!' );
79 if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
80 $regex .= '(?:[A-Za-z0-9.-]+\.|)';
81 $filterEntry = substr( $filterEntry, 2 );
82 }
83 $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
84 return $regex;
85 }
86
94 private static function indexifyHost( $host, $reverse = true ) {
95 // Canonicalize.
96 $host = rawurldecode( $host );
97 if ( $host !== '' ) {
98 $tmp = idn_to_utf8( $host );
99 if ( $tmp !== false ) {
100 $host = $tmp;
101 }
102 }
103 $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
104 if ( StringUtils::isUtf8( $host ) ) {
105 // Save a little space by not percent-encoding valid UTF-8 bytes
106 $okChars .= '\x80-\xf4';
107 }
108 $host = preg_replace_callback(
109 '<[^' . $okChars . ']+>',
110 static fn ( $m ) => rawurlencode( $m[0] ),
111 strtolower( $host )
112 );
113
114 // IPv6? RFC 3986 syntax.
115 if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
116 $ip = $m[1];
117 if ( IPUtils::isValid( $ip ) ) {
118 if ( !$reverse ) {
119 return '[' . IPUtils::sanitizeIP( $ip ) . ']';
120 }
121 return 'V6.' . implode( '.', explode( ':', IPUtils::sanitizeIP( $ip ) ) ) . '.';
122 }
123 if ( substr( $ip, -2 ) === ':*' ) {
124 $cutIp = substr( $ip, 0, -2 );
125 if ( IPUtils::isValid( "{$cutIp}::" ) ) {
126 // Wildcard IP doesn't contain "::", so multiple parts can be wild
127 $ct = count( explode( ':', $ip ) ) - 1;
128 if ( !$reverse ) {
129 return '[' . IPUtils::sanitizeIP( "{$cutIp}::" ) . ']';
130 }
131 return 'V6.' .
132 implode( '.', array_slice( explode( ':', IPUtils::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
133 '.*.';
134 }
135 if ( IPUtils::isValid( "{$cutIp}:1" ) ) {
136 // Wildcard IP does contain "::", so only the last part is wild
137 if ( !$reverse ) {
138 return '[' . IPUtils::sanitizeIP( "{$cutIp}:1" ) . ']';
139 }
140 return 'V6.' .
141 substr( implode( '.', explode( ':', IPUtils::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
142 '*.';
143 }
144 }
145 }
146
147 // Regularize explicit specification of the DNS root.
148 // Browsers seem to do this for IPv4 literals too.
149 if ( substr( $host, -1 ) === '.' ) {
150 $host = substr( $host, 0, -1 );
151 }
152
153 // IPv4?
154 $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
155 if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
156 if ( !$reverse ) {
157 return $host;
158 }
159 return 'V4.' . implode( '.', array_map( static function ( $v ) {
160 return $v === '*' ? $v : (int)$v;
161 }, explode( '.', $host ) ) ) . '.';
162 }
163
164 // Must be a host name.
165 if ( $reverse ) {
166 return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
167 } else {
168 return $host;
169 }
170 }
171
181 public static function makeIndexes( $url, $reverseDomain = true ) {
182 // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
183 // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
184 // versus "https://" prefix. If you change that, you'll likely need to update
185 // refreshExternallinksIndex.php accordingly.
186
187 $bits = MediaWikiServices::getInstance()->getUrlUtils()->parse( $url );
188 if ( !$bits ) {
189 return [];
190 }
191
192 // URI RFC identifies the email/server part of mailto or news protocol as 'path',
193 // while we want to match the email's domain or news server the same way we are
194 // matching hosts for other URLs.
195 if ( in_array( $bits['scheme'], [ 'mailto', 'news' ] ) ) {
196 // (T347574) Only set host if it's not already set (if // is used)
197 if ( array_key_exists( 'path', $bits ) ) {
198 $bits['host'] = $bits['path'];
199 }
200 $bits['path'] = '';
201 }
202
203 // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
204 // For emails turn it into "domain.reversed@localpart"
205 if ( $bits['scheme'] == 'mailto' ) {
206 $mailparts = explode( '@', $bits['host'], 2 );
207 if ( count( $mailparts ) === 2 ) {
208 $domainpart = self::indexifyHost( $mailparts[1], $reverseDomain );
209 } else {
210 // No @, assume it's a local part with no domain
211 $domainpart = '';
212 }
213 if ( $reverseDomain ) {
214 $bits['host'] = $domainpart . '@' . $mailparts[0];
215 } else {
216 $bits['host'] = $mailparts[0] . '@' . $domainpart;
217 }
218 } else {
219 $bits['host'] = self::indexifyHost( $bits['host'], $reverseDomain );
220 }
221
222 // Reconstruct the pseudo-URL
223 $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
224 // Leave out user and password. Add the port, path, query and fragment
225 if ( isset( $bits['port'] ) ) {
226 $index .= ':' . $bits['port'];
227 }
228 $index2 = $bits['path'] ?? '/';
229 if ( isset( $bits['query'] ) ) {
230 $index2 .= '?' . $bits['query'];
231 }
232 if ( isset( $bits['fragment'] ) ) {
233 $index2 .= '#' . $bits['fragment'];
234 }
235
236 if ( $bits['scheme'] == '' ) {
237 return [ [ "https:$index", $index2 ] ];
238 } else {
239 return [ [ $index, $index2 ] ];
240 }
241 }
242
249 public static function getIndexedUrlsNonReversed( $urls ) {
250 $newLinks = [];
251 foreach ( $urls as $url ) {
252 $indexes = self::makeIndexes( $url, false );
253 if ( !$indexes ) {
254 continue;
255 }
256 foreach ( $indexes as $index ) {
257 $newLinks[] = $index[0] . $index[1];
258 }
259 }
260 return $newLinks;
261 }
262
263 public static function reverseIndexes( $domainIndex ) {
264 $bits = MediaWikiServices::getInstance()->getUrlUtils()->parse( $domainIndex );
265 if ( !$bits ) {
266 return '';
267 }
268
269 // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
270 // For emails turn it into "domain.reversed@localpart"
271 if ( $bits['scheme'] == 'mailto' ) {
272 $mailparts = explode( '@', $bits['path'], 2 );
273 if ( count( $mailparts ) === 2 ) {
274 $domainpart = rtrim( self::reverseDomain( $mailparts[0] ), '.' );
275 } else {
276 // No @, assume it's a local part with no domain
277 $domainpart = '';
278 }
279 $bits['host'] = $mailparts[1] . '@' . $domainpart;
280 } else {
281 $bits['host'] = rtrim( self::reverseDomain( $bits['host'] ), '.' );
282 }
283
284 $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
285 if ( isset( $bits['port'] ) && $bits['port'] ) {
286 $index .= ':' . $bits['port'];
287 }
288 return $index;
289 }
290
291 private static function reverseDomain( $domain ) {
292 if ( substr( $domain, 0, 3 ) === 'V6.' ) {
293 $ipv6 = str_replace( '.', ':', trim( substr( $domain, 3 ), '.' ) );
294 if ( IPUtils::isValid( $ipv6 ) ) {
295 return '[' . $ipv6 . ']';
296 }
297 } elseif ( substr( $domain, 0, 3 ) === 'V4.' ) {
298 $ipv4 = trim( substr( $domain, 3 ), '.' );
299 if ( IPUtils::isValid( $ipv4 ) ) {
300 return $ipv4;
301 }
302 }
303 return self::indexifyHost( $domain );
304 }
305
333 public static function getQueryConditions( $filterEntry, array $options = [] ) {
334 $options += [
335 'protocol' => [ 'http://', 'https://' ],
336 'oneWildcard' => false,
337 'db' => null,
338 ];
339 $domainGaps = MediaWikiServices::getInstance()->getMainConfig()->get(
341 );
342
343 if ( is_string( $options['protocol'] ) ) {
344 $options['protocol'] = [ $options['protocol'] ];
345 } elseif ( $options['protocol'] === null ) {
346 $options['protocol'] = [ 'http://', 'https://' ];
347 }
348
349 $domainConditions = [];
350 $db = $options['db'] ?: MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase();
351 foreach ( $options['protocol'] as $protocol ) {
352 $like = self::makeLikeArray( $filterEntry, $protocol );
353 if ( $like === false ) {
354 continue;
355 }
356 [ $likeDomain, $likePath ] = $like;
357 $trimmedlikeDomain = self::keepOneWildcard( $likeDomain );
358 if ( $trimmedlikeDomain[count( $trimmedlikeDomain ) - 1] instanceof LikeMatch ) {
359 array_pop( $trimmedlikeDomain );
360 }
361 $index1 = implode( '', $trimmedlikeDomain );
362 if ( $options['oneWildcard'] && $likePath[0] != '/' ) {
363 $thisDomainExpr = $db->expr( 'el_to_domain_index', '=', $index1 );
364 } else {
365 $thisDomainExpr = $db->expr(
366 'el_to_domain_index',
367 IExpression::LIKE,
368 new LikeValue( $index1, $db->anyString() )
369 );
370 }
371 foreach ( $domainGaps[$index1] ?? [] as $from => $to ) {
372 $thisDomainExpr = $thisDomainExpr->andExpr( $db->expr( 'el_id', '<', $from )->or( 'el_id', '>', $to ) );
373 }
374 $domainConditions[] = $thisDomainExpr;
375 }
376 if ( !$domainConditions ) {
377 return false;
378 }
379 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable
380 $trimmedlikePath = self::keepOneWildcard( $likePath );
381 if ( $trimmedlikePath[count( $trimmedlikePath ) - 1] instanceof LikeMatch ) {
382 array_pop( $trimmedlikePath );
383 }
384 $index2 = implode( '', $trimmedlikePath );
385
386 return [
387 new OrExpressionGroup( ...$domainConditions ),
388 $db->expr( 'el_to_path', IExpression::LIKE, new LikeValue( $index2, $db->anyString() ) ),
389 ];
390 }
391
392 public static function getProtocolPrefix( $protocol ) {
393 // Find the right prefix
394 $urlProtocols = MediaWikiServices::getInstance()->getMainConfig()
396 if ( $protocol && !in_array( $protocol, $urlProtocols ) ) {
397 foreach ( $urlProtocols as $p ) {
398 if ( str_starts_with( $p, $protocol ) ) {
399 $protocol = $p;
400 break;
401 }
402 }
403
404 return $protocol;
405 } else {
406 return null;
407 }
408 }
409
410 public static function prepareProtocols() {
411 $urlProtocols = MediaWikiServices::getInstance()->getMainConfig()
413 $protocols = [ '' ];
414 foreach ( $urlProtocols as $p ) {
415 if ( $p !== '//' ) {
416 $protocols[] = substr( $p, 0, strpos( $p, ':' ) );
417 }
418 }
419
420 return $protocols;
421 }
422
435 public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) {
436 $services = MediaWikiServices::getInstance();
437 $db = $services->getConnectionProvider()->getReplicaDatabase();
438 $likeDomain = [];
439 $likePath = [];
440
441 $target = $protocol . $filterEntry;
442 $bits = $services->getUrlUtils()->parse( $target );
443 if ( !$bits ) {
444 return false;
445 }
446
447 // URI RFC identifies the email/server part of mailto or news protocol as 'path',
448 // while we want to match the email's domain or news server the same way we are
449 // matching hosts for other URLs.
450 if ( in_array( $bits['scheme'], [ 'mailto', 'news' ] ) ) {
451 $bits['host'] = $bits['path'];
452 $bits['path'] = '';
453 }
454
455 $subdomains = false;
456 if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
457 // Email address with domain and non-empty local part
458 $mailparts = explode( '@', $bits['host'], 2 );
459 $domainpart = self::indexifyHost( $mailparts[1] );
460 if ( $mailparts[0] === '*' ) {
461 $subdomains = true;
462 $bits['host'] = $domainpart . '@';
463 } else {
464 $bits['host'] = $domainpart . '@' . $mailparts[0];
465 }
466 } else {
467 // Non-email, or email with only a domain part.
468 $bits['host'] = self::indexifyHost( $bits['host'] );
469 if ( substr( $bits['host'], -3 ) === '.*.' ) {
470 $subdomains = true;
471 $bits['host'] = substr( $bits['host'], 0, -2 );
472 }
473 }
474
475 $likeDomain[] = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
476
477 if ( $subdomains ) {
478 $likeDomain[] = $db->anyString();
479 }
480
481 if ( isset( $bits['port'] ) ) {
482 $likeDomain[] = ':' . $bits['port'];
483 }
484 if ( isset( $bits['path'] ) ) {
485 $likePath[] = $bits['path'];
486 } else {
487 $likePath[] = '/';
488 }
489 if ( isset( $bits['query'] ) ) {
490 $likePath[] = '?' . $bits['query'];
491 }
492 if ( isset( $bits['fragment'] ) ) {
493 $likePath[] = '#' . $bits['fragment'];
494 }
495 $likePath[] = $db->anyString();
496
497 // Check for stray asterisks: asterisk only allowed at the start of the domain
498 foreach ( array_merge( $likeDomain, $likePath ) as $likepart ) {
499 if ( !( $likepart instanceof LikeMatch ) && strpos( $likepart, '*' ) !== false ) {
500 return false;
501 }
502 }
503
504 return [ $likeDomain, $likePath ];
505 }
506
515 public static function keepOneWildcard( $arr ) {
516 if ( !is_array( $arr ) ) {
517 return $arr;
518 }
519
520 foreach ( $arr as $key => $value ) {
521 if ( $value instanceof LikeMatch ) {
522 return array_slice( $arr, 0, $key + 1 );
523 }
524 }
525
526 return $arr;
527 }
528}
529
531class_alias( LinkFilter::class, 'LinkFilter' );
A class containing constants representing the names of configuration variables.
const ExternalLinksDomainGaps
Name constant for the ExternalLinksDomainGaps setting, for use with Config::get()
const UrlProtocols
Name constant for the UrlProtocols setting, for use with Config::get()
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
A collection of static methods to play with strings.
Content object implementation for representing flat text.
Used by Database::buildLike() to represent characters that have special meaning in SQL LIKE clauses a...
Definition LikeMatch.php:10
Content of like value.
Definition LikeValue.php:14
Representing a group of expressions chained via OR.
Base interface for representing page content.
Definition Content.php:37