MediaWiki master
LinkFilter.php
Go to the documentation of this file.
1<?php
22
23use Content;
26use StringUtils;
27use TextContent;
28use Wikimedia\IPUtils;
34
56 public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) {
57 if ( !( $content instanceof TextContent ) ) {
58 // TODO: handle other types of content too.
59 // Maybe create ContentHandler::matchFilter( LinkFilter ).
60 // Think about a common base class for LinkFilter and MagicWord.
61 return 0;
62 }
63
64 $text = $content->getText();
65 $regex = self::makeRegex( $filterEntry, $protocol );
66 return preg_match( $regex, $text );
67 }
68
78 private static function makeRegex( $filterEntry, $protocol ) {
79 $regex = '!' . preg_quote( $protocol, '!' );
80 if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
81 $regex .= '(?:[A-Za-z0-9.-]+\.|)';
82 $filterEntry = substr( $filterEntry, 2 );
83 }
84 $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
85 return $regex;
86 }
87
95 private static function indexifyHost( $host, $reverse = true ) {
96 // Canonicalize.
97 $host = rawurldecode( $host );
98 if ( $host !== '' ) {
99 $tmp = idn_to_utf8( $host );
100 if ( $tmp !== false ) {
101 $host = $tmp;
102 }
103 }
104 $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
105 if ( StringUtils::isUtf8( $host ) ) {
106 // Save a little space by not percent-encoding valid UTF-8 bytes
107 $okChars .= '\x80-\xf4';
108 }
109 $host = preg_replace_callback(
110 '<[^' . $okChars . ']+>',
111 static fn ( $m ) => rawurlencode( $m[0] ),
112 strtolower( $host )
113 );
114
115 // IPv6? RFC 3986 syntax.
116 if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
117 $ip = $m[1];
118 if ( IPUtils::isValid( $ip ) ) {
119 if ( !$reverse ) {
120 return '[' . IPUtils::sanitizeIP( $ip ) . ']';
121 }
122 return 'V6.' . implode( '.', explode( ':', IPUtils::sanitizeIP( $ip ) ) ) . '.';
123 }
124 if ( substr( $ip, -2 ) === ':*' ) {
125 $cutIp = substr( $ip, 0, -2 );
126 if ( IPUtils::isValid( "{$cutIp}::" ) ) {
127 // Wildcard IP doesn't contain "::", so multiple parts can be wild
128 $ct = count( explode( ':', $ip ) ) - 1;
129 if ( !$reverse ) {
130 return '[' . IPUtils::sanitizeIP( "{$cutIp}::" ) . ']';
131 }
132 return 'V6.' .
133 implode( '.', array_slice( explode( ':', IPUtils::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
134 '.*.';
135 }
136 if ( IPUtils::isValid( "{$cutIp}:1" ) ) {
137 // Wildcard IP does contain "::", so only the last part is wild
138 if ( !$reverse ) {
139 return '[' . IPUtils::sanitizeIP( "{$cutIp}:1" ) . ']';
140 }
141 return 'V6.' .
142 substr( implode( '.', explode( ':', IPUtils::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
143 '*.';
144 }
145 }
146 }
147
148 // Regularize explicit specification of the DNS root.
149 // Browsers seem to do this for IPv4 literals too.
150 if ( substr( $host, -1 ) === '.' ) {
151 $host = substr( $host, 0, -1 );
152 }
153
154 // IPv4?
155 $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
156 if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
157 if ( !$reverse ) {
158 return $host;
159 }
160 return 'V4.' . implode( '.', array_map( static function ( $v ) {
161 return $v === '*' ? $v : (int)$v;
162 }, explode( '.', $host ) ) ) . '.';
163 }
164
165 // Must be a host name.
166 if ( $reverse ) {
167 return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
168 } else {
169 return $host;
170 }
171 }
172
182 public static function makeIndexes( $url, $reverseDomain = true ) {
183 // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
184 // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
185 // versus "https://" prefix. If you change that, you'll likely need to update
186 // refreshExternallinksIndex.php accordingly.
187
188 $bits = wfParseUrl( $url );
189 if ( !$bits ) {
190 return [];
191 }
192
193 // URI RFC identifies the email/server part of mailto or news protocol as 'path',
194 // while we want to match the email's domain or news server the same way we are
195 // matching hosts for other URLs.
196 if ( in_array( $bits['scheme'], [ 'mailto', 'news' ] ) ) {
197 // (T347574) Only set host if it's not already set (if // is used)
198 if ( array_key_exists( 'path', $bits ) ) {
199 $bits['host'] = $bits['path'];
200 }
201 $bits['path'] = '';
202 }
203
204 // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
205 // For emails turn it into "domain.reversed@localpart"
206 if ( $bits['scheme'] == 'mailto' ) {
207 $mailparts = explode( '@', $bits['host'], 2 );
208 if ( count( $mailparts ) === 2 ) {
209 $domainpart = self::indexifyHost( $mailparts[1], $reverseDomain );
210 } else {
211 // No @, assume it's a local part with no domain
212 $domainpart = '';
213 }
214 if ( $reverseDomain ) {
215 $bits['host'] = $domainpart . '@' . $mailparts[0];
216 } else {
217 $bits['host'] = $mailparts[0] . '@' . $domainpart;
218 }
219 } else {
220 $bits['host'] = self::indexifyHost( $bits['host'], $reverseDomain );
221 }
222
223 // Reconstruct the pseudo-URL
224 $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
225 // Leave out user and password. Add the port, path, query and fragment
226 if ( isset( $bits['port'] ) ) {
227 $index .= ':' . $bits['port'];
228 }
229 $index2 = $bits['path'] ?? '/';
230 if ( isset( $bits['query'] ) ) {
231 $index2 .= '?' . $bits['query'];
232 }
233 if ( isset( $bits['fragment'] ) ) {
234 $index2 .= '#' . $bits['fragment'];
235 }
236
237 if ( $bits['scheme'] == '' ) {
238 return [ [ "https:$index", $index2 ] ];
239 } else {
240 return [ [ $index, $index2 ] ];
241 }
242 }
243
250 public static function getIndexedUrlsNonReversed( $urls ) {
251 $newLinks = [];
252 foreach ( $urls as $url ) {
253 $indexes = self::makeIndexes( $url, false );
254 if ( !$indexes ) {
255 continue;
256 }
257 foreach ( $indexes as $index ) {
258 $newLinks[] = $index[0] . $index[1];
259 }
260 }
261 return $newLinks;
262 }
263
264 public static function reverseIndexes( $domainIndex ) {
265 $bits = wfParseUrl( $domainIndex );
266 if ( !$bits ) {
267 return '';
268 }
269
270 // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
271 // For emails turn it into "domain.reversed@localpart"
272 if ( $bits['scheme'] == 'mailto' ) {
273 $mailparts = explode( '@', $bits['path'], 2 );
274 if ( count( $mailparts ) === 2 ) {
275 $domainpart = rtrim( self::reverseDomain( $mailparts[0] ), '.' );
276 } else {
277 // No @, assume it's a local part with no domain
278 $domainpart = '';
279 }
280 $bits['host'] = $mailparts[1] . '@' . $domainpart;
281 } else {
282 $bits['host'] = rtrim( self::reverseDomain( $bits['host'] ), '.' );
283 }
284
285 $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
286 if ( isset( $bits['port'] ) && $bits['port'] ) {
287 $index .= ':' . $bits['port'];
288 }
289 return $index;
290 }
291
292 private static function reverseDomain( $domain ) {
293 if ( substr( $domain, 0, 3 ) === 'V6.' ) {
294 $ipv6 = str_replace( '.', ':', trim( substr( $domain, 3 ), '.' ) );
295 if ( IPUtils::isValid( $ipv6 ) ) {
296 return '[' . $ipv6 . ']';
297 }
298 } elseif ( substr( $domain, 0, 3 ) === 'V4.' ) {
299 $ipv4 = trim( substr( $domain, 3 ), '.' );
300 if ( IPUtils::isValid( $ipv4 ) ) {
301 return $ipv4;
302 }
303 }
304 return self::indexifyHost( $domain );
305 }
306
334 public static function getQueryConditions( $filterEntry, array $options = [] ) {
335 $options += [
336 'protocol' => [ 'http://', 'https://' ],
337 'oneWildcard' => false,
338 'db' => null,
339 ];
340 $domainGaps = MediaWikiServices::getInstance()->getMainConfig()->get(
342 );
343
344 if ( is_string( $options['protocol'] ) ) {
345 $options['protocol'] = [ $options['protocol'] ];
346 } elseif ( $options['protocol'] === null ) {
347 $options['protocol'] = [ 'http://', 'https://' ];
348 }
349
350 $domainConditions = [];
351 $db = $options['db'] ?: MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->getReplicaDatabase();
352 foreach ( $options['protocol'] as $protocol ) {
353 $like = self::makeLikeArray( $filterEntry, $protocol );
354 if ( $like === false ) {
355 continue;
356 }
357 [ $likeDomain, $likePath ] = $like;
358 $trimmedlikeDomain = self::keepOneWildcard( $likeDomain );
359 if ( $trimmedlikeDomain[count( $trimmedlikeDomain ) - 1] instanceof LikeMatch ) {
360 array_pop( $trimmedlikeDomain );
361 }
362 $index1 = implode( '', $trimmedlikeDomain );
363 $thisDomainConditions = [];
364 if ( $options['oneWildcard'] && $likePath[0] != '/' ) {
365 $thisDomainConditions[] = $db->expr( 'el_to_domain_index', '=', $index1 );
366 } else {
367 $thisDomainConditions[] = $db->expr(
368 'el_to_domain_index',
369 IExpression::LIKE,
370 new LikeValue( $index1, $db->anyString() )
371 );
372 }
373 foreach ( $domainGaps[$index1] ?? [] as $from => $to ) {
374 $thisDomainConditions[] = $db->expr( 'el_id', '<', $from )->or( 'el_id', '>', $to );
375 }
376 $domainConditions[] = new AndExpressionGroup( ...$thisDomainConditions );
377
378 }
379 if ( !$domainConditions ) {
380 return false;
381 }
382 // @phan-suppress-next-line PhanPossiblyUndeclaredVariable
383 $trimmedlikePath = self::keepOneWildcard( $likePath );
384 if ( $trimmedlikePath[count( $trimmedlikePath ) - 1] instanceof LikeMatch ) {
385 array_pop( $trimmedlikePath );
386 }
387 $index2 = implode( '', $trimmedlikePath );
388
389 return [
390 new OrExpressionGroup( ...$domainConditions ),
391 $db->expr( 'el_to_path', IExpression::LIKE, new LikeValue( $index2, $db->anyString() ) ),
392 ];
393 }
394
395 public static function getProtocolPrefix( $protocol ) {
396 // Find the right prefix
397 $urlProtocols = MediaWikiServices::getInstance()->getMainConfig()
399 if ( $protocol && !in_array( $protocol, $urlProtocols ) ) {
400 foreach ( $urlProtocols as $p ) {
401 if ( str_starts_with( $p, $protocol ) ) {
402 $protocol = $p;
403 break;
404 }
405 }
406
407 return $protocol;
408 } else {
409 return null;
410 }
411 }
412
413 public static function prepareProtocols() {
414 $urlProtocols = MediaWikiServices::getInstance()->getMainConfig()
416 $protocols = [ '' ];
417 foreach ( $urlProtocols as $p ) {
418 if ( $p !== '//' ) {
419 $protocols[] = substr( $p, 0, strpos( $p, ':' ) );
420 }
421 }
422
423 return $protocols;
424 }
425
438 public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) {
439 $db = MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->getReplicaDatabase();
440 $likeDomain = [];
441 $likePath = [];
442
443 $target = $protocol . $filterEntry;
444 $bits = wfParseUrl( $target );
445 if ( !$bits ) {
446 return false;
447 }
448
449 // URI RFC identifies the email/server part of mailto or news protocol as 'path',
450 // while we want to match the email's domain or news server the same way we are
451 // matching hosts for other URLs.
452 if ( in_array( $bits['scheme'], [ 'mailto', 'news' ] ) ) {
453 $bits['host'] = $bits['path'];
454 $bits['path'] = '';
455 }
456
457 $subdomains = false;
458 if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
459 // Email address with domain and non-empty local part
460 $mailparts = explode( '@', $bits['host'], 2 );
461 $domainpart = self::indexifyHost( $mailparts[1] );
462 if ( $mailparts[0] === '*' ) {
463 $subdomains = true;
464 $bits['host'] = $domainpart . '@';
465 } else {
466 $bits['host'] = $domainpart . '@' . $mailparts[0];
467 }
468 } else {
469 // Non-email, or email with only a domain part.
470 $bits['host'] = self::indexifyHost( $bits['host'] );
471 if ( substr( $bits['host'], -3 ) === '.*.' ) {
472 $subdomains = true;
473 $bits['host'] = substr( $bits['host'], 0, -2 );
474 }
475 }
476
477 $likeDomain[] = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
478
479 if ( $subdomains ) {
480 $likeDomain[] = $db->anyString();
481 }
482
483 if ( isset( $bits['port'] ) ) {
484 $likeDomain[] = ':' . $bits['port'];
485 }
486 if ( isset( $bits['path'] ) ) {
487 $likePath[] = $bits['path'];
488 } else {
489 $likePath[] = '/';
490 }
491 if ( isset( $bits['query'] ) ) {
492 $likePath[] = '?' . $bits['query'];
493 }
494 if ( isset( $bits['fragment'] ) ) {
495 $likePath[] = '#' . $bits['fragment'];
496 }
497 $likePath[] = $db->anyString();
498
499 // Check for stray asterisks: asterisk only allowed at the start of the domain
500 foreach ( array_merge( $likeDomain, $likePath ) as $likepart ) {
501 if ( !( $likepart instanceof LikeMatch ) && strpos( $likepart, '*' ) !== false ) {
502 return false;
503 }
504 }
505
506 return [ $likeDomain, $likePath ];
507 }
508
517 public static function keepOneWildcard( $arr ) {
518 if ( !is_array( $arr ) ) {
519 return $arr;
520 }
521
522 foreach ( $arr as $key => $value ) {
523 if ( $value instanceof LikeMatch ) {
524 return array_slice( $arr, 0, $key + 1 );
525 }
526 }
527
528 return $arr;
529 }
530}
531
535class_alias( LinkFilter::class, 'LinkFilter' );
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
A class containing constants representing the names of configuration variables.
const ExternalLinksDomainGaps
Name constant for the ExternalLinksDomainGaps setting, for use with Config::get()
const UrlProtocols
Name constant for the UrlProtocols setting, for use with Config::get()
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
A collection of static methods to play with strings.
Content object implementation for representing flat text.
Representing a group of expressions chained via AND.
Used by Database::buildLike() to represent characters that have special meaning in SQL LIKE clauses a...
Definition LikeMatch.php:10
Content of like value.
Definition LikeValue.php:14
Representing a group of expressions chained via OR.
Base interface for representing page content.
Definition Content.php:39
$content
Definition router.php:76