MediaWiki  master
LinkFilter.php
Go to the documentation of this file.
1 <?php
23 namespace MediaWiki\ExternalLinks;
24 
25 use Content;
28 use StringUtils;
29 use TextContent;
30 use Wikimedia\IPUtils;
32 
43 class LinkFilter {
48  public const VERSION = 1;
49 
58  public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) {
59  if ( !( $content instanceof TextContent ) ) {
60  // TODO: handle other types of content too.
61  // Maybe create ContentHandler::matchFilter( LinkFilter ).
62  // Think about a common base class for LinkFilter and MagicWord.
63  return 0;
64  }
65 
66  $text = $content->getText();
67 
68  $regex = self::makeRegex( $filterEntry, $protocol );
69  return preg_match( $regex, $text );
70  }
71 
82  private static function makeRegex( $filterEntry, $protocol ) {
83  $regex = '!' . preg_quote( $protocol, '!' );
84  if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
85  $regex .= '(?:[A-Za-z0-9.-]+\.|)';
86  $filterEntry = substr( $filterEntry, 2 );
87  }
88  $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
89  return $regex;
90  }
91 
98  private static function indexifyHost( $host, $reverse = true ) {
99  // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
100 
101  // Canonicalize.
102  $host = rawurldecode( $host );
103  if ( $host !== '' ) {
104  $tmp = idn_to_utf8( $host );
105  if ( $tmp !== false ) {
106  $host = $tmp;
107  }
108  }
109  $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
110  if ( StringUtils::isUtf8( $host ) ) {
111  // Save a little space by not percent-encoding valid UTF-8 bytes
112  $okChars .= '\x80-\xf4';
113  }
114  $host = preg_replace_callback(
115  '<[^' . $okChars . ']>',
116  static function ( $m ) {
117  return rawurlencode( $m[0] );
118  },
119  strtolower( $host )
120  );
121 
122  // IPv6? RFC 3986 syntax.
123  if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
124  $ip = $m[1];
125  if ( IPUtils::isValid( $ip ) ) {
126  if ( !$reverse ) {
127  return '[' . IPUtils::sanitizeIP( $ip ) . ']';
128  }
129  return 'V6.' . implode( '.', explode( ':', IPUtils::sanitizeIP( $ip ) ) ) . '.';
130  }
131  if ( substr( $ip, -2 ) === ':*' ) {
132  $cutIp = substr( $ip, 0, -2 );
133  if ( IPUtils::isValid( "{$cutIp}::" ) ) {
134  // Wildcard IP doesn't contain "::", so multiple parts can be wild
135  $ct = count( explode( ':', $ip ) ) - 1;
136  if ( !$reverse ) {
137  return '[' . IPUtils::sanitizeIP( "{$cutIp}::" ) . ']';
138  }
139  return 'V6.' .
140  implode( '.', array_slice( explode( ':', IPUtils::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
141  '.*.';
142  }
143  if ( IPUtils::isValid( "{$cutIp}:1" ) ) {
144  // Wildcard IP does contain "::", so only the last part is wild
145  if ( !$reverse ) {
146  return '[' . IPUtils::sanitizeIP( "{$cutIp}:1" ) . ']';
147  }
148  return 'V6.' .
149  substr( implode( '.', explode( ':', IPUtils::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
150  '*.';
151  }
152  }
153  }
154 
155  // Regularize explicit specification of the DNS root.
156  // Browsers seem to do this for IPv4 literals too.
157  if ( substr( $host, -1 ) === '.' ) {
158  $host = substr( $host, 0, -1 );
159  }
160 
161  // IPv4?
162  $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
163  if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
164  if ( !$reverse ) {
165  return $host;
166  }
167  return 'V4.' . implode( '.', array_map( static function ( $v ) {
168  return $v === '*' ? $v : (int)$v;
169  }, explode( '.', $host ) ) ) . '.';
170  }
171 
172  // Must be a host name.
173  if ( $reverse ) {
174  return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
175  } else {
176  return $host;
177  }
178  }
179 
188  public static function makeIndexes( $url, $reverseDomain = true ) {
189  // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
190 
191  // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
192  // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
193  // versus "https://" prefix. If you change that, you'll likely need to update
194  // refreshExternallinksIndex.php accordingly.
195 
196  $bits = wfParseUrl( $url );
197  if ( !$bits ) {
198  return [];
199  }
200 
201  // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
202  // For emails turn it into "domain.reversed@localpart"
203  if ( $bits['scheme'] == 'mailto' ) {
204  $mailparts = explode( '@', $bits['host'], 2 );
205  if ( count( $mailparts ) === 2 ) {
206  $domainpart = self::indexifyHost( $mailparts[1], $reverseDomain );
207  } else {
208  // No @, assume it's a local part with no domain
209  $domainpart = '';
210  }
211  if ( $reverseDomain ) {
212  $bits['host'] = $domainpart . '@' . $mailparts[0];
213  } else {
214  $bits['host'] = $mailparts[0] . '@' . $domainpart;
215  }
216  } else {
217  $bits['host'] = self::indexifyHost( $bits['host'], $reverseDomain );
218  }
219 
220  // Reconstruct the pseudo-URL
221  $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
222  // Leave out user and password. Add the port, path, query and fragment
223  if ( isset( $bits['port'] ) ) {
224  $index .= ':' . $bits['port'];
225  }
226  $index2 = $bits['path'] ?? '/';
227  if ( isset( $bits['query'] ) ) {
228  $index2 .= '?' . $bits['query'];
229  }
230  if ( isset( $bits['fragment'] ) ) {
231  $index2 .= '#' . $bits['fragment'];
232  }
233 
234  if ( $bits['scheme'] == '' ) {
235  return [ [ "https:$index", $index2 ] ];
236  } else {
237  return [ [ $index, $index2 ] ];
238  }
239  }
240 
247  public static function getIndexedUrlsNonReversed( $urls ) {
248  $migrationStage = MediaWikiServices::getInstance()->getMainConfig()->get(
250  );
251  if ( $migrationStage & SCHEMA_COMPAT_READ_OLD ) {
252  return $urls;
253  }
254  $newLinks = [];
255  foreach ( $urls as $url ) {
256  $indexes = self::makeIndexes( $url, false );
257  if ( !$indexes ) {
258  continue;
259  }
260  foreach ( $indexes as $index ) {
261  $newLinks[] = $index[0] . $index[1];
262  }
263  }
264  return $newLinks;
265  }
266 
267  public static function reverseIndexe( $domainIndex ) {
268  $bits = wfParseUrl( $domainIndex );
269  if ( !$bits ) {
270  return '';
271  }
272 
273  // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
274  // For emails turn it into "domain.reversed@localpart"
275  if ( $bits['scheme'] == 'mailto' ) {
276  $mailparts = explode( '@', $bits['host'], 2 );
277  if ( count( $mailparts ) === 2 ) {
278  $domainpart = rtrim( self::reverseDomain( $mailparts[0] ), '.' );
279  } else {
280  // No @, assume it's a local part with no domain
281  $domainpart = '';
282  }
283  $bits['host'] = $mailparts[1] . '@' . $domainpart;
284  } else {
285  $bits['host'] = rtrim( self::reverseDomain( $bits['host'] ), '.' );
286  }
287 
288  return $bits['scheme'] . $bits['delimiter'] . $bits['host'];
289  }
290 
291  private static function reverseDomain( $domain ) {
292  if ( substr( $domain, 0, 3 ) === 'V6.' ) {
293  $ipv6 = str_replace( '.', ':', trim( substr( $domain, 3 ), '.' ) );
294  if ( IPUtils::isValid( $ipv6 ) ) {
295  return '[' . $ipv6 . ']';
296  }
297  } elseif ( substr( $domain, 0, 3 ) === 'V4.' ) {
298  $ipv4 = trim( substr( $domain, 3 ), '.' );
299  if ( IPUtils::isValid( $ipv4 ) ) {
300  return $ipv4;
301  }
302  }
303  return self::indexifyHost( $domain );
304  }
305 
334  public static function getQueryConditions( $filterEntry, array $options = [] ) {
335  $migrationStage = MediaWikiServices::getInstance()->getMainConfig()->get(
337  );
338  if ( $migrationStage & SCHEMA_COMPAT_READ_OLD ) {
339  return self::getQueryConditionsOld( $filterEntry, $options );
340  }
341  $options += [
342  'protocol' => 'http://',
343  'oneWildcard' => false,
344  'db' => null,
345  ];
346 
347  // First, get the like array
348  $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
349  if ( $like === false ) {
350  return $like;
351  }
352  [ $likeDomain, $likePath ] = $like;
353 
354  // Get the constant prefix (i.e. everything up to the first wildcard)
355  $trimmedlikeDomain = self::keepOneWildcard( $likeDomain );
356  $trimmedlikePath = self::keepOneWildcard( $likePath );
357  if ( $trimmedlikeDomain[count( $trimmedlikeDomain ) - 1] instanceof LikeMatch ) {
358  array_pop( $trimmedlikeDomain );
359  }
360  if ( $trimmedlikePath[count( $trimmedlikePath ) - 1] instanceof LikeMatch ) {
361  array_pop( $trimmedlikePath );
362  }
363  $db = $options['db'] ?: wfGetDB( DB_REPLICA );
364  $index1 = implode( '', $trimmedlikeDomain );
365  $index2 = implode( '', $trimmedlikePath );
366 
367  return [
368  "el_to_domain_index" . $db->buildLike( $index1, $db->anyString() ),
369  "el_to_path" . $db->buildLike( $index2, $db->anyString() ),
370  ];
371  }
372 
373  private static function getQueryConditionsOld( $filterEntry, array $options = [] ) {
374  $options += [
375  'protocol' => 'http://',
376  'oneWildcard' => false,
377  'db' => null,
378  ];
379 
380  // First, get the like array
381  $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
382  if ( $like === false ) {
383  return $like;
384  }
385 
386  $like = array_merge( $like[0], $like[1] );
387  // Fix very specific case of domain having a wild card and path being empty
388  // leading to LIKE com.example.%/%
389  if (
390  $like[count( $like ) - 1] instanceof LikeMatch &&
391  $like[count( $like ) - 3] instanceof LikeMatch &&
392  $like[count( $like ) - 2] == '/'
393  ) {
394  array_pop( $like );
395  // @phan-suppress-next-line PhanPluginDuplicateAdjacentStatement This will be removed soon
396  array_pop( $like );
397  }
398 
399  // Get the constant prefix (i.e. everything up to the first wildcard)
400  $trimmedLike = self::keepOneWildcard( $like );
401  if ( $options['oneWildcard'] ) {
402  $like = $trimmedLike;
403  }
404  if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) {
405  array_pop( $trimmedLike );
406  }
407  $index = implode( '', $trimmedLike );
408  $db = $options['db'] ?: wfGetDB( DB_REPLICA );
409 
410  // Build the query
411  $l = strlen( $index );
412  if ( $l >= 60 ) {
413  // The constant prefix is larger than el_index_60, so we can use a
414  // constant comparison.
415  return [
416  "el_index_60" => substr( $index, 0, 60 ),
417  "el_index" . $db->buildLike( $like ),
418  ];
419  }
420 
421  // The constant prefix is smaller than el_index_60, so we use a LIKE
422  // for a prefix search.
423  return [
424  "el_index_60" . $db->buildLike( $index, $db->anyString() ),
425  "el_index" . $db->buildLike( $like ),
426  ];
427  }
428 
429  public static function getProtocolPrefix( $protocol ) {
430  // Find the right prefix
431  $urlProtocols = MediaWikiServices::getInstance()->getMainConfig()
433  if ( $protocol && !in_array( $protocol, $urlProtocols ) ) {
434  foreach ( $urlProtocols as $p ) {
435  if ( str_starts_with( $p, $protocol ) ) {
436  $protocol = $p;
437  break;
438  }
439  }
440 
441  return $protocol;
442  } else {
443  return null;
444  }
445  }
446 
447  public static function prepareProtocols() {
448  $urlProtocols = MediaWikiServices::getInstance()->getMainConfig()
450  $protocols = [ '' ];
451  foreach ( $urlProtocols as $p ) {
452  if ( $p !== '//' ) {
453  $protocols[] = substr( $p, 0, strpos( $p, ':' ) );
454  }
455  }
456 
457  return $protocols;
458  }
459 
472  public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) {
473  $db = wfGetDB( DB_REPLICA );
474  $likeDomain = [];
475  $likePath = [];
476 
477  $target = $protocol . $filterEntry;
478  $bits = wfParseUrl( $target );
479  if ( !$bits ) {
480  return false;
481  }
482 
483  $subdomains = false;
484  if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
485  // Email address with domain and non-empty local part
486  $mailparts = explode( '@', $bits['host'], 2 );
487  $domainpart = self::indexifyHost( $mailparts[1] );
488  if ( $mailparts[0] === '*' ) {
489  $subdomains = true;
490  $bits['host'] = $domainpart . '@';
491  } else {
492  $bits['host'] = $domainpart . '@' . $mailparts[0];
493  }
494  } else {
495  // Non-email, or email with only a domain part.
496  $bits['host'] = self::indexifyHost( $bits['host'] );
497  if ( substr( $bits['host'], -3 ) === '.*.' ) {
498  $subdomains = true;
499  $bits['host'] = substr( $bits['host'], 0, -2 );
500  }
501  }
502 
503  $likeDomain[] = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
504 
505  if ( $subdomains ) {
506  $likeDomain[] = $db->anyString();
507  }
508 
509  if ( isset( $bits['port'] ) ) {
510  $likeDomain[] = ':' . $bits['port'];
511  }
512  if ( isset( $bits['path'] ) ) {
513  $likePath[] = $bits['path'];
514  } else {
515  $likePath[] = '/';
516  }
517  if ( isset( $bits['query'] ) ) {
518  $likePath[] = '?' . $bits['query'];
519  }
520  if ( isset( $bits['fragment'] ) ) {
521  $likePath[] = '#' . $bits['fragment'];
522  }
523  $likePath[] = $db->anyString();
524 
525  // Check for stray asterisks: asterisk only allowed at the start of the domain
526  foreach ( array_merge( $likeDomain, $likePath ) as $likepart ) {
527  if ( !( $likepart instanceof LikeMatch ) && strpos( $likepart, '*' ) !== false ) {
528  return false;
529  }
530  }
531 
532  return [ $likeDomain, $likePath ];
533  }
534 
543  public static function keepOneWildcard( $arr ) {
544  if ( !is_array( $arr ) ) {
545  return $arr;
546  }
547 
548  foreach ( $arr as $key => $value ) {
549  if ( $value instanceof LikeMatch ) {
550  return array_slice( $arr, 0, $key + 1 );
551  }
552  }
553 
554  return $arr;
555  }
556 }
557 
558 class_alias( LinkFilter::class, 'LinkFilter' );
const SCHEMA_COMPAT_READ_OLD
Definition: Defines.php:266
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
A class containing constants representing the names of configuration variables.
const UrlProtocols
Name constant for the UrlProtocols setting, for use with Config::get()
const ExternalLinksSchemaMigrationStage
Name constant for the ExternalLinksSchemaMigrationStage setting, for use with Config::get()
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
A collection of static methods to play with strings.
Definition: StringUtils.php:29
static isUtf8( $value)
Test whether a string is valid UTF-8.
Definition: StringUtils.php:44
Content object implementation for representing flat text.
Definition: TextContent.php:40
Used by Database::buildLike() to represent characters that have special meaning in SQL LIKE clauses a...
Definition: LikeMatch.php:10
Base interface for representing page content.
Definition: Content.php:37
const DB_REPLICA
Definition: defines.php:26
$content
Definition: router.php:76