MediaWiki  master
LinkFilter.php
Go to the documentation of this file.
1 <?php
23 use Wikimedia\IPUtils;
25 
36 class LinkFilter {
41  public const VERSION = 1;
42 
51  public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) {
52  if ( !( $content instanceof TextContent ) ) {
53  // TODO: handle other types of content too.
54  // Maybe create ContentHandler::matchFilter( LinkFilter ).
55  // Think about a common base class for LinkFilter and MagicWord.
56  return 0;
57  }
58 
59  $text = $content->getText();
60 
61  $regex = self::makeRegex( $filterEntry, $protocol );
62  return preg_match( $regex, $text );
63  }
64 
75  private static function makeRegex( $filterEntry, $protocol ) {
76  $regex = '!' . preg_quote( $protocol, '!' );
77  if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
78  $regex .= '(?:[A-Za-z0-9.-]+\.|)';
79  $filterEntry = substr( $filterEntry, 2 );
80  }
81  $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
82  return $regex;
83  }
84 
90  public static function supportsIDN() {
91  return is_callable( 'idn_to_utf8' ) && defined( 'INTL_IDNA_VARIANT_UTS46' );
92  }
93 
99  private static function indexifyHost( $host ) {
100  // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
101 
102  // Canonicalize.
103  $host = rawurldecode( $host );
104  if ( $host !== '' && self::supportsIDN() ) {
105  // @todo Add a PHP fallback
106  $tmp = idn_to_utf8( $host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46 );
107  if ( $tmp !== false ) {
108  $host = $tmp;
109  }
110  }
111  $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
112  if ( StringUtils::isUtf8( $host ) ) {
113  // Save a little space by not percent-encoding valid UTF-8 bytes
114  $okChars .= '\x80-\xf4';
115  }
116  $host = preg_replace_callback(
117  '<[^' . $okChars . ']>',
118  function ( $m ) {
119  return rawurlencode( $m[0] );
120  },
121  strtolower( $host )
122  );
123 
124  // IPv6? RFC 3986 syntax.
125  if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
126  $ip = $m[1];
127  if ( IPUtils::isValid( $ip ) ) {
128  return 'V6.' . implode( '.', explode( ':', IPUtils::sanitizeIP( $ip ) ) ) . '.';
129  }
130  if ( substr( $ip, -2 ) === ':*' ) {
131  $cutIp = substr( $ip, 0, -2 );
132  if ( IPUtils::isValid( "{$cutIp}::" ) ) {
133  // Wildcard IP doesn't contain "::", so multiple parts can be wild
134  $ct = count( explode( ':', $ip ) ) - 1;
135  return 'V6.' .
136  implode( '.', array_slice( explode( ':', IPUtils::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
137  '.*.';
138  }
139  if ( IPUtils::isValid( "{$cutIp}:1" ) ) {
140  // Wildcard IP does contain "::", so only the last part is wild
141  return 'V6.' .
142  substr( implode( '.', explode( ':', IPUtils::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
143  '*.';
144  }
145  }
146  }
147 
148  // Regularlize explicit specification of the DNS root.
149  // Browsers seem to do this for IPv4 literals too.
150  if ( substr( $host, -1 ) === '.' ) {
151  $host = substr( $host, 0, -1 );
152  }
153 
154  // IPv4?
155  $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
156  if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
157  return 'V4.' . implode( '.', array_map( function ( $v ) {
158  return $v === '*' ? $v : (int)$v;
159  }, explode( '.', $host ) ) ) . '.';
160  }
161 
162  // Must be a host name.
163  return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
164  }
165 
173  public static function makeIndexes( $url ) {
174  // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
175 
176  // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
177  // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
178  // versus "https://" prefix. If you change that, you'll likely need to update
179  // refreshExternallinksIndex.php accordingly.
180 
181  $bits = wfParseUrl( $url );
182  if ( !$bits ) {
183  return [];
184  }
185 
186  // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
187  // For emails turn it into "domain.reversed@localpart"
188  if ( $bits['scheme'] == 'mailto' ) {
189  $mailparts = explode( '@', $bits['host'], 2 );
190  if ( count( $mailparts ) === 2 ) {
191  $domainpart = self::indexifyHost( $mailparts[1] );
192  } else {
193  // No @, assume it's a local part with no domain
194  $domainpart = '';
195  }
196  $bits['host'] = $domainpart . '@' . $mailparts[0];
197  } else {
198  $bits['host'] = self::indexifyHost( $bits['host'] );
199  }
200 
201  // Reconstruct the pseudo-URL
202  $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
203  // Leave out user and password. Add the port, path, query and fragment
204  if ( isset( $bits['port'] ) ) {
205  $index .= ':' . $bits['port'];
206  }
207  if ( isset( $bits['path'] ) ) {
208  $index .= $bits['path'];
209  } else {
210  $index .= '/';
211  }
212  if ( isset( $bits['query'] ) ) {
213  $index .= '?' . $bits['query'];
214  }
215  if ( isset( $bits['fragment'] ) ) {
216  $index .= '#' . $bits['fragment'];
217  }
218 
219  if ( $bits['scheme'] == '' ) {
220  return [ "http:$index", "https:$index" ];
221  } else {
222  return [ $index ];
223  }
224  }
225 
256  public static function getQueryConditions( $filterEntry, array $options = [] ) {
257  $options += [
258  'protocol' => 'http://',
259  'oneWildcard' => false,
260  'prefix' => 'el',
261  'db' => null,
262  ];
263 
264  // First, get the like array
265  $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
266  if ( $like === false ) {
267  return $like;
268  }
269 
270  // Get the constant prefix (i.e. everything up to the first wildcard)
271  $trimmedLike = self::keepOneWildcard( $like );
272  if ( $options['oneWildcard'] ) {
273  $like = $trimmedLike;
274  }
275  if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) {
276  array_pop( $trimmedLike );
277  }
278  $index = implode( '', $trimmedLike );
279 
280  $p = $options['prefix'];
281  $db = $options['db'] ?: wfGetDB( DB_REPLICA );
282 
283  // Build the query
284  $l = strlen( $index );
285  if ( $l >= 60 ) {
286  // The constant prefix is larger than el_index_60, so we can use a
287  // constant comparison.
288  return [
289  "{$p}_index_60" => substr( $index, 0, 60 ),
290  "{$p}_index" . $db->buildLike( $like ),
291  ];
292  }
293 
294  // The constant prefix is smaller than el_index_60, so we use a LIKE
295  // for a prefix search.
296  return [
297  "{$p}_index_60" . $db->buildLike( $index, $db->anyString() ),
298  "{$p}_index" . $db->buildLike( $like ),
299  ];
300  }
301 
314  public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) {
315  $db = wfGetDB( DB_REPLICA );
316  $like = [];
317 
318  $target = $protocol . $filterEntry;
319  $bits = wfParseUrl( $target );
320  if ( !$bits ) {
321  return false;
322  }
323 
324  $subdomains = false;
325  if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
326  // Email address with domain and non-empty local part
327  $mailparts = explode( '@', $bits['host'], 2 );
328  $domainpart = self::indexifyHost( $mailparts[1] );
329  if ( $mailparts[0] === '*' ) {
330  $subdomains = true;
331  $bits['host'] = $domainpart . '@';
332  } else {
333  $bits['host'] = $domainpart . '@' . $mailparts[0];
334  }
335  } else {
336  // Non-email, or email with only a domain part.
337  $bits['host'] = self::indexifyHost( $bits['host'] );
338  if ( substr( $bits['host'], -3 ) === '.*.' ) {
339  $subdomains = true;
340  $bits['host'] = substr( $bits['host'], 0, -2 );
341  }
342  }
343 
344  $like[] = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
345 
346  if ( $subdomains ) {
347  $like[] = $db->anyString();
348  }
349 
350  if ( isset( $bits['port'] ) ) {
351  $like[] = ':' . $bits['port'];
352  }
353  if ( isset( $bits['path'] ) ) {
354  $like[] = $bits['path'];
355  } elseif ( !$subdomains ) {
356  $like[] = '/';
357  }
358  if ( isset( $bits['query'] ) ) {
359  $like[] = '?' . $bits['query'];
360  }
361  if ( isset( $bits['fragment'] ) ) {
362  $like[] = '#' . $bits['fragment'];
363  }
364 
365  // Check for stray asterisks: asterisk only allowed at the start of the domain
366  foreach ( $like as $likepart ) {
367  if ( !( $likepart instanceof LikeMatch ) && strpos( $likepart, '*' ) !== false ) {
368  return false;
369  }
370  }
371 
372  if ( !( $like[count( $like ) - 1] instanceof LikeMatch ) ) {
373  // Add wildcard at the end if there isn't one already
374  $like[] = $db->anyString();
375  }
376 
377  return $like;
378  }
379 
388  public static function keepOneWildcard( $arr ) {
389  if ( !is_array( $arr ) ) {
390  return $arr;
391  }
392 
393  foreach ( $arr as $key => $value ) {
394  if ( $value instanceof LikeMatch ) {
395  return array_slice( $arr, 0, $key + 1 );
396  }
397  }
398 
399  return $arr;
400  }
401 }
StringUtils\isUtf8
static isUtf8( $value)
Test whether a string is valid UTF-8.
Definition: StringUtils.php:44
LinkFilter\supportsIDN
static supportsIDN()
Indicate whether LinkFilter IDN support is available.
Definition: LinkFilter.php:90
LinkFilter\indexifyHost
static indexifyHost( $host)
Canonicalize a hostname for el_index.
Definition: LinkFilter.php:99
LinkFilter\getQueryConditions
static getQueryConditions( $filterEntry, array $options=[])
Return query conditions which will match the specified string.
Definition: LinkFilter.php:256
LinkFilter\matchEntry
static matchEntry(Content $content, $filterEntry, $protocol='http://')
Check whether $content contains a link to $filterEntry.
Definition: LinkFilter.php:51
LinkFilter\keepOneWildcard
static keepOneWildcard( $arr)
Filters an array returned by makeLikeArray(), removing everything past first pattern placeholder.
Definition: LinkFilter.php:388
LinkFilter\makeIndexes
static makeIndexes( $url)
Converts a URL into a format for el_index.
Definition: LinkFilter.php:173
LinkFilter\VERSION
const VERSION
Increment this when makeIndexes output changes.
Definition: LinkFilter.php:41
wfParseUrl
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
Definition: GlobalFunctions.php:791
wfGetDB
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Definition: GlobalFunctions.php:2467
LinkFilter\makeLikeArray
static makeLikeArray( $filterEntry, $protocol='http://')
Make an array to be used for calls to Database::buildLike(), which will match the specified string.
Definition: LinkFilter.php:314
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
LinkFilter\makeRegex
static makeRegex( $filterEntry, $protocol)
Builds a regex pattern for $filterEntry.
Definition: LinkFilter.php:75
$content
$content
Definition: router.php:76
LinkFilter
Some functions to help implement an external link filter for spam control.
Definition: LinkFilter.php:36
Wikimedia\Rdbms\LikeMatch
Used by Database::buildLike() to represent characters that have special meaning in SQL LIKE clauses a...
Definition: LikeMatch.php:10
TextContent
Content object implementation for representing flat text.
Definition: TextContent.php:39
Content
Base interface for content objects.
Definition: Content.php:35