MediaWiki  master
LinkFilter.php
Go to the documentation of this file.
1 <?php
23 use Wikimedia\IPUtils;
25 
36 class LinkFilter {
41  public const VERSION = 1;
42 
51  public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) {
52  if ( !( $content instanceof TextContent ) ) {
53  // TODO: handle other types of content too.
54  // Maybe create ContentHandler::matchFilter( LinkFilter ).
55  // Think about a common base class for LinkFilter and MagicWord.
56  return 0;
57  }
58 
59  $text = $content->getText();
60 
61  $regex = self::makeRegex( $filterEntry, $protocol );
62  return preg_match( $regex, $text );
63  }
64 
75  private static function makeRegex( $filterEntry, $protocol ) {
76  $regex = '!' . preg_quote( $protocol, '!' );
77  if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
78  $regex .= '(?:[A-Za-z0-9.-]+\.|)';
79  $filterEntry = substr( $filterEntry, 2 );
80  }
81  $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
82  return $regex;
83  }
84 
90  public static function supportsIDN() {
91  return is_callable( 'idn_to_utf8' ) && defined( 'INTL_IDNA_VARIANT_UTS46' );
92  }
93 
99  private static function indexifyHost( $host ) {
100  // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
101 
102  // Canonicalize.
103  $host = rawurldecode( $host );
104  if ( $host !== '' && self::supportsIDN() ) {
105  // @todo Add a PHP fallback
106  $tmp = idn_to_utf8( $host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46 );
107  if ( $tmp !== false ) {
108  $host = $tmp;
109  }
110  }
111  $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
112  if ( StringUtils::isUtf8( $host ) ) {
113  // Save a little space by not percent-encoding valid UTF-8 bytes
114  $okChars .= '\x80-\xf4';
115  }
116  $host = preg_replace_callback(
117  '<[^' . $okChars . ']>',
118  static function ( $m ) {
119  return rawurlencode( $m[0] );
120  },
121  strtolower( $host )
122  );
123 
124  // IPv6? RFC 3986 syntax.
125  if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
126  $ip = $m[1];
127  if ( IPUtils::isValid( $ip ) ) {
128  return 'V6.' . implode( '.', explode( ':', IPUtils::sanitizeIP( $ip ) ) ) . '.';
129  }
130  if ( substr( $ip, -2 ) === ':*' ) {
131  $cutIp = substr( $ip, 0, -2 );
132  if ( IPUtils::isValid( "{$cutIp}::" ) ) {
133  // Wildcard IP doesn't contain "::", so multiple parts can be wild
134  $ct = count( explode( ':', $ip ) ) - 1;
135  return 'V6.' .
136  implode( '.', array_slice( explode( ':', IPUtils::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
137  '.*.';
138  }
139  if ( IPUtils::isValid( "{$cutIp}:1" ) ) {
140  // Wildcard IP does contain "::", so only the last part is wild
141  return 'V6.' .
142  substr( implode( '.', explode( ':', IPUtils::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
143  '*.';
144  }
145  }
146  }
147 
148  // Regularize explicit specification of the DNS root.
149  // Browsers seem to do this for IPv4 literals too.
150  if ( substr( $host, -1 ) === '.' ) {
151  $host = substr( $host, 0, -1 );
152  }
153 
154  // IPv4?
155  $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
156  if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
157  return 'V4.' . implode( '.', array_map( static function ( $v ) {
158  return $v === '*' ? $v : (int)$v;
159  }, explode( '.', $host ) ) ) . '.';
160  }
161 
162  // Must be a host name.
163  return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
164  }
165 
173  public static function makeIndexes( $url ) {
174  // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
175 
176  // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
177  // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
178  // versus "https://" prefix. If you change that, you'll likely need to update
179  // refreshExternallinksIndex.php accordingly.
180 
181  $bits = wfParseUrl( $url );
182  if ( !$bits ) {
183  return [];
184  }
185 
186  // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
187  // For emails turn it into "domain.reversed@localpart"
188  if ( $bits['scheme'] == 'mailto' ) {
189  $mailparts = explode( '@', $bits['host'], 2 );
190  if ( count( $mailparts ) === 2 ) {
191  $domainpart = self::indexifyHost( $mailparts[1] );
192  } else {
193  // No @, assume it's a local part with no domain
194  $domainpart = '';
195  }
196  $bits['host'] = $domainpart . '@' . $mailparts[0];
197  } else {
198  $bits['host'] = self::indexifyHost( $bits['host'] );
199  }
200 
201  // Reconstruct the pseudo-URL
202  $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
203  // Leave out user and password. Add the port, path, query and fragment
204  if ( isset( $bits['port'] ) ) {
205  $index .= ':' . $bits['port'];
206  }
207  $index .= $bits['path'] ?? '/';
208  if ( isset( $bits['query'] ) ) {
209  $index .= '?' . $bits['query'];
210  }
211  if ( isset( $bits['fragment'] ) ) {
212  $index .= '#' . $bits['fragment'];
213  }
214 
215  if ( $bits['scheme'] == '' ) {
216  return [ "http:$index", "https:$index" ];
217  } else {
218  return [ $index ];
219  }
220  }
221 
252  public static function getQueryConditions( $filterEntry, array $options = [] ) {
253  $options += [
254  'protocol' => 'http://',
255  'oneWildcard' => false,
256  'prefix' => 'el',
257  'db' => null,
258  ];
259 
260  // First, get the like array
261  $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
262  if ( $like === false ) {
263  return $like;
264  }
265 
266  // Get the constant prefix (i.e. everything up to the first wildcard)
267  $trimmedLike = self::keepOneWildcard( $like );
268  if ( $options['oneWildcard'] ) {
269  $like = $trimmedLike;
270  }
271  if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) {
272  array_pop( $trimmedLike );
273  }
274  $index = implode( '', $trimmedLike );
275 
276  $p = $options['prefix'];
277  $db = $options['db'] ?: wfGetDB( DB_REPLICA );
278 
279  // Build the query
280  $l = strlen( $index );
281  if ( $l >= 60 ) {
282  // The constant prefix is larger than el_index_60, so we can use a
283  // constant comparison.
284  return [
285  "{$p}_index_60" => substr( $index, 0, 60 ),
286  "{$p}_index" . $db->buildLike( $like ),
287  ];
288  }
289 
290  // The constant prefix is smaller than el_index_60, so we use a LIKE
291  // for a prefix search.
292  return [
293  "{$p}_index_60" . $db->buildLike( $index, $db->anyString() ),
294  "{$p}_index" . $db->buildLike( $like ),
295  ];
296  }
297 
310  public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) {
311  $db = wfGetDB( DB_REPLICA );
312  $like = [];
313 
314  $target = $protocol . $filterEntry;
315  $bits = wfParseUrl( $target );
316  if ( !$bits ) {
317  return false;
318  }
319 
320  $subdomains = false;
321  if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
322  // Email address with domain and non-empty local part
323  $mailparts = explode( '@', $bits['host'], 2 );
324  $domainpart = self::indexifyHost( $mailparts[1] );
325  if ( $mailparts[0] === '*' ) {
326  $subdomains = true;
327  $bits['host'] = $domainpart . '@';
328  } else {
329  $bits['host'] = $domainpart . '@' . $mailparts[0];
330  }
331  } else {
332  // Non-email, or email with only a domain part.
333  $bits['host'] = self::indexifyHost( $bits['host'] );
334  if ( substr( $bits['host'], -3 ) === '.*.' ) {
335  $subdomains = true;
336  $bits['host'] = substr( $bits['host'], 0, -2 );
337  }
338  }
339 
340  $like[] = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
341 
342  if ( $subdomains ) {
343  $like[] = $db->anyString();
344  }
345 
346  if ( isset( $bits['port'] ) ) {
347  $like[] = ':' . $bits['port'];
348  }
349  if ( isset( $bits['path'] ) ) {
350  $like[] = $bits['path'];
351  } elseif ( !$subdomains ) {
352  $like[] = '/';
353  }
354  if ( isset( $bits['query'] ) ) {
355  $like[] = '?' . $bits['query'];
356  }
357  if ( isset( $bits['fragment'] ) ) {
358  $like[] = '#' . $bits['fragment'];
359  }
360 
361  // Check for stray asterisks: asterisk only allowed at the start of the domain
362  foreach ( $like as $likepart ) {
363  if ( !( $likepart instanceof LikeMatch ) && strpos( $likepart, '*' ) !== false ) {
364  return false;
365  }
366  }
367 
368  if ( !( $like[count( $like ) - 1] instanceof LikeMatch ) ) {
369  // Add wildcard at the end if there isn't one already
370  $like[] = $db->anyString();
371  }
372 
373  return $like;
374  }
375 
384  public static function keepOneWildcard( $arr ) {
385  if ( !is_array( $arr ) ) {
386  return $arr;
387  }
388 
389  foreach ( $arr as $key => $value ) {
390  if ( $value instanceof LikeMatch ) {
391  return array_slice( $arr, 0, $key + 1 );
392  }
393  }
394 
395  return $arr;
396  }
397 }
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Some functions to help implement an external link filter for spam control.
Definition: LinkFilter.php:36
static makeIndexes( $url)
Converts a URL into a format for el_index.
Definition: LinkFilter.php:173
static makeLikeArray( $filterEntry, $protocol='http://')
Make an array to be used for calls to Database::buildLike(), which will match the specified string.
Definition: LinkFilter.php:310
static getQueryConditions( $filterEntry, array $options=[])
Return query conditions which will match the specified string.
Definition: LinkFilter.php:252
static supportsIDN()
Indicate whether LinkFilter IDN support is available.
Definition: LinkFilter.php:90
const VERSION
Increment this when makeIndexes output changes.
Definition: LinkFilter.php:41
static matchEntry(Content $content, $filterEntry, $protocol='http://')
Check whether $content contains a link to $filterEntry.
Definition: LinkFilter.php:51
static keepOneWildcard( $arr)
Filters an array returned by makeLikeArray(), removing everything past first pattern placeholder.
Definition: LinkFilter.php:384
static isUtf8( $value)
Test whether a string is valid UTF-8.
Definition: StringUtils.php:44
Content object implementation for representing flat text.
Definition: TextContent.php:40
Used by Database::buildLike() to represent characters that have special meaning in SQL LIKE clauses a...
Definition: LikeMatch.php:10
Base interface for content objects.
Definition: Content.php:35
const DB_REPLICA
Definition: defines.php:26
$content
Definition: router.php:76