MediaWiki  1.34.0
LinkFilter.php
Go to the documentation of this file.
1 <?php
23 
34 class LinkFilter {
39  const VERSION = 1;
40 
49  public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) {
50  if ( !( $content instanceof TextContent ) ) {
51  // TODO: handle other types of content too.
52  // Maybe create ContentHandler::matchFilter( LinkFilter ).
53  // Think about a common base class for LinkFilter and MagicWord.
54  return 0;
55  }
56 
57  $text = $content->getText();
58 
59  $regex = self::makeRegex( $filterEntry, $protocol );
60  return preg_match( $regex, $text );
61  }
62 
73  private static function makeRegex( $filterEntry, $protocol ) {
74  $regex = '!' . preg_quote( $protocol, '!' );
75  if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
76  $regex .= '(?:[A-Za-z0-9.-]+\.|)';
77  $filterEntry = substr( $filterEntry, 2 );
78  }
79  $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
80  return $regex;
81  }
82 
88  public static function supportsIDN() {
89  return is_callable( 'idn_to_utf8' ) && defined( 'INTL_IDNA_VARIANT_UTS46' );
90  }
91 
97  private static function indexifyHost( $host ) {
98  // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
99 
100  // Canonicalize.
101  $host = rawurldecode( $host );
102  if ( $host !== '' && self::supportsIDN() ) {
103  // @todo Add a PHP fallback
104  $tmp = idn_to_utf8( $host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46 );
105  if ( $tmp !== false ) {
106  $host = $tmp;
107  }
108  }
109  $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
110  if ( StringUtils::isUtf8( $host ) ) {
111  // Save a little space by not percent-encoding valid UTF-8 bytes
112  $okChars .= '\x80-\xf4';
113  }
114  $host = preg_replace_callback(
115  '<[^' . $okChars . ']>',
116  function ( $m ) {
117  return rawurlencode( $m[0] );
118  },
119  strtolower( $host )
120  );
121 
122  // IPv6? RFC 3986 syntax.
123  if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
124  $ip = $m[1];
125  if ( IP::isValid( $ip ) ) {
126  return 'V6.' . implode( '.', explode( ':', IP::sanitizeIP( $ip ) ) ) . '.';
127  }
128  if ( substr( $ip, -2 ) === ':*' ) {
129  $cutIp = substr( $ip, 0, -2 );
130  if ( IP::isValid( "{$cutIp}::" ) ) {
131  // Wildcard IP doesn't contain "::", so multiple parts can be wild
132  $ct = count( explode( ':', $ip ) ) - 1;
133  return 'V6.' .
134  implode( '.', array_slice( explode( ':', IP::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
135  '.*.';
136  }
137  if ( IP::isValid( "{$cutIp}:1" ) ) {
138  // Wildcard IP does contain "::", so only the last part is wild
139  return 'V6.' .
140  substr( implode( '.', explode( ':', IP::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
141  '*.';
142  }
143  }
144  }
145 
146  // Regularlize explicit specification of the DNS root.
147  // Browsers seem to do this for IPv4 literals too.
148  if ( substr( $host, -1 ) === '.' ) {
149  $host = substr( $host, 0, -1 );
150  }
151 
152  // IPv4?
153  $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
154  if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
155  return 'V4.' . implode( '.', array_map( function ( $v ) {
156  return $v === '*' ? $v : (int)$v;
157  }, explode( '.', $host ) ) ) . '.';
158  }
159 
160  // Must be a host name.
161  return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
162  }
163 
171  public static function makeIndexes( $url ) {
172  // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
173 
174  // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
175  // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
176  // versus "https://" prefix. If you change that, you'll likely need to update
177  // refreshExternallinksIndex.php accordingly.
178 
179  $bits = wfParseUrl( $url );
180  if ( !$bits ) {
181  return [];
182  }
183 
184  // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
185  // For emails turn it into "domain.reversed@localpart"
186  if ( $bits['scheme'] == 'mailto' ) {
187  $mailparts = explode( '@', $bits['host'], 2 );
188  if ( count( $mailparts ) === 2 ) {
189  $domainpart = self::indexifyHost( $mailparts[1] );
190  } else {
191  // No @, assume it's a local part with no domain
192  $domainpart = '';
193  }
194  $bits['host'] = $domainpart . '@' . $mailparts[0];
195  } else {
196  $bits['host'] = self::indexifyHost( $bits['host'] );
197  }
198 
199  // Reconstruct the pseudo-URL
200  $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
201  // Leave out user and password. Add the port, path, query and fragment
202  if ( isset( $bits['port'] ) ) {
203  $index .= ':' . $bits['port'];
204  }
205  if ( isset( $bits['path'] ) ) {
206  $index .= $bits['path'];
207  } else {
208  $index .= '/';
209  }
210  if ( isset( $bits['query'] ) ) {
211  $index .= '?' . $bits['query'];
212  }
213  if ( isset( $bits['fragment'] ) ) {
214  $index .= '#' . $bits['fragment'];
215  }
216 
217  if ( $bits['scheme'] == '' ) {
218  return [ "http:$index", "https:$index" ];
219  } else {
220  return [ $index ];
221  }
222  }
223 
254  public static function getQueryConditions( $filterEntry, array $options = [] ) {
255  $options += [
256  'protocol' => 'http://',
257  'oneWildcard' => false,
258  'prefix' => 'el',
259  'db' => null,
260  ];
261 
262  // First, get the like array
263  $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
264  if ( $like === false ) {
265  return $like;
266  }
267 
268  // Get the constant prefix (i.e. everything up to the first wildcard)
269  $trimmedLike = self::keepOneWildcard( $like );
270  if ( $options['oneWildcard'] ) {
271  $like = $trimmedLike;
272  }
273  if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) {
274  array_pop( $trimmedLike );
275  }
276  $index = implode( '', $trimmedLike );
277 
278  $p = $options['prefix'];
279  $db = $options['db'] ?: wfGetDB( DB_REPLICA );
280 
281  // Build the query
282  $l = strlen( $index );
283  if ( $l >= 60 ) {
284  // The constant prefix is larger than el_index_60, so we can use a
285  // constant comparison.
286  return [
287  "{$p}_index_60" => substr( $index, 0, 60 ),
288  "{$p}_index" . $db->buildLike( $like ),
289  ];
290  }
291 
292  // The constant prefix is smaller than el_index_60, so we use a LIKE
293  // for a prefix search.
294  return [
295  "{$p}_index_60" . $db->buildLike( $index, $db->anyString() ),
296  "{$p}_index" . $db->buildLike( $like ),
297  ];
298  }
299 
312  public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) {
313  $db = wfGetDB( DB_REPLICA );
314  $like = [];
315 
316  $target = $protocol . $filterEntry;
317  $bits = wfParseUrl( $target );
318  if ( !$bits ) {
319  return false;
320  }
321 
322  $subdomains = false;
323  if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
324  // Email address with domain and non-empty local part
325  $mailparts = explode( '@', $bits['host'], 2 );
326  $domainpart = self::indexifyHost( $mailparts[1] );
327  if ( $mailparts[0] === '*' ) {
328  $subdomains = true;
329  $bits['host'] = $domainpart . '@';
330  } else {
331  $bits['host'] = $domainpart . '@' . $mailparts[0];
332  }
333  } else {
334  // Non-email, or email with only a domain part.
335  $bits['host'] = self::indexifyHost( $bits['host'] );
336  if ( substr( $bits['host'], -3 ) === '.*.' ) {
337  $subdomains = true;
338  $bits['host'] = substr( $bits['host'], 0, -2 );
339  }
340  }
341 
342  $like[] = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
343 
344  if ( $subdomains ) {
345  $like[] = $db->anyString();
346  }
347 
348  if ( isset( $bits['port'] ) ) {
349  $like[] = ':' . $bits['port'];
350  }
351  if ( isset( $bits['path'] ) ) {
352  $like[] = $bits['path'];
353  } elseif ( !$subdomains ) {
354  $like[] = '/';
355  }
356  if ( isset( $bits['query'] ) ) {
357  $like[] = '?' . $bits['query'];
358  }
359  if ( isset( $bits['fragment'] ) ) {
360  $like[] = '#' . $bits['fragment'];
361  }
362 
363  // Check for stray asterisks: asterisk only allowed at the start of the domain
364  foreach ( $like as $likepart ) {
365  if ( !( $likepart instanceof LikeMatch ) && strpos( $likepart, '*' ) !== false ) {
366  return false;
367  }
368  }
369 
370  if ( !( $like[count( $like ) - 1] instanceof LikeMatch ) ) {
371  // Add wildcard at the end if there isn't one already
372  $like[] = $db->anyString();
373  }
374 
375  return $like;
376  }
377 
386  public static function keepOneWildcard( $arr ) {
387  if ( !is_array( $arr ) ) {
388  return $arr;
389  }
390 
391  foreach ( $arr as $key => $value ) {
392  if ( $value instanceof LikeMatch ) {
393  return array_slice( $arr, 0, $key + 1 );
394  }
395  }
396 
397  return $arr;
398  }
399 }
StringUtils\isUtf8
static isUtf8( $value)
Test whether a string is valid UTF-8.
Definition: StringUtils.php:44
LinkFilter\supportsIDN
static supportsIDN()
Indicate whether LinkFilter IDN support is available.
Definition: LinkFilter.php:88
LinkFilter\indexifyHost
static indexifyHost( $host)
Canonicalize a hostname for el_index.
Definition: LinkFilter.php:97
LinkFilter\getQueryConditions
static getQueryConditions( $filterEntry, array $options=[])
Return query conditions which will match the specified string.
Definition: LinkFilter.php:254
LinkFilter\matchEntry
static matchEntry(Content $content, $filterEntry, $protocol='http://')
Check whether $content contains a link to $filterEntry.
Definition: LinkFilter.php:49
LinkFilter\keepOneWildcard
static keepOneWildcard( $arr)
Filters an array returned by makeLikeArray(), removing everything past first pattern placeholder.
Definition: LinkFilter.php:386
LinkFilter\makeIndexes
static makeIndexes( $url)
Converts a URL into a format for el_index.
Definition: LinkFilter.php:171
LinkFilter\VERSION
const VERSION
Increment this when makeIndexes output changes.
Definition: LinkFilter.php:39
wfParseUrl
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
Definition: GlobalFunctions.php:793
wfGetDB
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Definition: GlobalFunctions.php:2575
LinkFilter\makeLikeArray
static makeLikeArray( $filterEntry, $protocol='http://')
Make an array to be used for calls to Database::buildLike(), which will match the specified string.
Definition: LinkFilter.php:312
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
LinkFilter\makeRegex
static makeRegex( $filterEntry, $protocol)
Builds a regex pattern for $filterEntry.
Definition: LinkFilter.php:73
$content
$content
Definition: router.php:78
LinkFilter
Some functions to help implement an external link filter for spam control.
Definition: LinkFilter.php:34
Wikimedia\Rdbms\LikeMatch
Used by Database::buildLike() to represent characters that have special meaning in SQL LIKE clauses a...
Definition: LikeMatch.php:10
IP\isValid
static isValid( $ip)
Validate an IP address.
Definition: IP.php:111
TextContent
Content object implementation for representing flat text.
Definition: TextContent.php:37
Content
Base interface for content objects.
Definition: Content.php:34
IP\sanitizeIP
static sanitizeIP( $ip)
Convert an IP into a verbose, uppercase, normalized form.
Definition: IP.php:139