MediaWiki REL1_39
LinkFilter.php
Go to the documentation of this file.
1<?php
23use Wikimedia\IPUtils;
25
41 public const VERSION = 1;
42
51 public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) {
52 if ( !( $content instanceof TextContent ) ) {
53 // TODO: handle other types of content too.
54 // Maybe create ContentHandler::matchFilter( LinkFilter ).
55 // Think about a common base class for LinkFilter and MagicWord.
56 return 0;
57 }
58
59 $text = $content->getText();
60
61 $regex = self::makeRegex( $filterEntry, $protocol );
62 return preg_match( $regex, $text );
63 }
64
75 private static function makeRegex( $filterEntry, $protocol ) {
76 $regex = '!' . preg_quote( $protocol, '!' );
77 if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
78 $regex .= '(?:[A-Za-z0-9.-]+\.|)';
79 $filterEntry = substr( $filterEntry, 2 );
80 }
81 $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
82 return $regex;
83 }
84
90 public static function supportsIDN() {
91 return is_callable( 'idn_to_utf8' ) && defined( 'INTL_IDNA_VARIANT_UTS46' );
92 }
93
99 private static function indexifyHost( $host ) {
100 // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
101
102 // Canonicalize.
103 $host = rawurldecode( $host );
104 if ( $host !== '' && self::supportsIDN() ) {
105 // @todo Add a PHP fallback
106 $tmp = idn_to_utf8( $host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46 );
107 if ( $tmp !== false ) {
108 $host = $tmp;
109 }
110 }
111 $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
112 if ( StringUtils::isUtf8( $host ) ) {
113 // Save a little space by not percent-encoding valid UTF-8 bytes
114 $okChars .= '\x80-\xf4';
115 }
116 $host = preg_replace_callback(
117 '<[^' . $okChars . ']>',
118 static function ( $m ) {
119 return rawurlencode( $m[0] );
120 },
121 strtolower( $host )
122 );
123
124 // IPv6? RFC 3986 syntax.
125 if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
126 $ip = $m[1];
127 if ( IPUtils::isValid( $ip ) ) {
128 return 'V6.' . implode( '.', explode( ':', IPUtils::sanitizeIP( $ip ) ) ) . '.';
129 }
130 if ( substr( $ip, -2 ) === ':*' ) {
131 $cutIp = substr( $ip, 0, -2 );
132 if ( IPUtils::isValid( "{$cutIp}::" ) ) {
133 // Wildcard IP doesn't contain "::", so multiple parts can be wild
134 $ct = count( explode( ':', $ip ) ) - 1;
135 return 'V6.' .
136 implode( '.', array_slice( explode( ':', IPUtils::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
137 '.*.';
138 }
139 if ( IPUtils::isValid( "{$cutIp}:1" ) ) {
140 // Wildcard IP does contain "::", so only the last part is wild
141 return 'V6.' .
142 substr( implode( '.', explode( ':', IPUtils::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
143 '*.';
144 }
145 }
146 }
147
148 // Regularize explicit specification of the DNS root.
149 // Browsers seem to do this for IPv4 literals too.
150 if ( substr( $host, -1 ) === '.' ) {
151 $host = substr( $host, 0, -1 );
152 }
153
154 // IPv4?
155 $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
156 if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
157 return 'V4.' . implode( '.', array_map( static function ( $v ) {
158 return $v === '*' ? $v : (int)$v;
159 }, explode( '.', $host ) ) ) . '.';
160 }
161
162 // Must be a host name.
163 return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
164 }
165
173 public static function makeIndexes( $url ) {
174 // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
175
176 // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
177 // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
178 // versus "https://" prefix. If you change that, you'll likely need to update
179 // refreshExternallinksIndex.php accordingly.
180
181 $bits = wfParseUrl( $url );
182 if ( !$bits ) {
183 return [];
184 }
185
186 // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
187 // For emails turn it into "domain.reversed@localpart"
188 if ( $bits['scheme'] == 'mailto' ) {
189 $mailparts = explode( '@', $bits['host'], 2 );
190 if ( count( $mailparts ) === 2 ) {
191 $domainpart = self::indexifyHost( $mailparts[1] );
192 } else {
193 // No @, assume it's a local part with no domain
194 $domainpart = '';
195 }
196 $bits['host'] = $domainpart . '@' . $mailparts[0];
197 } else {
198 $bits['host'] = self::indexifyHost( $bits['host'] );
199 }
200
201 // Reconstruct the pseudo-URL
202 $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
203 // Leave out user and password. Add the port, path, query and fragment
204 if ( isset( $bits['port'] ) ) {
205 $index .= ':' . $bits['port'];
206 }
207 $index .= $bits['path'] ?? '/';
208 if ( isset( $bits['query'] ) ) {
209 $index .= '?' . $bits['query'];
210 }
211 if ( isset( $bits['fragment'] ) ) {
212 $index .= '#' . $bits['fragment'];
213 }
214
215 if ( $bits['scheme'] == '' ) {
216 return [ "http:$index", "https:$index" ];
217 } else {
218 return [ $index ];
219 }
220 }
221
252 public static function getQueryConditions( $filterEntry, array $options = [] ) {
253 $options += [
254 'protocol' => 'http://',
255 'oneWildcard' => false,
256 'prefix' => 'el',
257 'db' => null,
258 ];
259
260 // First, get the like array
261 $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
262 if ( $like === false ) {
263 return $like;
264 }
265
266 // Get the constant prefix (i.e. everything up to the first wildcard)
267 $trimmedLike = self::keepOneWildcard( $like );
268 if ( $options['oneWildcard'] ) {
269 $like = $trimmedLike;
270 }
271 if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) {
272 array_pop( $trimmedLike );
273 }
274 $index = implode( '', $trimmedLike );
275
276 $p = $options['prefix'];
277 $db = $options['db'] ?: wfGetDB( DB_REPLICA );
278
279 // Build the query
280 $l = strlen( $index );
281 if ( $l >= 60 ) {
282 // The constant prefix is larger than el_index_60, so we can use a
283 // constant comparison.
284 return [
285 "{$p}_index_60" => substr( $index, 0, 60 ),
286 "{$p}_index" . $db->buildLike( $like ),
287 ];
288 }
289
290 // The constant prefix is smaller than el_index_60, so we use a LIKE
291 // for a prefix search.
292 return [
293 "{$p}_index_60" . $db->buildLike( $index, $db->anyString() ),
294 "{$p}_index" . $db->buildLike( $like ),
295 ];
296 }
297
310 public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) {
311 $db = wfGetDB( DB_REPLICA );
312 $like = [];
313
314 $target = $protocol . $filterEntry;
315 $bits = wfParseUrl( $target );
316 if ( !$bits ) {
317 return false;
318 }
319
320 $subdomains = false;
321 if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
322 // Email address with domain and non-empty local part
323 $mailparts = explode( '@', $bits['host'], 2 );
324 $domainpart = self::indexifyHost( $mailparts[1] );
325 if ( $mailparts[0] === '*' ) {
326 $subdomains = true;
327 $bits['host'] = $domainpart . '@';
328 } else {
329 $bits['host'] = $domainpart . '@' . $mailparts[0];
330 }
331 } else {
332 // Non-email, or email with only a domain part.
333 $bits['host'] = self::indexifyHost( $bits['host'] );
334 if ( substr( $bits['host'], -3 ) === '.*.' ) {
335 $subdomains = true;
336 $bits['host'] = substr( $bits['host'], 0, -2 );
337 }
338 }
339
340 $like[] = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
341
342 if ( $subdomains ) {
343 $like[] = $db->anyString();
344 }
345
346 if ( isset( $bits['port'] ) ) {
347 $like[] = ':' . $bits['port'];
348 }
349 if ( isset( $bits['path'] ) ) {
350 $like[] = $bits['path'];
351 } elseif ( !$subdomains ) {
352 $like[] = '/';
353 }
354 if ( isset( $bits['query'] ) ) {
355 $like[] = '?' . $bits['query'];
356 }
357 if ( isset( $bits['fragment'] ) ) {
358 $like[] = '#' . $bits['fragment'];
359 }
360
361 // Check for stray asterisks: asterisk only allowed at the start of the domain
362 foreach ( $like as $likepart ) {
363 if ( !( $likepart instanceof LikeMatch ) && strpos( $likepart, '*' ) !== false ) {
364 return false;
365 }
366 }
367
368 if ( !( $like[count( $like ) - 1] instanceof LikeMatch ) ) {
369 // Add wildcard at the end if there isn't one already
370 $like[] = $db->anyString();
371 }
372
373 return $like;
374 }
375
384 public static function keepOneWildcard( $arr ) {
385 if ( !is_array( $arr ) ) {
386 return $arr;
387 }
388
389 foreach ( $arr as $key => $value ) {
390 if ( $value instanceof LikeMatch ) {
391 return array_slice( $arr, 0, $key + 1 );
392 }
393 }
394
395 return $arr;
396 }
397}
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Some functions to help implement an external link filter for spam control.
static makeIndexes( $url)
Converts a URL into a format for el_index.
static makeLikeArray( $filterEntry, $protocol='http://')
Make an array to be used for calls to Database::buildLike(), which will match the specified string.
static getQueryConditions( $filterEntry, array $options=[])
Return query conditions which will match the specified string.
static supportsIDN()
Indicate whether LinkFilter IDN support is available.
const VERSION
Increment this when makeIndexes output changes.
static matchEntry(Content $content, $filterEntry, $protocol='http://')
Check whether $content contains a link to $filterEntry.
static keepOneWildcard( $arr)
Filters an array returned by makeLikeArray(), removing everything past first pattern placeholder.
static isUtf8( $value)
Test whether a string is valid UTF-8.
Content object implementation for representing flat text.
Used by Database::buildLike() to represent characters that have special meaning in SQL LIKE clauses a...
Definition LikeMatch.php:10
Base interface for content objects.
Definition Content.php:35
const DB_REPLICA
Definition defines.php:26
$content
Definition router.php:76