MediaWiki 1.40.4
LinkFilter.php
Go to the documentation of this file.
1<?php
24
25use Content;
28use StringUtils;
29use TextContent;
30use Wikimedia\IPUtils;
32
48 public const VERSION = 1;
49
58 public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) {
59 if ( !( $content instanceof TextContent ) ) {
60 // TODO: handle other types of content too.
61 // Maybe create ContentHandler::matchFilter( LinkFilter ).
62 // Think about a common base class for LinkFilter and MagicWord.
63 return 0;
64 }
65
66 $text = $content->getText();
67
68 $regex = self::makeRegex( $filterEntry, $protocol );
69 return preg_match( $regex, $text );
70 }
71
82 private static function makeRegex( $filterEntry, $protocol ) {
83 $regex = '!' . preg_quote( $protocol, '!' );
84 if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
85 $regex .= '(?:[A-Za-z0-9.-]+\.|)';
86 $filterEntry = substr( $filterEntry, 2 );
87 }
88 $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
89 return $regex;
90 }
91
97 private static function indexifyHost( $host ) {
98 // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
99
100 // Canonicalize.
101 $host = rawurldecode( $host );
102 if ( $host !== '' ) {
103 $tmp = idn_to_utf8( $host );
104 if ( $tmp !== false ) {
105 $host = $tmp;
106 }
107 }
108 $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
109 if ( StringUtils::isUtf8( $host ) ) {
110 // Save a little space by not percent-encoding valid UTF-8 bytes
111 $okChars .= '\x80-\xf4';
112 }
113 $host = preg_replace_callback(
114 '<[^' . $okChars . ']>',
115 static function ( $m ) {
116 return rawurlencode( $m[0] );
117 },
118 strtolower( $host )
119 );
120
121 // IPv6? RFC 3986 syntax.
122 if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
123 $ip = $m[1];
124 if ( IPUtils::isValid( $ip ) ) {
125 return 'V6.' . implode( '.', explode( ':', IPUtils::sanitizeIP( $ip ) ) ) . '.';
126 }
127 if ( substr( $ip, -2 ) === ':*' ) {
128 $cutIp = substr( $ip, 0, -2 );
129 if ( IPUtils::isValid( "{$cutIp}::" ) ) {
130 // Wildcard IP doesn't contain "::", so multiple parts can be wild
131 $ct = count( explode( ':', $ip ) ) - 1;
132 return 'V6.' .
133 implode( '.', array_slice( explode( ':', IPUtils::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
134 '.*.';
135 }
136 if ( IPUtils::isValid( "{$cutIp}:1" ) ) {
137 // Wildcard IP does contain "::", so only the last part is wild
138 return 'V6.' .
139 substr( implode( '.', explode( ':', IPUtils::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
140 '*.';
141 }
142 }
143 }
144
145 // Regularize explicit specification of the DNS root.
146 // Browsers seem to do this for IPv4 literals too.
147 if ( substr( $host, -1 ) === '.' ) {
148 $host = substr( $host, 0, -1 );
149 }
150
151 // IPv4?
152 $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
153 if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
154 return 'V4.' . implode( '.', array_map( static function ( $v ) {
155 return $v === '*' ? $v : (int)$v;
156 }, explode( '.', $host ) ) ) . '.';
157 }
158
159 // Must be a host name.
160 return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
161 }
162
171 public static function makeIndexes( $url ) {
172 // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
173
174 // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
175 // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
176 // versus "https://" prefix. If you change that, you'll likely need to update
177 // refreshExternallinksIndex.php accordingly.
178
179 $bits = wfParseUrl( $url );
180 if ( !$bits ) {
181 return [];
182 }
183
184 // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
185 // For emails turn it into "domain.reversed@localpart"
186 if ( $bits['scheme'] == 'mailto' ) {
187 $mailparts = explode( '@', $bits['host'], 2 );
188 if ( count( $mailparts ) === 2 ) {
189 $domainpart = self::indexifyHost( $mailparts[1] );
190 } else {
191 // No @, assume it's a local part with no domain
192 $domainpart = '';
193 }
194 $bits['host'] = $domainpart . '@' . $mailparts[0];
195 } else {
196 $bits['host'] = self::indexifyHost( $bits['host'] );
197 }
198
199 // Reconstruct the pseudo-URL
200 $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
201 // Leave out user and password. Add the port, path, query and fragment
202 if ( isset( $bits['port'] ) ) {
203 $index .= ':' . $bits['port'];
204 }
205 $index2 = $bits['path'] ?? '/';
206 if ( isset( $bits['query'] ) ) {
207 $index2 .= '?' . $bits['query'];
208 }
209 if ( isset( $bits['fragment'] ) ) {
210 $index2 .= '#' . $bits['fragment'];
211 }
212
213 if ( $bits['scheme'] == '' ) {
214 return [ [ "http:$index", $index2 ], [ "https:$index", $index2 ] ];
215 } else {
216 return [ [ $index, $index2 ] ];
217 }
218 }
219
248 public static function getQueryConditions( $filterEntry, array $options = [] ) {
249 $options += [
250 'protocol' => 'http://',
251 'oneWildcard' => false,
252 'db' => null,
253 ];
254
255 // First, get the like array
256 $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
257 if ( $like === false ) {
258 return $like;
259 }
260
261 // Get the constant prefix (i.e. everything up to the first wildcard)
262 $trimmedLike = self::keepOneWildcard( $like );
263 if ( $options['oneWildcard'] ) {
264 $like = $trimmedLike;
265 }
266 if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) {
267 array_pop( $trimmedLike );
268 }
269 $index = implode( '', $trimmedLike );
270 $db = $options['db'] ?: wfGetDB( DB_REPLICA );
271
272 // Build the query
273 $l = strlen( $index );
274 if ( $l >= 60 ) {
275 // The constant prefix is larger than el_index_60, so we can use a
276 // constant comparison.
277 return [
278 "el_index_60" => substr( $index, 0, 60 ),
279 "el_index" . $db->buildLike( $like ),
280 ];
281 }
282
283 // The constant prefix is smaller than el_index_60, so we use a LIKE
284 // for a prefix search.
285 return [
286 "el_index_60" . $db->buildLike( $index, $db->anyString() ),
287 "el_index" . $db->buildLike( $like ),
288 ];
289 }
290
291 public static function getProtocolPrefix( $protocol ) {
292 // Find the right prefix
293 $urlProtocols = MediaWikiServices::getInstance()->getMainConfig()
295 if ( $protocol && !in_array( $protocol, $urlProtocols ) ) {
296 foreach ( $urlProtocols as $p ) {
297 if ( str_starts_with( $p, $protocol ) ) {
298 $protocol = $p;
299 break;
300 }
301 }
302
303 return $protocol;
304 } else {
305 return null;
306 }
307 }
308
309 public static function prepareProtocols() {
310 $urlProtocols = MediaWikiServices::getInstance()->getMainConfig()
312 $protocols = [ '' ];
313 foreach ( $urlProtocols as $p ) {
314 if ( $p !== '//' ) {
315 $protocols[] = substr( $p, 0, strpos( $p, ':' ) );
316 }
317 }
318
319 return $protocols;
320 }
321
334 public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) {
335 $db = wfGetDB( DB_REPLICA );
336 $like = [];
337
338 $target = $protocol . $filterEntry;
339 $bits = wfParseUrl( $target );
340 if ( !$bits ) {
341 return false;
342 }
343
344 $subdomains = false;
345 if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
346 // Email address with domain and non-empty local part
347 $mailparts = explode( '@', $bits['host'], 2 );
348 $domainpart = self::indexifyHost( $mailparts[1] );
349 if ( $mailparts[0] === '*' ) {
350 $subdomains = true;
351 $bits['host'] = $domainpart . '@';
352 } else {
353 $bits['host'] = $domainpart . '@' . $mailparts[0];
354 }
355 } else {
356 // Non-email, or email with only a domain part.
357 $bits['host'] = self::indexifyHost( $bits['host'] );
358 if ( substr( $bits['host'], -3 ) === '.*.' ) {
359 $subdomains = true;
360 $bits['host'] = substr( $bits['host'], 0, -2 );
361 }
362 }
363
364 $like[] = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
365
366 if ( $subdomains ) {
367 $like[] = $db->anyString();
368 }
369
370 if ( isset( $bits['port'] ) ) {
371 $like[] = ':' . $bits['port'];
372 }
373 if ( isset( $bits['path'] ) ) {
374 $like[] = $bits['path'];
375 } elseif ( !$subdomains ) {
376 $like[] = '/';
377 }
378 if ( isset( $bits['query'] ) ) {
379 $like[] = '?' . $bits['query'];
380 }
381 if ( isset( $bits['fragment'] ) ) {
382 $like[] = '#' . $bits['fragment'];
383 }
384
385 // Check for stray asterisks: asterisk only allowed at the start of the domain
386 foreach ( $like as $likepart ) {
387 if ( !( $likepart instanceof LikeMatch ) && strpos( $likepart, '*' ) !== false ) {
388 return false;
389 }
390 }
391
392 if ( !( $like[count( $like ) - 1] instanceof LikeMatch ) ) {
393 // Add wildcard at the end if there isn't one already
394 $like[] = $db->anyString();
395 }
396
397 return $like;
398 }
399
408 public static function keepOneWildcard( $arr ) {
409 if ( !is_array( $arr ) ) {
410 return $arr;
411 }
412
413 foreach ( $arr as $key => $value ) {
414 if ( $value instanceof LikeMatch ) {
415 return array_slice( $arr, 0, $key + 1 );
416 }
417 }
418
419 return $arr;
420 }
421}
422
423class_alias( LinkFilter::class, 'LinkFilter' );
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
A class containing constants representing the names of configuration variables.
const UrlProtocols
Name constant for the UrlProtocols setting, for use with Config::get()
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
A collection of static methods to play with strings.
Content object implementation for representing flat text.
Used by Database::buildLike() to represent characters that have special meaning in SQL LIKE clauses a...
Definition LikeMatch.php:10
Base interface for representing page content.
Definition Content.php:37
const DB_REPLICA
Definition defines.php:26
$content
Definition router.php:76