Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
4.26% |
6 / 141 |
|
5.00% |
1 / 20 |
CRAP | |
0.00% |
0 / 1 |
BaseBlacklist | |
4.26% |
6 / 141 |
|
5.00% |
1 / 20 |
2244.24 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
filter | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
addBlacklistType | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getBlacklistTypes | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getSpamBlacklist | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getEmailBlacklist | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getInstance | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
clearInstanceCache | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getBlacklistType | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
isLocalSource | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
132 | |||
getTypeFromTitle | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
getBlacklists | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
getLocalBlacklists | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
getWhitelists | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
getSharedBlacklists | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
20 | |||
clearCache | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
buildSharedBlacklists | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
30 | |||
getHttpText | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
30 | |||
getArticleText | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
12 | |||
getRegexStart | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getRegexEnd | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
warmCachesForFilter | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\SpamBlacklist; |
4 | |
5 | use InvalidArgumentException; |
6 | use MediaWiki\Content\TextContent; |
7 | use MediaWiki\MediaWikiServices; |
8 | use MediaWiki\Revision\SlotRecord; |
9 | use MediaWiki\Title\Title; |
10 | use MediaWiki\User\User; |
11 | |
12 | /** |
13 | * Base class for different kinds of blacklists |
14 | */ |
15 | abstract class BaseBlacklist { |
16 | /** |
17 | * Array of blacklist sources |
18 | * |
19 | * @var string[] |
20 | */ |
21 | public $files = []; |
22 | |
23 | /** |
24 | * Array containing regexes to test against |
25 | * |
26 | * @var string[]|false |
27 | */ |
28 | protected $regexes = false; |
29 | |
30 | /** |
31 | * Chance of receiving a warning when the filter is hit |
32 | * |
33 | * @var int |
34 | */ |
35 | public $warningChance = 100; |
36 | |
37 | /** |
38 | * @var int |
39 | */ |
40 | public $warningTime = 600; |
41 | |
42 | /** |
43 | * @var int |
44 | */ |
45 | public $expiryTime = 900; |
46 | |
47 | /** |
48 | * Array containing blacklists that extend BaseBlacklist |
49 | * |
50 | * @var string[] |
51 | */ |
52 | private static $blacklistTypes = [ |
53 | 'spam' => SpamBlacklist::class, |
54 | 'email' => EmailBlacklist::class, |
55 | ]; |
56 | |
57 | /** |
58 | * Array of blacklist instances |
59 | * |
60 | * @var self[] |
61 | */ |
62 | private static $instances = []; |
63 | |
64 | /** |
65 | * @param array $settings |
66 | */ |
67 | public function __construct( $settings = [] ) { |
68 | foreach ( $settings as $name => $value ) { |
69 | $this->$name = $value; |
70 | } |
71 | } |
72 | |
73 | /** |
74 | * @param array $links |
75 | * @param ?Title $title |
76 | * @param User $user |
77 | * @param bool $preventLog |
78 | * @return mixed |
79 | */ |
80 | abstract public function filter( |
81 | array $links, |
82 | ?Title $title, |
83 | User $user, |
84 | $preventLog = false |
85 | ); |
86 | |
87 | /** |
88 | * Adds a blacklist class to the registry |
89 | * |
90 | * @param string $type |
91 | * @param string $class |
92 | */ |
93 | public static function addBlacklistType( $type, $class ) { |
94 | self::$blacklistTypes[$type] = $class; |
95 | } |
96 | |
97 | /** |
98 | * Return the array of blacklist types currently defined |
99 | * |
100 | * @return string[] |
101 | */ |
102 | public static function getBlacklistTypes() { |
103 | return self::$blacklistTypes; |
104 | } |
105 | |
106 | /** |
107 | * @return SpamBlacklist |
108 | */ |
109 | public static function getSpamBlacklist() { |
110 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
111 | return self::getInstance( 'spam' ); |
112 | } |
113 | |
114 | /** |
115 | * @return EmailBlacklist |
116 | */ |
117 | public static function getEmailBlacklist() { |
118 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
119 | return self::getInstance( 'email' ); |
120 | } |
121 | |
122 | /** |
123 | * Returns an instance of the given blacklist |
124 | * |
125 | * @deprecated Use getSpamBlacklist() or getEmailBlacklist() instead |
126 | * @param string $type Code for the blacklist |
127 | * @return BaseBlacklist |
128 | */ |
129 | public static function getInstance( $type ) { |
130 | if ( !isset( self::$blacklistTypes[$type] ) ) { |
131 | throw new InvalidArgumentException( "Invalid blacklist type '$type' passed to " . __METHOD__ ); |
132 | } |
133 | |
134 | if ( !isset( self::$instances[$type] ) ) { |
135 | global $wgBlacklistSettings; |
136 | |
137 | // Prevent notices |
138 | if ( !isset( $wgBlacklistSettings[$type] ) ) { |
139 | $wgBlacklistSettings[$type] = []; |
140 | } |
141 | |
142 | $class = self::$blacklistTypes[$type]; |
143 | self::$instances[$type] = new $class( $wgBlacklistSettings[$type] ); |
144 | } |
145 | |
146 | return self::$instances[$type]; |
147 | } |
148 | |
149 | /** |
150 | * Clear instance cache. For use during testing. |
151 | */ |
152 | public static function clearInstanceCache() { |
153 | self::$instances = []; |
154 | } |
155 | |
156 | /** |
157 | * Returns the code for the blacklist implementation |
158 | * |
159 | * @return string |
160 | */ |
161 | abstract protected function getBlacklistType(); |
162 | |
163 | /** |
164 | * Check if the given local page title is a spam regex source. |
165 | * |
166 | * @param Title $title |
167 | * @return bool |
168 | */ |
169 | public static function isLocalSource( Title $title ) { |
170 | global $wgDBname, $wgBlacklistSettings; |
171 | |
172 | if ( $title->inNamespace( NS_MEDIAWIKI ) ) { |
173 | $sources = []; |
174 | foreach ( self::$blacklistTypes as $type => $class ) { |
175 | // For the built in types, this results in the use of: |
176 | // spam-blacklist, spam-whitelist |
177 | // email-blacklist, email-whitelist |
178 | $type = ucfirst( $type ); |
179 | $sources[] = "$type-blacklist"; |
180 | $sources[] = "$type-whitelist"; |
181 | } |
182 | |
183 | if ( in_array( $title->getDBkey(), $sources ) ) { |
184 | return true; |
185 | } |
186 | } |
187 | |
188 | $thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP ); |
189 | $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/'; |
190 | |
191 | $files = []; |
192 | foreach ( self::$blacklistTypes as $type => $class ) { |
193 | if ( isset( $wgBlacklistSettings[$type]['files'] ) ) { |
194 | $files += $wgBlacklistSettings[$type]['files']; |
195 | } |
196 | } |
197 | |
198 | foreach ( $files as $fileName ) { |
199 | $matches = []; |
200 | if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) { |
201 | if ( $wgDBname === $matches[1] && $matches[2] === $title->getPrefixedDbKey() ) { |
202 | // Local DB fetch of this page... |
203 | return true; |
204 | } |
205 | } elseif ( preg_match( $thisHttpRegex, $fileName ) ) { |
206 | // Raw view of this page |
207 | return true; |
208 | } |
209 | } |
210 | |
211 | return false; |
212 | } |
213 | |
214 | /** |
215 | * Returns the type of blacklist from the given title |
216 | * |
217 | * @todo building a regex for this is pretty overkill |
218 | * @param Title $title |
219 | * @return bool|string |
220 | */ |
221 | public static function getTypeFromTitle( Title $title ) { |
222 | $contLang = MediaWikiServices::getInstance()->getContentLanguage(); |
223 | |
224 | $types = array_map( [ $contLang, 'ucfirst' ], array_keys( self::$blacklistTypes ) ); |
225 | $regex = '/(' . implode( '|', $types ) . ')-(?:blacklist|whitelist)/'; |
226 | |
227 | if ( preg_match( $regex, $title->getDBkey(), $m ) ) { |
228 | return strtolower( $m[1] ); |
229 | } |
230 | |
231 | return false; |
232 | } |
233 | |
234 | /** |
235 | * Fetch local and (possibly cached) remote blacklists. |
236 | * Will be cached locally across multiple invocations. |
237 | * @return string[] set of regular expressions, potentially empty. |
238 | */ |
239 | public function getBlacklists() { |
240 | if ( $this->regexes === false ) { |
241 | $this->regexes = array_merge( |
242 | $this->getLocalBlacklists(), |
243 | $this->getSharedBlacklists() |
244 | ); |
245 | } |
246 | return $this->regexes; |
247 | } |
248 | |
249 | /** |
250 | * Returns the local blacklist |
251 | * |
252 | * @return string[] Regular expressions |
253 | */ |
254 | public function getLocalBlacklists() { |
255 | $type = $this->getBlacklistType(); |
256 | $cache = MediaWikiServices::getInstance()->getMainWANObjectCache(); |
257 | |
258 | return $cache->getWithSetCallback( |
259 | $cache->makeKey( 'spamblacklist', $type, 'blacklist-regex' ), |
260 | $this->expiryTime, |
261 | function () use ( $type ) { |
262 | return SpamRegexBatch::regexesFromMessage( "{$type}-blacklist", $this ); |
263 | } |
264 | ); |
265 | } |
266 | |
267 | /** |
268 | * Returns the (local) whitelist |
269 | * |
270 | * @return string[] Regular expressions |
271 | */ |
272 | public function getWhitelists() { |
273 | $type = $this->getBlacklistType(); |
274 | $cache = MediaWikiServices::getInstance()->getMainWANObjectCache(); |
275 | |
276 | return $cache->getWithSetCallback( |
277 | $cache->makeKey( 'spamblacklist', $type, 'whitelist-regex' ), |
278 | $this->expiryTime, |
279 | function () use ( $type ) { |
280 | return SpamRegexBatch::regexesFromMessage( "{$type}-whitelist", $this ); |
281 | } |
282 | ); |
283 | } |
284 | |
285 | /** |
286 | * Fetch (possibly cached) remote blacklists. |
287 | * @return array |
288 | */ |
289 | private function getSharedBlacklists() { |
290 | $listType = $this->getBlacklistType(); |
291 | |
292 | wfDebugLog( 'SpamBlacklist', "Loading $listType regex..." ); |
293 | |
294 | if ( !$this->files ) { |
295 | # No lists |
296 | wfDebugLog( 'SpamBlacklist', "no files specified\n" ); |
297 | return []; |
298 | } |
299 | |
300 | if ( defined( 'MW_PHPUNIT_TEST' ) ) { |
301 | wfDebugLog( 'SpamBlacklist', 'remote loading disabled during PHPUnit test' ); |
302 | return []; |
303 | } |
304 | |
305 | $miss = false; |
306 | $cache = MediaWikiServices::getInstance()->getMainWANObjectCache(); |
307 | $regexes = $cache->getWithSetCallback( |
308 | // This used to be cached per-site, but that could be bad on a shared |
309 | // server where not all wikis have the same configuration. |
310 | $cache->makeKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ), |
311 | $this->expiryTime, |
312 | function () use ( &$miss ) { |
313 | $miss = true; |
314 | return $this->buildSharedBlacklists(); |
315 | } |
316 | ); |
317 | |
318 | if ( !$miss ) { |
319 | wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" ); |
320 | } |
321 | |
322 | return $regexes; |
323 | } |
324 | |
325 | /** |
326 | * Clear all primary blacklist cache keys |
327 | */ |
328 | public function clearCache() { |
329 | $listType = $this->getBlacklistType(); |
330 | |
331 | $cache = MediaWikiServices::getInstance()->getMainWANObjectCache(); |
332 | $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ) ); |
333 | $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'blacklist-regex' ) ); |
334 | $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'whitelist-regex' ) ); |
335 | |
336 | wfDebugLog( 'SpamBlacklist', "$listType blacklist local cache cleared.\n" ); |
337 | } |
338 | |
339 | private function buildSharedBlacklists() { |
340 | $regexes = []; |
341 | $listType = $this->getBlacklistType(); |
342 | # Load lists |
343 | wfDebugLog( 'SpamBlacklist', "Constructing $listType blacklist\n" ); |
344 | foreach ( $this->files as $fileName ) { |
345 | $matches = []; |
346 | if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) { |
347 | $text = $this->getArticleText( $matches[1], $matches[2] ); |
348 | } elseif ( preg_match( '/^(https?:)?\/\//', $fileName ) ) { |
349 | $text = $this->getHttpText( $fileName ); |
350 | } else { |
351 | $text = file_get_contents( $fileName ); |
352 | wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" ); |
353 | } |
354 | |
355 | if ( $text ) { |
356 | // Build a separate batch of regexes from each source. |
357 | // While in theory we could squeeze a little efficiency |
358 | // out of combining multiple sources in one regex, if |
359 | // there's a bad line in one of them we'll gain more |
360 | // from only having to break that set into smaller pieces. |
361 | $regexes = array_merge( |
362 | $regexes, |
363 | SpamRegexBatch::regexesFromText( $text, $this, $fileName ) |
364 | ); |
365 | } |
366 | } |
367 | |
368 | return $regexes; |
369 | } |
370 | |
371 | /** |
372 | * @param string $fileName |
373 | * @return string|null|false |
374 | */ |
375 | private function getHttpText( $fileName ) { |
376 | global $wgMessageCacheType; |
377 | // FIXME: This is a hack to use Memcached where possible (incl. WMF), |
378 | // but have CACHE_DB as fallback (instead of no cache). |
379 | // This might be a good candidate for T248005. |
380 | $services = MediaWikiServices::getInstance()->getObjectCacheFactory(); |
381 | $cache = $services->getInstance( $wgMessageCacheType ); |
382 | |
383 | $listType = $this->getBlacklistType(); |
384 | // There are two keys, when the warning key expires, a random thread will refresh |
385 | // the real key. This reduces the chance of multiple requests under high traffic |
386 | // conditions. |
387 | $key = $cache->makeGlobalKey( "blacklist_file_{$listType}", $fileName ); |
388 | $warningKey = $cache->makeKey( "filewarning_{$listType}", $fileName ); |
389 | $httpText = $cache->get( $key ); |
390 | $warning = $cache->get( $warningKey ); |
391 | |
392 | if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) { |
393 | wfDebugLog( 'SpamBlacklist', "Loading $listType blacklist from $fileName\n" ); |
394 | $httpText = MediaWikiServices::getInstance()->getHttpRequestFactory() |
395 | ->get( $fileName, [], __METHOD__ ); |
396 | if ( $httpText === false ) { |
397 | wfDebugLog( 'SpamBlacklist', "Error loading $listType blacklist from $fileName\n" ); |
398 | } |
399 | $cache->set( $warningKey, 1, $this->warningTime ); |
400 | $cache->set( $key, $httpText, $this->expiryTime ); |
401 | } else { |
402 | wfDebugLog( 'SpamBlacklist', "Got $listType blacklist from HTTP cache for $fileName\n" ); |
403 | } |
404 | return $httpText; |
405 | } |
406 | |
407 | /** |
408 | * Fetch an article from this or another local MediaWiki database. |
409 | * |
410 | * @param string $wiki |
411 | * @param string $pagename |
412 | * @return bool|string|null |
413 | */ |
414 | private function getArticleText( $wiki, $pagename ) { |
415 | wfDebugLog( 'SpamBlacklist', |
416 | "Fetching {$this->getBlacklistType()} blacklist from '$pagename' on '$wiki'...\n" ); |
417 | |
418 | $services = MediaWikiServices::getInstance(); |
419 | |
420 | // XXX: We do not know about custom namespaces on the target wiki here! |
421 | $title = $services->getTitleParser()->parseTitle( $pagename ); |
422 | $store = $services->getRevisionStoreFactory()->getRevisionStore( $wiki ); |
423 | $rev = $store->getRevisionByTitle( $title ); |
424 | |
425 | $content = $rev ? $rev->getContent( SlotRecord::MAIN ) : null; |
426 | |
427 | if ( !( $content instanceof TextContent ) ) { |
428 | return false; |
429 | } |
430 | |
431 | return $content->getText(); |
432 | } |
433 | |
434 | /** |
435 | * Returns the start of the regex for matches |
436 | * |
437 | * @return string |
438 | */ |
439 | public function getRegexStart() { |
440 | return '/[a-z0-9_\-.]*'; |
441 | } |
442 | |
443 | /** |
444 | * Returns the end of the regex for matches |
445 | * |
446 | * @param int $batchSize |
447 | * @return string |
448 | */ |
449 | public function getRegexEnd( $batchSize ) { |
450 | return ( $batchSize > 0 ) ? '/Sim' : '/im'; |
451 | } |
452 | |
453 | /** |
454 | * @param Title $title |
455 | * @param string[] $entries |
456 | * @param User $user |
457 | */ |
458 | public function warmCachesForFilter( Title $title, array $entries, User $user ) { |
459 | // subclass this |
460 | } |
461 | } |