Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
4.29% |
6 / 140 |
|
5.00% |
1 / 20 |
CRAP | |
0.00% |
0 / 1 |
BaseBlacklist | |
4.29% |
6 / 140 |
|
5.00% |
1 / 20 |
2242.15 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
filter | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
addBlacklistType | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getBlacklistTypes | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getSpamBlacklist | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getEmailBlacklist | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getInstance | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
20 | |||
clearInstanceCache | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getBlacklistType | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
isLocalSource | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
132 | |||
getTypeFromTitle | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
getBlacklists | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
getLocalBlacklists | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
getWhitelists | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
getSharedBlacklists | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
20 | |||
clearCache | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
buildSharedBlacklists | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
30 | |||
getHttpText | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
30 | |||
getArticleText | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
12 | |||
getRegexStart | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getRegexEnd | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
warmCachesForFilter | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\SpamBlacklist; |
4 | |
5 | use InvalidArgumentException; |
6 | use MediaWiki\MediaWikiServices; |
7 | use MediaWiki\Revision\SlotRecord; |
8 | use MediaWiki\Title\Title; |
9 | use MediaWiki\User\User; |
10 | use ObjectCache; |
11 | use TextContent; |
12 | |
13 | /** |
14 | * Base class for different kinds of blacklists |
15 | */ |
16 | abstract class BaseBlacklist { |
17 | /** |
18 | * Array of blacklist sources |
19 | * |
20 | * @var string[] |
21 | */ |
22 | public $files = []; |
23 | |
24 | /** |
25 | * Array containing regexes to test against |
26 | * |
27 | * @var string[]|false |
28 | */ |
29 | protected $regexes = false; |
30 | |
31 | /** |
32 | * Chance of receiving a warning when the filter is hit |
33 | * |
34 | * @var int |
35 | */ |
36 | public $warningChance = 100; |
37 | |
38 | /** |
39 | * @var int |
40 | */ |
41 | public $warningTime = 600; |
42 | |
43 | /** |
44 | * @var int |
45 | */ |
46 | public $expiryTime = 900; |
47 | |
48 | /** |
49 | * Array containing blacklists that extend BaseBlacklist |
50 | * |
51 | * @var string[] |
52 | */ |
53 | private static $blacklistTypes = [ |
54 | 'spam' => SpamBlacklist::class, |
55 | 'email' => EmailBlacklist::class, |
56 | ]; |
57 | |
58 | /** |
59 | * Array of blacklist instances |
60 | * |
61 | * @var self[] |
62 | */ |
63 | private static $instances = []; |
64 | |
65 | /** |
66 | * @param array $settings |
67 | */ |
68 | public function __construct( $settings = [] ) { |
69 | foreach ( $settings as $name => $value ) { |
70 | $this->$name = $value; |
71 | } |
72 | } |
73 | |
74 | /** |
75 | * @param array $links |
76 | * @param ?Title $title |
77 | * @param User $user |
78 | * @param bool $preventLog |
79 | * @return mixed |
80 | */ |
81 | abstract public function filter( |
82 | array $links, |
83 | ?Title $title, |
84 | User $user, |
85 | $preventLog = false |
86 | ); |
87 | |
88 | /** |
89 | * Adds a blacklist class to the registry |
90 | * |
91 | * @param string $type |
92 | * @param string $class |
93 | */ |
94 | public static function addBlacklistType( $type, $class ) { |
95 | self::$blacklistTypes[$type] = $class; |
96 | } |
97 | |
98 | /** |
99 | * Return the array of blacklist types currently defined |
100 | * |
101 | * @return string[] |
102 | */ |
103 | public static function getBlacklistTypes() { |
104 | return self::$blacklistTypes; |
105 | } |
106 | |
107 | /** |
108 | * @return SpamBlacklist |
109 | */ |
110 | public static function getSpamBlacklist() { |
111 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
112 | return self::getInstance( 'spam' ); |
113 | } |
114 | |
115 | /** |
116 | * @return EmailBlacklist |
117 | */ |
118 | public static function getEmailBlacklist() { |
119 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
120 | return self::getInstance( 'email' ); |
121 | } |
122 | |
123 | /** |
124 | * Returns an instance of the given blacklist |
125 | * |
126 | * @deprecated Use getSpamBlacklist() or getEmailBlacklist() instead |
127 | * @param string $type Code for the blacklist |
128 | * @return BaseBlacklist |
129 | */ |
130 | public static function getInstance( $type ) { |
131 | if ( !isset( self::$blacklistTypes[$type] ) ) { |
132 | throw new InvalidArgumentException( "Invalid blacklist type '$type' passed to " . __METHOD__ ); |
133 | } |
134 | |
135 | if ( !isset( self::$instances[$type] ) ) { |
136 | global $wgBlacklistSettings; |
137 | |
138 | // Prevent notices |
139 | if ( !isset( $wgBlacklistSettings[$type] ) ) { |
140 | $wgBlacklistSettings[$type] = []; |
141 | } |
142 | |
143 | $class = self::$blacklistTypes[$type]; |
144 | self::$instances[$type] = new $class( $wgBlacklistSettings[$type] ); |
145 | } |
146 | |
147 | return self::$instances[$type]; |
148 | } |
149 | |
150 | /** |
151 | * Clear instance cache. For use during testing. |
152 | */ |
153 | public static function clearInstanceCache() { |
154 | self::$instances = []; |
155 | } |
156 | |
157 | /** |
158 | * Returns the code for the blacklist implementation |
159 | * |
160 | * @return string |
161 | */ |
162 | abstract protected function getBlacklistType(); |
163 | |
164 | /** |
165 | * Check if the given local page title is a spam regex source. |
166 | * |
167 | * @param Title $title |
168 | * @return bool |
169 | */ |
170 | public static function isLocalSource( Title $title ) { |
171 | global $wgDBname, $wgBlacklistSettings; |
172 | |
173 | if ( $title->inNamespace( NS_MEDIAWIKI ) ) { |
174 | $sources = []; |
175 | foreach ( self::$blacklistTypes as $type => $class ) { |
176 | // For the built in types, this results in the use of: |
177 | // spam-blacklist, spam-whitelist |
178 | // email-blacklist, email-whitelist |
179 | $type = ucfirst( $type ); |
180 | $sources[] = "$type-blacklist"; |
181 | $sources[] = "$type-whitelist"; |
182 | } |
183 | |
184 | if ( in_array( $title->getDBkey(), $sources ) ) { |
185 | return true; |
186 | } |
187 | } |
188 | |
189 | $thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP ); |
190 | $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/'; |
191 | |
192 | $files = []; |
193 | foreach ( self::$blacklistTypes as $type => $class ) { |
194 | if ( isset( $wgBlacklistSettings[$type]['files'] ) ) { |
195 | $files += $wgBlacklistSettings[$type]['files']; |
196 | } |
197 | } |
198 | |
199 | foreach ( $files as $fileName ) { |
200 | $matches = []; |
201 | if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) { |
202 | if ( $wgDBname === $matches[1] && $matches[2] === $title->getPrefixedDbKey() ) { |
203 | // Local DB fetch of this page... |
204 | return true; |
205 | } |
206 | } elseif ( preg_match( $thisHttpRegex, $fileName ) ) { |
207 | // Raw view of this page |
208 | return true; |
209 | } |
210 | } |
211 | |
212 | return false; |
213 | } |
214 | |
215 | /** |
216 | * Returns the type of blacklist from the given title |
217 | * |
218 | * @todo building a regex for this is pretty overkill |
219 | * @param Title $title |
220 | * @return bool|string |
221 | */ |
222 | public static function getTypeFromTitle( Title $title ) { |
223 | $contLang = MediaWikiServices::getInstance()->getContentLanguage(); |
224 | |
225 | $types = array_map( [ $contLang, 'ucfirst' ], array_keys( self::$blacklistTypes ) ); |
226 | $regex = '/(' . implode( '|', $types ) . ')-(?:blacklist|whitelist)/'; |
227 | |
228 | if ( preg_match( $regex, $title->getDBkey(), $m ) ) { |
229 | return strtolower( $m[1] ); |
230 | } |
231 | |
232 | return false; |
233 | } |
234 | |
235 | /** |
236 | * Fetch local and (possibly cached) remote blacklists. |
237 | * Will be cached locally across multiple invocations. |
238 | * @return string[] set of regular expressions, potentially empty. |
239 | */ |
240 | public function getBlacklists() { |
241 | if ( $this->regexes === false ) { |
242 | $this->regexes = array_merge( |
243 | $this->getLocalBlacklists(), |
244 | $this->getSharedBlacklists() |
245 | ); |
246 | } |
247 | return $this->regexes; |
248 | } |
249 | |
250 | /** |
251 | * Returns the local blacklist |
252 | * |
253 | * @return string[] Regular expressions |
254 | */ |
255 | public function getLocalBlacklists() { |
256 | $type = $this->getBlacklistType(); |
257 | $cache = MediaWikiServices::getInstance()->getMainWANObjectCache(); |
258 | |
259 | return $cache->getWithSetCallback( |
260 | $cache->makeKey( 'spamblacklist', $type, 'blacklist-regex' ), |
261 | $this->expiryTime, |
262 | function () use ( $type ) { |
263 | return SpamRegexBatch::regexesFromMessage( "{$type}-blacklist", $this ); |
264 | } |
265 | ); |
266 | } |
267 | |
268 | /** |
269 | * Returns the (local) whitelist |
270 | * |
271 | * @return string[] Regular expressions |
272 | */ |
273 | public function getWhitelists() { |
274 | $type = $this->getBlacklistType(); |
275 | $cache = MediaWikiServices::getInstance()->getMainWANObjectCache(); |
276 | |
277 | return $cache->getWithSetCallback( |
278 | $cache->makeKey( 'spamblacklist', $type, 'whitelist-regex' ), |
279 | $this->expiryTime, |
280 | function () use ( $type ) { |
281 | return SpamRegexBatch::regexesFromMessage( "{$type}-whitelist", $this ); |
282 | } |
283 | ); |
284 | } |
285 | |
286 | /** |
287 | * Fetch (possibly cached) remote blacklists. |
288 | * @return array |
289 | */ |
290 | private function getSharedBlacklists() { |
291 | $listType = $this->getBlacklistType(); |
292 | |
293 | wfDebugLog( 'SpamBlacklist', "Loading $listType regex..." ); |
294 | |
295 | if ( !$this->files ) { |
296 | # No lists |
297 | wfDebugLog( 'SpamBlacklist', "no files specified\n" ); |
298 | return []; |
299 | } |
300 | |
301 | if ( defined( 'MW_PHPUNIT_TEST' ) ) { |
302 | wfDebugLog( 'SpamBlacklist', 'remote loading disabled during PHPUnit test' ); |
303 | return []; |
304 | } |
305 | |
306 | $miss = false; |
307 | $cache = MediaWikiServices::getInstance()->getMainWANObjectCache(); |
308 | $regexes = $cache->getWithSetCallback( |
309 | // This used to be cached per-site, but that could be bad on a shared |
310 | // server where not all wikis have the same configuration. |
311 | $cache->makeKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ), |
312 | $this->expiryTime, |
313 | function () use ( &$miss ) { |
314 | $miss = true; |
315 | return $this->buildSharedBlacklists(); |
316 | } |
317 | ); |
318 | |
319 | if ( !$miss ) { |
320 | wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" ); |
321 | } |
322 | |
323 | return $regexes; |
324 | } |
325 | |
326 | /** |
327 | * Clear all primary blacklist cache keys |
328 | */ |
329 | public function clearCache() { |
330 | $listType = $this->getBlacklistType(); |
331 | |
332 | $cache = MediaWikiServices::getInstance()->getMainWANObjectCache(); |
333 | $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ) ); |
334 | $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'blacklist-regex' ) ); |
335 | $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'whitelist-regex' ) ); |
336 | |
337 | wfDebugLog( 'SpamBlacklist', "$listType blacklist local cache cleared.\n" ); |
338 | } |
339 | |
340 | private function buildSharedBlacklists() { |
341 | $regexes = []; |
342 | $listType = $this->getBlacklistType(); |
343 | # Load lists |
344 | wfDebugLog( 'SpamBlacklist', "Constructing $listType blacklist\n" ); |
345 | foreach ( $this->files as $fileName ) { |
346 | $matches = []; |
347 | if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) { |
348 | $text = $this->getArticleText( $matches[1], $matches[2] ); |
349 | } elseif ( preg_match( '/^(https?:)?\/\//', $fileName ) ) { |
350 | $text = $this->getHttpText( $fileName ); |
351 | } else { |
352 | $text = file_get_contents( $fileName ); |
353 | wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" ); |
354 | } |
355 | |
356 | if ( $text ) { |
357 | // Build a separate batch of regexes from each source. |
358 | // While in theory we could squeeze a little efficiency |
359 | // out of combining multiple sources in one regex, if |
360 | // there's a bad line in one of them we'll gain more |
361 | // from only having to break that set into smaller pieces. |
362 | $regexes = array_merge( |
363 | $regexes, |
364 | SpamRegexBatch::regexesFromText( $text, $this, $fileName ) |
365 | ); |
366 | } |
367 | } |
368 | |
369 | return $regexes; |
370 | } |
371 | |
372 | private function getHttpText( $fileName ) { |
373 | global $wgMessageCacheType; |
374 | // FIXME: This is a hack to use Memcached where possible (incl. WMF), |
375 | // but have CACHE_DB as fallback (instead of no cache). |
376 | // This might be a good candidate for T248005. |
377 | $cache = ObjectCache::getInstance( $wgMessageCacheType ); |
378 | |
379 | $listType = $this->getBlacklistType(); |
380 | // There are two keys, when the warning key expires, a random thread will refresh |
381 | // the real key. This reduces the chance of multiple requests under high traffic |
382 | // conditions. |
383 | $key = $cache->makeGlobalKey( "blacklist_file_{$listType}", $fileName ); |
384 | $warningKey = $cache->makeKey( "filewarning_{$listType}", $fileName ); |
385 | $httpText = $cache->get( $key ); |
386 | $warning = $cache->get( $warningKey ); |
387 | |
388 | if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) { |
389 | wfDebugLog( 'SpamBlacklist', "Loading $listType blacklist from $fileName\n" ); |
390 | $httpText = MediaWikiServices::getInstance()->getHttpRequestFactory() |
391 | ->get( $fileName, [], __METHOD__ ); |
392 | if ( $httpText === false ) { |
393 | wfDebugLog( 'SpamBlacklist', "Error loading $listType blacklist from $fileName\n" ); |
394 | } |
395 | $cache->set( $warningKey, 1, $this->warningTime ); |
396 | $cache->set( $key, $httpText, $this->expiryTime ); |
397 | } else { |
398 | wfDebugLog( 'SpamBlacklist', "Got $listType blacklist from HTTP cache for $fileName\n" ); |
399 | } |
400 | return $httpText; |
401 | } |
402 | |
403 | /** |
404 | * Fetch an article from this or another local MediaWiki database. |
405 | * |
406 | * @param string $wiki |
407 | * @param string $pagename |
408 | * @return bool|string|null |
409 | */ |
410 | private function getArticleText( $wiki, $pagename ) { |
411 | wfDebugLog( 'SpamBlacklist', |
412 | "Fetching {$this->getBlacklistType()} blacklist from '$pagename' on '$wiki'...\n" ); |
413 | |
414 | $services = MediaWikiServices::getInstance(); |
415 | |
416 | // XXX: We do not know about custom namespaces on the target wiki here! |
417 | $title = $services->getTitleParser()->parseTitle( $pagename ); |
418 | $store = $services->getRevisionStoreFactory()->getRevisionStore( $wiki ); |
419 | $rev = $store->getRevisionByTitle( $title ); |
420 | |
421 | $content = $rev ? $rev->getContent( SlotRecord::MAIN ) : null; |
422 | |
423 | if ( !( $content instanceof TextContent ) ) { |
424 | return false; |
425 | } |
426 | |
427 | return $content->getText(); |
428 | } |
429 | |
430 | /** |
431 | * Returns the start of the regex for matches |
432 | * |
433 | * @return string |
434 | */ |
435 | public function getRegexStart() { |
436 | return '/[a-z0-9_\-.]*'; |
437 | } |
438 | |
439 | /** |
440 | * Returns the end of the regex for matches |
441 | * |
442 | * @param int $batchSize |
443 | * @return string |
444 | */ |
445 | public function getRegexEnd( $batchSize ) { |
446 | return ( $batchSize > 0 ) ? '/Sim' : '/im'; |
447 | } |
448 | |
449 | /** |
450 | * @param Title $title |
451 | * @param string[] $entries |
452 | * @param User $user |
453 | */ |
454 | public function warmCachesForFilter( Title $title, array $entries, User $user ) { |
455 | // subclass this |
456 | } |
457 | } |