Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
4.26% covered (danger)
4.26%
6 / 141
5.00% covered (danger)
5.00%
1 / 20
CRAP
0.00% covered (danger)
0.00%
0 / 1
BaseBlacklist
4.26% covered (danger)
4.26%
6 / 141
5.00% covered (danger)
5.00%
1 / 20
2244.24
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
6
 filter
n/a
0 / 0
n/a
0 / 0
0
 addBlacklistType
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getBlacklistTypes
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getSpamBlacklist
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getEmailBlacklist
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getInstance
0.00% covered (danger)
0.00%
0 / 8
0.00% covered (danger)
0.00%
0 / 1
20
 clearInstanceCache
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getBlacklistType
n/a
0 / 0
n/a
0 / 0
0
 isLocalSource
0.00% covered (danger)
0.00%
0 / 22
0.00% covered (danger)
0.00%
0 / 1
132
 getTypeFromTitle
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 getBlacklists
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
6
 getLocalBlacklists
0.00% covered (danger)
0.00%
0 / 9
0.00% covered (danger)
0.00%
0 / 1
2
 getWhitelists
0.00% covered (danger)
0.00%
0 / 9
0.00% covered (danger)
0.00%
0 / 1
2
 getSharedBlacklists
0.00% covered (danger)
0.00%
0 / 21
0.00% covered (danger)
0.00%
0 / 1
20
 clearCache
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
2
 buildSharedBlacklists
0.00% covered (danger)
0.00%
0 / 17
0.00% covered (danger)
0.00%
0 / 1
30
 getHttpText
0.00% covered (danger)
0.00%
0 / 17
0.00% covered (danger)
0.00%
0 / 1
30
 getArticleText
0.00% covered (danger)
0.00%
0 / 10
0.00% covered (danger)
0.00%
0 / 1
12
 getRegexStart
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getRegexEnd
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
6
 warmCachesForFilter
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace MediaWiki\Extension\SpamBlacklist;
4
5use InvalidArgumentException;
6use MediaWiki\Content\TextContent;
7use MediaWiki\MediaWikiServices;
8use MediaWiki\Revision\SlotRecord;
9use MediaWiki\Title\Title;
10use MediaWiki\User\User;
11
12/**
13 * Base class for different kinds of blacklists
14 */
15abstract class BaseBlacklist {
16    /**
17     * Array of blacklist sources
18     *
19     * @var string[]
20     */
21    public $files = [];
22
23    /**
24     * Array containing regexes to test against
25     *
26     * @var string[]|false
27     */
28    protected $regexes = false;
29
30    /**
31     * Chance of receiving a warning when the filter is hit
32     *
33     * @var int
34     */
35    public $warningChance = 100;
36
37    /**
38     * @var int
39     */
40    public $warningTime = 600;
41
42    /**
43     * @var int
44     */
45    public $expiryTime = 900;
46
47    /**
48     * Array containing blacklists that extend BaseBlacklist
49     *
50     * @var string[]
51     */
52    private static $blacklistTypes = [
53        'spam' => SpamBlacklist::class,
54        'email' => EmailBlacklist::class,
55    ];
56
57    /**
58     * Array of blacklist instances
59     *
60     * @var self[]
61     */
62    private static $instances = [];
63
64    /**
65     * @param array $settings
66     */
67    public function __construct( $settings = [] ) {
68        foreach ( $settings as $name => $value ) {
69            $this->$name = $value;
70        }
71    }
72
73    /**
74     * @param array $links
75     * @param ?Title $title
76     * @param User $user
77     * @param bool $preventLog
78     * @return mixed
79     */
80    abstract public function filter(
81        array $links,
82        ?Title $title,
83        User $user,
84        $preventLog = false
85    );
86
87    /**
88     * Adds a blacklist class to the registry
89     *
90     * @param string $type
91     * @param string $class
92     */
93    public static function addBlacklistType( $type, $class ) {
94        self::$blacklistTypes[$type] = $class;
95    }
96
97    /**
98     * Return the array of blacklist types currently defined
99     *
100     * @return string[]
101     */
102    public static function getBlacklistTypes() {
103        return self::$blacklistTypes;
104    }
105
106    /**
107     * @return SpamBlacklist
108     */
109    public static function getSpamBlacklist() {
110        // @phan-suppress-next-line PhanTypeMismatchReturnSuperType
111        return self::getInstance( 'spam' );
112    }
113
114    /**
115     * @return EmailBlacklist
116     */
117    public static function getEmailBlacklist() {
118        // @phan-suppress-next-line PhanTypeMismatchReturnSuperType
119        return self::getInstance( 'email' );
120    }
121
122    /**
123     * Returns an instance of the given blacklist
124     *
125     * @deprecated Use getSpamBlacklist() or getEmailBlacklist() instead
126     * @param string $type Code for the blacklist
127     * @return BaseBlacklist
128     */
129    public static function getInstance( $type ) {
130        if ( !isset( self::$blacklistTypes[$type] ) ) {
131            throw new InvalidArgumentException( "Invalid blacklist type '$type' passed to " . __METHOD__ );
132        }
133
134        if ( !isset( self::$instances[$type] ) ) {
135            global $wgBlacklistSettings;
136
137            // Prevent notices
138            if ( !isset( $wgBlacklistSettings[$type] ) ) {
139                $wgBlacklistSettings[$type] = [];
140            }
141
142            $class = self::$blacklistTypes[$type];
143            self::$instances[$type] = new $class( $wgBlacklistSettings[$type] );
144        }
145
146        return self::$instances[$type];
147    }
148
149    /**
150     * Clear instance cache. For use during testing.
151     */
152    public static function clearInstanceCache() {
153        self::$instances = [];
154    }
155
156    /**
157     * Returns the code for the blacklist implementation
158     *
159     * @return string
160     */
161    abstract protected function getBlacklistType();
162
163    /**
164     * Check if the given local page title is a spam regex source.
165     *
166     * @param Title $title
167     * @return bool
168     */
169    public static function isLocalSource( Title $title ) {
170        global $wgDBname, $wgBlacklistSettings;
171
172        if ( $title->inNamespace( NS_MEDIAWIKI ) ) {
173            $sources = [];
174            foreach ( self::$blacklistTypes as $type => $class ) {
175                // For the built in types, this results in the use of:
176                // spam-blacklist, spam-whitelist
177                // email-blacklist, email-whitelist
178                $type = ucfirst( $type );
179                $sources[] = "$type-blacklist";
180                $sources[] = "$type-whitelist";
181            }
182
183            if ( in_array( $title->getDBkey(), $sources ) ) {
184                return true;
185            }
186        }
187
188        $thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP );
189        $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/';
190
191        $files = [];
192        foreach ( self::$blacklistTypes as $type => $class ) {
193            if ( isset( $wgBlacklistSettings[$type]['files'] ) ) {
194                $files += $wgBlacklistSettings[$type]['files'];
195            }
196        }
197
198        foreach ( $files as $fileName ) {
199            $matches = [];
200            if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
201                if ( $wgDBname === $matches[1] && $matches[2] === $title->getPrefixedDbKey() ) {
202                    // Local DB fetch of this page...
203                    return true;
204                }
205            } elseif ( preg_match( $thisHttpRegex, $fileName ) ) {
206                // Raw view of this page
207                return true;
208            }
209        }
210
211        return false;
212    }
213
214    /**
215     * Returns the type of blacklist from the given title
216     *
217     * @todo building a regex for this is pretty overkill
218     * @param Title $title
219     * @return bool|string
220     */
221    public static function getTypeFromTitle( Title $title ) {
222        $contLang = MediaWikiServices::getInstance()->getContentLanguage();
223
224        $types = array_map( [ $contLang, 'ucfirst' ], array_keys( self::$blacklistTypes ) );
225        $regex = '/(' . implode( '|', $types ) . ')-(?:blacklist|whitelist)/';
226
227        if ( preg_match( $regex, $title->getDBkey(), $m ) ) {
228            return strtolower( $m[1] );
229        }
230
231        return false;
232    }
233
234    /**
235     * Fetch local and (possibly cached) remote blacklists.
236     * Will be cached locally across multiple invocations.
237     * @return string[] set of regular expressions, potentially empty.
238     */
239    public function getBlacklists() {
240        if ( $this->regexes === false ) {
241            $this->regexes = array_merge(
242                $this->getLocalBlacklists(),
243                $this->getSharedBlacklists()
244            );
245        }
246        return $this->regexes;
247    }
248
249    /**
250     * Returns the local blacklist
251     *
252     * @return string[] Regular expressions
253     */
254    public function getLocalBlacklists() {
255        $type = $this->getBlacklistType();
256        $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
257
258        return $cache->getWithSetCallback(
259            $cache->makeKey( 'spamblacklist', $type, 'blacklist-regex' ),
260            $this->expiryTime,
261            function () use ( $type ) {
262                return SpamRegexBatch::regexesFromMessage( "{$type}-blacklist", $this );
263            }
264        );
265    }
266
267    /**
268     * Returns the (local) whitelist
269     *
270     * @return string[] Regular expressions
271     */
272    public function getWhitelists() {
273        $type = $this->getBlacklistType();
274        $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
275
276        return $cache->getWithSetCallback(
277            $cache->makeKey( 'spamblacklist', $type, 'whitelist-regex' ),
278            $this->expiryTime,
279            function () use ( $type ) {
280                return SpamRegexBatch::regexesFromMessage( "{$type}-whitelist", $this );
281            }
282        );
283    }
284
285    /**
286     * Fetch (possibly cached) remote blacklists.
287     * @return array
288     */
289    private function getSharedBlacklists() {
290        $listType = $this->getBlacklistType();
291
292        wfDebugLog( 'SpamBlacklist', "Loading $listType regex..." );
293
294        if ( !$this->files ) {
295            # No lists
296            wfDebugLog( 'SpamBlacklist', "no files specified\n" );
297            return [];
298        }
299
300        if ( defined( 'MW_PHPUNIT_TEST' ) ) {
301            wfDebugLog( 'SpamBlacklist', 'remote loading disabled during PHPUnit test' );
302            return [];
303        }
304
305        $miss = false;
306        $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
307        $regexes = $cache->getWithSetCallback(
308            // This used to be cached per-site, but that could be bad on a shared
309            // server where not all wikis have the same configuration.
310            $cache->makeKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ),
311            $this->expiryTime,
312            function () use ( &$miss ) {
313                $miss = true;
314                return $this->buildSharedBlacklists();
315            }
316        );
317
318        if ( !$miss ) {
319            wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" );
320        }
321
322        return $regexes;
323    }
324
325    /**
326     * Clear all primary blacklist cache keys
327     */
328    public function clearCache() {
329        $listType = $this->getBlacklistType();
330
331        $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
332        $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ) );
333        $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'blacklist-regex' ) );
334        $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'whitelist-regex' ) );
335
336        wfDebugLog( 'SpamBlacklist', "$listType blacklist local cache cleared.\n" );
337    }
338
339    private function buildSharedBlacklists() {
340        $regexes = [];
341        $listType = $this->getBlacklistType();
342        # Load lists
343        wfDebugLog( 'SpamBlacklist', "Constructing $listType blacklist\n" );
344        foreach ( $this->files as $fileName ) {
345            $matches = [];
346            if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) {
347                $text = $this->getArticleText( $matches[1], $matches[2] );
348            } elseif ( preg_match( '/^(https?:)?\/\//', $fileName ) ) {
349                $text = $this->getHttpText( $fileName );
350            } else {
351                $text = file_get_contents( $fileName );
352                wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" );
353            }
354
355            if ( $text ) {
356                // Build a separate batch of regexes from each source.
357                // While in theory we could squeeze a little efficiency
358                // out of combining multiple sources in one regex, if
359                // there's a bad line in one of them we'll gain more
360                // from only having to break that set into smaller pieces.
361                $regexes = array_merge(
362                    $regexes,
363                    SpamRegexBatch::regexesFromText( $text, $this, $fileName )
364                );
365            }
366        }
367
368        return $regexes;
369    }
370
371    /**
372     * @param string $fileName
373     * @return string|null|false
374     */
375    private function getHttpText( $fileName ) {
376        global $wgMessageCacheType;
377        // FIXME: This is a hack to use Memcached where possible (incl. WMF),
378        // but have CACHE_DB as fallback (instead of no cache).
379        // This might be a good candidate for T248005.
380        $services = MediaWikiServices::getInstance()->getObjectCacheFactory();
381        $cache = $services->getInstance( $wgMessageCacheType );
382
383        $listType = $this->getBlacklistType();
384        // There are two keys, when the warning key expires, a random thread will refresh
385        // the real key. This reduces the chance of multiple requests under high traffic
386        // conditions.
387        $key = $cache->makeGlobalKey( "blacklist_file_{$listType}", $fileName );
388        $warningKey = $cache->makeKey( "filewarning_{$listType}", $fileName );
389        $httpText = $cache->get( $key );
390        $warning = $cache->get( $warningKey );
391
392        if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
393            wfDebugLog( 'SpamBlacklist', "Loading $listType blacklist from $fileName\n" );
394            $httpText = MediaWikiServices::getInstance()->getHttpRequestFactory()
395                ->get( $fileName, [], __METHOD__ );
396            if ( $httpText === false ) {
397                wfDebugLog( 'SpamBlacklist', "Error loading $listType blacklist from $fileName\n" );
398            }
399            $cache->set( $warningKey, 1, $this->warningTime );
400            $cache->set( $key, $httpText, $this->expiryTime );
401        } else {
402            wfDebugLog( 'SpamBlacklist', "Got $listType blacklist from HTTP cache for $fileName\n" );
403        }
404        return $httpText;
405    }
406
407    /**
408     * Fetch an article from this or another local MediaWiki database.
409     *
410     * @param string $wiki
411     * @param string $pagename
412     * @return bool|string|null
413     */
414    private function getArticleText( $wiki, $pagename ) {
415        wfDebugLog( 'SpamBlacklist',
416            "Fetching {$this->getBlacklistType()} blacklist from '$pagename' on '$wiki'...\n" );
417
418        $services = MediaWikiServices::getInstance();
419
420        // XXX: We do not know about custom namespaces on the target wiki here!
421        $title = $services->getTitleParser()->parseTitle( $pagename );
422        $store = $services->getRevisionStoreFactory()->getRevisionStore( $wiki );
423        $rev = $store->getRevisionByTitle( $title );
424
425        $content = $rev ? $rev->getContent( SlotRecord::MAIN ) : null;
426
427        if ( !( $content instanceof TextContent ) ) {
428            return false;
429        }
430
431        return $content->getText();
432    }
433
434    /**
435     * Returns the start of the regex for matches
436     *
437     * @return string
438     */
439    public function getRegexStart() {
440        return '/[a-z0-9_\-.]*';
441    }
442
443    /**
444     * Returns the end of the regex for matches
445     *
446     * @param int $batchSize
447     * @return string
448     */
449    public function getRegexEnd( $batchSize ) {
450        return ( $batchSize > 0 ) ? '/Sim' : '/im';
451    }
452
453    /**
454     * @param Title $title
455     * @param string[] $entries
456     * @param User $user
457     */
458    public function warmCachesForFilter( Title $title, array $entries, User $user ) {
459        // subclass this
460    }
461}