Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
4.29% covered (danger)
4.29%
6 / 140
5.00% covered (danger)
5.00%
1 / 20
CRAP
0.00% covered (danger)
0.00%
0 / 1
BaseBlacklist
4.29% covered (danger)
4.29%
6 / 140
5.00% covered (danger)
5.00%
1 / 20
2242.15
0.00% covered (danger)
0.00%
0 / 1
 __construct
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
6
 filter
n/a
0 / 0
n/a
0 / 0
0
 addBlacklistType
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getBlacklistTypes
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getSpamBlacklist
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getEmailBlacklist
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getInstance
0.00% covered (danger)
0.00%
0 / 8
0.00% covered (danger)
0.00%
0 / 1
20
 clearInstanceCache
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getBlacklistType
n/a
0 / 0
n/a
0 / 0
0
 isLocalSource
0.00% covered (danger)
0.00%
0 / 22
0.00% covered (danger)
0.00%
0 / 1
132
 getTypeFromTitle
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 getBlacklists
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
6
 getLocalBlacklists
0.00% covered (danger)
0.00%
0 / 9
0.00% covered (danger)
0.00%
0 / 1
2
 getWhitelists
0.00% covered (danger)
0.00%
0 / 9
0.00% covered (danger)
0.00%
0 / 1
2
 getSharedBlacklists
0.00% covered (danger)
0.00%
0 / 21
0.00% covered (danger)
0.00%
0 / 1
20
 clearCache
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
2
 buildSharedBlacklists
0.00% covered (danger)
0.00%
0 / 17
0.00% covered (danger)
0.00%
0 / 1
30
 getHttpText
0.00% covered (danger)
0.00%
0 / 16
0.00% covered (danger)
0.00%
0 / 1
30
 getArticleText
0.00% covered (danger)
0.00%
0 / 10
0.00% covered (danger)
0.00%
0 / 1
12
 getRegexStart
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getRegexEnd
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
6
 warmCachesForFilter
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace MediaWiki\Extension\SpamBlacklist;
4
5use InvalidArgumentException;
6use MediaWiki\MediaWikiServices;
7use MediaWiki\Revision\SlotRecord;
8use MediaWiki\Title\Title;
9use MediaWiki\User\User;
10use ObjectCache;
11use TextContent;
12
13/**
14 * Base class for different kinds of blacklists
15 */
16abstract class BaseBlacklist {
17    /**
18     * Array of blacklist sources
19     *
20     * @var string[]
21     */
22    public $files = [];
23
24    /**
25     * Array containing regexes to test against
26     *
27     * @var string[]|false
28     */
29    protected $regexes = false;
30
31    /**
32     * Chance of receiving a warning when the filter is hit
33     *
34     * @var int
35     */
36    public $warningChance = 100;
37
38    /**
39     * @var int
40     */
41    public $warningTime = 600;
42
43    /**
44     * @var int
45     */
46    public $expiryTime = 900;
47
48    /**
49     * Array containing blacklists that extend BaseBlacklist
50     *
51     * @var string[]
52     */
53    private static $blacklistTypes = [
54        'spam' => SpamBlacklist::class,
55        'email' => EmailBlacklist::class,
56    ];
57
58    /**
59     * Array of blacklist instances
60     *
61     * @var self[]
62     */
63    private static $instances = [];
64
65    /**
66     * @param array $settings
67     */
68    public function __construct( $settings = [] ) {
69        foreach ( $settings as $name => $value ) {
70            $this->$name = $value;
71        }
72    }
73
74    /**
75     * @param array $links
76     * @param ?Title $title
77     * @param User $user
78     * @param bool $preventLog
79     * @return mixed
80     */
81    abstract public function filter(
82        array $links,
83        ?Title $title,
84        User $user,
85        $preventLog = false
86    );
87
88    /**
89     * Adds a blacklist class to the registry
90     *
91     * @param string $type
92     * @param string $class
93     */
94    public static function addBlacklistType( $type, $class ) {
95        self::$blacklistTypes[$type] = $class;
96    }
97
98    /**
99     * Return the array of blacklist types currently defined
100     *
101     * @return string[]
102     */
103    public static function getBlacklistTypes() {
104        return self::$blacklistTypes;
105    }
106
107    /**
108     * @return SpamBlacklist
109     */
110    public static function getSpamBlacklist() {
111        // @phan-suppress-next-line PhanTypeMismatchReturnSuperType
112        return self::getInstance( 'spam' );
113    }
114
115    /**
116     * @return EmailBlacklist
117     */
118    public static function getEmailBlacklist() {
119        // @phan-suppress-next-line PhanTypeMismatchReturnSuperType
120        return self::getInstance( 'email' );
121    }
122
123    /**
124     * Returns an instance of the given blacklist
125     *
126     * @deprecated Use getSpamBlacklist() or getEmailBlacklist() instead
127     * @param string $type Code for the blacklist
128     * @return BaseBlacklist
129     */
130    public static function getInstance( $type ) {
131        if ( !isset( self::$blacklistTypes[$type] ) ) {
132            throw new InvalidArgumentException( "Invalid blacklist type '$type' passed to " . __METHOD__ );
133        }
134
135        if ( !isset( self::$instances[$type] ) ) {
136            global $wgBlacklistSettings;
137
138            // Prevent notices
139            if ( !isset( $wgBlacklistSettings[$type] ) ) {
140                $wgBlacklistSettings[$type] = [];
141            }
142
143            $class = self::$blacklistTypes[$type];
144            self::$instances[$type] = new $class( $wgBlacklistSettings[$type] );
145        }
146
147        return self::$instances[$type];
148    }
149
150    /**
151     * Clear instance cache. For use during testing.
152     */
153    public static function clearInstanceCache() {
154        self::$instances = [];
155    }
156
157    /**
158     * Returns the code for the blacklist implementation
159     *
160     * @return string
161     */
162    abstract protected function getBlacklistType();
163
164    /**
165     * Check if the given local page title is a spam regex source.
166     *
167     * @param Title $title
168     * @return bool
169     */
170    public static function isLocalSource( Title $title ) {
171        global $wgDBname, $wgBlacklistSettings;
172
173        if ( $title->inNamespace( NS_MEDIAWIKI ) ) {
174            $sources = [];
175            foreach ( self::$blacklistTypes as $type => $class ) {
176                // For the built in types, this results in the use of:
177                // spam-blacklist, spam-whitelist
178                // email-blacklist, email-whitelist
179                $type = ucfirst( $type );
180                $sources[] = "$type-blacklist";
181                $sources[] = "$type-whitelist";
182            }
183
184            if ( in_array( $title->getDBkey(), $sources ) ) {
185                return true;
186            }
187        }
188
189        $thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP );
190        $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/';
191
192        $files = [];
193        foreach ( self::$blacklistTypes as $type => $class ) {
194            if ( isset( $wgBlacklistSettings[$type]['files'] ) ) {
195                $files += $wgBlacklistSettings[$type]['files'];
196            }
197        }
198
199        foreach ( $files as $fileName ) {
200            $matches = [];
201            if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
202                if ( $wgDBname === $matches[1] && $matches[2] === $title->getPrefixedDbKey() ) {
203                    // Local DB fetch of this page...
204                    return true;
205                }
206            } elseif ( preg_match( $thisHttpRegex, $fileName ) ) {
207                // Raw view of this page
208                return true;
209            }
210        }
211
212        return false;
213    }
214
215    /**
216     * Returns the type of blacklist from the given title
217     *
218     * @todo building a regex for this is pretty overkill
219     * @param Title $title
220     * @return bool|string
221     */
222    public static function getTypeFromTitle( Title $title ) {
223        $contLang = MediaWikiServices::getInstance()->getContentLanguage();
224
225        $types = array_map( [ $contLang, 'ucfirst' ], array_keys( self::$blacklistTypes ) );
226        $regex = '/(' . implode( '|', $types ) . ')-(?:blacklist|whitelist)/';
227
228        if ( preg_match( $regex, $title->getDBkey(), $m ) ) {
229            return strtolower( $m[1] );
230        }
231
232        return false;
233    }
234
235    /**
236     * Fetch local and (possibly cached) remote blacklists.
237     * Will be cached locally across multiple invocations.
238     * @return string[] set of regular expressions, potentially empty.
239     */
240    public function getBlacklists() {
241        if ( $this->regexes === false ) {
242            $this->regexes = array_merge(
243                $this->getLocalBlacklists(),
244                $this->getSharedBlacklists()
245            );
246        }
247        return $this->regexes;
248    }
249
250    /**
251     * Returns the local blacklist
252     *
253     * @return string[] Regular expressions
254     */
255    public function getLocalBlacklists() {
256        $type = $this->getBlacklistType();
257        $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
258
259        return $cache->getWithSetCallback(
260            $cache->makeKey( 'spamblacklist', $type, 'blacklist-regex' ),
261            $this->expiryTime,
262            function () use ( $type ) {
263                return SpamRegexBatch::regexesFromMessage( "{$type}-blacklist", $this );
264            }
265        );
266    }
267
268    /**
269     * Returns the (local) whitelist
270     *
271     * @return string[] Regular expressions
272     */
273    public function getWhitelists() {
274        $type = $this->getBlacklistType();
275        $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
276
277        return $cache->getWithSetCallback(
278            $cache->makeKey( 'spamblacklist', $type, 'whitelist-regex' ),
279            $this->expiryTime,
280            function () use ( $type ) {
281                return SpamRegexBatch::regexesFromMessage( "{$type}-whitelist", $this );
282            }
283        );
284    }
285
286    /**
287     * Fetch (possibly cached) remote blacklists.
288     * @return array
289     */
290    private function getSharedBlacklists() {
291        $listType = $this->getBlacklistType();
292
293        wfDebugLog( 'SpamBlacklist', "Loading $listType regex..." );
294
295        if ( !$this->files ) {
296            # No lists
297            wfDebugLog( 'SpamBlacklist', "no files specified\n" );
298            return [];
299        }
300
301        if ( defined( 'MW_PHPUNIT_TEST' ) ) {
302            wfDebugLog( 'SpamBlacklist', 'remote loading disabled during PHPUnit test' );
303            return [];
304        }
305
306        $miss = false;
307        $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
308        $regexes = $cache->getWithSetCallback(
309            // This used to be cached per-site, but that could be bad on a shared
310            // server where not all wikis have the same configuration.
311            $cache->makeKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ),
312            $this->expiryTime,
313            function () use ( &$miss ) {
314                $miss = true;
315                return $this->buildSharedBlacklists();
316            }
317        );
318
319        if ( !$miss ) {
320            wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" );
321        }
322
323        return $regexes;
324    }
325
326    /**
327     * Clear all primary blacklist cache keys
328     */
329    public function clearCache() {
330        $listType = $this->getBlacklistType();
331
332        $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
333        $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ) );
334        $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'blacklist-regex' ) );
335        $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'whitelist-regex' ) );
336
337        wfDebugLog( 'SpamBlacklist', "$listType blacklist local cache cleared.\n" );
338    }
339
340    private function buildSharedBlacklists() {
341        $regexes = [];
342        $listType = $this->getBlacklistType();
343        # Load lists
344        wfDebugLog( 'SpamBlacklist', "Constructing $listType blacklist\n" );
345        foreach ( $this->files as $fileName ) {
346            $matches = [];
347            if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) {
348                $text = $this->getArticleText( $matches[1], $matches[2] );
349            } elseif ( preg_match( '/^(https?:)?\/\//', $fileName ) ) {
350                $text = $this->getHttpText( $fileName );
351            } else {
352                $text = file_get_contents( $fileName );
353                wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" );
354            }
355
356            if ( $text ) {
357                // Build a separate batch of regexes from each source.
358                // While in theory we could squeeze a little efficiency
359                // out of combining multiple sources in one regex, if
360                // there's a bad line in one of them we'll gain more
361                // from only having to break that set into smaller pieces.
362                $regexes = array_merge(
363                    $regexes,
364                    SpamRegexBatch::regexesFromText( $text, $this, $fileName )
365                );
366            }
367        }
368
369        return $regexes;
370    }
371
372    private function getHttpText( $fileName ) {
373        global $wgMessageCacheType;
374        // FIXME: This is a hack to use Memcached where possible (incl. WMF),
375        // but have CACHE_DB as fallback (instead of no cache).
376        // This might be a good candidate for T248005.
377        $cache = ObjectCache::getInstance( $wgMessageCacheType );
378
379        $listType = $this->getBlacklistType();
380        // There are two keys, when the warning key expires, a random thread will refresh
381        // the real key. This reduces the chance of multiple requests under high traffic
382        // conditions.
383        $key = $cache->makeGlobalKey( "blacklist_file_{$listType}", $fileName );
384        $warningKey = $cache->makeKey( "filewarning_{$listType}", $fileName );
385        $httpText = $cache->get( $key );
386        $warning = $cache->get( $warningKey );
387
388        if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
389            wfDebugLog( 'SpamBlacklist', "Loading $listType blacklist from $fileName\n" );
390            $httpText = MediaWikiServices::getInstance()->getHttpRequestFactory()
391                ->get( $fileName, [], __METHOD__ );
392            if ( $httpText === false ) {
393                wfDebugLog( 'SpamBlacklist', "Error loading $listType blacklist from $fileName\n" );
394            }
395            $cache->set( $warningKey, 1, $this->warningTime );
396            $cache->set( $key, $httpText, $this->expiryTime );
397        } else {
398            wfDebugLog( 'SpamBlacklist', "Got $listType blacklist from HTTP cache for $fileName\n" );
399        }
400        return $httpText;
401    }
402
403    /**
404     * Fetch an article from this or another local MediaWiki database.
405     *
406     * @param string $wiki
407     * @param string $pagename
408     * @return bool|string|null
409     */
410    private function getArticleText( $wiki, $pagename ) {
411        wfDebugLog( 'SpamBlacklist',
412            "Fetching {$this->getBlacklistType()} blacklist from '$pagename' on '$wiki'...\n" );
413
414        $services = MediaWikiServices::getInstance();
415
416        // XXX: We do not know about custom namespaces on the target wiki here!
417        $title = $services->getTitleParser()->parseTitle( $pagename );
418        $store = $services->getRevisionStoreFactory()->getRevisionStore( $wiki );
419        $rev = $store->getRevisionByTitle( $title );
420
421        $content = $rev ? $rev->getContent( SlotRecord::MAIN ) : null;
422
423        if ( !( $content instanceof TextContent ) ) {
424            return false;
425        }
426
427        return $content->getText();
428    }
429
430    /**
431     * Returns the start of the regex for matches
432     *
433     * @return string
434     */
435    public function getRegexStart() {
436        return '/[a-z0-9_\-.]*';
437    }
438
439    /**
440     * Returns the end of the regex for matches
441     *
442     * @param int $batchSize
443     * @return string
444     */
445    public function getRegexEnd( $batchSize ) {
446        return ( $batchSize > 0 ) ? '/Sim' : '/im';
447    }
448
449    /**
450     * @param Title $title
451     * @param string[] $entries
452     * @param User $user
453     */
454    public function warmCachesForFilter( Title $title, array $entries, User $user ) {
455        // subclass this
456    }
457}