MediaWiki REL1_34
BaseBlacklist.php
Go to the documentation of this file.
1<?php
2
5
9abstract class BaseBlacklist {
15 public $files = [];
16
22 protected $regexes = false;
23
29 public $warningChance = 100;
30
34 public $warningTime = 600;
35
39 public $expiryTime = 900;
40
46 private static $blacklistTypes = [
47 'spam' => 'SpamBlacklist',
48 'email' => 'EmailBlacklist',
49 ];
50
56 private static $instances = [];
57
63 public function __construct( $settings = [] ) {
64 foreach ( $settings as $name => $value ) {
65 $this->$name = $value;
66 }
67 }
68
75 abstract public function filter( array $links, Title $title, $preventLog = false );
76
83 public static function addBlacklistType( $type, $class ) {
84 self::$blacklistTypes[$type] = $class;
85 }
86
92 public static function getBlacklistTypes() {
94 }
95
99 public static function getSpamBlacklist() {
100 return self::getInstance( 'spam' );
101 }
102
106 public static function getEmailBlacklist() {
107 return self::getInstance( 'email' );
108 }
109
118 public static function getInstance( $type ) {
119 if ( !isset( self::$blacklistTypes[$type] ) ) {
120 throw new Exception( "Invalid blacklist type '$type' passed to " . __METHOD__ );
121 }
122
123 if ( !isset( self::$instances[$type] ) ) {
124 global $wgBlacklistSettings;
125
126 // Prevent notices
127 if ( !isset( $wgBlacklistSettings[$type] ) ) {
128 $wgBlacklistSettings[$type] = [];
129 }
130
131 $class = self::$blacklistTypes[$type];
132 self::$instances[$type] = new $class( $wgBlacklistSettings[$type] );
133 }
134
135 return self::$instances[$type];
136 }
137
143 abstract protected function getBlacklistType();
144
151 public static function isLocalSource( Title $title ) {
152 global $wgDBname, $wgBlacklistSettings;
153
154 if ( $title->inNamespace( NS_MEDIAWIKI ) ) {
155 $sources = [];
156 foreach ( self::$blacklistTypes as $type => $class ) {
157 $type = ucfirst( $type );
158 $sources += [
159 "$type-blacklist",
160 "$type-whitelist"
161 ];
162 }
163
164 if ( in_array( $title->getDBkey(), $sources ) ) {
165 return true;
166 }
167 }
168
169 $thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP );
170 $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/';
171
172 $files = [];
173 foreach ( self::$blacklistTypes as $type => $class ) {
174 if ( isset( $wgBlacklistSettings[$type]['files'] ) ) {
175 $files += $wgBlacklistSettings[$type]['files'];
176 }
177 }
178
179 // @phan-suppress-next-line PhanTypeMismatchForeach += makes Phan think $files is a number
180 foreach ( $files as $fileName ) {
181 $matches = [];
182 if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
183 if ( $wgDBname === $matches[1] ) {
184 if ( $matches[2] === $title->getPrefixedDbKey() ) {
185 // Local DB fetch of this page...
186 return true;
187 }
188 }
189 } elseif ( preg_match( $thisHttpRegex, $fileName ) ) {
190 // Raw view of this page
191 return true;
192 }
193 }
194
195 return false;
196 }
197
205 public static function getTypeFromTitle( Title $title ) {
206 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
207
208 $types = array_map( [ $contLang, 'ucfirst' ], array_keys( self::$blacklistTypes ) );
209 $regex = '/(' . implode( '|', $types ) . ')-(?:blacklist|whitelist)/';
210
211 if ( preg_match( $regex, $title->getDBkey(), $m ) ) {
212 return strtolower( $m[1] );
213 }
214
215 return false;
216 }
217
223 public function getBlacklists() {
224 if ( $this->regexes === false ) {
225 $this->regexes = array_merge(
226 $this->getLocalBlacklists(),
227 $this->getSharedBlacklists()
228 );
229 }
230 return $this->regexes;
231 }
232
238 public function getLocalBlacklists() {
239 $that = $this;
240 $type = $this->getBlacklistType();
241 $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
242
243 return $cache->getWithSetCallback(
244 $cache->makeKey( 'spamblacklist', $type, 'blacklist-regex' ),
245 $this->expiryTime,
246 function () use ( $that, $type ) {
247 return SpamRegexBatch::regexesFromMessage( "{$type}-blacklist", $that );
248 }
249 );
250 }
251
257 public function getWhitelists() {
258 $that = $this;
259 $type = $this->getBlacklistType();
260 $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
261
262 return $cache->getWithSetCallback(
263 $cache->makeKey( 'spamblacklist', $type, 'whitelist-regex' ),
264 $this->expiryTime,
265 function () use ( $that, $type ) {
266 return SpamRegexBatch::regexesFromMessage( "{$type}-whitelist", $that );
267 }
268 );
269 }
270
275 private function getSharedBlacklists() {
276 $listType = $this->getBlacklistType();
277
278 wfDebugLog( 'SpamBlacklist', "Loading $listType regex..." );
279
280 if ( !$this->files ) {
281 # No lists
282 wfDebugLog( 'SpamBlacklist', "no files specified\n" );
283 return [];
284 }
285
286 $miss = false;
287
288 $that = $this;
289 $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
290 $regexes = $cache->getWithSetCallback(
291 // This used to be cached per-site, but that could be bad on a shared
292 // server where not all wikis have the same configuration.
293 $cache->makeKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ),
294 $this->expiryTime,
295 function () use ( $that, &$miss ) {
296 $miss = true;
297 return $that->buildSharedBlacklists();
298 }
299 );
300
301 if ( !$miss ) {
302 wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" );
303 }
304
305 return $regexes;
306 }
307
311 public function clearCache() {
312 $listType = $this->getBlacklistType();
313
314 $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
315 $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ) );
316 $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'blacklist-regex' ) );
317 $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'whitelist-regex' ) );
318
319 wfDebugLog( 'SpamBlacklist', "$listType blacklist local cache cleared.\n" );
320 }
321
322 private function buildSharedBlacklists() {
323 $regexes = [];
324 $listType = $this->getBlacklistType();
325 # Load lists
326 wfDebugLog( 'SpamBlacklist', "Constructing $listType blacklist\n" );
327 foreach ( $this->files as $fileName ) {
328 $matches = [];
329 if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) {
330 $text = $this->getArticleText( $matches[1], $matches[2] );
331 } elseif ( preg_match( '/^(https?:)?\/\//', $fileName ) ) {
332 $text = $this->getHttpText( $fileName );
333 } else {
334 $text = file_get_contents( $fileName );
335 wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" );
336 }
337
338 // Build a separate batch of regexes from each source.
339 // While in theory we could squeeze a little efficiency
340 // out of combining multiple sources in one regex, if
341 // there's a bad line in one of them we'll gain more
342 // from only having to break that set into smaller pieces.
343 $regexes = array_merge(
344 $regexes,
345 SpamRegexBatch::regexesFromText( $text, $this, $fileName )
346 );
347 }
348
349 return $regexes;
350 }
351
352 private function getHttpText( $fileName ) {
353 global $wgDBname, $messageMemc;
354 $listType = $this->getBlacklistType();
355
356 # HTTP request
357 # To keep requests to a minimum, we save results into $messageMemc, which is
358 # similar to $wgMemc except almost certain to exist. By default, it is stored
359 # in the database
360 # There are two keys, when the warning key expires, a random thread will refresh
361 # the real key. This reduces the chance of multiple requests under high traffic
362 # conditions.
363 $key = "{$listType}_blacklist_file:$fileName";
364 $warningKey = "$wgDBname:{$listType}filewarning:$fileName";
365 $httpText = $messageMemc->get( $key );
366 $warning = $messageMemc->get( $warningKey );
367
368 if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
369 wfDebugLog( 'SpamBlacklist', "Loading $listType blacklist from $fileName\n" );
370 $httpText = Http::get( $fileName );
371 if ( $httpText === false ) {
372 wfDebugLog( 'SpamBlacklist', "Error loading $listType blacklist from $fileName\n" );
373 }
374 $messageMemc->set( $warningKey, 1, $this->warningTime );
375 $messageMemc->set( $key, $httpText, $this->expiryTime );
376 } else {
377 wfDebugLog( 'SpamBlacklist', "Got $listType blacklist from HTTP cache for $fileName\n" );
378 }
379 return $httpText;
380 }
381
389 private function getArticleText( $wiki, $pagename ) {
390 wfDebugLog( 'SpamBlacklist',
391 "Fetching {$this->getBlacklistType()} blacklist from '$pagename' on '$wiki'...\n" );
392
393 $services = MediaWikiServices::getInstance();
394
395 // XXX: We do not know about custom namespaces on the target wiki here!
396 $title = $services->getTitleParser()->parseTitle( $pagename );
397 $store = $services->getRevisionStoreFactory()->getRevisionStore( $wiki );
398 $rev = $store->getRevisionByTitle( $title );
399
400 $content = $rev ? $rev->getContent( SlotRecord::MAIN ) : null;
401
402 if ( !( $content instanceof TextContent ) ) {
403 return false;
404 }
405
406 return $content->getText();
407 }
408
414 public function getRegexStart() {
415 return '/[a-z0-9_\-.]*';
416 }
417
424 public function getRegexEnd( $batchSize ) {
425 return ( $batchSize > 0 ) ? '/Sim' : '/im';
426 }
427
432 public function warmCachesForFilter( Title $title, array $entries ) {
433 // subclass this
434 }
435}
$wgDBname
Current wiki database name.
wfExpandUrl( $url, $defaultProto=PROTO_CURRENT)
Expand a potentially local URL to a fully-qualified URL.
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
$messageMemc
Definition Setup.php:791
Base class for different kinds of blacklists.
static array $blacklistTypes
Array containing blacklists that extend BaseBlacklist.
getLocalBlacklists()
Returns the local blacklist.
static getBlacklistTypes()
Return the array of blacklist types currently defined.
array $files
Array of blacklist sources.
static getEmailBlacklist()
__construct( $settings=[])
Constructor.
static getSpamBlacklist()
getBlacklists()
Fetch local and (possibly cached) remote blacklists.
clearCache()
Clear all primary blacklist cache keys.
filter(array $links, Title $title, $preventLog=false)
getWhitelists()
Returns the (local) whitelist.
getSharedBlacklists()
Fetch (possibly cached) remote blacklists.
getBlacklistType()
Returns the code for the blacklist implementation.
getRegexStart()
Returns the start of the regex for matches.
getArticleText( $wiki, $pagename)
Fetch an article from this or another local MediaWiki database.
static getInstance( $type)
Returns an instance of the given blacklist.
static getTypeFromTitle(Title $title)
Returns the type of blacklist from the given title.
bool array $regexes
Array containing regexes to test against.
static isLocalSource(Title $title)
Check if the given local page title is a spam regex source.
static array $instances
Array of blacklist instances.
getHttpText( $fileName)
static addBlacklistType( $type, $class)
Adds a blacklist class to the registry.
int $warningChance
Chance of receiving a warning when the filter is hit.
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
warmCachesForFilter(Title $title, array $entries)
MediaWikiServices is the service locator for the application scope of MediaWiki.
Value object representing a content slot associated with a page revision.
static regexesFromMessage( $message, BaseBlacklist $blacklist)
Build a set of regular expressions from a MediaWiki message.
static regexesFromText( $source, BaseBlacklist $blacklist, $fileName=false)
Build a set of regular expressions from the given multiline input text, with empty lines and comments...
Content object implementation for representing flat text.
Represents a title within MediaWiki.
Definition Title.php:42
const NS_MEDIAWIKI
Definition Defines.php:77
const PROTO_HTTP
Definition Defines.php:208
$cache
Definition mcc.php:33
$content
Definition router.php:78