MediaWiki REL1_31
BaseBlacklist.php
Go to the documentation of this file.
1<?php
2
6abstract class BaseBlacklist {
12 public $files = [];
13
19 protected $regexes = false;
20
26 public $warningChance = 100;
27
31 public $warningTime = 600;
32
36 public $expiryTime = 900;
37
43 private static $blacklistTypes = [
44 'spam' => 'SpamBlacklist',
45 'email' => 'EmailBlacklist',
46 ];
47
53 private static $instances = [];
54
60 function __construct( $settings = [] ) {
61 foreach ( $settings as $name => $value ) {
62 $this->$name = $value;
63 }
64 }
65
72 abstract public function filter( array $links, Title $title, $preventLog = false );
73
80 public static function addBlacklistType( $type, $class ) {
81 self::$blacklistTypes[$type] = $class;
82 }
83
89 public static function getBlacklistTypes() {
91 }
92
96 public static function getSpamBlacklist() {
97 return self::getInstance( 'spam' );
98 }
99
103 public static function getEmailBlacklist() {
104 return self::getInstance( 'email' );
105 }
106
115 public static function getInstance( $type ) {
116 if ( !isset( self::$blacklistTypes[$type] ) ) {
117 throw new Exception( "Invalid blacklist type '$type' passed to " . __METHOD__ );
118 }
119
120 if ( !isset( self::$instances[$type] ) ) {
121 global $wgBlacklistSettings;
122
123 // Prevent notices
124 if ( !isset( $wgBlacklistSettings[$type] ) ) {
125 $wgBlacklistSettings[$type] = [];
126 }
127
128 $class = self::$blacklistTypes[$type];
129 self::$instances[$type] = new $class( $wgBlacklistSettings[$type] );
130 }
131
132 return self::$instances[$type];
133 }
134
140 abstract protected function getBlacklistType();
141
148 public static function isLocalSource( Title $title ) {
149 global $wgDBname, $wgBlacklistSettings;
150
151 if ( $title->getNamespace() == NS_MEDIAWIKI ) {
152 $sources = [];
153 foreach ( self::$blacklistTypes as $type => $class ) {
154 $type = ucfirst( $type );
155 $sources += [
156 "$type-blacklist",
157 "$type-whitelist"
158 ];
159 }
160
161 if ( in_array( $title->getDBkey(), $sources ) ) {
162 return true;
163 }
164 }
165
166 $thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP );
167 $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/';
168
169 $files = [];
170 foreach ( self::$blacklistTypes as $type => $class ) {
171 if ( isset( $wgBlacklistSettings[$type]['files'] ) ) {
172 $files += $wgBlacklistSettings[$type]['files'];
173 }
174 }
175
176 foreach ( $files as $fileName ) {
177 $matches = [];
178 if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
179 if ( $wgDBname == $matches[1] ) {
180 if ( $matches[2] == $title->getPrefixedDbKey() ) {
181 // Local DB fetch of this page...
182 return true;
183 }
184 }
185 } elseif ( preg_match( $thisHttpRegex, $fileName ) ) {
186 // Raw view of this page
187 return true;
188 }
189 }
190
191 return false;
192 }
193
201 public static function getTypeFromTitle( Title $title ) {
202 global $wgContLang;
203
204 $types = array_map( [ $wgContLang, 'ucfirst' ], array_keys( self::$blacklistTypes ) );
205 $regex = '/(' . implode( '|', $types ). ')-(?:blacklist|whitelist)/';
206
207 if ( preg_match( $regex, $title->getDBkey(), $m ) ) {
208 return strtolower( $m[1] );
209 }
210
211 return false;
212 }
213
219 function getBlacklists() {
220 if ( $this->regexes === false ) {
221 $this->regexes = array_merge(
222 $this->getLocalBlacklists(),
223 $this->getSharedBlacklists() );
224 }
225 return $this->regexes;
226 }
227
233 public function getLocalBlacklists() {
234 $that = $this;
235 $type = $this->getBlacklistType();
236
237 return ObjectCache::getMainWANInstance()->getWithSetCallback(
238 wfMemcKey( 'spamblacklist', $type, 'blacklist-regex' ),
239 $this->expiryTime,
240 function () use ( $that, $type ) {
241 return SpamRegexBatch::regexesFromMessage( "{$type}-blacklist", $that );
242 }
243 );
244 }
245
251 public function getWhitelists() {
252 $that = $this;
253 $type = $this->getBlacklistType();
254
255 return ObjectCache::getMainWANInstance()->getWithSetCallback(
256 wfMemcKey( 'spamblacklist', $type, 'whitelist-regex' ),
257 $this->expiryTime,
258 function () use ( $that, $type ) {
259 return SpamRegexBatch::regexesFromMessage( "{$type}-whitelist", $that );
260 }
261 );
262 }
263
269 $listType = $this->getBlacklistType();
270
271 wfDebugLog( 'SpamBlacklist', "Loading $listType regex..." );
272
273 if ( count( $this->files ) == 0 ) {
274 # No lists
275 wfDebugLog( 'SpamBlacklist', "no files specified\n" );
276 return [];
277 }
278
279 $miss = false;
280
281 $that = $this;
282 $regexes = ObjectCache::getMainWANInstance()->getWithSetCallback(
283 // This used to be cached per-site, but that could be bad on a shared
284 // server where not all wikis have the same configuration.
285 wfMemcKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ),
286 $this->expiryTime,
287 function () use ( $that, &$miss ) {
288 $miss = true;
289 return $that->buildSharedBlacklists();
290 }
291 );
292
293 if ( !$miss ) {
294 wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" );
295 }
296
297 return $regexes;
298 }
299
305 function clearCache() {
306 $listType = $this->getBlacklistType();
307
308 $cache = ObjectCache::getMainWANInstance();
309 $cache->delete( wfMemcKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ) );
310 $cache->delete( wfMemcKey( 'spamblacklist', $listType, 'blacklist-regex' ) );
311 $cache->delete( wfMemcKey( 'spamblacklist', $listType, 'whitelist-regex' ) );
312
313 wfDebugLog( 'SpamBlacklist', "$listType blacklist local cache cleared.\n" );
314 }
315
317 $regexes = [];
318 $listType = $this->getBlacklistType();
319 # Load lists
320 wfDebugLog( 'SpamBlacklist', "Constructing $listType blacklist\n" );
321 foreach ( $this->files as $fileName ) {
322 $matches = [];
323 if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) {
324 $text = $this->getArticleText( $matches[1], $matches[2] );
325 } elseif ( preg_match( '/^(https?:)?\/\//', $fileName ) ) {
326 $text = $this->getHttpText( $fileName );
327 } else {
328 $text = file_get_contents( $fileName );
329 wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" );
330 }
331
332 // Build a separate batch of regexes from each source.
333 // While in theory we could squeeze a little efficiency
334 // out of combining multiple sources in one regex, if
335 // there's a bad line in one of them we'll gain more
336 // from only having to break that set into smaller pieces.
337 $regexes = array_merge( $regexes,
338 SpamRegexBatch::regexesFromText( $text, $this, $fileName ) );
339 }
340
341 return $regexes;
342 }
343
344 function getHttpText( $fileName ) {
345 global $wgDBname, $messageMemc;
346 $listType = $this->getBlacklistType();
347
348 # HTTP request
349 # To keep requests to a minimum, we save results into $messageMemc, which is
350 # similar to $wgMemc except almost certain to exist. By default, it is stored
351 # in the database
352 # There are two keys, when the warning key expires, a random thread will refresh
353 # the real key. This reduces the chance of multiple requests under high traffic
354 # conditions.
355 $key = "{$listType}_blacklist_file:$fileName";
356 $warningKey = "$wgDBname:{$listType}filewarning:$fileName";
357 $httpText = $messageMemc->get( $key );
358 $warning = $messageMemc->get( $warningKey );
359
360 if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
361 wfDebugLog( 'SpamBlacklist', "Loading $listType blacklist from $fileName\n" );
362 $httpText = Http::get( $fileName );
363 if ( $httpText === false ) {
364 wfDebugLog( 'SpamBlacklist', "Error loading $listType blacklist from $fileName\n" );
365 }
366 $messageMemc->set( $warningKey, 1, $this->warningTime );
367 $messageMemc->set( $key, $httpText, $this->expiryTime );
368 } else {
369 wfDebugLog( 'SpamBlacklist', "Got $listType blacklist from HTTP cache for $fileName\n" );
370 }
371 return $httpText;
372 }
373
382 function getArticleText( $wiki, $article ) {
383 wfDebugLog( 'SpamBlacklist',
384 "Fetching {$this->getBlacklistType()} blacklist from '$article' on '$wiki'...\n" );
385
386 $title = Title::newFromText( $article );
387 // Load all the relevant tables from the correct DB.
388 // This assumes that old_text is the actual text or
389 // that the external store system is at least unified.
390 if ( is_callable( [ Revision::class, 'getQueryInfo' ] ) ) {
391 $revQuery = Revision::getQueryInfo( [ 'page', 'text' ] );
392 } else {
393 $revQuery = [
394 'tables' => [ 'revision', 'page', 'text' ],
395 'fields' => array_merge(
396 Revision::selectFields(),
397 Revision::selectPageFields(),
398 Revision::selectTextFields()
399 ),
400 'joins' => [
401 'text' => [ 'JOIN', 'old_id=rev_text_id' ]
402 ],
403 ];
404 }
405 $row = wfGetDB( DB_REPLICA, [], $wiki )->selectRow(
406 $revQuery['tables'],
407 $revQuery['fields'],
408 [
409 'page_namespace' => $title->getNamespace(), // assume NS IDs match
410 'page_title' => $title->getDBkey(), // assume same case rules
411 ],
412 __METHOD__,
413 [],
414 [ 'page' => [ 'JOIN', 'rev_id=page_latest' ] ] + $revQuery['joins']
415 );
416
417 return $row
418 ? ContentHandler::getContentText( Revision::newFromRow( $row )->getContent() )
419 : false;
420 }
421
427 public function getRegexStart() {
428 return '/[a-z0-9_\-.]*';
429 }
430
437 public function getRegexEnd( $batchSize ) {
438 return ( $batchSize > 0 ) ? '/Sim' : '/im';
439 }
440
445 public function warmCachesForFilter( Title $title, array $entries ) {
446 // subclass this
447 }
448}
c Accompany it with the information you received as to the offer to distribute corresponding source complete source code means all the source code for all modules it plus any associated interface definition files
Definition COPYING.txt:158
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
wfExpandUrl( $url, $defaultProto=PROTO_CURRENT)
Expand a potentially local URL to a fully-qualified URL.
wfMemcKey()
Make a cache key for the local wiki.
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
Base class for different kinds of blacklists.
static array $blacklistTypes
Array containing blacklists that extend BaseBlacklist.
getLocalBlacklists()
Returns the local blacklist.
static getBlacklistTypes()
Return the array of blacklist types currently defined.
array $files
Array of blacklist sources.
static getEmailBlacklist()
__construct( $settings=[])
Constructor.
static getSpamBlacklist()
getBlacklists()
Fetch local and (possibly cached) remote blacklists.
clearCache()
Clear all primary blacklist cache keys.
filter(array $links, Title $title, $preventLog=false)
getWhitelists()
Returns the (local) whitelist.
getSharedBlacklists()
Fetch (possibly cached) remote blacklists.
getBlacklistType()
Returns the code for the blacklist implementation.
getRegexStart()
Returns the start of the regex for matches.
static getInstance( $type)
Returns an instance of the given blacklist.
getArticleText( $wiki, $article)
Fetch an article from this or another local MediaWiki database.
static getTypeFromTitle(Title $title)
Returns the type of blacklist from the given title.
bool array $regexes
Array containing regexes to test against.
static isLocalSource(Title $title)
Check if the given local page title is a spam regex source.
static array $instances
Array of blacklist instances.
getHttpText( $fileName)
static addBlacklistType( $type, $class)
Adds a blacklist class to the registry.
int $warningChance
Chance of receiving a warning when the filter is hit.
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
warmCachesForFilter(Title $title, array $entries)
static get( $url, $options=[], $caller=__METHOD__)
Simple wrapper for Http::request( 'GET' )
Definition Http.php:98
static regexesFromMessage( $message, BaseBlacklist $blacklist)
Build a set of regular expressions from a MediaWiki message.
static regexesFromText( $source, BaseBlacklist $blacklist, $fileName=false)
Build a set of regular expressions from the given multiline input text, with empty lines and comments...
Represents a title within MediaWiki.
Definition Title.php:39
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the local content language as $wgContLang
Definition design.txt:57
globals will be eliminated from MediaWiki replaced by an application object which would be passed to constructors Whether that would be an convenient solution remains to be but certainly PHP makes such object oriented programming models easier than they were in previous versions For the time being MediaWiki programmers will have to work in an environment with some global context At the time of globals were initialised on startup by MediaWiki of these were configuration which are documented in DefaultSettings php There is no comprehensive documentation for the remaining however some of the most important ones are listed below They are typically initialised either in index php or in Setup php For a description of the see design txt $wgTitle Title object created from the request URL $wgOut OutputPage object for HTTP response $wgUser User object for the user associated with the current request $wgLang Language object selected by user preferences $wgContLang Language object associated with the wiki being viewed $wgParser Parser object Parser extensions register their hooks here $wgRequest WebRequest to get request data $messageMemc
Definition globals.txt:66
const PROTO_HTTP
Definition Defines.php:229
$cache
Definition mcc.php:33
controlled by $wgMainCacheType controlled by $wgParserCacheType controlled by $wgMessageCacheType If you set CACHE_NONE to one of the three control default value for MediaWiki still create a but requests to it are no ops and we always fall through to the database If the cache daemon can t be it should also disable itself fairly smoothly By $wgMemc is used but when it is $parserMemc or $messageMemc this is mentioned $wgDBname
const DB_REPLICA
Definition defines.php:25