MediaWiki  1.34.0
BaseBlacklist.php
Go to the documentation of this file.
1 <?php
2 
4 use MediaWiki\Storage\SlotRecord;
5 
9 abstract class BaseBlacklist {
15  public $files = [];
16 
22  protected $regexes = false;
23 
29  public $warningChance = 100;
30 
34  public $warningTime = 600;
35 
39  public $expiryTime = 900;
40 
46  private static $blacklistTypes = [
47  'spam' => 'SpamBlacklist',
48  'email' => 'EmailBlacklist',
49  ];
50 
56  private static $instances = [];
57 
63  public function __construct( $settings = [] ) {
64  foreach ( $settings as $name => $value ) {
65  $this->$name = $value;
66  }
67  }
68 
75  abstract public function filter( array $links, Title $title, $preventLog = false );
76 
83  public static function addBlacklistType( $type, $class ) {
84  self::$blacklistTypes[$type] = $class;
85  }
86 
92  public static function getBlacklistTypes() {
93  return self::$blacklistTypes;
94  }
95 
99  public static function getSpamBlacklist() {
100  return self::getInstance( 'spam' );
101  }
102 
106  public static function getEmailBlacklist() {
107  return self::getInstance( 'email' );
108  }
109 
118  public static function getInstance( $type ) {
119  if ( !isset( self::$blacklistTypes[$type] ) ) {
120  throw new Exception( "Invalid blacklist type '$type' passed to " . __METHOD__ );
121  }
122 
123  if ( !isset( self::$instances[$type] ) ) {
124  global $wgBlacklistSettings;
125 
126  // Prevent notices
127  if ( !isset( $wgBlacklistSettings[$type] ) ) {
128  $wgBlacklistSettings[$type] = [];
129  }
130 
131  $class = self::$blacklistTypes[$type];
132  self::$instances[$type] = new $class( $wgBlacklistSettings[$type] );
133  }
134 
135  return self::$instances[$type];
136  }
137 
143  abstract protected function getBlacklistType();
144 
151  public static function isLocalSource( Title $title ) {
152  global $wgDBname, $wgBlacklistSettings;
153 
154  if ( $title->inNamespace( NS_MEDIAWIKI ) ) {
155  $sources = [];
156  foreach ( self::$blacklistTypes as $type => $class ) {
157  $type = ucfirst( $type );
158  $sources += [
159  "$type-blacklist",
160  "$type-whitelist"
161  ];
162  }
163 
164  if ( in_array( $title->getDBkey(), $sources ) ) {
165  return true;
166  }
167  }
168 
169  $thisHttp = wfExpandUrl( $title->getFullUrl( 'action=raw' ), PROTO_HTTP );
170  $thisHttpRegex = '/^' . preg_quote( $thisHttp, '/' ) . '(?:&.*)?$/';
171 
172  $files = [];
173  foreach ( self::$blacklistTypes as $type => $class ) {
174  if ( isset( $wgBlacklistSettings[$type]['files'] ) ) {
175  $files += $wgBlacklistSettings[$type]['files'];
176  }
177  }
178 
179  // @phan-suppress-next-line PhanTypeMismatchForeach += makes Phan think $files is a number
180  foreach ( $files as $fileName ) {
181  $matches = [];
182  if ( preg_match( '/^DB: (\w*) (.*)$/', $fileName, $matches ) ) {
183  if ( $wgDBname === $matches[1] ) {
184  if ( $matches[2] === $title->getPrefixedDbKey() ) {
185  // Local DB fetch of this page...
186  return true;
187  }
188  }
189  } elseif ( preg_match( $thisHttpRegex, $fileName ) ) {
190  // Raw view of this page
191  return true;
192  }
193  }
194 
195  return false;
196  }
197 
205  public static function getTypeFromTitle( Title $title ) {
206  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
207 
208  $types = array_map( [ $contLang, 'ucfirst' ], array_keys( self::$blacklistTypes ) );
209  $regex = '/(' . implode( '|', $types ) . ')-(?:blacklist|whitelist)/';
210 
211  if ( preg_match( $regex, $title->getDBkey(), $m ) ) {
212  return strtolower( $m[1] );
213  }
214 
215  return false;
216  }
217 
223  public function getBlacklists() {
224  if ( $this->regexes === false ) {
225  $this->regexes = array_merge(
226  $this->getLocalBlacklists(),
227  $this->getSharedBlacklists()
228  );
229  }
230  return $this->regexes;
231  }
232 
238  public function getLocalBlacklists() {
239  $that = $this;
240  $type = $this->getBlacklistType();
241  $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
242 
243  return $cache->getWithSetCallback(
244  $cache->makeKey( 'spamblacklist', $type, 'blacklist-regex' ),
246  function () use ( $that, $type ) {
247  return SpamRegexBatch::regexesFromMessage( "{$type}-blacklist", $that );
248  }
249  );
250  }
251 
257  public function getWhitelists() {
258  $that = $this;
259  $type = $this->getBlacklistType();
260  $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
261 
262  return $cache->getWithSetCallback(
263  $cache->makeKey( 'spamblacklist', $type, 'whitelist-regex' ),
265  function () use ( $that, $type ) {
266  return SpamRegexBatch::regexesFromMessage( "{$type}-whitelist", $that );
267  }
268  );
269  }
270 
275  private function getSharedBlacklists() {
276  $listType = $this->getBlacklistType();
277 
278  wfDebugLog( 'SpamBlacklist', "Loading $listType regex..." );
279 
280  if ( !$this->files ) {
281  # No lists
282  wfDebugLog( 'SpamBlacklist', "no files specified\n" );
283  return [];
284  }
285 
286  $miss = false;
287 
288  $that = $this;
289  $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
290  $regexes = $cache->getWithSetCallback(
291  // This used to be cached per-site, but that could be bad on a shared
292  // server where not all wikis have the same configuration.
293  $cache->makeKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ),
295  function () use ( $that, &$miss ) {
296  $miss = true;
297  return $that->buildSharedBlacklists();
298  }
299  );
300 
301  if ( !$miss ) {
302  wfDebugLog( 'SpamBlacklist', "Got shared spam regexes from cache\n" );
303  }
304 
305  return $regexes;
306  }
307 
311  public function clearCache() {
312  $listType = $this->getBlacklistType();
313 
314  $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
315  $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'shared-blacklist-regex' ) );
316  $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'blacklist-regex' ) );
317  $cache->delete( $cache->makeKey( 'spamblacklist', $listType, 'whitelist-regex' ) );
318 
319  wfDebugLog( 'SpamBlacklist', "$listType blacklist local cache cleared.\n" );
320  }
321 
322  private function buildSharedBlacklists() {
323  $regexes = [];
324  $listType = $this->getBlacklistType();
325  # Load lists
326  wfDebugLog( 'SpamBlacklist', "Constructing $listType blacklist\n" );
327  foreach ( $this->files as $fileName ) {
328  $matches = [];
329  if ( preg_match( '/^DB: ([\w-]*) (.*)$/', $fileName, $matches ) ) {
330  $text = $this->getArticleText( $matches[1], $matches[2] );
331  } elseif ( preg_match( '/^(https?:)?\/\//', $fileName ) ) {
332  $text = $this->getHttpText( $fileName );
333  } else {
334  $text = file_get_contents( $fileName );
335  wfDebugLog( 'SpamBlacklist', "got from file $fileName\n" );
336  }
337 
338  // Build a separate batch of regexes from each source.
339  // While in theory we could squeeze a little efficiency
340  // out of combining multiple sources in one regex, if
341  // there's a bad line in one of them we'll gain more
342  // from only having to break that set into smaller pieces.
343  $regexes = array_merge(
344  $regexes,
345  SpamRegexBatch::regexesFromText( $text, $this, $fileName )
346  );
347  }
348 
349  return $regexes;
350  }
351 
352  private function getHttpText( $fileName ) {
353  global $wgDBname, $messageMemc;
354  $listType = $this->getBlacklistType();
355 
356  # HTTP request
357  # To keep requests to a minimum, we save results into $messageMemc, which is
358  # similar to $wgMemc except almost certain to exist. By default, it is stored
359  # in the database
360  # There are two keys, when the warning key expires, a random thread will refresh
361  # the real key. This reduces the chance of multiple requests under high traffic
362  # conditions.
363  $key = "{$listType}_blacklist_file:$fileName";
364  $warningKey = "$wgDBname:{$listType}filewarning:$fileName";
365  $httpText = $messageMemc->get( $key );
366  $warning = $messageMemc->get( $warningKey );
367 
368  if ( !is_string( $httpText ) || ( !$warning && !mt_rand( 0, $this->warningChance ) ) ) {
369  wfDebugLog( 'SpamBlacklist', "Loading $listType blacklist from $fileName\n" );
370  $httpText = Http::get( $fileName );
371  if ( $httpText === false ) {
372  wfDebugLog( 'SpamBlacklist', "Error loading $listType blacklist from $fileName\n" );
373  }
374  $messageMemc->set( $warningKey, 1, $this->warningTime );
375  $messageMemc->set( $key, $httpText, $this->expiryTime );
376  } else {
377  wfDebugLog( 'SpamBlacklist', "Got $listType blacklist from HTTP cache for $fileName\n" );
378  }
379  return $httpText;
380  }
381 
389  private function getArticleText( $wiki, $pagename ) {
390  wfDebugLog( 'SpamBlacklist',
391  "Fetching {$this->getBlacklistType()} blacklist from '$pagename' on '$wiki'...\n" );
392 
393  $services = MediaWikiServices::getInstance();
394 
395  // XXX: We do not know about custom namespaces on the target wiki here!
396  $title = $services->getTitleParser()->parseTitle( $pagename );
397  $store = $services->getRevisionStoreFactory()->getRevisionStore( $wiki );
398  $rev = $store->getRevisionByTitle( $title );
399 
400  $content = $rev ? $rev->getContent( SlotRecord::MAIN ) : null;
401 
402  if ( !( $content instanceof TextContent ) ) {
403  return false;
404  }
405 
406  return $content->getText();
407  }
408 
414  public function getRegexStart() {
415  return '/[a-z0-9_\-.]*';
416  }
417 
424  public function getRegexEnd( $batchSize ) {
425  return ( $batchSize > 0 ) ? '/Sim' : '/im';
426  }
427 
432  public function warmCachesForFilter( Title $title, array $entries ) {
433  // subclass this
434  }
435 }
BaseBlacklist\getSpamBlacklist
static getSpamBlacklist()
Definition: BaseBlacklist.php:99
BaseBlacklist\getBlacklistTypes
static getBlacklistTypes()
Return the array of blacklist types currently defined.
Definition: BaseBlacklist.php:92
BaseBlacklist\getInstance
static getInstance( $type)
Returns an instance of the given blacklist.
Definition: BaseBlacklist.php:118
BaseBlacklist\getRegexEnd
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
Definition: BaseBlacklist.php:424
$wgDBname
$wgDBname
Current wiki database name.
Definition: DefaultSettings.php:1893
BaseBlacklist\getBlacklists
getBlacklists()
Fetch local and (possibly cached) remote blacklists.
Definition: BaseBlacklist.php:223
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:117
BaseBlacklist\getHttpText
getHttpText( $fileName)
Definition: BaseBlacklist.php:352
BaseBlacklist\getBlacklistType
getBlacklistType()
Returns the code for the blacklist implementation.
BaseBlacklist\$files
array $files
Array of blacklist sources.
Definition: BaseBlacklist.php:15
BaseBlacklist\isLocalSource
static isLocalSource(Title $title)
Check if the given local page title is a spam regex source.
Definition: BaseBlacklist.php:151
BaseBlacklist\addBlacklistType
static addBlacklistType( $type, $class)
Adds a blacklist class to the registry.
Definition: BaseBlacklist.php:83
BaseBlacklist\getTypeFromTitle
static getTypeFromTitle(Title $title)
Returns the type of blacklist from the given title.
Definition: BaseBlacklist.php:205
SpamRegexBatch\regexesFromText
static regexesFromText( $source, BaseBlacklist $blacklist, $fileName=false)
Build a set of regular expressions from the given multiline input text, with empty lines and comments...
Definition: SpamRegexBatch.php:155
wfDebugLog
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
Definition: GlobalFunctions.php:1007
BaseBlacklist\$blacklistTypes
static array $blacklistTypes
Array containing blacklists that extend BaseBlacklist.
Definition: BaseBlacklist.php:46
$matches
$matches
Definition: NoLocalSettings.php:24
Http\get
static get( $url, array $options=[], $caller=__METHOD__)
Simple wrapper for Http::request( 'GET' )
Definition: Http.php:64
BaseBlacklist\getLocalBlacklists
getLocalBlacklists()
Returns the local blacklist.
Definition: BaseBlacklist.php:238
BaseBlacklist
Base class for different kinds of blacklists.
Definition: BaseBlacklist.php:9
BaseBlacklist\clearCache
clearCache()
Clear all primary blacklist cache keys.
Definition: BaseBlacklist.php:311
$title
$title
Definition: testCompression.php:34
SpamRegexBatch\regexesFromMessage
static regexesFromMessage( $message, BaseBlacklist $blacklist)
Build a set of regular expressions from a MediaWiki message.
Definition: SpamRegexBatch.php:168
BaseBlacklist\getSharedBlacklists
getSharedBlacklists()
Fetch (possibly cached) remote blacklists.
Definition: BaseBlacklist.php:275
BaseBlacklist\$regexes
bool array $regexes
Array containing regexes to test against.
Definition: BaseBlacklist.php:22
$content
$content
Definition: router.php:78
$messageMemc
$messageMemc
Definition: Setup.php:792
BaseBlacklist\warmCachesForFilter
warmCachesForFilter(Title $title, array $entries)
Definition: BaseBlacklist.php:432
BaseBlacklist\filter
filter(array $links, Title $title, $preventLog=false)
BaseBlacklist\$instances
static array $instances
Array of blacklist instances.
Definition: BaseBlacklist.php:56
PROTO_HTTP
const PROTO_HTTP
Definition: Defines.php:199
TextContent
Content object implementation for representing flat text.
Definition: TextContent.php:37
Title
Represents a title within MediaWiki.
Definition: Title.php:42
BaseBlacklist\buildSharedBlacklists
buildSharedBlacklists()
Definition: BaseBlacklist.php:322
$cache
$cache
Definition: mcc.php:33
BaseBlacklist\$warningChance
int $warningChance
Chance of receiving a warning when the filter is hit.
Definition: BaseBlacklist.php:29
BaseBlacklist\getEmailBlacklist
static getEmailBlacklist()
Definition: BaseBlacklist.php:106
BaseBlacklist\__construct
__construct( $settings=[])
Constructor.
Definition: BaseBlacklist.php:63
NS_MEDIAWIKI
const NS_MEDIAWIKI
Definition: Defines.php:68
BaseBlacklist\$warningTime
int $warningTime
Definition: BaseBlacklist.php:34
BaseBlacklist\getWhitelists
getWhitelists()
Returns the (local) whitelist.
Definition: BaseBlacklist.php:257
wfExpandUrl
wfExpandUrl( $url, $defaultProto=PROTO_CURRENT)
Expand a potentially local URL to a fully-qualified URL.
Definition: GlobalFunctions.php:491
BaseBlacklist\getRegexStart
getRegexStart()
Returns the start of the regex for matches.
Definition: BaseBlacklist.php:414
$type
$type
Definition: testCompression.php:48
BaseBlacklist\getArticleText
getArticleText( $wiki, $pagename)
Fetch an article from this or another local MediaWiki database.
Definition: BaseBlacklist.php:389
BaseBlacklist\$expiryTime
int $expiryTime
Definition: BaseBlacklist.php:39