MediaWiki  1.33.0
SpamBlacklist.php
Go to the documentation of this file.
1 <?php
2 
3 if ( !defined( 'MEDIAWIKI' ) ) {
4  exit;
5 }
6 
9 
11  const STASH_TTL = 180;
12  const STASH_AGE_DYING = 150;
13 
18  private $urlChangeLog = [];
19 
25  protected function getBlacklistType() {
26  return 'spam';
27  }
28 
37  protected function antiSpoof( $text ) {
38  $text = str_replace( '.', '.', $text );
39  return $text;
40  }
41 
55  public function filter( array $links, Title $title = null, $preventLog = false, $mode = 'check' ) {
56  $statsd = MediaWikiServices::getInstance()->getStatsdDataFactory();
58 
59  // If there are no new links, and we are logging,
60  // mark all of the current links as being removed.
61  if ( !$links && $this->isLoggingEnabled() ) {
62  $this->logUrlChanges( $this->getCurrentLinks( $title ), [], [] );
63  }
64 
65  if ( !$links ) {
66  return false;
67  }
68 
69  sort( $links );
70  $key = $cache->makeKey(
71  'blacklist',
72  $this->getBlacklistType(),
73  'pass',
74  sha1( implode( "\n", $links ) ),
75  md5( (string)$title )
76  );
77  // Skip blacklist checks if nothing matched during edit stashing...
78  $knownNonMatchAsOf = $cache->get( $key );
79  if ( $mode === 'check' ) {
80  if ( $knownNonMatchAsOf ) {
81  $statsd->increment( 'spamblacklist.check-stash.hit' );
82 
83  return false;
84  } else {
85  $statsd->increment( 'spamblacklist.check-stash.miss' );
86  }
87  } elseif ( $mode === 'stash' ) {
88  if ( $knownNonMatchAsOf && ( time() - $knownNonMatchAsOf ) < self::STASH_AGE_DYING ) {
89  return false; // OK; not about to expire soon
90  }
91  }
92 
93  $blacklists = $this->getBlacklists();
94  $whitelists = $this->getWhitelists();
95 
96  if ( count( $blacklists ) ) {
97  // poor man's anti-spoof, see bug 12896
98  $newLinks = array_map( [ $this, 'antiSpoof' ], $links );
99 
100  $oldLinks = [];
101  if ( $title !== null ) {
102  $oldLinks = $this->getCurrentLinks( $title );
103  $addedLinks = array_diff( $newLinks, $oldLinks );
104  } else {
105  // can't load old links, so treat all links as added.
106  $addedLinks = $newLinks;
107  }
108 
109  wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) );
110  wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) );
111  wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) );
112 
113  if ( !$preventLog ) {
114  $this->logUrlChanges( $oldLinks, $newLinks, $addedLinks );
115  }
116 
117  $links = implode( "\n", $addedLinks );
118 
119  # Strip whitelisted URLs from the match
120  if ( is_array( $whitelists ) ) {
121  wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) .
122  " regexes: " . implode( ', ', $whitelists ) . "\n" );
123  foreach ( $whitelists as $regex ) {
124  Wikimedia\suppressWarnings();
125  $newLinks = preg_replace( $regex, '', $links );
126  Wikimedia\restoreWarnings();
127  if ( is_string( $newLinks ) ) {
128  // If there wasn't a regex error, strip the matching URLs
129  $links = $newLinks;
130  }
131  }
132  }
133 
134  # Do the match
135  wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) .
136  " regexes: " . implode( ', ', $blacklists ) . "\n" );
137  $retVal = false;
138  foreach ( $blacklists as $regex ) {
139  Wikimedia\suppressWarnings();
140  $matches = [];
141  $check = ( preg_match_all( $regex, $links, $matches ) > 0 );
142  Wikimedia\restoreWarnings();
143  if ( $check ) {
144  wfDebugLog( 'SpamBlacklist', "Match!\n" );
145  global $wgRequest;
146  $ip = $wgRequest->getIP();
147  $fullUrls = [];
148  $fullLineRegex = substr( $regex, 0, strrpos( $regex, '/' ) ) . '.*/Sim';
149  preg_match_all( $fullLineRegex, $links, $fullUrls );
150  $imploded = implode( ' ', $fullUrls[0] );
151  wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: $imploded\n" );
152  if ( !$preventLog ) {
153  $this->logFilterHit( $title, $imploded ); // Log it
154  }
155  if ( $retVal === false ) {
156  $retVal = [];
157  }
158  $retVal = array_merge( $retVal, $fullUrls[1] );
159  }
160  }
161  if ( is_array( $retVal ) ) {
162  $retVal = array_unique( $retVal );
163  }
164  } else {
165  $retVal = false;
166  }
167 
168  if ( $retVal === false ) {
169  // Cache the typical negative results
170  $cache->set( $key, time(), self::STASH_TTL );
171  if ( $mode === 'stash' ) {
172  $statsd->increment( 'spamblacklist.check-stash.store' );
173  }
174  }
175 
176  return $retVal;
177  }
178 
179  public function isLoggingEnabled() {
180  global $wgSpamBlacklistEventLogging;
181  return $wgSpamBlacklistEventLogging &&
182  ExtensionRegistry::getInstance()->isLoaded( 'EventLogging' );
183  }
184 
192  public function logUrlChanges( $oldLinks, $newLinks, $addedLinks ) {
193  if ( !$this->isLoggingEnabled() ) {
194  return;
195  }
196 
197  $removedLinks = array_diff( $oldLinks, $newLinks );
198  foreach ( $addedLinks as $url ) {
199  $this->logUrlChange( $url, 'insert' );
200  }
201 
202  foreach ( $removedLinks as $url ) {
203  $this->logUrlChange( $url, 'remove' );
204  }
205  }
206 
214  public function doLogging( User $user, Title $title, $revId ) {
215  if ( !$this->isLoggingEnabled() ) {
216  return;
217  }
218 
219  $baseInfo = [
220  'revId' => $revId,
221  'pageId' => $title->getArticleID(),
222  'pageNamespace' => $title->getNamespace(),
223  'userId' => $user->getId(),
224  'userText' => $user->getName(),
225  ];
226  $changes = $this->urlChangeLog;
227  // Empty the changes queue in case this function gets called more than once
228  $this->urlChangeLog = [];
229 
230  DeferredUpdates::addCallableUpdate( function () use ( $changes, $baseInfo ) {
231  foreach ( $changes as $change ) {
232  EventLogging::logEvent(
233  'ExternalLinksChange',
234  15716074,
235  $baseInfo + $change
236  );
237  }
238  } );
239  }
240 
247  private function logUrlChange( $url, $action ) {
248  $parsed = wfParseUrl( $url );
249  if ( !isset( $parsed['host'] ) ) {
250  wfDebugLog( 'SpamBlacklist', "Unable to parse $url" );
251  return;
252  }
253  $info = [
254  'action' => $action,
255  'protocol' => $parsed['scheme'],
256  'domain' => $parsed['host'],
257  'path' => $parsed['path'] ?? '',
258  'query' => $parsed['query'] ?? '',
259  'fragment' => $parsed['fragment'] ?? '',
260  ];
261 
262  $this->urlChangeLog[] = $info;
263  }
264 
273  public function getCurrentLinks( Title $title ) {
275  $fname = __METHOD__;
276  return $cache->getWithSetCallback(
277  // Key is warmed via warmCachesForFilter() from ApiStashEdit
278  $cache->makeKey( 'external-link-list', $title->getLatestRevID() ),
279  $cache::TTL_MINUTE,
280  function ( $oldValue, &$ttl, array &$setOpts ) use ( $title, $fname ) {
281  $dbr = wfGetDB( DB_REPLICA );
282  $setOpts += Database::getCacheSetOptions( $dbr );
283 
284  return $dbr->selectFieldValues(
285  'externallinks',
286  'el_to',
287  [ 'el_from' => $title->getArticleID() ], // should be zero queries
288  $fname
289  );
290  }
291  );
292  }
293 
294  public function warmCachesForFilter( Title $title, array $entries ) {
295  $this->filter( $entries, $title, true /* no logging */, 'stash' );
296  }
297 
303  public function getRegexStart() {
304  return '/(?:https?:)?\/\/+[a-z0-9_\-.]*(';
305  }
306 
313  public function getRegexEnd( $batchSize ) {
314  return ')' . parent::getRegexEnd( $batchSize );
315  }
323  public function logFilterHit( $title, $url ) {
324  global $wgUser, $wgLogSpamBlacklistHits;
325  if ( $wgLogSpamBlacklistHits ) {
326  $logEntry = new ManualLogEntry( 'spamblacklist', 'hit' );
327  $logEntry->setPerformer( $wgUser );
328  $logEntry->setTarget( $title );
329  $logEntry->setParameters( [
330  '4::url' => $url,
331  ] );
332  $logid = $logEntry->insert();
333  $log = new LogPage( 'spamblacklist' );
334  if ( $log->isRestricted() ) {
335  // Make sure checkusers can see this action if the log is restricted
336  // (which is the default)
337  if ( ExtensionRegistry::getInstance()->isLoaded( 'CheckUser' )
338  && class_exists( CheckUserHooks::class )
339  ) {
340  $rc = $logEntry->getRecentChange( $logid );
341  CheckUserHooks::updateCheckUserData( $rc );
342  }
343  } else {
344  // If the log is unrestricted, publish normally to RC,
345  // which will also update checkuser
346  $logEntry->publish( $logid, "rc" );
347  }
348  }
349  }
350 }
Wikimedia\Rdbms\Database
Relational database abstraction object.
Definition: Database.php:48
$user
return true to allow those checks to and false if checking is done & $user
Definition: hooks.txt:1476
SpamBlacklist\logUrlChange
logUrlChange( $url, $action)
Queue log data about change for a url addition or removal.
Definition: SpamBlacklist.php:247
SpamBlacklist\getRegexStart
getRegexStart()
Returns the start of the regex for matches.
Definition: SpamBlacklist.php:303
ObjectCache\getLocalClusterInstance
static getLocalClusterInstance()
Get the main cluster-local cache object.
Definition: ObjectCache.php:356
BaseBlacklist\getBlacklists
getBlacklists()
Fetch local and (possibly cached) remote blacklists.
Definition: BaseBlacklist.php:220
SpamBlacklist
Definition: SpamBlacklist.php:10
captcha-old.count
count
Definition: captcha-old.py:249
SpamBlacklist\getCurrentLinks
getCurrentLinks(Title $title)
Look up the links currently in the article, so we can ignore them on a second run.
Definition: SpamBlacklist.php:273
wfDebugLog
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
Definition: GlobalFunctions.php:1043
php
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
$dbr
$dbr
Definition: testCompression.php:50
SpamBlacklist\getRegexEnd
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
Definition: SpamBlacklist.php:313
ExtensionRegistry\getInstance
static getInstance()
Definition: ExtensionRegistry.php:98
wfParseUrl
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
Definition: GlobalFunctions.php:817
SpamBlacklist\logFilterHit
logFilterHit( $title, $url)
Logs the filter hit to Special:Log if $wgLogSpamBlacklistHits is enabled.
Definition: SpamBlacklist.php:323
$title
namespace and then decline to actually register it file or subcat img or subcat $title
Definition: hooks.txt:925
wfGetDB
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Definition: GlobalFunctions.php:2636
$matches
$matches
Definition: NoLocalSettings.php:24
BaseBlacklist
Base class for different kinds of blacklists.
Definition: BaseBlacklist.php:6
LogPage
Class to simplify the use of log pages.
Definition: LogPage.php:33
SpamBlacklist\warmCachesForFilter
warmCachesForFilter(Title $title, array $entries)
Definition: SpamBlacklist.php:294
use
as see the revision history and available at free of to any person obtaining a copy of this software and associated documentation to deal in the Software without including without limitation the rights to use
Definition: MIT-LICENSE.txt:10
SpamBlacklist\STASH_AGE_DYING
const STASH_AGE_DYING
Definition: SpamBlacklist.php:12
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
SpamBlacklist\$urlChangeLog
array[] $urlChangeLog
Changes to external links, for logging purposes.
Definition: SpamBlacklist.php:18
array
The wiki should then use memcached to cache various data To use multiple just add more items to the array To increase the weight of a make its entry a array("192.168.0.1:11211", 2))
$fname
if(defined( 'MW_SETUP_CALLBACK')) $fname
Customization point after all loading (constants, functions, classes, DefaultSettings,...
Definition: Setup.php:123
SpamBlacklist\logUrlChanges
logUrlChanges( $oldLinks, $newLinks, $addedLinks)
Diff added/removed urls and generate events for them.
Definition: SpamBlacklist.php:192
SpamBlacklist\filter
filter(array $links, Title $title=null, $preventLog=false, $mode='check')
Definition: SpamBlacklist.php:55
SpamBlacklist\isLoggingEnabled
isLoggingEnabled()
Definition: SpamBlacklist.php:179
Title
Represents a title within MediaWiki.
Definition: Title.php:40
$cache
$cache
Definition: mcc.php:33
ObjectCache\getMainWANInstance
static getMainWANInstance()
Get the main WAN cache object.
Definition: ObjectCache.php:369
SpamBlacklist\getBlacklistType
getBlacklistType()
Returns the code for the blacklist implementation.
Definition: SpamBlacklist.php:25
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
ManualLogEntry
Class for creating new log entries and inserting them into the database.
Definition: LogEntry.php:441
SpamBlacklist\STASH_TTL
const STASH_TTL
Definition: SpamBlacklist.php:11
class
you have access to all of the normal MediaWiki so you can get a DB use the etc For full docs on the Maintenance class
Definition: maintenance.txt:52
$wgRequest
if(! $wgDBerrorLogTZ) $wgRequest
Definition: Setup.php:728
MediaWikiServices
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
User
The User object encapsulates all of the user-specific settings (user_id, name, rights,...
Definition: User.php:48
DeferredUpdates\addCallableUpdate
static addCallableUpdate( $callable, $stage=self::POSTSEND, $dbw=null)
Add a callable update.
Definition: DeferredUpdates.php:118
BaseBlacklist\getWhitelists
getWhitelists()
Returns the (local) whitelist.
Definition: BaseBlacklist.php:253
SpamBlacklist\antiSpoof
antiSpoof( $text)
Apply some basic anti-spoofing to the links before they get filtered, see.
Definition: SpamBlacklist.php:37
SpamBlacklist\doLogging
doLogging(User $user, Title $title, $revId)
Actually push the url change events post-save.
Definition: SpamBlacklist.php:214