MediaWiki  1.31.0
SpamBlacklist.php
Go to the documentation of this file.
1 <?php
2 
3 if ( !defined( 'MEDIAWIKI' ) ) {
4  exit;
5 }
6 
9 
11  const STASH_TTL = 180;
12  const STASH_AGE_DYING = 150;
13 
18  private $urlChangeLog = [];
19 
25  protected function getBlacklistType() {
26  return 'spam';
27  }
28 
37  protected function antiSpoof( $text ) {
38  $text = str_replace( '.', '.', $text );
39  return $text;
40  }
41 
55  function filter( array $links, Title $title = null, $preventLog = false, $mode = 'check' ) {
56  $statsd = MediaWikiServices::getInstance()->getStatsdDataFactory();
58 
59  // If there are no new links, and we are logging,
60  // mark all of the current links as being removed.
61  if ( !$links && $this->isLoggingEnabled() ) {
62  $this->logUrlChanges( $this->getCurrentLinks( $title ), [], [] );
63  }
64 
65  if ( !$links ) {
66  return false;
67  }
68 
69  sort( $links );
70  $key = $cache->makeKey(
71  'blacklist',
72  $this->getBlacklistType(),
73  'pass',
74  sha1( implode( "\n", $links ) ),
75  (string)$title
76  );
77  // Skip blacklist checks if nothing matched during edit stashing...
78  $knownNonMatchAsOf = $cache->get( $key );
79  if ( $mode === 'check' ) {
80  if ( $knownNonMatchAsOf ) {
81  $statsd->increment( 'spamblacklist.check-stash.hit' );
82 
83  return false;
84  } else {
85  $statsd->increment( 'spamblacklist.check-stash.miss' );
86  }
87  } elseif ( $mode === 'stash' ) {
88  if ( $knownNonMatchAsOf && ( time() - $knownNonMatchAsOf ) < self::STASH_AGE_DYING ) {
89  return false; // OK; not about to expire soon
90  }
91  }
92 
93  $blacklists = $this->getBlacklists();
94  $whitelists = $this->getWhitelists();
95 
96  if ( count( $blacklists ) ) {
97  // poor man's anti-spoof, see bug 12896
98  $newLinks = array_map( [ $this, 'antiSpoof' ], $links );
99 
100  $oldLinks = [];
101  if ( $title !== null ) {
102  $oldLinks = $this->getCurrentLinks( $title );
103  $addedLinks = array_diff( $newLinks, $oldLinks );
104  } else {
105  // can't load old links, so treat all links as added.
106  $addedLinks = $newLinks;
107  }
108 
109  wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) );
110  wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) );
111  wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) );
112 
113  if ( !$preventLog ) {
114  $this->logUrlChanges( $oldLinks, $newLinks, $addedLinks );
115  }
116 
117  $links = implode( "\n", $addedLinks );
118 
119  # Strip whitelisted URLs from the match
120  if ( is_array( $whitelists ) ) {
121  wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) .
122  " regexes: " . implode( ', ', $whitelists ) . "\n" );
123  foreach ( $whitelists as $regex ) {
125  $newLinks = preg_replace( $regex, '', $links );
127  if ( is_string( $newLinks ) ) {
128  // If there wasn't a regex error, strip the matching URLs
129  $links = $newLinks;
130  }
131  }
132  }
133 
134  # Do the match
135  wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) .
136  " regexes: " . implode( ', ', $blacklists ) . "\n" );
137  $retVal = false;
138  foreach ( $blacklists as $regex ) {
140  $matches = [];
141  $check = ( preg_match_all( $regex, $links, $matches ) > 0 );
143  if ( $check ) {
144  wfDebugLog( 'SpamBlacklist', "Match!\n" );
146  $ip = $wgRequest->getIP();
147  $fullUrls = [];
148  $fullLineRegex = substr( $regex, 0, strrpos( $regex, '/' ) ) . '.*/Sim';
149  preg_match_all( $fullLineRegex, $links, $fullUrls );
150  $imploded = implode( ' ', $fullUrls[0] );
151  wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: $imploded\n" );
152  if ( !$preventLog ) {
153  $this->logFilterHit( $title, $imploded ); // Log it
154  }
155  if ( $retVal === false ) {
156  $retVal = [];
157  }
158  $retVal = array_merge( $retVal, $fullUrls[1] );
159  }
160  }
161  if ( is_array( $retVal ) ) {
162  $retVal = array_unique( $retVal );
163  }
164  } else {
165  $retVal = false;
166  }
167 
168  if ( $retVal === false ) {
169  // Cache the typical negative results
170  $cache->set( $key, time(), self::STASH_TTL );
171  if ( $mode === 'stash' ) {
172  $statsd->increment( 'spamblacklist.check-stash.store' );
173  }
174  }
175 
176  return $retVal;
177  }
178 
179  public function isLoggingEnabled() {
180  global $wgSpamBlacklistEventLogging;
181  return $wgSpamBlacklistEventLogging && class_exists( 'EventLogging' );
182  }
183 
191  public function logUrlChanges( $oldLinks, $newLinks, $addedLinks ) {
192  if ( !$this->isLoggingEnabled() ) {
193  return;
194  }
195 
196  $removedLinks = array_diff( $oldLinks, $newLinks );
197  foreach ( $addedLinks as $url ) {
198  $this->logUrlChange( $url, 'insert' );
199  }
200 
201  foreach ( $removedLinks as $url ) {
202  $this->logUrlChange( $url, 'remove' );
203  }
204  }
205 
213  public function doLogging( User $user, Title $title, $revId ) {
214  if ( !$this->isLoggingEnabled() ) {
215  return;
216  }
217 
218  $baseInfo = [
219  'revId' => $revId,
220  'pageId' => $title->getArticleID(),
221  'pageNamespace' => $title->getNamespace(),
222  'userId' => $user->getId(),
223  'userText' => $user->getName(),
224  ];
225  $changes = $this->urlChangeLog;
226  // Empty the changes queue in case this function gets called more than once
227  $this->urlChangeLog = [];
228 
229  DeferredUpdates::addCallableUpdate( function () use ( $changes, $baseInfo ) {
230  foreach ( $changes as $change ) {
231  EventLogging::logEvent(
232  'ExternalLinksChange',
233  15716074,
234  $baseInfo + $change
235  );
236  }
237  } );
238  }
239 
246  private function logUrlChange( $url, $action ) {
247  $parsed = wfParseUrl( $url );
248  if ( !isset( $parsed['host'] ) ) {
249  wfDebugLog( 'SpamBlacklist', "Unable to parse $url" );
250  return;
251  }
252  $info = [
253  'action' => $action,
254  'protocol' => $parsed['scheme'],
255  'domain' => $parsed['host'],
256  'path' => isset( $parsed['path'] ) ? $parsed['path'] : '',
257  'query' => isset( $parsed['query'] ) ? $parsed['query'] : '',
258  'fragment' => isset( $parsed['fragment'] ) ? $parsed['fragment'] : '',
259  ];
260 
261  $this->urlChangeLog[] = $info;
262  }
263 
274  return $cache->getWithSetCallback(
275  // Key is warmed via warmCachesForFilter() from ApiStashEdit
276  $cache->makeKey( 'external-link-list', $title->getLatestRevID() ),
277  $cache::TTL_MINUTE,
278  function ( $oldValue, &$ttl, array &$setOpts ) use ( $title ) {
279  $dbr = wfGetDB( DB_REPLICA );
280  $setOpts += Database::getCacheSetOptions( $dbr );
281 
282  return $dbr->selectFieldValues(
283  'externallinks',
284  'el_to',
285  [ 'el_from' => $title->getArticleID() ], // should be zero queries
286  __METHOD__
287  );
288  }
289  );
290  }
291 
292  public function warmCachesForFilter( Title $title, array $entries ) {
293  $this->filter( $entries, $title, true /* no logging */, 'stash' );
294  }
295 
301  public function getRegexStart() {
302  return '/(?:https?:)?\/\/+[a-z0-9_\-.]*(';
303  }
304 
311  public function getRegexEnd( $batchSize ) {
312  return ')' . parent::getRegexEnd( $batchSize );
313  }
321  public function logFilterHit( $title, $url ) {
322  global $wgUser, $wgLogSpamBlacklistHits;
323  if ( $wgLogSpamBlacklistHits ) {
324  $logEntry = new ManualLogEntry( 'spamblacklist', 'hit' );
325  $logEntry->setPerformer( $wgUser );
326  $logEntry->setTarget( $title );
327  $logEntry->setParameters( [
328  '4::url' => $url,
329  ] );
330  $logid = $logEntry->insert();
331  $log = new LogPage( 'spamblacklist' );
332  if ( $log->isRestricted() ) {
333  // Make sure checkusers can see this action if the log is restricted
334  // (which is the default)
335  if ( ExtensionRegistry::getInstance()->isLoaded( 'CheckUser' )
336  && class_exists( 'CheckUserHooks' )
337  ) {
338  $rc = $logEntry->getRecentChange( $logid );
339  CheckUserHooks::updateCheckUserData( $rc );
340  }
341  } else {
342  // If the log is unrestricted, publish normally to RC,
343  // which will also update checkuser
344  $logEntry->publish( $logid, "rc" );
345  }
346  }
347  }
348 }
Wikimedia\Rdbms\Database
Relational database abstraction object.
Definition: Database.php:48
$user
please add to it if you re going to add events to the MediaWiki code where normally authentication against an external auth plugin would be creating a account $user
Definition: hooks.txt:244
$wgUser
$wgUser
Definition: Setup.php:894
SpamBlacklist\logUrlChange
logUrlChange( $url, $action)
Queue log data about change for a url addition or removal.
Definition: SpamBlacklist.php:246
SpamBlacklist\getRegexStart
getRegexStart()
Returns the start of the regex for matches.
Definition: SpamBlacklist.php:301
ObjectCache\getLocalClusterInstance
static getLocalClusterInstance()
Get the main cluster-local cache object.
Definition: ObjectCache.php:367
BaseBlacklist\getBlacklists
getBlacklists()
Fetch local and (possibly cached) remote blacklists.
Definition: BaseBlacklist.php:219
SpamBlacklist
Definition: SpamBlacklist.php:10
captcha-old.count
count
Definition: captcha-old.py:249
use
as see the revision history and available at free of to any person obtaining a copy of this software and associated documentation to deal in the Software without including without limitation the rights to use
Definition: MIT-LICENSE.txt:10
wfSuppressWarnings
wfSuppressWarnings( $end=false)
Reference-counted warning suppression.
Definition: GlobalFunctions.php:1948
SpamBlacklist\getCurrentLinks
getCurrentLinks(Title $title)
Look up the links currently in the article, so we can ignore them on a second run.
Definition: SpamBlacklist.php:272
wfDebugLog
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
Definition: GlobalFunctions.php:1075
php
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
$dbr
$dbr
Definition: testCompression.php:50
ExtensionRegistry\getInstance
static getInstance()
Definition: ExtensionRegistry.php:88
SpamBlacklist\getRegexEnd
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
Definition: SpamBlacklist.php:311
wfParseUrl
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
Definition: GlobalFunctions.php:801
SpamBlacklist\logFilterHit
logFilterHit( $title, $url)
Logs the filter hit to Special:Log if $wgLogSpamBlacklistHits is enabled.
Definition: SpamBlacklist.php:321
$title
namespace and then decline to actually register it file or subcat img or subcat $title
Definition: hooks.txt:934
wfRestoreWarnings
wfRestoreWarnings()
Definition: GlobalFunctions.php:1956
wfGetDB
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Definition: GlobalFunctions.php:2800
$matches
$matches
Definition: NoLocalSettings.php:24
BaseBlacklist
Base class for different kinds of blacklists.
Definition: BaseBlacklist.php:6
LogPage
Class to simplify the use of log pages.
Definition: LogPage.php:31
SpamBlacklist\warmCachesForFilter
warmCachesForFilter(Title $title, array $entries)
Definition: SpamBlacklist.php:292
SpamBlacklist\STASH_AGE_DYING
const STASH_AGE_DYING
Definition: SpamBlacklist.php:12
global
when a variable name is used in a it is silently declared as a new masking the global
Definition: design.txt:93
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
SpamBlacklist\$urlChangeLog
array[] $urlChangeLog
Changes to external links, for logging purposes.
Definition: SpamBlacklist.php:18
SpamBlacklist\logUrlChanges
logUrlChanges( $oldLinks, $newLinks, $addedLinks)
Diff added/removed urls and generate events for them.
Definition: SpamBlacklist.php:191
SpamBlacklist\filter
filter(array $links, Title $title=null, $preventLog=false, $mode='check')
Definition: SpamBlacklist.php:55
SpamBlacklist\isLoggingEnabled
isLoggingEnabled()
Definition: SpamBlacklist.php:179
Title
Represents a title within MediaWiki.
Definition: Title.php:39
$cache
$cache
Definition: mcc.php:33
ObjectCache\getMainWANInstance
static getMainWANInstance()
Get the main WAN cache object.
Definition: ObjectCache.php:380
SpamBlacklist\getBlacklistType
getBlacklistType()
Returns the code for the blacklist implementation.
Definition: SpamBlacklist.php:25
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
ManualLogEntry
Class for creating log entries manually, to inject them into the database.
Definition: LogEntry.php:432
SpamBlacklist\STASH_TTL
const STASH_TTL
Definition: SpamBlacklist.php:11
$wgRequest
if(! $wgDBerrorLogTZ) $wgRequest
Definition: Setup.php:737
MediaWikiServices
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
User
The User object encapsulates all of the user-specific settings (user_id, name, rights,...
Definition: User.php:53
DeferredUpdates\addCallableUpdate
static addCallableUpdate( $callable, $stage=self::POSTSEND, $dbw=null)
Add a callable update.
Definition: DeferredUpdates.php:111
BaseBlacklist\getWhitelists
getWhitelists()
Returns the (local) whitelist.
Definition: BaseBlacklist.php:251
SpamBlacklist\antiSpoof
antiSpoof( $text)
Apply some basic anti-spoofing to the links before they get filtered, see.
Definition: SpamBlacklist.php:37
SpamBlacklist\doLogging
doLogging(User $user, Title $title, $revId)
Actually push the url change events post-save.
Definition: SpamBlacklist.php:213
array
the array() calling protocol came about after MediaWiki 1.4rc1.