MediaWiki  1.30.0
SpamBlacklist_body.php
Go to the documentation of this file.
1 <?php
2 
3 if ( !defined( 'MEDIAWIKI' ) ) {
4  exit;
5 }
6 
8 
9 class SpamBlacklist extends BaseBlacklist {
10  const STASH_TTL = 180;
11  const STASH_AGE_DYING = 150;
12 
17  private $urlChangeLog = [];
18 
24  protected function getBlacklistType() {
25  return 'spam';
26  }
27 
36  protected function antiSpoof( $text ) {
37  $text = str_replace( '.', '.', $text );
38  return $text;
39  }
40 
54  function filter( array $links, Title $title = null, $preventLog = false, $mode = 'check' ) {
55  $statsd = MediaWikiServices::getInstance()->getStatsdDataFactory();
57 
58  // If there are no new links, and we are logging,
59  // mark all of the current links as being removed.
60  if ( !$links && $this->isLoggingEnabled() ) {
61  $this->logUrlChanges( $this->getCurrentLinks( $title ), [], [] );
62  }
63 
64  if ( !$links ) {
65  return false;
66  }
67 
68  sort( $links );
69  $key = $cache->makeKey(
70  'blacklist',
71  $this->getBlacklistType(),
72  'pass',
73  sha1( implode( "\n", $links ) ),
74  (string)$title
75  );
76  // Skip blacklist checks if nothing matched during edit stashing...
77  $knownNonMatchAsOf = $cache->get( $key );
78  if ( $mode === 'check' ) {
79  if ( $knownNonMatchAsOf ) {
80  $statsd->increment( 'spamblacklist.check-stash.hit' );
81 
82  return false;
83  } else {
84  $statsd->increment( 'spamblacklist.check-stash.miss' );
85  }
86  } elseif ( $mode === 'stash' ) {
87  if ( $knownNonMatchAsOf && ( time() - $knownNonMatchAsOf ) < self::STASH_AGE_DYING ) {
88  return false; // OK; not about to expire soon
89  }
90  }
91 
92  $blacklists = $this->getBlacklists();
93  $whitelists = $this->getWhitelists();
94 
95  if ( count( $blacklists ) ) {
96  // poor man's anti-spoof, see bug 12896
97  $newLinks = array_map( [ $this, 'antiSpoof' ], $links );
98 
99  $oldLinks = [];
100  if ( $title !== null ) {
101  $oldLinks = $this->getCurrentLinks( $title );
102  $addedLinks = array_diff( $newLinks, $oldLinks );
103  } else {
104  // can't load old links, so treat all links as added.
105  $addedLinks = $newLinks;
106  }
107 
108  wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) );
109  wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) );
110  wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) );
111 
112  if ( !$preventLog ) {
113  $this->logUrlChanges( $oldLinks, $newLinks, $addedLinks );
114  }
115 
116  $links = implode( "\n", $addedLinks );
117 
118  # Strip whitelisted URLs from the match
119  if ( is_array( $whitelists ) ) {
120  wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) .
121  " regexes: " . implode( ', ', $whitelists ) . "\n" );
122  foreach ( $whitelists as $regex ) {
124  $newLinks = preg_replace( $regex, '', $links );
126  if ( is_string( $newLinks ) ) {
127  // If there wasn't a regex error, strip the matching URLs
128  $links = $newLinks;
129  }
130  }
131  }
132 
133  # Do the match
134  wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) .
135  " regexes: " . implode( ', ', $blacklists ) . "\n" );
136  $retVal = false;
137  foreach ( $blacklists as $regex ) {
139  $matches = [];
140  $check = ( preg_match_all( $regex, $links, $matches ) > 0 );
142  if ( $check ) {
143  wfDebugLog( 'SpamBlacklist', "Match!\n" );
145  $ip = $wgRequest->getIP();
146  $fullUrls = [];
147  $fullLineRegex = substr( $regex, 0, strrpos( $regex, '/' ) ) . '.*/Sim';
148  preg_match_all( $fullLineRegex, $links, $fullUrls );
149  $imploded = implode( ' ', $fullUrls[0] );
150  wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: $imploded\n" );
151  if ( !$preventLog ) {
152  $this->logFilterHit( $title, $imploded ); // Log it
153  }
154  if ( $retVal === false ) {
155  $retVal = [];
156  }
157  $retVal = array_merge( $retVal, $fullUrls[1] );
158  }
159  }
160  if ( is_array( $retVal ) ) {
161  $retVal = array_unique( $retVal );
162  }
163  } else {
164  $retVal = false;
165  }
166 
167  if ( $retVal === false ) {
168  // Cache the typical negative results
169  $cache->set( $key, time(), self::STASH_TTL );
170  if ( $mode === 'stash' ) {
171  $statsd->increment( 'spamblacklist.check-stash.store' );
172  }
173  }
174 
175  return $retVal;
176  }
177 
178  public function isLoggingEnabled() {
179  global $wgSpamBlacklistEventLogging;
180  return $wgSpamBlacklistEventLogging && class_exists( 'EventLogging' );
181  }
182 
190  public function logUrlChanges( $oldLinks, $newLinks, $addedLinks ) {
191  if ( !$this->isLoggingEnabled() ) {
192  return;
193  }
194 
195  $removedLinks = array_diff( $oldLinks, $newLinks );
196  foreach ( $addedLinks as $url ) {
197  $this->logUrlChange( $url, 'insert' );
198  }
199 
200  foreach ( $removedLinks as $url ) {
201  $this->logUrlChange( $url, 'remove' );
202  }
203  }
204 
212  public function doLogging( User $user, Title $title, $revId ) {
213  if ( !$this->isLoggingEnabled() ) {
214  return;
215  }
216 
217  $baseInfo = [
218  'revId' => $revId,
219  'pageId' => $title->getArticleID(),
220  'pageNamespace' => $title->getNamespace(),
221  'userId' => $user->getId(),
222  'userText' => $user->getName(),
223  ];
224  $changes = $this->urlChangeLog;
225  // Empty the changes queue in case this function gets called more than once
226  $this->urlChangeLog = [];
227 
228  DeferredUpdates::addCallableUpdate( function () use ( $changes, $baseInfo ) {
229  foreach ( $changes as $change ) {
230  EventLogging::logEvent(
231  'ExternalLinksChange',
232  15716074,
233  $baseInfo + $change
234  );
235  }
236  } );
237  }
238 
245  private function logUrlChange( $url, $action ) {
246  $parsed = wfParseUrl( $url );
247  if ( !isset( $parsed['host'] ) ) {
248  wfDebugLog( 'SpamBlacklist', "Unable to parse $url" );
249  return;
250  }
251  $info = [
252  'action' => $action,
253  'protocol' => $parsed['scheme'],
254  'domain' => $parsed['host'],
255  'path' => isset( $parsed['path'] ) ? $parsed['path'] : '',
256  'query' => isset( $parsed['query'] ) ? $parsed['query'] : '',
257  'fragment' => isset( $parsed['fragment'] ) ? $parsed['fragment'] : '',
258  ];
259 
260  $this->urlChangeLog[] = $info;
261  }
262 
273  return $cache->getWithSetCallback(
274  // Key is warmed via warmCachesForFilter() from ApiStashEdit
275  $cache->makeKey( 'external-link-list', $title->getLatestRevID() ),
276  $cache::TTL_MINUTE,
277  function ( $oldValue, &$ttl, array &$setOpts ) use ( $title ) {
278  $dbr = wfGetDB( DB_SLAVE );
279  $setOpts += Database::getCacheSetOptions( $dbr );
280 
281  return $dbr->selectFieldValues(
282  'externallinks',
283  'el_to',
284  [ 'el_from' => $title->getArticleID() ], // should be zero queries
285  __METHOD__
286  );
287  }
288  );
289  }
290 
291  public function warmCachesForFilter( Title $title, array $entries ) {
292  $this->filter( $entries, $title, true /* no logging */, 'stash' );
293  }
294 
300  public function getRegexStart() {
301  return '/(?:https?:)?\/\/+[a-z0-9_\-.]*(';
302  }
303 
310  public function getRegexEnd( $batchSize ) {
311  return ')' . parent::getRegexEnd( $batchSize );
312  }
320  public function logFilterHit( $title, $url ) {
321  global $wgUser, $wgLogSpamBlacklistHits;
322  if ( $wgLogSpamBlacklistHits ) {
323  $logEntry = new ManualLogEntry( 'spamblacklist', 'hit' );
324  $logEntry->setPerformer( $wgUser );
325  $logEntry->setTarget( $title );
326  $logEntry->setParameters( [
327  '4::url' => $url,
328  ] );
329  $logid = $logEntry->insert();
330  $log = new LogPage( 'spamblacklist' );
331  if ( $log->isRestricted() ) {
332  // Make sure checkusers can see this action if the log is restricted
333  // (which is the default)
334  if ( ExtensionRegistry::getInstance()->isLoaded( 'CheckUser' )
335  && class_exists( 'CheckUserHooks' )
336  ) {
337  $rc = $logEntry->getRecentChange( $logid );
338  CheckUserHooks::updateCheckUserData( $rc );
339  }
340  } else {
341  // If the log is unrestricted, publish normally to RC,
342  // which will also update checkuser
343  $logEntry->publish( $logid, "rc" );
344  }
345  }
346  }
347 }
$user
please add to it if you re going to add events to the MediaWiki code where normally authentication against an external auth plugin would be creating a account $user
Definition: hooks.txt:244
$wgUser
$wgUser
Definition: Setup.php:809
SpamBlacklist\logUrlChange
logUrlChange( $url, $action)
Queue log data about change for a url addition or removal.
Definition: SpamBlacklist_body.php:245
SpamBlacklist\getRegexStart
getRegexStart()
Returns the start of the regex for matches.
Definition: SpamBlacklist_body.php:300
ObjectCache\getLocalClusterInstance
static getLocalClusterInstance()
Get the main cluster-local cache object.
Definition: ObjectCache.php:357
BaseBlacklist\getBlacklists
getBlacklists()
Fetch local and (possibly cached) remote blacklists.
Definition: BaseBlacklist.php:204
SpamBlacklist
Definition: SpamBlacklist_body.php:9
captcha-old.count
count
Definition: captcha-old.py:249
use
as see the revision history and available at free of to any person obtaining a copy of this software and associated documentation to deal in the Software without including without limitation the rights to use
Definition: MIT-LICENSE.txt:10
wfSuppressWarnings
wfSuppressWarnings( $end=false)
Reference-counted warning suppression.
Definition: GlobalFunctions.php:2020
DB_SLAVE
const DB_SLAVE
Definition: Defines.php:37
SpamBlacklist\getCurrentLinks
getCurrentLinks(Title $title)
Look up the links currently in the article, so we can ignore them on a second run.
Definition: SpamBlacklist_body.php:271
wfDebugLog
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
Definition: GlobalFunctions.php:1140
php
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
ExtensionRegistry\getInstance
static getInstance()
Definition: ExtensionRegistry.php:80
SpamBlacklist\getRegexEnd
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
Definition: SpamBlacklist_body.php:310
DeferredUpdates\addCallableUpdate
static addCallableUpdate( $callable, $stage=self::POSTSEND, IDatabase $dbw=null)
Add a callable update.
Definition: DeferredUpdates.php:111
wfParseUrl
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
Definition: GlobalFunctions.php:866
SpamBlacklist\logFilterHit
logFilterHit( $title, $url)
Logs the filter hit to Special:Log if $wgLogSpamBlacklistHits is enabled.
Definition: SpamBlacklist_body.php:320
$title
namespace and then decline to actually register it file or subcat img or subcat $title
Definition: hooks.txt:932
wfRestoreWarnings
wfRestoreWarnings()
Definition: GlobalFunctions.php:2028
wfGetDB
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Definition: GlobalFunctions.php:2856
$matches
$matches
Definition: NoLocalSettings.php:24
BaseBlacklist
Base class for different kinds of blacklists.
Definition: BaseBlacklist.php:6
LogPage
Class to simplify the use of log pages.
Definition: LogPage.php:31
SpamBlacklist\warmCachesForFilter
warmCachesForFilter(Title $title, array $entries)
Definition: SpamBlacklist_body.php:291
SpamBlacklist\STASH_AGE_DYING
const STASH_AGE_DYING
Definition: SpamBlacklist_body.php:11
global
when a variable name is used in a it is silently declared as a new masking the global
Definition: design.txt:93
SpamBlacklist\$urlChangeLog
array[] $urlChangeLog
Changes to external links, for logging purposes.
Definition: SpamBlacklist_body.php:17
SpamBlacklist\logUrlChanges
logUrlChanges( $oldLinks, $newLinks, $addedLinks)
Diff added/removed urls and generate events for them.
Definition: SpamBlacklist_body.php:190
SpamBlacklist\filter
filter(array $links, Title $title=null, $preventLog=false, $mode='check')
Definition: SpamBlacklist_body.php:54
SpamBlacklist\isLoggingEnabled
isLoggingEnabled()
Definition: SpamBlacklist_body.php:178
Title
Represents a title within MediaWiki.
Definition: Title.php:39
$dbr
if(! $regexes) $dbr
Definition: cleanup.php:94
$cache
$cache
Definition: mcc.php:33
ObjectCache\getMainWANInstance
static getMainWANInstance()
Get the main WAN cache object.
Definition: ObjectCache.php:370
SpamBlacklist\getBlacklistType
getBlacklistType()
Returns the code for the blacklist implementation.
Definition: SpamBlacklist_body.php:24
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
ManualLogEntry
Class for creating log entries manually, to inject them into the database.
Definition: LogEntry.php:400
SpamBlacklist\STASH_TTL
const STASH_TTL
Definition: SpamBlacklist_body.php:10
$wgRequest
if(! $wgDBerrorLogTZ) $wgRequest
Definition: Setup.php:662
MediaWikiServices
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
User
The User object encapsulates all of the user-specific settings (user_id, name, rights,...
Definition: User.php:51
BaseBlacklist\getWhitelists
getWhitelists()
Returns the (local) whitelist.
Definition: BaseBlacklist.php:236
SpamBlacklist\antiSpoof
antiSpoof( $text)
Apply some basic anti-spoofing to the links before they get filtered, see.
Definition: SpamBlacklist_body.php:36
SpamBlacklist\doLogging
doLogging(User $user, Title $title, $revId)
Actually push the url change events post-save.
Definition: SpamBlacklist_body.php:212
array
the array() calling protocol came about after MediaWiki 1.4rc1.