MediaWiki  1.32.0
SpamBlacklist.php
Go to the documentation of this file.
1 <?php
2 
3 if ( !defined( 'MEDIAWIKI' ) ) {
4  exit;
5 }
6 
9 
11  const STASH_TTL = 180;
12  const STASH_AGE_DYING = 150;
13 
18  private $urlChangeLog = [];
19 
25  protected function getBlacklistType() {
26  return 'spam';
27  }
28 
37  protected function antiSpoof( $text ) {
38  $text = str_replace( '.', '.', $text );
39  return $text;
40  }
41 
55  function filter( array $links, Title $title = null, $preventLog = false, $mode = 'check' ) {
56  $statsd = MediaWikiServices::getInstance()->getStatsdDataFactory();
58 
59  // If there are no new links, and we are logging,
60  // mark all of the current links as being removed.
61  if ( !$links && $this->isLoggingEnabled() ) {
62  $this->logUrlChanges( $this->getCurrentLinks( $title ), [], [] );
63  }
64 
65  if ( !$links ) {
66  return false;
67  }
68 
69  sort( $links );
70  $key = $cache->makeKey(
71  'blacklist',
72  $this->getBlacklistType(),
73  'pass',
74  sha1( implode( "\n", $links ) ),
75  md5( (string)$title )
76  );
77  // Skip blacklist checks if nothing matched during edit stashing...
78  $knownNonMatchAsOf = $cache->get( $key );
79  if ( $mode === 'check' ) {
80  if ( $knownNonMatchAsOf ) {
81  $statsd->increment( 'spamblacklist.check-stash.hit' );
82 
83  return false;
84  } else {
85  $statsd->increment( 'spamblacklist.check-stash.miss' );
86  }
87  } elseif ( $mode === 'stash' ) {
88  if ( $knownNonMatchAsOf && ( time() - $knownNonMatchAsOf ) < self::STASH_AGE_DYING ) {
89  return false; // OK; not about to expire soon
90  }
91  }
92 
93  $blacklists = $this->getBlacklists();
94  $whitelists = $this->getWhitelists();
95 
96  if ( count( $blacklists ) ) {
97  // poor man's anti-spoof, see bug 12896
98  $newLinks = array_map( [ $this, 'antiSpoof' ], $links );
99 
100  $oldLinks = [];
101  if ( $title !== null ) {
102  $oldLinks = $this->getCurrentLinks( $title );
103  $addedLinks = array_diff( $newLinks, $oldLinks );
104  } else {
105  // can't load old links, so treat all links as added.
106  $addedLinks = $newLinks;
107  }
108 
109  wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) );
110  wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) );
111  wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) );
112 
113  if ( !$preventLog ) {
114  $this->logUrlChanges( $oldLinks, $newLinks, $addedLinks );
115  }
116 
117  $links = implode( "\n", $addedLinks );
118 
119  # Strip whitelisted URLs from the match
120  if ( is_array( $whitelists ) ) {
121  wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) .
122  " regexes: " . implode( ', ', $whitelists ) . "\n" );
123  foreach ( $whitelists as $regex ) {
125  $newLinks = preg_replace( $regex, '', $links );
127  if ( is_string( $newLinks ) ) {
128  // If there wasn't a regex error, strip the matching URLs
129  $links = $newLinks;
130  }
131  }
132  }
133 
134  # Do the match
135  wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) .
136  " regexes: " . implode( ', ', $blacklists ) . "\n" );
137  $retVal = false;
138  foreach ( $blacklists as $regex ) {
140  $matches = [];
141  $check = ( preg_match_all( $regex, $links, $matches ) > 0 );
143  if ( $check ) {
144  wfDebugLog( 'SpamBlacklist', "Match!\n" );
145  global $wgRequest;
146  $ip = $wgRequest->getIP();
147  $fullUrls = [];
148  $fullLineRegex = substr( $regex, 0, strrpos( $regex, '/' ) ) . '.*/Sim';
149  preg_match_all( $fullLineRegex, $links, $fullUrls );
150  $imploded = implode( ' ', $fullUrls[0] );
151  wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: $imploded\n" );
152  if ( !$preventLog ) {
153  $this->logFilterHit( $title, $imploded ); // Log it
154  }
155  if ( $retVal === false ) {
156  $retVal = [];
157  }
158  $retVal = array_merge( $retVal, $fullUrls[1] );
159  }
160  }
161  if ( is_array( $retVal ) ) {
162  $retVal = array_unique( $retVal );
163  }
164  } else {
165  $retVal = false;
166  }
167 
168  if ( $retVal === false ) {
169  // Cache the typical negative results
170  $cache->set( $key, time(), self::STASH_TTL );
171  if ( $mode === 'stash' ) {
172  $statsd->increment( 'spamblacklist.check-stash.store' );
173  }
174  }
175 
176  return $retVal;
177  }
178 
179  public function isLoggingEnabled() {
180  global $wgSpamBlacklistEventLogging;
181  return $wgSpamBlacklistEventLogging && class_exists( 'EventLogging' );
182  }
183 
191  public function logUrlChanges( $oldLinks, $newLinks, $addedLinks ) {
192  if ( !$this->isLoggingEnabled() ) {
193  return;
194  }
195 
196  $removedLinks = array_diff( $oldLinks, $newLinks );
197  foreach ( $addedLinks as $url ) {
198  $this->logUrlChange( $url, 'insert' );
199  }
200 
201  foreach ( $removedLinks as $url ) {
202  $this->logUrlChange( $url, 'remove' );
203  }
204  }
205 
213  public function doLogging( User $user, Title $title, $revId ) {
214  if ( !$this->isLoggingEnabled() ) {
215  return;
216  }
217 
218  $baseInfo = [
219  'revId' => $revId,
220  'pageId' => $title->getArticleID(),
221  'pageNamespace' => $title->getNamespace(),
222  'userId' => $user->getId(),
223  'userText' => $user->getName(),
224  ];
225  $changes = $this->urlChangeLog;
226  // Empty the changes queue in case this function gets called more than once
227  $this->urlChangeLog = [];
228 
229  DeferredUpdates::addCallableUpdate( function () use ( $changes, $baseInfo ) {
230  foreach ( $changes as $change ) {
231  EventLogging::logEvent(
232  'ExternalLinksChange',
233  15716074,
234  $baseInfo + $change
235  );
236  }
237  } );
238  }
239 
246  private function logUrlChange( $url, $action ) {
247  $parsed = wfParseUrl( $url );
248  if ( !isset( $parsed['host'] ) ) {
249  wfDebugLog( 'SpamBlacklist', "Unable to parse $url" );
250  return;
251  }
252  $info = [
253  'action' => $action,
254  'protocol' => $parsed['scheme'],
255  'domain' => $parsed['host'],
256  'path' => isset( $parsed['path'] ) ? $parsed['path'] : '',
257  'query' => isset( $parsed['query'] ) ? $parsed['query'] : '',
258  'fragment' => isset( $parsed['fragment'] ) ? $parsed['fragment'] : '',
259  ];
260 
261  $this->urlChangeLog[] = $info;
262  }
263 
274  $fname = __METHOD__;
275  return $cache->getWithSetCallback(
276  // Key is warmed via warmCachesForFilter() from ApiStashEdit
277  $cache->makeKey( 'external-link-list', $title->getLatestRevID() ),
278  $cache::TTL_MINUTE,
279  function ( $oldValue, &$ttl, array &$setOpts ) use ( $title, $fname ) {
280  $dbr = wfGetDB( DB_REPLICA );
281  $setOpts += Database::getCacheSetOptions( $dbr );
282 
283  return $dbr->selectFieldValues(
284  'externallinks',
285  'el_to',
286  [ 'el_from' => $title->getArticleID() ], // should be zero queries
287  $fname
288  );
289  }
290  );
291  }
292 
293  public function warmCachesForFilter( Title $title, array $entries ) {
294  $this->filter( $entries, $title, true /* no logging */, 'stash' );
295  }
296 
302  public function getRegexStart() {
303  return '/(?:https?:)?\/\/+[a-z0-9_\-.]*(';
304  }
305 
312  public function getRegexEnd( $batchSize ) {
313  return ')' . parent::getRegexEnd( $batchSize );
314  }
322  public function logFilterHit( $title, $url ) {
323  global $wgUser, $wgLogSpamBlacklistHits;
324  if ( $wgLogSpamBlacklistHits ) {
325  $logEntry = new ManualLogEntry( 'spamblacklist', 'hit' );
326  $logEntry->setPerformer( $wgUser );
327  $logEntry->setTarget( $title );
328  $logEntry->setParameters( [
329  '4::url' => $url,
330  ] );
331  $logid = $logEntry->insert();
332  $log = new LogPage( 'spamblacklist' );
333  if ( $log->isRestricted() ) {
334  // Make sure checkusers can see this action if the log is restricted
335  // (which is the default)
336  if ( ExtensionRegistry::getInstance()->isLoaded( 'CheckUser' )
337  && class_exists( 'CheckUserHooks' )
338  ) {
339  $rc = $logEntry->getRecentChange( $logid );
340  CheckUserHooks::updateCheckUserData( $rc );
341  }
342  } else {
343  // If the log is unrestricted, publish normally to RC,
344  // which will also update checkuser
345  $logEntry->publish( $logid, "rc" );
346  }
347  }
348  }
349 }
Wikimedia\Rdbms\Database
Relational database abstraction object.
Definition: Database.php:48
$user
please add to it if you re going to add events to the MediaWiki code where normally authentication against an external auth plugin would be creating a account $user
Definition: hooks.txt:244
SpamBlacklist\logUrlChange
logUrlChange( $url, $action)
Queue log data about change for a url addition or removal.
Definition: SpamBlacklist.php:246
SpamBlacklist\getRegexStart
getRegexStart()
Returns the start of the regex for matches.
Definition: SpamBlacklist.php:302
ObjectCache\getLocalClusterInstance
static getLocalClusterInstance()
Get the main cluster-local cache object.
Definition: ObjectCache.php:365
BaseBlacklist\getBlacklists
getBlacklists()
Fetch local and (possibly cached) remote blacklists.
Definition: BaseBlacklist.php:219
SpamBlacklist
Definition: SpamBlacklist.php:10
captcha-old.count
count
Definition: captcha-old.py:249
wfSuppressWarnings
wfSuppressWarnings( $end=false)
Reference-counted warning suppression.
Definition: GlobalFunctions.php:1934
SpamBlacklist\getCurrentLinks
getCurrentLinks(Title $title)
Look up the links currently in the article, so we can ignore them on a second run.
Definition: SpamBlacklist.php:272
wfDebugLog
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
Definition: GlobalFunctions.php:1082
php
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
$dbr
$dbr
Definition: testCompression.php:50
ExtensionRegistry\getInstance
static getInstance()
Definition: ExtensionRegistry.php:88
SpamBlacklist\getRegexEnd
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
Definition: SpamBlacklist.php:312
wfParseUrl
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
Definition: GlobalFunctions.php:814
SpamBlacklist\logFilterHit
logFilterHit( $title, $url)
Logs the filter hit to Special:Log if $wgLogSpamBlacklistHits is enabled.
Definition: SpamBlacklist.php:322
$title
namespace and then decline to actually register it file or subcat img or subcat $title
Definition: hooks.txt:964
wfRestoreWarnings
wfRestoreWarnings()
Definition: GlobalFunctions.php:1942
wfGetDB
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Definition: GlobalFunctions.php:2693
$matches
$matches
Definition: NoLocalSettings.php:24
BaseBlacklist
Base class for different kinds of blacklists.
Definition: BaseBlacklist.php:6
LogPage
Class to simplify the use of log pages.
Definition: LogPage.php:33
SpamBlacklist\warmCachesForFilter
warmCachesForFilter(Title $title, array $entries)
Definition: SpamBlacklist.php:293
use
as see the revision history and available at free of to any person obtaining a copy of this software and associated documentation to deal in the Software without including without limitation the rights to use
Definition: MIT-LICENSE.txt:10
SpamBlacklist\STASH_AGE_DYING
const STASH_AGE_DYING
Definition: SpamBlacklist.php:12
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
SpamBlacklist\$urlChangeLog
array[] $urlChangeLog
Changes to external links, for logging purposes.
Definition: SpamBlacklist.php:18
array
The wiki should then use memcached to cache various data To use multiple just add more items to the array To increase the weight of a make its entry a array("192.168.0.1:11211", 2))
$fname
if(defined( 'MW_SETUP_CALLBACK')) $fname
Customization point after all loading (constants, functions, classes, DefaultSettings,...
Definition: Setup.php:121
SpamBlacklist\logUrlChanges
logUrlChanges( $oldLinks, $newLinks, $addedLinks)
Diff added/removed urls and generate events for them.
Definition: SpamBlacklist.php:191
SpamBlacklist\filter
filter(array $links, Title $title=null, $preventLog=false, $mode='check')
Definition: SpamBlacklist.php:55
SpamBlacklist\isLoggingEnabled
isLoggingEnabled()
Definition: SpamBlacklist.php:179
Title
Represents a title within MediaWiki.
Definition: Title.php:39
$cache
$cache
Definition: mcc.php:33
ObjectCache\getMainWANInstance
static getMainWANInstance()
Get the main WAN cache object.
Definition: ObjectCache.php:378
SpamBlacklist\getBlacklistType
getBlacklistType()
Returns the code for the blacklist implementation.
Definition: SpamBlacklist.php:25
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
ManualLogEntry
Class for creating new log entries and inserting them into the database.
Definition: LogEntry.php:437
SpamBlacklist\STASH_TTL
const STASH_TTL
Definition: SpamBlacklist.php:11
$wgRequest
if(! $wgDBerrorLogTZ) $wgRequest
Definition: Setup.php:747
MediaWikiServices
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
User
The User object encapsulates all of the user-specific settings (user_id, name, rights,...
Definition: User.php:47
DeferredUpdates\addCallableUpdate
static addCallableUpdate( $callable, $stage=self::POSTSEND, $dbw=null)
Add a callable update.
Definition: DeferredUpdates.php:118
BaseBlacklist\getWhitelists
getWhitelists()
Returns the (local) whitelist.
Definition: BaseBlacklist.php:252
SpamBlacklist\antiSpoof
antiSpoof( $text)
Apply some basic anti-spoofing to the links before they get filtered, see.
Definition: SpamBlacklist.php:37
SpamBlacklist\doLogging
doLogging(User $user, Title $title, $revId)
Actually push the url change events post-save.
Definition: SpamBlacklist.php:213