MediaWiki  1.34.0
SpamBlacklist.php
Go to the documentation of this file.
1 <?php
2 
3 use \MediaWiki\MediaWikiServices;
5 
6 class SpamBlacklist extends BaseBlacklist {
7  const STASH_TTL = 180;
8  const STASH_AGE_DYING = 150;
9 
15  protected function getBlacklistType() {
16  return 'spam';
17  }
18 
27  protected function antiSpoof( $text ) {
28  $text = str_replace( '.', '.', $text );
29  return $text;
30  }
31 
45  public function filter( array $links, Title $title = null, $preventLog = false, $mode = 'check' ) {
46  $statsd = MediaWikiServices::getInstance()->getStatsdDataFactory();
48 
49  if ( !$links ) {
50  return false;
51  }
52 
53  sort( $links );
54  $key = $cache->makeKey(
55  'blacklist',
56  $this->getBlacklistType(),
57  'pass',
58  sha1( implode( "\n", $links ) ),
59  md5( (string)$title )
60  );
61  // Skip blacklist checks if nothing matched during edit stashing...
62  $knownNonMatchAsOf = $cache->get( $key );
63  if ( $mode === 'check' ) {
64  if ( $knownNonMatchAsOf ) {
65  $statsd->increment( 'spamblacklist.check-stash.hit' );
66 
67  return false;
68  } else {
69  $statsd->increment( 'spamblacklist.check-stash.miss' );
70  }
71  } elseif ( $mode === 'stash' ) {
72  if ( $knownNonMatchAsOf && ( time() - $knownNonMatchAsOf ) < self::STASH_AGE_DYING ) {
73  return false; // OK; not about to expire soon
74  }
75  }
76 
77  $blacklists = $this->getBlacklists();
78  $whitelists = $this->getWhitelists();
79 
80  if ( count( $blacklists ) ) {
81  // poor man's anti-spoof, see bug 12896
82  $newLinks = array_map( [ $this, 'antiSpoof' ], $links );
83 
84  $oldLinks = [];
85  if ( $title !== null ) {
86  $oldLinks = $this->getCurrentLinks( $title );
87  $addedLinks = array_diff( $newLinks, $oldLinks );
88  } else {
89  // can't load old links, so treat all links as added.
90  $addedLinks = $newLinks;
91  }
92 
93  wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) );
94  wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) );
95  wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) );
96 
97  $links = implode( "\n", $addedLinks );
98 
99  # Strip whitelisted URLs from the match
100  if ( is_array( $whitelists ) ) {
101  wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) .
102  " regexes: " . implode( ', ', $whitelists ) . "\n" );
103  foreach ( $whitelists as $regex ) {
104  Wikimedia\suppressWarnings();
105  $newLinks = preg_replace( $regex, '', $links );
106  Wikimedia\restoreWarnings();
107  if ( is_string( $newLinks ) ) {
108  // If there wasn't a regex error, strip the matching URLs
109  $links = $newLinks;
110  }
111  }
112  }
113 
114  # Do the match
115  wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) .
116  " regexes: " . implode( ', ', $blacklists ) . "\n" );
117  $retVal = false;
118  foreach ( $blacklists as $regex ) {
119  Wikimedia\suppressWarnings();
120  $matches = [];
121  $check = ( preg_match_all( $regex, $links, $matches ) > 0 );
122  Wikimedia\restoreWarnings();
123  if ( $check ) {
124  wfDebugLog( 'SpamBlacklist', "Match!\n" );
125  global $wgRequest;
126  $ip = $wgRequest->getIP();
127  $fullUrls = [];
128  $fullLineRegex = substr( $regex, 0, strrpos( $regex, '/' ) ) . '.*/Sim';
129  preg_match_all( $fullLineRegex, $links, $fullUrls );
130  $imploded = implode( ' ', $fullUrls[0] );
131  wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: $imploded\n" );
132  if ( !$preventLog ) {
133  $this->logFilterHit( $title, $imploded ); // Log it
134  }
135  if ( $retVal === false ) {
136  $retVal = [];
137  }
138  $retVal = array_merge( $retVal, $fullUrls[1] );
139  }
140  }
141  if ( is_array( $retVal ) ) {
142  $retVal = array_unique( $retVal );
143  }
144  } else {
145  $retVal = false;
146  }
147 
148  if ( $retVal === false ) {
149  // Cache the typical negative results
150  $cache->set( $key, time(), self::STASH_TTL );
151  if ( $mode === 'stash' ) {
152  $statsd->increment( 'spamblacklist.check-stash.store' );
153  }
154  }
155 
156  return $retVal;
157  }
158 
167  public function getCurrentLinks( Title $title ) {
168  $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
169  $fname = __METHOD__;
170  return $cache->getWithSetCallback(
171  // Key is warmed via warmCachesForFilter() from ApiStashEdit
172  $cache->makeKey( 'external-link-list', $title->getLatestRevID() ),
173  $cache::TTL_MINUTE,
174  function ( $oldValue, &$ttl, array &$setOpts ) use ( $title, $fname ) {
175  $dbr = wfGetDB( DB_REPLICA );
176  $setOpts += Database::getCacheSetOptions( $dbr );
177 
178  return $dbr->selectFieldValues(
179  'externallinks',
180  'el_to',
181  [ 'el_from' => $title->getArticleID() ], // should be zero queries
182  $fname
183  );
184  }
185  );
186  }
187 
188  public function warmCachesForFilter( Title $title, array $entries ) {
189  $this->filter( $entries, $title, true /* no logging */, 'stash' );
190  }
191 
197  public function getRegexStart() {
198  return '/(?:https?:)?\/\/+[a-z0-9_\-.]*(';
199  }
200 
207  public function getRegexEnd( $batchSize ) {
208  return ')' . parent::getRegexEnd( $batchSize );
209  }
210 
218  public function logFilterHit( $title, $url ) {
219  global $wgUser, $wgLogSpamBlacklistHits;
220  if ( $wgLogSpamBlacklistHits ) {
221  $logEntry = new ManualLogEntry( 'spamblacklist', 'hit' );
222  $logEntry->setPerformer( $wgUser );
223  $logEntry->setTarget( $title );
224  $logEntry->setParameters( [
225  '4::url' => $url,
226  ] );
227  $logid = $logEntry->insert();
228  $log = new LogPage( 'spamblacklist' );
229  if ( $log->isRestricted() ) {
230  // Make sure checkusers can see this action if the log is restricted
231  // (which is the default)
232  if ( ExtensionRegistry::getInstance()->isLoaded( 'CheckUser' )
233  && class_exists( CheckUserHooks::class )
234  ) {
235  $rc = $logEntry->getRecentChange( $logid );
236  CheckUserHooks::updateCheckUserData( $rc );
237  }
238  } else {
239  // If the log is unrestricted, publish normally to RC,
240  // which will also update checkuser
241  $logEntry->publish( $logid, "rc" );
242  }
243  }
244  }
245 }
Wikimedia\Rdbms\Database
Relational database abstraction object.
Definition: Database.php:49
SpamBlacklist\getRegexStart
getRegexStart()
Returns the start of the regex for matches.
Definition: SpamBlacklist.php:197
ObjectCache\getLocalClusterInstance
static getLocalClusterInstance()
Get the main cluster-local cache object.
Definition: ObjectCache.php:342
BaseBlacklist\getBlacklists
getBlacklists()
Fetch local and (possibly cached) remote blacklists.
Definition: BaseBlacklist.php:223
SpamBlacklist
Definition: SpamBlacklist.php:6
SpamBlacklist\getCurrentLinks
getCurrentLinks(Title $title)
Look up the links currently in the article, so we can ignore them on a second run.
Definition: SpamBlacklist.php:167
wfDebugLog
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
Definition: GlobalFunctions.php:1007
$dbr
$dbr
Definition: testCompression.php:50
ExtensionRegistry\getInstance
static getInstance()
Definition: ExtensionRegistry.php:106
SpamBlacklist\getRegexEnd
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
Definition: SpamBlacklist.php:207
SpamBlacklist\logFilterHit
logFilterHit( $title, $url)
Logs the filter hit to Special:Log if $wgLogSpamBlacklistHits is enabled.
Definition: SpamBlacklist.php:218
wfGetDB
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
Definition: GlobalFunctions.php:2575
$matches
$matches
Definition: NoLocalSettings.php:24
BaseBlacklist
Base class for different kinds of blacklists.
Definition: BaseBlacklist.php:9
LogPage
Class to simplify the use of log pages.
Definition: LogPage.php:33
SpamBlacklist\warmCachesForFilter
warmCachesForFilter(Title $title, array $entries)
Definition: SpamBlacklist.php:188
SpamBlacklist\STASH_AGE_DYING
const STASH_AGE_DYING
Definition: SpamBlacklist.php:8
$title
$title
Definition: testCompression.php:34
DB_REPLICA
const DB_REPLICA
Definition: defines.php:25
SpamBlacklist\filter
filter(array $links, Title $title=null, $preventLog=false, $mode='check')
Definition: SpamBlacklist.php:45
Title
Represents a title within MediaWiki.
Definition: Title.php:42
$cache
$cache
Definition: mcc.php:33
SpamBlacklist\getBlacklistType
getBlacklistType()
Returns the code for the blacklist implementation.
Definition: SpamBlacklist.php:15
ManualLogEntry
Class for creating new log entries and inserting them into the database.
Definition: ManualLogEntry.php:37
SpamBlacklist\STASH_TTL
const STASH_TTL
Definition: SpamBlacklist.php:7
$wgRequest
if(! $wgDBerrorLogTZ) $wgRequest
Definition: Setup.php:752
BaseBlacklist\getWhitelists
getWhitelists()
Returns the (local) whitelist.
Definition: BaseBlacklist.php:257
SpamBlacklist\antiSpoof
antiSpoof( $text)
Apply some basic anti-spoofing to the links before they get filtered, see.
Definition: SpamBlacklist.php:27