MediaWiki REL1_34
SpamBlacklist.php
Go to the documentation of this file.
1<?php
2
3use \MediaWiki\MediaWikiServices;
5
7 const STASH_TTL = 180;
8 const STASH_AGE_DYING = 150;
9
15 protected function getBlacklistType() {
16 return 'spam';
17 }
18
27 protected function antiSpoof( $text ) {
28 $text = str_replace( '.', '.', $text );
29 return $text;
30 }
31
45 public function filter( array $links, Title $title = null, $preventLog = false, $mode = 'check' ) {
46 $statsd = MediaWikiServices::getInstance()->getStatsdDataFactory();
47 $cache = ObjectCache::getLocalClusterInstance();
48
49 if ( !$links ) {
50 return false;
51 }
52
53 sort( $links );
54 $key = $cache->makeKey(
55 'blacklist',
56 $this->getBlacklistType(),
57 'pass',
58 sha1( implode( "\n", $links ) ),
59 md5( (string)$title )
60 );
61 // Skip blacklist checks if nothing matched during edit stashing...
62 $knownNonMatchAsOf = $cache->get( $key );
63 if ( $mode === 'check' ) {
64 if ( $knownNonMatchAsOf ) {
65 $statsd->increment( 'spamblacklist.check-stash.hit' );
66
67 return false;
68 } else {
69 $statsd->increment( 'spamblacklist.check-stash.miss' );
70 }
71 } elseif ( $mode === 'stash' ) {
72 if ( $knownNonMatchAsOf && ( time() - $knownNonMatchAsOf ) < self::STASH_AGE_DYING ) {
73 return false; // OK; not about to expire soon
74 }
75 }
76
77 $blacklists = $this->getBlacklists();
78 $whitelists = $this->getWhitelists();
79
80 if ( count( $blacklists ) ) {
81 // poor man's anti-spoof, see bug 12896
82 $newLinks = array_map( [ $this, 'antiSpoof' ], $links );
83
84 $oldLinks = [];
85 if ( $title !== null ) {
86 $oldLinks = $this->getCurrentLinks( $title );
87 $addedLinks = array_diff( $newLinks, $oldLinks );
88 } else {
89 // can't load old links, so treat all links as added.
90 $addedLinks = $newLinks;
91 }
92
93 wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) );
94 wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) );
95 wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) );
96
97 $links = implode( "\n", $addedLinks );
98
99 # Strip whitelisted URLs from the match
100 if ( is_array( $whitelists ) ) {
101 wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) .
102 " regexes: " . implode( ', ', $whitelists ) . "\n" );
103 foreach ( $whitelists as $regex ) {
104 Wikimedia\suppressWarnings();
105 $newLinks = preg_replace( $regex, '', $links );
106 Wikimedia\restoreWarnings();
107 if ( is_string( $newLinks ) ) {
108 // If there wasn't a regex error, strip the matching URLs
109 $links = $newLinks;
110 }
111 }
112 }
113
114 # Do the match
115 wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) .
116 " regexes: " . implode( ', ', $blacklists ) . "\n" );
117 $retVal = false;
118 foreach ( $blacklists as $regex ) {
119 Wikimedia\suppressWarnings();
120 $matches = [];
121 $check = ( preg_match_all( $regex, $links, $matches ) > 0 );
122 Wikimedia\restoreWarnings();
123 if ( $check ) {
124 wfDebugLog( 'SpamBlacklist', "Match!\n" );
125 global $wgRequest;
126 $ip = $wgRequest->getIP();
127 $fullUrls = [];
128 $fullLineRegex = substr( $regex, 0, strrpos( $regex, '/' ) ) . '.*/Sim';
129 preg_match_all( $fullLineRegex, $links, $fullUrls );
130 $imploded = implode( ' ', $fullUrls[0] );
131 wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: $imploded\n" );
132 if ( !$preventLog ) {
133 $this->logFilterHit( $title, $imploded ); // Log it
134 }
135 if ( $retVal === false ) {
136 $retVal = [];
137 }
138 $retVal = array_merge( $retVal, $fullUrls[1] );
139 }
140 }
141 if ( is_array( $retVal ) ) {
142 $retVal = array_unique( $retVal );
143 }
144 } else {
145 $retVal = false;
146 }
147
148 if ( $retVal === false ) {
149 // Cache the typical negative results
150 $cache->set( $key, time(), self::STASH_TTL );
151 if ( $mode === 'stash' ) {
152 $statsd->increment( 'spamblacklist.check-stash.store' );
153 }
154 }
155
156 return $retVal;
157 }
158
167 public function getCurrentLinks( Title $title ) {
168 $cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
169 $fname = __METHOD__;
170 return $cache->getWithSetCallback(
171 // Key is warmed via warmCachesForFilter() from ApiStashEdit
172 $cache->makeKey( 'external-link-list', $title->getLatestRevID() ),
173 $cache::TTL_MINUTE,
174 function ( $oldValue, &$ttl, array &$setOpts ) use ( $title, $fname ) {
176 $setOpts += Database::getCacheSetOptions( $dbr );
177
178 return $dbr->selectFieldValues(
179 'externallinks',
180 'el_to',
181 [ 'el_from' => $title->getArticleID() ], // should be zero queries
182 $fname
183 );
184 }
185 );
186 }
187
188 public function warmCachesForFilter( Title $title, array $entries ) {
189 $this->filter( $entries, $title, true /* no logging */, 'stash' );
190 }
191
197 public function getRegexStart() {
198 return '/(?:https?:)?\/\/+[a-z0-9_\-.]*(';
199 }
200
207 public function getRegexEnd( $batchSize ) {
208 return ')' . parent::getRegexEnd( $batchSize );
209 }
210
218 public function logFilterHit( $title, $url ) {
219 global $wgUser, $wgLogSpamBlacklistHits;
220 if ( $wgLogSpamBlacklistHits ) {
221 $logEntry = new ManualLogEntry( 'spamblacklist', 'hit' );
222 $logEntry->setPerformer( $wgUser );
223 $logEntry->setTarget( $title );
224 $logEntry->setParameters( [
225 '4::url' => $url,
226 ] );
227 $logid = $logEntry->insert();
228 $log = new LogPage( 'spamblacklist' );
229 if ( $log->isRestricted() ) {
230 // Make sure checkusers can see this action if the log is restricted
231 // (which is the default)
232 if ( ExtensionRegistry::getInstance()->isLoaded( 'CheckUser' )
233 && class_exists( CheckUserHooks::class )
234 ) {
235 $rc = $logEntry->getRecentChange( $logid );
236 CheckUserHooks::updateCheckUserData( $rc );
237 }
238 } else {
239 // If the log is unrestricted, publish normally to RC,
240 // which will also update checkuser
241 $logEntry->publish( $logid, "rc" );
242 }
243 }
244 }
245}
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
if(! $wgDBerrorLogTZ) $wgRequest
Definition Setup.php:751
Base class for different kinds of blacklists.
getBlacklists()
Fetch local and (possibly cached) remote blacklists.
getWhitelists()
Returns the (local) whitelist.
Class to simplify the use of log pages.
Definition LogPage.php:33
Class for creating new log entries and inserting them into the database.
getCurrentLinks(Title $title)
Look up the links currently in the article, so we can ignore them on a second run.
filter(array $links, Title $title=null, $preventLog=false, $mode='check')
const STASH_AGE_DYING
getRegexStart()
Returns the start of the regex for matches.
warmCachesForFilter(Title $title, array $entries)
logFilterHit( $title, $url)
Logs the filter hit to Special:Log if $wgLogSpamBlacklistHits is enabled.
getBlacklistType()
Returns the code for the blacklist implementation.
antiSpoof( $text)
Apply some basic anti-spoofing to the links before they get filtered, see.
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
Represents a title within MediaWiki.
Definition Title.php:42
Relational database abstraction object.
Definition Database.php:49
$cache
Definition mcc.php:33
const DB_REPLICA
Definition defines.php:25