MediaWiki REL1_30
SpamBlacklist_body.php
Go to the documentation of this file.
1<?php
2
3if ( !defined( 'MEDIAWIKI' ) ) {
4 exit;
5}
6
7use \MediaWiki\MediaWikiServices;
8
10 const STASH_TTL = 180;
11 const STASH_AGE_DYING = 150;
12
17 private $urlChangeLog = [];
18
24 protected function getBlacklistType() {
25 return 'spam';
26 }
27
36 protected function antiSpoof( $text ) {
37 $text = str_replace( '.', '.', $text );
38 return $text;
39 }
40
54 function filter( array $links, Title $title = null, $preventLog = false, $mode = 'check' ) {
55 $statsd = MediaWikiServices::getInstance()->getStatsdDataFactory();
56 $cache = ObjectCache::getLocalClusterInstance();
57
58 // If there are no new links, and we are logging,
59 // mark all of the current links as being removed.
60 if ( !$links && $this->isLoggingEnabled() ) {
61 $this->logUrlChanges( $this->getCurrentLinks( $title ), [], [] );
62 }
63
64 if ( !$links ) {
65 return false;
66 }
67
68 sort( $links );
69 $key = $cache->makeKey(
70 'blacklist',
71 $this->getBlacklistType(),
72 'pass',
73 sha1( implode( "\n", $links ) ),
74 (string)$title
75 );
76 // Skip blacklist checks if nothing matched during edit stashing...
77 $knownNonMatchAsOf = $cache->get( $key );
78 if ( $mode === 'check' ) {
79 if ( $knownNonMatchAsOf ) {
80 $statsd->increment( 'spamblacklist.check-stash.hit' );
81
82 return false;
83 } else {
84 $statsd->increment( 'spamblacklist.check-stash.miss' );
85 }
86 } elseif ( $mode === 'stash' ) {
87 if ( $knownNonMatchAsOf && ( time() - $knownNonMatchAsOf ) < self::STASH_AGE_DYING ) {
88 return false; // OK; not about to expire soon
89 }
90 }
91
92 $blacklists = $this->getBlacklists();
93 $whitelists = $this->getWhitelists();
94
95 if ( count( $blacklists ) ) {
96 // poor man's anti-spoof, see bug 12896
97 $newLinks = array_map( [ $this, 'antiSpoof' ], $links );
98
99 $oldLinks = [];
100 if ( $title !== null ) {
101 $oldLinks = $this->getCurrentLinks( $title );
102 $addedLinks = array_diff( $newLinks, $oldLinks );
103 } else {
104 // can't load old links, so treat all links as added.
105 $addedLinks = $newLinks;
106 }
107
108 wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) );
109 wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) );
110 wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) );
111
112 if ( !$preventLog ) {
113 $this->logUrlChanges( $oldLinks, $newLinks, $addedLinks );
114 }
115
116 $links = implode( "\n", $addedLinks );
117
118 # Strip whitelisted URLs from the match
119 if ( is_array( $whitelists ) ) {
120 wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) .
121 " regexes: " . implode( ', ', $whitelists ) . "\n" );
122 foreach ( $whitelists as $regex ) {
124 $newLinks = preg_replace( $regex, '', $links );
126 if ( is_string( $newLinks ) ) {
127 // If there wasn't a regex error, strip the matching URLs
128 $links = $newLinks;
129 }
130 }
131 }
132
133 # Do the match
134 wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) .
135 " regexes: " . implode( ', ', $blacklists ) . "\n" );
136 $retVal = false;
137 foreach ( $blacklists as $regex ) {
139 $matches = [];
140 $check = ( preg_match_all( $regex, $links, $matches ) > 0 );
142 if ( $check ) {
143 wfDebugLog( 'SpamBlacklist', "Match!\n" );
144 global $wgRequest;
145 $ip = $wgRequest->getIP();
146 $fullUrls = [];
147 $fullLineRegex = substr( $regex, 0, strrpos( $regex, '/' ) ) . '.*/Sim';
148 preg_match_all( $fullLineRegex, $links, $fullUrls );
149 $imploded = implode( ' ', $fullUrls[0] );
150 wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: $imploded\n" );
151 if ( !$preventLog ) {
152 $this->logFilterHit( $title, $imploded ); // Log it
153 }
154 if ( $retVal === false ) {
155 $retVal = [];
156 }
157 $retVal = array_merge( $retVal, $fullUrls[1] );
158 }
159 }
160 if ( is_array( $retVal ) ) {
161 $retVal = array_unique( $retVal );
162 }
163 } else {
164 $retVal = false;
165 }
166
167 if ( $retVal === false ) {
168 // Cache the typical negative results
169 $cache->set( $key, time(), self::STASH_TTL );
170 if ( $mode === 'stash' ) {
171 $statsd->increment( 'spamblacklist.check-stash.store' );
172 }
173 }
174
175 return $retVal;
176 }
177
178 public function isLoggingEnabled() {
179 global $wgSpamBlacklistEventLogging;
180 return $wgSpamBlacklistEventLogging && class_exists( 'EventLogging' );
181 }
182
190 public function logUrlChanges( $oldLinks, $newLinks, $addedLinks ) {
191 if ( !$this->isLoggingEnabled() ) {
192 return;
193 }
194
195 $removedLinks = array_diff( $oldLinks, $newLinks );
196 foreach ( $addedLinks as $url ) {
197 $this->logUrlChange( $url, 'insert' );
198 }
199
200 foreach ( $removedLinks as $url ) {
201 $this->logUrlChange( $url, 'remove' );
202 }
203 }
204
212 public function doLogging( User $user, Title $title, $revId ) {
213 if ( !$this->isLoggingEnabled() ) {
214 return;
215 }
216
217 $baseInfo = [
218 'revId' => $revId,
219 'pageId' => $title->getArticleID(),
220 'pageNamespace' => $title->getNamespace(),
221 'userId' => $user->getId(),
222 'userText' => $user->getName(),
223 ];
224 $changes = $this->urlChangeLog;
225 // Empty the changes queue in case this function gets called more than once
226 $this->urlChangeLog = [];
227
228 DeferredUpdates::addCallableUpdate( function () use ( $changes, $baseInfo ) {
229 foreach ( $changes as $change ) {
230 EventLogging::logEvent(
231 'ExternalLinksChange',
232 15716074,
233 $baseInfo + $change
234 );
235 }
236 } );
237 }
238
245 private function logUrlChange( $url, $action ) {
246 $parsed = wfParseUrl( $url );
247 if ( !isset( $parsed['host'] ) ) {
248 wfDebugLog( 'SpamBlacklist', "Unable to parse $url" );
249 return;
250 }
251 $info = [
252 'action' => $action,
253 'protocol' => $parsed['scheme'],
254 'domain' => $parsed['host'],
255 'path' => isset( $parsed['path'] ) ? $parsed['path'] : '',
256 'query' => isset( $parsed['query'] ) ? $parsed['query'] : '',
257 'fragment' => isset( $parsed['fragment'] ) ? $parsed['fragment'] : '',
258 ];
259
260 $this->urlChangeLog[] = $info;
261 }
262
271 function getCurrentLinks( Title $title ) {
272 $cache = ObjectCache::getMainWANInstance();
273 return $cache->getWithSetCallback(
274 // Key is warmed via warmCachesForFilter() from ApiStashEdit
275 $cache->makeKey( 'external-link-list', $title->getLatestRevID() ),
276 $cache::TTL_MINUTE,
277 function ( $oldValue, &$ttl, array &$setOpts ) use ( $title ) {
278 $dbr = wfGetDB( DB_SLAVE );
279 $setOpts += Database::getCacheSetOptions( $dbr );
280
281 return $dbr->selectFieldValues(
282 'externallinks',
283 'el_to',
284 [ 'el_from' => $title->getArticleID() ], // should be zero queries
285 __METHOD__
286 );
287 }
288 );
289 }
290
291 public function warmCachesForFilter( Title $title, array $entries ) {
292 $this->filter( $entries, $title, true /* no logging */, 'stash' );
293 }
294
300 public function getRegexStart() {
301 return '/(?:https?:)?\/\/+[a-z0-9_\-.]*(';
302 }
303
310 public function getRegexEnd( $batchSize ) {
311 return ')' . parent::getRegexEnd( $batchSize );
312 }
320 public function logFilterHit( $title, $url ) {
321 global $wgUser, $wgLogSpamBlacklistHits;
322 if ( $wgLogSpamBlacklistHits ) {
323 $logEntry = new ManualLogEntry( 'spamblacklist', 'hit' );
324 $logEntry->setPerformer( $wgUser );
325 $logEntry->setTarget( $title );
326 $logEntry->setParameters( [
327 '4::url' => $url,
328 ] );
329 $logid = $logEntry->insert();
330 $log = new LogPage( 'spamblacklist' );
331 if ( $log->isRestricted() ) {
332 // Make sure checkusers can see this action if the log is restricted
333 // (which is the default)
334 if ( ExtensionRegistry::getInstance()->isLoaded( 'CheckUser' )
335 && class_exists( 'CheckUserHooks' )
336 ) {
337 $rc = $logEntry->getRecentChange( $logid );
338 CheckUserHooks::updateCheckUserData( $rc );
339 }
340 } else {
341 // If the log is unrestricted, publish normally to RC,
342 // which will also update checkuser
343 $logEntry->publish( $logid, "rc" );
344 }
345 }
346 }
347}
const DB_SLAVE
Definition Defines.php:37
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
wfRestoreWarnings()
wfSuppressWarnings( $end=false)
Reference-counted warning suppression.
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
$wgUser
Definition Setup.php:817
if(! $wgDBerrorLogTZ) $wgRequest
Definition Setup.php:662
Base class for different kinds of blacklists.
getBlacklists()
Fetch local and (possibly cached) remote blacklists.
getWhitelists()
Returns the (local) whitelist.
Class to simplify the use of log pages.
Definition LogPage.php:31
Class for creating log entries manually, to inject them into the database.
Definition LogEntry.php:400
logUrlChange( $url, $action)
Queue log data about change for a url addition or removal.
getCurrentLinks(Title $title)
Look up the links currently in the article, so we can ignore them on a second run.
filter(array $links, Title $title=null, $preventLog=false, $mode='check')
doLogging(User $user, Title $title, $revId)
Actually push the url change events post-save.
getRegexStart()
Returns the start of the regex for matches.
warmCachesForFilter(Title $title, array $entries)
logFilterHit( $title, $url)
Logs the filter hit to Special:Log if $wgLogSpamBlacklistHits is enabled.
logUrlChanges( $oldLinks, $newLinks, $addedLinks)
Diff added/removed urls and generate events for them.
getBlacklistType()
Returns the code for the blacklist implementation.
antiSpoof( $text)
Apply some basic anti-spoofing to the links before they get filtered, see.
array[] $urlChangeLog
Changes to external links, for logging purposes.
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
Represents a title within MediaWiki.
Definition Title.php:39
The User object encapsulates all of the user-specific settings (user_id, name, rights,...
Definition User.php:51
if(! $regexes) $dbr
Definition cleanup.php:94
when a variable name is used in a function
Definition design.txt:94
$cache
Definition mcc.php:33