3if ( !defined(
'MEDIAWIKI' ) ) {
7use \MediaWiki\MediaWikiServices;
37 $text = str_replace(
'.',
'.', $text );
54 function filter( array $links,
Title $title =
null, $preventLog =
false, $mode =
'check' ) {
55 $statsd = MediaWikiServices::getInstance()->getStatsdDataFactory();
56 $cache = ObjectCache::getLocalClusterInstance();
73 sha1( implode(
"\n", $links ) ),
77 $knownNonMatchAsOf =
$cache->get( $key );
78 if ( $mode ===
'check' ) {
79 if ( $knownNonMatchAsOf ) {
80 $statsd->increment(
'spamblacklist.check-stash.hit' );
84 $statsd->increment(
'spamblacklist.check-stash.miss' );
86 } elseif ( $mode ===
'stash' ) {
87 if ( $knownNonMatchAsOf && ( time() - $knownNonMatchAsOf ) < self::STASH_AGE_DYING ) {
95 if ( count( $blacklists ) ) {
97 $newLinks = array_map( [ $this,
'antiSpoof' ], $links );
100 if ( $title !==
null ) {
102 $addedLinks = array_diff( $newLinks, $oldLinks );
105 $addedLinks = $newLinks;
108 wfDebugLog(
'SpamBlacklist',
"Old URLs: " . implode(
', ', $oldLinks ) );
109 wfDebugLog(
'SpamBlacklist',
"New URLs: " . implode(
', ', $newLinks ) );
110 wfDebugLog(
'SpamBlacklist',
"Added URLs: " . implode(
', ', $addedLinks ) );
112 if ( !$preventLog ) {
116 $links = implode(
"\n", $addedLinks );
118 # Strip whitelisted URLs from the match
119 if ( is_array( $whitelists ) ) {
120 wfDebugLog(
'SpamBlacklist',
"Excluding whitelisted URLs from " . count( $whitelists ) .
121 " regexes: " . implode(
', ', $whitelists ) .
"\n" );
122 foreach ( $whitelists as $regex ) {
124 $newLinks = preg_replace( $regex,
'', $links );
126 if ( is_string( $newLinks ) ) {
134 wfDebugLog(
'SpamBlacklist',
"Checking text against " . count( $blacklists ) .
135 " regexes: " . implode(
', ', $blacklists ) .
"\n" );
137 foreach ( $blacklists as $regex ) {
140 $check = ( preg_match_all( $regex, $links,
$matches ) > 0 );
147 $fullLineRegex = substr( $regex, 0, strrpos( $regex,
'/' ) ) .
'.*/Sim';
148 preg_match_all( $fullLineRegex, $links, $fullUrls );
149 $imploded = implode(
' ', $fullUrls[0] );
150 wfDebugLog(
'SpamBlacklistHit',
"$ip caught submitting spam: $imploded\n" );
151 if ( !$preventLog ) {
154 if ( $retVal ===
false ) {
157 $retVal = array_merge( $retVal, $fullUrls[1] );
160 if ( is_array( $retVal ) ) {
161 $retVal = array_unique( $retVal );
167 if ( $retVal ===
false ) {
169 $cache->set( $key, time(), self::STASH_TTL );
170 if ( $mode ===
'stash' ) {
171 $statsd->increment(
'spamblacklist.check-stash.store' );
179 global $wgSpamBlacklistEventLogging;
180 return $wgSpamBlacklistEventLogging && class_exists(
'EventLogging' );
195 $removedLinks = array_diff( $oldLinks, $newLinks );
196 foreach ( $addedLinks as $url ) {
200 foreach ( $removedLinks as $url ) {
219 'pageId' => $title->getArticleID(),
220 'pageNamespace' => $title->getNamespace(),
221 'userId' => $user->getId(),
222 'userText' => $user->getName(),
226 $this->urlChangeLog = [];
228 DeferredUpdates::addCallableUpdate(
function () use ( $changes, $baseInfo ) {
229 foreach ( $changes as $change ) {
230 EventLogging::logEvent(
231 'ExternalLinksChange',
247 if ( !isset( $parsed[
'host'] ) ) {
248 wfDebugLog(
'SpamBlacklist',
"Unable to parse $url" );
253 'protocol' => $parsed[
'scheme'],
254 'domain' => $parsed[
'host'],
255 'path' => isset( $parsed[
'path'] ) ? $parsed[
'path'] :
'',
256 'query' => isset( $parsed[
'query'] ) ? $parsed[
'query'] :
'',
257 'fragment' => isset( $parsed[
'fragment'] ) ? $parsed[
'fragment'] :
'',
260 $this->urlChangeLog[] = $info;
272 $cache = ObjectCache::getMainWANInstance();
273 return $cache->getWithSetCallback(
275 $cache->makeKey(
'external-link-list', $title->getLatestRevID() ),
277 function ( $oldValue, &$ttl, array &$setOpts ) use ( $title ) {
279 $setOpts += Database::getCacheSetOptions(
$dbr );
281 return $dbr->selectFieldValues(
284 [
'el_from' => $title->getArticleID() ],
292 $this->
filter( $entries, $title,
true ,
'stash' );
301 return '/(?:https?:)?\/\/+[a-z0-9_\-.]*(';
311 return ')' . parent::getRegexEnd( $batchSize );
321 global
$wgUser, $wgLogSpamBlacklistHits;
322 if ( $wgLogSpamBlacklistHits ) {
324 $logEntry->setPerformer(
$wgUser );
325 $logEntry->setTarget( $title );
326 $logEntry->setParameters( [
329 $logid = $logEntry->insert();
330 $log =
new LogPage(
'spamblacklist' );
331 if ( $log->isRestricted() ) {
335 && class_exists(
'CheckUserHooks' )
337 $rc = $logEntry->getRecentChange( $logid );
338 CheckUserHooks::updateCheckUserData( $rc );
343 $logEntry->publish( $logid,
"rc" );
wfParseUrl( $url)
parse_url() work-alike, but non-broken.
wfGetDB( $db, $groups=[], $wiki=false)
Get a Database object.
wfSuppressWarnings( $end=false)
Reference-counted warning suppression.
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
if(! $wgDBerrorLogTZ) $wgRequest
Base class for different kinds of blacklists.
getBlacklists()
Fetch local and (possibly cached) remote blacklists.
getWhitelists()
Returns the (local) whitelist.
Class to simplify the use of log pages.
Class for creating log entries manually, to inject them into the database.
logUrlChange( $url, $action)
Queue log data about change for a url addition or removal.
getCurrentLinks(Title $title)
Look up the links currently in the article, so we can ignore them on a second run.
filter(array $links, Title $title=null, $preventLog=false, $mode='check')
doLogging(User $user, Title $title, $revId)
Actually push the url change events post-save.
getRegexStart()
Returns the start of the regex for matches.
warmCachesForFilter(Title $title, array $entries)
logFilterHit( $title, $url)
Logs the filter hit to Special:Log if $wgLogSpamBlacklistHits is enabled.
logUrlChanges( $oldLinks, $newLinks, $addedLinks)
Diff added/removed urls and generate events for them.
getBlacklistType()
Returns the code for the blacklist implementation.
antiSpoof( $text)
Apply some basic anti-spoofing to the links before they get filtered, see.
array[] $urlChangeLog
Changes to external links, for logging purposes.
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
Represents a title within MediaWiki.
The User object encapsulates all of the user-specific settings (user_id, name, rights,...
when a variable name is used in a function