Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
74.56% |
85 / 114 |
|
62.50% |
5 / 8 |
CRAP | |
0.00% |
0 / 1 |
SpamBlacklist | |
74.56% |
85 / 114 |
|
62.50% |
5 / 8 |
44.82 | |
0.00% |
0 / 1 |
getBlacklistType | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
antiSpoof | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
filter | |
87.67% |
64 / 73 |
|
0.00% |
0 / 1 |
20.75 | |||
getCurrentLinks | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
1 | |||
warmCachesForFilter | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
getRegexStart | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getRegexEnd | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
logFilterHit | |
7.14% |
1 / 14 |
|
0.00% |
0 / 1 |
16.81 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\SpamBlacklist; |
4 | |
5 | use LogPage; |
6 | use ManualLogEntry; |
7 | use MediaWiki\CheckUser\Hooks as CUHooks; |
8 | use MediaWiki\Context\RequestContext; |
9 | use MediaWiki\ExternalLinks\ExternalLinksLookup; |
10 | use MediaWiki\MediaWikiServices; |
11 | use MediaWiki\Registration\ExtensionRegistry; |
12 | use MediaWiki\Title\Title; |
13 | use MediaWiki\User\User; |
14 | use Wikimedia\AtEase\AtEase; |
15 | use Wikimedia\Rdbms\Database; |
16 | |
17 | class SpamBlacklist extends BaseBlacklist { |
18 | private const STASH_TTL = 180; |
19 | private const STASH_AGE_DYING = 150; |
20 | |
21 | /** |
22 | * Returns the code for the blacklist implementation |
23 | * |
24 | * @return string |
25 | */ |
26 | protected function getBlacklistType() { |
27 | return 'spam'; |
28 | } |
29 | |
30 | /** |
31 | * Apply some basic anti-spoofing to the links before they get filtered, |
32 | * see @bug 12896 |
33 | * |
34 | * @param string $text |
35 | * |
36 | * @return string |
37 | */ |
38 | protected function antiSpoof( $text ) { |
39 | $text = str_replace( '.', '.', $text ); |
40 | return $text; |
41 | } |
42 | |
43 | /** |
44 | * @param string[] $links An array of links to check against the blacklist |
45 | * @param ?Title $title The title of the page to which the filter shall be applied. |
46 | * This is used to load the old links already on the page, so |
47 | * the filter is only applied to links that got added. If not given, |
48 | * the filter is applied to all $links. |
49 | * @param User $user Relevant user |
50 | * @param bool $preventLog Whether to prevent logging of hits. Set to true when |
51 | * the action is testing the links rather than attempting to save them |
52 | * (e.g. the API spamblacklist action) |
53 | * @param string $mode Either 'check' or 'stash' |
54 | * |
55 | * @return string[]|bool Matched text(s) if the edit should not be allowed; false otherwise |
56 | */ |
57 | public function filter( |
58 | array $links, |
59 | ?Title $title, |
60 | User $user, |
61 | $preventLog = false, |
62 | $mode = 'check' |
63 | ) { |
64 | $services = MediaWikiServices::getInstance(); |
65 | $statsd = $services->getStatsdDataFactory(); |
66 | $cache = $services->getObjectCacheFactory()->getLocalClusterInstance(); |
67 | |
68 | if ( !$links ) { |
69 | return false; |
70 | } |
71 | |
72 | sort( $links ); |
73 | $key = $cache->makeKey( |
74 | 'blacklist', |
75 | $this->getBlacklistType(), |
76 | 'pass', |
77 | sha1( implode( "\n", $links ) ), |
78 | md5( (string)$title ) |
79 | ); |
80 | // Skip blacklist checks if nothing matched during edit stashing... |
81 | $knownNonMatchAsOf = $cache->get( $key ); |
82 | if ( $mode === 'check' ) { |
83 | if ( $knownNonMatchAsOf ) { |
84 | $statsd->increment( 'spamblacklist.check-stash.hit' ); |
85 | |
86 | return false; |
87 | } else { |
88 | $statsd->increment( 'spamblacklist.check-stash.miss' ); |
89 | } |
90 | } elseif ( $mode === 'stash' ) { |
91 | if ( $knownNonMatchAsOf && ( time() - $knownNonMatchAsOf ) < self::STASH_AGE_DYING ) { |
92 | // OK; not about to expire soon |
93 | return false; |
94 | } |
95 | } |
96 | |
97 | $blacklists = $this->getBlacklists(); |
98 | $whitelists = $this->getWhitelists(); |
99 | |
100 | if ( count( $blacklists ) ) { |
101 | // poor man's anti-spoof, see bug 12896 |
102 | $newLinks = array_map( [ $this, 'antiSpoof' ], $links ); |
103 | |
104 | $oldLinks = []; |
105 | if ( $title !== null ) { |
106 | $oldLinks = $this->getCurrentLinks( $title ); |
107 | $addedLinks = array_diff( $newLinks, $oldLinks ); |
108 | } else { |
109 | // can't load old links, so treat all links as added. |
110 | $addedLinks = $newLinks; |
111 | } |
112 | |
113 | wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) ); |
114 | wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) ); |
115 | wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) ); |
116 | |
117 | $links = implode( "\n", $addedLinks ); |
118 | |
119 | # Strip whitelisted URLs from the match |
120 | if ( is_array( $whitelists ) ) { |
121 | wfDebugLog( 'SpamBlacklist', "Excluding whitelisted URLs from " . count( $whitelists ) . |
122 | " regexes: " . implode( ', ', $whitelists ) . "\n" ); |
123 | foreach ( $whitelists as $regex ) { |
124 | AtEase::suppressWarnings(); |
125 | $newLinks = preg_replace( $regex, '', $links ); |
126 | AtEase::restoreWarnings(); |
127 | if ( is_string( $newLinks ) ) { |
128 | // If there wasn't a regex error, strip the matching URLs |
129 | $links = $newLinks; |
130 | } |
131 | } |
132 | } |
133 | |
134 | # Do the match |
135 | wfDebugLog( 'SpamBlacklist', "Checking text against " . count( $blacklists ) . |
136 | " regexes: " . implode( ', ', $blacklists ) . "\n" ); |
137 | $retVal = false; |
138 | foreach ( $blacklists as $regex ) { |
139 | AtEase::suppressWarnings(); |
140 | $matches = []; |
141 | $check = ( preg_match_all( $regex, $links, $matches ) > 0 ); |
142 | AtEase::restoreWarnings(); |
143 | if ( $check ) { |
144 | wfDebugLog( 'SpamBlacklist', "Match!\n" ); |
145 | $ip = RequestContext::getMain()->getRequest()->getIP(); |
146 | $fullUrls = []; |
147 | $fullLineRegex = substr( $regex, 0, strrpos( $regex, '/' ) ) . '.*/Sim'; |
148 | preg_match_all( $fullLineRegex, $links, $fullUrls ); |
149 | $imploded = implode( ' ', $fullUrls[0] ); |
150 | wfDebugLog( 'SpamBlacklistHit', "$ip caught submitting spam: $imploded\n" ); |
151 | if ( !$preventLog && $title ) { |
152 | $this->logFilterHit( $user, $title, $imploded ); |
153 | } |
154 | if ( $retVal === false ) { |
155 | $retVal = []; |
156 | } |
157 | $retVal = array_merge( $retVal, $fullUrls[1] ); |
158 | } |
159 | } |
160 | if ( is_array( $retVal ) ) { |
161 | $retVal = array_unique( $retVal ); |
162 | } |
163 | } else { |
164 | $retVal = false; |
165 | } |
166 | |
167 | if ( $retVal === false ) { |
168 | // Cache the typical negative results |
169 | $cache->set( $key, time(), self::STASH_TTL ); |
170 | if ( $mode === 'stash' ) { |
171 | $statsd->increment( 'spamblacklist.check-stash.store' ); |
172 | } |
173 | } |
174 | |
175 | return $retVal; |
176 | } |
177 | |
178 | /** |
179 | * Look up the links currently in the article, so we can |
180 | * ignore them on a second run. |
181 | * |
182 | * WARNING: I can add more *of the same link* with no problem here. |
183 | * @param Title $title |
184 | * @return array |
185 | */ |
186 | public function getCurrentLinks( Title $title ) { |
187 | $cache = MediaWikiServices::getInstance()->getMainWANObjectCache(); |
188 | $fname = __METHOD__; |
189 | return $cache->getWithSetCallback( |
190 | // Key is warmed via warmCachesForFilter() from ApiStashEdit |
191 | $cache->makeKey( 'external-link-list', $title->getLatestRevID() ), |
192 | $cache::TTL_MINUTE, |
193 | static function ( $oldValue, &$ttl, array &$setOpts ) use ( $title, $fname ) { |
194 | $dbr = MediaWikiServices::getInstance()->getConnectionProvider()->getReplicaDatabase(); |
195 | $setOpts += Database::getCacheSetOptions( $dbr ); |
196 | return ExternalLinksLookup::getExternalLinksForPage( |
197 | $title->getArticleID(), |
198 | $dbr, |
199 | $fname |
200 | ); |
201 | } |
202 | ); |
203 | } |
204 | |
205 | public function warmCachesForFilter( Title $title, array $entries, User $user ) { |
206 | $this->filter( |
207 | $entries, |
208 | $title, |
209 | $user, |
210 | // no logging |
211 | true, |
212 | 'stash' |
213 | ); |
214 | } |
215 | |
216 | /** |
217 | * Returns the start of the regex for matches |
218 | * |
219 | * @return string |
220 | */ |
221 | public function getRegexStart() { |
222 | return '/(?:https?:)?\/\/+[a-z0-9_\-.]*('; |
223 | } |
224 | |
225 | /** |
226 | * Returns the end of the regex for matches |
227 | * |
228 | * @param int $batchSize |
229 | * @return string |
230 | */ |
231 | public function getRegexEnd( $batchSize ) { |
232 | return ')' . parent::getRegexEnd( $batchSize ); |
233 | } |
234 | |
235 | /** |
236 | * Logs the filter hit to Special:Log if |
237 | * $wgLogSpamBlacklistHits is enabled. |
238 | * |
239 | * @param User $user |
240 | * @param Title $title |
241 | * @param string $url URL that the user attempted to add |
242 | */ |
243 | public function logFilterHit( User $user, $title, $url ) { |
244 | global $wgLogSpamBlacklistHits; |
245 | if ( $wgLogSpamBlacklistHits ) { |
246 | $logEntry = new ManualLogEntry( 'spamblacklist', 'hit' ); |
247 | $logEntry->setPerformer( $user ); |
248 | $logEntry->setTarget( $title ); |
249 | $logEntry->setParameters( [ |
250 | '4::url' => $url, |
251 | ] ); |
252 | $logid = $logEntry->insert(); |
253 | $log = new LogPage( 'spamblacklist' ); |
254 | if ( $log->isRestricted() ) { |
255 | // Make sure checkusers can see this action if the log is restricted |
256 | // (which is the default) |
257 | if ( ExtensionRegistry::getInstance()->isLoaded( 'CheckUser' ) ) { |
258 | $rc = $logEntry->getRecentChange( $logid ); |
259 | CUHooks::updateCheckUserData( $rc ); |
260 | } |
261 | } else { |
262 | // If the log is unrestricted, publish normally to RC, |
263 | // which will also update checkuser |
264 | $logEntry->publish( $logid, "rc" ); |
265 | } |
266 | } |
267 | } |
268 | } |