Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 60 |
|
0.00% |
0 / 7 |
CRAP | |
0.00% |
0 / 1 |
SpamRegexBatch | |
0.00% |
0 / 60 |
|
0.00% |
0 / 7 |
506 | |
0.00% |
0 / 1 |
buildRegexes | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
42 | |||
validateRegexes | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
12 | |||
stripLines | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
buildSafeRegexes | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
12 | |||
getBadLines | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
42 | |||
regexesFromText | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
regexesFromMessage | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\SpamBlacklist; |
4 | |
5 | use Wikimedia\AtEase\AtEase; |
6 | |
7 | /** |
8 | * Utility class for working with blacklists |
9 | */ |
10 | class SpamRegexBatch { |
11 | /** |
12 | * Build a set of regular expressions matching URLs with the list of regex fragments. |
13 | * Returns an empty list if the input list is empty. |
14 | * |
15 | * @param string[] $lines list of fragments which will match in URLs |
16 | * @param BaseBlacklist $blacklist |
17 | * @param int $batchSize largest allowed batch regex; |
18 | * if 0, will produce one regex per line |
19 | * @return string[] |
20 | */ |
21 | private static function buildRegexes( array $lines, BaseBlacklist $blacklist, $batchSize = 4096 ) { |
22 | # Make regex |
23 | # It's faster using the S modifier even though it will usually only be run once |
24 | // $regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')'; |
25 | // return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim'; |
26 | $regexes = []; |
27 | $regexStart = $blacklist->getRegexStart(); |
28 | $regexEnd = $blacklist->getRegexEnd( $batchSize ); |
29 | $build = false; |
30 | foreach ( $lines as $line ) { |
31 | if ( substr( $line, -1, 1 ) == "\\" ) { |
32 | // Final \ will break silently on the batched regexes. |
33 | // Skip it here to avoid breaking the next line; |
34 | // warnings from getBadLines() will still trigger on |
35 | // edit to keep new ones from floating in. |
36 | continue; |
37 | } |
38 | // FIXME: not very robust size check, but should work. :) |
39 | if ( $build === false ) { |
40 | $build = $line; |
41 | } elseif ( strlen( $build ) + strlen( $line ) > $batchSize ) { |
42 | $regexes[] = $regexStart . |
43 | str_replace( '/', '\/', preg_replace( '|\\\*/|u', '/', $build ) ) . |
44 | $regexEnd; |
45 | $build = $line; |
46 | } else { |
47 | $build .= '|'; |
48 | $build .= $line; |
49 | } |
50 | } |
51 | if ( $build !== false ) { |
52 | $regexes[] = $regexStart . |
53 | str_replace( '/', '\/', preg_replace( '|\\\*/|u', '/', $build ) ) . |
54 | $regexEnd; |
55 | } |
56 | return $regexes; |
57 | } |
58 | |
59 | /** |
60 | * Confirm that a set of regexes is either empty or valid. |
61 | * |
62 | * @param string[] $regexes set of regexes |
63 | * @return bool true if ok, false if contains invalid lines |
64 | */ |
65 | private static function validateRegexes( $regexes ) { |
66 | foreach ( $regexes as $regex ) { |
67 | AtEase::suppressWarnings(); |
68 | // @phan-suppress-next-line PhanParamSuspiciousOrder False positive |
69 | $ok = preg_match( $regex, '' ); |
70 | AtEase::restoreWarnings(); |
71 | |
72 | if ( $ok === false ) { |
73 | return false; |
74 | } |
75 | } |
76 | return true; |
77 | } |
78 | |
79 | /** |
80 | * Strip comments and whitespace, then remove blanks |
81 | * |
82 | * @param string[] $lines |
83 | * @return string[] |
84 | */ |
85 | private static function stripLines( array $lines ) { |
86 | return array_filter( |
87 | array_map( 'trim', |
88 | preg_replace( '/#.*$/', '', |
89 | $lines ) |
90 | ) |
91 | ); |
92 | } |
93 | |
94 | /** |
95 | * Do a sanity check on the batch regex. |
96 | * |
97 | * @param string[] $lines unsanitized input lines |
98 | * @param BaseBlacklist $blacklist |
99 | * @param bool|string $fileName optional for debug reporting |
100 | * @return string[] of regexes |
101 | */ |
102 | private static function buildSafeRegexes( array $lines, BaseBlacklist $blacklist, $fileName = false ) { |
103 | $lines = self::stripLines( $lines ); |
104 | $regexes = self::buildRegexes( $lines, $blacklist ); |
105 | if ( self::validateRegexes( $regexes ) ) { |
106 | return $regexes; |
107 | } else { |
108 | // _Something_ broke... rebuild line-by-line; it'll be |
109 | // slower if there's a lot of blacklist lines, but one |
110 | // broken line won't take out hundreds of its brothers. |
111 | if ( $fileName ) { |
112 | wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" ); |
113 | } |
114 | return self::buildRegexes( $lines, $blacklist, 0 ); |
115 | } |
116 | } |
117 | |
118 | /** |
119 | * Returns an array of invalid lines |
120 | * |
121 | * @param string[] $lines |
122 | * @param BaseBlacklist $blacklist |
123 | * @return string[] of input lines which produce invalid input, or empty array if no problems |
124 | */ |
125 | public static function getBadLines( $lines, BaseBlacklist $blacklist ) { |
126 | $lines = self::stripLines( $lines ); |
127 | |
128 | $badLines = []; |
129 | foreach ( $lines as $line ) { |
130 | if ( substr( $line, -1, 1 ) == "\\" ) { |
131 | // Final \ will break silently on the batched regexes. |
132 | $badLines[] = $line; |
133 | } |
134 | } |
135 | |
136 | $regexes = self::buildRegexes( $lines, $blacklist ); |
137 | if ( self::validateRegexes( $regexes ) ) { |
138 | // No other problems! |
139 | return $badLines; |
140 | } |
141 | |
142 | // Something failed in the batch, so check them one by one. |
143 | foreach ( $lines as $line ) { |
144 | $regexes = self::buildRegexes( [ $line ], $blacklist ); |
145 | if ( !self::validateRegexes( $regexes ) ) { |
146 | $badLines[] = $line; |
147 | } |
148 | } |
149 | return $badLines; |
150 | } |
151 | |
152 | /** |
153 | * Build a set of regular expressions from the given multiline input text, |
154 | * with empty lines and comments stripped. |
155 | * |
156 | * @param string $source |
157 | * @param BaseBlacklist $blacklist |
158 | * @param bool|string $fileName optional, for reporting of bad files |
159 | * @return string[] of regular expressions, potentially empty |
160 | */ |
161 | public static function regexesFromText( $source, BaseBlacklist $blacklist, $fileName = false ) { |
162 | $lines = explode( "\n", $source ); |
163 | return self::buildSafeRegexes( $lines, $blacklist, $fileName ); |
164 | } |
165 | |
166 | /** |
167 | * Build a set of regular expressions from a MediaWiki message. |
168 | * Will be correctly empty if the message isn't present. |
169 | * |
170 | * @param string $message |
171 | * @param BaseBlacklist $blacklist |
172 | * @return string[] of regular expressions, potentially empty |
173 | */ |
174 | public static function regexesFromMessage( $message, BaseBlacklist $blacklist ) { |
175 | $source = wfMessage( $message )->inContentLanguage(); |
176 | if ( !$source->isDisabled() ) { |
177 | return self::regexesFromText( $source->plain(), $blacklist ); |
178 | } else { |
179 | return []; |
180 | } |
181 | } |
182 | } |