MediaWiki REL1_34
SpamRegexBatch.php
Go to the documentation of this file.
1<?php
2
17 private static function buildRegexes( $lines, BaseBlacklist $blacklist, $batchSize = 4096 ) {
18 # Make regex
19 # It's faster using the S modifier even though it will usually only be run once
20 // $regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')';
21 // return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim';
22 $regexes = [];
23 $regexStart = $blacklist->getRegexStart();
24 $regexEnd = $blacklist->getRegexEnd( $batchSize );
25 $build = false;
26 foreach ( $lines as $line ) {
27 if ( substr( $line, -1, 1 ) == "\\" ) {
28 // Final \ will break silently on the batched regexes.
29 // Skip it here to avoid breaking the next line;
30 // warnings from getBadLines() will still trigger on
31 // edit to keep new ones from floating in.
32 continue;
33 }
34 // FIXME: not very robust size check, but should work. :)
35 if ( $build === false ) {
36 $build = $line;
37 } elseif ( strlen( $build ) + strlen( $line ) > $batchSize ) {
38 $regexes[] = $regexStart .
39 str_replace( '/', '\/', preg_replace( '|\\\*/|u', '/', $build ) ) .
40 $regexEnd;
41 $build = $line;
42 } else {
43 $build .= '|';
44 $build .= $line;
45 }
46 }
47 if ( $build !== false ) {
48 $regexes[] = $regexStart .
49 str_replace( '/', '\/', preg_replace( '|\\\*/|u', '/', $build ) ) .
50 $regexEnd;
51 }
52 return $regexes;
53 }
54
61 private static function validateRegexes( $regexes ) {
62 foreach ( $regexes as $regex ) {
63 Wikimedia\suppressWarnings();
64 // @phan-suppress-next-line PhanParamSuspiciousOrder False positive
65 $ok = preg_match( $regex, '' );
66 Wikimedia\restoreWarnings();
67
68 if ( $ok === false ) {
69 return false;
70 }
71 }
72 return true;
73 }
74
81 private static function stripLines( $lines ) {
82 return array_filter(
83 array_map( 'trim',
84 preg_replace( '/#.*$/', '',
85 $lines ) ) );
86 }
87
96 private static function buildSafeRegexes( $lines, BaseBlacklist $blacklist, $fileName = false ) {
98 $regexes = self::buildRegexes( $lines, $blacklist );
99 if ( self::validateRegexes( $regexes ) ) {
100 return $regexes;
101 } else {
102 // _Something_ broke... rebuild line-by-line; it'll be
103 // slower if there's a lot of blacklist lines, but one
104 // broken line won't take out hundreds of its brothers.
105 if ( $fileName ) {
106 wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" );
107 }
108 return self::buildRegexes( $lines, $blacklist, 0 );
109 }
110 }
111
119 public static function getBadLines( $lines, BaseBlacklist $blacklist ) {
121
122 $badLines = [];
123 foreach ( $lines as $line ) {
124 if ( substr( $line, -1, 1 ) == "\\" ) {
125 // Final \ will break silently on the batched regexes.
126 $badLines[] = $line;
127 }
128 }
129
130 $regexes = self::buildRegexes( $lines, $blacklist );
131 if ( self::validateRegexes( $regexes ) ) {
132 // No other problems!
133 return $badLines;
134 }
135
136 // Something failed in the batch, so check them one by one.
137 foreach ( $lines as $line ) {
138 $regexes = self::buildRegexes( [ $line ], $blacklist );
139 if ( !self::validateRegexes( $regexes ) ) {
140 $badLines[] = $line;
141 }
142 }
143 return $badLines;
144 }
145
155 public static function regexesFromText( $source, BaseBlacklist $blacklist, $fileName = false ) {
156 $lines = explode( "\n", $source );
157 return self::buildSafeRegexes( $lines, $blacklist, $fileName );
158 }
159
168 public static function regexesFromMessage( $message, BaseBlacklist $blacklist ) {
169 $source = wfMessage( $message )->inContentLanguage();
170 if ( !$source->isDisabled() ) {
171 return self::regexesFromText( $source->plain(), $blacklist );
172 } else {
173 return [];
174 }
175 }
176}
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
$line
Definition cdb.php:59
Base class for different kinds of blacklists.
getRegexStart()
Returns the start of the regex for matches.
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
Utility class for working with blacklists.
static buildSafeRegexes( $lines, BaseBlacklist $blacklist, $fileName=false)
Do a sanity check on the batch regex.
static buildRegexes( $lines, BaseBlacklist $blacklist, $batchSize=4096)
Build a set of regular expressions matching URLs with the list of regex fragments.
static regexesFromMessage( $message, BaseBlacklist $blacklist)
Build a set of regular expressions from a MediaWiki message.
static regexesFromText( $source, BaseBlacklist $blacklist, $fileName=false)
Build a set of regular expressions from the given multiline input text, with empty lines and comments...
static stripLines( $lines)
Strip comments and whitespace, then remove blanks.
static validateRegexes( $regexes)
Confirm that a set of regexes is either empty or valid.
static getBadLines( $lines, BaseBlacklist $blacklist)
Returns an array of invalid lines.
$source
$lines
Definition router.php:61