MediaWiki  1.34.0
SpamRegexBatch.php
Go to the documentation of this file.
1 <?php
2 
17  private static function buildRegexes( $lines, BaseBlacklist $blacklist, $batchSize = 4096 ) {
18  # Make regex
19  # It's faster using the S modifier even though it will usually only be run once
20  // $regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')';
21  // return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim';
22  $regexes = [];
23  $regexStart = $blacklist->getRegexStart();
24  $regexEnd = $blacklist->getRegexEnd( $batchSize );
25  $build = false;
26  foreach ( $lines as $line ) {
27  if ( substr( $line, -1, 1 ) == "\\" ) {
28  // Final \ will break silently on the batched regexes.
29  // Skip it here to avoid breaking the next line;
30  // warnings from getBadLines() will still trigger on
31  // edit to keep new ones from floating in.
32  continue;
33  }
34  // FIXME: not very robust size check, but should work. :)
35  if ( $build === false ) {
36  $build = $line;
37  } elseif ( strlen( $build ) + strlen( $line ) > $batchSize ) {
38  $regexes[] = $regexStart .
39  str_replace( '/', '\/', preg_replace( '|\\\*/|u', '/', $build ) ) .
40  $regexEnd;
41  $build = $line;
42  } else {
43  $build .= '|';
44  $build .= $line;
45  }
46  }
47  if ( $build !== false ) {
48  $regexes[] = $regexStart .
49  str_replace( '/', '\/', preg_replace( '|\\\*/|u', '/', $build ) ) .
50  $regexEnd;
51  }
52  return $regexes;
53  }
54 
61  private static function validateRegexes( $regexes ) {
62  foreach ( $regexes as $regex ) {
63  Wikimedia\suppressWarnings();
64  // @phan-suppress-next-line PhanParamSuspiciousOrder False positive
65  $ok = preg_match( $regex, '' );
66  Wikimedia\restoreWarnings();
67 
68  if ( $ok === false ) {
69  return false;
70  }
71  }
72  return true;
73  }
74 
81  private static function stripLines( $lines ) {
82  return array_filter(
83  array_map( 'trim',
84  preg_replace( '/#.*$/', '',
85  $lines ) ) );
86  }
87 
96  private static function buildSafeRegexes( $lines, BaseBlacklist $blacklist, $fileName = false ) {
98  $regexes = self::buildRegexes( $lines, $blacklist );
99  if ( self::validateRegexes( $regexes ) ) {
100  return $regexes;
101  } else {
102  // _Something_ broke... rebuild line-by-line; it'll be
103  // slower if there's a lot of blacklist lines, but one
104  // broken line won't take out hundreds of its brothers.
105  if ( $fileName ) {
106  wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" );
107  }
108  return self::buildRegexes( $lines, $blacklist, 0 );
109  }
110  }
111 
119  public static function getBadLines( $lines, BaseBlacklist $blacklist ) {
121 
122  $badLines = [];
123  foreach ( $lines as $line ) {
124  if ( substr( $line, -1, 1 ) == "\\" ) {
125  // Final \ will break silently on the batched regexes.
126  $badLines[] = $line;
127  }
128  }
129 
130  $regexes = self::buildRegexes( $lines, $blacklist );
131  if ( self::validateRegexes( $regexes ) ) {
132  // No other problems!
133  return $badLines;
134  }
135 
136  // Something failed in the batch, so check them one by one.
137  foreach ( $lines as $line ) {
138  $regexes = self::buildRegexes( [ $line ], $blacklist );
139  if ( !self::validateRegexes( $regexes ) ) {
140  $badLines[] = $line;
141  }
142  }
143  return $badLines;
144  }
145 
155  public static function regexesFromText( $source, BaseBlacklist $blacklist, $fileName = false ) {
156  $lines = explode( "\n", $source );
157  return self::buildSafeRegexes( $lines, $blacklist, $fileName );
158  }
159 
168  public static function regexesFromMessage( $message, BaseBlacklist $blacklist ) {
169  $source = wfMessage( $message )->inContentLanguage();
170  if ( !$source->isDisabled() ) {
171  return self::regexesFromText( $source->plain(), $blacklist );
172  } else {
173  return [];
174  }
175  }
176 }
BaseBlacklist\getRegexEnd
getRegexEnd( $batchSize)
Returns the end of the regex for matches.
Definition: BaseBlacklist.php:424
SpamRegexBatch\stripLines
static stripLines( $lines)
Strip comments and whitespace, then remove blanks.
Definition: SpamRegexBatch.php:81
wfMessage
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Definition: GlobalFunctions.php:1264
SpamRegexBatch\getBadLines
static getBadLines( $lines, BaseBlacklist $blacklist)
Returns an array of invalid lines.
Definition: SpamRegexBatch.php:119
SpamRegexBatch\regexesFromText
static regexesFromText( $source, BaseBlacklist $blacklist, $fileName=false)
Build a set of regular expressions from the given multiline input text, with empty lines and comments...
Definition: SpamRegexBatch.php:155
wfDebugLog
wfDebugLog( $logGroup, $text, $dest='all', array $context=[])
Send a line to a supplementary debug log file, if configured, or main debug log if not.
Definition: GlobalFunctions.php:1007
SpamRegexBatch\validateRegexes
static validateRegexes( $regexes)
Confirm that a set of regexes is either empty or valid.
Definition: SpamRegexBatch.php:61
BaseBlacklist
Base class for different kinds of blacklists.
Definition: BaseBlacklist.php:9
$lines
$lines
Definition: router.php:61
SpamRegexBatch\regexesFromMessage
static regexesFromMessage( $message, BaseBlacklist $blacklist)
Build a set of regular expressions from a MediaWiki message.
Definition: SpamRegexBatch.php:168
SpamRegexBatch
Utility class for working with blacklists.
Definition: SpamRegexBatch.php:6
$line
$line
Definition: cdb.php:59
SpamRegexBatch\buildSafeRegexes
static buildSafeRegexes( $lines, BaseBlacklist $blacklist, $fileName=false)
Do a sanity check on the batch regex.
Definition: SpamRegexBatch.php:96
$source
$source
Definition: mwdoc-filter.php:34
SpamRegexBatch\buildRegexes
static buildRegexes( $lines, BaseBlacklist $blacklist, $batchSize=4096)
Build a set of regular expressions matching URLs with the list of regex fragments.
Definition: SpamRegexBatch.php:17
BaseBlacklist\getRegexStart
getRegexStart()
Returns the start of the regex for matches.
Definition: BaseBlacklist.php:414