View Javadoc
1   package org.wikimedia.search.extra.regex;
2   
3   import java.io.IOException;
4   import java.util.Locale;
5   import java.util.Objects;
6   import java.util.stream.StreamSupport;
7   
8   import javax.annotation.Nullable;
9   
10  import org.apache.lucene.analysis.Analyzer;
11  import org.apache.lucene.index.IndexReader;
12  import org.apache.lucene.search.Query;
13  import org.apache.lucene.search.TwoPhaseIterator;
14  import org.apache.lucene.util.automaton.Automaton;
15  import org.apache.lucene.util.automaton.CharacterRunAutomaton;
16  import org.apache.lucene.util.automaton.RegExp;
17  import org.elasticsearch.common.lucene.search.Queries;
18  import org.wikimedia.search.extra.regex.SourceRegexQueryBuilder.Settings;
19  import org.wikimedia.search.extra.regex.expression.Expression;
20  import org.wikimedia.search.extra.regex.expression.ExpressionRewriter;
21  import org.wikimedia.search.extra.regex.ngram.AutomatonTooComplexException;
22  import org.wikimedia.search.extra.regex.ngram.NGramExtractor;
23  import org.wikimedia.search.extra.util.FieldValues;
24  
25  import com.google.common.annotations.VisibleForTesting;
26  
27  import lombok.AccessLevel;
28  import lombok.EqualsAndHashCode;
29  import lombok.Getter;
30  
31  @EqualsAndHashCode(callSuper = false)
32  @VisibleForTesting
33  @Getter(AccessLevel.PACKAGE)
34  public class SourceRegexQuery extends Query {
35      private final String fieldPath;
36      @Nullable private final String ngramFieldPath;
37      private final String regex;
38      private final FieldValues.Loader loader;
39      private final Settings settings;
40      private final int gramSize;
41      private final Rechecker rechecker;
42      @Nullable private final Analyzer ngramAnalyzer;
43  
44      public SourceRegexQuery(String fieldPath, @Nullable String ngramFieldPath, String regex,
45                              FieldValues.Loader loader, Settings settings, int gramSize,
46                              @Nullable Analyzer ngramAnalyzer) {
47          this.fieldPath = fieldPath;
48          this.ngramFieldPath = ngramFieldPath;
49          this.regex = Objects.requireNonNull(regex);
50          if (regex.isEmpty()) {
51              throw new IllegalArgumentException("regex must be set");
52          }
53          this.loader = loader;
54          this.settings = settings;
55          this.gramSize = gramSize;
56          if (!settings.caseSensitive()
57                  && !settings.locale().getLanguage().equals("ga")
58                  && !settings.locale().getLanguage().equals("tr")) {
59              rechecker = new NonBacktrackingOnTheFlyCaseConvertingRechecker(regex, settings);
60          } else {
61              rechecker = new NonBacktrackingRechecker(regex, settings);
62          }
63          this.ngramAnalyzer = ngramAnalyzer;
64      }
65  
66      @Override
67      @SuppressWarnings("CyclomaticComplexity")
68      public Query rewrite(IndexReader reader) throws IOException {
69          // TODO: investigate moving this logic inside the Builder
70          // Rewrite the query as an AcceleratedSourceRegexQuery or UnacceleratedSourceRegexQuery
71          if (ngramFieldPath == null) {
72              assert ngramAnalyzer == null;
73              // Don't bother expanding the regex if there isn't a field to check
74              // it against. Its unlikely to resolve to all false anyway.
75              if (settings.rejectUnaccelerated()) {
76                  throw new UnableToAccelerateRegexException(regex, gramSize, null);
77              }
78              return new UnacceleratedSourceRegexQuery(rechecker, fieldPath, loader, settings);
79          }
80          assert ngramAnalyzer != null;
81          try {
82              // The accelerating filter is always assumed to be case
83              // insensitive/always lowercased
84              Automaton automaton = regexToAutomaton(
85                      new RegExp(regex.toLowerCase(settings.locale()), RegExp.ALL ^ RegExp.AUTOMATON),
86                      settings.maxDeterminizedStates());
87              Expression<String> expression = new NGramExtractor(gramSize, settings.maxExpand(), settings.maxStatesTraced(),
88                      settings.maxNgramsExtracted(), ngramAnalyzer).extract(automaton).simplify();
89              if (expression.alwaysTrue()) {
90                  if (settings.rejectUnaccelerated()) {
91                      throw new UnableToAccelerateRegexException(regex, gramSize, ngramFieldPath);
92                  }
93                  return new UnacceleratedSourceRegexQuery(rechecker, fieldPath, loader, settings).rewrite(reader);
94              } else if (expression.alwaysFalse()) {
95                  return Queries.newMatchNoDocsQuery("Expression is always false").rewrite(reader);
96              } else {
97                  if (expression.countClauses() > settings.maxNgramClauses()) {
98                      // The expression is too large we will try to use a degraded disjunction
99                      // Even if we limit the number of trigram generated (number of transition)
100                     // Some loops may generate huge boolean expression. If it's the case
101                     // The time required to build and scan all the clauses may be counter productive
102                     // since we are trying to optimize not to slowdown.
103                     //
104                     // It's not clear if the the degraded disjunction will be actually optimize the
105                     // regex, if one of the ngram is very common we will certainly scan nearly all
106                     // the docs in the index resulting in a UnacceleratedSourceRegexQuery.
107 
108                     expression = new ExpressionRewriter<>(expression).degradeAsDisjunction(settings.maxNgramClauses());
109                     if (expression.countClauses() > settings.maxNgramClauses() || expression.alwaysTrue()) {
110                         // Still too large, it's likely a bug or improper settings:
111                         // maxTrigramClauses very low and a large max_ngrams_extracted
112                         if (settings.rejectUnaccelerated()) {
113                             throw new UnableToAccelerateRegexException(regex, gramSize, ngramFieldPath);
114                         }
115                         return new UnacceleratedSourceRegexQuery(rechecker, fieldPath, loader, settings).rewrite(reader);
116                     }
117                     assert !expression.alwaysFalse();
118                 }
119                 return new AcceleratedSourceRegexQuery(rechecker, fieldPath, loader, settings,
120                         expression.transform(new ExpressionToQueryTransformer(ngramFieldPath))).rewrite(reader);
121             }
122         } catch (AutomatonTooComplexException e) {
123             throw new InvalidRegexException(String.format(Locale.ROOT,
124                     "Regex /%s/ too complex for maxStatesTraced setting [%s].  Use a simpler regex or raise maxStatesTraced.", regex,
125                     settings.maxStatesTraced()), e);
126         } catch (IllegalArgumentException e) {
127             throw new InvalidRegexException(e.getMessage(), e);
128         }
129     }
130 
131     private static Automaton regexToAutomaton(RegExp regex, int maxDeterminizedStates) {
132         return regex.toAutomaton(maxDeterminizedStates);
133     }
134 
135     /**
136      * Wraps all recheck operations for a single execution. Package private for
137      * testing.
138      */
139     interface Rechecker {
140         /**
141          * Recheck the values in a candidate document to see if they actually
142          * contain a match to the regex.
143          */
144         boolean recheck(Iterable<String> values);
145 
146         /**
147          * Determine the cost of the recheck phase.
148          * (Used by {@link TwoPhaseIterator})
149          * @return the cost
150          */
151         float getCost();
152     }
153 
154     /**
155      * Faster for case insensitive queries than the NonBacktrackingRechecker but
156      * wrong for Irish and Turkish.
157      */
158     @EqualsAndHashCode(exclude = "charRun")
159     static class NonBacktrackingOnTheFlyCaseConvertingRechecker implements Rechecker {
160         private final String regex;
161         private final Settings settings;
162 
163         @Nullable private ContainsCharacterRunAutomaton charRun;
164 
165         NonBacktrackingOnTheFlyCaseConvertingRechecker(String regex, Settings settings) {
166             this.regex = regex;
167             this.settings = settings;
168         }
169 
170         @Override
171         public boolean recheck(Iterable<String> values) {
172             for (String value : values) {
173                 if (getCharRun().contains(value)) {
174                     return true;
175                 }
176             }
177             return false;
178         }
179 
180         private ContainsCharacterRunAutomaton getCharRun() {
181             if (charRun == null) {
182                 String regexString = regex;
183                 if (!settings.caseSensitive()) {
184                     regexString = regexString.toLowerCase(settings.locale());
185                 }
186                 Automaton automaton = regexToAutomaton(new RegExp(".*(" + regexString + ")", RegExp.ALL ^ RegExp.AUTOMATON),
187                         settings.maxDeterminizedStates());
188                 if (settings.locale().getLanguage().equals("el")) {
189                     charRun = new ContainsCharacterRunAutomaton.GreekLowerCasing(automaton);
190                 } else {
191                     charRun = new ContainsCharacterRunAutomaton.LowerCasing(automaton);
192                 }
193             }
194             return charRun;
195         }
196 
197         @Override
198         public float getCost() {
199             return getCharRun().getSize();
200         }
201 
202     }
203 
204     /**
205      * Much much faster than SlowRechecker.
206      */
207     @EqualsAndHashCode(exclude = "charRun")
208     static class NonBacktrackingRechecker implements Rechecker {
209         private final String regex;
210         private final Settings settings;
211 
212         @Nullable private ContainsCharacterRunAutomaton charRun;
213 
214         NonBacktrackingRechecker(String regex, Settings settings) {
215             this.regex = regex;
216             this.settings = settings;
217         }
218 
219         @Override
220         public boolean recheck(Iterable<String> values) {
221             return StreamSupport.stream(values.spliterator(), false)
222                 .map(s -> settings.caseSensitive() ? s : s.toLowerCase(settings.locale()))
223                 .anyMatch(s -> getCharRun().contains(s));
224         }
225 
226         private ContainsCharacterRunAutomaton getCharRun() {
227             if (charRun == null) {
228                 String regexString = regex;
229                 if (!settings.caseSensitive()) {
230                     regexString = regexString.toLowerCase(settings.locale());
231                 }
232                 Automaton automaton = regexToAutomaton(new RegExp(".*(" + regexString + ")", RegExp.ALL ^ RegExp.AUTOMATON),
233                         settings.maxDeterminizedStates());
234                 charRun = new ContainsCharacterRunAutomaton(automaton);
235             }
236             return charRun;
237         }
238 
239         @Override
240         public float getCost() {
241             return getCharRun().getSize();
242         }
243 
244     }
245 
246     /**
247      * Simplistic recheck implemetation which is more obviously correct.
248      */
249     @EqualsAndHashCode(exclude = "charRun")
250     static class SlowRechecker implements Rechecker {
251         private final String regex;
252         private final Settings settings;
253 
254         @Nullable private CharacterRunAutomaton charRun;
255 
256         SlowRechecker(String regex, Settings settings) {
257             this.regex = regex;
258             this.settings = settings;
259         }
260 
261         /**
262          * Recheck the values in a candidate document to see if they actually
263          * contain a match to the regex.
264          */
265         @Override
266         public boolean recheck(Iterable<String> values) {
267             return StreamSupport.stream(values.spliterator(), false)
268                     .map(s -> settings.caseSensitive() ? s : s.toLowerCase(settings.locale()))
269                     .anyMatch(s -> getCharRun().run(s));
270         }
271 
272         private CharacterRunAutomaton getCharRun() {
273             if (charRun == null) {
274                 String regexString = regex;
275                 if (!settings.caseSensitive()) {
276                     regexString = regexString.toLowerCase(settings.locale());
277                 }
278                 Automaton automaton = regexToAutomaton(new RegExp(".*(" + regexString + ").*", RegExp.ALL ^ RegExp.AUTOMATON),
279                         settings.maxDeterminizedStates());
280                 charRun = new CharacterRunAutomaton(automaton);
281             }
282             return charRun;
283         }
284 
285         @Override
286         public float getCost() {
287             return getCharRun().getSize();
288         }
289 
290     }
291 
292     @Override
293     public String toString(String field) {
294         StringBuilder b = new StringBuilder();
295         b.append(fieldPath).append(":/").append(regex).append('/');
296         if (ngramFieldPath != null) {
297             b.append('~').append(ngramFieldPath);
298         }
299         return b.toString();
300     }
301 }