1 package org.wikimedia.search.extra.regex;
2
3 import java.io.IOException;
4 import java.util.Locale;
5 import java.util.Objects;
6 import java.util.stream.StreamSupport;
7
8 import javax.annotation.Nullable;
9
10 import org.apache.lucene.analysis.Analyzer;
11 import org.apache.lucene.index.IndexReader;
12 import org.apache.lucene.search.Query;
13 import org.apache.lucene.search.TwoPhaseIterator;
14 import org.apache.lucene.util.automaton.Automaton;
15 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
16 import org.apache.lucene.util.automaton.RegExp;
17 import org.elasticsearch.common.lucene.search.Queries;
18 import org.wikimedia.search.extra.regex.SourceRegexQueryBuilder.Settings;
19 import org.wikimedia.search.extra.regex.expression.Expression;
20 import org.wikimedia.search.extra.regex.expression.ExpressionRewriter;
21 import org.wikimedia.search.extra.regex.ngram.AutomatonTooComplexException;
22 import org.wikimedia.search.extra.regex.ngram.NGramExtractor;
23 import org.wikimedia.search.extra.util.FieldValues;
24
25 import com.google.common.annotations.VisibleForTesting;
26
27 import lombok.AccessLevel;
28 import lombok.EqualsAndHashCode;
29 import lombok.Getter;
30
31 @EqualsAndHashCode(callSuper = false)
32 @VisibleForTesting
33 @Getter(AccessLevel.PACKAGE)
34 public class SourceRegexQuery extends Query {
35 private final String fieldPath;
36 @Nullable private final String ngramFieldPath;
37 private final String regex;
38 private final FieldValues.Loader loader;
39 private final Settings settings;
40 private final int gramSize;
41 private final Rechecker rechecker;
42 @Nullable private final Analyzer ngramAnalyzer;
43
44 public SourceRegexQuery(String fieldPath, @Nullable String ngramFieldPath, String regex,
45 FieldValues.Loader loader, Settings settings, int gramSize,
46 @Nullable Analyzer ngramAnalyzer) {
47 this.fieldPath = fieldPath;
48 this.ngramFieldPath = ngramFieldPath;
49 this.regex = Objects.requireNonNull(regex);
50 if (regex.isEmpty()) {
51 throw new IllegalArgumentException("regex must be set");
52 }
53 this.loader = loader;
54 this.settings = settings;
55 this.gramSize = gramSize;
56 if (!settings.caseSensitive()
57 && !settings.locale().getLanguage().equals("ga")
58 && !settings.locale().getLanguage().equals("tr")) {
59 rechecker = new NonBacktrackingOnTheFlyCaseConvertingRechecker(regex, settings);
60 } else {
61 rechecker = new NonBacktrackingRechecker(regex, settings);
62 }
63 this.ngramAnalyzer = ngramAnalyzer;
64 }
65
66 @Override
67 @SuppressWarnings("CyclomaticComplexity")
68 public Query rewrite(IndexReader reader) throws IOException {
69
70
71 if (ngramFieldPath == null) {
72 assert ngramAnalyzer == null;
73
74
75 if (settings.rejectUnaccelerated()) {
76 throw new UnableToAccelerateRegexException(regex, gramSize, null);
77 }
78 return new UnacceleratedSourceRegexQuery(rechecker, fieldPath, loader, settings);
79 }
80 assert ngramAnalyzer != null;
81 try {
82
83
84 Automaton automaton = regexToAutomaton(
85 new RegExp(regex.toLowerCase(settings.locale()), RegExp.ALL ^ RegExp.AUTOMATON),
86 settings.maxDeterminizedStates());
87 Expression<String> expression = new NGramExtractor(gramSize, settings.maxExpand(), settings.maxStatesTraced(),
88 settings.maxNgramsExtracted(), ngramAnalyzer).extract(automaton).simplify();
89 if (expression.alwaysTrue()) {
90 if (settings.rejectUnaccelerated()) {
91 throw new UnableToAccelerateRegexException(regex, gramSize, ngramFieldPath);
92 }
93 return new UnacceleratedSourceRegexQuery(rechecker, fieldPath, loader, settings).rewrite(reader);
94 } else if (expression.alwaysFalse()) {
95 return Queries.newMatchNoDocsQuery("Expression is always false").rewrite(reader);
96 } else {
97 if (expression.countClauses() > settings.maxNgramClauses()) {
98
99
100
101
102
103
104
105
106
107
108 expression = new ExpressionRewriter<>(expression).degradeAsDisjunction(settings.maxNgramClauses());
109 if (expression.countClauses() > settings.maxNgramClauses() || expression.alwaysTrue()) {
110
111
112 if (settings.rejectUnaccelerated()) {
113 throw new UnableToAccelerateRegexException(regex, gramSize, ngramFieldPath);
114 }
115 return new UnacceleratedSourceRegexQuery(rechecker, fieldPath, loader, settings).rewrite(reader);
116 }
117 assert !expression.alwaysFalse();
118 }
119 return new AcceleratedSourceRegexQuery(rechecker, fieldPath, loader, settings,
120 expression.transform(new ExpressionToQueryTransformer(ngramFieldPath))).rewrite(reader);
121 }
122 } catch (AutomatonTooComplexException e) {
123 throw new InvalidRegexException(String.format(Locale.ROOT,
124 "Regex /%s/ too complex for maxStatesTraced setting [%s]. Use a simpler regex or raise maxStatesTraced.", regex,
125 settings.maxStatesTraced()), e);
126 } catch (IllegalArgumentException e) {
127 throw new InvalidRegexException(e.getMessage(), e);
128 }
129 }
130
131 private static Automaton regexToAutomaton(RegExp regex, int maxDeterminizedStates) {
132 return regex.toAutomaton(maxDeterminizedStates);
133 }
134
135
136
137
138
139 interface Rechecker {
140
141
142
143
144 boolean recheck(Iterable<String> values);
145
146
147
148
149
150
151 float getCost();
152 }
153
154
155
156
157
158 @EqualsAndHashCode(exclude = "charRun")
159 static class NonBacktrackingOnTheFlyCaseConvertingRechecker implements Rechecker {
160 private final String regex;
161 private final Settings settings;
162
163 @Nullable private ContainsCharacterRunAutomaton charRun;
164
165 NonBacktrackingOnTheFlyCaseConvertingRechecker(String regex, Settings settings) {
166 this.regex = regex;
167 this.settings = settings;
168 }
169
170 @Override
171 public boolean recheck(Iterable<String> values) {
172 for (String value : values) {
173 if (getCharRun().contains(value)) {
174 return true;
175 }
176 }
177 return false;
178 }
179
180 private ContainsCharacterRunAutomaton getCharRun() {
181 if (charRun == null) {
182 String regexString = regex;
183 if (!settings.caseSensitive()) {
184 regexString = regexString.toLowerCase(settings.locale());
185 }
186 Automaton automaton = regexToAutomaton(new RegExp(".*(" + regexString + ")", RegExp.ALL ^ RegExp.AUTOMATON),
187 settings.maxDeterminizedStates());
188 if (settings.locale().getLanguage().equals("el")) {
189 charRun = new ContainsCharacterRunAutomaton.GreekLowerCasing(automaton);
190 } else {
191 charRun = new ContainsCharacterRunAutomaton.LowerCasing(automaton);
192 }
193 }
194 return charRun;
195 }
196
197 @Override
198 public float getCost() {
199 return getCharRun().getSize();
200 }
201
202 }
203
204
205
206
207 @EqualsAndHashCode(exclude = "charRun")
208 static class NonBacktrackingRechecker implements Rechecker {
209 private final String regex;
210 private final Settings settings;
211
212 @Nullable private ContainsCharacterRunAutomaton charRun;
213
214 NonBacktrackingRechecker(String regex, Settings settings) {
215 this.regex = regex;
216 this.settings = settings;
217 }
218
219 @Override
220 public boolean recheck(Iterable<String> values) {
221 return StreamSupport.stream(values.spliterator(), false)
222 .map(s -> settings.caseSensitive() ? s : s.toLowerCase(settings.locale()))
223 .anyMatch(s -> getCharRun().contains(s));
224 }
225
226 private ContainsCharacterRunAutomaton getCharRun() {
227 if (charRun == null) {
228 String regexString = regex;
229 if (!settings.caseSensitive()) {
230 regexString = regexString.toLowerCase(settings.locale());
231 }
232 Automaton automaton = regexToAutomaton(new RegExp(".*(" + regexString + ")", RegExp.ALL ^ RegExp.AUTOMATON),
233 settings.maxDeterminizedStates());
234 charRun = new ContainsCharacterRunAutomaton(automaton);
235 }
236 return charRun;
237 }
238
239 @Override
240 public float getCost() {
241 return getCharRun().getSize();
242 }
243
244 }
245
246
247
248
249 @EqualsAndHashCode(exclude = "charRun")
250 static class SlowRechecker implements Rechecker {
251 private final String regex;
252 private final Settings settings;
253
254 @Nullable private CharacterRunAutomaton charRun;
255
256 SlowRechecker(String regex, Settings settings) {
257 this.regex = regex;
258 this.settings = settings;
259 }
260
261
262
263
264
265 @Override
266 public boolean recheck(Iterable<String> values) {
267 return StreamSupport.stream(values.spliterator(), false)
268 .map(s -> settings.caseSensitive() ? s : s.toLowerCase(settings.locale()))
269 .anyMatch(s -> getCharRun().run(s));
270 }
271
272 private CharacterRunAutomaton getCharRun() {
273 if (charRun == null) {
274 String regexString = regex;
275 if (!settings.caseSensitive()) {
276 regexString = regexString.toLowerCase(settings.locale());
277 }
278 Automaton automaton = regexToAutomaton(new RegExp(".*(" + regexString + ").*", RegExp.ALL ^ RegExp.AUTOMATON),
279 settings.maxDeterminizedStates());
280 charRun = new CharacterRunAutomaton(automaton);
281 }
282 return charRun;
283 }
284
285 @Override
286 public float getCost() {
287 return getCharRun().getSize();
288 }
289
290 }
291
292 @Override
293 public String toString(String field) {
294 StringBuilder b = new StringBuilder();
295 b.append(fieldPath).append(":/").append(regex).append('/');
296 if (ngramFieldPath != null) {
297 b.append('~').append(ngramFieldPath);
298 }
299 return b.toString();
300 }
301 }