1 package org.wikimedia.search.extra.regex;
2
3 import static org.elasticsearch.common.xcontent.ConstructingObjectParser.constructorArg;
4
5 import java.io.IOException;
6 import java.util.Locale;
7 import java.util.Objects;
8
9 import javax.annotation.Nullable;
10
11 import org.apache.lucene.analysis.Analyzer;
12 import org.apache.lucene.search.Query;
13 import org.elasticsearch.Version;
14 import org.elasticsearch.common.ParseField;
15 import org.elasticsearch.common.ParsingException;
16 import org.elasticsearch.common.io.stream.StreamInput;
17 import org.elasticsearch.common.io.stream.StreamOutput;
18 import org.elasticsearch.common.util.LocaleUtils;
19 import org.elasticsearch.common.xcontent.ConstructingObjectParser;
20 import org.elasticsearch.common.xcontent.XContentBuilder;
21 import org.elasticsearch.common.xcontent.XContentParser;
22 import org.elasticsearch.index.mapper.MappedFieldType;
23 import org.elasticsearch.index.query.AbstractQueryBuilder;
24 import org.elasticsearch.index.query.QueryShardContext;
25 import org.wikimedia.search.extra.regex.expression.ExpressionRewriter;
26 import org.wikimedia.search.extra.util.FieldValues;
27
28 import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
29 import lombok.AccessLevel;
30 import lombok.EqualsAndHashCode;
31 import lombok.Getter;
32 import lombok.NonNull;
33 import lombok.Setter;
34 import lombok.experimental.Accessors;
35
36
37
38
39 @Accessors(chain = true, fluent = true)
40 @Getter
41 @Setter
42 @SuppressFBWarnings("CLI_CONSTANT_LIST_INDEX")
43 public class SourceRegexQueryBuilder extends AbstractQueryBuilder<SourceRegexQueryBuilder> {
44 public static final ParseField NAME = new ParseField("source_regex", "sourceRegex", "source-regex");
45
46 public static final ParseField FIELD = new ParseField("field");
47 public static final ParseField REGEX = new ParseField("regex");
48 public static final ParseField LOAD_FROM_SOURCE = new ParseField("load_from_source");
49 public static final ParseField NGRAM_FIELD = new ParseField("ngram_field");
50 public static final ParseField GRAM_SIZE = new ParseField("gram_size");
51
52 public static final boolean DEFAULT_LOAD_FROM_SOURCE = true;
53 public static final int DEFAULT_GRAM_SIZE = 3;
54
55 private static final ConstructingObjectParser<SourceRegexQueryBuilder, Void> PARSER = constructParser();
56
57 private static ConstructingObjectParser<SourceRegexQueryBuilder, Void> constructParser() {
58 ConstructingObjectParser<SourceRegexQueryBuilder, Void> parser =
59 new ConstructingObjectParser<>(NAME.getPreferredName(),
60 o -> new SourceRegexQueryBuilder((String) o[0], (String) o[1]));
61 parser.declareString(constructorArg(), FIELD);
62 parser.declareString(constructorArg(), REGEX);
63 parser.declareBoolean(SourceRegexQueryBuilder::loadFromSource, LOAD_FROM_SOURCE);
64 parser.declareString(SourceRegexQueryBuilder::ngramField, NGRAM_FIELD);
65 parser.declareInt(SourceRegexQueryBuilder::gramSize, GRAM_SIZE);
66 parser.declareInt((x, i) -> x.settings().maxExpand(i), Settings.MAX_EXPAND);
67 parser.declareInt((x, i) -> x.settings().maxStatesTraced(i), Settings.MAX_STATES_TRACED);
68 parser.declareInt((x, i) -> x.settings().maxDeterminizedStates(i), Settings.MAX_DETERMINIZED_STATES);
69 parser.declareInt((x, i) -> x.settings().maxNgramsExtracted(i), Settings.MAX_NGRAMS_EXTRACTED);
70 parser.declareBoolean((x, b) -> x.settings().caseSensitive(b), Settings.CASE_SENSITIVE);
71 parser.declareString((x, s) -> x.settings().locale(LocaleUtils.parse(s)), Settings.LOCALE);
72 parser.declareBoolean((x, b) -> x.settings().rejectUnaccelerated(b), Settings.REJECT_UNACCELERATED);
73 parser.declareInt((x, i) -> x.settings().maxNgramClauses(i), Settings.MAX_NGRAM_CLAUSES);
74 declareStandardFields(parser);
75 return parser;
76 }
77
78 private final String field;
79 private final String regex;
80
81
82
83
84
85 private boolean loadFromSource = DEFAULT_LOAD_FROM_SOURCE;
86
87
88
89
90
91 @Nullable private String ngramField;
92
93
94
95
96
97 private int gramSize = DEFAULT_GRAM_SIZE;
98
99 @Setter(AccessLevel.NONE)
100 private final Settings settings;
101
102
103
104
105
106
107
108 public SourceRegexQueryBuilder(String field, String regex) {
109 this(field, regex, new Settings());
110 }
111
112
113
114
115
116
117
118
119 SourceRegexQueryBuilder(String field, String regex, Settings settings) {
120 this.field = Objects.requireNonNull(field);
121 this.regex = Objects.requireNonNull(regex);
122 this.settings = settings;
123 }
124
125 public SourceRegexQueryBuilder(StreamInput in) throws IOException {
126 super(in);
127 field = in.readString();
128 regex = in.readString();
129 loadFromSource = in.readBoolean();
130 ngramField = in.readOptionalString();
131 gramSize = in.readVInt();
132 settings = new Settings(in);
133 }
134
135 @Override
136 protected void doWriteTo(StreamOutput out) throws IOException {
137 out.writeString(field);
138 out.writeString(regex);
139 out.writeBoolean(loadFromSource);
140 out.writeOptionalString(ngramField);
141 out.writeVInt(gramSize);
142 settings.writeTo(out);
143 }
144
145 @Override
146 public String getWriteableName() {
147 return NAME.getPreferredName();
148 }
149
150 @Override
151 public int doHashCode() {
152 return Objects.hash(field, gramSize, loadFromSource, ngramField, regex, settings);
153 }
154
155 @Override
156 public boolean doEquals(SourceRegexQueryBuilder o) {
157 return Objects.equals(field, o.field) &&
158 Objects.equals(gramSize, o.gramSize) &&
159 Objects.equals(ngramField, o.ngramField) &&
160 Objects.equals(loadFromSource, o.loadFromSource) &&
161 Objects.equals(regex, o.regex) &&
162 Objects.equals(settings, o.settings);
163 }
164
165 public SourceRegexQueryBuilder maxStatesTraced(int i) {
166 settings.maxStatesTraced = i;
167 return this;
168 }
169
170 public SourceRegexQueryBuilder maxDeterminizedStates(int i) {
171 settings.maxDeterminizedStates = i;
172 return this;
173 }
174
175 public SourceRegexQueryBuilder rejectUnaccelerated(boolean b) {
176 settings.rejectUnaccelerated = b;
177 return this;
178 }
179
180 public SourceRegexQueryBuilder caseSensitive(boolean b) {
181 settings.caseSensitive = b;
182 return this;
183 }
184
185 public SourceRegexQueryBuilder locale(Locale el) {
186 settings.locale = Objects.requireNonNull(el);
187 return this;
188 }
189
190 @Override
191 protected Query doToQuery(QueryShardContext context) throws IOException {
192 final Analyzer ngramAnalyzer;
193 if (ngramField != null) {
194 MappedFieldType mapper = context.fieldMapper(ngramField);
195 if (mapper == null) {
196 throw new IllegalArgumentException("ngramField [" + ngramField + "] is unknown.");
197 }
198 ngramAnalyzer = context.getSearchAnalyzer(mapper);
199 if (ngramAnalyzer == null) {
200 throw new IllegalArgumentException("Cannot find an analyzer for ngramField [" + ngramField + "], is this field indexed?");
201 }
202 } else {
203 ngramAnalyzer = null;
204 }
205 return new SourceRegexQuery(
206 field, ngramField, regex,
207 loadFromSource ? FieldValues.loadFromSource() : FieldValues.loadFromStoredField(),
208 settings, gramSize, ngramAnalyzer);
209 }
210
211
212
213
214 @Accessors(chain = true, fluent = true)
215 @Setter
216 @Getter
217 @EqualsAndHashCode
218 static class Settings {
219 static final ParseField MAX_EXPAND = new ParseField("max_expand");
220 static final ParseField MAX_STATES_TRACED = new ParseField("max_states_traced");
221 static final ParseField MAX_DETERMINIZED_STATES = new ParseField("max_determinized_states");
222 static final ParseField MAX_NGRAMS_EXTRACTED = new ParseField("max_ngrams_extracted");
223 static final ParseField CASE_SENSITIVE = new ParseField("case_sensitive");
224 static final ParseField LOCALE = new ParseField("locale");
225 static final ParseField REJECT_UNACCELERATED = new ParseField("reject_unaccelerated");
226 static final ParseField MAX_NGRAM_CLAUSES = new ParseField("max_ngram_clauses");
227
228 private static final int DEFAULT_MAX_EXPAND = 4;
229 private static final int DEFAULT_MAX_STATES_TRACED = 10000;
230 private static final int DEFAULT_MAX_DETERMINIZED_STATES = 20000;
231 private static final int DEFAULT_MAX_NGRAMS_EXTRACTED = 100;
232 private static final boolean DEFAULT_CASE_SENSITIVE = false;
233 private static final Locale DEFAULT_LOCALE = Locale.ROOT;
234 private static final boolean DEFAULT_REJECT_UNACCELERATED = false;
235 private static final int DEFAULT_MAX_BOOLEAN_CLAUSES = ExpressionRewriter.MAX_BOOLEAN_CLAUSES;
236 private static final int DEFAULT_TIMEOUT = 0;
237
238
239
240
241
242
243
244
245
246 private int maxExpand = DEFAULT_MAX_EXPAND;
247
248
249
250
251
252
253
254
255
256
257 private int maxStatesTraced = DEFAULT_MAX_STATES_TRACED;
258
259
260
261
262
263
264
265
266 private int maxDeterminizedStates = DEFAULT_MAX_DETERMINIZED_STATES;
267
268
269
270
271
272
273
274
275
276
277 private int maxNgramsExtracted = DEFAULT_MAX_NGRAMS_EXTRACTED;
278 private boolean caseSensitive = DEFAULT_CASE_SENSITIVE;
279 @NonNull
280 private Locale locale = DEFAULT_LOCALE;
281
282
283
284
285
286 private boolean rejectUnaccelerated = DEFAULT_REJECT_UNACCELERATED;
287 private int maxNgramClauses = DEFAULT_MAX_BOOLEAN_CLAUSES;
288
289 Settings() {
290 }
291
292 private Settings(StreamInput in) throws IOException {
293 maxExpand = in.readVInt();
294 maxStatesTraced = in.readVInt();
295 maxDeterminizedStates = in.readVInt();
296 maxNgramsExtracted = in.readVInt();
297 caseSensitive = in.readBoolean();
298 locale = LocaleUtils.parse(in.readString());
299 rejectUnaccelerated = in.readBoolean();
300 maxNgramClauses = in.readVInt();
301 if (in.getVersion().before(Version.V_6_5_4)) {
302 in.readVLong();
303 }
304 }
305
306 public void writeTo(StreamOutput out) throws IOException {
307 out.writeVInt(maxExpand);
308 out.writeVInt(maxStatesTraced);
309 out.writeVInt(maxDeterminizedStates);
310 out.writeVInt(maxNgramsExtracted);
311 out.writeBoolean(caseSensitive);
312 out.writeString(locale.toString());
313 out.writeBoolean(rejectUnaccelerated);
314 out.writeVInt(maxNgramClauses);
315 if (out.getVersion().before(Version.V_6_5_4)) {
316 out.writeVLong(0);
317 }
318 }
319
320 @SuppressWarnings({"NPathComplexity", "CyclomaticComplexity"})
321 public XContentBuilder innerXContent(XContentBuilder builder, Params params) throws IOException {
322 if (maxExpand != DEFAULT_MAX_EXPAND) {
323 builder.field(MAX_EXPAND.getPreferredName(), maxExpand);
324 }
325 if (maxStatesTraced != DEFAULT_MAX_STATES_TRACED) {
326 builder.field(MAX_STATES_TRACED.getPreferredName(), maxStatesTraced);
327 }
328 if (maxDeterminizedStates != DEFAULT_MAX_DETERMINIZED_STATES) {
329 builder.field(MAX_DETERMINIZED_STATES.getPreferredName(), maxDeterminizedStates);
330 }
331 if (maxNgramsExtracted != DEFAULT_MAX_NGRAMS_EXTRACTED) {
332 builder.field(MAX_NGRAMS_EXTRACTED.getPreferredName(), maxNgramsExtracted);
333 }
334 if (caseSensitive != DEFAULT_CASE_SENSITIVE) {
335 builder.field(CASE_SENSITIVE.getPreferredName(), caseSensitive);
336 }
337 if (locale != DEFAULT_LOCALE) {
338 builder.field(LOCALE.getPreferredName(), locale);
339 }
340 if (rejectUnaccelerated != DEFAULT_REJECT_UNACCELERATED) {
341 builder.field(REJECT_UNACCELERATED.getPreferredName(), rejectUnaccelerated);
342 }
343 if (maxNgramClauses != DEFAULT_MAX_BOOLEAN_CLAUSES) {
344 builder.field(MAX_NGRAM_CLAUSES.getPreferredName(), maxNgramClauses);
345 }
346 return builder;
347 }
348 }
349
350 @Override
351 protected void doXContent(XContentBuilder builder, Params params) throws IOException {
352 builder.startObject(NAME.getPreferredName());
353 builder.field(FIELD.getPreferredName(), field);
354 builder.field(REGEX.getPreferredName(), regex);
355
356 if (loadFromSource != DEFAULT_LOAD_FROM_SOURCE) {
357 builder.field(LOAD_FROM_SOURCE.getPreferredName(), loadFromSource);
358 }
359 if (ngramField != null) {
360 builder.field(NGRAM_FIELD.getPreferredName(), ngramField);
361 }
362 if (gramSize != DEFAULT_GRAM_SIZE) {
363 builder.field(GRAM_SIZE.getPreferredName(), gramSize);
364 }
365 settings.innerXContent(builder, params);
366 printBoostAndQueryName(builder);
367 builder.endObject();
368 }
369
370 public static SourceRegexQueryBuilder fromXContent(XContentParser parser) throws IOException {
371 try {
372 return PARSER.parse(parser, null);
373 } catch (IllegalArgumentException iae) {
374 throw new ParsingException(parser.getTokenLocation(), iae.getMessage(), iae);
375 }
376 }
377 }