View Javadoc
1   package org.wikimedia.search.extra.regex;
2   
3   import static org.elasticsearch.common.xcontent.ConstructingObjectParser.constructorArg;
4   
5   import java.io.IOException;
6   import java.util.Locale;
7   import java.util.Objects;
8   
9   import javax.annotation.Nullable;
10  
11  import org.apache.lucene.analysis.Analyzer;
12  import org.apache.lucene.search.Query;
13  import org.elasticsearch.Version;
14  import org.elasticsearch.common.ParseField;
15  import org.elasticsearch.common.ParsingException;
16  import org.elasticsearch.common.io.stream.StreamInput;
17  import org.elasticsearch.common.io.stream.StreamOutput;
18  import org.elasticsearch.common.util.LocaleUtils;
19  import org.elasticsearch.common.xcontent.ConstructingObjectParser;
20  import org.elasticsearch.common.xcontent.XContentBuilder;
21  import org.elasticsearch.common.xcontent.XContentParser;
22  import org.elasticsearch.index.mapper.MappedFieldType;
23  import org.elasticsearch.index.query.AbstractQueryBuilder;
24  import org.elasticsearch.index.query.QueryShardContext;
25  import org.wikimedia.search.extra.regex.expression.ExpressionRewriter;
26  import org.wikimedia.search.extra.util.FieldValues;
27  
28  import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
29  import lombok.AccessLevel;
30  import lombok.EqualsAndHashCode;
31  import lombok.Getter;
32  import lombok.NonNull;
33  import lombok.Setter;
34  import lombok.experimental.Accessors;
35  
36  /**
37   * Builds source_regex filters.
38   */
39  @Accessors(chain = true, fluent = true)
40  @Getter
41  @Setter
42  @SuppressFBWarnings("CLI_CONSTANT_LIST_INDEX")
43  public class SourceRegexQueryBuilder extends AbstractQueryBuilder<SourceRegexQueryBuilder> {
44      public static final ParseField NAME = new ParseField("source_regex", "sourceRegex", "source-regex");
45  
46      public static final ParseField FIELD = new ParseField("field");
47      public static final ParseField REGEX = new ParseField("regex");
48      public static final ParseField LOAD_FROM_SOURCE = new ParseField("load_from_source");
49      public static final ParseField NGRAM_FIELD = new ParseField("ngram_field");
50      public static final ParseField GRAM_SIZE = new ParseField("gram_size");
51  
52      public static final boolean DEFAULT_LOAD_FROM_SOURCE = true;
53      public static final int DEFAULT_GRAM_SIZE = 3;
54  
55      private static final ConstructingObjectParser<SourceRegexQueryBuilder, Void> PARSER = constructParser();
56  
57      private static ConstructingObjectParser<SourceRegexQueryBuilder, Void> constructParser() {
58          ConstructingObjectParser<SourceRegexQueryBuilder, Void> parser =
59                  new ConstructingObjectParser<>(NAME.getPreferredName(),
60                          o -> new SourceRegexQueryBuilder((String) o[0], (String) o[1]));
61          parser.declareString(constructorArg(), FIELD);
62          parser.declareString(constructorArg(), REGEX);
63          parser.declareBoolean(SourceRegexQueryBuilder::loadFromSource, LOAD_FROM_SOURCE);
64          parser.declareString(SourceRegexQueryBuilder::ngramField, NGRAM_FIELD);
65          parser.declareInt(SourceRegexQueryBuilder::gramSize, GRAM_SIZE);
66          parser.declareInt((x, i) -> x.settings().maxExpand(i), Settings.MAX_EXPAND);
67          parser.declareInt((x, i) -> x.settings().maxStatesTraced(i), Settings.MAX_STATES_TRACED);
68          parser.declareInt((x, i) -> x.settings().maxDeterminizedStates(i), Settings.MAX_DETERMINIZED_STATES);
69          parser.declareInt((x, i) -> x.settings().maxNgramsExtracted(i), Settings.MAX_NGRAMS_EXTRACTED);
70          parser.declareBoolean((x, b) -> x.settings().caseSensitive(b), Settings.CASE_SENSITIVE);
71          parser.declareString((x, s) -> x.settings().locale(LocaleUtils.parse(s)), Settings.LOCALE);
72          parser.declareBoolean((x, b) -> x.settings().rejectUnaccelerated(b), Settings.REJECT_UNACCELERATED);
73          parser.declareInt((x, i) -> x.settings().maxNgramClauses(i), Settings.MAX_NGRAM_CLAUSES);
74          declareStandardFields(parser);
75          return parser;
76      }
77  
78      private final String field;
79      private final String regex;
80  
81      /**
82       * Should field be loaded from source (true) or from a
83       * stored field (false)?
84       */
85      private boolean loadFromSource = DEFAULT_LOAD_FROM_SOURCE;
86  
87      /**
88       * Field containing ngrams used to prefilter checked documents.
89       * If not set then no ngram acceleration is performed.
90       */
91      @Nullable private String ngramField;
92  
93      /**
94       * Size of the gram. Defaults to 3 because everyone loves
95       * trigrams.
96       */
97      private int gramSize = DEFAULT_GRAM_SIZE;
98  
99      @Setter(AccessLevel.NONE)
100     private final Settings settings;
101 
102     /**
103      * Start building.
104      *
105      * @param field the field to load and run the regex against
106      * @param regex the regex to run
107      */
108     public SourceRegexQueryBuilder(String field, String regex) {
109         this(field, regex, new Settings());
110     }
111 
112     /**
113      * Start building.
114      *
115      * @param field    the field to load and run the regex against
116      * @param regex    the regex to run
117      * @param settings additional settings
118      */
119     SourceRegexQueryBuilder(String field, String regex, Settings settings) {
120         this.field = Objects.requireNonNull(field);
121         this.regex = Objects.requireNonNull(regex);
122         this.settings = settings;
123     }
124 
125     public SourceRegexQueryBuilder(StreamInput in) throws IOException {
126         super(in);
127         field = in.readString();
128         regex = in.readString();
129         loadFromSource = in.readBoolean();
130         ngramField = in.readOptionalString();
131         gramSize = in.readVInt();
132         settings = new Settings(in);
133     }
134 
135     @Override
136     protected void doWriteTo(StreamOutput out) throws IOException {
137         out.writeString(field);
138         out.writeString(regex);
139         out.writeBoolean(loadFromSource);
140         out.writeOptionalString(ngramField);
141         out.writeVInt(gramSize);
142         settings.writeTo(out);
143     }
144 
145     @Override
146     public String getWriteableName() {
147         return NAME.getPreferredName();
148     }
149 
150     @Override
151     public int doHashCode() {
152         return Objects.hash(field, gramSize, loadFromSource, ngramField, regex, settings);
153     }
154 
155     @Override
156     public boolean doEquals(SourceRegexQueryBuilder o) {
157         return Objects.equals(field, o.field) &&
158                 Objects.equals(gramSize, o.gramSize) &&
159                 Objects.equals(ngramField, o.ngramField) &&
160                 Objects.equals(loadFromSource, o.loadFromSource) &&
161                 Objects.equals(regex, o.regex) &&
162                 Objects.equals(settings, o.settings);
163     }
164 
165     public SourceRegexQueryBuilder maxStatesTraced(int i) {
166         settings.maxStatesTraced = i;
167         return this;
168     }
169 
170     public SourceRegexQueryBuilder maxDeterminizedStates(int i) {
171         settings.maxDeterminizedStates = i;
172         return this;
173     }
174 
175     public SourceRegexQueryBuilder rejectUnaccelerated(boolean b) {
176         settings.rejectUnaccelerated = b;
177         return this;
178     }
179 
180     public SourceRegexQueryBuilder caseSensitive(boolean b) {
181         settings.caseSensitive = b;
182         return this;
183     }
184 
185     public SourceRegexQueryBuilder locale(Locale el) {
186         settings.locale = Objects.requireNonNull(el);
187         return this;
188     }
189 
190     @Override
191     protected Query doToQuery(QueryShardContext context) throws IOException {
192         final Analyzer ngramAnalyzer;
193         if (ngramField != null) {
194             MappedFieldType mapper = context.fieldMapper(ngramField);
195             if (mapper == null) {
196                 throw new IllegalArgumentException("ngramField [" + ngramField + "] is unknown.");
197             }
198             ngramAnalyzer = context.getSearchAnalyzer(mapper);
199             if (ngramAnalyzer == null) {
200                 throw new IllegalArgumentException("Cannot find an analyzer for ngramField [" + ngramField + "], is this field indexed?");
201             }
202         } else {
203             ngramAnalyzer = null;
204         }
205         return new SourceRegexQuery(
206                 field, ngramField, regex,
207                 loadFromSource ? FieldValues.loadFromSource() : FieldValues.loadFromStoredField(),
208                 settings, gramSize, ngramAnalyzer);
209     }
210 
211     /**
212      * Field independent settings for the SourceRegexFilter.
213      */
214     @Accessors(chain = true, fluent = true)
215     @Setter
216     @Getter
217     @EqualsAndHashCode
218     static class Settings {
219         static final ParseField MAX_EXPAND = new ParseField("max_expand");
220         static final ParseField MAX_STATES_TRACED = new ParseField("max_states_traced");
221         static final ParseField MAX_DETERMINIZED_STATES = new ParseField("max_determinized_states");
222         static final ParseField MAX_NGRAMS_EXTRACTED = new ParseField("max_ngrams_extracted");
223         static final ParseField CASE_SENSITIVE = new ParseField("case_sensitive");
224         static final ParseField LOCALE = new ParseField("locale");
225         static final ParseField REJECT_UNACCELERATED = new ParseField("reject_unaccelerated");
226         static final ParseField MAX_NGRAM_CLAUSES = new ParseField("max_ngram_clauses");
227 
228         private static final int DEFAULT_MAX_EXPAND = 4;
229         private static final int DEFAULT_MAX_STATES_TRACED = 10000;
230         private static final int DEFAULT_MAX_DETERMINIZED_STATES = 20000;
231         private static final int DEFAULT_MAX_NGRAMS_EXTRACTED = 100;
232         private static final boolean DEFAULT_CASE_SENSITIVE = false;
233         private static final Locale DEFAULT_LOCALE = Locale.ROOT;
234         private static final boolean DEFAULT_REJECT_UNACCELERATED = false;
235         private static final int DEFAULT_MAX_BOOLEAN_CLAUSES = ExpressionRewriter.MAX_BOOLEAN_CLAUSES;
236         private static final int DEFAULT_TIMEOUT = 0;
237 
238         /**
239          * Maximum size of range transitions to expand into
240          * single transitions when turning the automaton from the
241          * regex into an acceleration automaton. Its roughly
242          * analogous to the number of characters in a character class
243          * before it is considered a wildcard for optimization
244          * purposes.
245          */
246         private int maxExpand = DEFAULT_MAX_EXPAND;
247 
248         /**
249          * the maximum number of automaton states processed
250          * by the regex parsing algorithm. Higher numbers allow more
251          * complex regexes to be processed. Defaults to 10000 which
252          * allows reasonably complex regexes but still limits the regex
253          * processing time to under a second on modern hardware. 0
254          * effectively disabled regexes more complex than exact sequences
255          * of characters
256          */
257         private int maxStatesTraced = DEFAULT_MAX_STATES_TRACED;
258 
259         /**
260          * the maximum number of automaton states that
261          * Lucene will create at a time when compiling the regex to a
262          * DFA. Higher numbers allow the regex compilation phase to run
263          * for longer and use more memory needed to compile more complex
264          * regexes.
265          */
266         private int maxDeterminizedStates = DEFAULT_MAX_DETERMINIZED_STATES;
267 
268         /**
269          * the maximum number of ngrams extracted from the
270          * regex. This is pretty much the maximum number of term queries
271          * that are exectued per regex. If any more are required to
272          * accurately limit the regex to some document set they are all
273          * assumed to match all documents that match so far. Its crude,
274          * but it limits the number of term queries while degrading
275          * reasonably well.
276          */
277         private int maxNgramsExtracted = DEFAULT_MAX_NGRAMS_EXTRACTED;
278         private boolean caseSensitive = DEFAULT_CASE_SENSITIVE;
279         @NonNull
280         private Locale locale = DEFAULT_LOCALE;
281 
282         /**
283          * should the filter reject regexes it cannot
284          * accelerate?
285          */
286         private boolean rejectUnaccelerated = DEFAULT_REJECT_UNACCELERATED;
287         private int maxNgramClauses = DEFAULT_MAX_BOOLEAN_CLAUSES;
288 
289         Settings() {
290         }
291 
292         private Settings(StreamInput in) throws IOException {
293             maxExpand = in.readVInt();
294             maxStatesTraced = in.readVInt();
295             maxDeterminizedStates = in.readVInt();
296             maxNgramsExtracted = in.readVInt();
297             caseSensitive = in.readBoolean();
298             locale = LocaleUtils.parse(in.readString());
299             rejectUnaccelerated = in.readBoolean();
300             maxNgramClauses = in.readVInt();
301             if (in.getVersion().before(Version.V_6_5_4)) {
302                 in.readVLong();
303             }
304         }
305 
306         public void writeTo(StreamOutput out) throws IOException {
307             out.writeVInt(maxExpand);
308             out.writeVInt(maxStatesTraced);
309             out.writeVInt(maxDeterminizedStates);
310             out.writeVInt(maxNgramsExtracted);
311             out.writeBoolean(caseSensitive);
312             out.writeString(locale.toString());
313             out.writeBoolean(rejectUnaccelerated);
314             out.writeVInt(maxNgramClauses);
315             if (out.getVersion().before(Version.V_6_5_4)) {
316                 out.writeVLong(0);
317             }
318         }
319 
320         @SuppressWarnings({"NPathComplexity", "CyclomaticComplexity"})
321         public XContentBuilder innerXContent(XContentBuilder builder, Params params) throws IOException {
322             if (maxExpand != DEFAULT_MAX_EXPAND) {
323                 builder.field(MAX_EXPAND.getPreferredName(), maxExpand);
324             }
325             if (maxStatesTraced != DEFAULT_MAX_STATES_TRACED) {
326                 builder.field(MAX_STATES_TRACED.getPreferredName(), maxStatesTraced);
327             }
328             if (maxDeterminizedStates != DEFAULT_MAX_DETERMINIZED_STATES) {
329                 builder.field(MAX_DETERMINIZED_STATES.getPreferredName(), maxDeterminizedStates);
330             }
331             if (maxNgramsExtracted != DEFAULT_MAX_NGRAMS_EXTRACTED) {
332                 builder.field(MAX_NGRAMS_EXTRACTED.getPreferredName(), maxNgramsExtracted);
333             }
334             if (caseSensitive != DEFAULT_CASE_SENSITIVE) {
335                 builder.field(CASE_SENSITIVE.getPreferredName(), caseSensitive);
336             }
337             if (locale != DEFAULT_LOCALE) {
338                 builder.field(LOCALE.getPreferredName(), locale);
339             }
340             if (rejectUnaccelerated != DEFAULT_REJECT_UNACCELERATED) {
341                 builder.field(REJECT_UNACCELERATED.getPreferredName(), rejectUnaccelerated);
342             }
343             if (maxNgramClauses != DEFAULT_MAX_BOOLEAN_CLAUSES) {
344                 builder.field(MAX_NGRAM_CLAUSES.getPreferredName(), maxNgramClauses);
345             }
346             return builder;
347         }
348     }
349 
350     @Override
351     protected void doXContent(XContentBuilder builder, Params params) throws IOException {
352         builder.startObject(NAME.getPreferredName());
353         builder.field(FIELD.getPreferredName(), field);
354         builder.field(REGEX.getPreferredName(), regex);
355 
356         if (loadFromSource != DEFAULT_LOAD_FROM_SOURCE) {
357             builder.field(LOAD_FROM_SOURCE.getPreferredName(), loadFromSource);
358         }
359         if (ngramField != null) {
360             builder.field(NGRAM_FIELD.getPreferredName(), ngramField);
361         }
362         if (gramSize != DEFAULT_GRAM_SIZE) {
363             builder.field(GRAM_SIZE.getPreferredName(), gramSize);
364         }
365         settings.innerXContent(builder, params);
366         printBoostAndQueryName(builder);
367         builder.endObject();
368     }
369 
370     public static SourceRegexQueryBuilder fromXContent(XContentParser parser) throws IOException {
371         try {
372             return PARSER.parse(parser, null);
373         } catch (IllegalArgumentException iae) {
374             throw new ParsingException(parser.getTokenLocation(), iae.getMessage(), iae);
375         }
376     }
377 }