View Javadoc
1   package org.wikimedia.search.extra.regex;
2   
3   import java.io.IOException;
4   import java.util.List;
5   
6   import org.apache.lucene.index.LeafReaderContext;
7   import org.apache.lucene.search.ConstantScoreScorer;
8   import org.apache.lucene.search.ConstantScoreWeight;
9   import org.apache.lucene.search.DocIdSetIterator;
10  import org.apache.lucene.search.IndexSearcher;
11  import org.apache.lucene.search.Query;
12  import org.apache.lucene.search.ScoreMode;
13  import org.apache.lucene.search.Scorer;
14  import org.apache.lucene.search.TwoPhaseIterator;
15  import org.apache.lucene.search.Weight;
16  import org.wikimedia.search.extra.regex.SourceRegexQuery.Rechecker;
17  import org.wikimedia.search.extra.regex.SourceRegexQueryBuilder.Settings;
18  import org.wikimedia.search.extra.util.FieldValues;
19  import org.wikimedia.search.extra.util.FieldValues.Loader;
20  
21  import lombok.EqualsAndHashCode;
22  
23  /**
24   * Unaccelerated source_regex query.
25   * It will scan all the docs in the index.
26   */
27  @EqualsAndHashCode(callSuper = false)
28  class UnacceleratedSourceRegexQuery extends Query {
29      protected final Rechecker rechecker;
30      protected final String fieldPath;
31      protected final FieldValues.Loader loader;
32      protected final Settings settings;
33  
34      /**
35       * A new accelerated regex query.
36       *
37       * @param rechecker the rechecker used to perform the costly regex on doc content
38       * @param fieldPath the path to the field where the doc content is stored
39       * @param loader the loader used to load the field content
40       * @param settings the regex settings
41       */
42      UnacceleratedSourceRegexQuery(Rechecker rechecker, String fieldPath, Loader loader, Settings settings) {
43          super();
44          this.rechecker = rechecker;
45          this.fieldPath = fieldPath;
46          this.loader = loader;
47          this.settings = settings;
48      }
49  
50      @Override
51      public String toString(String field) {
52          return "source_regex(unaccelerated):" + field;
53      }
54  
55      @Override
56      public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
57          return new ConstantScoreWeight(this, 1F) {
58              @Override
59              public boolean isCacheable(LeafReaderContext leafReaderContext) {
60                  return false;
61              }
62  
63              @Override
64              public Scorer scorer(final LeafReaderContext context) throws IOException {
65                  final DocIdSetIterator approximation = DocIdSetIterator.all(context.reader().maxDoc());
66                  return new ConstantScoreScorer(this, 1f, scoreMode, new RegexTwoPhaseIterator(approximation, context));
67              }
68          };
69      }
70  
71      protected class RegexTwoPhaseIterator extends TwoPhaseIterator {
72          private final LeafReaderContext context;
73  
74          protected RegexTwoPhaseIterator(DocIdSetIterator approximation, LeafReaderContext context) {
75              super(approximation);
76              this.context = context;
77          }
78  
79          @Override
80          public boolean matches() throws IOException {
81              List<String> values = loader.load(fieldPath, context.reader(), approximation.docID());
82              return rechecker.recheck(values);
83          }
84  
85          @Override
86          public float matchCost() {
87              /*
88               * the recheck phase is costly and depends mostly on doc size. We
89               * set a very large base cost to reflect the fact that we will load
90               * the field data (I/O and mem) then we add a rechecker specific
91               * cost that depends on the number of states.
92               */
93              return 10000f + rechecker.getCost();
94          }
95      }
96  }