View Javadoc
1   package org.wikimedia.search.extra.regex.ngram;
2   
3   import org.apache.lucene.analysis.Analyzer;
4   import org.apache.lucene.util.automaton.Automaton;
5   import org.wikimedia.search.extra.regex.expression.Expression;
6   import org.wikimedia.search.extra.regex.expression.True;
7   
8   /**
9    * Extracts ngrams from automatons.
10   */
11  public class NGramExtractor {
12      private final int gramSize;
13      private final int maxExpand;
14      private final int maxStatesTraced;
15      private final int maxNgrams;
16      private final Analyzer ngramAnalyzer;
17  
18      /**
19       * Build it.
20       *
21       * @param gramSize size of the ngram. The "n" in ngram.
22       * @param maxExpand Maximum size of range transitions to expand into single
23       *            transitions. Its roughly analogous to the number of characters
24       *            in a character class before it is considered a wildcard for
25       *            optimization purposes.
26       * @param maxStatesTraced maximum number of states traced during automaton
27       *            functions. Higher number allow more complex automata to be
28       *            converted to ngram expressions at the cost of more time.
29       * @param maxNgrams the maximum number of ngrams extracted from the regex.
30       *            If more could be exracted from the regex they are ignored.
31       * @param ngramAnalyzer the analyzer used to generate indexed ngrams
32       */
33      public NGramExtractor(int gramSize, int maxExpand, int maxStatesTraced, int maxNgrams, Analyzer ngramAnalyzer) {
34          this.gramSize = gramSize;
35          this.maxExpand = maxExpand;
36          this.maxStatesTraced = maxStatesTraced;
37          this.maxNgrams = maxNgrams;
38          this.ngramAnalyzer = ngramAnalyzer;
39      }
40  
41      /**
42       * Extract an Expression containing ngrams from an automaton.
43       */
44      public Expression<String> extract(Automaton automaton) {
45          if (automaton.isAccept(0)) {
46              return True.instance();
47          }
48          return new NGramAutomaton(automaton, gramSize, maxExpand, maxStatesTraced, maxNgrams, ngramAnalyzer).expression().simplify();
49      }
50  }