View Javadoc
1   package org.wikimedia.search.extra.regex.ngram;
2   
3   import static org.junit.Assert.assertEquals;
4   import static org.wikimedia.search.extra.regex.expression.Leaf.leaves;
5   
6   import org.apache.lucene.analysis.core.KeywordAnalyzer;
7   import org.apache.lucene.util.automaton.Automaton;
8   import org.apache.lucene.util.automaton.RegExp;
9   import org.junit.Test;
10  import org.wikimedia.search.extra.regex.expression.And;
11  import org.wikimedia.search.extra.regex.expression.Leaf;
12  import org.wikimedia.search.extra.regex.expression.True;
13  
14  public class NGramExtractorTest {
15      @Test
16      public void simple() {
17          NGramExtractor gram = new NGramExtractor(3, 4, 10000, 100, new KeywordAnalyzer());
18          Automaton automaton = new RegExp("hero of legend").toAutomaton();
19          assertEquals(
20                  new And<>(leaves("her", "ero", "ro ", "o o", " of",
21                          "of ", "f l", " le", "leg", "ege", "gen", "end")),
22                  gram.extract(automaton));
23          automaton = new RegExp("").toAutomaton();
24          assertEquals(True.<String> instance(), gram.extract(automaton));
25          automaton = new RegExp(".*").toAutomaton();
26          assertEquals(True.<String> instance(), gram.extract(automaton));
27          automaton = new RegExp("he").toAutomaton();
28          assertEquals(True.<String> instance(), gram.extract(automaton));
29          automaton = new RegExp("her").toAutomaton();
30          assertEquals(new Leaf<>("her"), gram.extract(automaton));
31      }
32  
33      @Test
34      public void maxNgrams() {
35          NGramExtractor gram = new NGramExtractor(3, 4, 10000, 3, new KeywordAnalyzer());
36          Automaton automaton = new RegExp("hero of legend").toAutomaton();
37          assertEquals(
38                  new And<>(leaves("her", "ero", "ro ")),
39                  gram.extract(automaton));
40      }
41  }