1 package org.wikimedia.search.extra.regex.ngram; 2 3 import static org.junit.Assert.assertEquals; 4 import static org.wikimedia.search.extra.regex.expression.Leaf.leaves; 5 6 import org.apache.lucene.analysis.core.KeywordAnalyzer; 7 import org.apache.lucene.util.automaton.Automaton; 8 import org.apache.lucene.util.automaton.RegExp; 9 import org.junit.Test; 10 import org.wikimedia.search.extra.regex.expression.And; 11 import org.wikimedia.search.extra.regex.expression.Leaf; 12 import org.wikimedia.search.extra.regex.expression.True; 13 14 public class NGramExtractorTest { 15 @Test 16 public void simple() { 17 NGramExtractor gram = new NGramExtractor(3, 4, 10000, 100, new KeywordAnalyzer()); 18 Automaton automaton = new RegExp("hero of legend").toAutomaton(); 19 assertEquals( 20 new And<>(leaves("her", "ero", "ro ", "o o", " of", 21 "of ", "f l", " le", "leg", "ege", "gen", "end")), 22 gram.extract(automaton)); 23 automaton = new RegExp("").toAutomaton(); 24 assertEquals(True.<String> instance(), gram.extract(automaton)); 25 automaton = new RegExp(".*").toAutomaton(); 26 assertEquals(True.<String> instance(), gram.extract(automaton)); 27 automaton = new RegExp("he").toAutomaton(); 28 assertEquals(True.<String> instance(), gram.extract(automaton)); 29 automaton = new RegExp("her").toAutomaton(); 30 assertEquals(new Leaf<>("her"), gram.extract(automaton)); 31 } 32 33 @Test 34 public void maxNgrams() { 35 NGramExtractor gram = new NGramExtractor(3, 4, 10000, 3, new KeywordAnalyzer()); 36 Automaton automaton = new RegExp("hero of legend").toAutomaton(); 37 assertEquals( 38 new And<>(leaves("her", "ero", "ro ")), 39 gram.extract(automaton)); 40 } 41 }