View Javadoc
1   package org.wikimedia.search.extra.regex.ngram;
2   
3   import static org.wikimedia.search.extra.regex.expression.Leaf.leaves;
4   
5   import org.apache.lucene.analysis.core.KeywordAnalyzer;
6   import org.apache.lucene.util.automaton.Automaton;
7   import org.apache.lucene.util.automaton.AutomatonTestUtil;
8   import org.apache.lucene.util.automaton.RegExp;
9   import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
10  import org.junit.Assert;
11  import org.junit.Test;
12  import org.junit.runner.RunWith;
13  import org.wikimedia.search.extra.regex.expression.And;
14  import org.wikimedia.search.extra.regex.expression.Expression;
15  import org.wikimedia.search.extra.regex.expression.Leaf;
16  import org.wikimedia.search.extra.regex.expression.Or;
17  import org.wikimedia.search.extra.regex.expression.True;
18  
19  import com.carrotsearch.randomizedtesting.RandomizedRunner;
20  import com.carrotsearch.randomizedtesting.RandomizedTest;
21  import com.carrotsearch.randomizedtesting.annotations.Repeat;
22  
23  @RunWith(RandomizedRunner.class)
24  public class NGramAutomatonTest extends RandomizedTest {
25      @Test
26      public void simple() {
27          assertTrigramExpression("cat", new Leaf<>("cat"));
28      }
29  
30      @Test
31      public void options() {
32          assertTrigramExpression("(cat)|(dog)|(cow)", new Or<>(leaves("cat", "dog", "cow")));
33      }
34  
35      @Test
36      public void leadingWildcard() {
37          assertTrigramExpression(".*cat", new Leaf<>("cat"));
38      }
39  
40      @Test
41      public void followingWildcard() {
42          assertTrigramExpression("cat.*", new Leaf<>("cat"));
43      }
44  
45      @Test
46      public void initialCharClassExpanded() {
47          assertTrigramExpression("[abcd]oop", new And<>(new Or<>(leaves("aoo", "boo", "coo", "doo")), new Leaf<>("oop")));
48      }
49  
50      @Test
51      public void initialCharClassSkipped() {
52          assertTrigramExpression("[abcde]oop", new Leaf<>("oop"));
53      }
54  
55      @Test
56      public void followingCharClassExpanded() {
57          assertTrigramExpression("oop[abcd]", new And<>(
58                  new Leaf<>("oop"),
59                  new Or<>(leaves("opa", "opb", "opc", "opd"))));
60      }
61  
62      @Test
63      public void followingCharClassSkipped() {
64          assertTrigramExpression("oop[abcde]", new Leaf<>("oop"));
65      }
66  
67      @Test
68      public void shortCircuit() {
69          assertTrigramExpression("a|(lopi)", True.<String> instance());
70      }
71  
72      @Test
73      public void optional() {
74          assertTrigramExpression("(a|[j-t])lopi", new And<>(leaves("lop", "opi")));
75      }
76  
77      @Test
78      public void loop() {
79          assertTrigramExpression("ab(cdef)*gh", new Or<>(
80                  new And<>(leaves("abc", "bcd", "cde", "def", "efg", "fgh")),
81                  new And<>(leaves("abg", "bgh"))));
82      }
83  
84      @Test
85      public void converge() {
86          assertTrigramExpression("(ajdef)|(cdef)", new And<>(
87                  new Or<>(
88                          new And<>(leaves("ajd", "jde")),
89                          new Leaf<>("cde")),
90                  new Leaf<>("def")));
91      }
92  
93      @Test
94      public void complex() {
95          assertTrigramExpression("h[efas] te.*me", new And<>(
96                  new Or<>(
97                          new And<>(leaves("ha ", "a t")),
98                          new And<>(leaves("he ", "e t")),
99                          new And<>(leaves("hf ", "f t")),
100                         new And<>(leaves("hs ", "s t"))),
101                 new Leaf<>(" te")));
102     }
103 
104     // The pgTrgmExample methods below test examples from the slides at
105     // http://www.pgcon.org/2012/schedule/attachments/248_Alexander%20Korotkov%20-%20Index%20support%20for%20regular%20expression%20search.pdf
106     @Test
107     public void pgTrgmExample1() {
108         assertTrigramExpression("a(b+|c+)d", new Or<>(
109                 new Leaf<>("abd"),
110                 new And<>(leaves("abb", "bbd")),
111                 new Leaf<>("acd"),
112                 new And<>(leaves("acc", "ccd"))));
113     }
114 
115     @Test
116     public void pgTrgmExample2() {
117         assertTrigramExpression(
118             "(abc|cba)def",
119             new And<>(
120                 new Leaf<>("def"), new Or<>(
121                     new And<>(leaves("abc", "bcd", "cde")),
122                     new And<>(leaves("cba", "bad", "ade"))
123                 )
124             )
125         );
126     }
127 
128     @Test
129     public void pgTrgmExample3() {
130         assertTrigramExpression("abc+de", new And<>(
131                 new Leaf<>("abc"),
132                 new Leaf<>("cde"),
133                 new Or<>(
134                         new Leaf<>("bcd"),
135                         new And<>(leaves("bcc", "ccd")))));
136     }
137 
138     @Test
139     public void pgTrgmExample4() {
140         assertTrigramExpression("(abc*)+de", new Or<>(
141                 new And<>(leaves("abd", "bde")),
142                 new And<>(
143                         new Leaf<>("abc"),
144                         new Leaf<>("cde"),
145                         new Or<>(
146                                 new Leaf<>("bcd"),
147                                 new And<>(leaves("bcc", "ccd"))))));
148     }
149 
150     @Test
151     public void pgTrgmExample5() {
152         assertTrigramExpression("ab(cd)*ef", new Or<>(
153                 new And<>(leaves("abe", "bef")),
154                 new And<>(leaves("abc", "bcd", "cde", "def"))));
155     }
156 
157     /**
158      * Automatons that would take too long to process are aborted.
159      */
160 //    @Test(expected=AutomatonTooComplexException.class)
161     public void tooManyStates() {
162         // TODO I'm not sure how to reliably trigger this without really high maxTransitions.  Maybe its not possible?
163         StringBuilder b = new StringBuilder();
164         for (int i = 0; i < 1000; i++) {
165             b.append("[efas]+");
166         }
167         assertTrigramExpression(b.toString(), null /*ignored*/);
168     }
169 
170     /**
171      * This would periodically fail when we were removing cycles rather
172      * preventing them from being added to the expression.
173      */
174     @Test
175     public void bigramFailsSometimes() {
176         assertExpression("te.*me", 2, new And<>(leaves("te", "me")));
177     }
178 
179     @Test(expected = TooComplexToDeterminizeException.class)
180     public void tooBig() {
181         assertTrigramExpression("\\[\\[(Datei|File|Bild|Image):[^]]*alt=[^]|}]{50,200}",
182                 null /* ignored */);
183     }
184 
185     @Test(expected = TooComplexToDeterminizeException.class)
186     public void tooBigToo() {
187         assertTrigramExpression("[^]]*alt=[^]\\|}]{80,}",
188                 null /* ignored */);
189     }
190 
191     @Test
192     public void bigButNotTooBig() {
193         // I'd like to verify that this _doesn't_ take 26 seconds to run and
194         // instead takes .26 seconds but such assertions are foolhardy in unit
195         // tests.
196         assertTrigramExpression("[^]]*alt=[^]\\|}]{10,20}", new And<>(leaves("alt", "lt=")));
197     }
198 
199     @Test
200     public void huge() {
201         assertTrigramExpression("[ac]*a[de]{50,80}", null);
202     }
203 
204     @Test
205     @Repeat(iterations = 100)
206     @SuppressWarnings("IllegalCatch")
207     public void randomRegexp() {
208         // Some of the regex strings don't actually compile so just retry until we get a good one.
209         String str;
210         while (true) {
211             try {
212                 str = AutomatonTestUtil.randomRegexp(getRandom());
213                 new RegExp(str);
214                 break;
215             } catch (Exception e) {
216                 // retry now
217             }
218         }
219         assertTrigramExpression(str, null);
220     }
221 
222     /**
223      * Tests that building the automaton doesn't blow up in unexpected ways.
224      */
225     @Test
226     @Repeat(iterations = 100)
227     public void randomAutomaton() {
228         Automaton automaton = AutomatonTestUtil.randomAutomaton(getRandom());
229         NGramAutomaton ngramAutomaton;
230         try {
231             ngramAutomaton = new NGramAutomaton(automaton, between(2, 7), 4, 10000, 500, new KeywordAnalyzer());
232         } catch (AutomatonTooComplexException e) {
233             // This is fine - some automata are genuinely too complex to ngramify.
234             return;
235         }
236         Expression<String> expression = ngramAutomaton.expression();
237         expression = expression.simplify();
238     }
239 
240     /**
241      * Asserts that the provided regex extracts the expected expression when
242      * configured to extract trigrams. Uses 4 as maxExpand just because I had to
243      * pick something and 4 seemed pretty good.
244      */
245     private void assertTrigramExpression(String regex, Expression<String> expected) {
246         assertExpression(regex, 3, expected);
247     }
248 
249     /**
250      * Asserts that the provided regex extracts the expected expression when
251      * configured to extract ngrams. Uses 4 as maxExpand just because I had to
252      * pick something and 4 seemed pretty good.
253      */
254     private void assertExpression(String regex, int gramSize, Expression<String> expected) {
255 //         System.err.println(regex);
256         Automaton automaton = new RegExp(regex).toAutomaton(20000);
257 //         System.err.println(automaton.toDot());
258         NGramAutomaton ngramAutomaton = new NGramAutomaton(automaton, gramSize, 4, 10000, 500, new KeywordAnalyzer());
259 //         System.err.println(ngramAutomaton.toDot());
260         Expression<String> expression = ngramAutomaton.expression();
261 //         System.err.println(expression);
262         expression = expression.simplify();
263 //         System.err.println(expression);
264         if (expected != null) {
265             // Null means skip the test here.
266             Assert.assertEquals(expected, expression);
267         }
268     }
269 }