1 package org.wikimedia.search.extra.regex.ngram;
2
3 import static org.wikimedia.search.extra.regex.expression.Leaf.leaves;
4
5 import org.apache.lucene.analysis.core.KeywordAnalyzer;
6 import org.apache.lucene.util.automaton.Automaton;
7 import org.apache.lucene.util.automaton.AutomatonTestUtil;
8 import org.apache.lucene.util.automaton.RegExp;
9 import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
10 import org.junit.Assert;
11 import org.junit.Test;
12 import org.junit.runner.RunWith;
13 import org.wikimedia.search.extra.regex.expression.And;
14 import org.wikimedia.search.extra.regex.expression.Expression;
15 import org.wikimedia.search.extra.regex.expression.Leaf;
16 import org.wikimedia.search.extra.regex.expression.Or;
17 import org.wikimedia.search.extra.regex.expression.True;
18
19 import com.carrotsearch.randomizedtesting.RandomizedRunner;
20 import com.carrotsearch.randomizedtesting.RandomizedTest;
21 import com.carrotsearch.randomizedtesting.annotations.Repeat;
22
23 @RunWith(RandomizedRunner.class)
24 public class NGramAutomatonTest extends RandomizedTest {
25 @Test
26 public void simple() {
27 assertTrigramExpression("cat", new Leaf<>("cat"));
28 }
29
30 @Test
31 public void options() {
32 assertTrigramExpression("(cat)|(dog)|(cow)", new Or<>(leaves("cat", "dog", "cow")));
33 }
34
35 @Test
36 public void leadingWildcard() {
37 assertTrigramExpression(".*cat", new Leaf<>("cat"));
38 }
39
40 @Test
41 public void followingWildcard() {
42 assertTrigramExpression("cat.*", new Leaf<>("cat"));
43 }
44
45 @Test
46 public void initialCharClassExpanded() {
47 assertTrigramExpression("[abcd]oop", new And<>(new Or<>(leaves("aoo", "boo", "coo", "doo")), new Leaf<>("oop")));
48 }
49
50 @Test
51 public void initialCharClassSkipped() {
52 assertTrigramExpression("[abcde]oop", new Leaf<>("oop"));
53 }
54
55 @Test
56 public void followingCharClassExpanded() {
57 assertTrigramExpression("oop[abcd]", new And<>(
58 new Leaf<>("oop"),
59 new Or<>(leaves("opa", "opb", "opc", "opd"))));
60 }
61
62 @Test
63 public void followingCharClassSkipped() {
64 assertTrigramExpression("oop[abcde]", new Leaf<>("oop"));
65 }
66
67 @Test
68 public void shortCircuit() {
69 assertTrigramExpression("a|(lopi)", True.<String> instance());
70 }
71
72 @Test
73 public void optional() {
74 assertTrigramExpression("(a|[j-t])lopi", new And<>(leaves("lop", "opi")));
75 }
76
77 @Test
78 public void loop() {
79 assertTrigramExpression("ab(cdef)*gh", new Or<>(
80 new And<>(leaves("abc", "bcd", "cde", "def", "efg", "fgh")),
81 new And<>(leaves("abg", "bgh"))));
82 }
83
84 @Test
85 public void converge() {
86 assertTrigramExpression("(ajdef)|(cdef)", new And<>(
87 new Or<>(
88 new And<>(leaves("ajd", "jde")),
89 new Leaf<>("cde")),
90 new Leaf<>("def")));
91 }
92
93 @Test
94 public void complex() {
95 assertTrigramExpression("h[efas] te.*me", new And<>(
96 new Or<>(
97 new And<>(leaves("ha ", "a t")),
98 new And<>(leaves("he ", "e t")),
99 new And<>(leaves("hf ", "f t")),
100 new And<>(leaves("hs ", "s t"))),
101 new Leaf<>(" te")));
102 }
103
104
105
106 @Test
107 public void pgTrgmExample1() {
108 assertTrigramExpression("a(b+|c+)d", new Or<>(
109 new Leaf<>("abd"),
110 new And<>(leaves("abb", "bbd")),
111 new Leaf<>("acd"),
112 new And<>(leaves("acc", "ccd"))));
113 }
114
115 @Test
116 public void pgTrgmExample2() {
117 assertTrigramExpression(
118 "(abc|cba)def",
119 new And<>(
120 new Leaf<>("def"), new Or<>(
121 new And<>(leaves("abc", "bcd", "cde")),
122 new And<>(leaves("cba", "bad", "ade"))
123 )
124 )
125 );
126 }
127
128 @Test
129 public void pgTrgmExample3() {
130 assertTrigramExpression("abc+de", new And<>(
131 new Leaf<>("abc"),
132 new Leaf<>("cde"),
133 new Or<>(
134 new Leaf<>("bcd"),
135 new And<>(leaves("bcc", "ccd")))));
136 }
137
138 @Test
139 public void pgTrgmExample4() {
140 assertTrigramExpression("(abc*)+de", new Or<>(
141 new And<>(leaves("abd", "bde")),
142 new And<>(
143 new Leaf<>("abc"),
144 new Leaf<>("cde"),
145 new Or<>(
146 new Leaf<>("bcd"),
147 new And<>(leaves("bcc", "ccd"))))));
148 }
149
150 @Test
151 public void pgTrgmExample5() {
152 assertTrigramExpression("ab(cd)*ef", new Or<>(
153 new And<>(leaves("abe", "bef")),
154 new And<>(leaves("abc", "bcd", "cde", "def"))));
155 }
156
157
158
159
160
161 public void tooManyStates() {
162
163 StringBuilder b = new StringBuilder();
164 for (int i = 0; i < 1000; i++) {
165 b.append("[efas]+");
166 }
167 assertTrigramExpression(b.toString(), null );
168 }
169
170
171
172
173
174 @Test
175 public void bigramFailsSometimes() {
176 assertExpression("te.*me", 2, new And<>(leaves("te", "me")));
177 }
178
179 @Test(expected = TooComplexToDeterminizeException.class)
180 public void tooBig() {
181 assertTrigramExpression("\\[\\[(Datei|File|Bild|Image):[^]]*alt=[^]|}]{50,200}",
182 null );
183 }
184
185 @Test(expected = TooComplexToDeterminizeException.class)
186 public void tooBigToo() {
187 assertTrigramExpression("[^]]*alt=[^]\\|}]{80,}",
188 null );
189 }
190
191 @Test
192 public void bigButNotTooBig() {
193
194
195
196 assertTrigramExpression("[^]]*alt=[^]\\|}]{10,20}", new And<>(leaves("alt", "lt=")));
197 }
198
199 @Test
200 public void huge() {
201 assertTrigramExpression("[ac]*a[de]{50,80}", null);
202 }
203
204 @Test
205 @Repeat(iterations = 100)
206 @SuppressWarnings("IllegalCatch")
207 public void randomRegexp() {
208
209 String str;
210 while (true) {
211 try {
212 str = AutomatonTestUtil.randomRegexp(getRandom());
213 new RegExp(str);
214 break;
215 } catch (Exception e) {
216
217 }
218 }
219 assertTrigramExpression(str, null);
220 }
221
222
223
224
225 @Test
226 @Repeat(iterations = 100)
227 public void randomAutomaton() {
228 Automaton automaton = AutomatonTestUtil.randomAutomaton(getRandom());
229 NGramAutomaton ngramAutomaton;
230 try {
231 ngramAutomaton = new NGramAutomaton(automaton, between(2, 7), 4, 10000, 500, new KeywordAnalyzer());
232 } catch (AutomatonTooComplexException e) {
233
234 return;
235 }
236 Expression<String> expression = ngramAutomaton.expression();
237 expression = expression.simplify();
238 }
239
240
241
242
243
244
245 private void assertTrigramExpression(String regex, Expression<String> expected) {
246 assertExpression(regex, 3, expected);
247 }
248
249
250
251
252
253
254 private void assertExpression(String regex, int gramSize, Expression<String> expected) {
255
256 Automaton automaton = new RegExp(regex).toAutomaton(20000);
257
258 NGramAutomaton ngramAutomaton = new NGramAutomaton(automaton, gramSize, 4, 10000, 500, new KeywordAnalyzer());
259
260 Expression<String> expression = ngramAutomaton.expression();
261
262 expression = expression.simplify();
263
264 if (expected != null) {
265
266 Assert.assertEquals(expected, expression);
267 }
268 }
269 }