1 package org.wikimedia.search.extra.regex.expression;
2
3 import static org.junit.Assert.assertEquals;
4 import static org.junit.Assert.assertFalse;
5 import static org.junit.Assert.assertNotEquals;
6 import static org.junit.Assert.assertTrue;
7
8 import java.util.Locale;
9
10 import org.apache.lucene.analysis.core.KeywordAnalyzer;
11 import org.apache.lucene.util.automaton.Automaton;
12 import org.apache.lucene.util.automaton.RegExp;
13 import org.junit.Test;
14 import org.wikimedia.search.extra.regex.ngram.NGramExtractor;
15
16 @SuppressWarnings("unchecked")
17 public class ExpressionTest {
18 private final Leaf<String> foo = new Leaf<>("foo");
19 private final Leaf<String> bar = new Leaf<>("bar");
20 private final Leaf<String> baz = new Leaf<>("baz");
21
22 @Test
23 public void simple() {
24 assertEquals(True.instance(), True.instance());
25 assertEquals(True.instance().hashCode(), True.instance().hashCode());
26 assertEquals(False.instance(), False.instance());
27 assertEquals(False.instance().hashCode(), False.instance().hashCode());
28 assertNotEquals(True.instance(), False.instance());
29 assertNotEquals(False.instance(), True.instance());
30
31 Leaf<String> leaf = new Leaf<>("foo");
32 assertEquals(leaf, leaf);
33 assertNotEquals(True.instance(), leaf);
34 assertNotEquals(False.instance(), leaf);
35 }
36
37 @Test
38 public void extract() {
39 assertEquals(new And<>(foo, new Or<>(bar, baz)),
40 new Or<>(
41 new And<>(foo, bar),
42 new And<>(foo, baz)
43 ).simplify());
44 }
45
46 @Test
47 public void extractToEmpty() {
48 assertEquals(new And<>(foo, bar),
49 new Or<>(
50 new And<>(foo, bar),
51 new And<>(foo, bar, baz)
52 ).simplify());
53 }
54
55 @Test
56 public void extractSingle() {
57 assertEquals(foo,
58 new Or<>(
59 new And<>(foo, bar),
60 foo
61 ).simplify());
62 }
63
64 @Test
65 public void extractDuplicates() {
66 assertEquals(foo, new Or<>(foo, new And<>(foo)).simplify());
67 }
68
69 @Test
70 public void testDegradedDisjunction() {
71 String regex = "[ab]*a[cd]{50,80}";
72 int maxExpand = 4;
73 int maxStatesTraced = 10000;
74 int maxDeterminizedStates = 20000;
75 int maxNgramsExtracted = 100;
76
77 Automaton automaton = new RegExp(regex.toLowerCase(Locale.ENGLISH), RegExp.ALL ^ RegExp.AUTOMATON).toAutomaton(maxDeterminizedStates);
78 NGramExtractor extractor = new NGramExtractor(3, maxExpand, maxStatesTraced, maxNgramsExtracted, new KeywordAnalyzer());
79
80 Expression<String> expression = extractor.extract(automaton);
81 assertTrue(expression.countClauses() > 1024);
82 expression = new ExpressionRewriter<>(expression).degradeAsDisjunction();
83 assertTrue(expression.getClass() == Or.class);
84 assertFalse(expression.alwaysFalse());
85 assertFalse(expression.alwaysTrue());
86 assertTrue(expression.countClauses() <= maxNgramsExtracted);
87 }
88
89 @Test(timeout = 1000)
90 public void testBooleanExplosion() {
91 String regex = "[0-9]*a[0-9]{50,80}";
92 int maxExpand = 10;
93 int maxStatesTraced = 10000;
94 int maxDeterminizedStates = 20000;
95 int maxNgramsExtracted = 10000;
96
97 Automaton automaton = new RegExp(regex.toLowerCase(Locale.ENGLISH), RegExp.ALL ^ RegExp.AUTOMATON).toAutomaton(maxDeterminizedStates);
98 NGramExtractor extractor = new NGramExtractor(3, maxExpand, maxStatesTraced, maxNgramsExtracted, new KeywordAnalyzer());
99
100 Expression<String> expression = extractor.extract(automaton);
101
102 assertTrue(expression.countClauses() == Integer.MAX_VALUE);
103
104
105 expression = new ExpressionRewriter<>(expression).degradeAsDisjunction(maxNgramsExtracted);
106 assertTrue(expression.getClass() == Or.class);
107 assertFalse(expression.alwaysFalse());
108 assertFalse(expression.alwaysTrue());
109 assertTrue(expression.countClauses() <= maxNgramsExtracted);
110 }
111 }