View Javadoc
1   package org.wikimedia.search.extra.regex.expression;
2   
3   import static org.junit.Assert.assertEquals;
4   import static org.junit.Assert.assertFalse;
5   import static org.junit.Assert.assertNotEquals;
6   import static org.junit.Assert.assertTrue;
7   
8   import java.util.Locale;
9   
10  import org.apache.lucene.analysis.core.KeywordAnalyzer;
11  import org.apache.lucene.util.automaton.Automaton;
12  import org.apache.lucene.util.automaton.RegExp;
13  import org.junit.Test;
14  import org.wikimedia.search.extra.regex.ngram.NGramExtractor;
15  
16  @SuppressWarnings("unchecked")
17  public class ExpressionTest {
18      private final Leaf<String> foo = new Leaf<>("foo");
19      private final Leaf<String> bar = new Leaf<>("bar");
20      private final Leaf<String> baz = new Leaf<>("baz");
21  
22      @Test
23      public void simple() {
24          assertEquals(True.instance(), True.instance());
25          assertEquals(True.instance().hashCode(), True.instance().hashCode());
26          assertEquals(False.instance(), False.instance());
27          assertEquals(False.instance().hashCode(), False.instance().hashCode());
28          assertNotEquals(True.instance(), False.instance());
29          assertNotEquals(False.instance(), True.instance());
30  
31          Leaf<String> leaf = new Leaf<>("foo");
32          assertEquals(leaf, leaf);
33          assertNotEquals(True.instance(), leaf);
34          assertNotEquals(False.instance(), leaf);
35      }
36  
37      @Test
38      public void extract() {
39          assertEquals(new And<>(foo, new Or<>(bar, baz)),
40                  new Or<>(
41                          new And<>(foo, bar),
42                          new And<>(foo, baz)
43                  ).simplify());
44      }
45  
46      @Test
47      public void extractToEmpty() {
48          assertEquals(new And<>(foo, bar),
49                  new Or<>(
50                          new And<>(foo, bar),
51                          new And<>(foo, bar, baz)
52                  ).simplify());
53      }
54  
55      @Test
56      public void extractSingle() {
57          assertEquals(foo,
58                  new Or<>(
59                          new And<>(foo, bar),
60                          foo
61                  ).simplify());
62      }
63  
64      @Test
65      public void extractDuplicates() {
66          assertEquals(foo, new Or<>(foo, new And<>(foo)).simplify());
67      }
68  
69      @Test
70      public void testDegradedDisjunction() {
71          String regex = "[ab]*a[cd]{50,80}";
72          int maxExpand = 4;
73          int maxStatesTraced = 10000;
74          int maxDeterminizedStates = 20000;
75          int maxNgramsExtracted = 100;
76  
77          Automaton automaton = new RegExp(regex.toLowerCase(Locale.ENGLISH), RegExp.ALL ^ RegExp.AUTOMATON).toAutomaton(maxDeterminizedStates);
78          NGramExtractor extractor = new NGramExtractor(3, maxExpand, maxStatesTraced, maxNgramsExtracted, new KeywordAnalyzer());
79  
80          Expression<String> expression = extractor.extract(automaton);
81          assertTrue(expression.countClauses() > 1024);
82          expression = new ExpressionRewriter<>(expression).degradeAsDisjunction();
83          assertTrue(expression.getClass() == Or.class);
84          assertFalse(expression.alwaysFalse());
85          assertFalse(expression.alwaysTrue());
86          assertTrue(expression.countClauses() <= maxNgramsExtracted);
87      }
88  
89      @Test(timeout = 1000)
90      public void testBooleanExplosion() {
91          String regex = "[0-9]*a[0-9]{50,80}";
92          int maxExpand = 10;
93          int maxStatesTraced = 10000;
94          int maxDeterminizedStates = 20000;
95          int maxNgramsExtracted = 10000;
96  
97          Automaton automaton = new RegExp(regex.toLowerCase(Locale.ENGLISH), RegExp.ALL ^ RegExp.AUTOMATON).toAutomaton(maxDeterminizedStates);
98          NGramExtractor extractor = new NGramExtractor(3, maxExpand, maxStatesTraced, maxNgramsExtracted, new KeywordAnalyzer());
99  
100         Expression<String> expression = extractor.extract(automaton);
101         // This one is huge... but most of its branches are reused
102         assertTrue(expression.countClauses() == Integer.MAX_VALUE);
103         // Test that the rewritter is sufficiently optimized
104         // to run onthis boolean tree
105         expression = new ExpressionRewriter<>(expression).degradeAsDisjunction(maxNgramsExtracted);
106         assertTrue(expression.getClass() == Or.class);
107         assertFalse(expression.alwaysFalse());
108         assertFalse(expression.alwaysTrue());
109         assertTrue(expression.countClauses() <= maxNgramsExtracted);
110     }
111 }