View Javadoc
1   package org.wikimedia.search.extra.regex;
2   
3   import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
4   import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
5   import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFailures;
6   import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
7   import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchHits;
8   import static org.hamcrest.Matchers.anyOf;
9   import static org.hamcrest.Matchers.containsString;
10  
11  import java.io.IOException;
12  import java.util.ArrayList;
13  import java.util.List;
14  import java.util.Locale;
15  import java.util.concurrent.ExecutionException;
16  
17  import org.elasticsearch.action.index.IndexRequestBuilder;
18  import org.elasticsearch.action.search.SearchRequestBuilder;
19  import org.elasticsearch.action.search.SearchResponse;
20  import org.elasticsearch.common.xcontent.XContentBuilder;
21  import org.elasticsearch.rest.RestStatus;
22  import org.junit.Test;
23  import org.wikimedia.search.extra.AbstractPluginIntegrationTest;
24  
25  public class SourceRegexQueryIntegrationTest extends AbstractPluginIntegrationTest {
26      @Test
27      public void basicUnacceleratedRegex() throws InterruptedException, ExecutionException, IOException {
28          setup();
29          indexRandom(false, doc("findme", "test"));
30          indexChaff(between(0, 10000));
31  
32          SearchResponse response = search(filter("t..t")).get();
33          assertSearchHits(response, "findme");
34  
35          client().prepareDelete("test", "test", "findme").get();
36          deleteChaff(20);
37          refresh();
38  
39          // Result isn't found when it is deleted
40          response = search(filter("t..t")).get();
41          assertHitCount(response, 0);
42      }
43  
44      @Test
45      public void regexMatchesWholeString() throws InterruptedException, ExecutionException, IOException {
46          setup();
47          indexRandom(true, doc("findme", "test"));
48          SearchResponse response = search(filter("test")).get();
49          assertSearchHits(response, "findme");
50      }
51  
52      @Test
53      public void regexMatchesPartOfString() throws InterruptedException, ExecutionException, IOException {
54          setup();
55          indexRandom(true, doc("findme", "I have the test in me."));
56          SearchResponse response = search(filter("test")).get();
57          assertSearchHits(response, "findme");
58      }
59  
60      @Test
61      public void regexMatchesUnicodeCharacters() throws InterruptedException, ExecutionException, IOException {
62          setup();
63          indexRandom(true, doc("findme", "solved using only λ+μ function"));
64          SearchResponse response = search(filter("only λ\\+μ")).get();
65          assertSearchHits(response, "findme");
66  
67          // It even works with ngram extraction!
68          response = search(filter("on[ly]y λ\\+μ")).get();
69          assertSearchHits(response, "findme");
70      }
71  
72      @Test
73      public void maxStatesTracedLimitsComplexityOfRegexes() throws InterruptedException, ExecutionException, IOException {
74          setup();
75          indexRandom(true, doc("findme", "test"));
76          SearchResponse response = search(filter("te[st]t").maxStatesTraced(30)).get();
77          assertHitCount(response, 1);
78          // maxStatesTraced is used when the regex is just a sequence of
79          // characters
80          assertFailures(search(filter("test").maxStatesTraced(0)), RestStatus.INTERNAL_SERVER_ERROR, containsString("complex"));
81          // And when there are more complex things in the regex
82          assertFailures(search(filter("te[st]t").maxStatesTraced(0)), RestStatus.INTERNAL_SERVER_ERROR, containsString("complex"));
83          // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but
84          // I can't find any way from here to mark it otherwise.
85      }
86  
87      @Test
88      public void maxDeterminizedStatesLimitsComplexityOfRegexes() throws InterruptedException, ExecutionException, IOException {
89          setup();
90          indexRandom(true, doc("findme", "test"));
91          // The default is good enough to prevent craziness
92          assertFailures(search(filter("[^]]*alt=[^]\\|}]{80,}")), RestStatus.INTERNAL_SERVER_ERROR,
93                  containsString("Determinizing [^]]*alt=[^]\\|}]{80,} would result in more than"));
94          // Some regexes with explosive state growth still run because they
95          // don't explode into too many states.
96          SearchResponse response = search(filter("[^]]*s[tabcse]{1,10}")).get();
97          assertHitCount(response, 1);
98          // But you can stop them by lowering maxStatesTraced
99          assertFailures(search(filter("[^]]*s[tabcse]{1,10}").maxDeterminizedStates(100)), RestStatus.INTERNAL_SERVER_ERROR,
100                 containsString("Determinizing [^]]*s[tabcse]{1,10} would result in more than 100"));
101         // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but
102         // I can't find any way from here to mark it otherwise.
103     }
104 
105     @Test
106     public void maxNgramsExtractedLimitsFilters() throws IOException, InterruptedException, ExecutionException {
107         setup();
108         indexRandom(true, doc("findme", "test"));
109         // Basically the assertion here is that this doesn't run _forever_
110         SearchResponse response = search(filter("[ac]*a[de]{50,200}")).get();
111         assertHitCount(response, 0);
112     }
113 
114     @Test
115     public void rejectEmptyRegex() throws InterruptedException, ExecutionException, IOException {
116         setup();
117         assertFailures(search(filter("")), RestStatus.BAD_REQUEST, anyOf(
118                 containsString("filter must specify [regex]"),
119                 containsString("regex must be set")
120         ));
121     }
122 
123     @Test
124     public void rejectUnacceleratedCausesFailuresWhenItCannotAccelerateTheRegex() throws InterruptedException, ExecutionException,
125             IOException {
126         setup();
127         indexRandom(true, doc("findme", "test"));
128         // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but
129         // I can't find any way from here to mark it otherwise.
130 
131         assertFailures(search(filter("...").rejectUnaccelerated(true)), RestStatus.INTERNAL_SERVER_ERROR,
132                 containsString("Unable to accelerate"));
133         assertFailures(search(filter("t.p").rejectUnaccelerated(true)), RestStatus.INTERNAL_SERVER_ERROR,
134                 containsString("Unable to accelerate"));
135         assertFailures(search(filter(".+pa").rejectUnaccelerated(true)), RestStatus.INTERNAL_SERVER_ERROR,
136                 containsString("Unable to accelerate"));
137         assertFailures(search(filter("p").rejectUnaccelerated(true)), RestStatus.INTERNAL_SERVER_ERROR,
138                 containsString("Unable to accelerate"));
139     }
140 
141     @Test
142     public void caseInsensitiveMatching() throws InterruptedException, ExecutionException, IOException {
143         setup();
144         indexRandom(true, doc("findme", "I have the test in me."));
145         SearchResponse response = search(filter("i h[ai]ve")).get();
146         assertSearchHits(response, "findme");
147         response = search(filter("I h[ai]ve")).get();
148         assertSearchHits(response, "findme");
149     }
150 
151     @Test
152     public void caseSensitiveMatching() throws InterruptedException, ExecutionException, IOException {
153         setup();
154         indexRandom(true, doc("findme", "I have the test in me."));
155         SearchResponse response = search(filter("i h[ai]ve").caseSensitive(true)).get();
156         assertHitCount(response, 0);
157         response = search(filter("I h[ai]ve").caseSensitive(true)).get();
158         assertSearchHits(response, "findme");
159     }
160 
161     @Test
162     public void complex() throws InterruptedException, ExecutionException, IOException {
163         setup();
164         indexRandom(true, doc("findme", "I have the test in me."));
165         SearchResponse response = search(filter("h[efas] te.*me")).get();
166         assertSearchHits(response, "findme");
167     }
168 
169     @Test
170     public void changingGramSize() throws InterruptedException, ExecutionException, IOException {
171         setup();
172         indexRandom(true, doc("findme", "I have the test in me."));
173 
174         // You can change the gram size to allow more degenerate regexes!
175         SearchResponse response = search(filter("te.*me").gramSize(2).ngramField("test.bigram").rejectUnaccelerated(true)).get();
176         assertSearchHits(response, "findme");
177 
178         // Proof the regex would fail without the new gram size:
179         assertFailures(search(filter("te.*me").rejectUnaccelerated(true)), RestStatus.INTERNAL_SERVER_ERROR,
180                 containsString("Unable to accelerate"));
181         // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but
182         // I can't find any way from here to mark it otherwise.
183 
184         // You can also raise the gram size
185         response = search(filter("test.*me").gramSize(4).ngramField("test.quadgram").rejectUnaccelerated(true)).get();
186         assertSearchHits(response, "findme");
187 
188         // But that limits the regexes you can accelerate to those from which
189         // appropriate grams can be extracted
190         assertFailures(search(filter("tes.*me").ngramField("test.quadgram").gramSize(4).rejectUnaccelerated(true)), RestStatus.INTERNAL_SERVER_ERROR,
191                 containsString("Unable to accelerate"));
192         // Its unfortunate that this comes back as an INTERNAL_SERVER_ERROR but
193         // I can't find any way from here to mark it otherwise.
194     }
195 
196     @Test
197     public void leadingMultibyte() throws InterruptedException, ExecutionException, IOException {
198         setup();
199         indexRandom(true, doc("findme", "I have the \u03C2test in me."));
200         SearchResponse response = search(filter("\u03C2t[aeiou]st")).get();
201         assertSearchHits(response, "findme");
202     }
203 
204     @Test
205     public void manyLowerCasing() throws Exception {
206         // With the English analyzer
207         setup("en");
208         indexLowerCaseTestCases();
209         assertSearchHits(search(filter("\u03AC")).get(), "greek");
210         assertHitCount(search(filter("\u03B1")).get(), 0);
211         assertSearchHits(search(filter("nathair")).get(), "irish");
212         assertHitCount(search(filter("n-athair")).get(), 0);
213         assertSearchHits(search(filter("it")).get(), "turkish");
214         assertHitCount(search(filter("\u0131t")).get(), 0);
215 
216         // Now in Greek
217         client().admin().indices().prepareDelete("test").execute();
218         waitNoPendingTasksOnAll();
219         setup("el");
220         indexLowerCaseTestCases();
221         assertHitCount(search(filter("\u03AC").locale(Locale.forLanguageTag("el"))).get(), 0);
222         assertSearchHits(search(filter("\u03B1").locale(Locale.forLanguageTag("el"))).get(), "greek");
223 
224         // Now in Irish
225         client().admin().indices().prepareDelete("test").execute();
226         waitNoPendingTasksOnAll();
227         setup("ga");
228         indexLowerCaseTestCases();
229         assertHitCount(search(filter("nathair").locale(Locale.forLanguageTag("ga"))).get(), 0);
230         /*
231          * Bleh. This doesn't work because the lowercasing comes after the
232          * ngraming. We'd need to put it before it. To do that you'd need a
233          * lowercasing char filter which doens't exist at this point. And it
234          * really is only trouble for Irish or with unicode normalization.
235          * Unfortunately that is outside the scope of this patch....
236          */
237         // assertSearchHits(search(filter("nAthair").locale(Locale.forLanguageTag("ga"))).get(),
238         // "irish");
239 
240         // Now in Turkish
241         client().admin().indices().prepareDelete("test").execute();
242         waitNoPendingTasksOnAll();
243         setup("tr");
244         indexLowerCaseTestCases();
245         assertHitCount(search(filter("it").locale(Locale.forLanguageTag("tr"))).get(), 0);
246         assertSearchHits(search(filter("\u0131t").locale(Locale.forLanguageTag("tr"))).get(), "turkish");
247     }
248 
249     public void indexLowerCaseTestCases() throws InterruptedException, ExecutionException {
250         indexRandom(true,
251                 /*
252                  * This is ά which lowercases to itself with a regular lowercase
253                  * regeme but in Greek it lowercases to α.
254                  */
255                 doc("greek", "\u03AC"),
256                 /*
257                  * Normal lowercases makes this nathair but in Irish its
258                  * n-athair.
259                  */
260                 doc("irish", "nAthair"),
261                 /*
262                  * This lowercases to i in English and ı in Turkish
263                  */
264                 doc("turkish", "It"));
265     }
266 
267     /**
268      * Test that we analyze extracted ngrams
269      * in this test case at index time
270      * außer will be tokenized as ["auß", "uße", "ßer"]
271      * The pattern token filter will simulate the effect of case folding by icu normalization
272      * by emmiting: ["auss", "usse", "sser"]
273      * We make sure that the trigrams extram from the regualar expression benefit
274      * from the same analysis. Otherwize we may search for inexistent trigrams like "auß"
275      * while applying the approximation query.
276      *
277      * @throws ExecutionException
278      * @throws InterruptedException
279      * @throws IOException
280      */
281     @Test
282     public void testSpecialTrigram() throws ExecutionException, InterruptedException, IOException {
283         setup();
284         indexRandom(true, doc("eszett", "außer"));
285         SearchResponse response = search(filter("außer").gramSize(3).ngramField("test.spectrigram").rejectUnaccelerated(true)).get();
286         assertSearchHits(response, "eszett");
287     }
288 
289     /**
290      * Not really a test but can be uncommented for basic performance testing.
291      * Its not reliable to make performance assertions in these tests,
292      * unfortunately. And its slow to run the test because it has to create a
293      * bunch of test data before you can see the performance gain.
294      */
295     // @Test
296     public void accel() throws InterruptedException, ExecutionException, IOException {
297         String findText = " given as a subroutine for calculating ƒ, the cycle detection problem may be trivially solved using only λ+μ function applications";
298         String regex = "subroutine";
299         setup();
300         indexRandom(false, doc("findme", findText));
301         for (int i = 0; i < 20; i++) {
302             indexChaff(10000);
303         }
304 
305         int rounds = 50;
306         long start = System.currentTimeMillis();
307         for (int i = 0; i < rounds; i++) {
308             SearchResponse response = search(new SourceRegexQueryBuilder("test", regex)).get();
309             assertSearchHits(response, "findme");
310         }
311         logger.info("Warmup:  {}", (System.currentTimeMillis() - start) / rounds);
312 
313         start = System.currentTimeMillis();
314         for (int i = 0; i < rounds; i++) {
315             SearchResponse response = search(new SourceRegexQueryBuilder("test", regex)).get();
316             assertSearchHits(response, "findme");
317         }
318         logger.info("No accel:  {}", (System.currentTimeMillis() - start) / rounds);
319 
320         start = System.currentTimeMillis();
321         for (int i = 0; i < rounds; i++) {
322             SearchResponse response = search(filter(regex)).get();
323             assertSearchHits(response, "findme");
324         }
325         logger.info("Accelerated:  {}", (System.currentTimeMillis() - start) / rounds);
326     }
327 
328     private IndexRequestBuilder doc(String id, String fieldValue) {
329         return client().prepareIndex("test", "test", id).setSource("test", fieldValue);
330     }
331 
332     private void indexChaff(int count) throws InterruptedException, ExecutionException {
333         List<IndexRequestBuilder> docs = new ArrayList<>();
334         for (int i = 0; i < count; i++) {
335             docs.add(doc(Integer.toString(i), "chaff"));
336         }
337         indexRandom(true, docs);
338     }
339 
340     private void deleteChaff(int count) throws InterruptedException, ExecutionException {
341         for (int i = 0; i < count; i++) {
342             client().prepareDelete("test", "test", Integer.toString(i)).get();
343         }
344     }
345 
346     private SourceRegexQueryBuilder filter(String regex) {
347         SourceRegexQueryBuilder builder = new SourceRegexQueryBuilder("test", regex);
348         builder.ngramField("test.trigram");
349         return builder;
350     }
351 
352     private SearchRequestBuilder search(SourceRegexQueryBuilder builder) {
353         return client().prepareSearch("test").setTypes("test").setQuery(builder);
354     }
355 
356     private void setup() throws IOException {
357         setup("root");
358     }
359 
360     private void setup(String locale) throws IOException {
361         XContentBuilder mapping = jsonBuilder().startObject();
362         mapping.startObject("test").startObject("properties");
363         mapping.startObject("test");
364         mapping.field("type", "text");
365         mapping.startObject("fields");
366         buildSubfield(mapping, "bigram");
367         buildSubfield(mapping, "trigram");
368         buildSubfield(mapping, "quadgram");
369         buildSubfield(mapping, "spectrigram");
370         mapping.endObject()
371             .endObject()
372             .endObject()
373             .endObject()
374             .endObject();
375 
376         XContentBuilder settings = jsonBuilder().startObject().startObject("index");
377         settings.field("number_of_shards", 1);
378         settings.startObject("analysis");
379         settings.startObject("analyzer");
380         buildNgramAnalyzer(settings, "bigram", locale);
381         buildNgramAnalyzer(settings, "trigram", locale);
382         buildNgramAnalyzer(settings, "quadgram", locale);
383         buildNgramAnalyzer(settings, "spectrigram", locale, new String[]{"pattern"});
384 
385         settings.endObject();
386         settings.startObject("tokenizer");
387         buildNgramTokenizer(settings, "bigram", 2);
388         buildNgramTokenizer(settings, "trigram", 3);
389         buildNgramTokenizer(settings, "spectrigram", 3);
390         buildNgramTokenizer(settings, "quadgram", 4);
391         settings.endObject();
392         settings.startObject("filter");
393         buildLowercaseFilter(settings, "greek");
394         buildLowercaseFilter(settings, "irish");
395         buildLowercaseFilter(settings, "turkish");
396         buildPatternFilter(settings);
397         settings.endObject();
398         settings.endObject().endObject().endObject();
399         // System.err.println(settings.string());
400         // System.err.println(mapping.string());
401         assertAcked(prepareCreate("test").setSettings(settings).addMapping("test", mapping));
402         ensureYellow();
403     }
404 
405     private void buildSubfield(XContentBuilder mapping, String name) throws IOException {
406         mapping.startObject(name);
407         mapping.field("type", "text");
408         mapping.field("analyzer", name);
409         mapping.endObject();
410     }
411 
412     private void buildNgramAnalyzer(XContentBuilder settings, String name, String locale) throws IOException {
413         buildNgramAnalyzer(settings, name, locale, new String[]{});
414     }
415 
416     private void buildNgramAnalyzer(XContentBuilder settings, String name, String locale, String[] extraFilters) throws IOException {
417         settings.startObject(name);
418         settings.field("type", "custom");
419         settings.field("tokenizer", name);
420         String[] filters = new String[1 + extraFilters.length];
421         filters[0] = lowercaseForLocale(locale);
422         System.arraycopy(extraFilters, 0, filters, 1, extraFilters.length);
423         settings.field("filter", filters);
424         settings.endObject();
425     }
426 
427     private String lowercaseForLocale(String locale) {
428         switch (locale) {
429             case "el":
430                 return "greek_lowercase";
431             case "ga":
432                 return "irish_lowercase";
433             case "tr":
434                 return "turkish_lowercase";
435             default:
436                 return "lowercase";
437         }
438     }
439 
440     private void buildNgramTokenizer(XContentBuilder settings, String name, int size) throws IOException {
441         settings.startObject(name);
442         settings.field("type", "nGram");
443         settings.field("min_gram", size);
444         settings.field("max_gram", size);
445         settings.endObject();
446     }
447 
448     public void buildLowercaseFilter(XContentBuilder settings, String language) throws IOException {
449         settings.startObject(language + "_lowercase");
450         settings.field("type", "lowercase");
451         settings.field("language", language);
452         settings.endObject();
453     }
454 
455     public void buildPatternFilter(XContentBuilder settings) throws IOException {
456         settings.startObject("pattern");
457         settings.field("type", "pattern_replace");
458         settings.field("pattern", "ß");
459         settings.field("replacement", "ss");
460         settings.endObject();
461     }
462 }