1 package org.wikimedia.search.extra.regex;
2
3 import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
4 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
5 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertFailures;
6 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
7 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchHits;
8 import static org.hamcrest.Matchers.anyOf;
9 import static org.hamcrest.Matchers.containsString;
10
11 import java.io.IOException;
12 import java.util.ArrayList;
13 import java.util.List;
14 import java.util.Locale;
15 import java.util.concurrent.ExecutionException;
16
17 import org.elasticsearch.action.index.IndexRequestBuilder;
18 import org.elasticsearch.action.search.SearchRequestBuilder;
19 import org.elasticsearch.action.search.SearchResponse;
20 import org.elasticsearch.common.xcontent.XContentBuilder;
21 import org.elasticsearch.rest.RestStatus;
22 import org.junit.Test;
23 import org.wikimedia.search.extra.AbstractPluginIntegrationTest;
24
25 public class SourceRegexQueryIntegrationTest extends AbstractPluginIntegrationTest {
26 @Test
27 public void basicUnacceleratedRegex() throws InterruptedException, ExecutionException, IOException {
28 setup();
29 indexRandom(false, doc("findme", "test"));
30 indexChaff(between(0, 10000));
31
32 SearchResponse response = search(filter("t..t")).get();
33 assertSearchHits(response, "findme");
34
35 client().prepareDelete("test", "test", "findme").get();
36 deleteChaff(20);
37 refresh();
38
39
40 response = search(filter("t..t")).get();
41 assertHitCount(response, 0);
42 }
43
44 @Test
45 public void regexMatchesWholeString() throws InterruptedException, ExecutionException, IOException {
46 setup();
47 indexRandom(true, doc("findme", "test"));
48 SearchResponse response = search(filter("test")).get();
49 assertSearchHits(response, "findme");
50 }
51
52 @Test
53 public void regexMatchesPartOfString() throws InterruptedException, ExecutionException, IOException {
54 setup();
55 indexRandom(true, doc("findme", "I have the test in me."));
56 SearchResponse response = search(filter("test")).get();
57 assertSearchHits(response, "findme");
58 }
59
60 @Test
61 public void regexMatchesUnicodeCharacters() throws InterruptedException, ExecutionException, IOException {
62 setup();
63 indexRandom(true, doc("findme", "solved using only λ+μ function"));
64 SearchResponse response = search(filter("only λ\\+μ")).get();
65 assertSearchHits(response, "findme");
66
67
68 response = search(filter("on[ly]y λ\\+μ")).get();
69 assertSearchHits(response, "findme");
70 }
71
72 @Test
73 public void maxStatesTracedLimitsComplexityOfRegexes() throws InterruptedException, ExecutionException, IOException {
74 setup();
75 indexRandom(true, doc("findme", "test"));
76 SearchResponse response = search(filter("te[st]t").maxStatesTraced(30)).get();
77 assertHitCount(response, 1);
78
79
80 assertFailures(search(filter("test").maxStatesTraced(0)), RestStatus.INTERNAL_SERVER_ERROR, containsString("complex"));
81
82 assertFailures(search(filter("te[st]t").maxStatesTraced(0)), RestStatus.INTERNAL_SERVER_ERROR, containsString("complex"));
83
84
85 }
86
87 @Test
88 public void maxDeterminizedStatesLimitsComplexityOfRegexes() throws InterruptedException, ExecutionException, IOException {
89 setup();
90 indexRandom(true, doc("findme", "test"));
91
92 assertFailures(search(filter("[^]]*alt=[^]\\|}]{80,}")), RestStatus.INTERNAL_SERVER_ERROR,
93 containsString("Determinizing [^]]*alt=[^]\\|}]{80,} would result in more than"));
94
95
96 SearchResponse response = search(filter("[^]]*s[tabcse]{1,10}")).get();
97 assertHitCount(response, 1);
98
99 assertFailures(search(filter("[^]]*s[tabcse]{1,10}").maxDeterminizedStates(100)), RestStatus.INTERNAL_SERVER_ERROR,
100 containsString("Determinizing [^]]*s[tabcse]{1,10} would result in more than 100"));
101
102
103 }
104
105 @Test
106 public void maxNgramsExtractedLimitsFilters() throws IOException, InterruptedException, ExecutionException {
107 setup();
108 indexRandom(true, doc("findme", "test"));
109
110 SearchResponse response = search(filter("[ac]*a[de]{50,200}")).get();
111 assertHitCount(response, 0);
112 }
113
114 @Test
115 public void rejectEmptyRegex() throws InterruptedException, ExecutionException, IOException {
116 setup();
117 assertFailures(search(filter("")), RestStatus.BAD_REQUEST, anyOf(
118 containsString("filter must specify [regex]"),
119 containsString("regex must be set")
120 ));
121 }
122
123 @Test
124 public void rejectUnacceleratedCausesFailuresWhenItCannotAccelerateTheRegex() throws InterruptedException, ExecutionException,
125 IOException {
126 setup();
127 indexRandom(true, doc("findme", "test"));
128
129
130
131 assertFailures(search(filter("...").rejectUnaccelerated(true)), RestStatus.INTERNAL_SERVER_ERROR,
132 containsString("Unable to accelerate"));
133 assertFailures(search(filter("t.p").rejectUnaccelerated(true)), RestStatus.INTERNAL_SERVER_ERROR,
134 containsString("Unable to accelerate"));
135 assertFailures(search(filter(".+pa").rejectUnaccelerated(true)), RestStatus.INTERNAL_SERVER_ERROR,
136 containsString("Unable to accelerate"));
137 assertFailures(search(filter("p").rejectUnaccelerated(true)), RestStatus.INTERNAL_SERVER_ERROR,
138 containsString("Unable to accelerate"));
139 }
140
141 @Test
142 public void caseInsensitiveMatching() throws InterruptedException, ExecutionException, IOException {
143 setup();
144 indexRandom(true, doc("findme", "I have the test in me."));
145 SearchResponse response = search(filter("i h[ai]ve")).get();
146 assertSearchHits(response, "findme");
147 response = search(filter("I h[ai]ve")).get();
148 assertSearchHits(response, "findme");
149 }
150
151 @Test
152 public void caseSensitiveMatching() throws InterruptedException, ExecutionException, IOException {
153 setup();
154 indexRandom(true, doc("findme", "I have the test in me."));
155 SearchResponse response = search(filter("i h[ai]ve").caseSensitive(true)).get();
156 assertHitCount(response, 0);
157 response = search(filter("I h[ai]ve").caseSensitive(true)).get();
158 assertSearchHits(response, "findme");
159 }
160
161 @Test
162 public void complex() throws InterruptedException, ExecutionException, IOException {
163 setup();
164 indexRandom(true, doc("findme", "I have the test in me."));
165 SearchResponse response = search(filter("h[efas] te.*me")).get();
166 assertSearchHits(response, "findme");
167 }
168
169 @Test
170 public void changingGramSize() throws InterruptedException, ExecutionException, IOException {
171 setup();
172 indexRandom(true, doc("findme", "I have the test in me."));
173
174
175 SearchResponse response = search(filter("te.*me").gramSize(2).ngramField("test.bigram").rejectUnaccelerated(true)).get();
176 assertSearchHits(response, "findme");
177
178
179 assertFailures(search(filter("te.*me").rejectUnaccelerated(true)), RestStatus.INTERNAL_SERVER_ERROR,
180 containsString("Unable to accelerate"));
181
182
183
184
185 response = search(filter("test.*me").gramSize(4).ngramField("test.quadgram").rejectUnaccelerated(true)).get();
186 assertSearchHits(response, "findme");
187
188
189
190 assertFailures(search(filter("tes.*me").ngramField("test.quadgram").gramSize(4).rejectUnaccelerated(true)), RestStatus.INTERNAL_SERVER_ERROR,
191 containsString("Unable to accelerate"));
192
193
194 }
195
196 @Test
197 public void leadingMultibyte() throws InterruptedException, ExecutionException, IOException {
198 setup();
199 indexRandom(true, doc("findme", "I have the \u03C2test in me."));
200 SearchResponse response = search(filter("\u03C2t[aeiou]st")).get();
201 assertSearchHits(response, "findme");
202 }
203
204 @Test
205 public void manyLowerCasing() throws Exception {
206
207 setup("en");
208 indexLowerCaseTestCases();
209 assertSearchHits(search(filter("\u03AC")).get(), "greek");
210 assertHitCount(search(filter("\u03B1")).get(), 0);
211 assertSearchHits(search(filter("nathair")).get(), "irish");
212 assertHitCount(search(filter("n-athair")).get(), 0);
213 assertSearchHits(search(filter("it")).get(), "turkish");
214 assertHitCount(search(filter("\u0131t")).get(), 0);
215
216
217 client().admin().indices().prepareDelete("test").execute();
218 waitNoPendingTasksOnAll();
219 setup("el");
220 indexLowerCaseTestCases();
221 assertHitCount(search(filter("\u03AC").locale(Locale.forLanguageTag("el"))).get(), 0);
222 assertSearchHits(search(filter("\u03B1").locale(Locale.forLanguageTag("el"))).get(), "greek");
223
224
225 client().admin().indices().prepareDelete("test").execute();
226 waitNoPendingTasksOnAll();
227 setup("ga");
228 indexLowerCaseTestCases();
229 assertHitCount(search(filter("nathair").locale(Locale.forLanguageTag("ga"))).get(), 0);
230
231
232
233
234
235
236
237
238
239
240
241 client().admin().indices().prepareDelete("test").execute();
242 waitNoPendingTasksOnAll();
243 setup("tr");
244 indexLowerCaseTestCases();
245 assertHitCount(search(filter("it").locale(Locale.forLanguageTag("tr"))).get(), 0);
246 assertSearchHits(search(filter("\u0131t").locale(Locale.forLanguageTag("tr"))).get(), "turkish");
247 }
248
249 public void indexLowerCaseTestCases() throws InterruptedException, ExecutionException {
250 indexRandom(true,
251
252
253
254
255 doc("greek", "\u03AC"),
256
257
258
259
260 doc("irish", "nAthair"),
261
262
263
264 doc("turkish", "It"));
265 }
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281 @Test
282 public void testSpecialTrigram() throws ExecutionException, InterruptedException, IOException {
283 setup();
284 indexRandom(true, doc("eszett", "außer"));
285 SearchResponse response = search(filter("außer").gramSize(3).ngramField("test.spectrigram").rejectUnaccelerated(true)).get();
286 assertSearchHits(response, "eszett");
287 }
288
289
290
291
292
293
294
295
296 public void accel() throws InterruptedException, ExecutionException, IOException {
297 String findText = " given as a subroutine for calculating ƒ, the cycle detection problem may be trivially solved using only λ+μ function applications";
298 String regex = "subroutine";
299 setup();
300 indexRandom(false, doc("findme", findText));
301 for (int i = 0; i < 20; i++) {
302 indexChaff(10000);
303 }
304
305 int rounds = 50;
306 long start = System.currentTimeMillis();
307 for (int i = 0; i < rounds; i++) {
308 SearchResponse response = search(new SourceRegexQueryBuilder("test", regex)).get();
309 assertSearchHits(response, "findme");
310 }
311 logger.info("Warmup: {}", (System.currentTimeMillis() - start) / rounds);
312
313 start = System.currentTimeMillis();
314 for (int i = 0; i < rounds; i++) {
315 SearchResponse response = search(new SourceRegexQueryBuilder("test", regex)).get();
316 assertSearchHits(response, "findme");
317 }
318 logger.info("No accel: {}", (System.currentTimeMillis() - start) / rounds);
319
320 start = System.currentTimeMillis();
321 for (int i = 0; i < rounds; i++) {
322 SearchResponse response = search(filter(regex)).get();
323 assertSearchHits(response, "findme");
324 }
325 logger.info("Accelerated: {}", (System.currentTimeMillis() - start) / rounds);
326 }
327
328 private IndexRequestBuilder doc(String id, String fieldValue) {
329 return client().prepareIndex("test", "test", id).setSource("test", fieldValue);
330 }
331
332 private void indexChaff(int count) throws InterruptedException, ExecutionException {
333 List<IndexRequestBuilder> docs = new ArrayList<>();
334 for (int i = 0; i < count; i++) {
335 docs.add(doc(Integer.toString(i), "chaff"));
336 }
337 indexRandom(true, docs);
338 }
339
340 private void deleteChaff(int count) throws InterruptedException, ExecutionException {
341 for (int i = 0; i < count; i++) {
342 client().prepareDelete("test", "test", Integer.toString(i)).get();
343 }
344 }
345
346 private SourceRegexQueryBuilder filter(String regex) {
347 SourceRegexQueryBuilder builder = new SourceRegexQueryBuilder("test", regex);
348 builder.ngramField("test.trigram");
349 return builder;
350 }
351
352 private SearchRequestBuilder search(SourceRegexQueryBuilder builder) {
353 return client().prepareSearch("test").setTypes("test").setQuery(builder);
354 }
355
356 private void setup() throws IOException {
357 setup("root");
358 }
359
360 private void setup(String locale) throws IOException {
361 XContentBuilder mapping = jsonBuilder().startObject();
362 mapping.startObject("test").startObject("properties");
363 mapping.startObject("test");
364 mapping.field("type", "text");
365 mapping.startObject("fields");
366 buildSubfield(mapping, "bigram");
367 buildSubfield(mapping, "trigram");
368 buildSubfield(mapping, "quadgram");
369 buildSubfield(mapping, "spectrigram");
370 mapping.endObject()
371 .endObject()
372 .endObject()
373 .endObject()
374 .endObject();
375
376 XContentBuilder settings = jsonBuilder().startObject().startObject("index");
377 settings.field("number_of_shards", 1);
378 settings.startObject("analysis");
379 settings.startObject("analyzer");
380 buildNgramAnalyzer(settings, "bigram", locale);
381 buildNgramAnalyzer(settings, "trigram", locale);
382 buildNgramAnalyzer(settings, "quadgram", locale);
383 buildNgramAnalyzer(settings, "spectrigram", locale, new String[]{"pattern"});
384
385 settings.endObject();
386 settings.startObject("tokenizer");
387 buildNgramTokenizer(settings, "bigram", 2);
388 buildNgramTokenizer(settings, "trigram", 3);
389 buildNgramTokenizer(settings, "spectrigram", 3);
390 buildNgramTokenizer(settings, "quadgram", 4);
391 settings.endObject();
392 settings.startObject("filter");
393 buildLowercaseFilter(settings, "greek");
394 buildLowercaseFilter(settings, "irish");
395 buildLowercaseFilter(settings, "turkish");
396 buildPatternFilter(settings);
397 settings.endObject();
398 settings.endObject().endObject().endObject();
399
400
401 assertAcked(prepareCreate("test").setSettings(settings).addMapping("test", mapping));
402 ensureYellow();
403 }
404
405 private void buildSubfield(XContentBuilder mapping, String name) throws IOException {
406 mapping.startObject(name);
407 mapping.field("type", "text");
408 mapping.field("analyzer", name);
409 mapping.endObject();
410 }
411
412 private void buildNgramAnalyzer(XContentBuilder settings, String name, String locale) throws IOException {
413 buildNgramAnalyzer(settings, name, locale, new String[]{});
414 }
415
416 private void buildNgramAnalyzer(XContentBuilder settings, String name, String locale, String[] extraFilters) throws IOException {
417 settings.startObject(name);
418 settings.field("type", "custom");
419 settings.field("tokenizer", name);
420 String[] filters = new String[1 + extraFilters.length];
421 filters[0] = lowercaseForLocale(locale);
422 System.arraycopy(extraFilters, 0, filters, 1, extraFilters.length);
423 settings.field("filter", filters);
424 settings.endObject();
425 }
426
427 private String lowercaseForLocale(String locale) {
428 switch (locale) {
429 case "el":
430 return "greek_lowercase";
431 case "ga":
432 return "irish_lowercase";
433 case "tr":
434 return "turkish_lowercase";
435 default:
436 return "lowercase";
437 }
438 }
439
440 private void buildNgramTokenizer(XContentBuilder settings, String name, int size) throws IOException {
441 settings.startObject(name);
442 settings.field("type", "nGram");
443 settings.field("min_gram", size);
444 settings.field("max_gram", size);
445 settings.endObject();
446 }
447
448 public void buildLowercaseFilter(XContentBuilder settings, String language) throws IOException {
449 settings.startObject(language + "_lowercase");
450 settings.field("type", "lowercase");
451 settings.field("language", language);
452 settings.endObject();
453 }
454
455 public void buildPatternFilter(XContentBuilder settings) throws IOException {
456 settings.startObject("pattern");
457 settings.field("type", "pattern_replace");
458 settings.field("pattern", "ß");
459 settings.field("replacement", "ss");
460 settings.endObject();
461 }
462 }