View Javadoc
1   package org.wikimedia.search.extra.analysis.textify;
2   
3   import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
4   import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
5   import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchHits;
6   
7   import java.io.IOException;
8   import java.util.Arrays;
9   import java.util.Collection;
10  import java.util.Collections;
11  import java.util.concurrent.ExecutionException;
12  
13  import org.elasticsearch.action.index.IndexRequestBuilder;
14  import org.elasticsearch.action.search.SearchRequestBuilder;
15  import org.elasticsearch.common.xcontent.XContentBuilder;
16  import org.elasticsearch.index.query.QueryBuilders;
17  import org.elasticsearch.plugins.Plugin;
18  import org.elasticsearch.test.ESIntegTestCase;
19  import org.elasticsearch.test.ESIntegTestCase.ClusterScope;
20  import org.junit.Before;
21  import org.junit.Test;
22  
23  @ClusterScope(scope = ESIntegTestCase.Scope.SUITE, transportClientRatio = 0.0)
24  public class ICUTokenRepairIntegrationTest extends ESIntegTestCase {
25  
26      String[] allFields = {"preconfig_field", "default_field", "merge_camel_field",
27          "merge_no_types_field", "merge_no_scripts_field", "no_num_field"};
28  
29      SearchRequestBuilder srchReqBldr;
30  
31      @Before
32      public void init() throws IOException, InterruptedException, ExecutionException {
33          XContentBuilder settings = jsonBuilder()
34                  .startObject()
35                      .field("number_of_shards", 1)
36                      .startObject("analysis")
37                          .startObject("filter")
38                              .startObject("default_icutokrep")
39                                  .field("type", "icu_token_repair")
40                              .endObject()
41                              .startObject("merge_camel_icutokrep")
42                                  .field("type", "icu_token_repair")
43                                  .field(ICUTokenRepairFilterFactory.KEEP_CAMEL_KEY, false)
44                              .endObject()
45                              .startObject("merge_no_types_icutokrep")
46                                  .field("type", "icu_token_repair")
47                                  .field(ICUTokenRepairFilterFactory.TYPE_PRESET_KEY, "none")
48                              .endObject()
49                              .startObject("merge_no_scripts_icutokrep")
50                                  .field("type", "icu_token_repair")
51                                  .field(ICUTokenRepairFilterFactory.SCRIPT_PRESET_KEY, "none")
52                              .endObject()
53                              .startObject("no_num_icutokrep")
54                                  .field("type", "icu_token_repair")
55                                  .array(ICUTokenRepairFilterFactory.DENY_TYPES_KEY, "<NUM>")
56                              .endObject()
57                          .endObject()
58                          .startObject("analyzer")
59                              .startObject("preconfig_analyzer")
60                                  .field("tokenizer", "textify_icu_tokenizer")
61                                  .array("filter", "icu_token_repair", "lowercase")
62                              .endObject()
63                              .startObject("default_analyzer")
64                                  .field("tokenizer", "textify_icu_tokenizer")
65                                  .array("filter", "default_icutokrep", "lowercase")
66                              .endObject()
67                              .startObject("merge_camel_analyzer")
68                                  .field("tokenizer", "textify_icu_tokenizer")
69                                  .array("filter", "merge_camel_icutokrep", "lowercase")
70                              .endObject()
71                              .startObject("merge_no_types_analyzer")
72                                  .field("tokenizer", "textify_icu_tokenizer")
73                                  .array("filter", "merge_no_types_icutokrep", "lowercase")
74                              .endObject()
75                              .startObject("merge_no_scripts_analyzer")
76                                  .field("tokenizer", "textify_icu_tokenizer")
77                                  .array("filter", "merge_no_scripts_icutokrep", "lowercase")
78                              .endObject()
79                              .startObject("no_num_analyzer")
80                                  .field("tokenizer", "textify_icu_tokenizer")
81                                  .array("filter", "no_num_icutokrep", "lowercase")
82                              .endObject()
83                          .endObject()
84                      .endObject()
85                  .endObject();
86  
87          XContentBuilder mapping = jsonBuilder()
88                  .startObject()
89                      .startObject("my_mapping")
90                          .startObject("properties")
91                              .startObject("preconfig_field")
92                                  .field("type", "text")
93                                  .field("analyzer", "preconfig_analyzer")
94                                  .field("similarity", "BM25")
95                              .endObject()
96                              .startObject("default_field")
97                                  .field("type", "text")
98                                  .field("analyzer", "default_analyzer")
99                                  .field("similarity", "BM25")
100                             .endObject()
101                             .startObject("merge_camel_field")
102                                 .field("type", "text")
103                                 .field("analyzer", "merge_camel_analyzer")
104                                 .field("similarity", "BM25")
105                             .endObject()
106                             .startObject("merge_no_types_field")
107                                 .field("type", "text")
108                                 .field("analyzer", "merge_no_types_analyzer")
109                                 .field("similarity", "BM25")
110                             .endObject()
111                             .startObject("merge_no_scripts_field")
112                                 .field("type", "text")
113                                 .field("analyzer", "merge_no_scripts_analyzer")
114                                 .field("similarity", "BM25")
115                             .endObject()
116                             .startObject("no_num_field")
117                                 .field("type", "text")
118                                 .field("analyzer", "no_num_analyzer")
119                                 .field("similarity", "BM25")
120                             .endObject()
121                         .endObject()
122                     .endObject()
123                 .endObject();
124 
125         assertAcked(prepareCreate("my_index").addMapping("my_mapping", mapping).setSettings(settings));
126         ensureGreen();
127         for (String f: allFields) {
128             indexRandom(true, doc(f, f + "-mixed", "ж 3x 5д"));
129             indexRandom(true, doc(f, f + "-latin", "3x"));
130             indexRandom(true, doc(f, f + "-cyrillic", "5д"));
131             indexRandom(true, doc(f, f + "-abc", "abcабгαβγ"));
132             indexRandom(true, doc(f, f + "-camel", "camelϚΛϞΣ"));
133             indexRandom(true, doc(f, f + "-notcamel", "camelϛλϟσ"));
134         }
135         refresh();
136 
137         srchReqBldr = client().prepareSearch("my_index");
138     }
139 
140     @Test
141     public void testTokenRepairDefaultConfig() {
142         for (String f: new String[] {"preconfig_field", "default_field"}) {
143             checkHits(f, "5д", f + "-mixed", f + "-cyrillic");
144             checkHits(f, "3x", f + "-mixed", f + "-latin");
145             checkHits(f, "αβγ");
146             checkHits(f, "camelϚΛϞΣ", f + "-camel");
147             checkHits(f, "camelϛλϟσ", f + "-notcamel");
148         }
149     }
150 
151     @Test
152     public void testTokenRepairCamelMerge() {
153         String f = "merge_camel_field";
154         checkHits(f, "5д", f + "-mixed", f + "-cyrillic");
155         checkHits(f, "3x", f + "-mixed", f + "-latin");
156         checkHits(f, "αβγ");
157         checkHits(f, "camelϚΛϞΣ", f + "-camel", f + "-notcamel");
158         checkHits(f, "camelϛλϟσ", f + "-camel", f + "-notcamel");
159     }
160 
161     @Test
162     public void testTokenRepairMergeNoTypes() {
163         String f = "merge_no_types_field";
164         checkHits(f, "5д", f + "-cyrillic");
165         checkHits(f, "3x", f + "-latin");
166         checkHits(f, "αβγ", f + "-abc");
167         checkHits(f, "camelϚΛϞΣ", f + "-camel", f + "-notcamel");
168         checkHits(f, "camelϛλϟσ", f + "-camel", f + "-notcamel");
169     }
170 
171     @Test
172     public void testTokenRepairMergeNoScripts() {
173         // plain <NUM> are still merged, regardless of script, unless <NUM> type is blocked
174         String f = "merge_no_scripts_field";
175         checkHits(f, "5д", f + "-cyrillic", f + "-mixed");
176         checkHits(f, "3x", f + "-latin", f + "-mixed");
177         checkHits(f, "αβγ", f + "-abc");
178         checkHits(f, "camelϚΛϞΣ", f + "-camel", f + "-notcamel");
179         checkHits(f, "camelϛλϟσ", f + "-camel", f + "-notcamel");
180     }
181 
182     @Test
183     public void testTokenRepairNoNum() {
184         String f = "no_num_field";
185         checkHits(f, "5д", f + "-cyrillic");
186         checkHits(f, "3x", f + "-latin");
187         checkHits(f, "αβγ");
188         checkHits(f, "camelϚΛϞΣ", f + "-camel");
189         checkHits(f, "camelϛλϟσ", f + "-notcamel");
190     }
191 
192     private IndexRequestBuilder doc(String field, String id, String fieldValue) {
193         return client().prepareIndex("my_index", "my_mapping", id).setSource(field, fieldValue);
194     }
195 
196     private void checkHits(String field, String query, String... resultIDs) {
197         assertSearchHits(srchReqBldr.setQuery(QueryBuilders.matchQuery(field, query)).get(),
198             resultIDs);
199     }
200 
201     @Override
202     protected Collection<Class<? extends Plugin>> nodePlugins() {
203         return Collections.<Class<? extends Plugin>>unmodifiableList(Arrays.asList(ExtraAnalysisTextifyPlugin.class));
204     }
205 
206 }