1 package org.wikimedia.search.extra.analysis.textify;
2
3 import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
4 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
5 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchHits;
6
7 import java.io.IOException;
8 import java.util.Arrays;
9 import java.util.Collection;
10 import java.util.Collections;
11 import java.util.concurrent.ExecutionException;
12
13 import org.elasticsearch.action.index.IndexRequestBuilder;
14 import org.elasticsearch.action.search.SearchRequestBuilder;
15 import org.elasticsearch.common.xcontent.XContentBuilder;
16 import org.elasticsearch.index.query.QueryBuilders;
17 import org.elasticsearch.plugins.Plugin;
18 import org.elasticsearch.test.ESIntegTestCase;
19 import org.elasticsearch.test.ESIntegTestCase.ClusterScope;
20 import org.junit.Before;
21 import org.junit.Test;
22
23 @ClusterScope(scope = ESIntegTestCase.Scope.SUITE, transportClientRatio = 0.0)
24 public class ICUTokenRepairIntegrationTest extends ESIntegTestCase {
25
26 String[] allFields = {"preconfig_field", "default_field", "merge_camel_field",
27 "merge_no_types_field", "merge_no_scripts_field", "no_num_field"};
28
29 SearchRequestBuilder srchReqBldr;
30
31 @Before
32 public void init() throws IOException, InterruptedException, ExecutionException {
33 XContentBuilder settings = jsonBuilder()
34 .startObject()
35 .field("number_of_shards", 1)
36 .startObject("analysis")
37 .startObject("filter")
38 .startObject("default_icutokrep")
39 .field("type", "icu_token_repair")
40 .endObject()
41 .startObject("merge_camel_icutokrep")
42 .field("type", "icu_token_repair")
43 .field(ICUTokenRepairFilterFactory.KEEP_CAMEL_KEY, false)
44 .endObject()
45 .startObject("merge_no_types_icutokrep")
46 .field("type", "icu_token_repair")
47 .field(ICUTokenRepairFilterFactory.TYPE_PRESET_KEY, "none")
48 .endObject()
49 .startObject("merge_no_scripts_icutokrep")
50 .field("type", "icu_token_repair")
51 .field(ICUTokenRepairFilterFactory.SCRIPT_PRESET_KEY, "none")
52 .endObject()
53 .startObject("no_num_icutokrep")
54 .field("type", "icu_token_repair")
55 .array(ICUTokenRepairFilterFactory.DENY_TYPES_KEY, "<NUM>")
56 .endObject()
57 .endObject()
58 .startObject("analyzer")
59 .startObject("preconfig_analyzer")
60 .field("tokenizer", "textify_icu_tokenizer")
61 .array("filter", "icu_token_repair", "lowercase")
62 .endObject()
63 .startObject("default_analyzer")
64 .field("tokenizer", "textify_icu_tokenizer")
65 .array("filter", "default_icutokrep", "lowercase")
66 .endObject()
67 .startObject("merge_camel_analyzer")
68 .field("tokenizer", "textify_icu_tokenizer")
69 .array("filter", "merge_camel_icutokrep", "lowercase")
70 .endObject()
71 .startObject("merge_no_types_analyzer")
72 .field("tokenizer", "textify_icu_tokenizer")
73 .array("filter", "merge_no_types_icutokrep", "lowercase")
74 .endObject()
75 .startObject("merge_no_scripts_analyzer")
76 .field("tokenizer", "textify_icu_tokenizer")
77 .array("filter", "merge_no_scripts_icutokrep", "lowercase")
78 .endObject()
79 .startObject("no_num_analyzer")
80 .field("tokenizer", "textify_icu_tokenizer")
81 .array("filter", "no_num_icutokrep", "lowercase")
82 .endObject()
83 .endObject()
84 .endObject()
85 .endObject();
86
87 XContentBuilder mapping = jsonBuilder()
88 .startObject()
89 .startObject("my_mapping")
90 .startObject("properties")
91 .startObject("preconfig_field")
92 .field("type", "text")
93 .field("analyzer", "preconfig_analyzer")
94 .field("similarity", "BM25")
95 .endObject()
96 .startObject("default_field")
97 .field("type", "text")
98 .field("analyzer", "default_analyzer")
99 .field("similarity", "BM25")
100 .endObject()
101 .startObject("merge_camel_field")
102 .field("type", "text")
103 .field("analyzer", "merge_camel_analyzer")
104 .field("similarity", "BM25")
105 .endObject()
106 .startObject("merge_no_types_field")
107 .field("type", "text")
108 .field("analyzer", "merge_no_types_analyzer")
109 .field("similarity", "BM25")
110 .endObject()
111 .startObject("merge_no_scripts_field")
112 .field("type", "text")
113 .field("analyzer", "merge_no_scripts_analyzer")
114 .field("similarity", "BM25")
115 .endObject()
116 .startObject("no_num_field")
117 .field("type", "text")
118 .field("analyzer", "no_num_analyzer")
119 .field("similarity", "BM25")
120 .endObject()
121 .endObject()
122 .endObject()
123 .endObject();
124
125 assertAcked(prepareCreate("my_index").addMapping("my_mapping", mapping).setSettings(settings));
126 ensureGreen();
127 for (String f: allFields) {
128 indexRandom(true, doc(f, f + "-mixed", "ж 3x 5д"));
129 indexRandom(true, doc(f, f + "-latin", "3x"));
130 indexRandom(true, doc(f, f + "-cyrillic", "5д"));
131 indexRandom(true, doc(f, f + "-abc", "abcабгαβγ"));
132 indexRandom(true, doc(f, f + "-camel", "camelϚΛϞΣ"));
133 indexRandom(true, doc(f, f + "-notcamel", "camelϛλϟσ"));
134 }
135 refresh();
136
137 srchReqBldr = client().prepareSearch("my_index");
138 }
139
140 @Test
141 public void testTokenRepairDefaultConfig() {
142 for (String f: new String[] {"preconfig_field", "default_field"}) {
143 checkHits(f, "5д", f + "-mixed", f + "-cyrillic");
144 checkHits(f, "3x", f + "-mixed", f + "-latin");
145 checkHits(f, "αβγ");
146 checkHits(f, "camelϚΛϞΣ", f + "-camel");
147 checkHits(f, "camelϛλϟσ", f + "-notcamel");
148 }
149 }
150
151 @Test
152 public void testTokenRepairCamelMerge() {
153 String f = "merge_camel_field";
154 checkHits(f, "5д", f + "-mixed", f + "-cyrillic");
155 checkHits(f, "3x", f + "-mixed", f + "-latin");
156 checkHits(f, "αβγ");
157 checkHits(f, "camelϚΛϞΣ", f + "-camel", f + "-notcamel");
158 checkHits(f, "camelϛλϟσ", f + "-camel", f + "-notcamel");
159 }
160
161 @Test
162 public void testTokenRepairMergeNoTypes() {
163 String f = "merge_no_types_field";
164 checkHits(f, "5д", f + "-cyrillic");
165 checkHits(f, "3x", f + "-latin");
166 checkHits(f, "αβγ", f + "-abc");
167 checkHits(f, "camelϚΛϞΣ", f + "-camel", f + "-notcamel");
168 checkHits(f, "camelϛλϟσ", f + "-camel", f + "-notcamel");
169 }
170
171 @Test
172 public void testTokenRepairMergeNoScripts() {
173
174 String f = "merge_no_scripts_field";
175 checkHits(f, "5д", f + "-cyrillic", f + "-mixed");
176 checkHits(f, "3x", f + "-latin", f + "-mixed");
177 checkHits(f, "αβγ", f + "-abc");
178 checkHits(f, "camelϚΛϞΣ", f + "-camel", f + "-notcamel");
179 checkHits(f, "camelϛλϟσ", f + "-camel", f + "-notcamel");
180 }
181
182 @Test
183 public void testTokenRepairNoNum() {
184 String f = "no_num_field";
185 checkHits(f, "5д", f + "-cyrillic");
186 checkHits(f, "3x", f + "-latin");
187 checkHits(f, "αβγ");
188 checkHits(f, "camelϚΛϞΣ", f + "-camel");
189 checkHits(f, "camelϛλϟσ", f + "-notcamel");
190 }
191
192 private IndexRequestBuilder doc(String field, String id, String fieldValue) {
193 return client().prepareIndex("my_index", "my_mapping", id).setSource(field, fieldValue);
194 }
195
196 private void checkHits(String field, String query, String... resultIDs) {
197 assertSearchHits(srchReqBldr.setQuery(QueryBuilders.matchQuery(field, query)).get(),
198 resultIDs);
199 }
200
201 @Override
202 protected Collection<Class<? extends Plugin>> nodePlugins() {
203 return Collections.<Class<? extends Plugin>>unmodifiableList(Arrays.asList(ExtraAnalysisTextifyPlugin.class));
204 }
205
206 }