1 package org.wikimedia.search.extra.analysis.textify;
2
3 import static java.util.Collections.emptySet;
4 import static java.util.Collections.singleton;
5 import static org.wikimedia.search.extra.analysis.textify.ICUTokenRepairFilterTestUtils.testICUTokenization;
6
7 import java.io.IOException;
8 import java.util.Arrays;
9 import java.util.Set;
10 import java.util.HashSet;
11
12 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
13 import org.junit.Test;
14
15 public class ICUTokenRepairFilterConfigTest extends BaseTokenStreamTestCase {
16
17 ICUTokenRepairFilterConfig cfg;
18
19 @Test
20 public void testTokenLengthOptions() throws IOException {
21
22
23
24
25 String latD40 = "dddddddddddddddddddddddddddddddddddddddd";
26 String cyrD40 = "дддддддддддддддддддддддддддддддддддддддд";
27 String grkD40 = "δδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδ";
28 String mixD120 = latD40 + cyrD40 + grkD40;
29
30 testICUTokenization(mixD120,
31 new String[]{latD40, cyrD40, grkD40},
32 new String[]{latD40 + cyrD40, grkD40},
33 new String[]{"Unknown", "Greek"},
34 new String[]{"<ALPHANUM>"},
35 new int[]{0, 80},
36 new int[]{80, 120},
37 new int[]{1, 1}
38 );
39
40
41 cfg = new ICUTokenRepairFilterConfig();
42 cfg.setMaxTokenLength(200);
43 testICUTokenization(mixD120, cfg,
44 new String[]{latD40, cyrD40, grkD40},
45 new String[]{mixD120},
46 new String[]{"Unknown"},
47 new String[]{"<ALPHANUM>"},
48 new int[]{0},
49 new int[]{120},
50 new int[]{1}
51 );
52
53
54
55
56 String[] dddArray = new String[50];
57 Arrays.fill(dddArray, "dдδ");
58 String ddd150 = String.join("", dddArray);
59 testICUTokenization(ddd150,
60 new String[]{ddd150.substring(0, 100), ddd150.substring(100)},
61 new String[]{"Unknown"},
62 new String[]{"<ALPHANUM>"}
63 );
64
65
66 testICUTokenization(ddd150, cfg,
67 new String[]{ddd150},
68 new String[]{"Unknown"},
69 new String[]{"<ALPHANUM>"}
70 );
71 }
72
73 @Test
74 public void testCamelCaseOptions() throws IOException {
75
76
77 String input = "NGiИX KoЯn camel̠ϚΛϞΣ camel̠ϛλϟε";
78
79
80 boolean keepCamelSplits = true;
81
82 cfg = new ICUTokenRepairFilterConfig();
83 cfg.setKeepCamelSplit(true);
84 testICUTokenization(input, cfg,
85 new String[]{"NGi", "ИX", "Ko", "Яn", "camel̠", "ϚΛϞΣ", "camel̠ϛλϟε"},
86 new String[]{"Latin", "Unknown", "Latin", "Unknown", "Latin", "Greek", "Unknown"},
87 new String[]{"<ALPHANUM>"}
88 );
89
90
91 cfg.setKeepCamelSplit(false);
92 testICUTokenization(input, cfg,
93 new String[]{"NGiИX", "KoЯn", "camel̠ϚΛϞΣ", "camel̠ϛλϟε"},
94 new String[]{"Unknown"},
95 new String[]{"<ALPHANUM>"}
96 );
97 }
98
99 @Test
100 public void testMergeNumOnlyOptions() throws IOException {
101
102
103
104
105
106
107 String numInput = "x 2١ 2x x١ Ж 3a Ж 3̈a Ж 3a Ж 3̈a";
108
109 cfg = new ICUTokenRepairFilterConfig();
110
111 boolean[] trueFalse = {true, false};
112 for (boolean mergeNumOnly : trueFalse) {
113 cfg.setMergeNumOnly(mergeNumOnly);
114 testICUTokenization(numInput, cfg,
115 new String[]{"x", "2", "١", "2", "x", "x", "١",
116 "Ж", "3", "a", "Ж", "3̈", "a", "Ж", "3", "a", "Ж", "3̈", "a"},
117 new String[]{"x", "2١", "2x", "x١",
118 "Ж", "3a", "Ж", "3̈a", "Ж", "3a", "Ж", "3̈a"},
119 new String[]{"Latin", "Common", "Latin", "Latin", "Cyrillic", "Latin",
120 "Cyrillic", "Latin", "Cyrillic", "Latin", "Cyrillic", "Latin"},
121 new String[]{"<ALPHANUM>", "<NUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
122 "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
123 "<ALPHANUM>", "<ALPHANUM>"}
124 );
125 }
126
127
128 String nonNumInput = "abcабгαβγ 3x";
129
130
131 cfg.setMergeNumOnly(false);
132 testICUTokenization(nonNumInput, cfg,
133 new String[]{"abc", "абг", "αβγ", "3", "x"},
134 new String[]{"abcабгαβγ", "3x"},
135 new String[]{"Unknown", "Latin"},
136 new String[]{"<ALPHANUM>"}
137 );
138
139
140 cfg.setMergeNumOnly(true);
141 testICUTokenization(nonNumInput, cfg,
142 new String[]{"abc", "абг", "αβγ", "3", "x"},
143 new String[]{"abc", "абг", "αβγ", "3x"},
144 new String[]{"Latin", "Cyrillic", "Greek", "Latin"},
145 new String[]{"<ALPHANUM>"}
146 );
147 }
148
149 @Test
150 public void testTypeLimitOptions() throws IOException {
151 boolean makeDenyList = false;
152 Set<Integer> emptyTypeSet = emptySet();
153 Set<Integer> alphaTypeOnly =
154 singleton(TextifyUtils.TOKEN_TYPE_ALPHANUM);
155
156 ICUTokenRepairFilterConfig repairAllCfg = new ICUTokenRepairFilterConfig();
157 repairAllCfg.setNoScriptLimits();
158 repairAllCfg.setNoTypeLimits();
159
160
161
162 String emoji = "Д☂D☀Δ";
163 testICUTokenization(emoji,
164 new String[]{"Д", "☂", "D", "☀", "Δ"},
165 new String[]{"Д", "☂", "D", "☀", "Δ"},
166 new String[]{"Cyrillic", "Common", "Latin", "Common", "Greek"},
167 new String[]{"<ALPHANUM>", "<EMOJI>", "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>"}
168 );
169
170
171 testICUTokenization(emoji, repairAllCfg,
172 new String[]{"Д", "☂D", "☀Δ"},
173
174 new String[]{"Cyrillic", "Latin", "Greek"},
175 new String[]{"<ALPHANUM>"}
176 );
177
178
179 cfg = new ICUTokenRepairFilterConfig();
180 cfg.setTypeLimits(makeDenyList, alphaTypeOnly);
181 testICUTokenization(emoji, cfg,
182 new String[]{"Д", "☂", "D", "☀", "Δ"},
183 new String[]{"Cyrillic", "Common", "Latin", "Common", "Greek"},
184 new String[]{"<ALPHANUM>", "<EMOJI>", "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>"}
185 );
186
187 String chinese = "6年 X 8年";
188 testICUTokenization(chinese,
189 new String[]{"6", "年", "X", "8", "年"},
190 new String[]{"6", "年", "X", "8", "年"},
191 new String[]{"Common", "Jpan", "Latin", "Common", "Jpan"},
192
193
194 new String[]{"<NUM>", "<IDEOGRAPHIC>", "<ALPHANUM>", "<NUM>", "<IDEOGRAPHIC>"}
195 );
196
197
198 testICUTokenization(chinese, repairAllCfg,
199 new String[]{"6", "年", "X", "8年"},
200
201 new String[]{"Common", "Jpan", "Latin", "Jpan"},
202 new String[]{"<NUM>", "<IDEOGRAPHIC>", "<ALPHANUM>", "<IDEOGRAPHIC>"}
203 );
204
205 String korean = "3년 X 7년";
206 testICUTokenization(korean,
207 new String[]{"3", "년", "X", "7", "년"},
208 new String[]{"3", "년", "X", "7", "년"},
209 new String[]{"Common", "Hangul", "Latin", "Common", "Hangul"},
210 new String[]{"<NUM>", "<HANGUL>", "<ALPHANUM>", "<NUM>", "<HANGUL>"}
211 );
212
213
214 testICUTokenization(korean, repairAllCfg,
215 new String[]{"3", "년", "X", "7년"},
216
217 new String[]{"Common", "Hangul", "Latin", "Hangul"},
218 new String[]{"<NUM>", "<HANGUL>", "<ALPHANUM>", "<HANGUL>"}
219 );
220
221 String mixedCjkJa = "び帆布カバン";
222 testICUTokenization(mixedCjkJa,
223 new String[]{"び", "帆布", "カバン"},
224 new String[]{"び", "帆布", "カバン"},
225 new String[]{"Jpan"},
226 new String[]{"<IDEOGRAPHIC>"}
227 );
228
229
230 testICUTokenization(mixedCjkJa, repairAllCfg,
231 new String[]{"び", "帆布", "カバン"},
232 new String[]{"Jpan"},
233 new String[]{"<IDEOGRAPHIC>"}
234 );
235
236 String mixedCjkKo = "축구常備軍";
237 testICUTokenization(mixedCjkKo,
238 new String[]{"축구", "常備軍"},
239 new String[]{"축구", "常備軍"},
240 new String[]{"Hangul", "Jpan"},
241 new String[]{"<HANGUL>", "<IDEOGRAPHIC>"}
242 );
243
244
245 testICUTokenization(mixedCjkKo, repairAllCfg,
246 new String[]{mixedCjkKo},
247 new String[]{"Unknown"},
248 new String[]{"<OTHER>"}
249 );
250 }
251
252 @Test
253 public void testMergeableTypesAllowOptions() throws IOException {
254 boolean makeAllowList = true;
255 Set<Integer> alphaTypeOnly =
256 singleton(TextifyUtils.TOKEN_TYPE_ALPHANUM);
257
258 String abc = "abcабгαβγ 3x 3χ 3ж";
259 testICUTokenization(abc,
260 new String[]{"abc", "абг", "αβγ", "3", "x", "3", "χ", "3", "ж"},
261 new String[]{"abcабгαβγ", "3x", "3χ", "3ж"},
262 new String[]{"Unknown", "Latin", "Greek", "Cyrillic"},
263 new String[]{"<ALPHANUM>"}
264 );
265
266
267 cfg = new ICUTokenRepairFilterConfig();
268 cfg.setTypeLimits(makeAllowList, alphaTypeOnly);
269 testICUTokenization(abc, cfg,
270 new String[]{"abcабгαβγ", "3", "x", "3", "χ", "3", "ж"},
271 new String[]{"Unknown", "Common", "Latin", "Common", "Greek", "Common", "Cyrillic"},
272 new String[]{"<ALPHANUM>", "<NUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>",
273 "<NUM>", "<ALPHANUM>"}
274 );
275
276 makeAllowList = true;
277 Set<Integer> alphaHangulSet =
278 new HashSet<>(Arrays.asList(TextifyUtils.TOKEN_TYPE_ALPHANUM, TextifyUtils.TOKEN_TYPE_HANGUL));
279
280 String veryMixedString = "abc한글абг한글3αβγ5x";
281 cfg = new ICUTokenRepairFilterConfig();
282 cfg.setNoScriptLimits();
283 testICUTokenization(veryMixedString, cfg,
284 new String[]{"abc", "한글", "абг", "한글", "3", "αβγ5", "x"},
285 new String[]{"abc", "한글", "абг", "한글", "3αβγ5x"},
286 new String[]{"Latin", "Hangul", "Cyrillic", "Hangul", "Unknown"},
287 new String[]{"<ALPHANUM>", "<HANGUL>", "<ALPHANUM>", "<HANGUL>", "<ALPHANUM>"}
288 );
289
290
291 cfg = new ICUTokenRepairFilterConfig();
292 cfg.setTypeLimits(makeAllowList, alphaHangulSet);
293 cfg.setNoScriptLimits();
294 testICUTokenization(veryMixedString, cfg,
295 new String[]{"abc한글абг한글", "3", "αβγ5x"},
296 new String[]{"Unknown", "Common", "Unknown"},
297 new String[]{"<ALPHANUM>", "<NUM>", "<ALPHANUM>"}
298 );
299 }
300
301 @Test
302 public void testEmptyScriptLimits() throws IOException {
303
304
305 String input = "null test";
306 cfg = new ICUTokenRepairFilterConfig();
307
308 cfg.setScriptLimits("");
309 testICUTokenization(input, cfg,
310 new String[]{"null", "test"},
311 new String[]{"Latin"},
312 new String[]{"<ALPHANUM>"}
313 );
314
315 cfg.setScriptLimits(false, null);
316 testICUTokenization(input, cfg,
317 new String[]{"null", "test"},
318 new String[]{"Latin"},
319 new String[]{"<ALPHANUM>"}
320 );
321
322 cfg.setScriptLimits(true, null);
323 testICUTokenization(input, cfg,
324 new String[]{"null", "test"},
325 new String[]{"Latin"},
326 new String[]{"<ALPHANUM>"}
327 );
328 }
329
330 @Test
331 public void testScriptLimitOptions() throws IOException {
332
333 String[] tokens = {"աբգ", "ⲁⲃⲅ", "абг", "αβγ", "abc"};
334 String input = String.join("", tokens);
335
336 testICUTokenization(input,
337 tokens,
338 new String[]{input},
339 new String[]{"Unknown"},
340 new String[]{"<ALPHANUM>"}
341 );
342
343 cfg = new ICUTokenRepairFilterConfig();
344
345
346 cfg.setScriptLimits("Armenian+Cyrillic+Greek+Latin");
347 testICUTokenization(input, cfg,
348 new String[]{"աբգ", "ⲁⲃⲅ", "абгαβγabc"},
349 new String[]{"Armenian", "Coptic", "Unknown"},
350 new String[]{"<ALPHANUM>"}
351 );
352
353
354 cfg.setScriptLimits("Armenian+Cyrillic+Latin");
355 testICUTokenization(input, cfg,
356 tokens,
357 new String[]{"Armenian", "Coptic", "Cyrillic", "Greek", "Latin"},
358 new String[]{"<ALPHANUM>"}
359 );
360
361
362 cfg.setScriptLimits("");
363 testICUTokenization(input, cfg,
364 tokens,
365 new String[]{"Armenian", "Coptic", "Cyrillic", "Greek", "Latin"},
366 new String[]{"<ALPHANUM>"}
367 );
368
369
370 cfg.setScriptLimits("Armenian+Coptic, Coptic+Cyrillic, Cyrillic+Greek, Greek+Latin");
371 testICUTokenization(input, cfg,
372 new String[]{input},
373 new String[]{"Unknown"},
374 new String[]{"<ALPHANUM>"}
375 );
376
377
378 cfg.setScriptLimits("Greek+Latin,Cyrillic+Coptic,Armenian+Coptic,Greek+Cyrillic");
379 testICUTokenization(input, cfg,
380 new String[]{input},
381 new String[]{"Unknown"},
382 new String[]{"<ALPHANUM>"}
383 );
384
385
386 cfg.setScriptLimits("Latin+Cyrillic+Armenian+Coptic+Greek");
387 testICUTokenization(input, cfg,
388 new String[]{input},
389 new String[]{"Unknown"},
390 new String[]{"<ALPHANUM>"}
391 );
392 }
393
394 @Test
395 public void testCJScriptLimitNames() throws IOException {
396
397
398
399
400
401 String[] tokens = {"あ", "갠", "ア", "갠", "饳", "갠"};
402 String input = String.join("", tokens);
403
404 testICUTokenization(input,
405 tokens,
406 tokens,
407 new String[]{"Jpan", "Hangul", "Jpan", "Hangul", "Jpan", "Hangul"},
408 new String[]{"<IDEOGRAPHIC>", "<HANGUL>", "<IDEOGRAPHIC>", "<HANGUL>", "<IDEOGRAPHIC>",
409 "<HANGUL>"}
410 );
411
412 cfg = new ICUTokenRepairFilterConfig();
413 cfg.setNoTypeLimits();
414 cfg.setScriptLimits("Jpan+Hangul");
415 testICUTokenization(input, cfg,
416 new String[]{input},
417 new String[]{"Unknown"},
418 new String[]{"<OTHER>"}
419 );
420
421 cfg.setScriptLimits("Chinese/Japanese+Hangul");
422 testICUTokenization(input, cfg,
423 new String[]{input},
424 new String[]{"Unknown"},
425 new String[]{"<OTHER>"}
426 );
427
428 cfg.setScriptLimits("Chinese+Hangul");
429 testICUTokenization(input, cfg,
430 new String[]{input},
431 new String[]{"Unknown"},
432 new String[]{"<OTHER>"}
433 );
434
435 cfg.setScriptLimits("Japanese+Hangul");
436 testICUTokenization(input, cfg,
437 new String[]{input},
438 new String[]{"Unknown"},
439 new String[]{"<OTHER>"}
440 );
441
442 }
443
444 }