View Javadoc
1   package org.wikimedia.search.extra.analysis.textify;
2   
3   import static java.util.Collections.emptySet;
4   import static java.util.Collections.singleton;
5   import static org.wikimedia.search.extra.analysis.textify.ICUTokenRepairFilterTestUtils.testICUTokenization;
6   
7   import java.io.IOException;
8   import java.util.Arrays;
9   import java.util.Set;
10  import java.util.HashSet;
11  
12  import org.apache.lucene.analysis.BaseTokenStreamTestCase;
13  import org.junit.Test;
14  
15  public class ICUTokenRepairFilterConfigTest extends BaseTokenStreamTestCase {
16  
17      ICUTokenRepairFilterConfig cfg;
18  
19      @Test
20      public void testTokenLengthOptions() throws IOException {
21          // three strings are 40 characters long, making one 120-char token
22          // ICU default splits them up
23          // ICU repaired by default only allows 100-char tokens, so only the
24          // first two get rejoined
25          String latD40 = "dddddddddddddddddddddddddddddddddddddddd";
26          String cyrD40 = "дддддддддддддддддддддддддддддддддддддддд";
27          String grkD40 = "δδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδδ";
28          String mixD120 = latD40 + cyrD40 + grkD40;
29  
30          testICUTokenization(mixD120,                 // input
31              new String[]{latD40, cyrD40, grkD40},   // default tokens
32              new String[]{latD40 + cyrD40, grkD40}, // repaired tokens
33              new String[]{"Unknown", "Greek"},     // scripts
34              new String[]{"<ALPHANUM>"},          // types - all ALPHANUM
35              new int[]{0,   80},                 // start offsets
36              new int[]{80, 120},                // end offsets
37              new int[]{1,    1}                // pos increments
38          );
39  
40          // set max token length to 200 and get back one token
41          cfg = new ICUTokenRepairFilterConfig();
42          cfg.setMaxTokenLength(200);
43          testICUTokenization(mixD120, cfg,          // input & 200-char config
44              new String[]{latD40, cyrD40, grkD40}, // default tokens
45              new String[]{mixD120},               // repaired tokens
46              new String[]{"Unknown"},            // scripts
47              new String[]{"<ALPHANUM>"},        // types
48              new int[]{0},                     // start offsets
49              new int[]{120},                  // end offsets
50              new int[]{1}                    // pos increments
51          );
52  
53          // input is dдδdдδdдδdдδ.. for 150 characters
54          // skipping ICU default since it'd be 150 distinct characters
55          // max token len is 100 by default, so we still get 2 tokens
56          String[] dddArray = new String[50];
57          Arrays.fill(dddArray, "dдδ");
58          String ddd150 = String.join("", dddArray);
59          testICUTokenization(ddd150, // input
60              new String[]{ddd150.substring(0, 100), ddd150.substring(100)}, // repaired tokens
61              new String[]{"Unknown"}, // scripts - all Unknown
62              new String[]{"<ALPHANUM>"} // types - all ALPHANUM
63          );
64  
65          // set max token length to 200 and get back one token
66          testICUTokenization(ddd150, cfg,  // input & 200-char config
67              new String[]{ddd150},        // repaired tokens
68              new String[]{"Unknown"},    // scripts
69              new String[]{"<ALPHANUM>"} // types
70          );
71      }
72  
73      @Test
74      public void testCamelCaseOptions() throws IOException {
75          // "camel" is all Latin, "ϚΛϞΣ"/"ϛλϟε" is all Greek; plus combining diacritic,
76          // soft hyphen, and LTR mark in between them
77          String input = "NGiИX KoЯn camel̠­‎ϚΛϞΣ camel̠­‎ϛλϟε";
78  
79          // keep camelCase splits
80          boolean keepCamelSplits = true;
81  
82          cfg = new ICUTokenRepairFilterConfig();
83          cfg.setKeepCamelSplit(true);
84          testICUTokenization(input, cfg, // input & config
85              new String[]{"NGi", "ИX", "Ko", "Яn", "camel̠­‎", "ϚΛϞΣ", "camel̠­‎ϛλϟε"}, // repaired tokens
86              new String[]{"Latin", "Unknown", "Latin", "Unknown", "Latin", "Greek", "Unknown"}, // scripts
87              new String[]{"<ALPHANUM>"} // types - all ALPHANUM
88          );
89  
90          // don't keep camelCase splits
91          cfg.setKeepCamelSplit(false);
92          testICUTokenization(input, cfg, // input & config
93              new String[]{"NGiИX", "KoЯn", "camel̠­‎ϚΛϞΣ", "camel̠­‎ϛλϟε"}, // repaired tokens
94              new String[]{"Unknown"},   // scripts - all Unknown
95              new String[]{"<ALPHANUM>"} // types - all ALPHANUM
96          );
97      }
98  
99      @Test
100     public void testMergeNumOnlyOptions() throws IOException {
101         // 2١ = #|# split .. 2x = #|A split .. x١ = A|# split
102         // it can be hard to see, but..
103         // - first "3a" is separated by a soft-hyphen
104         // - second "3a" is separated by a combining umlaut (it renders on the 3)
105         // - third "3a" is separated by a left-to-right mark
106         // - last "3a" has all three!
107         String numInput = "x 2١ 2x x١ Ж 3­a Ж 3̈a Ж 3‎a Ж 3̈­‎a";
108 
109         cfg = new ICUTokenRepairFilterConfig();
110         // number repairs should be the same regardless of mergeNumOnly setting
111         boolean[] trueFalse = {true, false};
112         for (boolean mergeNumOnly : trueFalse) {
113             cfg.setMergeNumOnly(mergeNumOnly);
114             testICUTokenization(numInput, cfg,
115                 new String[]{"x", "2", "١", "2", "x", "x", "١",
116                     "Ж", "3­", "a", "Ж", "3̈", "a", "Ж", "3‎", "a", "Ж", "3̈­‎", "a"}, // default tokens
117                 new String[]{"x", "2١", "2x", "x١",
118                     "Ж", "3­a", "Ж", "3̈a", "Ж", "3‎a", "Ж", "3̈­‎a"}, // repaired tokens
119                 new String[]{"Latin", "Common", "Latin", "Latin", "Cyrillic", "Latin",
120                     "Cyrillic", "Latin", "Cyrillic", "Latin", "Cyrillic", "Latin"}, // scripts
121                 new String[]{"<ALPHANUM>", "<NUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
122                     "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
123                     "<ALPHANUM>", "<ALPHANUM>"} // types
124             );
125         }
126 
127         // Latin/Cyrillic/Greek x abc + 3x
128         String nonNumInput = "abcабгαβγ 3x";
129 
130         // repair everything
131         cfg.setMergeNumOnly(false);
132         testICUTokenization(nonNumInput, cfg,
133             new String[]{"abc", "абг", "αβγ", "3", "x"}, // default tokens
134             new String[]{"abcабгαβγ", "3x"}, // repaired tokens
135             new String[]{"Unknown", "Latin"}, // scripts
136             new String[]{"<ALPHANUM>"} // types
137         );
138 
139         // only repair numbers
140         cfg.setMergeNumOnly(true);
141         testICUTokenization(nonNumInput, cfg,
142             new String[]{"abc", "абг", "αβγ", "3", "x"}, // default tokens
143             new String[]{"abc", "абг", "αβγ", "3x"}, // repaired tokens
144             new String[]{"Latin", "Cyrillic", "Greek", "Latin"}, // scripts
145             new String[]{"<ALPHANUM>"} // types
146         );
147     }
148 
149     @Test
150     public void testTypeLimitOptions() throws IOException {
151         boolean makeDenyList = false; // create deny list
152         Set<Integer> emptyTypeSet = emptySet(); // deny nothing
153         Set<Integer> alphaTypeOnly =  // only deny ALPHANUM
154             singleton(TextifyUtils.TOKEN_TYPE_ALPHANUM);
155 
156         ICUTokenRepairFilterConfig repairAllCfg = new ICUTokenRepairFilterConfig();
157         repairAllCfg.setNoScriptLimits();
158         repairAllCfg.setNoTypeLimits();
159 
160         // by default, we don't merge EMOJI, IDEOGRAPHIC, or HANGUL types, but we can
161         // correct scripts for nearby "Common" tokens
162         String emoji = "Д☂D☀Δ";
163         testICUTokenization(emoji, // input
164             new String[]{"Д", "☂", "D", "☀", "Δ"}, // default tokens
165             new String[]{"Д", "☂", "D", "☀", "Δ"}, // repaired tokens
166             new String[]{"Cyrillic", "Common", "Latin", "Common", "Greek"}, // scripts
167             new String[]{"<ALPHANUM>", "<EMOJI>", "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>"} // types
168         );
169 
170         // repair **everything** - bad idea!
171         testICUTokenization(emoji, repairAllCfg,
172             new String[]{"Д", "☂D", "☀Δ"}, // repaired tokens — following emoji have same script type—
173                                            // ☂ is "Cyrillic" and ☀ is "Latin"—so they can't rejoin
174             new String[]{"Cyrillic", "Latin", "Greek"}, // scripts
175             new String[]{"<ALPHANUM>"} // types
176         );
177 
178         // block ALPHANUM
179         cfg = new ICUTokenRepairFilterConfig();
180         cfg.setTypeLimits(makeDenyList, alphaTypeOnly);
181         testICUTokenization(emoji, cfg,
182             new String[]{"Д", "☂", "D", "☀", "Δ"}, // repaired tokens - nothing happens
183             new String[]{"Cyrillic", "Common", "Latin", "Common", "Greek"}, // scripts
184             new String[]{"<ALPHANUM>", "<EMOJI>", "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>"} // types
185         );
186 
187         String chinese = "6年 X 8年"; // CJK gets split, correctly, at numbers
188         testICUTokenization(chinese, // input
189             new String[]{"6", "年", "X", "8", "年"}, // default tokens
190             new String[]{"6", "年", "X", "8", "年"}, // repaired tokens
191             new String[]{"Common", "Jpan", "Latin", "Common", "Jpan"}, // scripts
192                 // Tokens marked as "Chinese/Japanese" in explain output are internally "Jpan"
193                 // both getName() and getShortName() return "Jpan". Just roll with it.
194             new String[]{"<NUM>", "<IDEOGRAPHIC>", "<ALPHANUM>", "<NUM>", "<IDEOGRAPHIC>"} // types
195         );
196 
197         // repair **everything** - bad idea!
198         testICUTokenization(chinese, repairAllCfg,
199             new String[]{"6", "年", "X", "8年"}, // repaired tokens — 8 follows X so it is "Latin" and
200                                                 // can rejoin. "6" gets script "Jpan" and cannot rejoin
201             new String[]{"Common", "Jpan", "Latin", "Jpan"}, // scripts
202             new String[]{"<NUM>", "<IDEOGRAPHIC>", "<ALPHANUM>", "<IDEOGRAPHIC>"} // types
203         );
204 
205         String korean = "3년 X 7년"; // CJK gets split, correctly, at numbers
206         testICUTokenization(korean, // input
207             new String[]{"3", "년", "X", "7", "년"}, // default tokens
208             new String[]{"3", "년", "X", "7", "년"}, // repaired tokens
209             new String[]{"Common", "Hangul", "Latin", "Common", "Hangul"}, // scripts
210             new String[]{"<NUM>", "<HANGUL>", "<ALPHANUM>", "<NUM>", "<HANGUL>"} // types
211         );
212 
213         // repair **everything** - bad idea!
214         testICUTokenization(korean, repairAllCfg,
215             new String[]{"3", "년", "X", "7년"}, // repaired tokens — 7 follows X so it is "Latin" and
216                                                 // can rejoin. "3" gets script "Hangul" and cannot rejoin
217             new String[]{"Common", "Hangul", "Latin", "Hangul"}, // scripts
218             new String[]{"<NUM>", "<HANGUL>", "<ALPHANUM>", "<HANGUL>"} // types
219         );
220 
221         String mixedCjkJa = "び帆布カバン"; // correctly split at script boundaries
222         testICUTokenization(mixedCjkJa, // input
223             new String[]{"び", "帆布", "カバン"}, // default tokens
224             new String[]{"び", "帆布", "カバン"}, // repaired tokens
225             new String[]{"Jpan"}, // scripts - hiragana, katakana, and hanzi are all tagged as Japanese
226             new String[]{"<IDEOGRAPHIC>"} // types
227         );
228 
229         // repair **everything** - doesn't matter too much for mixed Japanese.. but don't do it!
230         testICUTokenization(mixedCjkJa, repairAllCfg,
231             new String[]{"び", "帆布", "カバン"}, // repaired tokens — all are Japanese so cannot rejoin
232             new String[]{"Jpan"}, // scripts
233             new String[]{"<IDEOGRAPHIC>"} // types
234         );
235 
236         String mixedCjkKo = "축구常備軍"; // correctly split at script boundaries
237         testICUTokenization(mixedCjkKo, // input
238             new String[]{"축구", "常備軍"}, // default tokens
239             new String[]{"축구", "常備軍"}, // repaired tokens
240             new String[]{"Hangul", "Jpan"}, // scripts
241             new String[]{"<HANGUL>", "<IDEOGRAPHIC>"} // types
242         );
243 
244         // repair **everything** - bad idea!
245         testICUTokenization(mixedCjkKo, repairAllCfg,
246             new String[]{mixedCjkKo}, // repaired tokens — "Hangul" + "Jpan" can rejoin
247             new String[]{"Unknown"}, // scripts
248             new String[]{"<OTHER>"} // types
249         );
250     }
251 
252     @Test
253     public void testMergeableTypesAllowOptions() throws IOException {
254         boolean makeAllowList = true; // create allow list..
255         Set<Integer> alphaTypeOnly =  // ..and only allow ALPHANUM (no NUM allowed!)
256             singleton(TextifyUtils.TOKEN_TYPE_ALPHANUM);
257 
258         String abc = "abcабгαβγ 3x 3χ 3ж";
259         testICUTokenization(abc, // input
260             new String[]{"abc", "абг", "αβγ", "3", "x", "3", "χ", "3", "ж"}, // default tokens
261             new String[]{"abcабгαβγ", "3x", "3χ", "3ж"}, // repaired tokens - default config
262             new String[]{"Unknown", "Latin", "Greek", "Cyrillic"}, // scripts
263             new String[]{"<ALPHANUM>"} // types
264         );
265 
266         // repair ALPHANUM only
267         cfg = new ICUTokenRepairFilterConfig();
268         cfg.setTypeLimits(makeAllowList, alphaTypeOnly);
269         testICUTokenization(abc, cfg,
270             new String[]{"abcабгαβγ", "3", "x", "3", "χ", "3", "ж"}, // repaired tokens — 3 can't rejoin!
271             new String[]{"Unknown", "Common", "Latin", "Common", "Greek", "Common", "Cyrillic"}, // scripts
272             new String[]{"<ALPHANUM>", "<NUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>",
273                 "<NUM>", "<ALPHANUM>"} // types
274         );
275 
276         makeAllowList = true; // create allow list..
277         Set<Integer> alphaHangulSet =  // ..allow ALPHANUM and HANGUL (why would you do that?!)
278             new HashSet<>(Arrays.asList(TextifyUtils.TOKEN_TYPE_ALPHANUM, TextifyUtils.TOKEN_TYPE_HANGUL));
279 
280         String veryMixedString = "abc한글абг한글3αβγ5x";
281         cfg = new ICUTokenRepairFilterConfig();
282         cfg.setNoScriptLimits();
283         testICUTokenization(veryMixedString, cfg, // input
284             new String[]{"abc", "한글", "абг", "한글", "3", "αβγ5", "x"}, // default tokens
285             new String[]{"abc", "한글", "абг", "한글", "3αβγ5x"}, // repaired tokens
286             new String[]{"Latin", "Hangul", "Cyrillic", "Hangul", "Unknown"}, // scripts
287             new String[]{"<ALPHANUM>", "<HANGUL>", "<ALPHANUM>", "<HANGUL>", "<ALPHANUM>"} // types
288         );
289 
290         // repair ALPHANUM & HANGUL only - so weird... note that ALPHANUM + HANGUL = ALPHANUM
291         cfg = new ICUTokenRepairFilterConfig();
292         cfg.setTypeLimits(makeAllowList, alphaHangulSet);
293         cfg.setNoScriptLimits();
294         testICUTokenization(veryMixedString, cfg,
295             new String[]{"abc한글абг한글", "3", "αβγ5x"}, // repaired tokens — 3 can't rejoin!
296             new String[]{"Unknown", "Common", "Unknown"}, // scripts
297             new String[]{"<ALPHANUM>", "<NUM>", "<ALPHANUM>"} // types
298         );
299     }
300 
301     @Test
302     public void testEmptyScriptLimits() throws IOException {
303         // test null and empty config options. These don't change the parsing of this
304         // example, but they probe for null pointer exceptions.
305         String input = "null test";
306         cfg = new ICUTokenRepairFilterConfig();
307 
308         cfg.setScriptLimits("");
309         testICUTokenization(input, cfg, // input & config
310             new String[]{"null", "test"}, // repaired tokens
311             new String[]{"Latin"}, // scripts
312             new String[]{"<ALPHANUM>"} // types - all ALPHANUM
313         );
314 
315         cfg.setScriptLimits(false, null);
316         testICUTokenization(input, cfg, // input & config
317             new String[]{"null", "test"}, // repaired tokens
318             new String[]{"Latin"}, // scripts
319             new String[]{"<ALPHANUM>"} // types - all ALPHANUM
320         );
321 
322         cfg.setScriptLimits(true, null);
323         testICUTokenization(input, cfg, // input & config
324             new String[]{"null", "test"}, // repaired tokens
325             new String[]{"Latin"}, // scripts
326             new String[]{"<ALPHANUM>"} // types - all ALPHANUM
327         );
328     }
329 
330     @Test
331     public void testScriptLimitOptions() throws IOException {
332         // Armenian, Coptic, Cyrillic, Greek, Latin
333         String[] tokens = {"աբգ", "ⲁⲃⲅ", "абг", "αβγ", "abc"};
334         String input = String.join("", tokens);
335 
336         testICUTokenization(input,         // input
337             tokens,                       // default tokens
338             new String[]{input},         // repaired token == original input
339             new String[]{"Unknown"},    // script
340             new String[]{"<ALPHANUM>"} // type
341         );
342 
343         cfg = new ICUTokenRepairFilterConfig();
344 
345         // don't include Coptic
346         cfg.setScriptLimits("Armenian+Cyrillic+Greek+Latin");
347         testICUTokenization(input, cfg, // input & config
348             new String[]{"աբգ", "ⲁⲃⲅ", "абгαβγabc"}, // repaired tokens
349             new String[]{"Armenian", "Coptic", "Unknown"}, // scripts
350             new String[]{"<ALPHANUM>"} // types - all ALPHANUM
351         );
352 
353         // don't include Coptic or Greek - which removes all matches
354         cfg.setScriptLimits("Armenian+Cyrillic+Latin");
355         testICUTokenization(input, cfg, // input & config
356             tokens, // repaired tokens == original tokens
357             new String[]{"Armenian", "Coptic", "Cyrillic", "Greek", "Latin"}, // scripts
358             new String[]{"<ALPHANUM>"} // types - all ALPHANUM
359         );
360 
361         // disallow all matches
362         cfg.setScriptLimits("");
363         testICUTokenization(input, cfg, // input & config
364             tokens, // repaired tokens == original tokens
365             new String[]{"Armenian", "Coptic", "Cyrillic", "Greek", "Latin"}, // scripts
366             new String[]{"<ALPHANUM>"} // types - all ALPHANUM
367         );
368 
369         // all pairwise matches to repair full string
370         cfg.setScriptLimits("Armenian+Coptic, Coptic+Cyrillic, Cyrillic+Greek, Greek+Latin");
371         testICUTokenization(input, cfg,   // input & config
372             new String[]{input},         // repaired token == original input
373             new String[]{"Unknown"},    // script
374             new String[]{"<ALPHANUM>"} // type
375         );
376 
377         // all pairwise matches in random order, no spaces
378         cfg.setScriptLimits("Greek+Latin,Cyrillic+Coptic,Armenian+Coptic,Greek+Cyrillic");
379         testICUTokenization(input, cfg,   // input & config
380             new String[]{input},         // repaired token == original input
381             new String[]{"Unknown"},    // script
382             new String[]{"<ALPHANUM>"} // type
383         );
384 
385         // big group in random order
386         cfg.setScriptLimits("Latin+Cyrillic+Armenian+Coptic+Greek");
387         testICUTokenization(input, cfg,   // input & config
388             new String[]{input},         // repaired token == original input
389             new String[]{"Unknown"},    // script
390             new String[]{"<ALPHANUM>"} // type
391         );
392     }
393 
394     @Test
395     public void testCJScriptLimitNames() throws IOException {
396         // Tokens marked as "Chinese/Japanese" in explain output are internally "Jpan"
397         // both getName() and getShortName() return "Jpan". Allow "Chinese/Japanese",
398         // "Chinese", and "Japanese" as alternatives to "Jpan" in config.
399 
400         // Hiragana, Hangul, Katakana, Hangul, Chinese, Hangul
401         String[] tokens = {"あ", "갠", "ア", "갠", "饳", "갠"};
402         String input = String.join("", tokens);
403 
404         testICUTokenization(input, // input
405             tokens, // default tokens (all separated)
406             tokens, // default doesn't repair <IDEOGRAPHIC> tokens
407             new String[]{"Jpan", "Hangul", "Jpan", "Hangul", "Jpan", "Hangul"},    // scripts
408             new String[]{"<IDEOGRAPHIC>", "<HANGUL>", "<IDEOGRAPHIC>", "<HANGUL>", "<IDEOGRAPHIC>",
409                 "<HANGUL>"} // types
410         );
411 
412         cfg = new ICUTokenRepairFilterConfig();
413         cfg.setNoTypeLimits();
414         cfg.setScriptLimits("Jpan+Hangul");
415         testICUTokenization(input, cfg, // input & config
416             new String[]{input},       // repaired token
417             new String[]{"Unknown"},  // script
418             new String[]{"<OTHER>"}  // type
419         );
420 
421         cfg.setScriptLimits("Chinese/Japanese+Hangul");
422         testICUTokenization(input, cfg, // input & config
423             new String[]{input},       // repaired token
424             new String[]{"Unknown"},  // script
425             new String[]{"<OTHER>"}  // type
426         );
427 
428         cfg.setScriptLimits("Chinese+Hangul");
429         testICUTokenization(input, cfg, // input & config
430             new String[]{input},       // repaired token
431             new String[]{"Unknown"},  // script
432             new String[]{"<OTHER>"}  // type
433         );
434 
435         cfg.setScriptLimits("Japanese+Hangul");
436         testICUTokenization(input, cfg, // input & config
437             new String[]{input},       // repaired token
438             new String[]{"Unknown"},  // script
439             new String[]{"<OTHER>"}  // type
440         );
441 
442     }
443 
444 }