1 package org.wikimedia.search.extra.analysis.textify;
2
3 import static org.junit.Assert.assertNotNull;
4 import static org.junit.Assert.assertNull;
5 import static org.wikimedia.search.extra.analysis.textify.ICUTokenRepairFilterTestUtils.testICUTokenization;
6 import static org.wikimedia.search.extra.analysis.textify.ICUTokenRepairFilterTestUtils.makeICUTokStream;
7
8 import java.io.IOException;
9
10 import org.apache.lucene.analysis.Analyzer;
11 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
12 import org.apache.lucene.analysis.TokenStream;
13 import org.apache.lucene.analysis.Tokenizer;
14 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
15 import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
16 import org.junit.Test;
17
18 public class ICUTokenRepairFilterTest extends BaseTokenStreamTestCase {
19
20 ICUTokenRepairFilterConfig cfg;
21
22 @Test
23 public void testUAX29Example() throws IOException {
24
25
26 testICUTokenization("Д 3a Д A3",
27 new String[]{"Д", "3", "a", "Д", "A3"},
28 new String[]{"Д", "3a", "Д", "A3"},
29 new String[]{"Cyrillic", "Latin", "Cyrillic", "Latin"},
30 new String[]{"<ALPHANUM>"},
31 new int[]{0, 2, 5, 7},
32 new int[]{1, 4, 6, 9},
33 new int[]{1, 1, 1, 1}
34 );
35 }
36
37 @Test
38 public void testHomoglyphExamples() throws IOException {
39
40 testICUTokenization("chocоlate Мoscow",
41 new String[]{"choc", "о", "late", "М", "oscow"},
42 new String[]{"chocоlate", "Мoscow"},
43 new String[]{"Unknown"},
44 new String[]{"<ALPHANUM>"},
45 new int[]{0, 10},
46 new int[]{9, 16},
47 new int[]{1, 1}
48 );
49 }
50
51 @Test
52 public void testBasicICUTokenRepair() throws IOException {
53
54 testICUTokenization("abcабгαβγ SWΛNKУ lιмιтed edιтιon",
55 new String[]{"abc", "абг", "αβγ", "SW", "Λ", "NK", "У",
56 "l", "ι", "м", "ι", "т", "ed", "ed", "ι", "т", "ι", "on"},
57 new String[]{"abcабгαβγ", "SWΛNKУ", "lιмιтed", "edιтιon"},
58 new String[]{"Unknown"},
59 new String[]{"<ALPHANUM>"},
60 new int[]{0, 10, 17, 25},
61 new int[]{9, 16, 24, 32},
62 new int[]{1, 1, 1, 1}
63 );
64 }
65
66 @Test
67 public void testNumberSplits() throws IOException {
68
69 testICUTokenization("3Q 3Ω 3Ω 3Д 3Д 3Q",
70 new String[]{"3Q", "3", "Ω", "3Ω", "3", "Д", "3Д", "3", "Q"},
71 new String[]{"3Q", "3Ω", "3Ω", "3Д", "3Д", "3Q"},
72 new String[]{"Latin", "Greek", "Greek", "Cyrillic",
73 "Cyrillic", "Latin"},
74 new String[]{"<ALPHANUM>"},
75 new int[]{0, 3, 6, 9, 12, 15},
76 new int[]{2, 5, 8, 11, 14, 17},
77 new int[]{1, 1, 1, 1, 1, 1}
78 );
79
80 testICUTokenization("3Q3 3Ω3",
81 new String[]{"3Q3", "3", "Ω3"},
82 new String[]{"3Q3", "3Ω3"},
83 new String[]{"Latin", "Greek"},
84 new String[]{"<ALPHANUM>"},
85 new int[]{0, 4},
86 new int[]{3, 7},
87 new int[]{1, 1}
88 );
89
90
91 testICUTokenization("3Q 1234567890 1234567890 3Ω",
92 new String[]{"3Q", "1234567890", "1234567890", "3", "Ω"},
93 new String[]{"3Q", "1234567890", "1234567890", "3Ω"},
94 new String[]{"Latin", "Common", "Common", "Greek"},
95 new String[]{"<ALPHANUM>", "<NUM>", "<NUM>", "<ALPHANUM>"},
96 new int[]{0, 3, 14, 25},
97 new int[]{2, 13, 24, 27},
98 new int[]{1, 1, 1, 1}
99 );
100
101
102 testICUTokenization("२২੨૨᠒᥈߂᧒᭒",
103 new String[]{"२", "২", "੨", "૨", "᠒", "᥈", "߂", "᧒", "᭒"},
104 new String[]{"२২੨૨᠒᥈߂᧒᭒"},
105 new String[]{"Common"},
106 new String[]{"<NUM>"}
107 );
108
109
110 testICUTokenization("xx 123ក",
111 new String[]{"xx", "123", "ក"},
112 new String[]{"xx", "123ក"},
113 new String[]{"Latin", "Khmer"},
114 new String[]{"<ALPHANUM>"}
115 );
116
117
118 testICUTokenization("xx123ក",
119 new String[]{"xx123", "ក"},
120 new String[]{"xx123", "ក"},
121 new String[]{"Latin", "Khmer"},
122 new String[]{"<ALPHANUM>"}
123 );
124 }
125
126 @Test
127 public void testMegaMultiScriptExample() throws IOException {
128 String[] multiScriptChars = {
129
130 "d", "ϗ", "ࡃ", "ࠄ", "߄", "ޔ", "ئ", "ח",
131
132 "д", "դ", "ऄ", "অ", "ਖ", "ખ", "କ", "ழ",
133
134 "న", "ತ", "ക", "ඕ", "ณ", "ທ", "ဖ", "ⴔ",
135
136 "ቁ", "ꭳ", "ᐕ", "ᚅ", "ᚥ"
137 };
138
139 String megaToken = String.join("", multiScriptChars);
140 cfg = new ICUTokenRepairFilterConfig();
141 cfg.setNoScriptLimits();
142 testICUTokenization(megaToken, cfg,
143 multiScriptChars,
144 new String[]{megaToken},
145 new String[]{"Unknown"},
146 new String[]{"<ALPHANUM>"}
147 );
148 }
149
150 @Test
151 public void testSurrogateCamelCase() throws IOException {
152
153
154
155
156
157
158
159 String surrogateCamel = "𐐀𐐩𐐪Abc Abc𐐀𐐩𐐪 𐐨𐐩𐐪abc abc𐐨𐐩𐐪";
160
161
162 cfg = new ICUTokenRepairFilterConfig();
163 cfg.setNoScriptLimits();
164 testICUTokenization(surrogateCamel, cfg,
165 new String[]{"𐐀𐐩𐐪", "Abc", "Abc", "𐐀𐐩𐐪", "𐐨𐐩𐐪", "abc", "abc", "𐐨𐐩𐐪"},
166 new String[]{"𐐀𐐩𐐪", "Abc", "Abc", "𐐀𐐩𐐪", "𐐨𐐩𐐪abc", "abc𐐨𐐩𐐪"},
167 new String[]{"Deseret", "Latin", "Latin", "Deseret", "Unknown", "Unknown"},
168 new String[]{"<ALPHANUM>"}
169 );
170
171
172 cfg.setKeepCamelSplit(false);
173 testICUTokenization(surrogateCamel, cfg,
174 new String[]{"𐐀𐐩𐐪Abc", "Abc𐐀𐐩𐐪", "𐐨𐐩𐐪abc", "abc𐐨𐐩𐐪"},
175 new String[]{"Unknown"},
176 new String[]{"<ALPHANUM>"}
177 );
178 }
179
180 @Test
181 public void testSurrogateDigits() throws IOException {
182
183
184
185
186
187 String surrogateDigits = "a𑛃𑁩z";
188
189 cfg = new ICUTokenRepairFilterConfig();
190 cfg.setNoScriptLimits();
191
192
193 testICUTokenization(surrogateDigits, cfg,
194 new String[]{"a", "𑛃", "𑁩", "z"},
195 new String[]{surrogateDigits},
196 new String[]{"Unknown"},
197 new String[]{"<ALPHANUM>"}
198 );
199
200
201 cfg.setMergeNumOnly(true);
202 testICUTokenization(surrogateDigits, cfg,
203 new String[]{surrogateDigits},
204 new String[]{"Unknown"},
205 new String[]{"<ALPHANUM>"}
206 );
207 }
208
209
210
211
212 @Test
213 public void testMiscMonoscriptTokens() throws IOException {
214
215
216 String[] toks = {"Wikipedia", "Википедию", "Βικιπαίδεια", "Վիքիպեդիա"};
217 testICUTokenization(String.join(" ", toks),
218 toks,
219 toks,
220 new String[]{"Latin", "Cyrillic", "Greek", "Armenian"},
221 new String[]{"<ALPHANUM>"}
222 );
223
224
225 toks = new String[]{"ვიკიპედია", "विकिपीडिया", "விக்கிப்பீடியா", "উইকিপিডিয়া"};
226 testICUTokenization(String.join(" ", toks), toks, toks,
227 new String[]{"Georgian", "Devanagari", "Tamil", "Bengali"},
228 new String[]{"<ALPHANUM>"}
229 );
230 }
231
232
233
234
235 @Test
236 public void testAvoidAddingScriptAttributes() throws IOException {
237
238 String testInput = "chocоlate Мoscow SWΛNKУ lιмιтed edιтιon NGiИX KoЯn";
239
240
241 TokenStream ts = makeICUTokStream(testInput);
242 ScriptAttribute scriptAtt = ts.getAttribute(ScriptAttribute.class);
243 assertNotNull(scriptAtt);
244
245
246 Analyzer ana = new Analyzer() {
247 @Override
248 protected TokenStreamComponents createComponents(String fieldName) {
249 Tokenizer tok = new WhitespaceTokenizer();
250 TokenStream ts = new ICUTokenRepairFilter(tok);
251 return new TokenStreamComponents(tok, ts);
252 }
253 };
254 ts = ana.tokenStream("", testInput);
255 scriptAtt = ts.getAttribute(ScriptAttribute.class);
256 assertNull(scriptAtt);
257 }
258
259 }