View Javadoc
1   package org.wikimedia.search.extra.analysis.textify;
2   
3   import static org.junit.Assert.assertNotNull;
4   import static org.junit.Assert.assertNull;
5   import static org.wikimedia.search.extra.analysis.textify.ICUTokenRepairFilterTestUtils.testICUTokenization;
6   import static org.wikimedia.search.extra.analysis.textify.ICUTokenRepairFilterTestUtils.makeICUTokStream;
7   
8   import java.io.IOException;
9   
10  import org.apache.lucene.analysis.Analyzer;
11  import org.apache.lucene.analysis.BaseTokenStreamTestCase;
12  import org.apache.lucene.analysis.TokenStream;
13  import org.apache.lucene.analysis.Tokenizer;
14  import org.apache.lucene.analysis.core.WhitespaceTokenizer;
15  import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
16  import org.junit.Test;
17  
18  public class ICUTokenRepairFilterTest extends BaseTokenStreamTestCase {
19  
20      ICUTokenRepairFilterConfig cfg;
21  
22      @Test
23      public void testUAX29Example() throws IOException {
24          // UAX #29 example: Do not break within sequences of digits, or digits adjacent
25          // to letters (“3a”, or “A3”).
26          testICUTokenization("Д 3a Д A3", // input
27              new String[]{"Д", "3", "a", "Д", "A3"}, // default tokens
28              new String[]{"Д", "3a", "Д", "A3"},    // repaired tokens
29              new String[]{"Cyrillic", "Latin", "Cyrillic", "Latin"}, // scripts
30              new String[]{"<ALPHANUM>"}, // types - all ALPHANUM
31              new int[]{0, 2, 5, 7}, // start offsets
32              new int[]{1, 4, 6, 9}, // end offsets
33              new int[]{1, 1, 1, 1}  // pos increments
34          );
35      }
36  
37      @Test
38      public void testHomoglyphExamples() throws IOException {
39          // Cyrillic о in choc*о*late, Cyrillic М in Мoscow
40          testICUTokenization("chocоlate Мoscow", // input
41              new String[]{"choc", "о", "late", "М", "oscow"}, // default tokens
42              new String[]{"chocоlate", "Мoscow"}, // repaired tokens
43              new String[]{"Unknown"},     // scripts - all Unknown
44              new String[]{"<ALPHANUM>"}, // types - all ALPHANUM
45              new int[]{0, 10},          // start offsets
46              new int[]{9, 16},         // end offsets
47              new int[]{1,  1}         // pos increments
48          );
49      }
50  
51      @Test
52      public void testBasicICUTokenRepair() throws IOException {
53          // Latin/Cyrillic/Greek ABC; plus intentional examples from enwiki
54          testICUTokenization("abcабгαβγ SWΛNKУ lιмιтed edιтιon", // input
55              new String[]{"abc", "абг", "αβγ", "SW", "Λ", "NK", "У",
56                  "l", "ι", "м", "ι", "т", "ed", "ed", "ι", "т", "ι", "on"}, // default tokens
57              new String[]{"abcабгαβγ", "SWΛNKУ", "lιмιтed", "edιтιon"},    // repaired tokens
58              new String[]{"Unknown"},     // scripts - all Unknown
59              new String[]{"<ALPHANUM>"}, // types - all ALPHANUM
60              new int[]{0, 10, 17, 25},  // start offsets
61              new int[]{9, 16, 24, 32}, // end offsets
62              new int[]{1,  1,  1,  1} // pos increments
63          );
64      }
65  
66      @Test
67      public void testNumberSplits() throws IOException {
68          // earlier character sets cause later splits after whitespace!
69          testICUTokenization("3Q 3Ω 3Ω 3Д 3Д 3Q", // input
70              new String[]{"3Q", "3", "Ω", "3Ω", "3", "Д", "3Д", "3", "Q"}, // default tokens
71              new String[]{"3Q", "3Ω", "3Ω", "3Д", "3Д", "3Q"}, // repaired tokens
72              new String[]{"Latin", "Greek", "Greek", "Cyrillic",
73                  "Cyrillic", "Latin"},       // scripts
74              new String[]{"<ALPHANUM>"},     // types - all ALPHANUM
75              new int[]{0, 3, 6,  9, 12, 15}, // start offsets
76              new int[]{2, 5, 8, 11, 14, 17}, // end offsets
77              new int[]{1, 1, 1,  1,  1,  1}  // pos increments
78          );
79  
80          testICUTokenization("3Q3 3Ω3",         // input
81              new String[]{"3Q3", "3", "Ω3"},   // default tokens
82              new String[]{"3Q3", "3Ω3"},      // repaired tokens
83              new String[]{"Latin", "Greek"}, // scripts
84              new String[]{"<ALPHANUM>"},    // types - all ALPHANUM
85              new int[]{0, 4},              // start offsets
86              new int[]{3, 7},             // end offsets
87              new int[]{1, 1}             // pos increments
88          );
89  
90          // longer distance example
91          testICUTokenization("3Q 1234567890 1234567890 3Ω",              // input
92              new String[]{"3Q", "1234567890", "1234567890", "3", "Ω"},   // default tokens
93              new String[]{"3Q", "1234567890", "1234567890", "3Ω"},       // repaired tokens
94              new String[]{"Latin", "Common", "Common", "Greek"},         // scripts
95              new String[]{"<ALPHANUM>", "<NUM>", "<NUM>", "<ALPHANUM>"}, // types
96              new int[]{0,  3, 14, 25},                                   // start offsets
97              new int[]{2, 13, 24, 27},                                   // end offsets
98              new int[]{1,  1,  1,  1}                                    // pos increments
99          );
100 
101         // digit 2 in many scripts
102         testICUTokenization("२২੨૨᠒᥈߂᧒᭒",                               // input
103             new String[]{"२", "২", "੨", "૨", "᠒", "᥈", "߂", "᧒", "᭒"}, // default tokens
104             new String[]{"२২੨૨᠒᥈߂᧒᭒"},                                 // repaired tokens
105             new String[]{"Common"},                                   // scripts
106             new String[]{"<NUM>"}                                     // types
107         );
108 
109         // "123" is <NUM> and should be rejoined with ក
110         testICUTokenization("xx 123ក",         // input
111             new String[]{"xx", "123", "ក"},   // default tokens
112             new String[]{"xx", "123ក"},      // repaired tokens
113             new String[]{"Latin", "Khmer"}, // scripts
114             new String[]{"<ALPHANUM>"}     // types
115         );
116 
117         // "xx123" is not <NUM>, should be repaired to <ALPHANUM>, and blocked from joining ក
118         testICUTokenization("xx123ក",          // input
119             new String[]{"xx123", "ក"},       // default tokens
120             new String[]{"xx123", "ក"},      // repaired tokens
121             new String[]{"Latin", "Khmer"}, // scripts
122             new String[]{"<ALPHANUM>"}     // types
123         );
124     }
125 
126     @Test
127     public void testMegaMultiScriptExample() throws IOException {
128         String[] multiScriptChars = {
129             // Latin, Greek, Mandaic, Samaritan, N'Ko, Thaana, Arabic, Hebrew,
130             "d", "ϗ", "ࡃ", "ࠄ", "߄", "ޔ", "ئ", "ח",
131             // Cyrillic, Armenian, Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil,
132             "д", "դ", "ऄ", "অ", "ਖ", "ખ", "କ", "ழ",
133             // Telugu, Kannada, Malayalam, Sinhala, Thai, Lao, Myanmar, Georgian,
134             "న", "ತ", "ക", "ඕ", "ณ", "ທ", "ဖ", "ⴔ",
135             // Ethiopic,, Cherokee, Canadian Syllabics, Ogham, Runic
136             "ቁ", "ꭳ", "ᐕ", "ᚅ", "ᚥ"
137         };
138 
139         String megaToken = String.join("", multiScriptChars); // all one token
140         cfg = new ICUTokenRepairFilterConfig();
141         cfg.setNoScriptLimits();
142         testICUTokenization(megaToken, cfg, // input & config
143             multiScriptChars, // default tokens — all separated!
144             new String[]{megaToken}, // repaired tokens - all together!
145             new String[]{"Unknown"}, // scripts
146             new String[]{"<ALPHANUM>"} // types
147         );
148     }
149 
150     @Test
151     public void testSurrogateCamelCase() throws IOException {
152         // make sure 32-bit uppercase and lowercase are recognized as numbers
153 
154         // 𐐀𐐩𐐪 is Deseret (U+10400-U+1044F), which is the only alphabet with 32-bit
155         // characters that has upper and lowercase letters recognized by our current
156         // version of Java (8), and which the current ICU tokenizer (8.7) does not mark
157         // as "Common" script
158 
159         String surrogateCamel = "𐐀𐐩𐐪Abc Abc𐐀𐐩𐐪 𐐨𐐩𐐪abc abc𐐨𐐩𐐪";
160 
161         // defaults - camelCase should *not* be rejoined
162         cfg = new ICUTokenRepairFilterConfig();
163         cfg.setNoScriptLimits();
164         testICUTokenization(surrogateCamel, cfg, // input & config
165             new String[]{"𐐀𐐩𐐪", "Abc", "Abc", "𐐀𐐩𐐪", "𐐨𐐩𐐪", "abc", "abc", "𐐨𐐩𐐪"}, // default tokens
166             new String[]{"𐐀𐐩𐐪", "Abc", "Abc", "𐐀𐐩𐐪", "𐐨𐐩𐐪abc", "abc𐐨𐐩𐐪"}, // repaired tokens
167             new String[]{"Deseret", "Latin", "Latin", "Deseret", "Unknown", "Unknown"}, // scripts
168             new String[]{"<ALPHANUM>"} // types
169         );
170 
171         // don't preserve camelCase splits
172         cfg.setKeepCamelSplit(false);
173         testICUTokenization(surrogateCamel, cfg, // input & config
174             new String[]{"𐐀𐐩𐐪Abc", "Abc𐐀𐐩𐐪", "𐐨𐐩𐐪abc", "abc𐐨𐐩𐐪"}, // repaired tokens
175             new String[]{"Unknown"}, // scripts
176             new String[]{"<ALPHANUM>"} // types
177         );
178     }
179 
180     @Test
181     public void testSurrogateDigits() throws IOException {
182         // make sure 32-bit numbers are recognized as numbers
183 
184         // Latin a + Takri 3 (U+116C3) + Brahmi 3 (U+11069) + Latin z
185         // These digits are carefully chosen to be 32-bit and recognized by our current
186         // version of Java (8) as having a type of DECIMAL_DIGIT_NUMBER
187         String surrogateDigits = "a𑛃𑁩z";
188 
189         cfg = new ICUTokenRepairFilterConfig();
190         cfg.setNoScriptLimits();
191 
192         // defaults
193         testICUTokenization(surrogateDigits, cfg, // input & config
194             new String[]{"a", "𑛃", "𑁩", "z"}, // default tokens
195             new String[]{surrogateDigits}, // repaired token
196             new String[]{"Unknown"}, // scripts
197             new String[]{"<ALPHANUM>"} // types
198         );
199 
200         // only allow number splits
201         cfg.setMergeNumOnly(true);
202         testICUTokenization(surrogateDigits, cfg, // input & config
203             new String[]{surrogateDigits}, // repaired tokens"
204             new String[]{"Unknown"}, // scripts
205             new String[]{"<ALPHANUM>"} // types
206         );
207     }
208 
209     /* Monoscript tokens in scripts that don't need segmentation should be unchanged. Mini-test
210      * of the ICU tokenizer's script identification; Baseline test of testICUTokenization().
211      */
212     @Test
213     public void testMiscMonoscriptTokens() throws IOException {
214         // take the tokens, join them together, send them off to be split apart
215         // and get the same tokens back, ICU repair or not
216         String[] toks = {"Wikipedia", "Википедию", "Βικιπαίδεια", "Վիքիպեդիա"};
217         testICUTokenization(String.join(" ", toks), // input
218             toks, // default tokens
219             toks, // repaired tokens
220             new String[]{"Latin", "Cyrillic", "Greek", "Armenian"}, // scripts
221             new String[]{"<ALPHANUM>"} // types - all ALPHANUM
222         );
223 
224         // That was fun! Do it again!
225         toks = new String[]{"ვიკიპედია", "विकिपीडिया", "விக்கிப்பீடியா", "উইকিপিডিয়া"};
226         testICUTokenization(String.join(" ", toks), toks, toks,
227             new String[]{"Georgian", "Devanagari", "Tamil", "Bengali"},
228             new String[]{"<ALPHANUM>"} // types - all ALPHANUM
229         );
230     }
231 
232     /* We should avoid adding script attributes (with default values) to token streams
233      * that don't already have them (i.e., because a non-ICU tokenizer was used).
234      */
235     @Test
236     public void testAvoidAddingScriptAttributes() throws IOException {
237         // Cyrillic о in chocоlate, Cyrillic М in Мoscow
238         String testInput = "chocоlate Мoscow SWΛNKУ lιмιтed edιтιon NGiИX KoЯn";
239 
240         // ICU tokenizer -> ScriptAttribute *is not* null
241         TokenStream ts = makeICUTokStream(testInput);
242         ScriptAttribute scriptAtt = ts.getAttribute(ScriptAttribute.class);
243         assertNotNull(scriptAtt);
244 
245         // Non-ICU tokenizer -> ScriptAttribute *is* null, even with repair
246         Analyzer ana = new Analyzer() {
247             @Override
248             protected TokenStreamComponents createComponents(String fieldName) {
249                 Tokenizer tok = new WhitespaceTokenizer(); // <= doesn't need repair!
250                 TokenStream ts = new ICUTokenRepairFilter(tok);
251                 return new TokenStreamComponents(tok, ts);
252             }
253         };
254         ts = ana.tokenStream("", testInput);
255         scriptAtt = ts.getAttribute(ScriptAttribute.class);
256         assertNull(scriptAtt);
257     }
258 
259 }