View Javadoc
1   package org.wikimedia.search.extra.analysis.khmer;
2   
3   import java.io.IOException;
4   import java.io.StringReader;
5   
6   import org.apache.lucene.analysis.BaseTokenStreamTestCase;
7   import org.apache.lucene.analysis.CharFilter;
8   import org.apache.lucene.analysis.TokenStream;
9   import org.junit.Test;
10  
11  public class KhmerCharFilterTest extends BaseTokenStreamTestCase {
12  
13      @Test
14      public void testDeprecatedCharConversions() throws IOException {
15          // Two inherent vowels, U+17B4 and U+17B5 are in parens below because
16          // they may or may not be visible, depending on your OS, fonts, etc.
17          String testString = "ឨ ឣ ឤ ឲ ៘ (឴) (឵) ៝ ៓";
18          CharFilter cs = new KhmerCharFilter(new StringReader(testString));
19          TokenStream ts = whitespaceMockTokenizer(cs);
20          assertTokenStreamContents(ts,
21              // these parens are empty (inherent vowels have been removed)
22              new String[]{"ឧក", "អ", "អា", "ឱ", "។ល។", "()", "()", "៑", "ំ"},
23              new int[]{0, 2, 4, 6, 8, 10, 14, 18, 20},  // start offsets
24              new int[]{1, 3, 5, 7, 9, 13, 17, 19, 21}); // end offsets
25      }
26  
27      @Test
28      public void testDuplicateSubscriptConsonants() throws IOException {
29          String testString = "ញ្ច្ចូ ត្ដ្ដ ន្ធិ្ធ ភ្លេ្ល";
30          CharFilter cs = new KhmerCharFilter(new StringReader(testString));
31          TokenStream ts = whitespaceMockTokenizer(cs);
32          assertTokenStreamContents(ts,
33              new String[]{"ញ្ចូ", "ត្ដ", "ន្ធិ", "ភ្លេ"},
34              new int[]{0,  7, 13, 20},  // start offsets
35              new int[]{6, 12, 19, 26}); // end offsets
36      }
37  
38      @Test
39      public void testDuplicateDiacritics() throws IOException {
40          String testString = "ខំំ តិំំំំំំំំំំំំំំ ញុំាំ ក់់់់ គ្្្នា ខ្ញុំុំ";
41          CharFilter cs = new KhmerCharFilter(new StringReader(testString));
42          TokenStream ts = whitespaceMockTokenizer(cs);
43          assertTokenStreamContents(ts,
44              new String[]{"ខំ", "តិំ", "ញុាំ", "ក់", "គ្នា", "ខ្ញុំ"},
45              new int[]{0,  4, 21, 27, 33, 40},  // start offsets
46              new int[]{3, 20, 26, 32, 39, 47}); // end offsets
47      }
48  
49      @Test
50      public void testOtherDuplicates() throws IOException {
51          String testString = "ខំេេេ កេេ សីេេ";
52          CharFilter cs = new KhmerCharFilter(new StringReader(testString));
53          TokenStream ts = whitespaceMockTokenizer(cs);
54          assertTokenStreamContents(ts,
55              new String[]{"ខេំ", "កេ", "សើ"},
56              new int[]{0, 6, 10},  // start offsets
57              new int[]{5, 9, 14}); // end offsets
58      }
59  
60      @Test
61      public void testSuppConsVSDepVowelOrder() throws IOException {
62          String testString = "នា្ទ មិ្ម មេ្ល មៃ្ភ មោ្ព លា្ង លិ្ល លែ្វ";
63          CharFilter cs = new KhmerCharFilter(new StringReader(testString));
64          TokenStream ts = whitespaceMockTokenizer(cs);
65          assertTokenStreamContents(ts,
66              new String[]{"ន្ទា", "ម្មិ", "ម្លេ", "ម្ភៃ", "ម្ពោ", "ល្ងា", "ល្លិ", "ល្វែ"},
67              new int[]{0, 5, 10, 15, 20, 25, 30, 35},  // start offsets
68              new int[]{4, 9, 14, 19, 24, 29, 34, 39}); // end offsets
69  
70          testString = "សឹ្ស សើ្ទ សើ្ម សេ្ន សែ្ត ហឺ្គ ងោ្ស";
71          cs = new KhmerCharFilter(new StringReader(testString));
72          ts = whitespaceMockTokenizer(cs);
73          assertTokenStreamContents(ts,
74              new String[]{"ស្សឹ", "ស្ទើ", "ស្មើ", "ស្នេ", "ស្តែ", "ហ្គឺ", "ង្សោ"},
75              new int[]{0, 5, 10, 15, 20, 25, 30},  // start offsets
76              new int[]{4, 9, 14, 19, 24, 29, 34, 39}); // end offsets
77      }
78  
79      @Test
80      public void testSplitVowels() throws IOException {
81          String testString = "កេី កីេ កេា ណេ្ណាះ";
82          CharFilter cs = new KhmerCharFilter(new StringReader(testString));
83          TokenStream ts = whitespaceMockTokenizer(cs);
84          assertTokenStreamContents(ts,
85              new String[]{"កើ", "កើ", "កោ", "ណ្ណោះ"},
86              new int[]{0, 4,  8, 12},  // start offsets
87              new int[]{3, 7, 11, 18}); // end offsets
88      }
89  
90      @Test
91      public void testReorderRo() throws IOException {
92          String testString = "ង្រ្កា ង្រា្ក ក្រ័្ក ហ្រ្វាំ";
93          CharFilter cs = new KhmerCharFilter(new StringReader(testString));
94          TokenStream ts = whitespaceMockTokenizer(cs);
95          assertTokenStreamContents(ts,
96              new String[]{"ង្ក្រា", "ង្ក្រា", "ក្ក្រ័", "ហ្វ្រាំ"},
97              new int[]{0,  7, 14, 21},  // start offsets
98              new int[]{6, 13, 20, 28}); // end offsets
99      }
100 
101     @Test
102     public void testSuppConsVSDiacriticOrder() throws IOException {
103         String testString = "ន់ែ យ្យ៌ រ់ា ល័ួ ល់េ";
104         CharFilter cs = new KhmerCharFilter(new StringReader(testString));
105         TokenStream ts = whitespaceMockTokenizer(cs);
106         assertTokenStreamContents(ts,
107             new String[]{"នែ់", "យ៌្យ", "រា់", "លួ័", "លេ់"},
108             new int[]{0, 4,  9, 13, 17},  // start offsets
109             new int[]{3, 8, 12, 16, 20}); // end offsets
110 
111         testString = "លំែ ល់ៃ ស់ា ហំា ហ៎្ន";
112         cs = new KhmerCharFilter(new StringReader(testString));
113         ts = whitespaceMockTokenizer(cs);
114         assertTokenStreamContents(ts,
115             new String[]{"លែំ", "លៃ់", "សា់", "ហាំ", "ហ្ន៎"},
116             new int[]{0, 4,  8, 12, 16},  // start offsets
117             new int[]{3, 7, 11, 15, 20}); // end offsets
118     }
119 
120     @Test
121     public void testStripInvisibles() throws IOException {
122         // ZWSP, ZWNJ, ZWJ, and SHY
123         String testString = "ក​​្លេ ហ្វ‌៊ី អ‍៊ី រ­ៀ";
124         CharFilter cs = new KhmerCharFilter(new StringReader(testString));
125         TokenStream ts = whitespaceMockTokenizer(cs);
126         assertTokenStreamContents(ts,
127             new String[]{"ក្លេ", "ហ៊្វី", "អ៊ី", "រៀ"},
128             new int[]{0,  7, 14, 19},  // start offsets
129             new int[]{6, 13, 18, 22}); // end offsets
130     }
131 
132     @Test
133     public void testDepVowelVSDiacriticOrder() throws IOException {
134         String testString = "ពីំា វ៉់ា រុំា គា៌";
135         CharFilter cs = new KhmerCharFilter(new StringReader(testString));
136         TokenStream ts = whitespaceMockTokenizer(cs);
137         assertTokenStreamContents(ts,
138             new String[]{"ពីាំ", "វ៉ា់", "រុាំ", "គ៌ា"},
139             new int[]{0, 5, 10, 15},  // start offsets
140             new int[]{4, 9, 14, 18}); // end offsets
141     }
142 
143     @Test
144     public void testDiacriticOrder() throws IOException {
145         String testString = "សូ៊";
146         CharFilter cs = new KhmerCharFilter(new StringReader(testString));
147         TokenStream ts = whitespaceMockTokenizer(cs);
148         assertTokenStreamContents(ts,
149             new String[]{"ស៊ូ"},
150             new int[]{0},  // start offsets
151             new int[]{3}); // end offsets
152     }
153 
154     @Test
155     public void testCrazySyllables() throws IOException {
156         // these are fairly ridiculous, but should be repaired nonetheless
157         String testString = "ស្ស្សិោេ្ហ្ហ ហ្្្គំ្្្រំាំ ហ្រ្វាំាំាំ ហ្រ៊ីី្វីី កំំា្្ឌាះ";
158         CharFilter cs = new KhmerCharFilter(new StringReader(testString));
159         TokenStream ts = whitespaceMockTokenizer(cs);
160         assertTokenStreamContents(ts,
161             new String[]{"ស្ស្ហិោេ", "ហ្គ្រាំ", "ហ្វ្រាំ", "ហ្វ្រ៊ី", "ក្ឌាំះ"});
162     }
163 
164     @Test
165     public void testRunningText() throws IOException {
166         // several syllables in this text need to be reordered
167         String testString = "សំលេងពាក់កណា្តលរហូតដល់រូបភាពមានផៃ្ទភឺ្លថ្លានៅខែឧសភាឆាំ្ម1925  ។";
168         CharFilter cs = new KhmerCharFilter(new StringReader(testString));
169         TokenStream ts = whitespaceMockTokenizer(cs);
170         assertTokenStreamContents(ts,
171             new String[]{"សំលេងពាក់កណ្តាលរហូតដល់រូបភាពមានផ្ទៃភ្លឺថ្លានៅខែឧសភាឆ្មាំ1925", "។"});
172 
173         testString = "បន្ទាប់មកទៀតមានបែ្រកមួយឈោ្មះបែ្រកកំពង់គ្រញូង";
174         cs = new KhmerCharFilter(new StringReader(testString));
175         ts = whitespaceMockTokenizer(cs);
176         assertTokenStreamContents(ts,
177             new String[]{"បន្ទាប់មកទៀតមានប្រែកមួយឈ្មោះប្រែកកំពង់គ្រញូង"});
178     }
179 
180     @Test
181     public void testNonKhmerText() throws IOException { // these should all be unchanged
182         // Latin, Cyrillic, Katakana, Hiragana, Hanzi, Hangul
183         String testString = "Wikipedia Википедию ウィキペディア うぃきぺでぃあ 维基百科 위키백과";
184         CharFilter cs = new KhmerCharFilter(new StringReader(testString));
185         TokenStream ts = whitespaceMockTokenizer(cs);
186         assertTokenStreamContents(ts,
187             new String[]{"Wikipedia", "Википедию", "ウィキペディア", "うぃきぺでぃあ", "维基百科", "위키백과"});
188 
189         // Armenian, Hebrew, Greek, Arabic, Georgian
190         testString = "Վիքիպեդիա ויקיפדיה Βικιπαίδεια ويكيبيدي ვიკიპედია";
191         cs = new KhmerCharFilter(new StringReader(testString));
192         ts = whitespaceMockTokenizer(cs);
193         assertTokenStreamContents(ts,
194             new String[]{"Վիքիպեդիա", "ויקיפדיה", "Βικιπαίδεια", "ويكيبيدي", "ვიკიპედია"});
195 
196         // Devanagari, Thai, Tamil, Bengali, IPA
197         testString = "विकिपीडिया วิกิพีเดีย விக்கிப்பீடியா উইকিপিডিয়া ˌwɪkɪˈpiːdiə";
198         cs = new KhmerCharFilter(new StringReader(testString));
199         ts = whitespaceMockTokenizer(cs);
200         assertTokenStreamContents(ts,
201             new String[]{"विकिपीडिया", "วิกิพีเดีย", "விக்கிப்பீடியா", "উইকিপিডিয়া", "ˌwɪkɪˈpiːdiə"});
202 
203         // random diacritical Latin
204         testString = "Wíkìpėdïã įš â müłtīlíñgûål òpęń-çółláboràtive õńlîñe enčÿćlopædīá";
205         cs = new KhmerCharFilter(new StringReader(testString));
206         ts = whitespaceMockTokenizer(cs);
207         assertTokenStreamContents(ts,
208             new String[]{"Wíkìpėdïã", "įš", "â", "müłtīlíñgûål", "òpęń-çółláboràtive",
209                 "õńlîñe", "enčÿćlopædīá"});
210 
211         // elements that need escaping in regexes and replacement parts
212         // should never come up because they SYLL_PAT doesn't match them
213         testString = "$1 \\2 3* ^4 5$ /6/ (?!7) 8+ {9,10} .11 12? [13-14]";
214         cs = new KhmerCharFilter(new StringReader(testString));
215         ts = whitespaceMockTokenizer(cs);
216         assertTokenStreamContents(ts,
217             new String[]{"$1", "\\2", "3*", "^4", "5$", "/6/", "(?!7)", "8+", "{9,10}",
218                 ".11", "12?", "[13-14]"});
219     }
220 
221 }