1 package org.wikimedia.search.extra.analysis.khmer;
2
3 import java.io.IOException;
4 import java.io.StringReader;
5
6 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
7 import org.apache.lucene.analysis.CharFilter;
8 import org.apache.lucene.analysis.TokenStream;
9 import org.junit.Test;
10
11 public class KhmerCharFilterTest extends BaseTokenStreamTestCase {
12
13 @Test
14 public void testDeprecatedCharConversions() throws IOException {
15
16
17 String testString = "ឨ ឣ ឤ ឲ ៘ (឴) (឵) ៝ ៓";
18 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
19 TokenStream ts = whitespaceMockTokenizer(cs);
20 assertTokenStreamContents(ts,
21
22 new String[]{"ឧក", "អ", "អា", "ឱ", "។ល។", "()", "()", "៑", "ំ"},
23 new int[]{0, 2, 4, 6, 8, 10, 14, 18, 20},
24 new int[]{1, 3, 5, 7, 9, 13, 17, 19, 21});
25 }
26
27 @Test
28 public void testDuplicateSubscriptConsonants() throws IOException {
29 String testString = "ញ្ច្ចូ ត្ដ្ដ ន្ធិ្ធ ភ្លេ្ល";
30 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
31 TokenStream ts = whitespaceMockTokenizer(cs);
32 assertTokenStreamContents(ts,
33 new String[]{"ញ្ចូ", "ត្ដ", "ន្ធិ", "ភ្លេ"},
34 new int[]{0, 7, 13, 20},
35 new int[]{6, 12, 19, 26});
36 }
37
38 @Test
39 public void testDuplicateDiacritics() throws IOException {
40 String testString = "ខំំ តិំំំំំំំំំំំំំំ ញុំាំ ក់់់់ គ្្្នា ខ្ញុំុំ";
41 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
42 TokenStream ts = whitespaceMockTokenizer(cs);
43 assertTokenStreamContents(ts,
44 new String[]{"ខំ", "តិំ", "ញុាំ", "ក់", "គ្នា", "ខ្ញុំ"},
45 new int[]{0, 4, 21, 27, 33, 40},
46 new int[]{3, 20, 26, 32, 39, 47});
47 }
48
49 @Test
50 public void testOtherDuplicates() throws IOException {
51 String testString = "ខំេេេ កេេ សីេេ";
52 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
53 TokenStream ts = whitespaceMockTokenizer(cs);
54 assertTokenStreamContents(ts,
55 new String[]{"ខេំ", "កេ", "សើ"},
56 new int[]{0, 6, 10},
57 new int[]{5, 9, 14});
58 }
59
60 @Test
61 public void testSuppConsVSDepVowelOrder() throws IOException {
62 String testString = "នា្ទ មិ្ម មេ្ល មៃ្ភ មោ្ព លា្ង លិ្ល លែ្វ";
63 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
64 TokenStream ts = whitespaceMockTokenizer(cs);
65 assertTokenStreamContents(ts,
66 new String[]{"ន្ទា", "ម្មិ", "ម្លេ", "ម្ភៃ", "ម្ពោ", "ល្ងា", "ល្លិ", "ល្វែ"},
67 new int[]{0, 5, 10, 15, 20, 25, 30, 35},
68 new int[]{4, 9, 14, 19, 24, 29, 34, 39});
69
70 testString = "សឹ្ស សើ្ទ សើ្ម សេ្ន សែ្ត ហឺ្គ ងោ្ស";
71 cs = new KhmerCharFilter(new StringReader(testString));
72 ts = whitespaceMockTokenizer(cs);
73 assertTokenStreamContents(ts,
74 new String[]{"ស្សឹ", "ស្ទើ", "ស្មើ", "ស្នេ", "ស្តែ", "ហ្គឺ", "ង្សោ"},
75 new int[]{0, 5, 10, 15, 20, 25, 30},
76 new int[]{4, 9, 14, 19, 24, 29, 34, 39});
77 }
78
79 @Test
80 public void testSplitVowels() throws IOException {
81 String testString = "កេី កីេ កេា ណេ្ណាះ";
82 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
83 TokenStream ts = whitespaceMockTokenizer(cs);
84 assertTokenStreamContents(ts,
85 new String[]{"កើ", "កើ", "កោ", "ណ្ណោះ"},
86 new int[]{0, 4, 8, 12},
87 new int[]{3, 7, 11, 18});
88 }
89
90 @Test
91 public void testReorderRo() throws IOException {
92 String testString = "ង្រ្កា ង្រា្ក ក្រ័្ក ហ្រ្វាំ";
93 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
94 TokenStream ts = whitespaceMockTokenizer(cs);
95 assertTokenStreamContents(ts,
96 new String[]{"ង្ក្រា", "ង្ក្រា", "ក្ក្រ័", "ហ្វ្រាំ"},
97 new int[]{0, 7, 14, 21},
98 new int[]{6, 13, 20, 28});
99 }
100
101 @Test
102 public void testSuppConsVSDiacriticOrder() throws IOException {
103 String testString = "ន់ែ យ្យ៌ រ់ា ល័ួ ល់េ";
104 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
105 TokenStream ts = whitespaceMockTokenizer(cs);
106 assertTokenStreamContents(ts,
107 new String[]{"នែ់", "យ៌្យ", "រា់", "លួ័", "លេ់"},
108 new int[]{0, 4, 9, 13, 17},
109 new int[]{3, 8, 12, 16, 20});
110
111 testString = "លំែ ល់ៃ ស់ា ហំា ហ៎្ន";
112 cs = new KhmerCharFilter(new StringReader(testString));
113 ts = whitespaceMockTokenizer(cs);
114 assertTokenStreamContents(ts,
115 new String[]{"លែំ", "លៃ់", "សា់", "ហាំ", "ហ្ន៎"},
116 new int[]{0, 4, 8, 12, 16},
117 new int[]{3, 7, 11, 15, 20});
118 }
119
120 @Test
121 public void testStripInvisibles() throws IOException {
122
123 String testString = "ក្លេ ហ្វ៊ី អ៊ី រៀ";
124 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
125 TokenStream ts = whitespaceMockTokenizer(cs);
126 assertTokenStreamContents(ts,
127 new String[]{"ក្លេ", "ហ៊្វី", "អ៊ី", "រៀ"},
128 new int[]{0, 7, 14, 19},
129 new int[]{6, 13, 18, 22});
130 }
131
132 @Test
133 public void testDepVowelVSDiacriticOrder() throws IOException {
134 String testString = "ពីំា វ៉់ា រុំា គា៌";
135 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
136 TokenStream ts = whitespaceMockTokenizer(cs);
137 assertTokenStreamContents(ts,
138 new String[]{"ពីាំ", "វ៉ា់", "រុាំ", "គ៌ា"},
139 new int[]{0, 5, 10, 15},
140 new int[]{4, 9, 14, 18});
141 }
142
143 @Test
144 public void testDiacriticOrder() throws IOException {
145 String testString = "សូ៊";
146 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
147 TokenStream ts = whitespaceMockTokenizer(cs);
148 assertTokenStreamContents(ts,
149 new String[]{"ស៊ូ"},
150 new int[]{0},
151 new int[]{3});
152 }
153
154 @Test
155 public void testCrazySyllables() throws IOException {
156
157 String testString = "ស្ស្សិោេ្ហ្ហ ហ្្្គំ្្្រំាំ ហ្រ្វាំាំាំ ហ្រ៊ីី្វីី កំំា្្ឌាះ";
158 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
159 TokenStream ts = whitespaceMockTokenizer(cs);
160 assertTokenStreamContents(ts,
161 new String[]{"ស្ស្ហិោេ", "ហ្គ្រាំ", "ហ្វ្រាំ", "ហ្វ្រ៊ី", "ក្ឌាំះ"});
162 }
163
164 @Test
165 public void testRunningText() throws IOException {
166
167 String testString = "សំលេងពាក់កណា្តលរហូតដល់រូបភាពមានផៃ្ទភឺ្លថ្លានៅខែឧសភាឆាំ្ម1925 ។";
168 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
169 TokenStream ts = whitespaceMockTokenizer(cs);
170 assertTokenStreamContents(ts,
171 new String[]{"សំលេងពាក់កណ្តាលរហូតដល់រូបភាពមានផ្ទៃភ្លឺថ្លានៅខែឧសភាឆ្មាំ1925", "។"});
172
173 testString = "បន្ទាប់មកទៀតមានបែ្រកមួយឈោ្មះបែ្រកកំពង់គ្រញូង";
174 cs = new KhmerCharFilter(new StringReader(testString));
175 ts = whitespaceMockTokenizer(cs);
176 assertTokenStreamContents(ts,
177 new String[]{"បន្ទាប់មកទៀតមានប្រែកមួយឈ្មោះប្រែកកំពង់គ្រញូង"});
178 }
179
180 @Test
181 public void testNonKhmerText() throws IOException {
182
183 String testString = "Wikipedia Википедию ウィキペディア うぃきぺでぃあ 维基百科 위키백과";
184 CharFilter cs = new KhmerCharFilter(new StringReader(testString));
185 TokenStream ts = whitespaceMockTokenizer(cs);
186 assertTokenStreamContents(ts,
187 new String[]{"Wikipedia", "Википедию", "ウィキペディア", "うぃきぺでぃあ", "维基百科", "위키백과"});
188
189
190 testString = "Վիքիպեդիա ויקיפדיה Βικιπαίδεια ويكيبيدي ვიკიპედია";
191 cs = new KhmerCharFilter(new StringReader(testString));
192 ts = whitespaceMockTokenizer(cs);
193 assertTokenStreamContents(ts,
194 new String[]{"Վիքիպեդիա", "ויקיפדיה", "Βικιπαίδεια", "ويكيبيدي", "ვიკიპედია"});
195
196
197 testString = "विकिपीडिया วิกิพีเดีย விக்கிப்பீடியா উইকিপিডিয়া ˌwɪkɪˈpiːdiə";
198 cs = new KhmerCharFilter(new StringReader(testString));
199 ts = whitespaceMockTokenizer(cs);
200 assertTokenStreamContents(ts,
201 new String[]{"विकिपीडिया", "วิกิพีเดีย", "விக்கிப்பீடியா", "উইকিপিডিয়া", "ˌwɪkɪˈpiːdiə"});
202
203
204 testString = "Wíkìpėdïã įš â müłtīlíñgûål òpęń-çółláboràtive õńlîñe enčÿćlopædīá";
205 cs = new KhmerCharFilter(new StringReader(testString));
206 ts = whitespaceMockTokenizer(cs);
207 assertTokenStreamContents(ts,
208 new String[]{"Wíkìpėdïã", "įš", "â", "müłtīlíñgûål", "òpęń-çółláboràtive",
209 "õńlîñe", "enčÿćlopædīá"});
210
211
212
213 testString = "$1 \\2 3* ^4 5$ /6/ (?!7) 8+ {9,10} .11 12? [13-14]";
214 cs = new KhmerCharFilter(new StringReader(testString));
215 ts = whitespaceMockTokenizer(cs);
216 assertTokenStreamContents(ts,
217 new String[]{"$1", "\\2", "3*", "^4", "5$", "/6/", "(?!7)", "8+", "{9,10}",
218 ".11", "12?", "[13-14]"});
219 }
220
221 }