View Javadoc
1   package org.wikimedia.search.extra.analysis.turkish;
2   
3   import static org.junit.Assert.assertEquals;
4   
5   import java.util.Arrays;
6   import java.util.Collection;
7   
8   import org.junit.Test;
9   import org.junit.runner.RunWith;
10  import org.junit.runners.Parameterized;
11  
12  @RunWith(Parameterized.class)
13  public class BetterApostropheTest {
14      private String input;
15      private String expected;
16      private BetterApostrophe apostrophe = new BetterApostrophe();
17  
18      public BetterApostropheTest(String input, String expected) {
19          this.input = input;
20          this.expected = expected;
21      }
22  
23      @Parameterized.Parameters
24      public static Collection<Object[]> apostropheTestCases() {
25          return Arrays.asList(new Object[][]{
26              // simple single and mult-apostrophe examples
27              {"türkiye'den", "türkiye"},
28              {"xi'an'in", "xian"},
29              {"yefâ’î’nin", "yefâî"},
30              {"vak‘ası’nın", "vakası"},
31              {"k'at'aph'ia", "kataph"},
32              {"hawai'i'o'o", "hawaiio"},
33              {"k'oyitl'ots'ina", "koyitlots"},
34              {"isnâ‘aşer’îyye'nin", "isnâaşerîyye"},
35  
36              // apostrophe-like characters
37              // note that other tests also include apostrophe-like characters other than '
38              {"türkiye'den", "türkiye"}, // apostrophe
39              {"türkiyeʼden", "türkiye"}, // modifier apostrophe
40              {"türkiye'den", "türkiye"}, // fullwidth apostrophe
41              {"türkiye‘den", "türkiye"}, // left curly quote
42              {"türkiye’den", "türkiye"}, // right curly quote
43              {"türkiye`den", "türkiye"}, // grave accent
44              {"türkiye´den", "türkiye"}, // acute accent
45              {"türkiyeˋden", "türkiye"}, // modifier grave accent
46              {"türkiyeˊden", "türkiye"}, // modifier acute accent
47  
48              // testWholeWordExceptions
49              // whole word exceptions: things that are easier to process as whole words
50              {"qu'il", "il"},
51              {"s'il", "il"},
52              {"d'un", "un"},
53              {"l'un", "un"},
54              {"qu'un", "un"},
55  
56              // special case—kuran/quran/etc.
57              {"kur'ân", "kurân"},
58              {"kur'an'daki", "kuran"},
59              {"kurʼân'da", "kurân"},
60              {"kur'an'dır", "kuran"},
61              {"qur'anic", "quranic"},
62              {"qur’ān", "qurān"},
63  
64              // special case—English -n't
65              {"needn't", "neednt"},
66              {"shan't", "shant"},
67              {"shouldn’t", "shouldnt"},
68              {"wasn't", "wasnt"},
69  
70              // special case—English -'n'-
71              {"drum'n'bass", "drumnbass"},
72              {"nice’n’easy", "niceneasy"},
73              {"r'n'b", "rnb"},
74              {"rock'n'roll", "rocknroll"},
75  
76              // special case—English -'s + ' (+ tr suffix)
77              {"dalek's'de", "dalek"},
78              {"mcdonald's'ında", "mcdonald"},
79              {"mcvitie's'nin", "mcvitie"},
80              {"scott's’da", "scott"},
81  
82              // very French/Italian elision
83              {"j't'aime", "aime"},
84              {"j'n'attends", "attends"},
85              {"j'étais", "étais"},
86              {"l'écologie", "écologie"},
87              {"qu’aucun", "aucun"},
88              {"sull'uscio", "uscio"},
89  
90              // looks like elision and Turkish suffixation at the same time--suffixation wins!
91              {"j'den", "j"},
92              {"d'deki", "d"},
93              {"un'un", "un"},
94              {"all'daki", "all"},
95              {"l'ı", "l"},
96  
97              // strip multiple common Turkish suffixes used at once
98              {"alp'lerindeki", "alp"},
99              {"ceo'luklarını", "ceo"},
100             {"lacan'ınkilerden", "lacan"},
101             {"ankara'dakilerin", "ankara"},
102             {"amerika'dakileri", "amerika"},
103             {"plüton'unkindense", "plüton"},
104             {"profesör'lerindendir", "profesör"},
105             {"archaeopteryx'inkilere", "archaeopteryx"},
106 
107             // Turkish suffix stripping should work on non-Latin words, too
108             {"πάπυρος'tan", "πάπυρος"},
109             {"ребро'dan", "ребро"},
110             {"Աշտարակ'in", "Աշտարակ"},
111             {"قاعدة‎'nin", "قاعدة‎"},
112 
113             // generally disfavor one-letter "stems"
114             {"a'oulhalak", "aoulhalak"},
115             {"b'day", "bday"},
116             {"c'hwennenn", "chwennenn"},
117             {"e'cole", "ecole"},
118             {"g'day", "gday"},
119             {"g‘azalkent", "gazalkent"},
120             {"y'all", "yall"},
121 
122             // non-Turkish letters can't be in Turkish suffixes, so don't treat as suffixes
123             // numbers
124             {"00'19", "0019"},
125 
126             // non-letters
127             {"albʊ'raːq", "albʊraːq"},
128             {"a·lü·mi'n·yum", "a·lü·min·yum"},
129 
130             // non-Turkish Latin
131             {"awa’uq", "awauq"},
132             {"arc’teryx", "arcteryx"},
133 
134             // non-Turkish diacritics
135             {"ba'aṯ", "baaṯ"},
136             {"bābā’ī", "bābāī"},
137             {"abdülkerim'ê", "abdülkerimê"},
138 
139             // non-Latin
140             {"В’в", "Вв"},
141             {"Х’агәы́шь", "Хагәы́шь"},
142             {"прем'єра", "премєра"},
143             {"ג'אלה", "גאלה"},
144             {"여성들'에", "여성들에"},
145             {"կ’ընթրենք", "կընթրենք"},
146             {"επ'ευκαιρία", "επευκαιρία"},
147 
148             // a few two-letter prefixes that are almost never stems
149             {"ch'alla", "challa"},
150             {"ch'ang", "chang"},
151             {"ch'ing", "ching"},
152             {"ma'arif", "maarif"},
153             {"ma'rifette", "marifette"},
154             {"ma'ruflardır", "maruflardır"},
155             {"ta'izz", "taizz"},
156             {"ta'rikh", "tarikh"},
157             {"ta'us", "taus"},
158             {"te'lif", "telif"},
159             {"te'vîl", "tevîl"},
160             {"te'mine", "temine"},
161 
162             // various tests of rule ordering
163 
164             // remove common Fr/It elision before removing words that look like multiple
165             // (admittedly possibly nonsensical) Turkish suffixes
166             {"d'adieu", "adieu"}, // a + di + e + u
167             {"l'ındiana", "ındiana"}, // ın + di + a + na
168             {"dell'ıtalia", "ıtalia"}, // ı + ta + li + a
169 
170             // remove common Fr/It elision before removing multiple apostrophes
171             {"dell'arte'nin", "arte"}, // not dellarte
172             {"nell'emilia'da", "emilia"}, // not nellemilia
173             {"d'artagnan'ın", "artagnan"}, // not dartagnan
174 
175             // remove common endings before removing one letter before apostrophe
176             {"b'dekilere", "b"},
177             {"n'ın", "n"},
178             {"s'inkilere", "s"},
179             {"x'dedir", "x"},
180             {"β'ların", "β"},
181             {"ϖ'yi", "ϖ"},
182             {"ж’den", "ж"},
183             {"й'dir", "й"},
184 
185             // non-word prefixes + clear Turkish suffix -> strip suffix
186             {"ch'den", "ch"},
187             {"ma'ın", "ma"},
188             {"te'de", "te"},
189             {"ta'lik", "ta"},
190 
191             // interaction of non-word stems and other endings
192             {"ch'orti's", "chorti"},
193             {"ch'ing'in", "ching"},
194             {"ma'ali'yi", "maali"},
195             {"ma'arretü'n", "maarretü"},
196             {"ta'us'un", "taus"},
197             {"ta'rifâti'l", "tarifâti"},
198             {"te'vilâti'l", "tevilâti"},
199             {"te’lifi’l", "telifi"},
200 
201             // words with non-Turkish Latin or non-Latin characters and no apostrophes should
202             // be unchanged
203             {"año", "año"}, // Spanish
204             {"вищій", "вищій"}, // Ukrainian
205             {"위키백과", "위키백과"}, // Korean
206             {"əliağa", "əliağa"}, // Azerbaijani
207             {"ውክፔዲያ", "ውክፔዲያ"}, // Amharic
208             {"ᐅᐃᑭᐱᑎᐊ", "ᐅᐃᑭᐱᑎᐊ"}, // Inuktitut
209             {"ვიკიპედია", "ვიკიპედია"}, // Georgian
210             {"βικιπαίδεια", "βικιπαίδεια"}, // Greek
211             {"аблютомания", "аблютомания"}, // Russian
212 
213             // Exhaustive test of "common Turkish suffixes", part I. They only get stripped as
214             // suffixes after elision prefixes, one-letter stems, or "non-word" stems, so not
215             // all examples are easily found. This first batch all come from Turkish Wikipedia.
216             {"g’a", "g"},
217             {"ş'da", "ş"},
218             {"all'daki", "all"},
219             {"ı'dan", "ı"},
220             {"dell'de", "dell"},
221             {"m'deki", "m"},
222             {"p’den", "p"},
223             {"nell'dir", "nell"},
224             {"c'dur", "c"},
225             {"w'dı", "w"},
226             {"k'dır", "k"},
227             {"ı’e", "ı"},
228             {"ç'i", "ç"},
229             {"b’il", "b"},
230             {"v'in", "v"},
231             {"v'la", "v"},
232             {"a’lar", "a"},
233             {"f'le", "f"},
234             {"x'ler", "x"},
235             {"h'li", "h"},
236             {"m'lik", "m"},
237             {"c’lu", "c"},
238             {"n'luk", "n"},
239             {"k’lı", "k"},
240             {"w’lık", "w"},
241             {"o'na", "o"},
242             {"o'ndaki", "o"},
243             {"o’ndan", "o"},
244             {"π'nin", "π"},
245             {"o’nu", "o"},
246             {"u'nun", "u"},
247             {"μ'nün", "μ"},
248             {"å'nın", "å"},
249             {"f'si", "f"},
250             {"o'su", "o"},
251             {"h’sı", "h"},
252             {"w'ta", "w"},
253             {"s'tan", "s"},
254             {"v'te", "v"},
255             {"x'teki", "x"},
256             {"h'ten", "h"},
257             {"v’ti", "v"},
258             {"v'tir", "v"},
259             {"v’tur", "v"},
260             {"w'tır", "w"},
261             {"k'u", "k"},
262             {"all'un", "all"},
263             {"δ'ya", "δ"},
264             {"b'ydi", "b"},
265             {"ê’ye", "ê"},
266             {"φ’yi", "φ"},
267             {"b'yken", "b"},
268             {"u'yla", "u"},
269             {"j'yle", "j"},
270             {"u'yu", "u"},
271             {"ü'yü", "ü"},
272             {"γ'yı", "γ"},
273             {"l'ü", "l"},
274             {"n'ı", "n"},
275             {"ψ'ın", "ψ"},
276 
277             // Exhaustive test of "common Turkish suffixes", part II. These are "synthetic"
278             // examples, x, plus an apostrophe-like character, plus a suffix. The suffixes all
279             // occur in Turkish Wikipedia with apostrophe-like characters, but not necessarily
280             // after one-letter stems.
281             {"x'di", "x"},
282             {"xʼdu", "x"},
283             {"x’dü", "x"},
284             {"x'dür", "x"},
285             {"x’ken", "x"},
286             {"x'ki", "x"},
287             {"x‘lü", "x"},
288             {"x'lük", "x"},
289             {"xʼnda", "x"},
290             {"x'nde", "x"},
291             {"xʼndeki", "x"},
292             {"x'nden", "x"},
293             {"x'ne", "x"},
294             {"x'ni", "x"},
295             {"x’nü", "x"},
296             {"x'nı", "x"},
297             {"x'sa", "x"},
298             {"x’se", "x"},
299             {"xʼsü", "x"},
300             {"x'taki", "x"},
301             {"x'tu", "x"},
302             {"x'tü", "x"},
303             {"x'tür", "x"},
304             {"x'tı", "x"},
305             {"x’ul", "x"},
306             {"x’ydu", "x"},
307             {"x'ydü", "x"},
308             {"x'ydı", "x"},
309             {"x‘ül", "x"},
310             {"x'ün", "x"},
311 
312         });
313     }
314 
315     @Test
316     public void apostropheTester() throws Exception {
317         assertEquals(expected, apostrophe.apos(input));
318     }
319 
320 }