1
2
3
4
5
6
7
8
9
10
11
12
13
14 package org.wikimedia.search.extra.analysis.turkish;
15
16 import static java.util.Collections.unmodifiableMap;
17
18 import java.util.regex.Matcher;
19 import java.util.regex.Pattern;
20 import java.util.HashMap;
21 import java.util.Map;
22
23 public class BetterApostrophe {
24
25
26 private static final Pattern APOS_LIKE = Pattern.compile("[''ʼ‘’`´ˋˊ]");
27
28
29 private static final Map<String, String> WHOLE_WORD_MAP = unmodifiableMap(initWholeWordMap());
30 private static final Pattern FR_DOUBLE_ELISION = Pattern.compile("^j'[nt]'");
31 private static final String FR_IT_ELISION_LIST = "([ljd]|dall|dell|all|nell|qu|un|sull)";
32 private static final String TR_COMMON_SUFF_LIST = initTurkishSuffixes();
33 private static final Pattern FR_IT_ELISION = Pattern.compile("^" + FR_IT_ELISION_LIST + "'");
34 private static final Pattern TR_MULTI_SUFF = Pattern.compile("'" + TR_COMMON_SUFF_LIST + "{1,5}$");
35 private static final Pattern ELISION_WITH_SUFF =
36 Pattern.compile("^" + FR_IT_ELISION_LIST + "'" + TR_COMMON_SUFF_LIST + "$");
37 private static final Pattern TR_KURAN = Pattern.compile("^([kq]ur)'([aâā]n)");
38 private static final Pattern EN_AINT = Pattern.compile("n't$");
39 private static final Pattern MULTI_APOS = Pattern.compile("'(?=.*')");
40 private static final Pattern SINGLE_FIRST_LET = Pattern.compile("^(.)'");
41 private static final Pattern NON_TURKISH =
42 Pattern.compile("'(.*[^abcçdefgğhıijklmnoöprsştuüvyzâîû])");
43 private static final Pattern NON_WORDS = Pattern.compile("^(ch|ma|ta|te)'");
44
45
46
47
48
49
50
51
52
53
54 @SuppressWarnings({"NPathComplexity"})
55 public CharSequence apos(CharSequence wordCS) {
56 Matcher m;
57
58
59
60 m = APOS_LIKE.matcher(wordCS);
61 if (!m.find()) {
62 return wordCS;
63 }
64
65 String word = m.replaceAll("'");
66
67
68 String exceptn = WHOLE_WORD_MAP.get(word);
69 if (exceptn != null) {
70 return exceptn;
71 }
72
73
74 word = FR_DOUBLE_ELISION.matcher(word).replaceFirst("");
75
76
77 m = ELISION_WITH_SUFF.matcher(word);
78 if (m.find()) {
79 return m.group(1);
80 }
81
82
83 word = TR_KURAN.matcher(word).replaceFirst("$1$2");
84 word = EN_AINT.matcher(word).replaceFirst("nt");
85 word = word.replace("'n'", "n");
86 word = word.replace("'s'", "'");
87
88
89 word = FR_IT_ELISION.matcher(word).replaceFirst("");
90
91
92 word = MULTI_APOS.matcher(word).replaceAll("");
93
94
95 m = TR_MULTI_SUFF.matcher(word);
96 if (m.find()) {
97 return m.replaceFirst("");
98 }
99
100
101 m = SINGLE_FIRST_LET.matcher(word);
102 if (m.find()) {
103 return m.replaceFirst("$1");
104 }
105
106
107
108 m = NON_TURKISH.matcher(word);
109 if (m.find()) {
110 return m.replaceFirst("$1");
111 }
112
113
114 m = NON_WORDS.matcher(word);
115 if (m.find()) {
116 return m.replaceFirst("$1");
117 }
118
119 int lastDash = word.lastIndexOf('\'');
120 if (lastDash != -1) {
121 return word.substring(0, lastDash);
122 }
123
124 return word;
125 }
126
127
128
129
130
131
132 private static Map<String, String> initWholeWordMap() {
133 Map<String, String> wwm = new HashMap<>();
134 wwm.put("l'un", "un");
135 wwm.put("d'un", "un");
136 wwm.put("qu'un", "un");
137 wwm.put("s'il", "il");
138 wwm.put("qu'il", "il");
139
140 return wwm;
141 }
142
143
144
145
146
147
148
149 private static String initTurkishSuffixes() {
150 return "(?:" + String.join("|",
151 "[aeiıuü]", "d[aeiıuü]", "l[aeiıuü]", "n[aeiıuü]", "s[aeiıuü]", "t[aeiıuü]",
152 "y[aeiıuü]", "[iuü]l", "[iıuü]n", "n[iıuü]n", "nd[ae]", "d[ae]n", "nd[ae]n",
153 "t[ae]n", "d[ae]ki", "nd[ae]ki", "t[ae]ki", "d[iıuü]r", "t[iıuü]r", "ken", "yken",
154 "l[ae]r", "l[iıuü]k", "yd[iıuü]", "yl[ae]", "ki"
155 ) + ")";
156 }
157 }