View Javadoc
1   /*
2    * The WMF licenses this file to you under the Apache License, Version
3    * 2.0 (the "License"); you may not use this file except in compliance
4    * with the License. You may obtain a copy of the License at
5    *
6    *      http://www.apache.org/licenses/LICENSE-2.0
7    *
8    * Unless required by applicable law or agreed to in writing, software
9    * distributed under the License is distributed on an "AS IS" BASIS,
10   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11   * See the License for the specific language governing permissions and
12   * limitations under the License.
13   */
14  package org.wikimedia.search.extra.analysis.turkish;
15  
16  import static java.util.Collections.unmodifiableMap;
17  
18  import java.util.regex.Matcher;
19  import java.util.regex.Pattern;
20  import java.util.HashMap;
21  import java.util.Map;
22  
23  public class BetterApostrophe {
24      // apostrophe-like characters, INCLUDING APOSTROPHE: apostrophe, fullwidth, & modifer
25      // versions; left & right curly quote; grave & acute, plus modifier versions
26      private static final Pattern APOS_LIKE = Pattern.compile("[''ʼ‘’`´ˋˊ]");
27  
28      // The Patterns below assume that everything APOS_LIKE has been converted to '
29      private static final Map<String, String> WHOLE_WORD_MAP = unmodifiableMap(initWholeWordMap());
30      private static final Pattern FR_DOUBLE_ELISION = Pattern.compile("^j'[nt]'");
31      private static final String  FR_IT_ELISION_LIST = "([ljd]|dall|dell|all|nell|qu|un|sull)";
32      private static final String  TR_COMMON_SUFF_LIST = initTurkishSuffixes();
33      private static final Pattern FR_IT_ELISION = Pattern.compile("^" + FR_IT_ELISION_LIST + "'");
34      private static final Pattern TR_MULTI_SUFF = Pattern.compile("'" + TR_COMMON_SUFF_LIST + "{1,5}$");
35      private static final Pattern ELISION_WITH_SUFF =
36          Pattern.compile("^" + FR_IT_ELISION_LIST + "'" + TR_COMMON_SUFF_LIST + "$");
37      private static final Pattern TR_KURAN = Pattern.compile("^([kq]ur)'([aâā]n)");
38      private static final Pattern EN_AINT = Pattern.compile("n't$");
39      private static final Pattern MULTI_APOS = Pattern.compile("'(?=.*')");
40      private static final Pattern SINGLE_FIRST_LET = Pattern.compile("^(.)'");
41      private static final Pattern NON_TURKISH =
42          Pattern.compile("'(.*[^abcçdefgğhıijklmnoöprsştuüvyzâîû])");
43      private static final Pattern NON_WORDS = Pattern.compile("^(ch|ma|ta|te)'");
44  
45      /*
46       * "Stem" words with apostrophes in Turkish
47       *
48       * @param word input string
49       * @return word after normalization
50       *
51       * <p><b>NOTE</b>: Input is expected to be in lowercase,
52       * but with diacritical marks</p>
53       */
54      @SuppressWarnings({"NPathComplexity"})
55      public CharSequence apos(CharSequence wordCS) {
56          Matcher m;
57  
58          // normalize everything that's "like" an apostrophe to an apostrophe so we can
59          // just use a plain apostrophe everywhere else; if no apostrophes, return early
60          m = APOS_LIKE.matcher(wordCS);
61          if (!m.find()) {
62              return wordCS;
63          }
64  
65          String word = m.replaceAll("'");
66  
67          // whole word exceptions like d'un, s'il, and g'day
68          String exceptn = WHOLE_WORD_MAP.get(word);
69          if (exceptn != null) { // we mapped to the "final" answer, so return
70              return exceptn;
71          }
72  
73          // strip French j'n'- and j't'-
74          word = FR_DOUBLE_ELISION.matcher(word).replaceFirst("");
75  
76          // Fr/It elision prefix + common Turkish suffix => strip suffix
77          m = ELISION_WITH_SUFF.matcher(word);
78          if (m.find()) { // whole word match, so no apostrophes left
79              return m.group(1);
80          }
81  
82          // remove apostrophes in special cases. Partial word matches, so keep going.
83          word = TR_KURAN.matcher(word).replaceFirst("$1$2"); // kur'an, etc.
84          word = EN_AINT.matcher(word).replaceFirst("nt");    // English -n't
85          word = word.replace("'n'", "n"); // English -'n'-
86          word = word.replace("'s'", "'");   // English -'s + ' (+ tr suffix)
87  
88          // strip very French/Italian elision prefixes
89          word = FR_IT_ELISION.matcher(word).replaceFirst("");
90  
91          // remove all but the last apostrophe in a word
92          word = MULTI_APOS.matcher(word).replaceAll("");
93  
94          // strip top ~50 Turkish endings off after apostrophe
95          m = TR_MULTI_SUFF.matcher(word);
96          if (m.find()) { // remove final apos + suffixes & return
97              return m.replaceFirst("");
98          }
99  
100         // remove any remaining apostrophes following a single letter at the beginning
101         m = SINGLE_FIRST_LET.matcher(word);
102         if (m.find()) { // remove final apos & return
103             return m.replaceFirst("$1");
104         }
105 
106         // remove apostrophes not followed by only Turkish letters to the end of the
107         // word—and QXW don't count!
108         m = NON_TURKISH.matcher(word);
109         if (m.find()) { // remove final apos & return
110             return m.replaceFirst("$1");
111         }
112 
113         // remove apostrophes after non-words, like ch'
114         m = NON_WORDS.matcher(word);
115         if (m.find()) { // remove final apos & return
116             return m.replaceFirst("$1");
117         }
118 
119         int lastDash = word.lastIndexOf('\''); // find the last apostrophe
120         if (lastDash != -1) {
121             return word.substring(0, lastDash);
122         }
123 
124         return word;
125     }
126 
127 
128     /* Initialize whole word exceptions map
129      *
130      * Convert French l'un, d'un, or qu'un to plain un; and qu'il or s'il to plain il
131      */
132     private static Map<String, String> initWholeWordMap() {
133         Map<String, String> wwm = new HashMap<>();
134         wwm.put("l'un",  "un");
135         wwm.put("d'un",  "un");
136         wwm.put("qu'un", "un");
137         wwm.put("s'il",  "il");
138         wwm.put("qu'il", "il");
139 
140         return wwm;
141     }
142 
143     /* Initialize regex to match common Turkish suffixes
144      *
145      * These are the top ~50 Turkish suffixes that appear after apostrophes, plus less
146      * common presumed variants (due to Turkish vowel harmony). BTW, "ıl" is not missing
147      * from "[iuü]l" ... it doesn't seem to occur in the wild!
148      */
149     private static String initTurkishSuffixes() {
150         return "(?:" + String.join("|",
151             "[aeiıuü]", "d[aeiıuü]", "l[aeiıuü]", "n[aeiıuü]", "s[aeiıuü]", "t[aeiıuü]",
152             "y[aeiıuü]", "[iuü]l", "[iıuü]n", "n[iıuü]n", "nd[ae]", "d[ae]n", "nd[ae]n",
153             "t[ae]n", "d[ae]ki", "nd[ae]ki", "t[ae]ki", "d[iıuü]r", "t[iıuü]r", "ken", "yken",
154             "l[ae]r", "l[iıuü]k", "yd[iıuü]", "yl[ae]", "ki"
155             ) + ")";
156     }
157 }