View Javadoc
1   /*
2    * The WMF licenses this file to you under the Apache License, Version
3    * 2.0 (the "License"); you may not use this file except in compliance
4    * with the License. You may obtain a copy of the License at
5    *
6    *      http://www.apache.org/licenses/LICENSE-2.0
7    *
8    * Unless required by applicable law or agreed to in writing, software
9    * distributed under the License is distributed on an "AS IS" BASIS,
10   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11   * See the License for the specific language governing permissions and
12   * limitations under the License.
13   *
14   *
15   * Notes on the algorithm and its development are available here:
16   *    https://www.mediawiki.org/wiki/User:TJones_(WMF)/Notes/Khmer_Reordering
17   *
18   */
19  
20  package org.wikimedia.search.extra.analysis.khmer;
21  
22  import static java.util.Collections.unmodifiableMap;
23  
24  import java.util.ArrayList;
25  import java.util.HashMap;
26  import java.util.Map;
27  import java.util.regex.Matcher;
28  import java.util.regex.Pattern;
29  
30  public final class KhmerSyllableReorderer {
31  
32      private KhmerSyllableReorderer() {
33          // Utility classes should not have a public or default constructor.
34      }
35  
36      // useful characters and character classes
37      // strings are mostly used to build up more complex patterns
38      // grouped into consonants, vowels, and diacritics
39      private static final String  CONSONANT = "[\u1780-\u17A2]";
40      private static final String  RO = "\u179A";
41      private static final String  COENG = "\u17D2";
42      private static final Pattern MULTI_COENG_PAT = Pattern.compile(COENG + "+");
43      private static final Pattern COENG_RO_PAT = Pattern.compile("^" + COENG + RO);
44  
45      private static final String  INDEP_VOWEL = "[\u17A3-\u17B3]";
46      private static final Pattern DEP_VOWEL_PAT = Pattern.compile("[\u17B6-\u17C5]");
47  
48      private static final String  DIACRITIC = "[\u17C6-\u17D1\u17DD]";
49      private static final Pattern REG_SHIFTER_PAT = Pattern.compile("[\u17C9\u17CA]");
50      private static final String  ROBAT = "\u17CC";
51      private static final Pattern NON_SPACING_PAT = Pattern.compile("[\u17C6\u17CB\u17CD-\u17D1\u17DD]");
52      private static final Pattern SPACING_PAT = Pattern.compile("[\u17C7\u17C8]");
53      private static final String  ZERO_WIDTH = "[\u200B-\u200D\u00AD\u2063]";
54  
55      // A syllable is a consonant or independent vowel, followed by zero or more of (i) a
56      // supplementary cons or indep vowel (coeng + cons or indep vowel), (ii) a sequence of
57      // dependent vowels, diacritics, or zero-width characters. We also allow multiple
58      // coengs in (i) because they are usually invisible and typos happen.
59      private static final String SYLL_DEF =
60          "(?:" + CONSONANT + "|" + INDEP_VOWEL + ")" +
61          "(?:" + COENG + "+(?:" + CONSONANT + "|" + INDEP_VOWEL + ")" +
62                "|(?:" + DEP_VOWEL_PAT.pattern() + "|" + DIACRITIC + "|" + ZERO_WIDTH + ")+" +
63          ")*";
64  
65      // Create a Pattern that can be used outside this class to match syllables
66      static final Pattern SYLL_PAT = Pattern.compile(SYLL_DEF);
67  
68      // a "coeng chunk" is a coeng (or several) plus a cons or indep vowel, and an optional
69      // register shifter.
70      private static final String CHUNK_DEF =
71          "(?:" + COENG + "+" +
72          "(?:" + CONSONANT + "|" + INDEP_VOWEL + ")" +
73          REG_SHIFTER_PAT.pattern() + "?)";
74  
75      // Pattern to match a coeng chunk or an individual character.
76      private static final Pattern CHUNK_OR_CHAR_PAT = Pattern.compile(CHUNK_DEF + "|.");
77  
78      // map of vowel characters that need to be merged, for internal use after reordering.
79      private static final Map<String, String> MERGE_VOWELS_MAP = unmodifiableMap(initMergeVowelsMap());
80  
81      // use the keys of MERGE_VOWELS_MAP to build MERGE_VOWELS_PAT; they are sequences of
82      // characters, so build a group (ab|cd|ef).
83      private static final Pattern MERGE_VOWELS_PAT =
84          Pattern.compile("(" + String.join("|", MERGE_VOWELS_MAP.keySet()) + ")");
85  
86      private static Map<String, String> initMergeVowelsMap() {
87          Map<String, String> map = new HashMap<>();
88          map.put("\u17C1\u17B8", "\u17BE");       // replace េ + ី with ើ
89          map.put("\u17B8\u17C1", "\u17BE");       // replace ី + េ with ើ
90          map.put("\u17C1\u17B6", "\u17C4");       // replace េ + ា  with ោ
91          return map;
92      }
93  
94      // rather than multiple calls to replaceAll, lets find and replace everything at once!
95      private static String replacePatWithMap(String s, Pattern pat, Map<String, String> map) {
96          Matcher m = pat.matcher(s);
97          StringBuffer sb = new StringBuffer();
98          while (m.find()) {
99              String charToReplace = m.group();
100             m.appendReplacement(sb, map.get(charToReplace));
101         }
102         m.appendTail(sb);
103         return sb.toString();
104     }
105 
106     // Replace duplicate elements in an array with empty string, which will disappear when
107     // we join them later.
108     private static ArrayList<CharSequence> dedupeArrayList(ArrayList<CharSequence> myList) {
109         for (int i = 1; i < myList.size(); i++) {
110             if (myList.get(i).equals(myList.get(i - 1))) {
111                 myList.set(i - 1, "");
112             }
113         }
114         return myList;
115     }
116 
117     // Input String s is assumed to be a string that was matched by SYLL_PAT. Other input
118     // will likely get mangled, probably truncated to the first character.
119     static String reorderKhmerSyllable(String s) {
120         assert !Character.isHighSurrogate(s.charAt(0)) : "the string s must match the syllable pattern";
121 
122         StringBuilder sb = new StringBuilder(s);
123 
124         ArrayList<CharSequence> coengChunks = new ArrayList<CharSequence>();
125         ArrayList<CharSequence> depVowelChunks = new ArrayList<CharSequence>();
126         ArrayList<CharSequence> regShifterChunks = new ArrayList<CharSequence>();
127         ArrayList<CharSequence> robatChunks = new ArrayList<CharSequence>();
128         ArrayList<CharSequence> nonSpacingChunks = new ArrayList<CharSequence>();
129         ArrayList<CharSequence> spacingChunks = new ArrayList<CharSequence>();
130 
131         // Match chunks for everything after the base char.
132         Matcher m = CHUNK_OR_CHAR_PAT.matcher(sb.subSequence(1, sb.length()));
133         int chunkCount = 1; // count the base character
134 
135         // Collect the various chunks by type, while keeping types in their original order.
136         while (m.find()) {
137             String chunk = m.group();
138             chunkCount++;
139 
140             // The cases below are in decreasing frequency based on a sample from Khmer
141             // Wikipedia. Coeng chunks always start with a coeng, robat is one character,
142             // the rest are regex character classes, so we use matches().
143             if (DEP_VOWEL_PAT.matcher(chunk).find()) {
144                 depVowelChunks.add(chunk);
145             } else if (chunk.startsWith(COENG)) {
146                 // Remove duplicate coengs, if any, first.
147                 chunk = MULTI_COENG_PAT.matcher(chunk).replaceAll(COENG);
148                 coengChunks.add(chunk);
149             } else if (NON_SPACING_PAT.matcher(chunk).find()) {
150                 nonSpacingChunks.add(chunk);
151             } else if (SPACING_PAT.matcher(chunk).find()) {
152                 spacingChunks.add(chunk);
153             } else if (REG_SHIFTER_PAT.matcher(chunk).find()) {
154                 regShifterChunks.add(chunk);
155             } else if (chunk.equals(ROBAT)) {
156                 robatChunks.add(chunk);
157             }
158         }
159 
160         // Reorder coeng chunks/supplementary consonants (ro is always last).
161         int coengNum = coengChunks.size();
162         for (int i = 0; i < coengNum; i++) {
163             if (COENG_RO_PAT.matcher(coengChunks.get(i)).find()) {
164                 coengChunks.add(coengChunks.get(i));
165                 coengChunks.set(i, "");
166             }
167         }
168 
169         // Merge various chunk types in the right order and dedupe.
170         ArrayList<CharSequence> allChunks = new ArrayList<CharSequence>(chunkCount);
171 
172         allChunks.add(sb.subSequence(0, 1)); // the base character
173         allChunks.addAll(regShifterChunks);
174         allChunks.addAll(robatChunks);
175         allChunks.addAll(coengChunks);
176         allChunks.addAll(depVowelChunks);
177         allChunks.addAll(nonSpacingChunks);
178         allChunks.addAll(spacingChunks);
179 
180         allChunks = dedupeArrayList(allChunks);
181 
182         // Re-join chunks in the new order, merge split vowels, and send it back!
183         return replacePatWithMap(
184             String.join("", allChunks),
185             MERGE_VOWELS_PAT, MERGE_VOWELS_MAP
186         );
187     }
188 
189 }