1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.wikimedia.search.extra.analysis.khmer;
21
22 import static java.util.Collections.unmodifiableMap;
23
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.Map;
27 import java.util.regex.Matcher;
28 import java.util.regex.Pattern;
29
30 public final class KhmerSyllableReorderer {
31
32 private KhmerSyllableReorderer() {
33
34 }
35
36
37
38
39 private static final String CONSONANT = "[\u1780-\u17A2]";
40 private static final String RO = "\u179A";
41 private static final String COENG = "\u17D2";
42 private static final Pattern MULTI_COENG_PAT = Pattern.compile(COENG + "+");
43 private static final Pattern COENG_RO_PAT = Pattern.compile("^" + COENG + RO);
44
45 private static final String INDEP_VOWEL = "[\u17A3-\u17B3]";
46 private static final Pattern DEP_VOWEL_PAT = Pattern.compile("[\u17B6-\u17C5]");
47
48 private static final String DIACRITIC = "[\u17C6-\u17D1\u17DD]";
49 private static final Pattern REG_SHIFTER_PAT = Pattern.compile("[\u17C9\u17CA]");
50 private static final String ROBAT = "\u17CC";
51 private static final Pattern NON_SPACING_PAT = Pattern.compile("[\u17C6\u17CB\u17CD-\u17D1\u17DD]");
52 private static final Pattern SPACING_PAT = Pattern.compile("[\u17C7\u17C8]");
53 private static final String ZERO_WIDTH = "[\u200B-\u200D\u00AD\u2063]";
54
55
56
57
58
59 private static final String SYLL_DEF =
60 "(?:" + CONSONANT + "|" + INDEP_VOWEL + ")" +
61 "(?:" + COENG + "+(?:" + CONSONANT + "|" + INDEP_VOWEL + ")" +
62 "|(?:" + DEP_VOWEL_PAT.pattern() + "|" + DIACRITIC + "|" + ZERO_WIDTH + ")+" +
63 ")*";
64
65
66 static final Pattern SYLL_PAT = Pattern.compile(SYLL_DEF);
67
68
69
70 private static final String CHUNK_DEF =
71 "(?:" + COENG + "+" +
72 "(?:" + CONSONANT + "|" + INDEP_VOWEL + ")" +
73 REG_SHIFTER_PAT.pattern() + "?)";
74
75
76 private static final Pattern CHUNK_OR_CHAR_PAT = Pattern.compile(CHUNK_DEF + "|.");
77
78
79 private static final Map<String, String> MERGE_VOWELS_MAP = unmodifiableMap(initMergeVowelsMap());
80
81
82
83 private static final Pattern MERGE_VOWELS_PAT =
84 Pattern.compile("(" + String.join("|", MERGE_VOWELS_MAP.keySet()) + ")");
85
86 private static Map<String, String> initMergeVowelsMap() {
87 Map<String, String> map = new HashMap<>();
88 map.put("\u17C1\u17B8", "\u17BE");
89 map.put("\u17B8\u17C1", "\u17BE");
90 map.put("\u17C1\u17B6", "\u17C4");
91 return map;
92 }
93
94
95 private static String replacePatWithMap(String s, Pattern pat, Map<String, String> map) {
96 Matcher m = pat.matcher(s);
97 StringBuffer sb = new StringBuffer();
98 while (m.find()) {
99 String charToReplace = m.group();
100 m.appendReplacement(sb, map.get(charToReplace));
101 }
102 m.appendTail(sb);
103 return sb.toString();
104 }
105
106
107
108 private static ArrayList<CharSequence> dedupeArrayList(ArrayList<CharSequence> myList) {
109 for (int i = 1; i < myList.size(); i++) {
110 if (myList.get(i).equals(myList.get(i - 1))) {
111 myList.set(i - 1, "");
112 }
113 }
114 return myList;
115 }
116
117
118
119 static String reorderKhmerSyllable(String s) {
120 assert !Character.isHighSurrogate(s.charAt(0)) : "the string s must match the syllable pattern";
121
122 StringBuilder sb = new StringBuilder(s);
123
124 ArrayList<CharSequence> coengChunks = new ArrayList<CharSequence>();
125 ArrayList<CharSequence> depVowelChunks = new ArrayList<CharSequence>();
126 ArrayList<CharSequence> regShifterChunks = new ArrayList<CharSequence>();
127 ArrayList<CharSequence> robatChunks = new ArrayList<CharSequence>();
128 ArrayList<CharSequence> nonSpacingChunks = new ArrayList<CharSequence>();
129 ArrayList<CharSequence> spacingChunks = new ArrayList<CharSequence>();
130
131
132 Matcher m = CHUNK_OR_CHAR_PAT.matcher(sb.subSequence(1, sb.length()));
133 int chunkCount = 1;
134
135
136 while (m.find()) {
137 String chunk = m.group();
138 chunkCount++;
139
140
141
142
143 if (DEP_VOWEL_PAT.matcher(chunk).find()) {
144 depVowelChunks.add(chunk);
145 } else if (chunk.startsWith(COENG)) {
146
147 chunk = MULTI_COENG_PAT.matcher(chunk).replaceAll(COENG);
148 coengChunks.add(chunk);
149 } else if (NON_SPACING_PAT.matcher(chunk).find()) {
150 nonSpacingChunks.add(chunk);
151 } else if (SPACING_PAT.matcher(chunk).find()) {
152 spacingChunks.add(chunk);
153 } else if (REG_SHIFTER_PAT.matcher(chunk).find()) {
154 regShifterChunks.add(chunk);
155 } else if (chunk.equals(ROBAT)) {
156 robatChunks.add(chunk);
157 }
158 }
159
160
161 int coengNum = coengChunks.size();
162 for (int i = 0; i < coengNum; i++) {
163 if (COENG_RO_PAT.matcher(coengChunks.get(i)).find()) {
164 coengChunks.add(coengChunks.get(i));
165 coengChunks.set(i, "");
166 }
167 }
168
169
170 ArrayList<CharSequence> allChunks = new ArrayList<CharSequence>(chunkCount);
171
172 allChunks.add(sb.subSequence(0, 1));
173 allChunks.addAll(regShifterChunks);
174 allChunks.addAll(robatChunks);
175 allChunks.addAll(coengChunks);
176 allChunks.addAll(depVowelChunks);
177 allChunks.addAll(nonSpacingChunks);
178 allChunks.addAll(spacingChunks);
179
180 allChunks = dedupeArrayList(allChunks);
181
182
183 return replacePatWithMap(
184 String.join("", allChunks),
185 MERGE_VOWELS_PAT, MERGE_VOWELS_MAP
186 );
187 }
188
189 }