1
2
3
4
5
6
7
8
9
10
11
12
13
14 package org.wikimedia.search.extra.analysis.textify;
15
16 import static java.util.Collections.unmodifiableMap;
17
18 import java.util.Arrays;
19 import java.util.HashMap;
20 import java.util.List;
21 import java.util.ListIterator;
22 import java.util.Map;
23
24 import javax.annotation.Nullable;
25
26 import org.apache.lucene.analysis.standard.StandardTokenizer;
27
28 import com.google.common.collect.HashBasedTable;
29 import com.google.common.collect.Table;
30 import com.ibm.icu.lang.UScript;
31
32 public final class TextifyUtils {
33
34 private TextifyUtils() {}
35
36
37 private static final Map<String, Integer> TOKEN_TYPE_STR2INT = initTokenTypeMappings();
38
39
40
41
42
43
44
45
46
47
48 protected static final int TOKEN_TYPE_ALPHANUM = StandardTokenizer.ALPHANUM;
49 protected static final int TOKEN_TYPE_NUM = StandardTokenizer.NUM;
50 protected static final int TOKEN_TYPE_SOUTHEAST_ASIAN = StandardTokenizer.SOUTHEAST_ASIAN;
51 protected static final int TOKEN_TYPE_IDEOGRAPHIC = StandardTokenizer.IDEOGRAPHIC;
52 protected static final int TOKEN_TYPE_HIRAGANA = StandardTokenizer.HIRAGANA;
53 protected static final int TOKEN_TYPE_KATAKANA = StandardTokenizer.KATAKANA;
54 protected static final int TOKEN_TYPE_HANGUL = StandardTokenizer.HANGUL;
55 protected static final int TOKEN_TYPE_EMOJI = StandardTokenizer.EMOJI;
56 protected static final int TOKEN_TYPE_OTHER = -1;
57 protected static final String TOKEN_TYPE_OTHER_WORD = "<OTHER>";
58
59 protected static boolean isLetterType(int type) {
60 switch (type) {
61 case Character.LOWERCASE_LETTER:
62 case Character.UPPERCASE_LETTER:
63 case Character.TITLECASE_LETTER:
64 case Character.OTHER_LETTER:
65 case Character.MODIFIER_LETTER:
66 return true;
67 default:
68 return false;
69 }
70 }
71
72 protected static boolean isMarkOrFormatType(int type) {
73 switch (type) {
74 case Character.FORMAT:
75 case Character.COMBINING_SPACING_MARK:
76 case Character.NON_SPACING_MARK:
77 case Character.ENCLOSING_MARK:
78 return true;
79 default:
80 return false;
81 }
82 }
83
84 protected static boolean isPeriodlikeChar(int c) {
85 switch (c) {
86 case '.':
87 case '.':
88 return true;
89 default:
90 return false;
91 }
92 }
93
94
95
96
97 protected static boolean isLeadingUppercaseishType(int type) {
98 switch (type) {
99 case Character.UPPERCASE_LETTER:
100 case Character.TITLECASE_LETTER:
101 return true;
102 default:
103 return false;
104 }
105 }
106
107
108
109
110 protected static boolean isTrailingLowercaseishType(int type) {
111 switch (type) {
112 case Character.LOWERCASE_LETTER:
113 case Character.TITLECASE_LETTER:
114 return true;
115 default:
116 return false;
117 }
118 }
119
120 protected static int getCustomCharType(int c) {
121 if (c == 0x2069) {
122 return Character.FORMAT;
123 }
124 return Character.getType(c);
125 }
126
127 protected static String getTokenTypeName(int typeInt) {
128 switch (typeInt) {
129 case TOKEN_TYPE_OTHER:
130 return TOKEN_TYPE_OTHER_WORD;
131 default:
132 return StandardTokenizer.TOKEN_TYPES[typeInt];
133 }
134 }
135
136 protected static int getTokenType(String typeStr) {
137 return TOKEN_TYPE_STR2INT.getOrDefault(typeStr, TOKEN_TYPE_OTHER);
138 }
139
140 private static Map<String, Integer> initTokenTypeMappings() {
141 Map<String, Integer> map = new HashMap<>();
142 for (int i = 0; i < StandardTokenizer.TOKEN_TYPES.length; i++) {
143 map.put(StandardTokenizer.TOKEN_TYPES[i], i);
144 }
145 map.put(TOKEN_TYPE_OTHER_WORD, TOKEN_TYPE_OTHER);
146 return unmodifiableMap(map);
147 }
148
149
150
151
152
153 protected static Table<Integer, Integer, Boolean> parseICUTokenRepairScriptList(
154 String scriptGroups) {
155 if (scriptGroups.length() == 0) {
156 return parseICUTokenRepairScriptList((List<String>) null);
157 }
158 return parseICUTokenRepairScriptList(Arrays.asList(scriptGroups.split(", *")));
159 }
160
161
162 protected static Table<Integer, Integer, Boolean> parseICUTokenRepairScriptList(
163 @Nullable List<String> listOfScriptGroups) {
164 if (listOfScriptGroups == null || listOfScriptGroups.isEmpty()) {
165 return HashBasedTable.create();
166 }
167
168 ListIterator<String> iter = listOfScriptGroups.listIterator();
169 Table<Integer, Integer, Boolean> scriptTable = HashBasedTable.create();
170
171 while (iter.hasNext()) {
172 String[] group = iter.next().split("\\+");
173 int glen = group.length;
174 int[] groupCode = new int[glen];
175
176 for (int i = 0; i < glen; i++) {
177 if (isJpanScriptName(group[i])) {
178 group[i] = "Jpan";
179 }
180 groupCode[i] = UScript.getCodeFromName(group[i]);
181 if (groupCode[i] == UScript.INVALID_CODE) {
182 throw new IllegalArgumentException("ICU Token Repair invalid argument: " +
183 "unrecognized script " + group[i]);
184 }
185 }
186
187 for (int i = 0; i < glen; i++) {
188 for (int j = i + 1; j < glen; j++) {
189
190 scriptTable.put(groupCode[i], groupCode[j], Boolean.TRUE);
191 scriptTable.put(groupCode[j], groupCode[i], Boolean.TRUE);
192 }
193 }
194 }
195
196 return scriptTable;
197 }
198
199 private static boolean isJpanScriptName(String scr) {
200
201
202
203 switch (scr) {
204 case "Chinese":
205 case "Japanese":
206 case "Chinese/Japanese":
207 return true;
208 default:
209 return false;
210 }
211 }
212
213 }