1 package org.wikimedia.search.extra.analysis.textify;
2
3 import java.io.IOException;
4 import java.util.Set;
5 import java.util.regex.Pattern;
6
7 import javax.annotation.Nullable;
8
9 import org.apache.lucene.analysis.TokenFilter;
10 import org.apache.lucene.analysis.TokenStream;
11 import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
12 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
13 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
14 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
15 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
16
17 import com.google.common.collect.Table;
18 import com.ibm.icu.lang.UScript;
19
20 import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
21
22 @SuppressFBWarnings(value = "EQ_DOESNT_OVERRIDE_EQUALS", justification = "Standard pattern for token filters.")
23 public final class ICUTokenRepairFilter extends TokenFilter {
24
25
26 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
27 private final OffsetAttribute offAtt = addAttribute(OffsetAttribute.class);
28 private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
29 private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
30 private int currType;
31
32
33 @Nullable private final ScriptAttribute scriptAtt = getAttribute(ScriptAttribute.class);
34
35 @Nullable private State prevState;
36 private final TmpTok prevTok = new TmpTok();
37 private boolean inputEnd;
38
39 private final int maxTokenLength;
40 private final boolean keepCamelSplit;
41 private final boolean mergeNumOnly;
42 private final boolean isTypeAllowList;
43 private final Set<Integer> mergeableTypes;
44 private final boolean filterScriptPairs;
45 @Nullable private final Table<Integer, Integer, Boolean> mergeableScriptPairs;
46
47 private static final Pattern HAS_LETTER_PAT = Pattern.compile(".*\\p{L}.*");
48
49 public ICUTokenRepairFilter(TokenStream input) {
50 this(input, new ICUTokenRepairFilterConfig());
51 }
52
53 public ICUTokenRepairFilter(TokenStream input, ICUTokenRepairFilterConfig cfg) {
54 super(input);
55 maxTokenLength = cfg.maxTokenLength;
56 keepCamelSplit = cfg.keepCamelSplit;
57 mergeNumOnly = cfg.mergeNumOnly;
58 isTypeAllowList = cfg.isTypeAllowList;
59 mergeableTypes = cfg.mergeableTypes;
60 filterScriptPairs = cfg.filterScriptPairs;
61 mergeableScriptPairs = cfg.mergeableScriptPairs;
62
63 if (!keepCamelSplit && mergeNumOnly) {
64 throw new IllegalArgumentException("ICU Token Repair invalid argument: Setting " +
65 "'merge numbers only' to true and setting 'keep camelCase split' to false " +
66 "are not compatible");
67 }
68 }
69
70 @SuppressWarnings("CyclomaticComplexity")
71 @Override
72 public boolean incrementToken() throws IOException {
73
74 if (scriptAtt == null) {
75
76 return input.incrementToken();
77 }
78
79
80 while (!inputEnd || prevState != null) {
81
82 if (inputEnd || !input.incrementToken()) {
83
84 inputEnd = true;
85 if (prevState != null) {
86
87 restoreStateEtc();
88 prevState = null;
89 return true;
90 }
91
92
93 return false;
94 }
95
96
97 currType = TextifyUtils.getTokenType(typeAtt.type());
98
99
100
101 if (currType == TextifyUtils.TOKEN_TYPE_NUM && HAS_LETTER_PAT.matcher(termAtt).matches()) {
102 typeAtt.setType(TextifyUtils.getTokenTypeName(TextifyUtils.TOKEN_TYPE_ALPHANUM));
103 currType = TextifyUtils.TOKEN_TYPE_ALPHANUM;
104 }
105
106
107 if (prevState != null) {
108
109
110 if (!prevTok.canMergeWithCurrTok()) {
111
112 prevTok.captureCurrentToken();
113 State tmpState = captureState();
114 restoreStateEtc();
115 prevState = tmpState;
116 return true;
117 }
118
119 prevTok.mergeIntoCurrTok();
120 prevState = captureState();
121 } else {
122
123 prevTok.captureCurrentToken();
124 prevState = captureState();
125 }
126 }
127
128 return false;
129 }
130
131 private void restoreStateEtc() {
132 restoreState(prevState);
133 currType = TextifyUtils.getTokenType(typeAtt.type());
134 if (isWeakTokenType(currType)) {
135 scriptAtt.setCode(UScript.COMMON);
136 }
137 }
138
139
140 private final class TmpTok {
141 final StringBuilder term = new StringBuilder();
142 int startOff;
143 int endOff;
144 int posIncr;
145 int type;
146 int lastType;
147 int script;
148 int lastScript;
149
150 private void captureCurrentToken() {
151
152 term.setLength(0);
153 term.append(termAtt);
154 startOff = offAtt.startOffset();
155 endOff = offAtt.endOffset();
156 posIncr = posAtt.getPositionIncrement();
157 type = currType;
158 lastType = currType;
159 script = scriptAtt.getCode();
160 lastScript = script;
161 }
162
163
164 private int getPrevLastRealCharType() {
165 int prevType = TextifyUtils.TOKEN_TYPE_OTHER;
166 int i;
167 for (i = term.length() - 1; i >= 0; i--) {
168 int codepoint = Character.codePointAt(term, i);
169 if (Character.isLowSurrogate((char) codepoint) && i > 0 &&
170 Character.isHighSurrogate(term.charAt(i - 1))) {
171 i--;
172 codepoint = Character.codePointAt(term, i);
173 }
174 prevType = TextifyUtils.getCustomCharType(codepoint);
175 if (TextifyUtils.isMarkOrFormatType(prevType)) {
176 continue;
177 }
178 return prevType;
179 }
180 return prevType;
181 }
182
183
184 private int getNextFirstRealCharType() {
185 int nextType = TextifyUtils.TOKEN_TYPE_OTHER;
186 for (int i = 0; i < termAtt.length();) {
187 int codepoint = Character.codePointAt(termAtt, i);
188 i += Character.charCount(codepoint);
189 nextType = TextifyUtils.getCustomCharType(codepoint);
190 if (TextifyUtils.isMarkOrFormatType(nextType)) {
191 continue;
192 }
193 return nextType;
194 }
195 return nextType;
196 }
197
198 private boolean isUnmergeableTokenType(int type) {
199
200
201
202
203
204
205 return isTypeAllowList != mergeableTypes.contains(type);
206 }
207
208 private boolean canMergeWithCurrTok() {
209
210
211
212
213 if (endOff != offAtt.startOffset()) {
214 return false;
215 }
216
217
218 if (lastScript == scriptAtt.getCode()) {
219 return false;
220 }
221
222
223 if (camelSplitOrMergeNumCheck()) {
224 return false;
225 }
226
227
228 if (isUnmergeableTokenType(type) || isUnmergeableTokenType(currType)) {
229 return false;
230 }
231
232
233
234
235 if (scriptPairCheck()) {
236 return false;
237 }
238
239
240 if (term.length() + termAtt.length() > maxTokenLength) {
241 return false;
242 }
243
244 return true;
245 }
246
247 private boolean scriptPairCheck() {
248 return filterScriptPairs && lastType != TextifyUtils.TOKEN_TYPE_NUM &&
249 currType != TextifyUtils.TOKEN_TYPE_NUM &&
250 !mergeableScriptPairs.contains(lastScript, scriptAtt.getCode());
251 }
252
253 private boolean camelSplitOrMergeNumCheck() {
254 if (keepCamelSplit || mergeNumOnly) {
255 int prevLastCharType = getPrevLastRealCharType();
256 int nextFirstCharType = getNextFirstRealCharType();
257
258
259 if (keepCamelSplit &&
260 TextifyUtils.isTrailingLowercaseishType(prevLastCharType) &&
261 TextifyUtils.isLeadingUppercaseishType(nextFirstCharType)) {
262 return true;
263 }
264
265
266 if (mergeNumOnly &&
267 (prevLastCharType != Character.DECIMAL_DIGIT_NUMBER) &&
268 (nextFirstCharType != Character.DECIMAL_DIGIT_NUMBER)) {
269 return true;
270 }
271 }
272 return false;
273 }
274
275 private void mergeIntoCurrTok() {
276
277
278
279 term.append(termAtt.buffer(), 0, termAtt.length());
280 termAtt.setEmpty();
281 termAtt.append(term);
282
283 posIncr += posAtt.getPositionIncrement() - 1;
284 posAtt.setPositionIncrement(posIncr);
285
286 endOff = offAtt.endOffset();
287 offAtt.setOffset(startOff, endOff);
288
289 lastScript = scriptAtt.getCode();
290 script = mergeScripts();
291 scriptAtt.setCode(script);
292
293 lastType = currType;
294 type = mergeTokenTypes();
295 typeAtt.setType(TextifyUtils.getTokenTypeName(type));
296 currType = type;
297 }
298
299 private int mergeScripts() {
300
301
302
303 if (isWeakTokenType(type)) {
304 return scriptAtt.getCode();
305 }
306
307 if (isWeakTokenType(currType)) {
308 return script;
309 }
310
311 return UScript.UNKNOWN;
312 }
313
314 private int mergeTokenTypes() {
315 if (type == currType || isWeakTokenType(currType)) {
316 return type;
317 }
318
319 if (isWeakTokenType(type)) {
320 return currType;
321 }
322
323
324 if ((type == TextifyUtils.TOKEN_TYPE_HANGUL && currType == TextifyUtils.TOKEN_TYPE_ALPHANUM) ||
325 (currType == TextifyUtils.TOKEN_TYPE_HANGUL && type == TextifyUtils.TOKEN_TYPE_ALPHANUM)) {
326 return TextifyUtils.TOKEN_TYPE_ALPHANUM;
327 }
328
329
330
331
332 if (isIdeoTokenType(type) && isIdeoTokenType(currType)) {
333 return TextifyUtils.TOKEN_TYPE_IDEOGRAPHIC;
334 }
335
336 return TextifyUtils.TOKEN_TYPE_OTHER;
337 }
338
339 private boolean isIdeoTokenType(int ty) {
340 switch (ty) {
341 case TextifyUtils.TOKEN_TYPE_IDEOGRAPHIC:
342 case TextifyUtils.TOKEN_TYPE_HIRAGANA:
343 case TextifyUtils.TOKEN_TYPE_KATAKANA:
344 return true;
345 default:
346 return false;
347 }
348 }
349
350 }
351
352
353
354
355 private boolean isWeakTokenType(int ty) {
356 switch (ty) {
357 case TextifyUtils.TOKEN_TYPE_NUM:
358 case TextifyUtils.TOKEN_TYPE_EMOJI:
359 return true;
360 default:
361 return false;
362 }
363 }
364
365 @Override
366 public void reset() throws IOException {
367 super.reset();
368 inputEnd = false;
369 prevState = null;
370 currType = 0;
371 }
372 }