View Javadoc
1   package org.wikimedia.search.extra.analysis.textify;
2   
3   import java.io.IOException;
4   import java.util.Set;
5   import java.util.regex.Pattern;
6   
7   import javax.annotation.Nullable;
8   
9   import org.apache.lucene.analysis.TokenFilter;
10  import org.apache.lucene.analysis.TokenStream;
11  import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
12  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
13  import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
14  import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
15  import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
16  
17  import com.google.common.collect.Table;
18  import com.ibm.icu.lang.UScript;
19  
20  import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
21  
22  @SuppressFBWarnings(value = "EQ_DOESNT_OVERRIDE_EQUALS", justification = "Standard pattern for token filters.")
23  public final class ICUTokenRepairFilter extends TokenFilter {
24  
25      // core attributes should be present
26      private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
27      private final OffsetAttribute offAtt = addAttribute(OffsetAttribute.class);
28      private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
29      private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
30      private int currType; // numerical code for type in standard and ICU tokenizers
31  
32      // script attribute is only present if ICU tokenizer was used
33      @Nullable private final ScriptAttribute scriptAtt = getAttribute(ScriptAttribute.class);
34  
35      @Nullable private State prevState;
36      private final TmpTok prevTok = new TmpTok();
37      private boolean inputEnd;
38  
39      private final int maxTokenLength;
40      private final boolean keepCamelSplit;
41      private final boolean mergeNumOnly;
42      private final boolean isTypeAllowList;
43      private final Set<Integer> mergeableTypes;
44      private final boolean filterScriptPairs;
45      @Nullable private final Table<Integer, Integer, Boolean> mergeableScriptPairs;
46  
47      private static final Pattern HAS_LETTER_PAT = Pattern.compile(".*\\p{L}.*");
48  
49      public ICUTokenRepairFilter(TokenStream input) {
50          this(input, new ICUTokenRepairFilterConfig());
51      }
52  
53      public ICUTokenRepairFilter(TokenStream input, ICUTokenRepairFilterConfig cfg) {
54          super(input);
55          maxTokenLength = cfg.maxTokenLength;
56          keepCamelSplit = cfg.keepCamelSplit;
57          mergeNumOnly = cfg.mergeNumOnly;
58          isTypeAllowList = cfg.isTypeAllowList;
59          mergeableTypes = cfg.mergeableTypes;
60          filterScriptPairs = cfg.filterScriptPairs;
61          mergeableScriptPairs = cfg.mergeableScriptPairs;
62  
63          if (!keepCamelSplit && mergeNumOnly) {
64              throw new IllegalArgumentException("ICU Token Repair invalid argument: Setting " +
65                  "'merge numbers only' to true and setting 'keep camelCase split' to false  " +
66                  "are not compatible");
67          }
68      }
69  
70      @SuppressWarnings("CyclomaticComplexity") // 11 out of 10
71      @Override
72      public boolean incrementToken() throws IOException {
73  
74          if (scriptAtt == null) {
75              // tokenizer is misconfigured or not using script attributes.. do nothing!
76              return input.incrementToken();
77          }
78  
79          // while we haven't exhausted input or we have a cached token
80          while (!inputEnd || prevState != null) {
81  
82              if (inputEnd || !input.incrementToken()) {
83                  // no more new tokens (& no current token) ...
84                  inputEnd = true;
85                  if (prevState != null) {
86                      // ... but we have a previous one, ship it
87                      restoreStateEtc();
88                      prevState = null;
89                      return true;
90                  }
91  
92                  // all out of tokens
93                  return false;
94              }
95  
96              // on successful input.incrementToken(), capture the int token type
97              currType = TextifyUtils.getTokenType(typeAtt.type());
98  
99              // ICU tokenizer mislabels anything that ends with two+ digits as <NUM>; check
100             // for letters with HAS_LETTER_PAT, and reset to <ALPHANUM> as needed
101             if (currType == TextifyUtils.TOKEN_TYPE_NUM && HAS_LETTER_PAT.matcher(termAtt).matches()) {
102                 typeAtt.setType(TextifyUtils.getTokenTypeName(TextifyUtils.TOKEN_TYPE_ALPHANUM));
103                 currType = TextifyUtils.TOKEN_TYPE_ALPHANUM;
104             }
105 
106             // we have a current token ...
107             if (prevState != null) {
108                 // ... and we have previous token, too; merge them?
109 
110                 if (!prevTok.canMergeWithCurrTok()) {
111                     // can't merge, so cache the new token, restore the old token
112                     prevTok.captureCurrentToken();
113                     State tmpState = captureState();
114                     restoreStateEtc();
115                     prevState = tmpState;
116                     return true;
117                 }
118                 // it's mergin' time...
119                 prevTok.mergeIntoCurrTok();
120                 prevState = captureState();
121             } else {
122                 // ... but no previous token, capture this one and try again
123                 prevTok.captureCurrentToken();
124                 prevState = captureState();
125             }
126         }
127 
128         return false;
129     }
130 
131     private void restoreStateEtc() {
132         restoreState(prevState);
133         currType = TextifyUtils.getTokenType(typeAtt.type());
134         if (isWeakTokenType(currType)) {
135             scriptAtt.setCode(UScript.COMMON);
136         }
137     }
138 
139     /* temp storage for current values while we restore previous values */
140     private final class TmpTok {
141         final StringBuilder term = new StringBuilder();
142         int startOff;
143         int endOff;
144         int posIncr;
145         int type; // type we report for this token
146         int lastType; // type of last piece added to this token
147         int script; // script we report for this token
148         int lastScript; // script of last piece added to this token
149 
150         private void captureCurrentToken() {
151             // pull out useful bits of info for easy access
152             term.setLength(0);
153             term.append(termAtt);
154             startOff = offAtt.startOffset();
155             endOff = offAtt.endOffset();
156             posIncr = posAtt.getPositionIncrement();
157             type = currType;
158             lastType = currType;
159             script = scriptAtt.getCode();
160             lastScript = script;
161         }
162 
163         /* find the last *real* character of previous token, after skipping diacritics & invisibles */
164         private int getPrevLastRealCharType() {
165             int prevType = TextifyUtils.TOKEN_TYPE_OTHER;
166             int i;
167             for (i = term.length() - 1; i >= 0; i--) {
168                 int codepoint = Character.codePointAt(term, i);
169                 if (Character.isLowSurrogate((char) codepoint) && i > 0 &&
170                         Character.isHighSurrogate(term.charAt(i - 1))) {
171                     i--;
172                     codepoint = Character.codePointAt(term, i);
173                 }
174                 prevType = TextifyUtils.getCustomCharType(codepoint);
175                 if (TextifyUtils.isMarkOrFormatType(prevType)) {
176                     continue;
177                 }
178                 return prevType;
179             }
180             return prevType;
181         }
182 
183         /* find the first *real* character of next token, after skipping diacritics & invisibles */
184         private int getNextFirstRealCharType() {
185             int nextType = TextifyUtils.TOKEN_TYPE_OTHER;
186             for (int i = 0; i < termAtt.length();) {
187                 int codepoint = Character.codePointAt(termAtt, i);
188                 i += Character.charCount(codepoint);
189                 nextType = TextifyUtils.getCustomCharType(codepoint);
190                 if (TextifyUtils.isMarkOrFormatType(nextType)) {
191                     continue;
192                 }
193                 return nextType;
194             }
195             return nextType;
196         }
197 
198         private boolean isUnmergeableTokenType(int type) {
199             // This seems a bit tricky; it's an XOR of isAllowList and mergeableSet:
200             // - if allowList is true, mergeableSet says whether it's mergeable,
201             //   so isUnmergeable is the opposite of allowList
202             // - if allowList is false, mergeableSet says whether it's *un*mergeable,
203             //   so isUnmergeable is *also* the opposite of allowList
204             // ... and for booleans, != is the same as XOR
205             return isTypeAllowList != mergeableTypes.contains(type);
206         }
207 
208         private boolean canMergeWithCurrTok() {
209             // this == "previous" token
210             // current AttributeSource == "next" token
211 
212             // end of prev token must equal start of next token
213             if (endOff != offAtt.startOffset()) {
214                 return false;
215             }
216 
217             // scripts must be different, or it's not a ICU tokenizer error
218             if (lastScript == scriptAtt.getCode()) {
219                 return false;
220             }
221 
222             // cases where we have to inspect previous and next characters
223             if (camelSplitOrMergeNumCheck()) {
224                 return false;
225             }
226 
227             // Some types, like EMOJI, always split and shouldn't be remerged
228             if (isUnmergeableTokenType(type) || isUnmergeableTokenType(currType)) {
229                 return false;
230             }
231 
232             // If we are filtering scripts and this pair is not allowed, just say no--
233             // unless one token is a plain number (assume that if <NUM> is not allowed,
234             // it will be filtered by isUnmergeableTokenType() checks above).
235             if (scriptPairCheck()) {
236                 return false;
237             }
238 
239             // don't merge if the new token would be too long
240             if (term.length() + termAtt.length() > maxTokenLength) {
241                 return false;
242             }
243 
244             return true;
245         }
246 
247         private boolean scriptPairCheck() {
248             return filterScriptPairs && lastType != TextifyUtils.TOKEN_TYPE_NUM &&
249                     currType != TextifyUtils.TOKEN_TYPE_NUM &&
250                     !mergeableScriptPairs.contains(lastScript, scriptAtt.getCode());
251         }
252 
253         private boolean camelSplitOrMergeNumCheck() {
254             if (keepCamelSplit || mergeNumOnly) {
255                 int prevLastCharType = getPrevLastRealCharType();
256                 int nextFirstCharType = getNextFirstRealCharType();
257 
258                 // camel|Case split should be left alone
259                 if (keepCamelSplit &&
260                     TextifyUtils.isTrailingLowercaseishType(prevLastCharType) &&
261                     TextifyUtils.isLeadingUppercaseishType(nextFirstCharType)) {
262                     return true;
263                 }
264 
265                 // must be either 3|A or A|۱ or 3|۱ to be rejoined
266                 if (mergeNumOnly &&
267                     (prevLastCharType != Character.DECIMAL_DIGIT_NUMBER) &&
268                     (nextFirstCharType != Character.DECIMAL_DIGIT_NUMBER)) {
269                     return true;
270                 }
271             }
272             return false;
273         }
274 
275         private void mergeIntoCurrTok() {
276             // this == previous token
277             // current AttributeSource == next token
278 
279             term.append(termAtt.buffer(), 0, termAtt.length());
280             termAtt.setEmpty();
281             termAtt.append(term);
282 
283             posIncr += posAtt.getPositionIncrement() - 1;
284             posAtt.setPositionIncrement(posIncr);
285 
286             endOff = offAtt.endOffset();
287             offAtt.setOffset(startOff, endOff);
288 
289             lastScript = scriptAtt.getCode();
290             script = mergeScripts();
291             scriptAtt.setCode(script);
292 
293             lastType = currType;
294             type = mergeTokenTypes();
295             typeAtt.setType(TextifyUtils.getTokenTypeName(type));
296             currType = type;
297         }
298 
299         private int mergeScripts() {
300             // if one token is a weak type, return the other script
301             // if both are weak, it doesn't really matter, since the script
302             // attribute will overwritten as "Common" in restoreStateEtc()
303             if (isWeakTokenType(type)) {
304                 return scriptAtt.getCode();
305             }
306 
307             if (isWeakTokenType(currType)) {
308                 return script;
309             }
310 
311             return UScript.UNKNOWN;
312         }
313 
314         private int mergeTokenTypes() {
315             if (type == currType || isWeakTokenType(currType)) {
316                 return type;
317             }
318 
319             if (isWeakTokenType(type)) {
320                 return currType;
321             }
322 
323             // standard tokenizer combines hangul and alphanum to alphanum, so why not?
324             if ((type == TextifyUtils.TOKEN_TYPE_HANGUL && currType == TextifyUtils.TOKEN_TYPE_ALPHANUM) ||
325                 (currType == TextifyUtils.TOKEN_TYPE_HANGUL && type == TextifyUtils.TOKEN_TYPE_ALPHANUM)) {
326                 return TextifyUtils.TOKEN_TYPE_ALPHANUM;
327             }
328 
329             // ICU tok doesn't seem to return HIRAGANA or KATAKANA, but if it ever does,
330             // they can merge with IDEOGRAPHIC; ICU tok merges Han, Hiragana, and Katakana
331             // scripts to "Chinese/Japanese" (externally) or "Jpan" (internally)
332             if (isIdeoTokenType(type) && isIdeoTokenType(currType)) {
333                 return TextifyUtils.TOKEN_TYPE_IDEOGRAPHIC;
334             }
335 
336             return TextifyUtils.TOKEN_TYPE_OTHER;
337         }
338 
339         private boolean isIdeoTokenType(int ty) {
340             switch (ty) {
341                 case TextifyUtils.TOKEN_TYPE_IDEOGRAPHIC:
342                 case TextifyUtils.TOKEN_TYPE_HIRAGANA:
343                 case TextifyUtils.TOKEN_TYPE_KATAKANA:
344                     return true;
345                 default:
346                     return false;
347             }
348         }
349 
350     }
351 
352     /* "Weak" character/token types, like numbers, can take on a more specific
353      * type if they are in the same token with something else.
354      */
355     private boolean isWeakTokenType(int ty) {
356         switch (ty) {
357             case TextifyUtils.TOKEN_TYPE_NUM:
358             case TextifyUtils.TOKEN_TYPE_EMOJI:
359                 return true;
360             default:
361                 return false;
362         }
363     }
364 
365     @Override
366     public void reset() throws IOException {
367         super.reset();
368         inputEnd = false;
369         prevState = null;
370         currType = 0;
371     }
372 }