View Javadoc
1   package org.wikimedia.search.extra.analysis.homoglyph;
2   
3   import java.io.IOException;
4   import java.util.regex.Pattern;
5   
6   import org.apache.lucene.analysis.TokenFilter;
7   import org.apache.lucene.analysis.TokenStream;
8   import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
9   import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
10  
11  import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
12  
13  
14  @SuppressFBWarnings(value = "EQ_DOESNT_OVERRIDE_EQUALS", justification = "Standard pattern for token filters.")
15  public class HomoglyphTokenFilter extends TokenFilter {
16  
17      private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
18      private final PositionIncrementAttribute posIncr = addAttribute(PositionIncrementAttribute.class);
19      private final TranslationTable translationTable;
20      private State state;
21      private final StringBuilder scriptOne = new StringBuilder();
22      private final StringBuilder scriptTwo = new StringBuilder();
23  
24      HomoglyphTokenFilter(TokenStream in, TranslationTable translationTable) {
25          super(in);
26          this.translationTable = translationTable;
27      }
28  
29  
30      /* Marked final because "the TokenStream-API in Lucene is based on the
31       * decorator pattern. Therefore all non-abstract subclasses must be final
32       * or have at least a final implementation of incrementToken()! This is
33       * checked when Java assertions are enabled."
34       * https://lucene.apache.org/core/7_0_0/core/org/apache/lucene/analysis/TokenStream.html
35       */
36      @Override
37      @SuppressWarnings("checkstyle:cyclomaticComplexity") // justification: will not make code more readable
38      public final boolean incrementToken() throws IOException {
39          if (state != null) {
40              restoreState(state);
41              if (scriptOne.length() > 0) {
42                  replayTerm(scriptOne);
43                  if (scriptTwo.length() == 0) {
44                      state = null;
45                  }
46                  return true;
47              }
48              if (scriptTwo.length() > 0) {
49                  replayTerm(scriptTwo);
50                  state = null;
51                  return true;
52              }
53              throw new IllegalStateException("At least one of script one|two should be non empty");
54          }
55          if (!input.incrementToken()) {
56              return false;
57          }
58  
59          boolean matchesBothScripts = hasChars(translationTable.getScript1Reg()) && hasChars(translationTable.getScript2Reg());
60          if (matchesBothScripts) {
61              scriptOne.setLength(0);
62              scriptOne.append(termAtt);
63              translationTable.replaceScriptOne(scriptOne);
64              scriptTwo.setLength(0);
65              scriptTwo.append(termAtt);
66              translationTable.replaceScriptTwo(scriptTwo);
67              if (!translationTable.getScript1Reg().matcher(scriptOne).find()) {
68                  state = captureState();
69              } else {
70                  scriptOne.setLength(0);
71              }
72  
73              if (!translationTable.getScript2Reg().matcher(scriptTwo).find()) {
74                  if (state == null) {
75                      state = captureState();
76                  }
77              } else {
78                  scriptTwo.setLength(0);
79              }
80          }
81          return true;
82      }
83  
84      private void replayTerm(StringBuilder term) {
85          termAtt.setEmpty();
86          termAtt.append(term);
87          posIncr.setPositionIncrement(0);
88          term.setLength(0);
89      }
90  
91      private boolean hasChars(Pattern scriptRegex) {
92          return scriptRegex.matcher(termAtt).find();
93      }
94  
95  }