1 package org.wikimedia.search.extra.analysis.homoglyph;
2
3 import java.io.IOException;
4 import java.util.regex.Pattern;
5
6 import org.apache.lucene.analysis.TokenFilter;
7 import org.apache.lucene.analysis.TokenStream;
8 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
9 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
10
11 import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
12
13
14 @SuppressFBWarnings(value = "EQ_DOESNT_OVERRIDE_EQUALS", justification = "Standard pattern for token filters.")
15 public class HomoglyphTokenFilter extends TokenFilter {
16
17 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
18 private final PositionIncrementAttribute posIncr = addAttribute(PositionIncrementAttribute.class);
19 private final TranslationTable translationTable;
20 private State state;
21 private final StringBuilder scriptOne = new StringBuilder();
22 private final StringBuilder scriptTwo = new StringBuilder();
23
24 HomoglyphTokenFilter(TokenStream in, TranslationTable translationTable) {
25 super(in);
26 this.translationTable = translationTable;
27 }
28
29
30
31
32
33
34
35
36 @Override
37 @SuppressWarnings("checkstyle:cyclomaticComplexity")
38 public final boolean incrementToken() throws IOException {
39 if (state != null) {
40 restoreState(state);
41 if (scriptOne.length() > 0) {
42 replayTerm(scriptOne);
43 if (scriptTwo.length() == 0) {
44 state = null;
45 }
46 return true;
47 }
48 if (scriptTwo.length() > 0) {
49 replayTerm(scriptTwo);
50 state = null;
51 return true;
52 }
53 throw new IllegalStateException("At least one of script one|two should be non empty");
54 }
55 if (!input.incrementToken()) {
56 return false;
57 }
58
59 boolean matchesBothScripts = hasChars(translationTable.getScript1Reg()) && hasChars(translationTable.getScript2Reg());
60 if (matchesBothScripts) {
61 scriptOne.setLength(0);
62 scriptOne.append(termAtt);
63 translationTable.replaceScriptOne(scriptOne);
64 scriptTwo.setLength(0);
65 scriptTwo.append(termAtt);
66 translationTable.replaceScriptTwo(scriptTwo);
67 if (!translationTable.getScript1Reg().matcher(scriptOne).find()) {
68 state = captureState();
69 } else {
70 scriptOne.setLength(0);
71 }
72
73 if (!translationTable.getScript2Reg().matcher(scriptTwo).find()) {
74 if (state == null) {
75 state = captureState();
76 }
77 } else {
78 scriptTwo.setLength(0);
79 }
80 }
81 return true;
82 }
83
84 private void replayTerm(StringBuilder term) {
85 termAtt.setEmpty();
86 termAtt.append(term);
87 posIncr.setPositionIncrement(0);
88 term.setLength(0);
89 }
90
91 private boolean hasChars(Pattern scriptRegex) {
92 return scriptRegex.matcher(termAtt).find();
93 }
94
95 }