View Javadoc
1   package org.wikimedia.search.extra.analysis.homoglyph;
2   
3   import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
4   
5   import java.io.IOException;
6   import java.util.Arrays;
7   import java.util.Collection;
8   
9   import org.apache.lucene.analysis.Analyzer;
10  import org.apache.lucene.analysis.TokenStream;
11  import org.apache.lucene.analysis.Tokenizer;
12  import org.apache.lucene.analysis.core.WhitespaceTokenizer;
13  import org.junit.Test;
14  import org.junit.runner.RunWith;
15  import org.junit.runners.Parameterized;
16  
17  
18  @RunWith(Parameterized.class)
19  public class HomoglyphTokenFilterTest {
20      private final String input;
21      private final String[] expected;
22  
23      public HomoglyphTokenFilterTest(String input, String[] expected) {
24          this.input = input;
25          this.expected = expected;
26      }
27  
28      @Parameterized.Parameters
29      public static Collection<Object[]> homoglyphCases() {
30          return Arrays.asList(new Object[][]{
31                  {"cаt", new String[]{"cаt", "cat"}}, // input: latin c and t with cyrillic a
32                  {"LOL", new String[]{"LOL"}}, // input: all latin characters
33                  {"ЛОЛ", new String[]{"ЛОЛ"}}, // input: all cyrillic (LOL in cyrillic)
34                  {"KOЯN", new String[]{"KOЯN"}}, // input: mixed latin and cyrillic, but not convertible
35                  {"Лa", new String[]{"Лa", "Ла"}}, // input: cyrillic followed by latin
36                  {"aа", new String[]{"aа", "аа", "aa"}}, // input: latin a followed by cyrillic a
37                  {"33", new String[]{"33"}}, // input: neither latin or cyrillic
38                  {"3aа3", new String[]{"3aа3", "3аа3", "3aa3"}}, // input: mixed latin and cyrillic expected: mixed, cyrillic, latin
39                  {"Мoscow", new String[]{"Мoscow", "Moscow"}}, // input: cyrillic M followed by latin characters
40                  {"Аk", new String[]{"Аk", "Ак", "Ak"}}, // input: cyrillic followed by latin k
41                  {"іs", new String[]{"іs", "іѕ", "is"}}, // input: <mixed>, output: <mixed> <cyrillic> <latin>
42                  {"Bа́а́а́", new String[]{"Bа́а́а́", "Ва́а́а́", "Bááá"}} // input: <mixed> (latin B), output: <mixed> <cyrillic> <latin>
43          });
44      }
45  
46      @Test
47      public void testWithParameters() throws IOException {
48          try (Analyzer ws = newHomoglyphFilter()) {
49              TokenStream ts = ws.tokenStream("", input);
50              assertTokenStreamContents(ts,
51                      expected);
52          }
53      }
54  
55      @Test
56      public void testPositionIncrements() throws IOException {
57          try (Analyzer ws = newHomoglyphFilter()) {
58              String input = "All оf Ме іs fаke"; // <latin> <mixed> <cyrillic> <mixed> <mixed>
59              TokenStream ts = ws.tokenStream("", input);
60              assertTokenStreamContents(ts,
61                      new String[]{"All", "оf", "of", "Ме", "іs", "іѕ", "is", "fаke", "fake"},
62                      new int[]{1, 1, 0, 1, 1, 0, 0, 1, 0});
63          }
64      }
65  
66      private Analyzer newHomoglyphFilter() {
67          return new Analyzer() {
68              @Override
69              protected TokenStreamComponents createComponents(String fieldName) {
70                  Tokenizer tok = new WhitespaceTokenizer();
71                  TokenStream ts = new HomoglyphTokenFilter(tok, new TranslationTable(
72                          TranslationTableDictionaries.LATIN_REG, TranslationTableDictionaries.CYR_REG, TranslationTableDictionaries.LATIN_TO_CYRILLIC));
73                  return new TokenStreamComponents(tok, ts);
74              }
75          };
76      }
77  
78  }