1 package org.wikimedia.search.extra.analysis.homoglyph;
2
3 import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
4
5 import java.io.IOException;
6 import java.util.Arrays;
7 import java.util.Collection;
8
9 import org.apache.lucene.analysis.Analyzer;
10 import org.apache.lucene.analysis.TokenStream;
11 import org.apache.lucene.analysis.Tokenizer;
12 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
13 import org.junit.Test;
14 import org.junit.runner.RunWith;
15 import org.junit.runners.Parameterized;
16
17
18 @RunWith(Parameterized.class)
19 public class HomoglyphTokenFilterTest {
20 private final String input;
21 private final String[] expected;
22
23 public HomoglyphTokenFilterTest(String input, String[] expected) {
24 this.input = input;
25 this.expected = expected;
26 }
27
28 @Parameterized.Parameters
29 public static Collection<Object[]> homoglyphCases() {
30 return Arrays.asList(new Object[][]{
31 {"cаt", new String[]{"cаt", "cat"}},
32 {"LOL", new String[]{"LOL"}},
33 {"ЛОЛ", new String[]{"ЛОЛ"}},
34 {"KOЯN", new String[]{"KOЯN"}},
35 {"Лa", new String[]{"Лa", "Ла"}},
36 {"aа", new String[]{"aа", "аа", "aa"}},
37 {"33", new String[]{"33"}},
38 {"3aа3", new String[]{"3aа3", "3аа3", "3aa3"}},
39 {"Мoscow", new String[]{"Мoscow", "Moscow"}},
40 {"Аk", new String[]{"Аk", "Ак", "Ak"}},
41 {"іs", new String[]{"іs", "іѕ", "is"}},
42 {"Bа́а́а́", new String[]{"Bа́а́а́", "Ва́а́а́", "Bááá"}}
43 });
44 }
45
46 @Test
47 public void testWithParameters() throws IOException {
48 try (Analyzer ws = newHomoglyphFilter()) {
49 TokenStream ts = ws.tokenStream("", input);
50 assertTokenStreamContents(ts,
51 expected);
52 }
53 }
54
55 @Test
56 public void testPositionIncrements() throws IOException {
57 try (Analyzer ws = newHomoglyphFilter()) {
58 String input = "All оf Ме іs fаke";
59 TokenStream ts = ws.tokenStream("", input);
60 assertTokenStreamContents(ts,
61 new String[]{"All", "оf", "of", "Ме", "іs", "іѕ", "is", "fаke", "fake"},
62 new int[]{1, 1, 0, 1, 1, 0, 0, 1, 0});
63 }
64 }
65
66 private Analyzer newHomoglyphFilter() {
67 return new Analyzer() {
68 @Override
69 protected TokenStreamComponents createComponents(String fieldName) {
70 Tokenizer tok = new WhitespaceTokenizer();
71 TokenStream ts = new HomoglyphTokenFilter(tok, new TranslationTable(
72 TranslationTableDictionaries.LATIN_REG, TranslationTableDictionaries.CYR_REG, TranslationTableDictionaries.LATIN_TO_CYRILLIC));
73 return new TokenStreamComponents(tok, ts);
74 }
75 };
76 }
77
78 }