1 package org.wikimedia.search.extra.analysis.textify;
2
3 import static org.wikimedia.search.extra.analysis.textify.LimitedMappingCharFilterFactory.parseMappings;
4
5 import java.io.IOException;
6 import java.io.StringReader;
7 import java.util.Arrays;
8 import java.util.HashMap;
9 import java.util.Map;
10
11 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
12 import org.apache.lucene.analysis.TokenStream;
13 import org.elasticsearch.common.settings.SettingsException;
14 import org.junit.Test;
15
16 public class LimitedMappingCharFilterTest extends BaseTokenStreamTestCase {
17
18 private Map<Integer, Integer> map;
19
20 private TokenStream ezTokStream(String s, Map<Integer, Integer> map) throws IOException {
21 return whitespaceMockTokenizer(new LimitedMappingCharFilter(map, new StringReader(s)));
22 }
23
24 @Test
25 public void testSimpleMapping() throws IOException {
26 map = parseMappings(Arrays.asList("`=>'", "‘=>'", "’=>'"));
27 assertTokenStreamContents(
28 ezTokStream("a`b‘c’d", map),
29 new String[]{"a'b'c'd"},
30 new int[]{0},
31 new int[]{7});
32 }
33
34 @Test
35 public void testDeletionMapping() throws IOException {
36 map = parseMappings(Arrays.asList("_=>", "-=>"));
37 assertTokenStreamContents(
38 ezTokStream("a_b-c_d -- _x__y_-_z_", map),
39 new String[]{"abcd", "xyz"},
40 new int[]{0, 12},
41 new int[]{7, 21});
42
43 assertTokenStreamContents(
44 ezTokStream("a_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_b", map),
45 new String[]{"ab"},
46 new int[]{0},
47 new int[]{35});
48 }
49
50 @Test
51 public void testUnicodeMappings() throws IOException {
52 map = parseMappings(Arrays.asList(
53 "e=>\\u00E9",
54 "\\u00FC=>u",
55 "\\u00C5=>\\u00E5"
56 ));
57 assertTokenStreamContents(
58 ezTokStream("eüÅ", map),
59 new String[]{"éuå"},
60 new int[]{0},
61 new int[]{3});
62 }
63
64 @Test
65 public void testEscapes() throws IOException {
66 map = parseMappings(Arrays.asList(
67 "\\=>\"",
68 "\t=>\'",
69 "\r=>,",
70 "\n=>."
71 ));
72 assertTokenStreamContents(
73 ezTokStream("a\\b c\td e\nf g\rh", map),
74 new String[]{"a\"b", "c'd", "e.f", "g,h"},
75 new int[]{0, 4, 8, 12},
76 new int[]{3, 7, 11, 15});
77 }
78
79 @Test
80 public void testEscapedEscapes() throws IOException {
81 map = parseMappings(Arrays.asList(
82 "\\\\=>\\\"",
83 "\\t=>\\\'",
84 "\\r=>\\'",
85 "\\n=>'"
86 ));
87 assertTokenStreamContents(
88 ezTokStream("a\\b c\td e\nf g\rh", map),
89 new String[]{"a\"b", "c'd", "e'f", "g'h"},
90 new int[]{0, 4, 8, 12},
91 new int[]{3, 7, 11, 15});
92 }
93
94 @Test
95 public void testMappingToSpaces() throws IOException {
96 map = parseMappings(Arrays.asList("_=> ", "-=>\u0020", ".=>\\u0020"));
97 assertTokenStreamContents(
98 ezTokStream("_a-b.c_", map),
99 new String[]{"a", "b", "c"},
100 new int[]{1, 3, 5},
101 new int[]{2, 4, 6});
102 }
103
104 @Test
105 public void testWeirdLookingMappings() throws IOException {
106 map = parseMappings(Arrays.asList(
107 "==>",
108 "<=>",
109 ">=>="
110 ));
111 assertTokenStreamContents(
112 ezTokStream("a=b>c<d", map),
113 new String[]{"ab=cd"},
114 new int[]{0},
115 new int[]{7});
116 }
117
118 @Test
119 public void testFlipFlop() throws IOException {
120 map = parseMappings(Arrays.asList(
121 "a=>b", "b=>c", "c=>a",
122 "y=>z", "z=>y"
123 ));
124 assertTokenStreamContents(
125 ezTokStream("abc zzy", map),
126 new String[]{"bca", "yyz"},
127 new int[]{0, 4},
128 new int[]{3, 7});
129 }
130
131 @Test(expected = SettingsException.class)
132 public void testInvalidZeroCharSrc() throws IOException {
133 map = parseMappings(Arrays.asList("=>x"));
134 }
135
136 @Test(expected = SettingsException.class)
137 public void testInvalidMappingSyntax() throws IOException {
138 map = parseMappings(Arrays.asList("a->x"));
139 }
140
141 @Test(expected = SettingsException.class)
142 public void testMultiCharSrc() throws IOException {
143 map = parseMappings(Arrays.asList("ab=>x"));
144 }
145
146 @Test(expected = SettingsException.class)
147 public void testMultiCharDst() throws IOException {
148 map = parseMappings(Arrays.asList("a=>xy"));
149 }
150
151 @Test(expected = SettingsException.class)
152 public void testInvalidShortUnicode() throws IOException {
153 map = parseMappings(Arrays.asList("\\u65=>a"));
154 }
155
156 @Test(expected = SettingsException.class)
157 public void testInvalidLongUnicode() throws IOException {
158 map = parseMappings(Arrays.asList("X=>\\u2EBD6"));
159 }
160
161 @Test(expected = NumberFormatException.class)
162 public void testInvalidUnicode() throws IOException {
163 map = parseMappings(Arrays.asList("X=>\\uQQQQ"));
164 }
165
166 @Test(expected = SettingsException.class)
167 public void testInvalidThirtyTwoBitCharSrc() throws IOException {
168
169 map = parseMappings(Arrays.asList("𝐀=>A"));
170 }
171
172 @Test(expected = SettingsException.class)
173 public void testInvalidThirtyTwoBitCharDst() throws IOException {
174
175 map = parseMappings(Arrays.asList("A=>𝐀"));
176 }
177
178 @Test(expected = SettingsException.class)
179 public void testDuplicateMappings() throws IOException {
180 map = parseMappings(Arrays.asList("x=>a", "x=>b"));
181 }
182
183 @Test(expected = IllegalArgumentException.class)
184 public void testLowKeyBound() throws IOException {
185 map = new HashMap<>();
186 map.put(-1, 65);
187 ezTokStream("xxx", map);
188 }
189
190 @Test(expected = IllegalArgumentException.class)
191 public void testHighKeyBound() throws IOException {
192 map = new HashMap<>();
193 map.put(65537, 65);
194 ezTokStream("xxx", map);
195 }
196
197 @Test(expected = IllegalArgumentException.class)
198 public void testLowValueBound() throws IOException {
199 map = new HashMap<>();
200 map.put(65, -2);
201 ezTokStream("xxx", map);
202 }
203
204 @Test(expected = IllegalArgumentException.class)
205 public void testHighValueBound() throws IOException {
206 map = new HashMap<>();
207 map.put(65, 65537);
208 ezTokStream("xxx", map);
209 }
210
211 }