View Javadoc
1   package org.wikimedia.search.extra.analysis.textify;
2   
3   import static org.wikimedia.search.extra.analysis.textify.LimitedMappingCharFilterFactory.parseMappings;
4   
5   import java.io.IOException;
6   import java.io.StringReader;
7   import java.util.Arrays;
8   import java.util.HashMap;
9   import java.util.Map;
10  
11  import org.apache.lucene.analysis.BaseTokenStreamTestCase;
12  import org.apache.lucene.analysis.TokenStream;
13  import org.elasticsearch.common.settings.SettingsException;
14  import org.junit.Test;
15  
16  public class LimitedMappingCharFilterTest extends BaseTokenStreamTestCase {
17  
18      private Map<Integer, Integer> map;
19  
20      private TokenStream ezTokStream(String s, Map<Integer, Integer> map) throws IOException {
21          return whitespaceMockTokenizer(new LimitedMappingCharFilter(map, new StringReader(s)));
22      }
23  
24      @Test
25      public void testSimpleMapping() throws IOException {
26          map = parseMappings(Arrays.asList("`=>'", "‘=>'", "’=>'"));
27          assertTokenStreamContents(
28              ezTokStream("a`b‘c’d", map),
29              new String[]{"a'b'c'd"},
30              new int[]{0},  // start offsets
31              new int[]{7}); // end offsets
32      }
33  
34      @Test
35      public void testDeletionMapping() throws IOException {
36          map = parseMappings(Arrays.asList("_=>", "-=>"));
37          assertTokenStreamContents(
38              ezTokStream("a_b-c_d -- _x__y_-_z_", map),
39              new String[]{"abcd", "xyz"},
40              new int[]{0, 12},  // start offsets
41              new int[]{7, 21}); // end offsets
42  
43          assertTokenStreamContents(
44              ezTokStream("a_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_b", map),
45              new String[]{"ab"},
46              new int[]{0},   // start offsets
47              new int[]{35}); // end offsets
48      }
49  
50      @Test
51      public void testUnicodeMappings() throws IOException {
52          map = parseMappings(Arrays.asList(
53              "e=>\\u00E9",      // e => é
54              "\\u00FC=>u",      // ü => u
55              "\\u00C5=>\\u00E5" // Å => å
56              ));
57          assertTokenStreamContents(
58              ezTokStream("eüÅ", map),
59              new String[]{"éuå"},
60              new int[]{0},  // start offsets
61              new int[]{3}); // end offsets
62      }
63  
64      @Test
65      public void testEscapes() throws IOException {
66          map = parseMappings(Arrays.asList(
67              "\\=>\"",   //      \ => "
68              "\t=>\'",  //   [tab] => '
69              "\r=>,",  // [return] => ,
70              "\n=>."  // [newline] => .
71              ));
72          assertTokenStreamContents(
73              ezTokStream("a\\b c\td e\nf g\rh", map),
74              new String[]{"a\"b", "c'd", "e.f", "g,h"},
75              new int[]{0, 4,  8, 12},  // start offsets
76              new int[]{3, 7, 11, 15}); // end offsets
77      }
78  
79      @Test
80      public void testEscapedEscapes() throws IOException {
81          map = parseMappings(Arrays.asList(
82              "\\\\=>\\\"", //      \ => "
83              "\\t=>\\\'", //   [tab] => '
84              "\\r=>\\'", // [return] => '
85              "\\n=>'"   // [newline] => '
86              ));
87          assertTokenStreamContents(
88              ezTokStream("a\\b c\td e\nf g\rh", map),
89              new String[]{"a\"b", "c'd", "e'f", "g'h"},
90              new int[]{0, 4,  8, 12},  // start offsets
91              new int[]{3, 7, 11, 15}); // end offsets
92      }
93  
94      @Test
95      public void testMappingToSpaces() throws IOException {
96          map = parseMappings(Arrays.asList("_=> ", "-=>\u0020", ".=>\\u0020"));
97          assertTokenStreamContents(
98              ezTokStream("_a-b.c_", map),
99              new String[]{"a", "b", "c"},
100             new int[]{1, 3, 5},  // start offsets
101             new int[]{2, 4, 6}); // end offsets
102     }
103 
104     @Test
105     public void testWeirdLookingMappings() throws IOException {
106         map = parseMappings(Arrays.asList(
107             "==>", // delete =
108             "<=>", // delete <
109             ">=>=" // map > to =
110             ));
111         assertTokenStreamContents(
112             ezTokStream("a=b>c<d", map),
113             new String[]{"ab=cd"},
114             new int[]{0},  // start offsets
115             new int[]{7}); // end offsets
116     }
117 
118     @Test
119     public void testFlipFlop() throws IOException {
120         map = parseMappings(Arrays.asList(
121             "a=>b", "b=>c", "c=>a",
122             "y=>z", "z=>y"
123             ));
124         assertTokenStreamContents(
125             ezTokStream("abc zzy", map),
126             new String[]{"bca", "yyz"},
127             new int[]{0, 4},  // start offsets
128             new int[]{3, 7}); // end offsets
129     }
130 
131     @Test(expected = SettingsException.class)
132     public void testInvalidZeroCharSrc() throws IOException {
133         map = parseMappings(Arrays.asList("=>x"));
134     }
135 
136     @Test(expected = SettingsException.class)
137     public void testInvalidMappingSyntax() throws IOException {
138         map = parseMappings(Arrays.asList("a->x"));
139     }
140 
141     @Test(expected = SettingsException.class)
142     public void testMultiCharSrc() throws IOException {
143         map = parseMappings(Arrays.asList("ab=>x"));
144     }
145 
146     @Test(expected = SettingsException.class)
147     public void testMultiCharDst() throws IOException {
148         map = parseMappings(Arrays.asList("a=>xy"));
149     }
150 
151     @Test(expected = SettingsException.class)
152     public void testInvalidShortUnicode() throws IOException {
153         map = parseMappings(Arrays.asList("\\u65=>a"));
154     }
155 
156     @Test(expected = SettingsException.class)
157     public void testInvalidLongUnicode() throws IOException {
158         map = parseMappings(Arrays.asList("X=>\\u2EBD6"));
159     }
160 
161     @Test(expected = NumberFormatException.class)
162     public void testInvalidUnicode() throws IOException {
163         map = parseMappings(Arrays.asList("X=>\\uQQQQ"));
164     }
165 
166     @Test(expected = SettingsException.class)
167     public void testInvalidThirtyTwoBitCharSrc() throws IOException {
168         // 32-bit character 𝐀 is not "one character"
169         map = parseMappings(Arrays.asList("𝐀=>A"));
170     }
171 
172     @Test(expected = SettingsException.class)
173     public void testInvalidThirtyTwoBitCharDst() throws IOException {
174         // 32-bit character 𝐀 is not "one character"
175         map = parseMappings(Arrays.asList("A=>𝐀"));
176     }
177 
178     @Test(expected = SettingsException.class)
179     public void testDuplicateMappings() throws IOException {
180         map = parseMappings(Arrays.asList("x=>a", "x=>b"));
181     }
182 
183     @Test(expected = IllegalArgumentException.class)
184     public void testLowKeyBound() throws IOException {
185         map = new HashMap<>();
186         map.put(-1, 65);
187         ezTokStream("xxx", map);
188     }
189 
190     @Test(expected = IllegalArgumentException.class)
191     public void testHighKeyBound() throws IOException {
192         map = new HashMap<>();
193         map.put(65537, 65);
194         ezTokStream("xxx", map);
195     }
196 
197     @Test(expected = IllegalArgumentException.class)
198     public void testLowValueBound() throws IOException {
199         map = new HashMap<>();
200         map.put(65, -2);
201         ezTokStream("xxx", map);
202     }
203 
204     @Test(expected = IllegalArgumentException.class)
205     public void testHighValueBound() throws IOException {
206         map = new HashMap<>();
207         map.put(65, 65537);
208         ezTokStream("xxx", map);
209     }
210 
211 }