1 package org.wikimedia.search.extra.analysis.textify;
2
3 import static org.junit.Assert.assertEquals;
4
5 import java.io.IOException;
6
7 import org.apache.lucene.analysis.Analyzer;
8 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
9 import org.apache.lucene.analysis.CachingTokenFilter;
10 import org.apache.lucene.analysis.TokenStream;
11 import org.apache.lucene.analysis.Tokenizer;
12 import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
13 import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
14 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
15
16 public class ICUTokenRepairFilterTestUtils extends BaseTokenStreamTestCase {
17
18
19 protected static TokenStream makeICUTokStream(String s) throws IOException {
20 Analyzer ana = new Analyzer() {
21 @Override
22 protected TokenStreamComponents createComponents(String fieldName) {
23 Tokenizer tok = new ICUTokenizer();
24 return new TokenStreamComponents(tok);
25 }
26 };
27 return new CachingTokenFilter(ana.tokenStream("", s));
28 }
29
30
31 protected static TokenStream makeRepairedICUTokStream(String s) throws IOException {
32 return makeRepairedICUTokStream(s, new ICUTokenRepairFilterConfig());
33 }
34
35
36
37
38 protected static TokenStream makeRepairedICUTokStream(String s, ICUTokenRepairFilterConfig cfg)
39 throws IOException {
40 Analyzer ana = new Analyzer() {
41 @Override
42 protected TokenStreamComponents createComponents(String fieldName) {
43 Tokenizer tok = new ICUTokenizer();
44 TokenStream ts = new ICUTokenRepairFilter(tok, cfg);
45 return new TokenStreamComponents(tok, ts);
46 }
47 };
48 return new CachingTokenFilter(ana.tokenStream("", s));
49 }
50
51
52 protected static void scriptTypeCheck(TokenStream ts, String[] scripts, String[] types)
53 throws IOException {
54 ScriptAttribute scriptAtt = ts.getAttribute(ScriptAttribute.class);
55 TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
56 ts.reset();
57 int idx = 0;
58 boolean singleScript = scripts.length == 1;
59 boolean singleType = types.length == 1;
60 while (ts.incrementToken()) {
61 assertEquals(singleScript ? scripts[0] : scripts[idx], scriptAtt.getName());
62 assertEquals(singleType ? types[0] : types[idx], typeAtt.type());
63 idx++;
64 }
65 }
66
67
68 protected static void testICUTokenization(String input, String[] repairedICUTokens,
69 String[] scripts, String[] types) throws IOException {
70 testICUTokenization(makeRepairedICUTokStream(input), repairedICUTokens, null, null,
71 scripts, types, null, null, null);
72 }
73
74
75 protected static void testICUTokenization(String input, String[] icuTokens, String[] repairedICUTokens,
76 String[] scripts, String[] types) throws IOException {
77 testICUTokenization(makeRepairedICUTokStream(input), repairedICUTokens,
78 makeICUTokStream(input), icuTokens, scripts, types, null, null, null);
79 }
80
81
82
83
84 protected static void testICUTokenization(String input, String[] icuTokens, String[] repairedICUTokens,
85 String[] scripts, String[] types,
86 int[] startOffsets, int[] endOffsets, int[] posIncrements) throws IOException {
87 testICUTokenization(makeRepairedICUTokStream(input), repairedICUTokens,
88 makeICUTokStream(input), icuTokens,
89 scripts, types, startOffsets, endOffsets, posIncrements);
90 }
91
92
93 protected static void testICUTokenization(String input, ICUTokenRepairFilterConfig cfg,
94 String[] repairedICUTokens, String[] scripts, String[] types) throws IOException {
95 testICUTokenization(makeRepairedICUTokStream(input, cfg), repairedICUTokens, null, null,
96 scripts, types, null, null, null);
97 }
98
99
100 protected static void testICUTokenization(String input, ICUTokenRepairFilterConfig cfg,
101 String[] icuTokens, String[] repairedICUTokens,
102 String[] scripts, String[] types) throws IOException {
103 testICUTokenization(makeRepairedICUTokStream(input, cfg), repairedICUTokens,
104 makeICUTokStream(input), icuTokens, scripts, types, null, null, null);
105 }
106
107
108
109
110 protected static void testICUTokenization(String input, ICUTokenRepairFilterConfig cfg,
111 String[] icuTokens, String[] repairedICUTokens, String[] scripts, String[] types,
112 int[] startOffsets, int[] endOffsets, int[] posIncrements) throws IOException {
113 testICUTokenization(makeRepairedICUTokStream(input, cfg), repairedICUTokens,
114 makeICUTokStream(input), icuTokens, scripts, types, startOffsets, endOffsets, posIncrements);
115 }
116
117
118
119
120
121
122
123 protected static void testICUTokenization(TokenStream repairedICUTokenStream, String[] repairedICUTokens,
124 TokenStream defaultICUTokenStream, String[] icuTokens, String[] scripts, String[] types,
125 int[] startOffsets, int[] endOffsets, int[] posIncrements) throws IOException {
126
127 if (icuTokens != null) {
128 assertTokenStreamContents(defaultICUTokenStream, icuTokens);
129 }
130
131 if (startOffsets == null) {
132 assertTokenStreamContents(repairedICUTokenStream, repairedICUTokens);
133 } else {
134 assertTokenStreamContents(repairedICUTokenStream, repairedICUTokens,
135 startOffsets, endOffsets, posIncrements);
136 }
137
138 repairedICUTokenStream.reset();
139 scriptTypeCheck(repairedICUTokenStream, scripts, types);
140 }
141
142 }