View Javadoc
1   package org.wikimedia.search.extra.analysis.textify;
2   
3   import static org.junit.Assert.assertEquals;
4   
5   import java.io.IOException;
6   
7   import org.apache.lucene.analysis.Analyzer;
8   import org.apache.lucene.analysis.BaseTokenStreamTestCase;
9   import org.apache.lucene.analysis.CachingTokenFilter;
10  import org.apache.lucene.analysis.TokenStream;
11  import org.apache.lucene.analysis.Tokenizer;
12  import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
13  import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
14  import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
15  
16  public class ICUTokenRepairFilterTestUtils extends BaseTokenStreamTestCase {
17  
18      /* create a stream of ICU tokenized tokens */
19      protected static TokenStream makeICUTokStream(String s) throws IOException {
20          Analyzer ana = new Analyzer() {
21              @Override
22              protected TokenStreamComponents createComponents(String fieldName) {
23                  Tokenizer tok = new ICUTokenizer();
24                  return new TokenStreamComponents(tok);
25              }
26          };
27          return new CachingTokenFilter(ana.tokenStream("", s));
28      }
29  
30      /* create a stream of ICU tokenized tokens with default repair options */
31      protected static TokenStream makeRepairedICUTokStream(String s) throws IOException {
32          return makeRepairedICUTokStream(s, new ICUTokenRepairFilterConfig());
33      }
34  
35      /* create a stream of ICU tokenized tokens with custom repair options
36       * null options are not changed from default
37       */
38      protected static TokenStream makeRepairedICUTokStream(String s, ICUTokenRepairFilterConfig cfg)
39              throws IOException {
40          Analyzer ana = new Analyzer() {
41              @Override
42              protected TokenStreamComponents createComponents(String fieldName) {
43                  Tokenizer tok = new ICUTokenizer();
44                  TokenStream ts = new ICUTokenRepairFilter(tok, cfg);
45                  return new TokenStreamComponents(tok, ts);
46              }
47          };
48          return new CachingTokenFilter(ana.tokenStream("", s));
49      }
50  
51      /* check that all the scripts & types are as expected */
52      protected static void scriptTypeCheck(TokenStream ts, String[] scripts, String[] types)
53              throws IOException {
54          ScriptAttribute scriptAtt = ts.getAttribute(ScriptAttribute.class);
55          TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
56          ts.reset();
57          int idx = 0;
58          boolean singleScript = scripts.length == 1;
59          boolean singleType = types.length == 1;
60          while (ts.incrementToken()) {
61              assertEquals(singleScript ? scripts[0] : scripts[idx], scriptAtt.getName());
62              assertEquals(singleType ? types[0] : types[idx], typeAtt.type());
63              idx++;
64          }
65      }
66  
67      /* test shortcut method: input, repaired tokens, scripts & types */
68      protected static void testICUTokenization(String input, String[] repairedICUTokens,
69              String[] scripts, String[] types) throws IOException {
70          testICUTokenization(makeRepairedICUTokStream(input), repairedICUTokens, null, null,
71              scripts, types, null, null, null);
72      }
73  
74      /* test shortcut method: input, default tokens, repaired tokens, scripts & types */
75      protected static void testICUTokenization(String input, String[] icuTokens, String[] repairedICUTokens,
76              String[] scripts, String[] types) throws IOException {
77          testICUTokenization(makeRepairedICUTokStream(input), repairedICUTokens,
78              makeICUTokStream(input), icuTokens, scripts, types, null, null, null);
79      }
80  
81      /* test shortcut method: input, default tokens, repaired tokens, scripts & types,
82       * offsets & position increments
83       */
84      protected static void testICUTokenization(String input, String[] icuTokens, String[] repairedICUTokens,
85              String[] scripts, String[] types,
86              int[] startOffsets, int[] endOffsets, int[] posIncrements) throws IOException {
87          testICUTokenization(makeRepairedICUTokStream(input), repairedICUTokens,
88              makeICUTokStream(input), icuTokens,
89              scripts, types, startOffsets, endOffsets, posIncrements);
90      }
91  
92      /* test shortcut method: input, non-default config, repaired tokens, scripts & types */
93      protected static void testICUTokenization(String input, ICUTokenRepairFilterConfig cfg,
94              String[] repairedICUTokens, String[] scripts, String[] types) throws IOException {
95          testICUTokenization(makeRepairedICUTokStream(input, cfg), repairedICUTokens, null, null,
96              scripts, types, null, null, null);
97      }
98  
99      /* test shortcut method: input, non-default config, default tokens, repaired tokens, scripts & types */
100     protected static void testICUTokenization(String input, ICUTokenRepairFilterConfig cfg,
101             String[] icuTokens, String[] repairedICUTokens,
102             String[] scripts, String[] types) throws IOException {
103         testICUTokenization(makeRepairedICUTokStream(input, cfg), repairedICUTokens,
104             makeICUTokStream(input), icuTokens, scripts, types, null, null, null);
105     }
106 
107     /* test shortcut method: input, non-default config, default tokens, repaired tokens, scripts & types,
108      * offsets & position increments
109      */
110     protected static void testICUTokenization(String input, ICUTokenRepairFilterConfig cfg,
111             String[] icuTokens, String[] repairedICUTokens, String[] scripts, String[] types,
112             int[] startOffsets, int[] endOffsets, int[] posIncrements) throws IOException {
113         testICUTokenization(makeRepairedICUTokStream(input, cfg), repairedICUTokens,
114             makeICUTokStream(input), icuTokens, scripts, types, startOffsets, endOffsets, posIncrements);
115     }
116 
117     /* main test method, with all the bells and whistles:
118      *   repaired stream & tokens
119      *   default stream & tokens (nullable)
120      *   scripts & types (nullable)
121      *   offsets & position increments (nullable)
122      */
123     protected static void testICUTokenization(TokenStream repairedICUTokenStream, String[] repairedICUTokens,
124             TokenStream defaultICUTokenStream, String[] icuTokens, String[] scripts, String[] types,
125             int[] startOffsets, int[] endOffsets, int[] posIncrements) throws IOException {
126 
127         if (icuTokens != null) { // check ICU default results
128             assertTokenStreamContents(defaultICUTokenStream, icuTokens);
129         }
130 
131         if (startOffsets == null) { // check basic repaired ICU results
132             assertTokenStreamContents(repairedICUTokenStream, repairedICUTokens);
133         } else { // check full repaired ICU results
134             assertTokenStreamContents(repairedICUTokenStream, repairedICUTokens,
135                 startOffsets, endOffsets, posIncrements);
136         }
137 
138         repairedICUTokenStream.reset();
139         scriptTypeCheck(repairedICUTokenStream, scripts, types);
140     }
141 
142 }