View Javadoc
1   package org.wikimedia.search.extra.analysis.filters;
2   
3   import java.io.IOException;
4   import java.util.ArrayList;
5   import java.util.Iterator;
6   import java.util.List;
7   
8   import org.apache.lucene.analysis.Analyzer;
9   import org.apache.lucene.analysis.BaseTokenStreamTestCase;
10  import org.apache.lucene.analysis.TokenStream;
11  import org.apache.lucene.analysis.Tokenizer;
12  import org.apache.lucene.analysis.core.WhitespaceTokenizer;
13  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
14  import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
15  import org.elasticsearch.common.collect.Tuple;
16  
17  public class TermFreqTokenFilterTest extends BaseTokenStreamTestCase {
18      public void testSimple() throws IOException {
19          //              0000000000111111111122222222223333333333
20          //              0123456789012345678901234567890123456789
21          String input = " Q|1 Q2|2 Q3 Q4| Q4|A Q5|0 Q10|10000000";
22          try (Analyzer analyzer = newAnalyzer()) {
23              TokenStream ts = analyzer.tokenStream("", input);
24              assertTokenStreamContents(ts,
25                      new String[]{"Q", "Q2", "Q3", "Q4|", "Q4|A", "Q5", "Q10"},
26                      new int[]{1, 5, 10, 13, 17, 22, 27, 27, 27}, // start offsets
27                      new int[]{4, 9, 12, 16, 21, 26, 39, 39, 39}, // end offsets
28                      null, // types, not supported
29                      new int[]{1, 1, 1, 1, 1, 1, 1}, // pos increments
30                      null, // pos size (unsupported)
31                      39, // last offset
32                      null, //keywordAtts, (unsupported)
33                      true);
34  
35          }
36      }
37  
38      public void testAttr() throws IOException {
39          String input = " Q|1 Q2|2 Q3 Q4| Q4|A Q5|0 Q10|10000000";
40          List<Tuple<String, Integer>> expects = new ArrayList<>();
41          expects.add(new Tuple<>("Q", 1));
42          expects.add(new Tuple<>("Q2", 2));
43          expects.add(new Tuple<>("Q3", 1));
44          expects.add(new Tuple<>("Q4|", 1));
45          expects.add(new Tuple<>("Q4|A", 1));
46          expects.add(new Tuple<>("Q5", 1));
47          expects.add(new Tuple<>("Q10", 3));
48          try (Analyzer analyzer = newAnalyzer()) {
49              TokenStream ts = analyzer.tokenStream("", input);
50              CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class);
51              TermFrequencyAttribute fattr = ts.getAttribute(TermFrequencyAttribute.class);
52              Iterator<Tuple<String, Integer>> ite = expects.iterator();
53              ts.reset();
54              while (ite.hasNext()) {
55                  assertTrue(ts.incrementToken());
56                  Tuple<String, Integer> tuple = ite.next();
57                  assertEquals(tuple.v1(), cattr.toString());
58                  assertEquals((int) tuple.v2(), fattr.getTermFrequency());
59              }
60              assertFalse(ts.incrementToken());
61          }
62      }
63  
64      private Analyzer newAnalyzer() {
65          return new Analyzer() {
66              @Override
67              protected TokenStreamComponents createComponents(String fieldName) {
68                  Tokenizer tok = new WhitespaceTokenizer();
69                  TokenStream ts = new TermFreqTokenFilter(tok, '|', 3);
70                  return new TokenStreamComponents(tok, ts);
71              }
72          };
73      }
74  }