View Javadoc
1   package org.wikimedia.search.extra.analysis.slovak;
2   
3   import static java.util.Collections.singletonList;
4   
5   import java.io.IOException;
6   import java.util.HashSet;
7   
8   import org.apache.lucene.analysis.Analyzer;
9   import org.apache.lucene.analysis.BaseTokenStreamTestCase;
10  import org.apache.lucene.analysis.CharArraySet;
11  import org.apache.lucene.analysis.TokenStream;
12  import org.apache.lucene.analysis.Tokenizer;
13  import org.apache.lucene.analysis.core.LowerCaseFilter;
14  import org.apache.lucene.analysis.core.StopFilter;
15  import org.apache.lucene.analysis.core.WhitespaceTokenizer;
16  import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
17  import org.junit.Test;
18  
19  public class SlovakStemmerFilterTest extends BaseTokenStreamTestCase {
20  
21      @Test
22      public void simpleTest() throws IOException {
23          String input = "Vitajte vo Wikipédii";
24          try (Analyzer ws = newSlovakStemmer()) {
25              TokenStream ts = ws.tokenStream("", input);
26              assertTokenStreamContents(ts,
27                      new String[]{"vitajt", "vo", "wikipédi"},
28                      new int[]{0, 8, 11}, // start offsets
29                      new int[]{7, 10, 20}, // end offsets
30                      null, // types, not supported
31                      new int[]{1, 1, 1}, // pos increments
32                      null, // pos size (unsupported)
33                      20, // last offset
34                      null, //keywordAtts, (unsupported)
35                      true);
36          }
37      }
38  
39      private Analyzer newSlovakStemmer() {
40          return new Analyzer() {
41              @Override
42              protected TokenStreamComponents createComponents(String fieldName) {
43                  Tokenizer tok = new WhitespaceTokenizer();
44                  TokenStream ts = new LowerCaseFilter(tok);
45                  ts = new SlovakStemmerFilter(ts);
46                  return new TokenStreamComponents(tok, ts);
47              }
48          };
49      }
50  
51      @Test
52      public void simpleTestWithStop() throws IOException {
53          // Same test but with a stop filter wrapped
54          // testing that if a term is removed our states are still valid
55          String input = "Vitajte vo Wikipédii";
56          try (Analyzer ws = newSlovakStemmerWithStop()) {
57              TokenStream ts = ws.tokenStream("", input);
58              assertTokenStreamContents(ts,
59                      new String[]{"vitajt", "wikipédi"},
60                      new int[]{0, 11}, // start offsets
61                      new int[]{7, 20}, // end offsets
62                      null, // types, not supported
63                      new int[]{1, 2}, // pos increments
64                      null, // pos size (unsupported)
65                      20, // last offset
66                      null, //keywordAtts, (unsupported)
67                      true);
68          }
69      }
70  
71      private Analyzer newSlovakStemmerWithStop() {
72          return new Analyzer() {
73              @Override
74              protected TokenStreamComponents createComponents(String fieldName) {
75                  Tokenizer tok = new WhitespaceTokenizer();
76                  TokenStream ts = new LowerCaseFilter(tok);
77                  ts = new StopFilter(ts, new CharArraySet(new HashSet<>(singletonList("vo")), true));
78                  ts = new SlovakStemmerFilter(ts);
79                  return new TokenStreamComponents(tok, ts);
80              }
81          };
82      }
83  
84      @Test
85      public void simpleTestWithFolding() throws IOException {
86          // Same test but with folding
87          String input = "Vitajte vo Wikipédii";
88          try (Analyzer ws = newSlovakStemmerWithFolding()) {
89              TokenStream ts = ws.tokenStream("", input);
90              assertTokenStreamContents(ts,
91                      new String[]{"vitajt", "vo", "wikipedi"},
92                      new int[]{0, 8, 11}, // start offsets
93                      new int[]{7, 10, 20}, // end offsets
94                      null, // types, not supported
95                      new int[]{1, 1, 1}, // pos increments
96                      null, // pos size (unsupported)
97                      20, // last offset
98                      null, //keywordAtts, (unsupported)
99                      true);
100         }
101     }
102 
103     private Analyzer newSlovakStemmerWithFolding() {
104         return new Analyzer() {
105             @Override
106             protected TokenStreamComponents createComponents(String fieldName) {
107                 Tokenizer tok = new WhitespaceTokenizer();
108                 TokenStream ts = new LowerCaseFilter(tok);
109                 ts = new ASCIIFoldingFilter(ts, false);
110                 ts = new SlovakStemmerFilter(ts);
111                 return new TokenStreamComponents(tok, ts);
112             }
113         };
114     }
115 
116 }