View Javadoc
1   package org.wikimedia.search.extra.analysis.ukrainian;
2   
3   import java.io.IOException;
4   
5   import org.apache.lucene.analysis.Analyzer;
6   import org.apache.lucene.analysis.BaseTokenStreamTestCase;
7   import org.apache.lucene.analysis.CharArraySet;
8   import org.apache.lucene.analysis.TokenStream;
9   import org.apache.lucene.analysis.Tokenizer;
10  import org.apache.lucene.analysis.core.LowerCaseFilter;
11  import org.apache.lucene.analysis.core.WhitespaceTokenizer;
12  import org.junit.Test;
13  
14  import morfologik.stemming.Dictionary;
15  
16  public class UkrainianAnalysisTest extends BaseTokenStreamTestCase {
17  
18      private static final Dictionary UK_DICT = UkrainianStemmerFilterFactory.UK_DICT;
19      private static final CharArraySet UK_STOP = UkrainianStopFilterFactory.UK_STOP;
20  
21      @Test
22      public void simpleTest() throws IOException {
23          String input = "Ласкаво просимо до Вікіпедії";
24          try (Analyzer ws = newUkrainianStemmer()) {
25              TokenStream ts = ws.tokenStream("", input);
26              assertTokenStreamContents(ts,
27                      new String[]{"ласкаво", "просити", "до", "вікіпедія"},
28                      new int[]{0, 8, 16, 19}, // start offsets
29                      new int[]{7, 15, 18, 28}, // end offsets
30                      null, // types, not supported
31                      new int[]{1, 1, 1, 1}, // pos increments
32                      null, // pos size (unsupported)
33                      28, // last offset
34                      null, //keywordAtts, (unsupported)
35                      true);
36          }
37      }
38  
39      private Analyzer newUkrainianStemmer() {
40          return new Analyzer() {
41              @Override
42              protected TokenStreamComponents createComponents(String fieldName) {
43                  Tokenizer tok = new WhitespaceTokenizer();
44                  TokenStream ts = new LowerCaseFilter(tok);
45                  ts = new UkrainianStemmerFilter(ts, UK_DICT);
46                  return new TokenStreamComponents(tok, ts);
47              }
48          };
49      }
50  
51      @Test
52      public void simpleTestWithStop() throws IOException {
53          // Same test but with a stop filter wrapped
54          // testing that if a term is removed our states are still valid
55          String input = "Ласкаво просимо до Вікіпедії";
56          try (Analyzer ws = newUkrainianStemmerWithStop()) {
57              TokenStream ts = ws.tokenStream("", input);
58              assertTokenStreamContents(ts,
59                      new String[]{"ласкаво", "просити", "вікіпедія"},
60                      new int[]{0, 8, 19}, // start offsets
61                      new int[]{7, 15, 28}, // end offsets
62                      null, // types, not supported
63                      new int[]{1, 1, 2}, // pos increments
64                      null, // pos size (unsupported)
65                      28, // last offset
66                      null, //keywordAtts, (unsupported)
67                      true);
68          }
69      }
70  
71      private Analyzer newUkrainianStemmerWithStop() {
72          return new Analyzer() {
73              @Override
74              protected TokenStreamComponents createComponents(String fieldName) {
75                  Tokenizer tok = new WhitespaceTokenizer();
76                  TokenStream ts = new LowerCaseFilter(tok);
77                  ts = new UkrainianStopFilter(ts, UK_STOP);
78                  ts = new UkrainianStemmerFilter(ts, UK_DICT);
79                  return new TokenStreamComponents(tok, ts);
80              }
81          };
82      }
83  
84  }