1 package org.wikimedia.search.extra.analysis.ukrainian;
2
3 import java.io.IOException;
4
5 import org.apache.lucene.analysis.Analyzer;
6 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
7 import org.apache.lucene.analysis.CharArraySet;
8 import org.apache.lucene.analysis.TokenStream;
9 import org.apache.lucene.analysis.Tokenizer;
10 import org.apache.lucene.analysis.core.LowerCaseFilter;
11 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
12 import org.junit.Test;
13
14 import morfologik.stemming.Dictionary;
15
16 public class UkrainianAnalysisTest extends BaseTokenStreamTestCase {
17
18 private static final Dictionary UK_DICT = UkrainianStemmerFilterFactory.UK_DICT;
19 private static final CharArraySet UK_STOP = UkrainianStopFilterFactory.UK_STOP;
20
21 @Test
22 public void simpleTest() throws IOException {
23 String input = "Ласкаво просимо до Вікіпедії";
24 try (Analyzer ws = newUkrainianStemmer()) {
25 TokenStream ts = ws.tokenStream("", input);
26 assertTokenStreamContents(ts,
27 new String[]{"ласкаво", "просити", "до", "вікіпедія"},
28 new int[]{0, 8, 16, 19},
29 new int[]{7, 15, 18, 28},
30 null,
31 new int[]{1, 1, 1, 1},
32 null,
33 28,
34 null,
35 true);
36 }
37 }
38
39 private Analyzer newUkrainianStemmer() {
40 return new Analyzer() {
41 @Override
42 protected TokenStreamComponents createComponents(String fieldName) {
43 Tokenizer tok = new WhitespaceTokenizer();
44 TokenStream ts = new LowerCaseFilter(tok);
45 ts = new UkrainianStemmerFilter(ts, UK_DICT);
46 return new TokenStreamComponents(tok, ts);
47 }
48 };
49 }
50
51 @Test
52 public void simpleTestWithStop() throws IOException {
53
54
55 String input = "Ласкаво просимо до Вікіпедії";
56 try (Analyzer ws = newUkrainianStemmerWithStop()) {
57 TokenStream ts = ws.tokenStream("", input);
58 assertTokenStreamContents(ts,
59 new String[]{"ласкаво", "просити", "вікіпедія"},
60 new int[]{0, 8, 19},
61 new int[]{7, 15, 28},
62 null,
63 new int[]{1, 1, 2},
64 null,
65 28,
66 null,
67 true);
68 }
69 }
70
71 private Analyzer newUkrainianStemmerWithStop() {
72 return new Analyzer() {
73 @Override
74 protected TokenStreamComponents createComponents(String fieldName) {
75 Tokenizer tok = new WhitespaceTokenizer();
76 TokenStream ts = new LowerCaseFilter(tok);
77 ts = new UkrainianStopFilter(ts, UK_STOP);
78 ts = new UkrainianStemmerFilter(ts, UK_DICT);
79 return new TokenStreamComponents(tok, ts);
80 }
81 };
82 }
83
84 }