1 package org.wikimedia.search.extra.analysis.slovak;
2
3 import static java.util.Collections.singletonList;
4
5 import java.io.IOException;
6 import java.util.HashSet;
7
8 import org.apache.lucene.analysis.Analyzer;
9 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
10 import org.apache.lucene.analysis.CharArraySet;
11 import org.apache.lucene.analysis.TokenStream;
12 import org.apache.lucene.analysis.Tokenizer;
13 import org.apache.lucene.analysis.core.LowerCaseFilter;
14 import org.apache.lucene.analysis.core.StopFilter;
15 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
16 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
17 import org.junit.Test;
18
19 public class SlovakStemmerFilterTest extends BaseTokenStreamTestCase {
20
21 @Test
22 public void simpleTest() throws IOException {
23 String input = "Vitajte vo Wikipédii";
24 try (Analyzer ws = newSlovakStemmer()) {
25 TokenStream ts = ws.tokenStream("", input);
26 assertTokenStreamContents(ts,
27 new String[]{"vitajt", "vo", "wikipédi"},
28 new int[]{0, 8, 11},
29 new int[]{7, 10, 20},
30 null,
31 new int[]{1, 1, 1},
32 null,
33 20,
34 null,
35 true);
36 }
37 }
38
39 private Analyzer newSlovakStemmer() {
40 return new Analyzer() {
41 @Override
42 protected TokenStreamComponents createComponents(String fieldName) {
43 Tokenizer tok = new WhitespaceTokenizer();
44 TokenStream ts = new LowerCaseFilter(tok);
45 ts = new SlovakStemmerFilter(ts);
46 return new TokenStreamComponents(tok, ts);
47 }
48 };
49 }
50
51 @Test
52 public void simpleTestWithStop() throws IOException {
53
54
55 String input = "Vitajte vo Wikipédii";
56 try (Analyzer ws = newSlovakStemmerWithStop()) {
57 TokenStream ts = ws.tokenStream("", input);
58 assertTokenStreamContents(ts,
59 new String[]{"vitajt", "wikipédi"},
60 new int[]{0, 11},
61 new int[]{7, 20},
62 null,
63 new int[]{1, 2},
64 null,
65 20,
66 null,
67 true);
68 }
69 }
70
71 private Analyzer newSlovakStemmerWithStop() {
72 return new Analyzer() {
73 @Override
74 protected TokenStreamComponents createComponents(String fieldName) {
75 Tokenizer tok = new WhitespaceTokenizer();
76 TokenStream ts = new LowerCaseFilter(tok);
77 ts = new StopFilter(ts, new CharArraySet(new HashSet<>(singletonList("vo")), true));
78 ts = new SlovakStemmerFilter(ts);
79 return new TokenStreamComponents(tok, ts);
80 }
81 };
82 }
83
84 @Test
85 public void simpleTestWithFolding() throws IOException {
86
87 String input = "Vitajte vo Wikipédii";
88 try (Analyzer ws = newSlovakStemmerWithFolding()) {
89 TokenStream ts = ws.tokenStream("", input);
90 assertTokenStreamContents(ts,
91 new String[]{"vitajt", "vo", "wikipedi"},
92 new int[]{0, 8, 11},
93 new int[]{7, 10, 20},
94 null,
95 new int[]{1, 1, 1},
96 null,
97 20,
98 null,
99 true);
100 }
101 }
102
103 private Analyzer newSlovakStemmerWithFolding() {
104 return new Analyzer() {
105 @Override
106 protected TokenStreamComponents createComponents(String fieldName) {
107 Tokenizer tok = new WhitespaceTokenizer();
108 TokenStream ts = new LowerCaseFilter(tok);
109 ts = new ASCIIFoldingFilter(ts, false);
110 ts = new SlovakStemmerFilter(ts);
111 return new TokenStreamComponents(tok, ts);
112 }
113 };
114 }
115
116 }