1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65 package org.wikimedia.search.extra.analysis.slovak;
66
67 import static org.apache.lucene.analysis.util.StemmerUtil.deleteN;
68 import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
69 import static org.apache.lucene.analysis.util.StemmerUtil.startsWith;
70
71
72 public class SlovakStemmer {
73
74
75
76
77
78
79
80
81
82
83
84 public int stem(char[] s, int len) {
85 len = removeCase(s, len);
86 len = removePossessives(s, len);
87 return removePrefixes(s, len);
88 }
89
90 private int removePrefixes(char[] s, int len) {
91 if (len > 5 && startsWith(s, len, "naj")) {
92 return deleteN(s, 0, len, 3);
93 }
94 return len;
95 }
96
97 @SuppressWarnings({"NPathComplexity", "CyclomaticComplexity"})
98 private int removeCase(char[] s, int len) {
99 if (len > 7 && endsWith(s, len, "atoch")) {
100 return len - 5;
101 }
102
103 if (len > 6 && endsWith(s, len, "aťom")) {
104 return palatalize(s, len - 3);
105 }
106
107 if (len > 5) {
108 if (endsWith(s, len, "och") ||
109 endsWith(s, len, "ich") ||
110 endsWith(s, len, "ích") ||
111 endsWith(s, len, "ého") ||
112 endsWith(s, len, "ami") ||
113 endsWith(s, len, "emi") ||
114 endsWith(s, len, "ému") ||
115 endsWith(s, len, "ete") ||
116 endsWith(s, len, "eti") ||
117 endsWith(s, len, "iho") ||
118 endsWith(s, len, "ího") ||
119 endsWith(s, len, "ími") ||
120 endsWith(s, len, "imu") ||
121 endsWith(s, len, "aťa")) {
122 return palatalize(s, len - 2);
123 }
124 if (endsWith(s, len, "ách") ||
125 endsWith(s, len, "ata") ||
126 endsWith(s, len, "aty") ||
127 endsWith(s, len, "ých") ||
128 endsWith(s, len, "ové") ||
129 endsWith(s, len, "ovi") ||
130 endsWith(s, len, "ými")) {
131 return len - 3;
132 }
133 }
134
135 if (len > 4) {
136 if (endsWith(s, len, "om")) {
137 return palatalize(s, len - 1);
138 }
139 if (endsWith(s, len, "es") ||
140 endsWith(s, len, "ém") ||
141 endsWith(s, len, "ím")) {
142 return palatalize(s, len - 2);
143 }
144 if (endsWith(s, len, "úm") ||
145 endsWith(s, len, "at") ||
146 endsWith(s, len, "ám") ||
147 endsWith(s, len, "os") ||
148 endsWith(s, len, "us") ||
149 endsWith(s, len, "ým") ||
150 endsWith(s, len, "mi") ||
151 endsWith(s, len, "ou") ||
152 endsWith(s, len, "ej")) {
153 return len - 2;
154 }
155 }
156
157 if (len > 3) {
158 switch (s[len - 1]) {
159 case 'e':
160 case 'i':
161 case 'í':
162 return palatalize(s, len);
163 case 'ú':
164 case 'y':
165 case 'a':
166 case 'o':
167 case 'á':
168 case 'é':
169 case 'ý':
170 return len - 1;
171 default:
172 }
173 }
174
175 return len;
176 }
177
178 private int removePossessives(char[] s, int len) {
179 if (len > 5) {
180 if (endsWith(s, len, "ov")) {
181 return len - 2;
182 }
183 if (endsWith(s, len, "in")) {
184 return palatalize(s, len - 1);
185 }
186 }
187
188 return len;
189 }
190
191 @SuppressWarnings({"CyclomaticComplexity"})
192 private int palatalize(char[] s, int len) {
193 assert len > 3;
194
195 if (endsWith(s, len, "ci") ||
196 endsWith(s, len, "ce") ||
197 endsWith(s, len, "či") ||
198 endsWith(s, len, "če")) {
199 s[len - 2] = 'k';
200 } else if (endsWith(s, len, "zi") ||
201 endsWith(s, len, "ze") ||
202 endsWith(s, len, "ži") ||
203 endsWith(s, len, "že")) {
204 s[len - 2] = 'h';
205 } else if (endsWith(s, len, "čte") ||
206 endsWith(s, len, "čti") ||
207 endsWith(s, len, "čtí")) {
208 s[len - 3] = 'c';
209 s[len - 2] = 'k';
210 } else if (endsWith(s, len, "šte") ||
211 endsWith(s, len, "šti") ||
212 endsWith(s, len, "ští")) {
213 s[len - 3] = 's';
214 s[len - 2] = 'k';
215 }
216
217 return len - 1;
218 }
219 }