View Javadoc
1   /*
2    * The WMF licenses this file to you under the Apache License, Version
3    * 2.0 (the "License"); you may not use this file except in compliance
4    * with the License. You may obtain a copy of the License at
5    *
6    *      http://www.apache.org/licenses/LICENSE-2.0
7    *
8    * Unless required by applicable law or agreed to in writing, software
9    * distributed under the License is distributed on an "AS IS" BASIS,
10   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11   * See the License for the specific language governing permissions and
12   * limitations under the License.
13   *
14   * *** Source Information ***
15   *
16   * This code combines implementation details and linguistic information
17   * from two main sources.
18   *
19   * ** Light Stemmer for Czech **
20   *
21   * The implementation is based on the lucene-solr "Light Stemmer for
22   * Czech", which is licensed from ASF under the Apache License, Version
23   * 2.0. Source code is available here:
24   * https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java
25   *
26   * ** stemm-sk **
27   *
28   * The Slovak-specific suffix information is adapted from stemm-sk, which
29   * is Copyright (c) 2015 Marek Šuppa and licensed under the MIT
30   * License (included below, as required). Source code is available here:
31   * https://github.com/mrshu/stemm-sk/
32   *
33   * | Slovak-specific suffix information Copyright (c) 2015 Marek Šuppa
34   * |
35   * | Permission is hereby granted, free of charge, to any
36   * | person obtaining a copy of this software and associated
37   * | documentation files (the "Software"), to deal in the
38   * | Software without restriction, including without limitation
39   * | the rights to use, copy, modify, merge, publish,
40   * | distribute, sublicense, and/or sell copies of the
41   * | Software, and to permit persons to whom the Software is
42   * | furnished to do so, subject to the following conditions:
43   * |
44   * | The above copyright notice and this permission notice
45   * | shall be included in all copies or substantial portions of
46   * | the Software.
47   *
48   * ** Additional Sources **
49   *
50   * The stemm-sk source code includes its own additional sources. The
51   * Light Stemmer for Czech source code references the paper "Indexing
52   * and stemming approaches for the Czech language" by Dolamic and Savoy
53   * (2009), which is also the ultimate source of the main Czech
54   * implementation that stemm-sk is based on. The paper is available
55   * here: http://portal.acm.org/citation.cfm?id=1598600 .
56   *
57   * ** Additional Changes **
58   *
59   * - Updates to conform to findbugs/spotbugs/checkstyle errors.
60   *
61   * - Added prefix stripping based on review of Slovak morphology and
62   * comparison to Polish.
63   */
64  
65  package org.wikimedia.search.extra.analysis.slovak;
66  
67  import static org.apache.lucene.analysis.util.StemmerUtil.deleteN;
68  import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
69  import static org.apache.lucene.analysis.util.StemmerUtil.startsWith;
70  
71  
72  public class SlovakStemmer {
73  
74      /*
75       * Stem an input buffer of Slovak text.
76       *
77       * @param s input buffer
78       * @param len length of input buffer
79       * @return length of input buffer after normalization
80       *
81       * <p><b>NOTE</b>: Input is expected to be in lowercase,
82       * but with diacritical marks</p>
83       */
84      public int stem(char[] s, int len) {
85          len = removeCase(s, len);
86          len = removePossessives(s, len);
87          return removePrefixes(s, len);
88      }
89  
90      private int removePrefixes(char[] s, int len) {
91          if (len > 5 && startsWith(s, len, "naj")) {
92              return deleteN(s, 0, len, 3);
93          }
94          return len;
95      }
96  
97      @SuppressWarnings({"NPathComplexity", "CyclomaticComplexity"})
98      private int removeCase(char[] s, int len) {
99          if (len > 7 && endsWith(s, len, "atoch")) {
100             return len - 5;
101         }
102 
103         if (len > 6 && endsWith(s, len, "aťom")) {
104             return palatalize(s, len - 3);
105         }
106 
107         if (len > 5) {
108             if (endsWith(s, len, "och") ||
109                 endsWith(s, len, "ich") ||
110                 endsWith(s, len, "ích") ||
111                 endsWith(s, len, "ého") ||
112                 endsWith(s, len, "ami") ||
113                 endsWith(s, len, "emi") ||
114                 endsWith(s, len, "ému") ||
115                 endsWith(s, len, "ete") ||
116                 endsWith(s, len, "eti") ||
117                 endsWith(s, len, "iho") ||
118                 endsWith(s, len, "ího") ||
119                 endsWith(s, len, "ími") ||
120                 endsWith(s, len, "imu") ||
121                 endsWith(s, len, "aťa")) {
122                 return palatalize(s, len - 2);
123             }
124             if (endsWith(s, len, "ách") ||
125                 endsWith(s, len, "ata") ||
126                 endsWith(s, len, "aty") ||
127                 endsWith(s, len, "ých") ||
128                 endsWith(s, len, "ové") ||
129                 endsWith(s, len, "ovi") ||
130                 endsWith(s, len, "ými")) {
131                 return len - 3;
132             }
133         }
134 
135         if (len > 4) {
136             if (endsWith(s, len, "om")) {
137                 return palatalize(s, len - 1);
138             }
139             if (endsWith(s, len, "es") ||
140                 endsWith(s, len, "ém") ||
141                 endsWith(s, len, "ím")) {
142                 return palatalize(s, len - 2);
143             }
144             if (endsWith(s, len, "úm") ||
145                 endsWith(s, len, "at") ||
146                 endsWith(s, len, "ám") ||
147                 endsWith(s, len, "os") ||
148                 endsWith(s, len, "us") ||
149                 endsWith(s, len, "ým") ||
150                 endsWith(s, len, "mi") ||
151                 endsWith(s, len, "ou") ||
152                 endsWith(s, len, "ej")) {
153                 return len - 2;
154             }
155         }
156 
157         if (len > 3) {
158             switch (s[len - 1]) {
159                 case 'e':
160                 case 'i':
161                 case 'í':
162                     return palatalize(s, len);
163                 case 'ú':
164                 case 'y':
165                 case 'a':
166                 case 'o':
167                 case 'á':
168                 case 'é':
169                 case 'ý':
170                     return len - 1;
171                 default:
172             }
173         }
174 
175         return len;
176     }
177 
178     private int removePossessives(char[] s, int len) {
179         if (len > 5) {
180             if (endsWith(s, len, "ov")) {
181                 return len - 2;
182             }
183             if (endsWith(s, len, "in")) {
184                 return palatalize(s, len - 1);
185             }
186         }
187 
188         return len;
189     }
190 
191     @SuppressWarnings({"CyclomaticComplexity"})
192     private int palatalize(char[] s, int len) {
193         assert len > 3;
194 
195         if (endsWith(s, len, "ci") ||
196             endsWith(s, len, "ce") ||
197             endsWith(s, len, "či") ||
198             endsWith(s, len, "če")) { // [cč][ie] -> k
199             s[len - 2] = 'k';
200         } else if (endsWith(s, len, "zi") ||
201             endsWith(s, len, "ze") ||
202             endsWith(s, len, "ži") ||
203             endsWith(s, len, "že")) { // [zž][ie] -> h
204             s[len - 2] = 'h';
205         } else if (endsWith(s, len, "čte") ||
206             endsWith(s, len, "čti") ||
207             endsWith(s, len, "čtí")) { // čt[eií] -> ck
208             s[len - 3] = 'c';
209             s[len - 2] = 'k';
210         } else if (endsWith(s, len, "šte") ||
211             endsWith(s, len, "šti") ||
212             endsWith(s, len, "ští")) { // št[eií] -> sk
213             s[len - 3] = 's';
214             s[len - 2] = 'k';
215         }
216 
217         return len - 1;
218     }
219 }