View Javadoc
1   package org.wikimedia.search.extra.analysis.slovak;
2   
3   import static org.junit.Assert.assertEquals;
4   
5   import junit.framework.TestCase;
6   
7   public class SlovakStemmerTest extends TestCase {
8   
9       SlovakStemmer stemmer = new SlovakStemmer();
10  
11      /*
12       * Stem strings rather than character arrays for convenience.
13       *
14       * @param s input string
15       * @return stem as string
16       */
17      private String stemAsString(String str) {
18          int len = str.length();
19          char[] s = str.toCharArray();
20          len = stemmer.stem(s, len);
21          return new String(s, 0, len);
22      }
23  
24      public void testStemmingCaseRemoval() throws Exception {
25          // semi-random selection of case ending tests
26          assertEquals(stemAsString("automatoch"), "autom");
27          assertEquals(stemAsString("dieťaťom"), "dieť");
28          assertEquals(stemAsString("stříbrného"), "stříbrn");
29          assertEquals(stemAsString("horthyovskému"), "horthyovsk");
30          assertEquals(stemAsString("dojčaťa"), "dojč");
31          assertEquals(stemAsString("pruskými"), "prusk");
32          assertEquals(stemAsString("ranených"), "ranen");
33          assertEquals(stemAsString("orkovi"), "ork");
34      }
35  
36      public void testStemmingPossRemoval() throws Exception {
37          // semi-random selection of possessive ending tests
38          assertEquals(stemAsString("draľov"), "draľ");
39          assertEquals(stemAsString("sinigrin"), "sinigr");
40      }
41  
42      public void testStemmingPalatalization() throws Exception {
43          // stem ending changes after suffix removal because of palatalize()
44          assertEquals(stemAsString("venujúcich"), "venujúk"); // k
45          assertEquals(stemAsString("turečtiny"), "tureck"); // k
46          assertEquals(stemAsString("političtí"), "politick"); // ck
47          assertEquals(stemAsString("dokážete"), "dokáh"); // h
48          assertEquals(stemAsString("zapíšte"), "zapísk"); // sk
49      }
50  
51      public void testStemmingShortStrings() throws Exception {
52          // endings match longer affixes, but stem is too short
53          // case
54          assertEquals(stemAsString("očami"), "oča");
55          assertEquals(stemAsString("inému"), "inému");
56          assertEquals(stemAsString("cete"), "cet");
57          assertEquals(stemAsString("noch"), "noch");
58          assertEquals(stemAsString("hrách"), "hrách");
59          assertEquals(stemAsString("maata"), "maat");
60          assertEquals(stemAsString("vami"), "vam");
61          assertEquals(stemAsString("nové"), "nov");
62          // possessive
63          assertEquals(stemAsString("ozov"), "ozov");
64          assertEquals(stemAsString("špin"), "špin");
65          // prefix
66          assertEquals(stemAsString("najmä"), "najmä");
67          assertEquals(stemAsString("najml"), "najml");
68      }
69  
70      public void testStemmingGeneral() throws Exception {
71          // general stemming, with multiple elements
72          assertEquals(stemAsString("najznámejšími"), "známejš"); // prefix + case suffix
73          assertEquals(stemAsString("najat"), "naj"); // prefix too short + case suffix
74          assertEquals(stemAsString("bunkových"), "bunk"); // poss + case suffixes
75          assertEquals(stemAsString("vysočinami"), "vysok"); // palatalized + poss + case
76          assertEquals(stemAsString("príčinách"), "prík"); // palatalized + poss + case
77          assertEquals(stemAsString("najnovšími"), "novš"); // prefix + case
78  
79          // artificial "test" example:
80          assertEquals(stemAsString("najtestciných"), "testk"); // prefix + palatal + poss + case
81      }
82  
83      public void testNonSlovak() throws Exception {
84          // words with non-Slovak Latin characters, or non-Latin characters
85          assertEquals(stemAsString("əliağa"), "əliağ"); // Azerbaijani
86          assertEquals(stemAsString("año"), "año"); // Spanish
87          assertEquals(stemAsString("аблютомания"), "аблютомания"); // Russian
88          assertEquals(stemAsString("вищій"), "вищій"); // Ukrainian
89          assertEquals(stemAsString("βικιπαίδεια"), "βικιπαίδεια"); // Greek
90          assertEquals(stemAsString("ვიკიპედია"), "ვიკიპედია"); // Georgian
91          assertEquals(stemAsString("위키백과"), "위키백과"); // Korean
92          assertEquals(stemAsString("ውክፔዲያ"), "ውክፔዲያ"); // Amharic
93          assertEquals(stemAsString("ᐅᐃᑭᐱᑎᐊ"), "ᐅᐃᑭᐱᑎᐊ"); // Inuktitut
94      }
95  
96  }