SlovakStemmer.java

/*
 * The WMF licenses this file to you under the Apache License, Version
 * 2.0 (the "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * *** Source Information ***
 *
 * This code combines implementation details and linguistic information
 * from two main sources.
 *
 * ** Light Stemmer for Czech **
 *
 * The implementation is based on the lucene-solr "Light Stemmer for
 * Czech", which is licensed from ASF under the Apache License, Version
 * 2.0. Source code is available here:
 * https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java
 *
 * ** stemm-sk **
 *
 * The Slovak-specific suffix information is adapted from stemm-sk, which
 * is Copyright (c) 2015 Marek Šuppa and licensed under the MIT
 * License (included below, as required). Source code is available here:
 * https://github.com/mrshu/stemm-sk/
 *
 * | Slovak-specific suffix information Copyright (c) 2015 Marek Šuppa
 * |
 * | Permission is hereby granted, free of charge, to any
 * | person obtaining a copy of this software and associated
 * | documentation files (the "Software"), to deal in the
 * | Software without restriction, including without limitation
 * | the rights to use, copy, modify, merge, publish,
 * | distribute, sublicense, and/or sell copies of the
 * | Software, and to permit persons to whom the Software is
 * | furnished to do so, subject to the following conditions:
 * |
 * | The above copyright notice and this permission notice
 * | shall be included in all copies or substantial portions of
 * | the Software.
 *
 * ** Additional Sources **
 *
 * The stemm-sk source code includes its own additional sources. The
 * Light Stemmer for Czech source code references the paper "Indexing
 * and stemming approaches for the Czech language" by Dolamic and Savoy
 * (2009), which is also the ultimate source of the main Czech
 * implementation that stemm-sk is based on. The paper is available
 * here: http://portal.acm.org/citation.cfm?id=1598600 .
 *
 * ** Additional Changes **
 *
 * - Updates to conform to findbugs/spotbugs/checkstyle errors.
 *
 * - Added prefix stripping based on review of Slovak morphology and
 * comparison to Polish.
 */

package org.wikimedia.search.extra.analysis.slovak;

import static org.apache.lucene.analysis.util.StemmerUtil.deleteN;
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
import static org.apache.lucene.analysis.util.StemmerUtil.startsWith;


public class SlovakStemmer {

    /*
     * Stem an input buffer of Slovak text.
     *
     * @param s input buffer
     * @param len length of input buffer
     * @return length of input buffer after normalization
     *
     * <p><b>NOTE</b>: Input is expected to be in lowercase,
     * but with diacritical marks</p>
     */
    public int stem(char[] s, int len) {
        len = removeCase(s, len);
        len = removePossessives(s, len);
        return removePrefixes(s, len);
    }

    private int removePrefixes(char[] s, int len) {
        if (len > 5 && startsWith(s, len, "naj")) {
            return deleteN(s, 0, len, 3);
        }
        return len;
    }

    @SuppressWarnings({"NPathComplexity", "CyclomaticComplexity"})
    private int removeCase(char[] s, int len) {
        if (len > 7 && endsWith(s, len, "atoch")) {
            return len - 5;
        }

        if (len > 6 && endsWith(s, len, "aťom")) {
            return palatalize(s, len - 3);
        }

        if (len > 5) {
            if (endsWith(s, len, "och") ||
                endsWith(s, len, "ich") ||
                endsWith(s, len, "ích") ||
                endsWith(s, len, "ého") ||
                endsWith(s, len, "ami") ||
                endsWith(s, len, "emi") ||
                endsWith(s, len, "ému") ||
                endsWith(s, len, "ete") ||
                endsWith(s, len, "eti") ||
                endsWith(s, len, "iho") ||
                endsWith(s, len, "ího") ||
                endsWith(s, len, "ími") ||
                endsWith(s, len, "imu") ||
                endsWith(s, len, "aťa")) {
                return palatalize(s, len - 2);
            }
            if (endsWith(s, len, "ách") ||
                endsWith(s, len, "ata") ||
                endsWith(s, len, "aty") ||
                endsWith(s, len, "ých") ||
                endsWith(s, len, "ové") ||
                endsWith(s, len, "ovi") ||
                endsWith(s, len, "ými")) {
                return len - 3;
            }
        }

        if (len > 4) {
            if (endsWith(s, len, "om")) {
                return palatalize(s, len - 1);
            }
            if (endsWith(s, len, "es") ||
                endsWith(s, len, "ém") ||
                endsWith(s, len, "ím")) {
                return palatalize(s, len - 2);
            }
            if (endsWith(s, len, "úm") ||
                endsWith(s, len, "at") ||
                endsWith(s, len, "ám") ||
                endsWith(s, len, "os") ||
                endsWith(s, len, "us") ||
                endsWith(s, len, "ým") ||
                endsWith(s, len, "mi") ||
                endsWith(s, len, "ou") ||
                endsWith(s, len, "ej")) {
                return len - 2;
            }
        }

        if (len > 3) {
            switch (s[len - 1]) {
                case 'e':
                case 'i':
                case 'í':
                    return palatalize(s, len);
                case 'ú':
                case 'y':
                case 'a':
                case 'o':
                case 'á':
                case 'é':
                case 'ý':
                    return len - 1;
                default:
            }
        }

        return len;
    }

    private int removePossessives(char[] s, int len) {
        if (len > 5) {
            if (endsWith(s, len, "ov")) {
                return len - 2;
            }
            if (endsWith(s, len, "in")) {
                return palatalize(s, len - 1);
            }
        }

        return len;
    }

    @SuppressWarnings({"CyclomaticComplexity"})
    private int palatalize(char[] s, int len) {
        assert len > 3;

        if (endsWith(s, len, "ci") ||
            endsWith(s, len, "ce") ||
            endsWith(s, len, "či") ||
            endsWith(s, len, "če")) { // [cč][ie] -> k
            s[len - 2] = 'k';
        } else if (endsWith(s, len, "zi") ||
            endsWith(s, len, "ze") ||
            endsWith(s, len, "ži") ||
            endsWith(s, len, "že")) { // [zž][ie] -> h
            s[len - 2] = 'h';
        } else if (endsWith(s, len, "čte") ||
            endsWith(s, len, "čti") ||
            endsWith(s, len, "čtí")) { // čt[eií] -> ck
            s[len - 3] = 'c';
            s[len - 2] = 'k';
        } else if (endsWith(s, len, "šte") ||
            endsWith(s, len, "šti") ||
            endsWith(s, len, "ští")) { // št[eií] -> sk
            s[len - 3] = 's';
            s[len - 2] = 'k';
        }

        return len - 1;
    }
}