EsperantoStemmer.java

package org.wikimedia.search.extra.analysis.esperanto;

import static java.util.Collections.unmodifiableSet;

import java.util.Arrays;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.Set;

/*
 * This file was forked from this repo under a GPLv3 license:
 *    https://github.com/wjdeclan/esperanto_stemmer
 *
 *  Copyright (C) 2018 Declan Whitford Jones
 *
 *  Licensed under GPLv3
 *
 * =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
 *
 * Modified to pass WMF style checks, convert to WMF best practices, and use unmodifiable sets.
 * Significant refactoring done to reduce complexity of stemWord(), and to front-load hyphen
 * processing to reduce overall complexity.
 *
 * All modifications are also available under a GPLv3 license.
 *
 */

public class EsperantoStemmer {

    private static final Set<String> STEMMER_SUFFIXES = unmodifiableSet(initStemmerSuffixes());
    private static final Set<String> STEMMER_EXCEPTIONS = unmodifiableSet(initStemmerExceptions());
    private static final Set<String> BASIC_NUMERALS = unmodifiableSet(initNumerals());

    // words with limited inflections
    private static final Set<String> PLURAL_DIRECT_CHECKS = unmodifiableSet(initPluralDirectChecks());

    // bare suffixes: assumes that -j and -n (as in -oj, -on, -ojn) are stripped elsewhere
    private static final Set<String> BARE_SUFFIXES = unmodifiableSet(new HashSet<>(
            Arrays.asList("-o", "-a", "-e", "-")));

    private static final Set<Character> VOWELS = unmodifiableSet(new HashSet<>(
            Arrays.asList('a', 'e', 'i', 'o', 'u')));

    private static final String[] BIG_NUMBER_WORDS = {"mil", "cent", "dek"}; // 1000, 100, 10

    private static final String[] PLURAL_DIR_OBJ_SUFFIXES = {"jn", "n", "j"}; // longest first

    private static final int MAX_SUFFIX_LENGTH = initMaxSuffixLength();
    private static final int MIN_STEM_LENGTH = 2;

    private static final Pattern INFLECTED_NUMBER_PAT = Pattern.compile("^(.*[0-9])(a|an|aj|ajn|j|oj|ojn)$");

    // Given a word, return its stemmed form
    public String stemWord(String word) {

        // Check if it is an exception to stemming
        if (isExceptionOrNumber(word)) {
            return word;
        }

        // match strings ending in numbers that are inflected without a hyphen
        // they really should use a hyphen, but we know what they meant
        Matcher inflectedNumberMatcher = INFLECTED_NUMBER_PAT.matcher(word);
        if (inflectedNumberMatcher.matches()) {
            return inflectedNumberMatcher.group(1);
        }

        int localMinStemLength = Math.max(MIN_STEM_LENGTH, firstVowelPos(word) + 1);
        int pluralDirectOffset = calcPluralDirectOffset(word);

        String stem = word; // make a copy to pare down to the stem

        if (pluralDirectOffset > 0) {
            // remove plural (-j) and direct object (-n) suffixes and check for exceptions
            stem = word.substring(0, word.length() - pluralDirectOffset);
            if (PLURAL_DIRECT_CHECKS.contains(stem) || STEMMER_EXCEPTIONS.contains(stem)) {
                return stem;
            }
        }

        if (BARE_SUFFIXES.contains(stem)) {
            // if the token is a bare suffix, like -o, -a, -e, etc., return original word
            // to include -j and -n removed via pluralDirectOffset
            return word;
        }

        // deal with hyphens, which are used to offset inflections for non-standard words
        // like "1-oj" as the plural of "1". Compare to English apostrophe, as in "3's".
        int lastDash = stem.lastIndexOf('-'); // find the last dash, if there is one

        if (lastDash != -1) {
            String ending = stem.substring(lastDash);
            // if everything after the last dash is a bare suffix, strip it and we are done.
            if (BARE_SUFFIXES.contains(ending)) {
                return stem.substring(0, lastDash);
            }
        }

        // find and remove any known suffix, along with any plural/direct object endings
        return word.substring(0, word.length() - pluralDirectOffset -
            findSuffixLength(stem, localMinStemLength));
    }

    // find the position of the first vowel in the word; it must be part of the stem
    private static int firstVowelPos(String word) {
        for (int i = 0; i < word.length(); i++) {
            if (VOWELS.contains(word.charAt(i))) {
                return i;
            }
        }
        return -1;
    }

    // check exceptions: null, on explicit list, or ends with what looks like a complex number
    private static boolean isExceptionOrNumber(String word) {
        // check lists of explicit exceptions
        if (word == null || STEMMER_EXCEPTIONS.contains(word) || BASIC_NUMERALS.contains(word)) {
            return true;
        }

        // find the last big number element (ten, hundred, thousand) and check if the rest
        // of the word is a basic number. if so, we're done.
        int numeralIndex = -1;
        for (String bigNumWord : BIG_NUMBER_WORDS) {
            int index = word.lastIndexOf(bigNumWord);
            if (index != -1 && index >= numeralIndex) {
                numeralIndex = index + bigNumWord.length();
            }
        }
        if (numeralIndex != -1) {
            return BASIC_NUMERALS.contains(word.substring(numeralIndex));
        }
        return false;
    }

    // calculate the offset of plural marker (-j) and direct object marker (-n) from end of string
    private static int calcPluralDirectOffset(String word) {
        for (String suffix : PLURAL_DIR_OBJ_SUFFIXES) {
            if (word.equals(suffix)) {
                // don't trim the whole string as a suffix
                return 0;
            }
            if (word.endsWith(suffix)) {
                int suffixLength = suffix.length();
                char prevChar = word.charAt(word.length() - suffixLength - 1);
                if (prevChar == '-' || VOWELS.contains(prevChar)) {
                    return suffixLength;
                }
            }
        }
        return 0;
    }

    /*  String findSuffixLength()
     *
     *  find the length of the longest remaining known suffix of input string "stem"
     *
     *  stem: a copy of the word minus any plural -j or direct obj -n that we found, which
     *      we are going to trim down to the best suffix
     *  localMinStemLength: minimum stem length for this word == position of first vowel in
     *      the word or the global MIN_STEM_LENGTH, whichever is greater
     */
    private static int findSuffixLength(String stem, int localMinStemLength) {
        int initialStemLength = stem.length();

        // skip to either the maximum suffix, or after the min stem length,
        // whichever is further along
        int skipOver = Math.max(initialStemLength - MAX_SUFFIX_LENGTH, localMinStemLength);

        if (initialStemLength >= skipOver) {
            // if there's anything we should be skipping, skip it
            stem = stem.substring(skipOver);
        } else {
            // otherwise, there's no plausible suffix
            return 0;
        }

        while ((!STEMMER_SUFFIXES.contains(stem) && !stem.isEmpty())
               || (initialStemLength - stem.length() < localMinStemLength)) {
            // while not a known suffix and not empty, or stem is too short for this suffix,
            // keep trying...
            stem = stem.substring(1);
        }

        return stem.length();
    }

    // Suffixes are sourced from https://en.wikipedia.org/wiki/Esperanto_grammar
    private static Set<String> initStemmerSuffixes() {
        return new HashSet<>(Arrays.asList(
            // Part of speech suffixes
            "o", "a", "e", "i",
            // Verb conjugations
            // Mood
            "u", "us",
            // Indicative
            "is", "as", "os",
            // Voice
            "inta", "anta", "onta", "ita", "ata", "ota",
            // Compound Tense
            "intas", "antas", "ontas", "itas", "atas", "otas", "intis", "antis", "ontis",
            "itis", "atis", "otis", "intos", "antos", "ontos", "itos", "atos", "otos", "intus",
            "antus", "ontus", "itus", "atus", "otus",
            // Nominal participles
            "inte", "ante", "onte", "ite", "ate", "ote", "into", "anto", "onto", "ito", "ato", "oto"
        ));
    }

    // words that look like they get stemmed, but don't
    private static Set<String> initStemmerExceptions() {
        return new HashSet<>(Arrays.asList(
            // The article
            "la",
            // Conjunctions
            "kaj", "ke", "kie", "minus", "plus", "se",
            // Interjections
            "aha", "bis", "damne", "dirlididi", "fi", "forfikiĝu", "ha", "ho", "hola", "hu",
            "hura", "muu", "nedankinde", "nu", "oho", "ve",
            // Pronouns
            "aliu", "ĉio", "ĉiu", "ili", "io", "iŝi", "iu", "kio", "kiu", "nenio", "neniu",
            "oni", "tio", "tiu",
            // Determiners
            "ĉies", "ia", "kelka", "kia", "nenia", "tia", "tie",
            // Prepositions
            "cis", "ĉe", "da", "de", "disde", "ekde", "en", "ĝis", "je", "kun", "na", "po",
            "pri", "pro", "sen", "tra",
            // Adverbs
            "malplej", "malpli", "plej", "pli", "plu", "tamen",
            // Particles
            "ajn", "ĉu", "ĉi", "jen", "ju", "ne",
            // Dates
            "a", "an",
            // Roman numerals to 20
            "i", "ii", "iii", "vi", "vii", "viii", "xi", "xii", "xiii", "xvi", "xvii", "xviii",
            // irregular numeral
            "unu"
        ));
    }

    // list of words with limited inflections
    private static Set<String> initPluralDirectChecks() {
        return new HashSet<>(Arrays.asList(
            // pronouns
            "ci", "ĝi", "gi", "iŝi", "li", "mi", "ni", "ri", "ŝi", "si", "ŝli", "vi",
            // determiners
            "ia", "io", "iu"
        ));
    }

    // basic numerals 1-9
    private static Set<String> initNumerals() {
        return new HashSet<>(Arrays.asList(
            "unu", "du", "tri", "kvar", "kvin", "ses", "sep", "ok", "naŭ"
        ));
    }

    // find the length of the longest suffix on our list of suffixes
    private static int initMaxSuffixLength() {
        int maxLen = -1;
        for (String suffix : STEMMER_SUFFIXES) {
            int len = suffix.length();
            if (len > maxLen) {
                maxLen = len;
            }
        }
        return maxLen;
    }

}