EDConfig.java

package org.wikimedia.search.glent.editdistance;

import java.io.Serializable;
import java.util.Locale;
import java.util.function.Function;
import java.util.regex.Pattern;

import lombok.Getter;

// EDConfig class to set up TokenAwareEditDistance
@Getter
public final class EDConfig implements Serializable {
    // Edit distance cost params, default values
    private float defaultLimit = 2.0f; // default edit dist limit for early termination
    private float defaultNormLimit; // default to 0; default normalized edit dist limit for
                                    // early termination
    private float insDelCost = 1.0f; // cost for insertion or deletion (abc / ac)
    private float substCost = 1.0f; // cost for substitution (abc / adc)
    private float swapCost = 1.25f; // cost for swapping two letters (abc / bac)
    private float duplicateCost = 0.05f; // cost for a duplicated letter (abc / abbc)
    private float digitChangePenalty = 0.33f; // add'l cost for changing one digit to
                    // another. Note: if digitChangePenalty is more than
                    // insDelCost, it will be cheaper to delete one digit and
                    // insert the other so the effective digitChangePenalty will
                    // be == insDelCost
    private NormType normType = NormType.MAX; // normalization method

    // Token processing params
    private char  tokenSep = ' '; // space -- expected separator between tokens
    private String tokenSepStr = Character.toString(tokenSep);
            // string version of tokenSep (calculated)
    private float tokenInitialPenalty = 0.25f; // add'l cost for changing the first
                                               // char of a token
    private float tokenSepSubstPenalty = 0.50f; // add'l cost for changing the tokenSep
                                                // char to something else
    private float tokenDeltaPenalty = 0.25f; // cost for changing the number of tokens,
                                             // per token
    private float spaceOnlyCost = 0.1f; // cost for inserting/deleting only spaces
    private boolean perTokenLimit = true; // should limits be enforced on each token?

    // Default tokenization params
    private Locale locale = Locale.ENGLISH; // locale for lowercasing; null for none
    private String tokenSplit = "[\\p{Z}\\p{P}\\p{S}]+";
        // regex to split tokens: all space, punct, and symbols, which may be too
        // aggressive in some cases; if \p{S} is removed, add + instead
    private Tokenizer tokenizer; // tokenizer for strings to be compared,
                                 // will be constructed if needed.

    public interface Tokenizer extends Function<String, String[]>, Serializable { }

    private EDConfig() {}

    public static final class Builder {
        private EDConfig theConfig;

        Builder() {
            this.theConfig = new EDConfig();
        }

        public static Builder newInstance() {
            return new Builder();
        }

        public EDConfig build() {
            EDConfig readyConfig = this.theConfig;

            // reset the builder's config so the current config cannot be
            // modified, but the builder instance can be re-used
            this.theConfig = new EDConfig();

            // build a default tokenizer if none has been provided, using the
            // tokenSplit regex
            if (readyConfig.tokenizer == null) {
                String tokenTrim = "^" + Pattern.quote(readyConfig.tokenSplit)
                    + "|" + readyConfig.tokenSplit + "$";
                readyConfig.tokenizer = makeTokenizer(readyConfig.locale,
                    tokenTrim, readyConfig.tokenSplit);
            }

            return readyConfig;
        }

        public Builder setDefaultLimit(final float defaultLimit) {
            this.theConfig.defaultLimit = defaultLimit;
            return this;
        }

        public Builder setDefaultNormLimit(final float defaultNormLimit) {
            this.theConfig.defaultNormLimit = defaultNormLimit;
            return this;
        }

        public Builder setInsDelCost(final float insDelCost) {
            this.theConfig.insDelCost = insDelCost;
            return this;
        }

        public Builder setSubstCost(final float substCost) {
            this.theConfig.substCost = substCost;
            return this;
        }

        public Builder setSwapCost(final float swapCost) {
            this.theConfig.swapCost = swapCost;
            return this;
        }

        public Builder setDuplicateCost(final float duplicateCost) {
            this.theConfig.duplicateCost = duplicateCost;
            return this;
        }

        public Builder setDigitChangePenalty(final float digitChangePenalty) {
            this.theConfig.digitChangePenalty = digitChangePenalty;
            return this;
        }

        public Builder setTokenSep(final char tokenSep) {
            this.theConfig.tokenSep = tokenSep;
            this.theConfig.tokenSepStr = Character.toString(tokenSep);
            return this;
        }

        public Builder setTokenInitialPenalty(final float tokenInitialPenalty) {
            this.theConfig.tokenInitialPenalty = tokenInitialPenalty;
            return this;
        }

        public Builder setTokenSepSubstPenalty(final float tokenSepSubstPenalty) {
            this.theConfig.tokenSepSubstPenalty = tokenSepSubstPenalty;
            return this;
        }

        public Builder setTokenDeltaPenalty(final float tokenDeltaPenalty) {
            this.theConfig.tokenDeltaPenalty = tokenDeltaPenalty;
            return this;
        }

        public Builder setSpaceOnlyCost(final float spaceOnlyCost) {
            this.theConfig.spaceOnlyCost = spaceOnlyCost;
            return this;
        }

        public Builder setPerTokenLimit(final boolean perTokenLimit) {
            this.theConfig.perTokenLimit = perTokenLimit;
            return this;
        }

        public Builder setLocale(final Locale locale) {
            this.theConfig.locale = locale;
            return this;
        }

        public Builder setTokenSplit(final String tokenSplit) {
            this.theConfig.tokenSplit = tokenSplit;
            return this;
        }

        public Builder setTokenizer(final Tokenizer tokenizer) {
            this.theConfig.tokenizer = tokenizer;
            return this;
        }

        public Builder setNormType(final NormType normType) {
            this.theConfig.normType = normType;
            return this;
        }

        /**
          * Create a simple tokenizer function for alphabetic scripts that uses
          * the given locale, tokenSplit regex, and tokenTrim regex.
          *
          * This is a simple tokenizer that works with scripts with separate words
          * (e.g., English, Russian, Hindi, Hebrew, etc., but not CJK, Thai, etc.)
          *
          * @param lowercaseLocale locale for lowercasing
          * @param tokenTrimRegex regex for trimming the beginning and end of the
          *     string before tokenizing
          * @param tokenSplitRegex regex for splitting a string into tokens
          *
          * @return the tokenizer function
          */
        private static Tokenizer makeTokenizer(Locale lowercaseLocale,
                String tokenTrimRegex, String tokenSplitRegex) {
            return s -> {
                if (lowercaseLocale != null) {
                    s = s.toLowerCase(lowercaseLocale);
                }
                s = s.replaceAll(tokenTrimRegex, "");

                return s.split(tokenSplitRegex);
            };
        }
    }
}