EditDistItem.java

package org.wikimedia.search.glent.editdistance;

import java.util.HashSet;
import java.util.regex.Pattern;

import lombok.Getter;

/**
  * Class to convert a string into an EditDistItem for edit distance
  * computation.
  *
  * To be compatible with 32-bit Unicode characters, we convert strings to
  * an array of ints (representing codepoints). We also calculate various
  * useful stats on the string and compute variants of the string.
  *
  * Future optimization: Consider making this class public and creating a version
  * of computeEditDistance() that takes two EditDistItems, or an EditDistItem and
  * a string, so that when comparing one string against many others the
  * EditDistItem doesn't need to be repeatedly built.
  */
@Getter
final class EditDistItem {
    final int[] text;             // the string as Unicode codepoints
    final boolean[] isDigit;      // for each codepoint, is it a digit?
    final String spacelessText;   // all instances of tokenSep removed for quick comparison
    final float normLength;       // # of codepoints, with duplicates discounted
    final HashSet<Integer> uniqueCodepoints; // hash of unique codepoints
    final int tokenCount;         // number of tokens

    final EDConfig edItemConfig;  // the config for the calling TokenAwareEditDistance

    /**
      * Constructor for EditDistItem
      *
      * Tokenize string, count tokens, create codepoint array and isDigit
      * array, find normalized codepoint length, create spaceless version
      * of the text, and a hash set of unique codepoints.
      *
      * @param s the string to convert to EditDistItem
      * @param edConfig an edit dist config object with all the relevant costs
      *     and tokenizing values.
      */
    EditDistItem(final String s, EDConfig edConfig) {
        edItemConfig = edConfig;
        if (s == null || s.isEmpty()) {
            text = new int[0];
            tokenCount = 0;
            spacelessText = "";
            isDigit = new boolean[0];
            normLength = 0;
            uniqueCodepoints = new HashSet<>();
        } else {
            String[] tokens = edItemConfig.getTokenizer().apply(s);
            tokenCount = tokens.length;
            String tokStr = String.join(edItemConfig.getTokenSepStr(), tokens).trim();

            text = tokStr.codePoints().toArray();
            spacelessText = tokStr.replaceAll(
                Pattern.quote(edItemConfig.getTokenSepStr()), "");
            isDigit = new boolean[text.length];

            // calc normLength of text and init isDigit
            float normLen = 0f;

            for (int i = 0; i < text.length; i++) {
                if (isDuplicate(i)) {
                    normLen += edItemConfig.getDuplicateCost();
                } else {
                    normLen += edItemConfig.getInsDelCost();
                }

                // Note: Character.isDigit() doesn't match everything that the
                // regex \p{N} does; it misses Ethiopic digits, some Tibetan
                // digits, "number" characters (>10) and lots of typographic
                // variants: circled, parens, full stop, etc. All those are
                // generally quite rare, though. isDigit() is fairly slow, so we
                // pre-compute; converting to String to use \p{N} is even
                // slower, alas.
                isDigit[i] = Character.isDigit(text[i]);
            }
            normLength = normLen;

            // create hash of unique codepoints
            uniqueCodepoints = new HashSet<>();
            for (int c : text) {
                uniqueCodepoints.add(c);
            }
        }
    }

    private int codepointAt(final int idx) {
        if (idx < 0) {
            return -1;
        }
        return text[idx];
    }

    /**
      * Note that the codepoint at 0 can't be a duplicate because there is no
      * codepoint before it.
      */
    boolean isDuplicate(final int idx) {
        if (idx == 0) {
            return false;
        }
        return text[idx] == text[idx - 1];
    }

    boolean isTokenSep(final int idx) {
        return text[idx] == edItemConfig.getTokenSep();
    }

    /**
      * return true if the codepoint is the the first codepoint
      *     (idx == 0) or right after a token separator, false if not.
      */
    boolean isTokenStart(final int idx) {
        if (idx == 0) {
            return true;
        }
        return text[idx - 1] == edItemConfig.getTokenSep();
    }

    /**
      * Determine whether two codepoints have been swapped.
      *
      * Let codepointAt() handle index < 0 that are out of range.
      *
      * @param idx index of relevant codepoint in this EditDistItem
      * @param otherItem the other EditDistItem to compare to
      * @param otherIdx index of relevant codepoint in the other
      *     EditDistItem
      *
      * @return true if the codepoints have been swapped, false if
      *     not, or if anything is out of range.
      */
    boolean isSwapped(final int idx, final EditDistItem otherItem,
            final int otherIdx) {
        return this.codepointAt(idx - 1) == otherItem.codepointAt(otherIdx)
            && this.codepointAt(idx) == otherItem.codepointAt(otherIdx - 1);
    }

    /**
      * Caclulate the minimum edit distance based on unique codepoints.
      *
      * Used for early termination of edit distance calculation. Calculate
      * the minimum possible cost between two items based on the raw number of
      * unique codepoints in each item (excesses in either direction incur
      * insDelCost), and the non-overlap in unique codepoint sets
      * (non-matching leftovers incur substCost)
      *
      * @param otherItem the other EditDistItem to compare to
      *
      * @return the minimum cost based on unique codepoint info
      */
    float calcUniqCharMinCost(final EditDistItem otherItem) {
        // find the *size* of the intersection of the unique codepoints
        float charIntersectionCount = 0;
        for (int cp : this.uniqueCodepoints) {
            if (otherItem.uniqueCodepoints.contains(cp)) {
                charIntersectionCount++;
            }
        }

        /* Every unique character has to be inserted, deleted, or changed so
         * diffs in raw counts indicate necessary insertions (insDelCost).
         * remaining unique characters must at least be substituted
         * (substCost).
         *
         * e.g., best case for the unique set of letters {a,b,c,d} vs {a,f,g} is:
         * 1 stays the same (a), plus 2 substitutions (b->f, c->g), plus 1 delete
         * (d). [Note that the b/c/d vs f/g alignment is arbitrary]
         */

        // minimum insertions is difference in set sizes. e.g., {a,b,c,d} has 4,
        // but {a,f,g} has 3, so there has to be one insertion.
        float minInsertions = Math.abs((float)this.uniqueCodepoints.size() -
                    otherItem.uniqueCodepoints.size());
        minInsertions *= edItemConfig.getInsDelCost(); // multiply by cost per insertion

        // minimum substitutions is smaller set size, minus size of overlap.
        // e.g., {a,b,c,d} (4) and {a,f,g} (3) overlap by {a} (1), so the two
        // (3-1) remaining elements {f,g} could at best be substitutions.
        float minSubstitutions = Math.min(this.uniqueCodepoints.size(),
                    otherItem.uniqueCodepoints.size()) -
                    charIntersectionCount;
        minSubstitutions *= edItemConfig.getSubstCost(); // multiply by cost per substitution

        return minInsertions + minSubstitutions;
    }

    /**
      * Caclulate penalty for differing number of tokens.
      *
      * Used for early termination of edit distance calculation. Calculate
      * the penalty incurred for having differing number of tokens, taking
      * into account the discount (i.e., no penalty) for spaceless equality
      * (in which case we just want to find the spaces).
      *
      * @param otherItem the other EditDistItem to compare to
      * @param theCompInfo the pair-specific computed info
      *
      * @return the token difference penalty
      */
    float calcTokenDiffPenalty(final EditDistItem otherItem,
            ComparisonInfo theCompInfo) {
        return theCompInfo.isSpacelessEquals() ? 0 : // no penalty for spaceless equality
            Math.abs((this.tokenCount - otherItem.tokenCount) *
                edItemConfig.getTokenDeltaPenalty());
    }

    /**
      * Caclulate cost for substituting codepoints.
      *
      * Calculate cost for substitution; if codepoints are equal, no cost!
      * otherwise, check for token start and digit penalties.
      *
      * @param idx index of relevant codepoint in this EditDistItem
      * @param otherItem the other EditDistItem to compare to
      * @param otherIdx index of relevant codepoint in the other
      *     EditDistItem
      *
      * @return the final substitution cost
      */
    float calcSubstCost(final int idx, final EditDistItem otherItem,
            final int otherIdx) {
        float cost = 0f;

        if (this.codepointAt(idx) != otherItem.codepointAt(otherIdx)) {
            // if not equal, add cost for substitution
            cost += edItemConfig.getSubstCost();

            // if we are at the start of either token, add penalty
            if (this.isTokenStart(idx) || otherItem.isTokenStart(otherIdx)) {
                cost += edItemConfig.getTokenInitialPenalty();
            }

            // if we are changing a token sep character, add penalty
            if (this.isTokenSep(idx) || otherItem.isTokenSep(otherIdx)) {
                cost += edItemConfig.getTokenSepSubstPenalty();
            }

            // add penalty for changing numbers
            if (this.isDigit[idx] && otherItem.isDigit[otherIdx]) {
                cost += edItemConfig.getDigitChangePenalty();
            }
        }

        return cost;
    }

    /**
      * Caclulate cost for swapping codepoints.
      *
      * Calculate cost for swap; check for digit penalties.
      *
      * @param idx index of relevant codepoint in this EditDistItem
      * @param otherItem the other EditDistItem to compare to
      * @param otherIdx index of relevant codepoint in the other
      *     EditDistItem
      *
      * @return the final swap cost
      */
    float calcSwapCost(final int idx, final EditDistItem otherItem, final int otherIdx) {
        if (this.isDigit[idx] && otherItem.isDigit[otherIdx]) {
            return edItemConfig.getSwapCost() + edItemConfig.getDigitChangePenalty();
        }

        return edItemConfig.getSwapCost();
    }

    /**
      * Caclulate cost for inserting or deleting a codepoint.
      *
      * Handle all possible variations of inserting or deleting a codepoint,
      * including spaceless equality discount, token start and digit
      * penalties, and duplicated character discount.
      *
      * @param idx index of relevant codepoint in this EditDistItem
      * @param theCompInfo the pair-specific computed info
      *
      * @return the final insert or delete cost
      */
    float calcInsDelCost(final int idx, final ComparisonInfo theCompInfo) {
        // if spaceless-equal then only incur discounted cost
        if (theCompInfo.isSpacelessEquals() && this.isTokenSep(idx)) {
            return edItemConfig.getSpaceOnlyCost();
        }

        // extra penalty for changing the first codepoint in a token or
        // inderting/deleting a digit.
        float penalty = 0f;
        if (this.isTokenStart(idx)) {
            penalty += edItemConfig.getTokenInitialPenalty();
        }
        if (this.isDigit[idx]) {
            penalty += edItemConfig.getDigitChangePenalty();
        }

        // give duplicate discount, or full insert/delete cost
        if (this.isDuplicate(idx)) {
            return edItemConfig.getDuplicateCost() + penalty;
        }

        return edItemConfig.getInsDelCost() + penalty;
    }
}