TokenAwareEditDistance.java
package org.wikimedia.search.glent.editdistance;
import java.io.Serializable;
import java.util.Arrays;
import lombok.Builder;
/* Token-Aware Edit Distance
*
* Compute a modified Damerau–Levenshtein edit distance (inserts, deletions,
* substitutions, transpositions) that also accounts for duplicated characters,
* multi-token strings, token-initial character changes, per-string edit limits,
* per-token edit limits, length-proportional edit limits (for strings and
* tokens), token-count differences, and strings that differ only by tokenization.
*/
@Builder
public final class TokenAwareEditDistance implements Serializable {
// use Infinity for being over any edit limits, since you can still do
// math to it.
private static final float OVER_LIMIT = Float.POSITIVE_INFINITY;
// internal configuration, abstracted into a class so we can keep
// track of what is what.
private final EDConfig edConfig;
public TokenAwareEditDistance(final EDConfig edConfig) {
// the builder loses the reference to this config object, so it is
// effectively immutable
this.edConfig = edConfig;
}
/**
* Computes the cost of weighted edits required to transform str1 into str2,
* using the default edit limits for this TokenAwareEditDistance instance.
*
* @param str1 the first string to be compared
* @param str2 the second string to be compared
* @return the computed edit distance between the two strings;
* returns Float.POSITIVE_INFINITY for early termination OR if the edit
* distance is over either limit.
*/
public float calcEditDistance(final String str1, final String str2) {
return calcEditDistance(str1, str2, edConfig.getDefaultLimit(), edConfig.getDefaultNormLimit());
}
/**
* Computes the cost of weighted edits required to transform str1 into str2.
* <p>
* Allowed edits include inserting, deleting, substituting, transposing, or
* (de-)duplicating letters. Additional penalties are applied for changing
* the first character of a token, changing the number of tokens in the
* string, or modifying digits. There is special discounted processing for
* strings that are the same modulo the tokenSep character.
* <p>
* A maximum allowable edit distance (editLimit) or proportional edit
* distance (editNormLimit) can be used to terminate early based on the cost
* of edits (for the whole string or per token) or based on changes to the
* number of tokens. In the case of early termination the edit distance returned
* is Float.POSITIVE_INFINITY.
*
* @param str1 the first string to be compared
* @param str2 the second string to be compared
* @param editLimit the maximum relevant edit distance; 0 = no limit
* @param editNormLimit the maximum relevant edit distance as a percentage;
* 0 = no limit
* @return the computed edit distance between the two strings;
* returns Float.POSITIVE_INFINITY for early termination OR if the edit
* distance is over either limit.
*/
// Cyclomatic Complexity was 66 and NPath Complexity was
// 60,552,907,617,280! It's much lower now. Going further just seems
// to be "optimizing the metric". Suppress the remaining warnings.
@SuppressWarnings({"NPathComplexity", "CyclomaticComplexity"})
public float calcEditDistance(final String str1, final String str2,
final float editLimit, final float editNormLimit) {
final EditDistItem edItem1 = new EditDistItem(str1, edConfig);
final EditDistItem edItem2 = new EditDistItem(str2, edConfig);
/////////////////
// Check for simple equality (after tokenization, not of the original
// strings)
if (Arrays.equals(edItem1.text, edItem2.text)) {
return 0;
}
// copy internal-only per-comparison computed values into a ComparisonInfo
// object so we can pass them around together: absolute edit limit,
// percentage (norm) edit limit, and "spaceless equality" (e.g., same as
// "sp acel esseq uali ty")
boolean spacelessEq = edItem1.spacelessText.equals(edItem2.spacelessText);
ComparisonInfo compInfo = new ComparisonInfo(editLimit, editNormLimit,
spacelessEq);
if (edItem1.text.length == 0 || edItem2.text.length == 0) {
// max is non-zero value
return calcEmptyInputRetVal(
Math.max(edItem1.normLength, edItem2.normLength),
edItem1.text.length,
compInfo
);
}
float tokenDiffPenalty = edItem1.calcTokenDiffPenalty(edItem2, compInfo);
float adjustedEditLimit = 0f;
boolean limitsExist = editLimit > 0 || editNormLimit > 0;
if (limitsExist) {
adjustedEditLimit = calcAdjustedEditLimit(edItem1.normLength, edItem2.normLength, compInfo);
adjustedEditLimit -= tokenDiffPenalty;
if (adjustedEditLimit < edItem1.calcUniqCharMinCost(edItem2)) {
// early termination based on unique characters or token diffs
// (if adjustedEditLimit < 0); note that calcUniqCharMinCost() is
// always >= 0
return OVER_LIMIT;
}
}
// Since we aren't calculating the edit path, just the total distance, we
// only need three working rows for modified Damerau–Levenshtein distance
// with swaps.
//
// Create three rows of length 1 + |str2| to hold our work. 1 + ... because
// we need an additional element for the initial column. As a result,
// row{Curr|Next|Prev}[i] corresponds to str[i - 1]
//
// Prev is row before Curr, for looking back for swaps
// Curr is the current row
// Next is the row we are filling in
EditDistRow rowPrev = new EditDistRow(edItem2, compInfo);
EditDistRow rowCurr = new EditDistRow(edItem2, compInfo);
EditDistRow rowNext = new EditDistRow(edItem2, compInfo);
rowCurr.initFirstRow(edItem2);
// for each character of str1, compute the cost (rowNext) from the
// row before (rowCurr) (with rowPrev available to check for swaps)
for (int i = 0; i < edItem1.text.length; i++) {
// minimum value for a given row (for early termination)
float rowMin = rowNext.initFirstCell(rowCurr, edItem1, i);
// compute the cost of the rest of the row
for (int j = 0; j < edItem2.text.length; j++) {
EditDistCell minCost = new EditDistCell(compInfo);
// minimum cost to get to the current cell
EditDistCell nextCost = new EditDistCell(compInfo);
// temp var for candidate cost
// is either string at the start of a token?
boolean atTokenEdge = edItem1.isTokenSep(i) || edItem2.isTokenSep(j);
/////////////////////////////////
// check substitution vs equality
// initialize minCost assuming equality (no add'l cost; copy diagonally)
// then add substitution cost as needed
minCost.setCostsAndCheckTokenEdge(rowCurr.getRow(j), atTokenEdge,
edConfig.isPerTokenLimit());
minCost.incrementCosts(edItem1.calcSubstCost(i, edItem2, j));
/////////////
// check swap
// check for possible swap (rowPrev has costs from two rows ago); no
// penalty for swapping the first character of a token and a token
// separator
if (edItem1.isSwapped(i, edItem2, j)) {
// copy cost from two back diagonally & add swap cost
nextCost.setCostsAndCheckTokenEdge(rowPrev.getRow(j - 1), atTokenEdge,
edConfig.isPerTokenLimit());
nextCost.incrementCosts(edItem1.calcSwapCost(i, edItem2, j));
minCost.setIfCostsLess(nextCost);
}
//////////////////
// check insertion
// copy from previous column, plus penalty for digit or first char in token
nextCost.setCostsAndCheckTokenEdge(rowNext.getRow(j), atTokenEdge,
edConfig.isPerTokenLimit());
nextCost.incrementCosts(edItem2.calcInsDelCost(j, compInfo));
minCost.setIfCostsLess(nextCost);
/////////////////
// check deletion
// copy from previous row, plus penalty for digit or first char in token
nextCost.setCostsAndCheckTokenEdge(rowCurr.getRow(j + 1), atTokenEdge,
edConfig.isPerTokenLimit());
nextCost.incrementCosts(edItem1.calcInsDelCost(i, compInfo));
minCost.setIfCostsLess(nextCost);
// assign cost of minimum cost path
rowNext.getRow(j + 1).setCosts(minCost);
//////////////////////////
// do the per-token things
// calculate the normalized token length for this token so far
rowNext.getRow(j + 1).setTokenNormLength(calcTokenNormLength(edItem1, edItem2, i, j,
rowNext.getRow(j).getTokenNormLength(),
rowCurr.getRow(j + 1).getTokenNormLength()));
// if we are at a token boundary, start a new token
if (atTokenEdge) {
rowNext.getRow(j + 1).startNewToken();
}
// update rowMin if this is lower
rowMin = Math.min(rowNext.getRow(j + 1).getCost(), rowMin);
}
// rotate rowCurr, rowNext, rowPrev for next round of calculations
EditDistRow rowTemp = rowCurr;
rowCurr = rowNext; // rowCurr is now the most up-to-date
rowNext = rowPrev;
rowPrev = rowTemp;
if (limitsExist && rowMin > adjustedEditLimit) {
// early termination based on too many edits on this row
return OVER_LIMIT;
}
}
// check final edit distance cell against per-token edit limits
if (rowCurr.getRow(edItem2.text.length).isOverTokenEditLimit(edConfig.isPerTokenLimit())) {
// final token has too many edits
return OVER_LIMIT;
}
// if there are too many total edits in the final cell of the table, bail
if (limitsExist && rowCurr.getRow(edItem2.text.length).getCost() > adjustedEditLimit) {
// total edits for the whole string are over the limit
return OVER_LIMIT;
}
// pull out final edit distance, add penalty for differing number of
// tokens, and return
return rowCurr.getRow(edItem2.text.length).getCost() + tokenDiffPenalty;
}
/**
* Calculate the maximum normalized edit limit based on the norm type and
* normalized lengths of the items being compared.
*
* @param len1 normalized length of the first string
* @param len2 normalized length of the second string
* @return maximum normalized edit limit
*/
private float calcEditNormLimitByType(final float len1, final float len2, ComparisonInfo compInfo) {
// if getCurrEditNormLimit() == 0, there is no limit
if (compInfo.getCurrEditNormLimit() <= 0) {
return 0f;
}
switch (edConfig.getNormType()) {
case MIN:
return compInfo.getCurrEditNormLimit() * Math.min(len1, len2);
case FIRST:
return compInfo.getCurrEditNormLimit() * len1;
case MAX:
return compInfo.getCurrEditNormLimit() * Math.max(len1, len2);
default:
throw new UnsupportedOperationException("Don't know how to handle case " + edConfig.getNormType());
}
}
/**
* Calculate the "adjusted" edit limit, which we actually use to check for
* early termination.
* <p>
* Also account for possible decrease in edit cost if insDelCost > swapCost.
*
* @param len1 normalized length of the first string
* @param len2 normalized length of the second string
* @param compInfo per-pair comparison values
* @return the maximum number of edits we can allow
*/
private float calcAdjustedEditLimit(final float len1, final float len2, ComparisonInfo compInfo) {
float normEditMax = calcEditNormLimitByType(len1, len2, compInfo);
float adjLimit;
// if both limits are > 0, i.e., both limits are in effect, then take
// the lower limit.
// if either is <= 0, then take the max, i.e., ignoring the 0 limit
// if both are <= 0, the max will also be <= 0 (indicating no limit).
if (compInfo.getCurrEditLimit() > 0 && normEditMax > 0) {
adjLimit = Math.min(compInfo.getCurrEditLimit(), normEditMax);
} else {
adjLimit = Math.max(compInfo.getCurrEditLimit(), normEditMax);
}
// if swap cost is less than insert/delete cost, then cost can go
// *down* from one row to the next, so adjust edit limit to account
// for that.
if (edConfig.getSwapCost() < edConfig.getInsDelCost()) {
adjLimit += edConfig.getInsDelCost() - edConfig.getSwapCost();
}
return adjLimit;
}
/**
* Calculate the normalized length of the token so far based on the cell
* above and the cell to the left, based on the normalization type.
* <p>
* This requires a lot of information, so we have to pass a lot in. We need
* the editDistItems and indexes into them so we can determine whether
* relevant characters are duplicated or token separators. We need the
* current normalized token length of the cell to the left and the cell
* above to build on.
*
* @param ed1 edit dist item for the first string
* @param ed2 edit dist item for the second string
* @param idx1 index of current char in ed1
* @param idx2 index of current char in ed2
* @param tnlLeft the tokenNormLength from the cell to the left
* @param tnlAbove the tokenNormLength from the cell above
* @return the normalized length of the token so far
*/
private float calcTokenNormLength(final EditDistItem ed1, final EditDistItem ed2,
final int idx1, final int idx2, final float tnlLeft,
final float tnlAbove) {
// if we are at a token boundary, we may need to increment
// tokenNormalizedLength; calculate the increments (with discount for
// duplicates)
float incrCostLeft = ed2.isDuplicate(idx2) ? edConfig.getDuplicateCost() : edConfig.getInsDelCost();
float incrCostAbove = ed1.isDuplicate(idx1) ? edConfig.getDuplicateCost() : edConfig.getInsDelCost();
switch (edConfig.getNormType()) {
case MIN:
// take the min of tokenNormLength to left and above, plus any
// increments
return Math.min(tnlLeft + incrCostLeft, tnlAbove + incrCostAbove);
case FIRST:
if (ed2.isTokenStart(idx2)) {
// if we are the beginning of a token in str2, copy from
// tokenNormLength above and increment tokenNormalizedLength
return tnlAbove + incrCostAbove;
} else {
// just copy from the tokenNormalizedLength to the left
return tnlLeft;
}
case MAX:
if (!ed1.isTokenStart(idx1)) {
incrCostLeft = 0f;
}
if (idx1 != 0 && !ed2.isTokenStart(idx2)) {
incrCostAbove = 0f;
}
// take the max of tokenNormalizedLength to left and above, plus any
// increment
return Math.max(tnlLeft + incrCostLeft, tnlAbove + incrCostAbove);
default:
throw new UnsupportedOperationException("Don't know how to handle case " + edConfig.getNormType());
}
}
/**
* Computes the return value (finite or infinite) when one of the input items is
* empty (or null), based on the proposed return value and the edit distance
* limits, if any.
*
* @param retVal the normalized length of the non-empty string
* @param firstLen the length of the first item
* @param compInfo per-pair comparison values
* @return retVal if the length is not over the raw edit distance or normalized
* edit dist limit; OVER_LIMIT otherwise
*/
private float calcEmptyInputRetVal(float retVal, float firstLen, ComparisonInfo compInfo) {
// retVal can only be zero if both inputs are zero-length, so bail now.
// (This shouldn't happen with the way the code is currently structured, but
// play it safe.)
if (retVal == 0) {
return retVal;
}
// is the raw edit limit relevant? Are we over the raw limit?
if (compInfo.getCurrEditLimit() > 0 && retVal > compInfo.getCurrEditLimit()) {
return OVER_LIMIT;
}
// is the normalized edit limit relevant?
// IF NormType == MIN *OR*
// NormType == FIRST + first item is empty
// THEN limit == 0, and we are over since second item is not also empty
// IF NormType == MAX *OR*
// NormType == FIRST + first item is *not* empty,
// THEN dist == 100%, so the only thing that matters is whether the limit
// is < 100%
if (compInfo.getCurrEditNormLimit() > 0 &&
(edConfig.getNormType() == NormType.MIN ||
(edConfig.getNormType() == NormType.FIRST && firstLen == 0) ||
compInfo.getCurrEditNormLimit() < 1)) {
return OVER_LIMIT;
}
// not over any limits
return retVal;
}
}