EditDistCell.java
package org.wikimedia.search.glent.editdistance;
/**
* Object to hold the info needed in a cell of our crazy edit distance table.
*
* Simple Levenshtein edit distance usually only needs an integer in each cell.
* Damerau–Levenshtein may need floats for non-integral edits (e.g., if
* swap costs 1.5).
*
* We need floats for cost, but we are also tracking per-token normalized length
* and per-token edit cost in each cell.
*/
final class EditDistCell {
private float cost; // the cost to get to this cell of the edit distance table
private float tokenCost; // the cost to get to this cell in the current token
private float tokenNormLength; // the normalized length of the current token so far
private ComparisonInfo edCellCompInfo;
private static final float OVER_LIMIT = Float.POSITIVE_INFINITY;
/**
* Constructor takes initial weights, and reference to the pair-specific
* info object.
*
* @param cost initial value
* @param tokenCost initial value
* @param tokenNormLength initial value
* @param theCompInfo the pair-specific computed info
*/
EditDistCell(final float cost, final float tokenCost, final float tokenNormLength,
ComparisonInfo theCompInfo) {
this.cost = cost;
this.tokenCost = tokenCost;
this.tokenNormLength = tokenNormLength;
this.edCellCompInfo = theCompInfo;
}
EditDistCell(ComparisonInfo theCompInfo) {
this(0f, 0f, 0f, theCompInfo);
}
// accessors for cost, tokenCost, or tokenNormLength
void setTokenNormLength(final float tokenNormLength) {
this.tokenNormLength = tokenNormLength;
}
float getCost() {
return this.cost;
}
float getTokenNormLength() {
return this.tokenNormLength;
}
// set cost and tokenCost from another EditDistCell
void setCosts(final EditDistCell otherCell) {
this.cost = otherCell.cost;
this.tokenCost = otherCell.tokenCost;
}
/*
* Set cost and tokenCost from another EditDistCell, and check if the
* source cell is over per-token limits.
*
* @param otherCell the edit distance cell we are copying costs from
* @param atTokenEdge whether we are at a token edge in either direction
* @param perTokLim whether we care about per-token limits
*/
void setCostsAndCheckTokenEdge(final EditDistCell otherCell,
boolean atTokenEdge, boolean perTokLim) {
this.cost = otherCell.cost;
this.tokenCost = otherCell.tokenCost;
if (atTokenEdge && perTokLim && !edCellCompInfo.isSpacelessEquals() &&
otherCell.isOverTokenEditLimit(perTokLim)) {
this.cost = OVER_LIMIT;
}
}
// new token, same old string; reset per-token values to 0
void startNewToken() {
this.tokenCost = 0f;
this.tokenNormLength = 0f;
}
// update both costs by the same amount
// (cost and tokenCost tend to increment together)
void incrementCosts(final float incr) {
this.cost += incr;
this.tokenCost += incr;
}
/*
* Compare EditDistCell costs and copy if the other one is less (tokenCosts
* are just along for the ride).
*
* @param otherCell the edit distance cell we are copying costs from
*/
void setIfCostsLess(final EditDistCell otherCell) {
if (otherCell.cost < this.cost) {
this.setCosts(otherCell);
}
}
/**
* Determine whether the token has too many edits.
*
* Bail if we are not looking at per-token limits or we have two strings
* that have spaceless equality
*
* @param perTokLim whether we care about per-token limits
*
* @return true/false based on whether the token has too many edits
*/
boolean isOverTokenEditLimit(final boolean perTokLim) {
if (!perTokLim || edCellCompInfo.isSpacelessEquals()) {
return false;
}
if (edCellCompInfo.getCurrEditLimit() > 0
&& tokenCost > edCellCompInfo.getCurrEditLimit()) {
// too many edits by the raw numbers
return true;
}
if (edCellCompInfo.getCurrEditNormLimit() > 0 &&
tokenCost > tokenNormLength * edCellCompInfo.getCurrEditNormLimit()) {
// too many edits as a percentage of this token length
return true;
}
return false;
}
}