GlentTokenizer.java
package org.wikimedia.search.glent.analysis;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.List;
import java.util.function.Function;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import com.google.common.collect.ImmutableList;
import lombok.SneakyThrows;
public abstract class GlentTokenizer extends Analyzer implements Function<String, List<String>>, Externalizable {
/**
* @param input string to be tokenized
* @param omitted token to replace a sequence of omitted characters (whitespace,
* punct, etc.); usually space, or null (to ignore omitted chars)
*
* @return a list of tokens as strings
*/
@SneakyThrows
public final List<String> tokenize(String input, String omitted) {
Integer lastEndOffset = null;
try (TokenStream ts = this.tokenStream("", input)) {
ts.reset();
ImmutableList.Builder<String> builder = ImmutableList.builder();
CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
while (ts.incrementToken()) {
if (omitted != null) {
if (lastEndOffset != null && lastEndOffset < offsetAtt.startOffset()) {
builder.add(omitted);
}
lastEndOffset = offsetAtt.endOffset();
}
builder.add(charTermAttribute.toString());
}
return builder.build();
}
}
public final List<String> tokenize(String input) {
return this.tokenize(input, null);
}
public List<String> apply(String input, String omitted) {
return tokenize(input, omitted);
}
@Override
public List<String> apply(String input) {
return tokenize(input, null);
}
@Override
public void readExternal(ObjectInput objectInput) throws IOException, ClassNotFoundException {
}
@Override
public void writeExternal(ObjectOutput objectOutput) throws IOException {
}
}