GlentNormalizer.java
package org.wikimedia.search.glent.analysis;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.function.UnaryOperator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import lombok.SneakyThrows;
public abstract class GlentNormalizer extends Analyzer implements UnaryOperator<String>, Externalizable {
@SneakyThrows
public final String apply(String input) {
try (TokenStream ts = this.tokenStream(input, input)) {
ts.reset();
CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class);
if (ts.incrementToken()) {
if (ts.incrementToken()) {
throw new IllegalStateException("GlentNormalizer should only generate one output token per input string.");
}
return charTermAttribute.toString();
}
return "";
}
}
@Override
protected final TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source, filters(source));
}
protected TokenStream filters(TokenStream source) {
return source;
}
@Override
public void readExternal(ObjectInput objectInput) throws IOException, ClassNotFoundException {
}
@Override
public void writeExternal(ObjectOutput objectOutput) throws IOException {
}
}