GlentNormalizer.java

package org.wikimedia.search.glent.analysis;

import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.function.UnaryOperator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import lombok.SneakyThrows;

public abstract class GlentNormalizer extends Analyzer implements UnaryOperator<String>, Externalizable {
    @SneakyThrows
    public final String apply(String input) {
        try (TokenStream ts = this.tokenStream(input, input)) {
            ts.reset();
            CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class);
            if (ts.incrementToken()) {
                if (ts.incrementToken()) {
                    throw new IllegalStateException("GlentNormalizer should only generate one output token per input string.");
                }
                return charTermAttribute.toString();
            }
            return "";
        }
    }

    @Override
    protected final TokenStreamComponents createComponents(String fieldName) {
        Tokenizer source = new KeywordTokenizer();
        return new TokenStreamComponents(source, filters(source));
    }

    protected TokenStream filters(TokenStream source) {
        return source;
    }

    @Override
    public void readExternal(ObjectInput objectInput) throws IOException, ClassNotFoundException {
    }

    @Override
    public void writeExternal(ObjectOutput objectOutput) throws IOException {
    }
}