GlentTokenizer.java

package org.wikimedia.search.glent.analysis;

import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.List;
import java.util.function.Function;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

import com.google.common.collect.ImmutableList;

import lombok.SneakyThrows;

public abstract class GlentTokenizer extends Analyzer implements Function<String, List<String>>, Externalizable {

    /**
     * @param input         string to be tokenized
     * @param omitted       token to replace a sequence of omitted characters (whitespace,
     *                      punct, etc.); usually space, or null (to ignore omitted chars)
     *
     * @return a list of tokens as strings
     */
    @SneakyThrows
    public final List<String> tokenize(String input, String omitted) {
        Integer lastEndOffset = null;
        try (TokenStream ts = this.tokenStream("", input)) {
            ts.reset();
            ImmutableList.Builder<String> builder = ImmutableList.builder();
            CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
            while (ts.incrementToken()) {
                if (omitted != null) {
                    if (lastEndOffset != null && lastEndOffset < offsetAtt.startOffset()) {
                        builder.add(omitted);
                    }
                    lastEndOffset = offsetAtt.endOffset();
                }
                builder.add(charTermAttribute.toString());
            }
            return builder.build();
        }
    }

    public final List<String> tokenize(String input) {
        return this.tokenize(input, null);
    }

    public List<String> apply(String input, String omitted) {
        return tokenize(input, omitted);
    }

    @Override
    public List<String> apply(String input) {
        return tokenize(input, null);
    }

    @Override
    public void readExternal(ObjectInput objectInput) throws IOException, ClassNotFoundException {
    }

    @Override
    public void writeExternal(ObjectOutput objectOutput) throws IOException {
    }
}