GlentRetokenizer.java

package org.wikimedia.search.glent.analysis;

import java.io.Serializable;
import java.util.function.UnaryOperator;

public class GlentRetokenizer implements UnaryOperator<String>, Serializable {
    private final GlentTokenizer tokenizer;
    private final String joiner;
    private final boolean separateTokens;

    public GlentRetokenizer(GlentTokenizer tok, String join, boolean sepTok) {
        tokenizer = tok;
        joiner = join;
        separateTokens = sepTok; // otherwise, keep original separation, esp. for CJK
    }

    public GlentRetokenizer(GlentTokenizer tok, String join) {
        // default token separation = false
        this(tok, join, false);
    }

    public GlentRetokenizer(GlentTokenizer tok) {
        // default joiner == space
        // default token separation = false
        this(tok, "\u0020", false);
    }

    @Override
    public final String apply(String input) {
        return retokenize(input);
    }

    public final String retokenize(String input) {
        if (separateTokens) {
            return String.join(joiner, tokenizer.tokenize(input));
        } else {
            return String.join("", tokenizer.tokenize(input, joiner));
        }
    }
}