GlentRetokenizer.java
package org.wikimedia.search.glent.analysis;
import java.io.Serializable;
import java.util.function.UnaryOperator;
public class GlentRetokenizer implements UnaryOperator<String>, Serializable {
private final GlentTokenizer tokenizer;
private final String joiner;
private final boolean separateTokens;
public GlentRetokenizer(GlentTokenizer tok, String join, boolean sepTok) {
tokenizer = tok;
joiner = join;
separateTokens = sepTok; // otherwise, keep original separation, esp. for CJK
}
public GlentRetokenizer(GlentTokenizer tok, String join) {
// default token separation = false
this(tok, join, false);
}
public GlentRetokenizer(GlentTokenizer tok) {
// default joiner == space
// default token separation = false
this(tok, "\u0020", false);
}
@Override
public final String apply(String input) {
return retokenize(input);
}
public final String retokenize(String input) {
if (separateTokens) {
return String.join(joiner, tokenizer.tokenize(input));
} else {
return String.join("", tokenizer.tokenize(input, joiner));
}
}
}