Korean.java
package org.wikimedia.search.glent.analysis;
import java.io.Reader;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.ko.KoreanTokenizer;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter;
public class Korean extends GlentTokenizer {
private static final Pattern REPLACE_PATTERN = Pattern.compile("[\u0300-\u0331]");
private static final NormalizeCharMap CHAR_MAP = initKoreanCharMap();
private static NormalizeCharMap initKoreanCharMap() {
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("\u00B7", "\u0020");
builder.add("\u318D", "\u0020");
builder.add("\u00AD", "");
builder.add("\u200C", "");
return builder.build();
}
@Override
protected TokenStreamComponents createComponents(String s) {
KoreanTokenizer tokenizer = new KoreanTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, null, KoreanTokenizer.DecompoundMode.DISCARD, false);
// For ICU: new ICUNormalizer2Filter(tokenizer, Normalizer2.getNFKCCasefoldInstance())
TokenStream ts = new LowerCaseFilter(tokenizer);
ts = new LengthFilter(ts, 1, Integer.MAX_VALUE);
return new TokenStreamComponents(tokenizer, ts);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new PatternReplaceCharFilter(REPLACE_PATTERN, "", new MappingCharFilter(CHAR_MAP, reader));
}
}