SimplifiedChinese.java
package org.wikimedia.search.glent.analysis;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.HMMChineseTokenizer;
public class SimplifiedChinese extends GlentTokenizer {
// Smart CN converts *all* punctuation to commas
private static final CharArraySet SMART_CN_STOP = StopFilter.makeStopSet(",");
@Override
protected TokenStreamComponents createComponents(String s) {
HMMChineseTokenizer tokenizer = new HMMChineseTokenizer();
// For ICU: new ICUNormalizer2Filter(tokenizer, Normalizer2.getNFKCCasefoldInstance())
TokenStream ts = new LowerCaseFilter(tokenizer);
ts = new StopFilter(ts, SMART_CN_STOP);
return new TokenStreamComponents(tokenizer, ts);
}
}