ICU.java
package org.wikimedia.search.glent.analysis;
import java.io.Reader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
import com.ibm.icu.text.Normalizer2;
public class ICU extends GlentTokenizer {
private static final Normalizer2 NFKCCF = Normalizer2.getNFKCCasefoldInstance();
private static final NormalizeCharMap CHAR_MAP = initICUCharMap();
private static NormalizeCharMap initICUCharMap() {
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
// elements of word_break_helper and near_space_flattener
builder.add("_", "\u0020");
builder.add(":", "\u0020");
builder.add("\u2018", "'"); // LEFT SINGLE QUOTATION MARK
builder.add("\u2019", "'"); // RIGHT SINGLE QUOTATION MARK
builder.add("\u02BC", "'"); // MODIFIER LETTER APOSTROPHE
builder.add("\u201C", "\""); // LEFT DOUBLE QUOTATION MARK
builder.add("\u201D", "\""); // RIGHT DOUBLE QUOTATION MARK
return builder.build();
}
@Override
protected TokenStreamComponents createComponents(String s) {
ICUTokenizer tokenizer = new ICUTokenizer();
TokenStream ts = new ICUNormalizer2Filter(tokenizer, NFKCCF);
ts = new LengthFilter(ts, 1, Integer.MAX_VALUE);
return new TokenStreamComponents(tokenizer, ts);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(CHAR_MAP, reader);
}
}