ICU.java

package org.wikimedia.search.glent.analysis;

import java.io.Reader;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
import org.apache.lucene.analysis.miscellaneous.LengthFilter;

import com.ibm.icu.text.Normalizer2;

public class ICU extends GlentTokenizer {
    private static final Normalizer2 NFKCCF = Normalizer2.getNFKCCasefoldInstance();
    private static final NormalizeCharMap CHAR_MAP = initICUCharMap();

    private static NormalizeCharMap initICUCharMap() {
        NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
        // elements of word_break_helper and near_space_flattener
        builder.add("_", "\u0020");
        builder.add(":", "\u0020");
        builder.add("\u2018", "'"); // LEFT SINGLE QUOTATION MARK
        builder.add("\u2019", "'"); // RIGHT SINGLE QUOTATION MARK
        builder.add("\u02BC", "'"); // MODIFIER LETTER APOSTROPHE
        builder.add("\u201C", "\""); // LEFT DOUBLE QUOTATION MARK
        builder.add("\u201D", "\""); // RIGHT DOUBLE QUOTATION MARK
        return builder.build();
    }

    @Override
    protected TokenStreamComponents createComponents(String s) {
        ICUTokenizer tokenizer = new ICUTokenizer();
        TokenStream ts = new ICUNormalizer2Filter(tokenizer, NFKCCF);
        ts = new LengthFilter(ts, 1, Integer.MAX_VALUE);
        return new TokenStreamComponents(tokenizer, ts);
    }

    @Override
    protected Reader initReader(String fieldName, Reader reader) {
        return new MappingCharFilter(CHAR_MAP, reader);
    }

}