CirrusNearMatchNormalizer.java

package org.wikimedia.search.glent.analysis;

import java.io.Reader;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.icu.ICUNormalizer2CharFilter;
import org.apache.lucene.analysis.miscellaneous.TrimFilter;

import com.ibm.icu.text.Normalizer2;

public class CirrusNearMatchNormalizer extends GlentNormalizer {
    private static final Normalizer2 NFKCCF = Normalizer2.getNFKCCasefoldInstance();
    private static final NormalizeCharMap NEAR_SPACE_FLATTENER = initNearSpaceFlattener();
    private static final String SPACE = "\u0020";

    private static NormalizeCharMap initNearSpaceFlattener() {
        NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
        builder.add("'", SPACE);      // apostrophe
        builder.add("\u2019", SPACE); // right single quote
        builder.add("\u02BC", SPACE); // modifier letter apostrophe
        builder.add("_", SPACE);      // underscore
        builder.add("-", SPACE);      // hyphen
        return builder.build();
    }

    @Override
    protected TokenStream filters(TokenStream source) {
        return new TrimFilter(source);
    }

    @Override
    protected Reader initReader(String fieldName, Reader reader) {
        reader = new MappingCharFilter(NEAR_SPACE_FLATTENER, reader);
        return new ICUNormalizer2CharFilter(reader, NFKCCF);
    }
}