Japanese.java

package org.wikimedia.search.glent.analysis;

import java.io.Reader;
import java.util.stream.IntStream;

import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;

public class Japanese extends GlentTokenizer {
    private static final NormalizeCharMap CHAR_MAP = initJapaneseCharMap();

    private static NormalizeCharMap initJapaneseCharMap() {
        /*
         * 		"type": "mapping",
         * 		"mappings": [
         * 			"\uff10=>0", "\uff11=>1", "\uff12=>2", "\uff13=>3",
         * 			"\uff14=>4", "\uff15=>5", "\uff16=>6", "\uff17=>7",
         * 			"\uff18=>8", "\uff19=>9",
         * 		]
         */
        int from = '\uff10';
        int to = '0';
        NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
        IntStream.range(0, 10).forEach(i -> builder.add(codePointToString(from + i), codePointToString(to + i)));
        return builder.build();
    }

    @Override
    protected TokenStreamComponents createComponents(String s) {
        Tokenizer source = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.NORMAL);
        TokenStream result = new CJKWidthFilter(source);
        result = new LowerCaseFilter(result);
        return new TokenStreamComponents(source, result);
    }

    @Override
    protected Reader initReader(String fieldName, Reader reader) {
        return new MappingCharFilter(CHAR_MAP, reader);
    }

    private static String codePointToString(int copdePoint) {
        // replace with Character.toString(int) when available
        return String.valueOf(Character.toChars(copdePoint));
    }
}