Japanese.java
package org.wikimedia.search.glent.analysis;
import java.io.Reader;
import java.util.stream.IntStream;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
public class Japanese extends GlentTokenizer {
private static final NormalizeCharMap CHAR_MAP = initJapaneseCharMap();
private static NormalizeCharMap initJapaneseCharMap() {
/*
* "type": "mapping",
* "mappings": [
* "\uff10=>0", "\uff11=>1", "\uff12=>2", "\uff13=>3",
* "\uff14=>4", "\uff15=>5", "\uff16=>6", "\uff17=>7",
* "\uff18=>8", "\uff19=>9",
* ]
*/
int from = '\uff10';
int to = '0';
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
IntStream.range(0, 10).forEach(i -> builder.add(codePointToString(from + i), codePointToString(to + i)));
return builder.build();
}
@Override
protected TokenStreamComponents createComponents(String s) {
Tokenizer source = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.NORMAL);
TokenStream result = new CJKWidthFilter(source);
result = new LowerCaseFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(CHAR_MAP, reader);
}
private static String codePointToString(int copdePoint) {
// replace with Character.toString(int) when available
return String.valueOf(Character.toChars(copdePoint));
}
}