1 package org.wikimedia.search.extra.analysis.ukrainian; 2 3 import java.io.IOException; 4 import java.io.Reader; 5 import java.io.UncheckedIOException; 6 import java.nio.charset.StandardCharsets; 7 8 import org.apache.lucene.analysis.CharArraySet; 9 import org.apache.lucene.analysis.TokenStream; 10 import org.apache.lucene.analysis.WordlistLoader; 11 import org.apache.lucene.util.IOUtils; 12 import org.elasticsearch.common.settings.Settings; 13 import org.elasticsearch.env.Environment; 14 import org.elasticsearch.index.IndexSettings; 15 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 16 17 public class UkrainianStopFilterFactory extends AbstractTokenFilterFactory { 18 19 protected static final CharArraySet UK_STOP = getStopwords(); 20 21 public UkrainianStopFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 22 super(indexSettings, name, settings); 23 } 24 25 @Override public TokenStream create(TokenStream tokenStream) { 26 return new UkrainianStopFilter(tokenStream, UK_STOP); 27 } 28 29 private static CharArraySet getStopwords() { 30 try ( 31 Reader reader = IOUtils.getDecodingReader(UkrainianStopFilterFactory.class, "stopwords.txt", StandardCharsets.UTF_8) 32 ) { 33 return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader, "#")); 34 } catch (IOException e) { 35 throw new UncheckedIOException("Could not load the Ukrainian stopword list.", e); 36 } 37 } 38 39 }