View Javadoc
1   package org.wikimedia.search.extra.analysis.ukrainian;
2   
3   import java.io.IOException;
4   import java.io.Reader;
5   import java.io.UncheckedIOException;
6   import java.nio.charset.StandardCharsets;
7   
8   import org.apache.lucene.analysis.CharArraySet;
9   import org.apache.lucene.analysis.TokenStream;
10  import org.apache.lucene.analysis.WordlistLoader;
11  import org.apache.lucene.util.IOUtils;
12  import org.elasticsearch.common.settings.Settings;
13  import org.elasticsearch.env.Environment;
14  import org.elasticsearch.index.IndexSettings;
15  import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
16  
17  public class UkrainianStopFilterFactory extends AbstractTokenFilterFactory {
18  
19      protected static final CharArraySet UK_STOP = getStopwords();
20  
21      public UkrainianStopFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
22          super(indexSettings, name, settings);
23      }
24  
25      @Override public TokenStream create(TokenStream tokenStream) {
26          return new UkrainianStopFilter(tokenStream, UK_STOP);
27      }
28  
29      private static CharArraySet getStopwords() {
30          try (
31              Reader reader = IOUtils.getDecodingReader(UkrainianStopFilterFactory.class, "stopwords.txt", StandardCharsets.UTF_8)
32          ) {
33              return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(reader, "#"));
34          } catch (IOException e) {
35              throw new UncheckedIOException("Could not load the Ukrainian stopword list.", e);
36          }
37      }
38  
39  }