View Javadoc
1   package org.wikimedia.search.extra.analysis.turkish;
2   
3   // import static java.util.Collections.singletonList;
4   
5   import java.io.IOException;
6   
7   import org.apache.lucene.analysis.Analyzer;
8   import org.apache.lucene.analysis.BaseTokenStreamTestCase;
9   import org.apache.lucene.analysis.TokenStream;
10  import org.apache.lucene.analysis.Tokenizer;
11  import org.apache.lucene.analysis.core.WhitespaceTokenizer;
12  import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
13  import org.junit.Test;
14  
15  public class BetterApostropheFilterTest extends BaseTokenStreamTestCase {
16  
17      @Test
18      public void simpleTest() throws IOException {
19          String input = "Wikipedia'nın sunucuları ABD’de";
20          try (Analyzer ws = newBetterApostrophe()) {
21              TokenStream ts = ws.tokenStream("", input);
22              assertTokenStreamContents(ts,
23                      new String[]{"wikipedia", "sunucuları", "abd"},
24                      new int[]{0,  14, 25}, // start offsets
25                      new int[]{13, 24, 31}, // end offsets
26                      null, // types, not supported
27                      new int[]{1, 1, 1}, // pos increments
28                      null, // pos size (unsupported)
29                      31, // last offset
30                      null, //keywordAtts, (unsupported)
31                      true);
32          }
33      }
34  
35      private Analyzer newBetterApostrophe() {
36          return new Analyzer() {
37              @Override
38              protected TokenStreamComponents createComponents(String fieldName) {
39                  Tokenizer tok = new WhitespaceTokenizer();
40                  TokenStream ts = new TurkishLowerCaseFilter(tok);
41                  ts = new BetterApostropheFilter(ts);
42                  return new TokenStreamComponents(tok, ts);
43              }
44          };
45      }
46  
47  }