View Javadoc
1   package org.wikimedia.search.extra.analysis.filters;
2   
3   import java.io.IOException;
4   
5   import org.apache.lucene.analysis.TokenFilter;
6   import org.apache.lucene.analysis.TokenStream;
7   import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
8   import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
9   
10  import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
11  
12  @SuppressFBWarnings(
13          value = "EQ_DOESNT_OVERRIDE_EQUALS",
14          justification = "equals() as defined in org.apache.lucene.util.AttributeSource seems strong enough.")
15  public class TermFreqTokenFilter extends TokenFilter {
16  
17      public static final char DEFAULT_SPLIT_CHAR = '|';
18      public static final int DEFAULT_MAX_TF = 1000;
19  
20      private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
21      private final TermFrequencyAttribute termFreq = addAttribute(TermFrequencyAttribute.class);
22  
23      private final char splitChar;
24      private final int maxTF;
25  
26      public TermFreqTokenFilter(TokenStream input) {
27          this(input, DEFAULT_SPLIT_CHAR, DEFAULT_MAX_TF);
28      }
29  
30      public TermFreqTokenFilter(TokenStream input, char splitChar, int maxTF) {
31          super(input);
32          this.splitChar = splitChar;
33          assert maxTF > 0;
34          this.maxTF = maxTF;
35      }
36  
37      @Override
38      public final boolean incrementToken() throws IOException {
39          if (!input.incrementToken()) {
40              return false;
41          }
42          int sepOffset = findSeparatorOffset();
43          if (sepOffset == -1) {
44              return true;
45          }
46  
47          int freq = extractFreq(sepOffset);
48          if (freq == -1) {
49              return true;
50          }
51          termFreq.setTermFrequency(Math.max(freq, 1));
52          // We cannot store 0 as a term freq...
53          termAttribute.setLength(sepOffset);
54          return true;
55      }
56  
57      private int findSeparatorOffset() {
58          for (int i = termAttribute.length() - 1; i > 0; i--) {
59              if (termAttribute.charAt(i) == splitChar) {
60                  return i;
61              }
62          }
63          return -1;
64      }
65  
66      private int extractFreq(int sepOffset) {
67          int m = -1;
68          int iter = 1;
69          for (int i = termAttribute.length() - 1; i > sepOffset; i--) {
70              int c = termAttribute.charAt(i) - '0';
71              if (c > 9 || c < 0) {
72                  return -1;
73              }
74              if (m < 0) {
75                  m = 0;
76              }
77              m += c*iter;
78              iter *= 10;
79              if (m > maxTF) {
80                  return maxTF;
81              }
82          }
83          return m;
84      }
85  }