1 package org.wikimedia.search.extra.analysis.filters;
2
3 import java.io.IOException;
4
5 import org.apache.lucene.analysis.TokenFilter;
6 import org.apache.lucene.analysis.TokenStream;
7 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
8 import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
9
10 import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
11
12 @SuppressFBWarnings(
13 value = "EQ_DOESNT_OVERRIDE_EQUALS",
14 justification = "equals() as defined in org.apache.lucene.util.AttributeSource seems strong enough.")
15 public class TermFreqTokenFilter extends TokenFilter {
16
17 public static final char DEFAULT_SPLIT_CHAR = '|';
18 public static final int DEFAULT_MAX_TF = 1000;
19
20 private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
21 private final TermFrequencyAttribute termFreq = addAttribute(TermFrequencyAttribute.class);
22
23 private final char splitChar;
24 private final int maxTF;
25
26 public TermFreqTokenFilter(TokenStream input) {
27 this(input, DEFAULT_SPLIT_CHAR, DEFAULT_MAX_TF);
28 }
29
30 public TermFreqTokenFilter(TokenStream input, char splitChar, int maxTF) {
31 super(input);
32 this.splitChar = splitChar;
33 assert maxTF > 0;
34 this.maxTF = maxTF;
35 }
36
37 @Override
38 public final boolean incrementToken() throws IOException {
39 if (!input.incrementToken()) {
40 return false;
41 }
42 int sepOffset = findSeparatorOffset();
43 if (sepOffset == -1) {
44 return true;
45 }
46
47 int freq = extractFreq(sepOffset);
48 if (freq == -1) {
49 return true;
50 }
51 termFreq.setTermFrequency(Math.max(freq, 1));
52
53 termAttribute.setLength(sepOffset);
54 return true;
55 }
56
57 private int findSeparatorOffset() {
58 for (int i = termAttribute.length() - 1; i > 0; i--) {
59 if (termAttribute.charAt(i) == splitChar) {
60 return i;
61 }
62 }
63 return -1;
64 }
65
66 private int extractFreq(int sepOffset) {
67 int m = -1;
68 int iter = 1;
69 for (int i = termAttribute.length() - 1; i > sepOffset; i--) {
70 int c = termAttribute.charAt(i) - '0';
71 if (c > 9 || c < 0) {
72 return -1;
73 }
74 if (m < 0) {
75 m = 0;
76 }
77 m += c*iter;
78 iter *= 10;
79 if (m > maxTF) {
80 return maxTF;
81 }
82 }
83 return m;
84 }
85 }