View Javadoc
1   package org.wikimedia.search.extra.termfreq;
2   
3   import static org.hamcrest.CoreMatchers.containsString;
4   import static org.wikimedia.search.extra.util.ConcreteIntPredicate.eq;
5   import static org.wikimedia.search.extra.util.ConcreteIntPredicate.gt;
6   import static org.wikimedia.search.extra.util.ConcreteIntPredicate.gte;
7   import static org.wikimedia.search.extra.util.ConcreteIntPredicate.lt;
8   import static org.wikimedia.search.extra.util.ConcreteIntPredicate.lte;
9   
10  import java.io.IOException;
11  
12  import org.apache.lucene.analysis.Analyzer;
13  import org.apache.lucene.analysis.TokenStream;
14  import org.apache.lucene.analysis.Tokenizer;
15  import org.apache.lucene.analysis.core.WhitespaceTokenizer;
16  import org.apache.lucene.document.Document;
17  import org.apache.lucene.document.Field;
18  import org.apache.lucene.document.FieldType;
19  import org.apache.lucene.document.StoredField;
20  import org.apache.lucene.index.IndexOptions;
21  import org.apache.lucene.index.IndexReader;
22  import org.apache.lucene.index.RandomIndexWriter;
23  import org.apache.lucene.index.Term;
24  import org.apache.lucene.search.BooleanClause;
25  import org.apache.lucene.search.BooleanQuery;
26  import org.apache.lucene.search.Explanation;
27  import org.apache.lucene.search.IndexSearcher;
28  import org.apache.lucene.search.ScoreDoc;
29  import org.apache.lucene.search.TopDocs;
30  import org.apache.lucene.store.Directory;
31  import org.apache.lucene.util.LuceneTestCase;
32  import org.hamcrest.Matchers;
33  import org.junit.After;
34  import org.junit.Before;
35  import org.wikimedia.search.extra.analysis.filters.TermFreqTokenFilter;
36  
37  @SuppressWarnings("checkstyle:classfanoutcomplexity")
38  public class TermFreqFilterQueryTest extends LuceneTestCase {
39  
40      private IndexSearcher searcherUnderTest;
41      private RandomIndexWriter indexWriterUnderTest;
42      private IndexReader indexReaderUnderTest;
43      private Directory dirUnderTest;
44      private int nbDocs;
45  
46      @Before
47      public void setupIndex() throws IOException {
48          dirUnderTest = newDirectory();
49          nbDocs = random().nextInt(100) + 10;
50          Analyzer analyzer = new Analyzer() {
51              @Override
52              protected TokenStreamComponents createComponents(String s) {
53                  Tokenizer tok = new WhitespaceTokenizer();
54                  TokenStream ts = new TermFreqTokenFilter(tok, '|', nbDocs);
55                  return new TokenStreamComponents(tok, ts);
56              }
57          };
58          indexWriterUnderTest = new RandomIndexWriter(random(), dirUnderTest, newIndexWriterConfig(analyzer));
59  
60          int minDocs = 10;
61          nbDocs = random().nextInt(100) + minDocs;
62          for (int i = 0; i < nbDocs; i++) {
63              int freq = i + 1;
64              Document doc = new Document();
65  
66              doc.add(new StoredField("freq", freq));
67              FieldType type = new FieldType();
68              type.setStored(false);
69              type.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
70              type.setStoreTermVectors(false);
71              type.freeze();
72              Field f = new Field("main_field", "word1|" + freq + " word2|" + (nbDocs - i), type);
73              doc.add(f);
74              indexWriterUnderTest.addDocument(doc);
75          }
76  
77          indexWriterUnderTest.commit();
78          indexWriterUnderTest.flush();
79  
80          indexReaderUnderTest = indexWriterUnderTest.getReader();
81          searcherUnderTest = newSearcher(indexReaderUnderTest);
82      }
83  
84      public void test() throws IOException {
85          Term word1 = new Term("main_field", "word1");
86          Term word2 = new Term("main_field", "word2");
87  
88          TermFreqFilterQuery tQuery = new TermFreqFilterQuery(word1, eq(1));
89          assertEquals(1, searcherUnderTest.count(tQuery));
90  
91          tQuery = new TermFreqFilterQuery(word2, eq(1));
92          assertEquals(1, searcherUnderTest.count(tQuery));
93  
94          tQuery = new TermFreqFilterQuery(word1, gte(nbDocs));
95          assertEquals(1, searcherUnderTest.count(tQuery));
96  
97          tQuery = new TermFreqFilterQuery(word2, gt(nbDocs + 1));
98          assertEquals(0, searcherUnderTest.count(tQuery));
99  
100         tQuery = new TermFreqFilterQuery(word1, lte(nbDocs));
101         assertEquals(nbDocs, searcherUnderTest.count(tQuery));
102 
103         tQuery = new TermFreqFilterQuery(word2, lt(nbDocs));
104         assertEquals(nbDocs - 1, searcherUnderTest.count(tQuery));
105 
106         tQuery = new TermFreqFilterQuery(word1, gte(1)
107                 .and(lte(nbDocs)));
108         assertEquals(nbDocs, searcherUnderTest.count(tQuery));
109 
110         tQuery = new TermFreqFilterQuery(word2, gte(1)
111                 .and(lte(nbDocs)));
112         assertEquals(nbDocs, searcherUnderTest.count(tQuery));
113 
114         tQuery = new TermFreqFilterQuery(word1, gt(1)
115                 .and(lt(nbDocs)));
116         assertEquals(nbDocs - 2, searcherUnderTest.count(tQuery));
117 
118         tQuery = new TermFreqFilterQuery(word1, gte(1)
119                 .and(lt(nbDocs)));
120         assertEquals(nbDocs - 1, searcherUnderTest.count(tQuery));
121 
122         tQuery = new TermFreqFilterQuery(word1, gt(1)
123                 .and(lte(nbDocs)));
124         assertEquals(nbDocs - 1, searcherUnderTest.count(tQuery));
125     }
126 
127     public void testScoring() throws IOException {
128         Term word1 = new Term("main_field", "word1");
129         Term word2 = new Term("main_field", "word2");
130 
131         TermFreqFilterQuery tQuery = new TermFreqFilterQuery(word1,
132                 gte(5).and(lte(nbDocs)));
133         TopDocs docs = searcherUnderTest.search(tQuery, random().nextInt(10) + 10);
134         assertEquals(nbDocs - 4, docs.totalHits.value);
135         int freq = Integer.MAX_VALUE;
136         for (ScoreDoc doc : docs.scoreDocs) {
137             int nfreq = searcherUnderTest.doc(doc.doc).getField("freq").numericValue().intValue();
138             assertThat(freq, Matchers.greaterThan(nfreq));
139             freq = nfreq;
140         }
141 
142 
143         // Filter
144         TermFreqFilterQuery tQuery1 = new TermFreqFilterQuery(word1,
145                 gte(5).and(lte(nbDocs)));
146         // reverse scoring
147         TermFreqFilterQuery tQuery2 = new TermFreqFilterQuery(word2,
148                 gte(1).and(lte(nbDocs)));
149         BooleanQuery.Builder bq = new BooleanQuery.Builder();
150         bq.add(new BooleanClause(tQuery1, BooleanClause.Occur.FILTER));
151         bq.add(new BooleanClause(tQuery2, BooleanClause.Occur.MUST));
152         docs = searcherUnderTest.search(bq.build(), random().nextInt(10) + 10);
153         assertEquals(nbDocs - 4, docs.totalHits.value);
154         freq = Integer.MIN_VALUE;
155         for (ScoreDoc doc : docs.scoreDocs) {
156             int nfreq = searcherUnderTest.doc(doc.doc).getField("freq").numericValue().intValue();
157             assertThat(nfreq, Matchers.greaterThan(freq));
158             freq = nfreq;
159         }
160     }
161 
162     public void testsUnknown() throws IOException {
163         TermFreqFilterQuery tQuery = new TermFreqFilterQuery(new Term("main_field", "unknown"),
164                 gte(5).and(lte(nbDocs)));
165         assertEquals(0, searcherUnderTest.count(tQuery));
166         tQuery = new TermFreqFilterQuery(new Term("unknown_field", "unknown"),
167                 gte(5).and(lte(nbDocs)));
168         assertEquals(0, searcherUnderTest.count(tQuery));
169     }
170 
171     public void testExplain() throws IOException {
172         Term word1 = new Term("main_field", "word1");
173         TermFreqFilterQuery tQuery = new TermFreqFilterQuery(word1, eq(1));
174         TopDocs docs = searcherUnderTest.search(tQuery, 1);
175         assertEquals(1, docs.totalHits.value);
176         Explanation exp = searcherUnderTest.explain(tQuery, docs.scoreDocs[0].doc);
177         assertTrue(exp.isMatch());
178         assertThat(exp.getDescription(), containsString("1 = 1 (main_field:word1)"));
179 
180         tQuery = new TermFreqFilterQuery(word1, eq(nbDocs + 10));
181         exp = searcherUnderTest.explain(tQuery, docs.scoreDocs[0].doc);
182         assertFalse(exp.isMatch());
183         assertThat(exp.getDescription(), containsString("1 = " + (nbDocs + 10) + " (main_field:word1)"));
184 
185         tQuery = new TermFreqFilterQuery(new Term("unk", "unk"), eq(1));
186         exp = searcherUnderTest.explain(tQuery, docs.scoreDocs[0].doc);
187         assertFalse(exp.isMatch());
188         assertThat(exp.getDescription(), containsString("(unk:unk)"));
189     }
190 
191     @After
192     public void closeStuff() throws IOException {
193         indexReaderUnderTest.close();
194         indexWriterUnderTest.close();
195         dirUnderTest.close();
196     }
197 }