1 package org.wikimedia.search.extra.termfreq;
2
3 import static org.hamcrest.CoreMatchers.containsString;
4 import static org.wikimedia.search.extra.util.ConcreteIntPredicate.eq;
5 import static org.wikimedia.search.extra.util.ConcreteIntPredicate.gt;
6 import static org.wikimedia.search.extra.util.ConcreteIntPredicate.gte;
7 import static org.wikimedia.search.extra.util.ConcreteIntPredicate.lt;
8 import static org.wikimedia.search.extra.util.ConcreteIntPredicate.lte;
9
10 import java.io.IOException;
11
12 import org.apache.lucene.analysis.Analyzer;
13 import org.apache.lucene.analysis.TokenStream;
14 import org.apache.lucene.analysis.Tokenizer;
15 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
16 import org.apache.lucene.document.Document;
17 import org.apache.lucene.document.Field;
18 import org.apache.lucene.document.FieldType;
19 import org.apache.lucene.document.StoredField;
20 import org.apache.lucene.index.IndexOptions;
21 import org.apache.lucene.index.IndexReader;
22 import org.apache.lucene.index.RandomIndexWriter;
23 import org.apache.lucene.index.Term;
24 import org.apache.lucene.search.BooleanClause;
25 import org.apache.lucene.search.BooleanQuery;
26 import org.apache.lucene.search.Explanation;
27 import org.apache.lucene.search.IndexSearcher;
28 import org.apache.lucene.search.ScoreDoc;
29 import org.apache.lucene.search.TopDocs;
30 import org.apache.lucene.store.Directory;
31 import org.apache.lucene.util.LuceneTestCase;
32 import org.hamcrest.Matchers;
33 import org.junit.After;
34 import org.junit.Before;
35 import org.wikimedia.search.extra.analysis.filters.TermFreqTokenFilter;
36
37 @SuppressWarnings("checkstyle:classfanoutcomplexity")
38 public class TermFreqFilterQueryTest extends LuceneTestCase {
39
40 private IndexSearcher searcherUnderTest;
41 private RandomIndexWriter indexWriterUnderTest;
42 private IndexReader indexReaderUnderTest;
43 private Directory dirUnderTest;
44 private int nbDocs;
45
46 @Before
47 public void setupIndex() throws IOException {
48 dirUnderTest = newDirectory();
49 nbDocs = random().nextInt(100) + 10;
50 Analyzer analyzer = new Analyzer() {
51 @Override
52 protected TokenStreamComponents createComponents(String s) {
53 Tokenizer tok = new WhitespaceTokenizer();
54 TokenStream ts = new TermFreqTokenFilter(tok, '|', nbDocs);
55 return new TokenStreamComponents(tok, ts);
56 }
57 };
58 indexWriterUnderTest = new RandomIndexWriter(random(), dirUnderTest, newIndexWriterConfig(analyzer));
59
60 int minDocs = 10;
61 nbDocs = random().nextInt(100) + minDocs;
62 for (int i = 0; i < nbDocs; i++) {
63 int freq = i + 1;
64 Document doc = new Document();
65
66 doc.add(new StoredField("freq", freq));
67 FieldType type = new FieldType();
68 type.setStored(false);
69 type.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
70 type.setStoreTermVectors(false);
71 type.freeze();
72 Field f = new Field("main_field", "word1|" + freq + " word2|" + (nbDocs - i), type);
73 doc.add(f);
74 indexWriterUnderTest.addDocument(doc);
75 }
76
77 indexWriterUnderTest.commit();
78 indexWriterUnderTest.flush();
79
80 indexReaderUnderTest = indexWriterUnderTest.getReader();
81 searcherUnderTest = newSearcher(indexReaderUnderTest);
82 }
83
84 public void test() throws IOException {
85 Term word1 = new Term("main_field", "word1");
86 Term word2 = new Term("main_field", "word2");
87
88 TermFreqFilterQuery tQuery = new TermFreqFilterQuery(word1, eq(1));
89 assertEquals(1, searcherUnderTest.count(tQuery));
90
91 tQuery = new TermFreqFilterQuery(word2, eq(1));
92 assertEquals(1, searcherUnderTest.count(tQuery));
93
94 tQuery = new TermFreqFilterQuery(word1, gte(nbDocs));
95 assertEquals(1, searcherUnderTest.count(tQuery));
96
97 tQuery = new TermFreqFilterQuery(word2, gt(nbDocs + 1));
98 assertEquals(0, searcherUnderTest.count(tQuery));
99
100 tQuery = new TermFreqFilterQuery(word1, lte(nbDocs));
101 assertEquals(nbDocs, searcherUnderTest.count(tQuery));
102
103 tQuery = new TermFreqFilterQuery(word2, lt(nbDocs));
104 assertEquals(nbDocs - 1, searcherUnderTest.count(tQuery));
105
106 tQuery = new TermFreqFilterQuery(word1, gte(1)
107 .and(lte(nbDocs)));
108 assertEquals(nbDocs, searcherUnderTest.count(tQuery));
109
110 tQuery = new TermFreqFilterQuery(word2, gte(1)
111 .and(lte(nbDocs)));
112 assertEquals(nbDocs, searcherUnderTest.count(tQuery));
113
114 tQuery = new TermFreqFilterQuery(word1, gt(1)
115 .and(lt(nbDocs)));
116 assertEquals(nbDocs - 2, searcherUnderTest.count(tQuery));
117
118 tQuery = new TermFreqFilterQuery(word1, gte(1)
119 .and(lt(nbDocs)));
120 assertEquals(nbDocs - 1, searcherUnderTest.count(tQuery));
121
122 tQuery = new TermFreqFilterQuery(word1, gt(1)
123 .and(lte(nbDocs)));
124 assertEquals(nbDocs - 1, searcherUnderTest.count(tQuery));
125 }
126
127 public void testScoring() throws IOException {
128 Term word1 = new Term("main_field", "word1");
129 Term word2 = new Term("main_field", "word2");
130
131 TermFreqFilterQuery tQuery = new TermFreqFilterQuery(word1,
132 gte(5).and(lte(nbDocs)));
133 TopDocs docs = searcherUnderTest.search(tQuery, random().nextInt(10) + 10);
134 assertEquals(nbDocs - 4, docs.totalHits.value);
135 int freq = Integer.MAX_VALUE;
136 for (ScoreDoc doc : docs.scoreDocs) {
137 int nfreq = searcherUnderTest.doc(doc.doc).getField("freq").numericValue().intValue();
138 assertThat(freq, Matchers.greaterThan(nfreq));
139 freq = nfreq;
140 }
141
142
143
144 TermFreqFilterQuery tQuery1 = new TermFreqFilterQuery(word1,
145 gte(5).and(lte(nbDocs)));
146
147 TermFreqFilterQuery tQuery2 = new TermFreqFilterQuery(word2,
148 gte(1).and(lte(nbDocs)));
149 BooleanQuery.Builder bq = new BooleanQuery.Builder();
150 bq.add(new BooleanClause(tQuery1, BooleanClause.Occur.FILTER));
151 bq.add(new BooleanClause(tQuery2, BooleanClause.Occur.MUST));
152 docs = searcherUnderTest.search(bq.build(), random().nextInt(10) + 10);
153 assertEquals(nbDocs - 4, docs.totalHits.value);
154 freq = Integer.MIN_VALUE;
155 for (ScoreDoc doc : docs.scoreDocs) {
156 int nfreq = searcherUnderTest.doc(doc.doc).getField("freq").numericValue().intValue();
157 assertThat(nfreq, Matchers.greaterThan(freq));
158 freq = nfreq;
159 }
160 }
161
162 public void testsUnknown() throws IOException {
163 TermFreqFilterQuery tQuery = new TermFreqFilterQuery(new Term("main_field", "unknown"),
164 gte(5).and(lte(nbDocs)));
165 assertEquals(0, searcherUnderTest.count(tQuery));
166 tQuery = new TermFreqFilterQuery(new Term("unknown_field", "unknown"),
167 gte(5).and(lte(nbDocs)));
168 assertEquals(0, searcherUnderTest.count(tQuery));
169 }
170
171 public void testExplain() throws IOException {
172 Term word1 = new Term("main_field", "word1");
173 TermFreqFilterQuery tQuery = new TermFreqFilterQuery(word1, eq(1));
174 TopDocs docs = searcherUnderTest.search(tQuery, 1);
175 assertEquals(1, docs.totalHits.value);
176 Explanation exp = searcherUnderTest.explain(tQuery, docs.scoreDocs[0].doc);
177 assertTrue(exp.isMatch());
178 assertThat(exp.getDescription(), containsString("1 = 1 (main_field:word1)"));
179
180 tQuery = new TermFreqFilterQuery(word1, eq(nbDocs + 10));
181 exp = searcherUnderTest.explain(tQuery, docs.scoreDocs[0].doc);
182 assertFalse(exp.isMatch());
183 assertThat(exp.getDescription(), containsString("1 = " + (nbDocs + 10) + " (main_field:word1)"));
184
185 tQuery = new TermFreqFilterQuery(new Term("unk", "unk"), eq(1));
186 exp = searcherUnderTest.explain(tQuery, docs.scoreDocs[0].doc);
187 assertFalse(exp.isMatch());
188 assertThat(exp.getDescription(), containsString("(unk:unk)"));
189 }
190
191 @After
192 public void closeStuff() throws IOException {
193 indexReaderUnderTest.close();
194 indexWriterUnderTest.close();
195 dirUnderTest.close();
196 }
197 }