Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
94.62% |
123 / 130 |
|
75.00% |
6 / 8 |
CRAP | |
0.00% |
0 / 1 |
EntityFullTextQueryBuilder | |
94.62% |
123 / 130 |
|
75.00% |
6 / 8 |
22.08 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
newFromGlobals | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
build | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
buildDegraded | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
buildEntitySearchQuery | |
100.00% |
65 / 65 |
|
100.00% |
1 / 1 |
9 | |||
buildSimpleAllFilter | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
buildFieldMatch | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
buildPhraseRescore | |
81.25% |
26 / 32 |
|
0.00% |
0 / 1 |
4.11 |
1 | <?php |
2 | |
3 | namespace Wikibase\Search\Elastic; |
4 | |
5 | use CirrusSearch\Extra\Query\TokenCountRouter; |
6 | use CirrusSearch\Query\FullTextQueryBuilder; |
7 | use CirrusSearch\Search\SearchContext; |
8 | use Elastica\Query\AbstractQuery; |
9 | use Elastica\Query\BoolQuery; |
10 | use Elastica\Query\DisMax; |
11 | use Elastica\Query\MatchNone; |
12 | use Elastica\Query\MatchQuery; |
13 | use Elastica\Query\MultiMatch; |
14 | use Elastica\Query\Term; |
15 | use MediaWiki\MediaWikiServices; |
16 | use Wikibase\DataModel\Entity\EntityIdParser; |
17 | use Wikibase\Lib\LanguageFallbackChainFactory; |
18 | use Wikibase\Repo\WikibaseRepo; |
19 | |
20 | /** |
21 | * Builder for entity fulltext queries |
22 | */ |
23 | class EntityFullTextQueryBuilder implements FullTextQueryBuilder { |
24 | public const ENTITY_FULL_TEXT_MARKER = 'entity_full_text'; |
25 | |
26 | /** |
27 | * @var array |
28 | */ |
29 | private $settings; |
30 | /** |
31 | * Repository 'entitySearch' settings |
32 | * @var array |
33 | */ |
34 | private $stemmingSettings; |
35 | /** |
36 | * @var LanguageFallbackChainFactory |
37 | */ |
38 | private $languageFallbackChainFactory; |
39 | /** |
40 | * @var EntityIdParser |
41 | */ |
42 | private $entityIdParser; |
43 | /** |
44 | * @var string User language code |
45 | */ |
46 | private $userLanguage; |
47 | |
48 | /** |
49 | * @param array $stemmingSettings Stemming settings from UseStemming config entry |
50 | * @param array $settings Settings from EntitySearchProfiles.php |
51 | * @param LanguageFallbackChainFactory $languageFallbackChainFactory |
52 | * @param EntityIdParser $entityIdParser |
53 | * @param string $userLanguage User's language code |
54 | */ |
55 | public function __construct( |
56 | array $stemmingSettings, |
57 | array $settings, |
58 | LanguageFallbackChainFactory $languageFallbackChainFactory, |
59 | EntityIdParser $entityIdParser, |
60 | $userLanguage |
61 | ) { |
62 | $this->stemmingSettings = $stemmingSettings; |
63 | $this->settings = $settings; |
64 | $this->languageFallbackChainFactory = $languageFallbackChainFactory; |
65 | $this->entityIdParser = $entityIdParser; |
66 | $this->userLanguage = $userLanguage; |
67 | } |
68 | |
69 | /** |
70 | * Create fulltext builder from global environment. |
71 | * @param array $settings Configuration from config file |
72 | * @return EntityFullTextQueryBuilder |
73 | */ |
74 | public static function newFromGlobals( array $settings ) { |
75 | $services = MediaWikiServices::getInstance(); |
76 | $config = $services->getConfigFactory()->makeConfig( 'WikibaseCirrusSearch' ); |
77 | return new static( |
78 | $config->get( 'UseStemming' ), |
79 | $settings, |
80 | WikibaseRepo::getLanguageFallbackChainFactory( $services ), |
81 | WikibaseRepo::getEntityIdParser( $services ), |
82 | WikibaseRepo::getUserLanguage( $services )->getCode() |
83 | ); |
84 | } |
85 | |
86 | /** |
87 | * Search articles with provided term. |
88 | * |
89 | * @param SearchContext $searchContext |
90 | * @param string $term term to search |
91 | */ |
92 | public function build( SearchContext $searchContext, $term ) { |
93 | $this->buildEntitySearchQuery( $searchContext, $term ); |
94 | // if we did find advanced query, we keep the old setup but change the result type |
95 | // FIXME: make it dispatch by content model |
96 | $searchContext->setResultsType( new EntityResultType( $this->userLanguage, |
97 | $this->languageFallbackChainFactory->newFromLanguageCode( $this->userLanguage ) ) ); |
98 | } |
99 | |
100 | /** |
101 | * @param SearchContext $searchContext |
102 | * @return bool |
103 | */ |
104 | public function buildDegraded( SearchContext $searchContext ) { |
105 | // Not doing anything for now |
106 | return false; |
107 | } |
108 | |
109 | /** |
110 | * Build a fulltext query for Wikibase entity. |
111 | * @param SearchContext $searchContext |
112 | * @param string $term Search term |
113 | */ |
114 | protected function buildEntitySearchQuery( SearchContext $searchContext, $term ) { |
115 | $searchContext->addSyntaxUsed( self::ENTITY_FULL_TEXT_MARKER, 10 ); |
116 | /* |
117 | * Overall query structure is as follows: |
118 | * - Bool with: |
119 | * Filter of namespace = N |
120 | * OR (Should with 1 mininmum) of: |
121 | * title.keyword = QUERY |
122 | * fulltext match query |
123 | * |
124 | * Fulltext match query is: |
125 | * Filter of: |
126 | * at least one of: all, all.plain matching |
127 | * description (for stemmed) or description.en (for non-stemmed) matching, with fallback |
128 | * OR (should with 0 minimum) of: |
129 | * DISMAX query of: all labels.near_match in fallback chain |
130 | * OR (should with 0 minimum) of: |
131 | * all |
132 | * all.plain |
133 | * DISMAX of: all fulltext matches for tokenized fields |
134 | */ |
135 | |
136 | $profile = $this->settings; |
137 | // $fields is collecting all the fields for dismax query to be used in |
138 | // scoring match |
139 | $fields = [ |
140 | [ "labels.{$this->userLanguage}.near_match", $profile['lang-exact'] ], |
141 | [ "labels.{$this->userLanguage}.near_match_folded", $profile['lang-folded'] ], |
142 | ]; |
143 | |
144 | $fieldsTokenized = [ |
145 | [ "labels.{$this->userLanguage}.plain", $profile['lang-partial'] ], |
146 | [ "descriptions.{$this->userLanguage}.plain", $profile['lang-partial'] ], |
147 | ]; |
148 | if ( !empty( $this->stemmingSettings[$this->userLanguage]['query'] ) ) { |
149 | $fieldsTokenized[] = [ "labels.{$this->userLanguage}", $profile['lang-partial'] ]; |
150 | $fieldsTokenized[] = [ "descriptions.{$this->userLanguage}", $profile['lang-partial'] ]; |
151 | } |
152 | |
153 | $searchLanguageCodes = $this->languageFallbackChainFactory->newFromLanguageCode( $this->userLanguage ) |
154 | ->getFetchLanguageCodes(); |
155 | |
156 | $discount = $profile['fallback-discount']; |
157 | $stemFilterFields = []; |
158 | |
159 | foreach ( $searchLanguageCodes as $fallbackCode ) { |
160 | if ( empty( $this->stemmingSettings[$fallbackCode]['query'] ) ) { |
161 | $stemFilterFields[] = "descriptions.{$fallbackCode}.plain"; |
162 | } else { |
163 | $stemFilterFields[] = "descriptions.{$fallbackCode}"; |
164 | // only add the stemmed version in the filter |
165 | // labels should be copied to the text field and thus be captured by the filter on the all field |
166 | $stemFilterFields[] = "labels.{$fallbackCode}"; |
167 | } |
168 | |
169 | if ( $fallbackCode === $this->userLanguage ) { |
170 | continue; |
171 | } |
172 | |
173 | $weight = $profile['fallback-exact'] * $discount; |
174 | $fields[] = [ "labels.{$fallbackCode}.near_match", $weight ]; |
175 | |
176 | $weight = $profile['fallback-folded'] * $discount; |
177 | $fields[] = [ "labels.{$fallbackCode}.near_match_folded", $weight ]; |
178 | |
179 | $weight = $profile['fallback-partial'] * $discount; |
180 | $fieldsTokenized[] = [ "labels.{$fallbackCode}.plain", $weight ]; |
181 | $fieldsTokenized[] = [ "descriptions.{$fallbackCode}.plain", $weight ]; |
182 | if ( !empty( $this->stemmingSettings[$fallbackCode]['query'] ) ) { |
183 | $fieldsTokenized[] = [ "descriptions.{$fallbackCode}", $weight ]; |
184 | $fieldsTokenized[] = [ "labels.{$fallbackCode}", $weight ]; |
185 | } |
186 | |
187 | $discount *= $profile['fallback-discount']; |
188 | } |
189 | |
190 | $titleMatch = new Term( [ |
191 | 'title.keyword' => EntitySearchUtils::normalizeId( $term, $this->entityIdParser ), |
192 | ] ); |
193 | |
194 | // Main query filter |
195 | $filterQuery = $this->buildSimpleAllFilter( $term ); |
196 | foreach ( $stemFilterFields as $filterField ) { |
197 | $filterQuery->addShould( $this->buildFieldMatch( $filterField, $term, 'AND' ) ); |
198 | } |
199 | |
200 | // Near match ones, they use constant score |
201 | $nearMatchQuery = new DisMax(); |
202 | $nearMatchQuery->setTieBreaker( 0 ); |
203 | foreach ( $fields as $field ) { |
204 | $nearMatchQuery->addQuery( EntitySearchUtils::makeConstScoreQuery( $field[0], $field[1], |
205 | $term ) ); |
206 | } |
207 | |
208 | // Tokenized ones |
209 | $tokenizedQuery = $this->buildSimpleAllFilter( $term, 'OR', $profile['any'] ); |
210 | $tokenizedQueryFields = new DisMax(); |
211 | $tokenizedQueryFields->setTieBreaker( 0.2 ); |
212 | foreach ( $fieldsTokenized as $field ) { |
213 | $m = $this->buildFieldMatch( $field[0], $term ); |
214 | $m->setFieldBoost( $field[0], $field[1] ); |
215 | $tokenizedQueryFields->addQuery( $m ); |
216 | } |
217 | $tokenizedQuery->addShould( $tokenizedQueryFields ); |
218 | |
219 | // Main labels/desc query |
220 | $labelsDescQuery = new BoolQuery(); |
221 | $labelsDescQuery->setMinimumShouldMatch( 0 ); |
222 | $labelsDescQuery->addFilter( $filterQuery ); |
223 | $labelsDescQuery->addShould( $nearMatchQuery ); |
224 | $labelsDescQuery->addShould( $tokenizedQuery ); |
225 | |
226 | // Main query |
227 | $query = new BoolQuery(); |
228 | |
229 | // Match either labels or exact match to title |
230 | $query->addShould( $titleMatch ); |
231 | $query->addShould( $labelsDescQuery ); |
232 | $query->setMinimumShouldMatch( 1 ); |
233 | |
234 | $searchContext->setMainQuery( $query ); |
235 | $searchContext->setPhraseRescoreQuery( $this->buildPhraseRescore( $term, $searchContext, $profile ) ); |
236 | } |
237 | |
238 | /** |
239 | * Builds a simple filter on all and all.plain when all terms must match |
240 | * |
241 | * @param string $query |
242 | * @param string $operator |
243 | * @param null $boost |
244 | * @return BoolQuery |
245 | */ |
246 | private function buildSimpleAllFilter( $query, $operator = 'AND', $boost = null ) { |
247 | $filter = new BoolQuery(); |
248 | $filter->setMinimumShouldMatch( 1 ); |
249 | // FIXME: We can't use solely the stem field here |
250 | // - Depending on languages it may lack stopwords, |
251 | // A dedicated field used for filtering would be nice |
252 | foreach ( [ 'all', 'all.plain' ] as $field ) { |
253 | $m = new MatchQuery(); |
254 | $m->setFieldQuery( $field, $query ); |
255 | $m->setFieldOperator( $field, $operator ); |
256 | if ( $boost ) { |
257 | $m->setFieldBoost( $field, $boost ); |
258 | } |
259 | $filter->addShould( $m ); |
260 | } |
261 | return $filter; |
262 | } |
263 | |
264 | /** |
265 | * Build simple match clause, matching field against term |
266 | * @param string $field |
267 | * @param string $term |
268 | * @param string|null $operator |
269 | * @return MatchQuery |
270 | */ |
271 | private function buildFieldMatch( $field, $term, $operator = null ) { |
272 | $m = new MatchQuery(); |
273 | $m->setFieldQuery( $field, $term ); |
274 | if ( $operator ) { |
275 | $m->setFieldOperator( $field, $operator ); |
276 | } |
277 | return $m; |
278 | } |
279 | |
280 | /** |
281 | * Create phrase rescore query for "all" fields |
282 | * @param string $queryText |
283 | * @param SearchContext $context |
284 | * @param float[][] $profile Must contain $profile['phrase'] with keys 'all', 'slop', 'all.plain' |
285 | * @return AbstractQuery|null |
286 | */ |
287 | private function buildPhraseRescore( $queryText, SearchContext $context, array $profile ) { |
288 | if ( empty( $profile['phrase'] ) ) { |
289 | return null; |
290 | } else { |
291 | $phraseProfile = $profile['phrase']; |
292 | } |
293 | $useRouter = $context->getConfig()->getElement( 'CirrusSearchWikimediaExtraPlugin', 'token_count_router' ) === true; |
294 | $phrase = new MultiMatch(); |
295 | $phrase->setParam( 'type', 'phrase' ); |
296 | $phrase->setParam( 'slop', $phraseProfile['slop'] ); |
297 | $fields = [ |
298 | "all^{$phraseProfile['all']}", "all.plain^{$phraseProfile['all.plain']}" |
299 | ]; |
300 | $phrase->setFields( $fields ); |
301 | $phrase->setQuery( $queryText ); |
302 | if ( !$useRouter ) { |
303 | return $phrase; |
304 | } |
305 | $tokCount = new TokenCountRouter( |
306 | // text |
307 | $queryText, |
308 | // fallback |
309 | new MatchNone(), |
310 | // field |
311 | "text" |
312 | ); |
313 | $tokCount->addCondition( |
314 | TokenCountRouter::GT, |
315 | 1, |
316 | $phrase |
317 | ); |
318 | $maxTokens = $context->getConfig()->get( 'CirrusSearchMaxPhraseTokens' ); |
319 | if ( $maxTokens ) { |
320 | $tokCount->addCondition( |
321 | TokenCountRouter::GT, |
322 | $maxTokens, |
323 | new \Elastica\Query\MatchNone() |
324 | ); |
325 | } |
326 | return $tokCount; |
327 | } |
328 | |
329 | } |