Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
94.62% |
123 / 130 |
|
75.00% |
6 / 8 |
CRAP | |
0.00% |
0 / 1 |
EntityFullTextQueryBuilder | |
94.62% |
123 / 130 |
|
75.00% |
6 / 8 |
22.08 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
newFromGlobals | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
build | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
buildDegraded | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
buildEntitySearchQuery | |
100.00% |
65 / 65 |
|
100.00% |
1 / 1 |
9 | |||
buildSimpleAllFilter | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
buildFieldMatch | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
buildPhraseRescore | |
81.25% |
26 / 32 |
|
0.00% |
0 / 1 |
4.11 |
1 | <?php |
2 | |
3 | namespace Wikibase\Search\Elastic; |
4 | |
5 | use CirrusSearch\Extra\Query\TokenCountRouter; |
6 | use CirrusSearch\Query\FullTextQueryBuilder; |
7 | use CirrusSearch\Search\SearchContext; |
8 | use Elastica\Query\AbstractQuery; |
9 | use Elastica\Query\BoolQuery; |
10 | use Elastica\Query\DisMax; |
11 | use Elastica\Query\MatchNone; |
12 | use Elastica\Query\MatchQuery; |
13 | use Elastica\Query\MultiMatch; |
14 | use Elastica\Query\Term; |
15 | use MediaWiki\Context\RequestContext; |
16 | use MediaWiki\MediaWikiServices; |
17 | use Wikibase\DataModel\Entity\EntityIdParser; |
18 | use Wikibase\Lib\LanguageFallbackChainFactory; |
19 | use Wikibase\Repo\WikibaseRepo; |
20 | |
21 | /** |
22 | * Builder for entity fulltext queries |
23 | */ |
24 | class EntityFullTextQueryBuilder implements FullTextQueryBuilder { |
25 | public const ENTITY_FULL_TEXT_MARKER = 'entity_full_text'; |
26 | |
27 | /** |
28 | * @var array |
29 | */ |
30 | private $settings; |
31 | /** |
32 | * Repository 'entitySearch' settings |
33 | * @var array |
34 | */ |
35 | private $stemmingSettings; |
36 | /** |
37 | * @var LanguageFallbackChainFactory |
38 | */ |
39 | private $languageFallbackChainFactory; |
40 | /** |
41 | * @var EntityIdParser |
42 | */ |
43 | private $entityIdParser; |
44 | /** |
45 | * @var string User language code |
46 | */ |
47 | private $userLanguage; |
48 | |
49 | /** |
50 | * @param array $stemmingSettings Stemming settings from UseStemming config entry |
51 | * @param array $settings Settings from EntitySearchProfiles.php |
52 | * @param LanguageFallbackChainFactory $languageFallbackChainFactory |
53 | * @param EntityIdParser $entityIdParser |
54 | * @param string $userLanguage User's language code |
55 | */ |
56 | public function __construct( |
57 | array $stemmingSettings, |
58 | array $settings, |
59 | LanguageFallbackChainFactory $languageFallbackChainFactory, |
60 | EntityIdParser $entityIdParser, |
61 | $userLanguage |
62 | ) { |
63 | $this->stemmingSettings = $stemmingSettings; |
64 | $this->settings = $settings; |
65 | $this->languageFallbackChainFactory = $languageFallbackChainFactory; |
66 | $this->entityIdParser = $entityIdParser; |
67 | $this->userLanguage = $userLanguage; |
68 | } |
69 | |
70 | /** |
71 | * Create fulltext builder from global environment. |
72 | * @param array $settings Configuration from config file |
73 | * @return EntityFullTextQueryBuilder |
74 | */ |
75 | public static function newFromGlobals( array $settings ) { |
76 | $services = MediaWikiServices::getInstance(); |
77 | $config = $services->getConfigFactory()->makeConfig( 'WikibaseCirrusSearch' ); |
78 | return new static( |
79 | $config->get( 'UseStemming' ), |
80 | $settings, |
81 | WikibaseRepo::getLanguageFallbackChainFactory( $services ), |
82 | WikibaseRepo::getEntityIdParser( $services ), |
83 | RequestContext::getMain()->getLanguage()->getCode() |
84 | ); |
85 | } |
86 | |
87 | /** |
88 | * Search articles with provided term. |
89 | * |
90 | * @param SearchContext $searchContext |
91 | * @param string $term term to search |
92 | */ |
93 | public function build( SearchContext $searchContext, $term ) { |
94 | $this->buildEntitySearchQuery( $searchContext, $term ); |
95 | // if we did find advanced query, we keep the old setup but change the result type |
96 | // FIXME: make it dispatch by content model |
97 | $searchContext->setResultsType( new EntityResultType( $this->userLanguage, |
98 | $this->languageFallbackChainFactory->newFromLanguageCode( $this->userLanguage ) ) ); |
99 | } |
100 | |
101 | /** |
102 | * @param SearchContext $searchContext |
103 | * @return bool |
104 | */ |
105 | public function buildDegraded( SearchContext $searchContext ) { |
106 | // Not doing anything for now |
107 | return false; |
108 | } |
109 | |
110 | /** |
111 | * Build a fulltext query for Wikibase entity. |
112 | * @param SearchContext $searchContext |
113 | * @param string $term Search term |
114 | */ |
115 | protected function buildEntitySearchQuery( SearchContext $searchContext, $term ) { |
116 | $searchContext->addSyntaxUsed( self::ENTITY_FULL_TEXT_MARKER, 10 ); |
117 | /* |
118 | * Overall query structure is as follows: |
119 | * - Bool with: |
120 | * Filter of namespace = N |
121 | * OR (Should with 1 mininmum) of: |
122 | * title.keyword = QUERY |
123 | * fulltext match query |
124 | * |
125 | * Fulltext match query is: |
126 | * Filter of: |
127 | * at least one of: all, all.plain matching |
128 | * description (for stemmed) or description.en (for non-stemmed) matching, with fallback |
129 | * OR (should with 0 minimum) of: |
130 | * DISMAX query of: all labels.near_match in fallback chain |
131 | * OR (should with 0 minimum) of: |
132 | * all |
133 | * all.plain |
134 | * DISMAX of: all fulltext matches for tokenized fields |
135 | */ |
136 | |
137 | $profile = $this->settings; |
138 | // $fields is collecting all the fields for dismax query to be used in |
139 | // scoring match |
140 | $fields = [ |
141 | [ "labels.{$this->userLanguage}.near_match", $profile['lang-exact'] ], |
142 | [ "labels.{$this->userLanguage}.near_match_folded", $profile['lang-folded'] ], |
143 | ]; |
144 | |
145 | $fieldsTokenized = [ |
146 | [ "labels.{$this->userLanguage}.plain", $profile['lang-partial'] ], |
147 | [ "descriptions.{$this->userLanguage}.plain", $profile['lang-partial'] ], |
148 | ]; |
149 | if ( !empty( $this->stemmingSettings[$this->userLanguage]['query'] ) ) { |
150 | $fieldsTokenized[] = [ "labels.{$this->userLanguage}", $profile['lang-partial'] ]; |
151 | $fieldsTokenized[] = [ "descriptions.{$this->userLanguage}", $profile['lang-partial'] ]; |
152 | } |
153 | |
154 | $searchLanguageCodes = $this->languageFallbackChainFactory->newFromLanguageCode( $this->userLanguage ) |
155 | ->getFetchLanguageCodes(); |
156 | |
157 | $discount = $profile['fallback-discount']; |
158 | $stemFilterFields = []; |
159 | |
160 | foreach ( $searchLanguageCodes as $fallbackCode ) { |
161 | if ( empty( $this->stemmingSettings[$fallbackCode]['query'] ) ) { |
162 | $stemFilterFields[] = "descriptions.{$fallbackCode}.plain"; |
163 | } else { |
164 | $stemFilterFields[] = "descriptions.{$fallbackCode}"; |
165 | // only add the stemmed version in the filter |
166 | // labels should be copied to the text field and thus be captured by the filter on the all field |
167 | $stemFilterFields[] = "labels.{$fallbackCode}"; |
168 | } |
169 | |
170 | if ( $fallbackCode === $this->userLanguage ) { |
171 | continue; |
172 | } |
173 | |
174 | $weight = $profile['fallback-exact'] * $discount; |
175 | $fields[] = [ "labels.{$fallbackCode}.near_match", $weight ]; |
176 | |
177 | $weight = $profile['fallback-folded'] * $discount; |
178 | $fields[] = [ "labels.{$fallbackCode}.near_match_folded", $weight ]; |
179 | |
180 | $weight = $profile['fallback-partial'] * $discount; |
181 | $fieldsTokenized[] = [ "labels.{$fallbackCode}.plain", $weight ]; |
182 | $fieldsTokenized[] = [ "descriptions.{$fallbackCode}.plain", $weight ]; |
183 | if ( !empty( $this->stemmingSettings[$fallbackCode]['query'] ) ) { |
184 | $fieldsTokenized[] = [ "descriptions.{$fallbackCode}", $weight ]; |
185 | $fieldsTokenized[] = [ "labels.{$fallbackCode}", $weight ]; |
186 | } |
187 | |
188 | $discount *= $profile['fallback-discount']; |
189 | } |
190 | |
191 | $titleMatch = new Term( [ |
192 | 'title.keyword' => EntitySearchUtils::normalizeId( $term, $this->entityIdParser ), |
193 | ] ); |
194 | |
195 | // Main query filter |
196 | $filterQuery = $this->buildSimpleAllFilter( $term ); |
197 | foreach ( $stemFilterFields as $filterField ) { |
198 | $filterQuery->addShould( $this->buildFieldMatch( $filterField, $term, 'AND' ) ); |
199 | } |
200 | |
201 | // Near match ones, they use constant score |
202 | $nearMatchQuery = new DisMax(); |
203 | $nearMatchQuery->setTieBreaker( 0 ); |
204 | foreach ( $fields as $field ) { |
205 | $nearMatchQuery->addQuery( EntitySearchUtils::makeConstScoreQuery( $field[0], $field[1], |
206 | $term ) ); |
207 | } |
208 | |
209 | // Tokenized ones |
210 | $tokenizedQuery = $this->buildSimpleAllFilter( $term, 'OR', $profile['any'] ); |
211 | $tokenizedQueryFields = new DisMax(); |
212 | $tokenizedQueryFields->setTieBreaker( 0.2 ); |
213 | foreach ( $fieldsTokenized as $field ) { |
214 | $m = $this->buildFieldMatch( $field[0], $term ); |
215 | $m->setFieldBoost( $field[0], $field[1] ); |
216 | $tokenizedQueryFields->addQuery( $m ); |
217 | } |
218 | $tokenizedQuery->addShould( $tokenizedQueryFields ); |
219 | |
220 | // Main labels/desc query |
221 | $labelsDescQuery = new BoolQuery(); |
222 | $labelsDescQuery->setMinimumShouldMatch( 0 ); |
223 | $labelsDescQuery->addFilter( $filterQuery ); |
224 | $labelsDescQuery->addShould( $nearMatchQuery ); |
225 | $labelsDescQuery->addShould( $tokenizedQuery ); |
226 | |
227 | // Main query |
228 | $query = new BoolQuery(); |
229 | |
230 | // Match either labels or exact match to title |
231 | $query->addShould( $titleMatch ); |
232 | $query->addShould( $labelsDescQuery ); |
233 | $query->setMinimumShouldMatch( 1 ); |
234 | |
235 | $searchContext->setMainQuery( $query ); |
236 | $searchContext->setPhraseRescoreQuery( $this->buildPhraseRescore( $term, $searchContext, $profile ) ); |
237 | } |
238 | |
239 | /** |
240 | * Builds a simple filter on all and all.plain when all terms must match |
241 | * |
242 | * @param string $query |
243 | * @param string $operator |
244 | * @param null $boost |
245 | * @return BoolQuery |
246 | */ |
247 | private function buildSimpleAllFilter( $query, $operator = 'AND', $boost = null ) { |
248 | $filter = new BoolQuery(); |
249 | $filter->setMinimumShouldMatch( 1 ); |
250 | // FIXME: We can't use solely the stem field here |
251 | // - Depending on languages it may lack stopwords, |
252 | // A dedicated field used for filtering would be nice |
253 | foreach ( [ 'all', 'all.plain' ] as $field ) { |
254 | $m = new MatchQuery(); |
255 | $m->setFieldQuery( $field, $query ); |
256 | $m->setFieldOperator( $field, $operator ); |
257 | if ( $boost ) { |
258 | $m->setFieldBoost( $field, $boost ); |
259 | } |
260 | $filter->addShould( $m ); |
261 | } |
262 | return $filter; |
263 | } |
264 | |
265 | /** |
266 | * Build simple match clause, matching field against term |
267 | * @param string $field |
268 | * @param string $term |
269 | * @param string|null $operator |
270 | * @return MatchQuery |
271 | */ |
272 | private function buildFieldMatch( $field, $term, $operator = null ) { |
273 | $m = new MatchQuery(); |
274 | $m->setFieldQuery( $field, $term ); |
275 | if ( $operator ) { |
276 | $m->setFieldOperator( $field, $operator ); |
277 | } |
278 | return $m; |
279 | } |
280 | |
281 | /** |
282 | * Create phrase rescore query for "all" fields |
283 | * @param string $queryText |
284 | * @param SearchContext $context |
285 | * @param float[][] $profile Must contain $profile['phrase'] with keys 'all', 'slop', 'all.plain' |
286 | * @return AbstractQuery|null |
287 | */ |
288 | private function buildPhraseRescore( $queryText, SearchContext $context, array $profile ) { |
289 | if ( empty( $profile['phrase'] ) ) { |
290 | return null; |
291 | } else { |
292 | $phraseProfile = $profile['phrase']; |
293 | } |
294 | $useRouter = $context->getConfig()->getElement( 'CirrusSearchWikimediaExtraPlugin', 'token_count_router' ) === true; |
295 | $phrase = new MultiMatch(); |
296 | $phrase->setParam( 'type', 'phrase' ); |
297 | $phrase->setParam( 'slop', $phraseProfile['slop'] ); |
298 | $fields = [ |
299 | "all^{$phraseProfile['all']}", "all.plain^{$phraseProfile['all.plain']}" |
300 | ]; |
301 | $phrase->setFields( $fields ); |
302 | $phrase->setQuery( $queryText ); |
303 | if ( !$useRouter ) { |
304 | return $phrase; |
305 | } |
306 | $tokCount = new TokenCountRouter( |
307 | // text |
308 | $queryText, |
309 | // fallback |
310 | new MatchNone(), |
311 | // field |
312 | "text" |
313 | ); |
314 | $tokCount->addCondition( |
315 | TokenCountRouter::GT, |
316 | 1, |
317 | $phrase |
318 | ); |
319 | $maxTokens = $context->getConfig()->get( 'CirrusSearchMaxPhraseTokens' ); |
320 | if ( $maxTokens ) { |
321 | $tokCount->addCondition( |
322 | TokenCountRouter::GT, |
323 | $maxTokens, |
324 | new \Elastica\Query\MatchNone() |
325 | ); |
326 | } |
327 | return $tokCount; |
328 | } |
329 | |
330 | } |