Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
96.49% |
110 / 114 |
|
60.00% |
3 / 5 |
CRAP | |
0.00% |
0 / 1 |
EntitySearchElastic | |
96.49% |
110 / 114 |
|
60.00% |
3 / 5 |
23 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
expandGenericProfile | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
3 | |||
loadProfile | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
getElasticSearchQuery | |
95.08% |
58 / 61 |
|
0.00% |
0 / 1 |
12 | |||
getRankedSearchResults | |
94.74% |
18 / 19 |
|
0.00% |
0 / 1 |
3.00 |
1 | <?php |
2 | |
3 | namespace Wikibase\Search\Elastic; |
4 | |
5 | use CirrusSearch\CirrusDebugOptions; |
6 | use CirrusSearch\Search\SearchContext; |
7 | use Elastica\Query\AbstractQuery; |
8 | use Elastica\Query\BoolQuery; |
9 | use Elastica\Query\DisMax; |
10 | use Elastica\Query\MatchQuery; |
11 | use Elastica\Query\Term; |
12 | use Language; |
13 | use MediaWiki\Request\FauxRequest; |
14 | use MediaWiki\Request\WebRequest; |
15 | use Wikibase\DataModel\Entity\EntityIdParser; |
16 | use Wikibase\Lib\LanguageFallbackChainFactory; |
17 | use Wikibase\Repo\Api\EntitySearchException; |
18 | use Wikibase\Repo\Api\EntitySearchHelper; |
19 | |
20 | /** |
21 | * Entity search implementation using ElasticSearch. |
22 | * Requires CirrusSearch extension and $wgEntitySearchUseCirrus to be on. |
23 | * |
24 | * @license GPL-2.0-or-later |
25 | * @author Stas Malyshev |
26 | */ |
27 | class EntitySearchElastic implements EntitySearchHelper { |
28 | /** |
29 | * Default rescore profile |
30 | */ |
31 | public const DEFAULT_RESCORE_PROFILE = 'wikibase_prefix'; |
32 | |
33 | /** |
34 | * Name of the context for profile name resolution |
35 | */ |
36 | public const CONTEXT_WIKIBASE_PREFIX = 'wikibase_prefix_search'; |
37 | |
38 | /** |
39 | * Name of the context for profile name resolution |
40 | */ |
41 | public const CONTEXT_WIKIBASE_FULLTEXT = 'wikibase_fulltext_search'; |
42 | |
43 | /** |
44 | * Name of the profile type used to build the elastic query |
45 | */ |
46 | public const WIKIBASE_PREFIX_QUERY_BUILDER = 'wikibase_prefix_querybuilder'; |
47 | |
48 | /** |
49 | * Default query builder profile for prefix searches |
50 | */ |
51 | public const DEFAULT_QUERY_BUILDER_PROFILE = 'default'; |
52 | |
53 | /** |
54 | * Default query builder profile for fulltext searches |
55 | * |
56 | */ |
57 | public const DEFAULT_FULL_TEXT_QUERY_BUILDER_PROFILE = 'wikibase'; |
58 | |
59 | /** |
60 | * Replacement syntax for statement boosting |
61 | * @see \CirrusSearch\Profile\SearchProfileRepositoryTransformer |
62 | * and repo/config/ElasticSearchRescoreFunctions.php |
63 | */ |
64 | public const STMT_BOOST_PROFILE_REPL = 'functions.*[type=term_boost].params[statement_keywords=_statementBoost_].statement_keywords'; |
65 | |
66 | /** |
67 | * @var LanguageFallbackChainFactory |
68 | */ |
69 | private $languageChainFactory; |
70 | |
71 | /** |
72 | * @var EntityIdParser |
73 | */ |
74 | private $idParser; |
75 | |
76 | /** |
77 | * @var string[] |
78 | */ |
79 | private $contentModelMap; |
80 | |
81 | /** |
82 | * Web request context. |
83 | * Used for implementing debug features such as cirrusDumpQuery. |
84 | * @var WebRequest |
85 | */ |
86 | private $request; |
87 | |
88 | /** |
89 | * List of fallback codes for search language |
90 | * @var string[] |
91 | */ |
92 | private $searchLanguageCodes = []; |
93 | |
94 | /** |
95 | * @var Language User language for display. |
96 | */ |
97 | private $userLang; |
98 | |
99 | /** |
100 | * @var CirrusDebugOptions |
101 | */ |
102 | private $debugOptions; |
103 | |
104 | /** |
105 | * @param LanguageFallbackChainFactory $languageChainFactory |
106 | * @param EntityIdParser $idParser |
107 | * @param Language $userLang |
108 | * @param array $contentModelMap Maps entity type => content model name |
109 | * @param WebRequest|null $request Web request context |
110 | * @param CirrusDebugOptions|null $options |
111 | */ |
112 | public function __construct( |
113 | LanguageFallbackChainFactory $languageChainFactory, |
114 | EntityIdParser $idParser, |
115 | Language $userLang, |
116 | array $contentModelMap, |
117 | WebRequest $request = null, |
118 | CirrusDebugOptions $options = null |
119 | ) { |
120 | $this->languageChainFactory = $languageChainFactory; |
121 | $this->idParser = $idParser; |
122 | $this->userLang = $userLang; |
123 | $this->contentModelMap = $contentModelMap; |
124 | $this->request = $request ?: new FauxRequest(); |
125 | $this->debugOptions = $options ?: CirrusDebugOptions::fromRequest( $this->request ); |
126 | } |
127 | |
128 | private function expandGenericProfile( $languageCode, array $profile ) { |
129 | $res = [ |
130 | 'language-chain' => $this->languageChainFactory |
131 | ->newFromLanguageCode( $languageCode ) |
132 | ->getFetchLanguageCodes(), |
133 | 'any' => $profile['any'], |
134 | 'tie-breaker' => $profile['tie-breaker'], |
135 | 'space-discount' => $profile['space-discount'] ?? null, |
136 | "{$languageCode}-exact" => $profile['lang-exact'], |
137 | "{$languageCode}-folded" => $profile['lang-folded'], |
138 | "{$languageCode}-prefix" => $profile['lang-prefix'], |
139 | ]; |
140 | |
141 | $discount = $profile['fallback-discount']; |
142 | foreach ( $res['language-chain'] as $fallback ) { |
143 | if ( $fallback === $languageCode ) { |
144 | continue; |
145 | } |
146 | $res["{$fallback}-exact"] = $profile['fallback-exact'] * $discount; |
147 | $res["{$fallback}-folded"] = $profile['fallback-folded'] * $discount; |
148 | $res["{$fallback}-prefix"] = $profile['fallback-prefix'] * $discount; |
149 | $discount *= $profile['fallback-discount']; |
150 | } |
151 | |
152 | return $res; |
153 | } |
154 | |
155 | private function loadProfile( SearchContext $context, $languageCode ) { |
156 | $profile = $context->getConfig() |
157 | ->getProfileService() |
158 | ->loadProfile( self::WIKIBASE_PREFIX_QUERY_BUILDER, $context->getProfileContext(), null, |
159 | $context->getProfileContextParams() ); |
160 | |
161 | // Set some bc defaults for properties that didn't always exist. |
162 | $profile['tie-breaker'] ??= 0; |
163 | |
164 | // There are two flavors of profiles: fully specified, and generic |
165 | // fallback. When language-chain is provided we assume a fully |
166 | // specified profile. Otherwise we expand the language agnostic |
167 | // profile into a language specific profile. |
168 | if ( !isset( $profile['language-chain'] ) ) { |
169 | $profile = $this->expandGenericProfile( $languageCode, $profile ); |
170 | } |
171 | |
172 | return $profile; |
173 | } |
174 | |
175 | /** |
176 | * Produce ES query that matches the arguments. |
177 | * |
178 | * @param string $text |
179 | * @param string $languageCode |
180 | * @param string $entityType |
181 | * @param bool $strictLanguage |
182 | * @param SearchContext $context |
183 | * |
184 | * @return AbstractQuery |
185 | */ |
186 | protected function getElasticSearchQuery( |
187 | $text, |
188 | $languageCode, |
189 | $entityType, |
190 | $strictLanguage, |
191 | SearchContext $context |
192 | ) { |
193 | $query = new BoolQuery(); |
194 | |
195 | $context->setOriginalSearchTerm( $text ); |
196 | // Drop only leading spaces for exact matches, and all spaces for the rest |
197 | $textExact = ltrim( $text ); |
198 | $text = trim( $text ); |
199 | if ( empty( $this->contentModelMap[$entityType] ) ) { |
200 | $context->setResultsPossible( false ); |
201 | $context->addWarning( 'wikibasecirrus-search-bad-entity-type', $entityType ); |
202 | return $query; |
203 | } |
204 | |
205 | $labelsFilter = new MatchQuery( 'labels_all.prefix', $text ); |
206 | |
207 | $profile = $this->loadProfile( $context, $languageCode ); |
208 | $this->searchLanguageCodes = $profile['language-chain']; |
209 | if ( $languageCode !== $this->searchLanguageCodes[0] ) { |
210 | // Log a warning? Are there valid reasons for the primary language |
211 | // in the profile to not match the profile request? |
212 | $languageCode = $this->searchLanguageCodes[0]; |
213 | } |
214 | |
215 | $fields = [ |
216 | [ "labels.{$languageCode}.near_match", $profile["{$languageCode}-exact"] ], |
217 | [ "labels.{$languageCode}.near_match_folded", $profile["{$languageCode}-folded"] ], |
218 | ]; |
219 | // Fields to which query applies exactly as stated, without trailing space trimming |
220 | $fieldsExact = []; |
221 | $weight = $profile["{$languageCode}-prefix"]; |
222 | if ( $textExact !== $text && isset( $profile['space-discount'] ) ) { |
223 | $fields[] = |
224 | [ |
225 | "labels.{$languageCode}.prefix", |
226 | $weight * $profile['space-discount'], |
227 | ]; |
228 | $fieldsExact[] = [ "labels.{$languageCode}.prefix", $weight ]; |
229 | } else { |
230 | $fields[] = [ "labels.{$languageCode}.prefix", $weight ]; |
231 | } |
232 | |
233 | if ( !$strictLanguage ) { |
234 | $fields[] = [ "labels_all.near_match_folded", $profile['any'] ]; |
235 | foreach ( $this->searchLanguageCodes as $fallbackCode ) { |
236 | if ( $fallbackCode === $languageCode ) { |
237 | continue; |
238 | } |
239 | $fields[] = [ |
240 | "labels.{$fallbackCode}.near_match", |
241 | $profile["{$fallbackCode}-exact"] ]; |
242 | $fields[] = [ |
243 | "labels.{$fallbackCode}.near_match_folded", |
244 | $profile["{$fallbackCode}-folded"] ]; |
245 | |
246 | $weight = $profile["{$fallbackCode}-prefix"]; |
247 | if ( $textExact !== $text && isset( $profile['space-discount'] ) ) { |
248 | $fields[] = [ |
249 | "labels.{$fallbackCode}.prefix", |
250 | $weight * $profile['space-discount'] |
251 | ]; |
252 | $fieldsExact[] = [ "labels.{$fallbackCode}.prefix", $weight ]; |
253 | } else { |
254 | $fields[] = [ "labels.{$fallbackCode}.prefix", $weight ]; |
255 | } |
256 | } |
257 | } |
258 | |
259 | $dismax = new DisMax(); |
260 | $dismax->setTieBreaker( $profile['tie-breaker'] ); |
261 | foreach ( $fields as $field ) { |
262 | $dismax->addQuery( EntitySearchUtils::makeConstScoreQuery( $field[0], $field[1], $text ) ); |
263 | } |
264 | |
265 | foreach ( $fieldsExact as $field ) { |
266 | $dismax->addQuery( EntitySearchUtils::makeConstScoreQuery( $field[0], $field[1], $textExact ) ); |
267 | } |
268 | |
269 | $labelsQuery = new BoolQuery(); |
270 | $labelsQuery->addFilter( $labelsFilter ); |
271 | $labelsQuery->addShould( $dismax ); |
272 | $titleMatch = new Term( [ 'title.keyword' => EntitySearchUtils::normalizeId( $text, $this->idParser ) ] ); |
273 | |
274 | // Match either labels or exact match to title |
275 | $query->addShould( $labelsQuery ); |
276 | $query->addShould( $titleMatch ); |
277 | $query->setMinimumShouldMatch( 1 ); |
278 | |
279 | // Filter to fetch only given entity type |
280 | $query->addFilter( new Term( [ 'content_model' => $this->contentModelMap[$entityType] ] ) ); |
281 | |
282 | return $query; |
283 | } |
284 | |
285 | /** |
286 | * @inheritDoc |
287 | */ |
288 | public function getRankedSearchResults( |
289 | $text, |
290 | $languageCode, |
291 | $entityType, |
292 | $limit, |
293 | $strictLanguage, |
294 | string $profileContext = null |
295 | ) { |
296 | $profileContext ??= self::CONTEXT_WIKIBASE_PREFIX; |
297 | $searcher = new WikibasePrefixSearcher( 0, $limit, $this->debugOptions ); |
298 | $searcher->getSearchContext()->setProfileContext( |
299 | $profileContext, |
300 | [ 'language' => $languageCode ] ); |
301 | $query = $this->getElasticSearchQuery( $text, $languageCode, $entityType, $strictLanguage, |
302 | $searcher->getSearchContext() ); |
303 | |
304 | $searcher->setResultsType( new ElasticTermResult( |
305 | $this->idParser, |
306 | $this->searchLanguageCodes, |
307 | $this->languageChainFactory->newFromLanguage( $this->userLang ) |
308 | ) ); |
309 | |
310 | $result = $searcher->performSearch( $query ); |
311 | |
312 | if ( $result->isOK() ) { |
313 | $result = $result->getValue(); |
314 | } else { |
315 | throw new EntitySearchException( $result ); |
316 | } |
317 | |
318 | if ( $searcher->isReturnRaw() ) { |
319 | $result = $searcher->processRawReturn( $result, $this->request ); |
320 | } |
321 | |
322 | return $result; |
323 | } |
324 | |
325 | } |