Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
60.45% |
81 / 134 |
|
12.50% |
1 / 8 |
CRAP | |
0.00% |
0 / 1 |
LexemeFulltextResult | |
60.45% |
81 / 134 |
|
12.50% |
1 / 8 |
63.67 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getSourceFiltering | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
getFields | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getHighlightingConfiguration | |
0.00% |
0 / 37 |
|
0.00% |
0 / 1 |
2 | |||
getFormIdResult | |
90.00% |
18 / 20 |
|
0.00% |
0 / 1 |
5.03 | |||
getFormRepresentationResult | |
88.24% |
15 / 17 |
|
0.00% |
0 / 1 |
4.03 | |||
transformElasticsearchResult | |
95.74% |
45 / 47 |
|
0.00% |
0 / 1 |
11 | |||
createEmptyResult | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | namespace Wikibase\Lexeme\Search\Elastic; |
3 | |
4 | use CirrusSearch\Search\BaseCirrusSearchResultSet; |
5 | use CirrusSearch\Search\BaseResultsType; |
6 | use Elastica\ResultSet; |
7 | use MediaWiki\Language\Language; |
8 | use Wikibase\DataModel\Entity\EntityIdParser; |
9 | use Wikibase\Lexeme\DataAccess\LexemeDescription; |
10 | use Wikibase\Lib\Store\FallbackLabelDescriptionLookupFactory; |
11 | use Wikibase\Search\Elastic\EntitySearchUtils; |
12 | use Wikibase\Search\Elastic\Fields\StatementCountField; |
13 | |
14 | /** |
15 | * This result type implements the result for searching a Lexeme for fulltext search. |
16 | * |
17 | * @license GPL-2.0-or-later |
18 | * @author Stas Malyshev |
19 | */ |
20 | class LexemeFulltextResult extends BaseResultsType { |
21 | |
22 | /** |
23 | * @var EntityIdParser |
24 | */ |
25 | private $idParser; |
26 | |
27 | /** |
28 | * Display language |
29 | * @var Language |
30 | */ |
31 | private $displayLanguage; |
32 | /** |
33 | * @var FallbackLabelDescriptionLookupFactory |
34 | */ |
35 | private $termLookupFactory; |
36 | |
37 | /** |
38 | * @param EntityIdParser $idParser |
39 | * @param Language $displayLanguage User display language |
40 | * @param FallbackLabelDescriptionLookupFactory $termLookupFactory |
41 | * Lookup factory for assembling descriptions |
42 | */ |
43 | public function __construct( |
44 | EntityIdParser $idParser, |
45 | Language $displayLanguage, |
46 | FallbackLabelDescriptionLookupFactory $termLookupFactory |
47 | ) { |
48 | $this->idParser = $idParser; |
49 | $this->termLookupFactory = $termLookupFactory; |
50 | $this->displayLanguage = $displayLanguage; |
51 | } |
52 | |
53 | /** |
54 | * Get the source filtering to be used loading the result. |
55 | * |
56 | * @return string[] |
57 | */ |
58 | public function getSourceFiltering() { |
59 | return array_merge( parent::getSourceFiltering(), [ |
60 | LemmaField::NAME, |
61 | LexemeLanguageField::NAME, |
62 | LexemeCategoryField::NAME, |
63 | FormsField::NAME, |
64 | StatementCountField::NAME, |
65 | // The web ui for fulltext search expects this to be returned. |
66 | // Longer term there should probably be some concept where the UI |
67 | // requests additional properties instead of baking it in at these |
68 | // lower levels for each fulltext results type. |
69 | 'timestamp', |
70 | ] ); |
71 | } |
72 | |
73 | /** |
74 | * Get the fields to load. Most of the time we'll use source filtering instead but |
75 | * some fields aren't part of the source. |
76 | * |
77 | * @return string[] |
78 | */ |
79 | public function getFields() { |
80 | return []; |
81 | } |
82 | |
83 | /** |
84 | * Get the highlighting configuration. |
85 | * |
86 | * @param array $highlightSource configuration for how to highlight the source. |
87 | * Empty if source should be ignored. |
88 | * @return array|null highlighting configuration for elasticsearch |
89 | */ |
90 | public function getHighlightingConfiguration( array $highlightSource ) { |
91 | $config = [ |
92 | 'pre_tags' => [ '' ], |
93 | 'post_tags' => [ '' ], |
94 | 'fields' => [], |
95 | ]; |
96 | $config['fields']['title'] = [ |
97 | 'type' => 'experimental', |
98 | 'fragmenter' => "none", |
99 | 'number_of_fragments' => 0, |
100 | 'matched_fields' => [ 'title.keyword' ] |
101 | ]; |
102 | $config['fields']['lexeme_forms.id'] = [ |
103 | 'type' => 'experimental', |
104 | 'fragmenter' => "none", |
105 | 'number_of_fragments' => 0, |
106 | 'options' => [ |
107 | 'skip_if_last_matched' => true, |
108 | ], |
109 | ]; |
110 | $config['fields']["lemma"] = [ |
111 | 'type' => 'experimental', |
112 | 'fragmenter' => "none", |
113 | 'number_of_fragments' => 0, |
114 | 'options' => [ |
115 | 'skip_if_last_matched' => true, |
116 | ], |
117 | ]; |
118 | $config['fields']["lexeme_forms.representation"] = [ |
119 | 'type' => 'experimental', |
120 | 'fragmenter' => "none", |
121 | 'number_of_fragments' => 30, |
122 | 'fragment_size' => 1000, // Hopefully this is enough |
123 | 'options' => [ |
124 | 'skip_if_last_matched' => true, |
125 | ], |
126 | ]; |
127 | |
128 | return $config; |
129 | } |
130 | |
131 | /** |
132 | * Produce raw result for Form ID match. |
133 | * @param string[][] $highlight Highlighter data |
134 | * @param array $sourceData Lexeme source data |
135 | * @return array|null Null if match is bad |
136 | */ |
137 | private function getFormIdResult( $highlight, $sourceData ) { |
138 | $formId = $highlight['lexeme_forms.id'][0]; |
139 | $formIdParsed = EntitySearchUtils::parseOrNull( $formId, $this->idParser ); |
140 | if ( !$formIdParsed ) { |
141 | // Got some bad id?? Weird. |
142 | return null; |
143 | } |
144 | $repr = ''; |
145 | $features = []; |
146 | foreach ( $sourceData['lexeme_forms'] as $form ) { |
147 | if ( $form['id'] === $formId ) { |
148 | // TODO: how we choose one? |
149 | $repr = $form['representation'][0]; |
150 | // Convert features to EntityId's |
151 | $features = array_filter( array_map( function ( $featureId ) { |
152 | return EntitySearchUtils::parseOrNull( $featureId, $this->idParser ); |
153 | }, $form['features'] ) ); |
154 | break; |
155 | } |
156 | } |
157 | if ( $repr === '' ) { |
158 | // Didn't find the right id? Weird, skip it. |
159 | return null; |
160 | } |
161 | |
162 | return [ |
163 | 'formId' => $formId, |
164 | 'representation' => $repr, |
165 | 'features' => $features, |
166 | ]; |
167 | } |
168 | |
169 | /** |
170 | * Get data for specific form match from source data |
171 | * @param array[] $sourceForms 'forms' field of the source data |
172 | * @param string[] $highlight Highlighter data about match |
173 | * @return array|null Null if match is bad |
174 | */ |
175 | private function getFormRepresentationResult( $sourceForms, $highlight ) { |
176 | foreach ( $sourceForms as $form ) { |
177 | $reprMatches = array_intersect( $form['representation'], |
178 | $highlight ); |
179 | if ( !$reprMatches ) { |
180 | continue; |
181 | } |
182 | // matches the data |
183 | $formIdParsed = EntitySearchUtils::parseOrNull( $form['id'], $this->idParser ); |
184 | if ( !$formIdParsed ) { |
185 | // Got some bad id?? Weird. |
186 | continue; |
187 | } |
188 | // Convert features to EntityId's |
189 | $featureIds = array_filter( array_map( function ( $featureId ) { |
190 | return EntitySearchUtils::parseOrNull( $featureId, $this->idParser ); |
191 | }, $form['features'] ) ); |
192 | |
193 | return [ |
194 | 'formId' => $formIdParsed, |
195 | 'representation' => reset( $reprMatches ), |
196 | 'features' => $featureIds, |
197 | ]; |
198 | } |
199 | // Didn't find anything |
200 | return null; |
201 | } |
202 | |
203 | /** |
204 | * Convert search result from ElasticSearch result set to LexemeResultSet. |
205 | * |
206 | * The data inside the set are not rendered yet, but the set is configured with |
207 | * the label lookup that has necessary item labels already loaded. |
208 | * |
209 | * @param ResultSet $result ElasticSearch results |
210 | * @return \ISearchResultSet |
211 | */ |
212 | public function transformElasticsearchResult( ResultSet $result ) { |
213 | $rawResults = $entityIds = []; |
214 | foreach ( $result->getResults() as $r ) { |
215 | $rawResultKey = spl_object_hash( $r ); |
216 | $sourceData = $r->getSource(); |
217 | $entityId = EntitySearchUtils::parseOrNull( $sourceData['title'], $this->idParser ); |
218 | if ( !$entityId ) { |
219 | // Can not parse entity ID - skip it |
220 | // TODO: what we do here if no language code? |
221 | // Not sure we want to index all lemma languages. |
222 | // Should we just fake the term language code? |
223 | continue; |
224 | } |
225 | |
226 | $lemmaCode = LexemeTermResult::extractLanguageCode( $sourceData ); |
227 | |
228 | // Highlight part contains information about what has actually been matched. |
229 | $highlight = $r->getHighlights(); |
230 | |
231 | // we accept missing lemma fields (see T365692) |
232 | $lang = $sourceData['lexeme_language']['entity'] ?? ''; |
233 | $category = $sourceData['lexical_category'] ?? ''; |
234 | |
235 | $features = []; |
236 | $lexemeData = [ |
237 | 'lexemeId' => $entityId, |
238 | // Having empty lemma is unusual, but in theory possible |
239 | 'lemma' => empty( $sourceData['lemma'] ) ? '' : $sourceData['lemma'][0], |
240 | 'lang' => $lang, |
241 | 'langcode' => $lemmaCode, |
242 | 'category' => $category, |
243 | 'elasticResult' => $r |
244 | ]; |
245 | |
246 | if ( !empty( $highlight['lexeme_forms.id'] ) ) { |
247 | // If we matched Form ID, this means it's a match by ID |
248 | |
249 | $idResult = $this->getFormIdResult( $highlight, $sourceData ); |
250 | if ( !$idResult ) { |
251 | continue; |
252 | } |
253 | |
254 | $lexemeData = $idResult + $lexemeData; |
255 | $features = array_merge( $features, $idResult['features'] ); |
256 | } elseif ( !empty( $highlight['lemma'] ) ) { |
257 | // TODO: make result display highlight this |
258 | $lexemeData['matchedLemma'] = $highlight['lemma'][0]; |
259 | } elseif ( !empty( $highlight["lexeme_forms.representation"] ) ) { |
260 | // For now, find the first form that matches |
261 | $formResult = $this->getFormRepresentationResult( $sourceData['lexeme_forms'], |
262 | $highlight['lexeme_forms.representation'] ); |
263 | if ( $formResult ) { |
264 | $lexemeData = $formResult + $lexemeData; |
265 | $features = array_merge( $features, $formResult['features'] ); |
266 | } |
267 | } |
268 | |
269 | // Doing two-stage resolution here since we want to prefetch all labels for |
270 | // auxiliary entities before using them to construct descriptions. |
271 | $lexemeData['elastica_result_hash'] = $rawResultKey; |
272 | $rawResults[$entityId->getSerialization()] = $lexemeData; |
273 | $entityIds[$lang] = EntitySearchUtils::parseOrNull( $lang, $this->idParser ); |
274 | $entityIds[$category] = EntitySearchUtils::parseOrNull( $category, $this->idParser ); |
275 | foreach ( $features as $feature ) { |
276 | $entityIds[$feature->getSerialization()] = $feature; |
277 | } |
278 | } |
279 | |
280 | if ( !$rawResults ) { |
281 | return new \CirrusSearch\Search\ResultSet(); |
282 | } |
283 | // Create prefetched lookup |
284 | $termLookup = $this->termLookupFactory->newLabelDescriptionLookup( $this->displayLanguage, |
285 | array_filter( $entityIds ) ); |
286 | $descriptionMaker = new LexemeDescription( $termLookup, $this->idParser, |
287 | $this->displayLanguage ); |
288 | |
289 | return new LexemeResultSet( $result, $this->displayLanguage, $descriptionMaker, $rawResults ); |
290 | } |
291 | |
292 | /** |
293 | * @return mixed Empty set of search results |
294 | */ |
295 | public function createEmptyResult() { |
296 | return BaseCirrusSearchResultSet::emptyResultSet( false ); |
297 | } |
298 | |
299 | } |