Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
73.48% |
97 / 132 |
|
22.22% |
2 / 9 |
CRAP | |
0.00% |
0 / 1 |
FormTermResult | |
73.48% |
97 / 132 |
|
22.22% |
2 / 9 |
34.74 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
getSourceFiltering | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
getFields | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getHighlightingConfiguration | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
2 | |||
getIdResult | |
90.91% |
20 / 22 |
|
0.00% |
0 / 1 |
5.02 | |||
getRepresentationResult | |
94.12% |
16 / 17 |
|
0.00% |
0 / 1 |
3.00 | |||
transformElasticsearchResult | |
93.88% |
46 / 49 |
|
0.00% |
0 / 1 |
10.02 | |||
produceTermResult | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
1 | |||
createEmptyResult | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | namespace Wikibase\Lexeme\Search\Elastic; |
3 | |
4 | use CirrusSearch\Search\BaseResultsType; |
5 | use Elastica\ResultSet; |
6 | use Language; |
7 | use Wikibase\DataModel\Entity\EntityIdParser; |
8 | use Wikibase\DataModel\Term\Term; |
9 | use Wikibase\Lexeme\DataAccess\LexemeDescription; |
10 | use Wikibase\Lib\Interactors\TermSearchResult; |
11 | use Wikibase\Lib\Store\FallbackLabelDescriptionLookupFactory; |
12 | use Wikibase\Search\Elastic\EntitySearchUtils; |
13 | |
14 | /** |
15 | * This result type implements the result for searching a Wikibase Form. |
16 | * |
17 | * @license GPL-2.0-or-later |
18 | * @author Stas Malyshev |
19 | */ |
20 | class FormTermResult extends BaseResultsType { |
21 | |
22 | /** |
23 | * @var EntityIdParser |
24 | */ |
25 | private $idParser; |
26 | |
27 | /** |
28 | * Display language |
29 | * @var Language |
30 | */ |
31 | private $displayLanguage; |
32 | /** |
33 | * @var FallbackLabelDescriptionLookupFactory |
34 | */ |
35 | private $termLookupFactory; |
36 | /** |
37 | * Limit how many results to produce |
38 | * @var int |
39 | */ |
40 | private $limit; |
41 | |
42 | /** |
43 | * @param EntityIdParser $idParser |
44 | * @param Language $displayLanguage User display language |
45 | * @param FallbackLabelDescriptionLookupFactory $termLookupFactory |
46 | * Lookup factory for assembling descriptions |
47 | * @param int $limit How many results to produce |
48 | */ |
49 | public function __construct( |
50 | EntityIdParser $idParser, |
51 | Language $displayLanguage, |
52 | FallbackLabelDescriptionLookupFactory $termLookupFactory, |
53 | $limit |
54 | ) { |
55 | $this->idParser = $idParser; |
56 | $this->termLookupFactory = $termLookupFactory; |
57 | $this->displayLanguage = $displayLanguage; |
58 | $this->limit = $limit; |
59 | } |
60 | |
61 | /** |
62 | * Get the source filtering to be used loading the result. |
63 | * |
64 | * @return string[] |
65 | */ |
66 | public function getSourceFiltering() { |
67 | return array_merge( parent::getSourceFiltering(), [ |
68 | LemmaField::NAME, |
69 | LexemeLanguageField::NAME, |
70 | LexemeCategoryField::NAME, |
71 | FormsField::NAME |
72 | ] ); |
73 | } |
74 | |
75 | /** |
76 | * Get the fields to load. Most of the time we'll use source filtering instead but |
77 | * some fields aren't part of the source. |
78 | * |
79 | * @return string[] |
80 | */ |
81 | public function getFields() { |
82 | return []; |
83 | } |
84 | |
85 | /** |
86 | * Get the highlighting configuration. |
87 | * |
88 | * @param array $highlightSource configuration for how to highlight the source. |
89 | * Empty if source should be ignored. |
90 | * @return array|null highlighting configuration for elasticsearch |
91 | */ |
92 | public function getHighlightingConfiguration( array $highlightSource ) { |
93 | $config = [ |
94 | 'pre_tags' => [ '' ], |
95 | 'post_tags' => [ '' ], |
96 | 'fields' => [], |
97 | ]; |
98 | $config['fields']['lexeme_forms.id'] = [ |
99 | 'type' => 'experimental', |
100 | 'fragmenter' => "none", |
101 | 'number_of_fragments' => 0, |
102 | ]; |
103 | $config['fields']["lexeme_forms.representation"] = [ |
104 | 'type' => 'experimental', |
105 | 'fragmenter' => "none", |
106 | 'number_of_fragments' => 30, |
107 | 'fragment_size' => 1000, // Hopefully this is enough |
108 | 'matched_fields' => [ 'lexeme_forms.representation.prefix' ], |
109 | 'options' => [ |
110 | 'skip_if_last_matched' => true, |
111 | ], |
112 | ]; |
113 | |
114 | return $config; |
115 | } |
116 | |
117 | /** |
118 | * Produce raw result for ID-type match. |
119 | * @param string[][] $highlight Highlighter data |
120 | * @param array $sourceData Lexeme source data |
121 | * @return array|null Null if match is bad |
122 | */ |
123 | private function getIdResult( $highlight, $sourceData ) { |
124 | $formId = $highlight['lexeme_forms.id'][0]; |
125 | $formIdParsed = EntitySearchUtils::parseOrNull( $formId, $this->idParser ); |
126 | if ( !$formIdParsed ) { |
127 | // Got some bad id?? Weird. |
128 | return null; |
129 | } |
130 | $repr = ''; |
131 | $features = []; |
132 | foreach ( $sourceData['lexeme_forms'] as $form ) { |
133 | if ( $form['id'] === $formId ) { |
134 | // TODO: how we choose one? |
135 | $repr = $form['representation'][0]; |
136 | // Convert features to EntityId's |
137 | $features = array_filter( array_map( function ( $featureId ) { |
138 | return EntitySearchUtils::parseOrNull( $featureId, $this->idParser ); |
139 | }, $form['features'] ) ); |
140 | break; |
141 | } |
142 | } |
143 | if ( $repr === '' ) { |
144 | // Didn't find the right id? Weird, skip it. |
145 | return null; |
146 | } |
147 | |
148 | return [ |
149 | 'id' => $formIdParsed, |
150 | 'representation' => $repr, |
151 | 'features' => $features, |
152 | 'term' => new Term( 'qid', $formId ), |
153 | 'type' => 'entityId', |
154 | ]; |
155 | } |
156 | |
157 | /** |
158 | * Get data for specific form |
159 | * @param string[][] $highlight Highlighter data |
160 | * @param array $form Form source data |
161 | * @param string $lemmaCode Language code for main lemma |
162 | * @return array|null Null if match is bad |
163 | */ |
164 | private function getRepresentationResult( $highlight, $form, $lemmaCode ) { |
165 | $reprMatches = array_intersect( $form['representation'], |
166 | $highlight['lexeme_forms.representation'] ); |
167 | if ( !$reprMatches ) { |
168 | return null; |
169 | } |
170 | // matches the data |
171 | $formIdParsed = EntitySearchUtils::parseOrNull( $form['id'], $this->idParser ); |
172 | if ( !$formIdParsed ) { |
173 | // Got some bad id?? Weird. |
174 | return null; |
175 | } |
176 | // Convert features to EntityId's |
177 | $featureIds = array_filter( array_map( function ( $featureId ) { |
178 | return EntitySearchUtils::parseOrNull( $featureId, $this->idParser ); |
179 | }, $form['features'] ) ); |
180 | return [ |
181 | 'id' => $formIdParsed, |
182 | // TODO: how we choose the best one of many? |
183 | 'representation' => reset( $form['representation'] ), |
184 | 'features' => $featureIds, |
185 | // TODO: This may not be true, since matched representation can be |
186 | // from another language...Not sure what to do about it. |
187 | 'term' => new Term( $lemmaCode, reset( $reprMatches ) ), |
188 | 'type' => 'label', |
189 | ]; |
190 | } |
191 | |
192 | /** |
193 | * Convert search result from ElasticSearch result set to TermSearchResult. |
194 | * @param ResultSet $result |
195 | * @return TermSearchResult[] Set of search results, the types of which vary by implementation. |
196 | */ |
197 | public function transformElasticsearchResult( ResultSet $result ) { |
198 | $rawResults = $entityIds = []; |
199 | foreach ( $result->getResults() as $r ) { |
200 | $sourceData = $r->getSource(); |
201 | $entityId = EntitySearchUtils::parseOrNull( $sourceData['title'], $this->idParser ); |
202 | if ( !$entityId ) { |
203 | // Can not parse entity ID - skip it |
204 | // TODO: what we do here if no language code? |
205 | // Not sure we want to index all lemma languages. |
206 | // Should we just fake the term language code? |
207 | continue; |
208 | } |
209 | |
210 | $lemmaCode = LexemeTermResult::extractLanguageCode( $sourceData ); |
211 | |
212 | // Highlight part contains information about what has actually been matched. |
213 | $highlight = $r->getHighlights(); |
214 | |
215 | $lang = $sourceData['lexeme_language']['entity']; |
216 | $category = $sourceData['lexical_category']; |
217 | |
218 | $features = []; |
219 | $lexemeData = [ |
220 | 'lexemeId' => $entityId, |
221 | 'lemma' => $sourceData['lemma'][0], |
222 | 'lang' => $lang, |
223 | 'langcode' => $lemmaCode, |
224 | 'category' => $category |
225 | ]; |
226 | // Doing two-stage resolution here since we want to prefetch all labels for |
227 | // auxiliary entities before using them to construct descriptions. |
228 | if ( !empty( $highlight['lexeme_forms.id'] ) ) { |
229 | // If we matched Form ID, this means it's a match by ID |
230 | $idResult = $this->getIdResult( $highlight, $sourceData ); |
231 | if ( !$idResult ) { |
232 | continue; |
233 | } |
234 | |
235 | $rawResults[$highlight['lexeme_forms.id'][0]] = $idResult + $lexemeData; |
236 | $features = array_merge( $features, $idResult['features'] ); |
237 | } elseif ( !empty( $highlight['lexeme_forms.representation'] ) ) { |
238 | // We matched form representation, let's see which ones we've got |
239 | // Find all forms whose representations match what we have found. |
240 | // Note this can be more than one. |
241 | foreach ( $sourceData['lexeme_forms'] as $form ) { |
242 | $formResult = $this->getRepresentationResult( $highlight, $form, $lemmaCode ); |
243 | if ( !$formResult ) { |
244 | continue; |
245 | } |
246 | $rawResults[$form['id']] = $formResult + $lexemeData; |
247 | $features = array_merge( $features, $formResult['features'] ); |
248 | } |
249 | } else { |
250 | // TODO: No data to match, skip it. Should we report something? |
251 | continue; |
252 | } |
253 | |
254 | $entityIds[$lang] = EntitySearchUtils::parseOrNull( $lang, $this->idParser ); |
255 | $entityIds[$category] = EntitySearchUtils::parseOrNull( $category, $this->idParser ); |
256 | foreach ( $features as $feature ) { |
257 | $entityIds[$feature->getSerialization()] = $feature; |
258 | } |
259 | } |
260 | |
261 | $langCode = $this->displayLanguage->getCode(); |
262 | if ( !$rawResults ) { |
263 | return []; |
264 | } |
265 | // Create prefetched lookup |
266 | $termLookup = $this->termLookupFactory->newLabelDescriptionLookup( $this->displayLanguage, |
267 | array_filter( $entityIds ) ); |
268 | $descriptionMaker = new LexemeDescription( $termLookup, $this->idParser, |
269 | $this->displayLanguage ); |
270 | // Create full descriptions and instantiate TermSearchResult objects |
271 | return array_map( |
272 | function ( $raw ) use ( $descriptionMaker, $langCode ) { |
273 | return $this->produceTermResult( $descriptionMaker, $langCode, $raw ); |
274 | }, |
275 | array_slice( $rawResults, 0, $this->limit ) |
276 | ); |
277 | } |
278 | |
279 | /** |
280 | * Produce TermSearchResult from raw result data. |
281 | * @param LexemeDescription $descriptionMaker |
282 | * @param string $langCode |
283 | * @param array $raw |
284 | * @return TermSearchResult |
285 | */ |
286 | private function produceTermResult( |
287 | LexemeDescription $descriptionMaker, |
288 | $langCode, |
289 | array $raw |
290 | ) { |
291 | return new TermSearchResult( |
292 | $raw['term'], |
293 | $raw['type'], |
294 | $raw['id'], |
295 | // We are lying somewhat here, as description might be from fallback languages, |
296 | // but I am not sure there's any better way here. |
297 | new Term( $raw['langcode'], $raw['representation'] ), |
298 | new Term( $langCode, |
299 | $descriptionMaker->createFormDescription( |
300 | $raw['lexemeId'], $raw['features'], $raw['lemma'], $raw['lang'], |
301 | $raw['category'] |
302 | ) ) |
303 | ); |
304 | } |
305 | |
306 | /** |
307 | * @return TermSearchResult[] Empty set of search results |
308 | */ |
309 | public function createEmptyResult() { |
310 | return []; |
311 | } |
312 | |
313 | } |