Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
55.32% |
52 / 94 |
|
28.57% |
2 / 7 |
CRAP | |
0.00% |
0 / 1 |
LexemeTermResult | |
55.32% |
52 / 94 |
|
28.57% |
2 / 7 |
35.07 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getSourceFiltering | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
getFields | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getHighlightingConfiguration | |
0.00% |
0 / 32 |
|
0.00% |
0 / 1 |
2 | |||
transformElasticsearchResult | |
93.88% |
46 / 49 |
|
0.00% |
0 / 1 |
8.01 | |||
extractLanguageCode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
createEmptyResult | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | namespace Wikibase\Lexeme\Search\Elastic; |
3 | |
4 | use CirrusSearch\Search\BaseResultsType; |
5 | use Elastica\ResultSet; |
6 | use Language; |
7 | use Wikibase\DataModel\Entity\EntityIdParser; |
8 | use Wikibase\DataModel\Term\Term; |
9 | use Wikibase\Lexeme\DataAccess\LexemeDescription; |
10 | use Wikibase\Lib\Interactors\TermSearchResult; |
11 | use Wikibase\Lib\Store\FallbackLabelDescriptionLookupFactory; |
12 | use Wikibase\Search\Elastic\EntitySearchUtils; |
13 | |
14 | /** |
15 | * This result type implements the result for searching a Wikibase Lexeme. |
16 | * |
17 | * @license GPL-2.0-or-later |
18 | * @author Stas Malyshev |
19 | */ |
20 | class LexemeTermResult extends BaseResultsType { |
21 | |
22 | /** |
23 | * @var EntityIdParser |
24 | */ |
25 | private $idParser; |
26 | |
27 | /** |
28 | * Display language |
29 | * @var Language |
30 | */ |
31 | private $displayLanguage; |
32 | |
33 | /** |
34 | * @var FallbackLabelDescriptionLookupFactory |
35 | */ |
36 | private $termLookupFactory; |
37 | |
38 | /** |
39 | * @param EntityIdParser $idParser |
40 | * @param Language $displayLanguage User display language |
41 | * @param FallbackLabelDescriptionLookupFactory $termLookupFactory |
42 | * Lookup factory for assembling descriptions |
43 | */ |
44 | public function __construct( |
45 | EntityIdParser $idParser, |
46 | Language $displayLanguage, |
47 | FallbackLabelDescriptionLookupFactory $termLookupFactory |
48 | ) { |
49 | $this->idParser = $idParser; |
50 | $this->termLookupFactory = $termLookupFactory; |
51 | $this->displayLanguage = $displayLanguage; |
52 | } |
53 | |
54 | /** |
55 | * Get the source filtering to be used loading the result. |
56 | * |
57 | * @return string[] |
58 | */ |
59 | public function getSourceFiltering() { |
60 | return array_merge( parent::getSourceFiltering(), [ |
61 | LemmaField::NAME, |
62 | LexemeLanguageField::NAME, |
63 | LexemeCategoryField::NAME, |
64 | ] ); |
65 | } |
66 | |
67 | /** |
68 | * Get the fields to load. Most of the time we'll use source filtering instead but |
69 | * some fields aren't part of the source. |
70 | * |
71 | * @return string[] |
72 | */ |
73 | public function getFields() { |
74 | return []; |
75 | } |
76 | |
77 | /** |
78 | * Get the highlighting configuration. |
79 | * |
80 | * @param array $highlightSource configuration for how to highlight the source. |
81 | * Empty if source should be ignored. |
82 | * @return array|null highlighting configuration for elasticsearch |
83 | */ |
84 | public function getHighlightingConfiguration( array $highlightSource ) { |
85 | $config = [ |
86 | 'pre_tags' => [ '' ], |
87 | 'post_tags' => [ '' ], |
88 | 'fields' => [], |
89 | ]; |
90 | $config['fields']['title'] = [ |
91 | 'type' => 'experimental', |
92 | 'fragmenter' => "none", |
93 | 'number_of_fragments' => 0, |
94 | 'matched_fields' => [ 'title.keyword' ] |
95 | ]; |
96 | $config['fields']["lemma"] = [ |
97 | 'type' => 'experimental', |
98 | 'fragmenter' => "none", |
99 | 'number_of_fragments' => 0, |
100 | 'options' => [ |
101 | 'skip_if_last_matched' => true, |
102 | ], |
103 | 'matched_fields' => [ 'lemma.prefix' ] |
104 | ]; |
105 | $config['fields']["lexeme_forms.representation"] = [ |
106 | 'type' => 'experimental', |
107 | 'fragmenter' => "none", |
108 | 'number_of_fragments' => 0, |
109 | "matched_fields" => [ |
110 | "lexeme_forms.representation.prefix", |
111 | ], |
112 | 'options' => [ |
113 | 'skip_if_last_matched' => true, |
114 | ], |
115 | ]; |
116 | |
117 | return $config; |
118 | } |
119 | |
120 | /** |
121 | * Convert search result from ElasticSearch result set to TermSearchResult. |
122 | * @param ResultSet $result |
123 | * @return TermSearchResult[] Set of search results, the types of which vary by implementation. |
124 | */ |
125 | public function transformElasticsearchResult( ResultSet $result ) { |
126 | $rawResults = $entityIds = []; |
127 | foreach ( $result->getResults() as $r ) { |
128 | $sourceData = $r->getSource(); |
129 | $entityId = EntitySearchUtils::parseOrNull( $sourceData['title'], $this->idParser ); |
130 | if ( !$entityId ) { |
131 | // Can not parse entity ID - skip it |
132 | continue; |
133 | } |
134 | |
135 | $lemmaCode = self::extractLanguageCode( $sourceData ); |
136 | |
137 | // Highlight part contains information about what has actually been matched. |
138 | $highlight = $r->getHighlights(); |
139 | |
140 | if ( !empty( $highlight['title'] ) ) { |
141 | // If we matched title, this means it's a match by ID |
142 | $matchedTermType = 'entityId'; |
143 | $matchedTerm = new Term( 'qid', $sourceData['title'] ); |
144 | } elseif ( empty( $highlight['lemma'] ) && empty( $highlight['lexeme_forms.representation'] ) ) { |
145 | // Something went wrong, we don't have any highlighting data |
146 | continue; |
147 | } elseif ( !empty( $highlight['lemma'] ) ) { |
148 | // We matched lemma |
149 | $matchedTermType = 'label'; |
150 | $matchedTerm = new Term( $lemmaCode, $highlight['lemma'][0] ); |
151 | } else { |
152 | // matched one of the forms |
153 | $matchedTermType = 'alias'; |
154 | // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset |
155 | $matchedTerm = new Term( $lemmaCode, $highlight['lexeme_forms.representation'][0] ); |
156 | } |
157 | |
158 | $lang = $sourceData['lexeme_language']['entity']; |
159 | $category = $sourceData['lexical_category']; |
160 | |
161 | $entityIds[$lang] = EntitySearchUtils::parseOrNull( $lang, $this->idParser ); |
162 | $entityIds[$category] = EntitySearchUtils::parseOrNull( $category, $this->idParser ); |
163 | |
164 | // Doing two-stage resolution here since we want to prefetch all labels for |
165 | // auxiliary entities before using them to construct descriptions. |
166 | $rawResults[$entityId->getSerialization()] = [ |
167 | 'id' => $entityId, |
168 | // TODO: this assumes we always take the first lemma. Maybe we should use |
169 | // the shortest language code or something. That would require us to index |
170 | // lemma language codes though. |
171 | 'lemma' => $sourceData['lemma'][0], |
172 | 'term' => $matchedTerm, |
173 | 'type' => $matchedTermType, |
174 | 'lang' => $lang, |
175 | 'langcode' => $lemmaCode, |
176 | 'category' => $category |
177 | ]; |
178 | } |
179 | |
180 | $langCode = $this->displayLanguage->getCode(); |
181 | if ( $entityIds ) { |
182 | // Create prefetched lookup |
183 | $termLookup = $this->termLookupFactory->newLabelDescriptionLookup( $this->displayLanguage, |
184 | array_filter( $entityIds ) ); |
185 | $descriptionMaker = new LexemeDescription( $termLookup, $this->idParser, |
186 | $this->displayLanguage ); |
187 | // Create full descriptons and instantiate TermSearchResult objects |
188 | return array_map( static function ( $raw ) use ( $descriptionMaker, $langCode ) { |
189 | return new TermSearchResult( |
190 | $raw['term'], |
191 | $raw['type'], |
192 | $raw['id'], |
193 | new Term( $raw['langcode'], $raw['lemma'] ), |
194 | // We are lying somewhat here, as description might be from fallback languages, |
195 | // but I am not sure there's any better way here. |
196 | new Term( $langCode, |
197 | $descriptionMaker->createDescription( $raw['id'], $raw['lang'], |
198 | $raw['category'] ) ) |
199 | ); |
200 | }, $rawResults ); |
201 | } else { |
202 | return []; |
203 | } |
204 | } |
205 | |
206 | /** |
207 | * @param array $sourceData the source data returned by elastic |
208 | * @return string the lexeme_language code if set, 'und' otherwise. |
209 | */ |
210 | public static function extractLanguageCode( array $sourceData ) { |
211 | if ( empty( $sourceData['lexeme_language']['code'] ) ) { |
212 | return 'und'; |
213 | } else { |
214 | return $sourceData['lexeme_language']['code']; |
215 | } |
216 | } |
217 | |
218 | /** |
219 | * @return TermSearchResult[] Empty set of search results |
220 | */ |
221 | public function createEmptyResult() { |
222 | return []; |
223 | } |
224 | |
225 | } |