Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
48.15% |
39 / 81 |
|
14.29% |
1 / 7 |
CRAP | |
0.00% |
0 / 1 |
ElasticTermResult | |
48.15% |
39 / 81 |
|
14.29% |
1 / 7 |
69.33 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getSourceFiltering | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
getFields | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getHighlightingConfiguration | |
0.00% |
0 / 31 |
|
0.00% |
0 / 1 |
6 | |||
transformElasticsearchResult | |
87.50% |
21 / 24 |
|
0.00% |
0 / 1 |
6.07 | |||
extractTermFromHighlight | |
93.75% |
15 / 16 |
|
0.00% |
0 / 1 |
6.01 | |||
createEmptyResult | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace Wikibase\Search\Elastic; |
4 | |
5 | use CirrusSearch\Search\BaseResultsType; |
6 | use Wikibase\DataModel\Entity\EntityIdParser; |
7 | use Wikibase\DataModel\Entity\EntityIdParsingException; |
8 | use Wikibase\DataModel\Term\Term; |
9 | use Wikibase\Lib\Interactors\TermSearchResult; |
10 | use Wikibase\Lib\TermLanguageFallbackChain; |
11 | |
12 | /** |
13 | * This result type implements the result for searching |
14 | * a Wikibase entity by its label or alias. |
15 | * |
16 | * @license GPL-2.0-or-later |
17 | * @author Stas Malyshev |
18 | */ |
19 | class ElasticTermResult extends BaseResultsType { |
20 | |
21 | /** |
22 | * @var EntityIdParser |
23 | */ |
24 | private $idParser; |
25 | |
26 | /** |
27 | * List of language codes in the search fallback chain, the first |
28 | * is the preferred language. |
29 | * @var string[] |
30 | */ |
31 | private $searchLanguageCodes; |
32 | |
33 | /** |
34 | * Display fallback chain. |
35 | * @var TermLanguageFallbackChain |
36 | */ |
37 | private $termFallbackChain; |
38 | |
39 | /** |
40 | * @param EntityIdParser $idParser |
41 | * @param string[] $searchLanguageCodes Language fallback chain for search |
42 | * @param TermLanguageFallbackChain $displayFallbackChain Fallback chain for display |
43 | */ |
44 | public function __construct( |
45 | EntityIdParser $idParser, |
46 | array $searchLanguageCodes, |
47 | TermLanguageFallbackChain $displayFallbackChain |
48 | ) { |
49 | $this->idParser = $idParser; |
50 | $this->searchLanguageCodes = $searchLanguageCodes; |
51 | $this->termFallbackChain = $displayFallbackChain; |
52 | } |
53 | |
54 | /** |
55 | * Get the source filtering to be used loading the result. |
56 | * |
57 | * @return string[] |
58 | */ |
59 | public function getSourceFiltering() { |
60 | $fields = parent::getSourceFiltering(); |
61 | foreach ( $this->termFallbackChain->getFetchLanguageCodes() as $code ) { |
62 | $fields[] = "labels.$code"; |
63 | $fields[] = "descriptions.$code"; |
64 | } |
65 | return $fields; |
66 | } |
67 | |
68 | /** |
69 | * Get the fields to load. Most of the time we'll use source filtering instead but |
70 | * some fields aren't part of the source. |
71 | * |
72 | * @return string[] |
73 | */ |
74 | public function getFields() { |
75 | return []; |
76 | } |
77 | |
78 | /** |
79 | * Get the highlighting configuration. |
80 | * |
81 | * @param array $highlightSource configuration for how to highlight the source. |
82 | * Empty if source should be ignored. |
83 | * @return array|null highlighting configuration for elasticsearch |
84 | */ |
85 | public function getHighlightingConfiguration( array $highlightSource ) { |
86 | $config = [ |
87 | 'pre_tags' => [ '' ], |
88 | 'post_tags' => [ '' ], |
89 | 'fields' => [], |
90 | ]; |
91 | $config['fields']['title'] = [ |
92 | 'type' => 'experimental', |
93 | 'fragmenter' => "none", |
94 | 'number_of_fragments' => 0, |
95 | 'matched_fields' => [ 'title.keyword' ] |
96 | ]; |
97 | foreach ( $this->searchLanguageCodes as $code ) { |
98 | $config['fields']["labels.$code.prefix"] = [ |
99 | 'type' => 'experimental', |
100 | 'fragmenter' => "none", |
101 | 'number_of_fragments' => 0, |
102 | 'options' => [ |
103 | 'skip_if_last_matched' => true, |
104 | 'return_snippets_and_offsets' => true |
105 | ], |
106 | ]; |
107 | } |
108 | $config['fields']['labels.*.prefix'] = [ |
109 | 'type' => 'experimental', |
110 | 'fragmenter' => "none", |
111 | 'number_of_fragments' => 0, |
112 | 'options' => [ |
113 | 'skip_if_last_matched' => true, |
114 | 'return_snippets_and_offsets' => true |
115 | ], |
116 | ]; |
117 | |
118 | return $config; |
119 | } |
120 | |
121 | /** |
122 | * Convert search result from ElasticSearch result set to TermSearchResult. |
123 | * @param \Elastica\ResultSet $result |
124 | * @return TermSearchResult[] Set of search results, the types of which vary by implementation. |
125 | */ |
126 | public function transformElasticsearchResult( \Elastica\ResultSet $result ) { |
127 | $results = []; |
128 | foreach ( $result->getResults() as $r ) { |
129 | $sourceData = $r->getSource(); |
130 | try { |
131 | $entityId = $this->idParser->parse( $sourceData['title'] ); |
132 | } catch ( EntityIdParsingException $e ) { |
133 | // Can not parse entity ID - skip it |
134 | continue; |
135 | } |
136 | |
137 | // Highlight part contains information about what has actually been matched. |
138 | $highlight = $r->getHighlights(); |
139 | $displayLabel = EntitySearchUtils::findTermForDisplay( $sourceData, 'labels', $this->termFallbackChain ); |
140 | $displayDescription = EntitySearchUtils::findTermForDisplay( $sourceData, 'descriptions', $this->termFallbackChain ); |
141 | |
142 | if ( !empty( $highlight['title'] ) ) { |
143 | // If we matched title, this means it's a match by ID |
144 | $matchedTermType = 'entityId'; |
145 | $matchedTerm = new Term( 'qid', $sourceData['title'] ); |
146 | } elseif ( !$highlight ) { |
147 | // Something went wrong, we don't have any highlighting data |
148 | continue; |
149 | } else { |
150 | [ $matchedTermType, $langCode, $term ] = |
151 | $this->extractTermFromHighlight( $highlight, $sourceData ); |
152 | $matchedTerm = new Term( $langCode, $term ); |
153 | } |
154 | |
155 | if ( !$displayLabel ) { |
156 | // This should not happen, but just in case, it's better to return something |
157 | $displayLabel = $matchedTerm; |
158 | } |
159 | |
160 | $results[$entityId->getSerialization()] = new TermSearchResult( |
161 | $matchedTerm, $matchedTermType, $entityId, $displayLabel, |
162 | $displayDescription |
163 | ); |
164 | } |
165 | |
166 | return $results; |
167 | } |
168 | |
169 | /** |
170 | * New highlighter pattern. |
171 | * The new highlighter can return offsets as: 1:1-XX:YY|Text Snippet |
172 | * or even SNIPPET_START:MATCH1_START-MATCH1_END,MATCH2_START-MATCH2_END,...:SNIPPET_END|Text |
173 | */ |
174 | public const HIGHLIGHT_PATTERN = '/^\d+:\d+-\d+(?:,\d+-\d+)*:\d+\|(.+)/'; |
175 | |
176 | /** |
177 | * Extract term, language and type from highlighter results. |
178 | * @param array $highlight Data from highlighter |
179 | * @param array[] $sourceData Data from _source |
180 | * @return array Array of: [string $termType, string $languageCode, string $term] |
181 | */ |
182 | private function extractTermFromHighlight( array $highlight, array $sourceData ) { |
183 | /** |
184 | * Highlighter returns: |
185 | * { |
186 | * labels.en.prefix: [ |
187 | * "metre" // or "0:0-5:5|metre" |
188 | * ] |
189 | * } |
190 | */ |
191 | $matchedTermType = 'label'; |
192 | $term = reset( $highlight ); // Take the first one |
193 | $term = $term[0]; // Highlighter returns array |
194 | $field = key( $highlight ); |
195 | if ( preg_match( '/^labels\.([^.]+)\.prefix$/', $field, $match ) ) { |
196 | $langCode = $match[1]; |
197 | if ( preg_match( self::HIGHLIGHT_PATTERN, $term, $termMatch ) ) { |
198 | $isFirst = ( $term[0] === '0' ); |
199 | $term = $termMatch[1]; |
200 | } else { |
201 | $isFirst = true; |
202 | } |
203 | if ( !empty( $sourceData['labels'][$langCode] ) ) { |
204 | // Here we have match in one of the languages we asked for. |
205 | // Primary label always comes first, so if it's not the first one, |
206 | // it's an alias. |
207 | if ( $sourceData['labels'][$langCode][0] !== $term ) { |
208 | $matchedTermType = 'alias'; |
209 | } |
210 | } else { |
211 | // Here we have match in one of the "other" languages. |
212 | // If it's the first one in the list, it's label, otherwise it is alias. |
213 | $matchedTermType = $isFirst ? 'label' : 'alias'; |
214 | } |
215 | } else { |
216 | // This is weird since we didn't ask to match anything else, |
217 | // but we'll return it anyway for debugging. |
218 | $langCode = 'unknown'; |
219 | } |
220 | return [ $matchedTermType, $langCode, $term ]; |
221 | } |
222 | |
223 | /** |
224 | * @return TermSearchResult[] Empty set of search results |
225 | */ |
226 | public function createEmptyResult() { |
227 | return []; |
228 | } |
229 | |
230 | } |