Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
48.75% |
39 / 80 |
|
14.29% |
1 / 7 |
CRAP | |
0.00% |
0 / 1 |
ElasticTermResult | |
48.75% |
39 / 80 |
|
14.29% |
1 / 7 |
67.59 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getSourceFiltering | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
getFields | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getHighlightingConfiguration | |
0.00% |
0 / 32 |
|
0.00% |
0 / 1 |
6 | |||
transformElasticsearchResult | |
95.45% |
21 / 22 |
|
0.00% |
0 / 1 |
6 | |||
getTermSearchResult | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
extractTermFromHighlight | |
93.75% |
15 / 16 |
|
0.00% |
0 / 1 |
6.01 | |||
createEmptyResult | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace Wikibase\Search\Elastic; |
4 | |
5 | use CirrusSearch\Search\BaseResultsType; |
6 | use Wikibase\DataModel\Term\Term; |
7 | use Wikibase\Lib\Interactors\TermSearchResult; |
8 | use Wikibase\Lib\TermLanguageFallbackChain; |
9 | use Wikibase\Search\Elastic\Fields\DescriptionsField; |
10 | use Wikibase\Search\Elastic\Fields\LabelsField; |
11 | |
12 | /** |
13 | * This result type implements the result for searching |
14 | * an entity by its {@link LabelsField label or alias} |
15 | * (also showing {@link DescriptionsField descriptions}). |
16 | * |
17 | * Fully implemented by {@link EntityElasticTermResult} for Wikibase entities. |
18 | * May also be used by other extensions, |
19 | * provided they use those same fields |
20 | * (via {@link \Wikibase\Search\Elastic\Fields\LabelsProviderFieldDefinitions LabelsProviderFieldDefinitions} |
21 | * and {@link \Wikibase\Search\Elastic\Fields\DescriptionsProviderFieldDefinitions DescriptionsProviderFieldDefinitions}). |
22 | * |
23 | * @license GPL-2.0-or-later |
24 | * @author Stas Malyshev |
25 | */ |
26 | abstract class ElasticTermResult extends BaseResultsType { |
27 | |
28 | /** |
29 | * List of language codes in the search fallback chain, the first |
30 | * is the preferred language. |
31 | * @var string[] |
32 | */ |
33 | private $searchLanguageCodes; |
34 | |
35 | /** |
36 | * Display fallback chain. |
37 | * @var TermLanguageFallbackChain |
38 | */ |
39 | private $termFallbackChain; |
40 | private string $highlightSubField; |
41 | |
42 | /** |
43 | * @param string[] $searchLanguageCodes Language fallback chain for search |
44 | * @param TermLanguageFallbackChain $displayFallbackChain Fallback chain for display |
45 | * @param string $highlightSubField 'prefix' or 'plain' |
46 | */ |
47 | public function __construct( |
48 | array $searchLanguageCodes, |
49 | TermLanguageFallbackChain $displayFallbackChain, |
50 | string $highlightSubField = 'prefix' |
51 | ) { |
52 | $this->searchLanguageCodes = $searchLanguageCodes; |
53 | $this->termFallbackChain = $displayFallbackChain; |
54 | $this->highlightSubField = $highlightSubField; |
55 | } |
56 | |
57 | /** |
58 | * Get the source filtering to be used loading the result. |
59 | * |
60 | * @return string[] |
61 | */ |
62 | public function getSourceFiltering() { |
63 | $fields = parent::getSourceFiltering(); |
64 | foreach ( $this->termFallbackChain->getFetchLanguageCodes() as $code ) { |
65 | $fields[] = LabelsField::NAME . '.' . $code; |
66 | $fields[] = DescriptionsField::NAME . '.' . $code; |
67 | } |
68 | return $fields; |
69 | } |
70 | |
71 | /** |
72 | * Get the fields to load. Most of the time we'll use source filtering instead but |
73 | * some fields aren't part of the source. |
74 | * |
75 | * @return string[] |
76 | */ |
77 | public function getFields() { |
78 | return []; |
79 | } |
80 | |
81 | /** |
82 | * Get the highlighting configuration. |
83 | * |
84 | * @param array $highlightSource configuration for how to highlight the source. |
85 | * Empty if source should be ignored. |
86 | * @return array|null highlighting configuration for elasticsearch |
87 | */ |
88 | public function getHighlightingConfiguration( array $highlightSource ) { |
89 | $config = [ |
90 | 'pre_tags' => [ '' ], |
91 | 'post_tags' => [ '' ], |
92 | 'fields' => [], |
93 | ]; |
94 | $config['fields']['title'] = [ |
95 | 'type' => 'experimental', |
96 | 'fragmenter' => "none", |
97 | 'number_of_fragments' => 0, |
98 | 'matched_fields' => [ 'title.keyword' ] |
99 | ]; |
100 | $labelsName = LabelsField::NAME; |
101 | foreach ( $this->searchLanguageCodes as $code ) { |
102 | $config['fields']["$labelsName.$code.{$this->highlightSubField}"] = [ |
103 | 'type' => 'experimental', |
104 | 'fragmenter' => "none", |
105 | 'number_of_fragments' => 0, |
106 | 'options' => [ |
107 | 'skip_if_last_matched' => true, |
108 | 'return_snippets_and_offsets' => true |
109 | ], |
110 | ]; |
111 | } |
112 | $config['fields']["$labelsName.*.{$this->highlightSubField}"] = [ |
113 | 'type' => 'experimental', |
114 | 'fragmenter' => "none", |
115 | 'number_of_fragments' => 0, |
116 | 'options' => [ |
117 | 'skip_if_last_matched' => true, |
118 | 'return_snippets_and_offsets' => true |
119 | ], |
120 | ]; |
121 | |
122 | return $config; |
123 | } |
124 | |
125 | /** |
126 | * Convert search result from ElasticSearch result set to TermSearchResult. |
127 | * @param \Elastica\ResultSet $result |
128 | * @return TermSearchResult[] Set of search results, the types of which vary by implementation. |
129 | */ |
130 | public function transformElasticsearchResult( \Elastica\ResultSet $result ) { |
131 | $results = []; |
132 | foreach ( $result->getResults() as $r ) { |
133 | $sourceData = $r->getSource(); |
134 | |
135 | // Highlight part contains information about what has actually been matched. |
136 | $highlight = $r->getHighlights(); |
137 | $displayLabel = EntitySearchUtils::findTermForDisplay( $sourceData, LabelsField::NAME, $this->termFallbackChain ); |
138 | $displayDescription = EntitySearchUtils::findTermForDisplay( $sourceData, DescriptionsField::NAME, $this->termFallbackChain ); |
139 | |
140 | if ( !empty( $highlight['title'] ) ) { |
141 | // If we matched title, this means it's a match by ID |
142 | $matchedTermType = 'entityId'; |
143 | $matchedTerm = new Term( 'qid', $sourceData['title'] ); |
144 | } elseif ( !$highlight ) { |
145 | // Something went wrong, we don't have any highlighting data |
146 | continue; |
147 | } else { |
148 | [ $matchedTermType, $langCode, $term ] = |
149 | $this->extractTermFromHighlight( $highlight, $sourceData ); |
150 | $matchedTerm = new Term( $langCode, $term ); |
151 | } |
152 | |
153 | if ( !$displayLabel ) { |
154 | // This should not happen, but just in case, it's better to return something |
155 | $displayLabel = $matchedTerm; |
156 | } |
157 | |
158 | $termSearchResult = $this->getTermSearchResult( |
159 | $sourceData, $matchedTerm, $matchedTermType, $displayLabel, $displayDescription |
160 | ); |
161 | if ( $termSearchResult !== null ) { |
162 | $results[$termSearchResult->getEntityIdSerialization()] = $termSearchResult; |
163 | } |
164 | } |
165 | |
166 | return $results; |
167 | } |
168 | |
169 | /** |
170 | * Turn the given result data into a {@link TermSearchResult} |
171 | * (or skip this result if null is returned). |
172 | */ |
173 | abstract protected function getTermSearchResult( |
174 | array $sourceData, |
175 | Term $matchedTerm, |
176 | string $matchedTermType, |
177 | ?Term $displayLabel, |
178 | ?Term $displayDescription |
179 | ): ?TermSearchResult; |
180 | |
181 | /** |
182 | * New highlighter pattern. |
183 | * The new highlighter can return offsets as: 1:1-XX:YY|Text Snippet |
184 | * or even SNIPPET_START:MATCH1_START-MATCH1_END,MATCH2_START-MATCH2_END,...:SNIPPET_END|Text |
185 | */ |
186 | public const HIGHLIGHT_PATTERN = '/^\d+:\d+-\d+(?:,\d+-\d+)*:\d+\|(.+)/'; |
187 | |
188 | /** |
189 | * Extract term, language and type from highlighter results. |
190 | * @param array $highlight Data from highlighter |
191 | * @param array[] $sourceData Data from _source |
192 | * @return array Array of: [string $termType, string $languageCode, string $term] |
193 | */ |
194 | private function extractTermFromHighlight( array $highlight, array $sourceData ) { |
195 | /** |
196 | * Highlighter returns: |
197 | * { |
198 | * labels.en.prefix: [ |
199 | * "metre" // or "0:0-5:5|metre" |
200 | * ] |
201 | * } |
202 | */ |
203 | $matchedTermType = 'label'; |
204 | $term = reset( $highlight ); // Take the first one |
205 | $term = $term[0]; // Highlighter returns array |
206 | $field = key( $highlight ); |
207 | if ( preg_match( '/^' . preg_quote( LabelsField::NAME ) . "\.([^.]+)\.{$this->highlightSubField}$/", $field, $match ) ) { |
208 | $langCode = $match[1]; |
209 | if ( preg_match( self::HIGHLIGHT_PATTERN, $term, $termMatch ) ) { |
210 | $isFirst = ( $term[0] === '0' ); |
211 | $term = $termMatch[1]; |
212 | } else { |
213 | $isFirst = true; |
214 | } |
215 | if ( !empty( $sourceData[LabelsField::NAME][$langCode] ) ) { |
216 | // Here we have match in one of the languages we asked for. |
217 | // Primary label always comes first, so if it's not the first one, |
218 | // it's an alias. |
219 | if ( $sourceData[LabelsField::NAME][$langCode][0] !== $term ) { |
220 | $matchedTermType = 'alias'; |
221 | } |
222 | } else { |
223 | // Here we have match in one of the "other" languages. |
224 | // If it's the first one in the list, it's label, otherwise it is alias. |
225 | $matchedTermType = $isFirst ? 'label' : 'alias'; |
226 | } |
227 | } else { |
228 | // This is weird since we didn't ask to match anything else, |
229 | // but we'll return it anyway for debugging. |
230 | $langCode = 'unknown'; |
231 | } |
232 | return [ $matchedTermType, $langCode, $term ]; |
233 | } |
234 | |
235 | /** |
236 | * @return TermSearchResult[] Empty set of search results |
237 | */ |
238 | public function createEmptyResult() { |
239 | return []; |
240 | } |
241 | |
242 | } |