Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
45.00% covered (danger)
45.00%
36 / 80
14.29% covered (danger)
14.29%
1 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
ElasticTermResult
45.00% covered (danger)
45.00%
36 / 80
14.29% covered (danger)
14.29%
1 / 7
86.55
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getSourceFiltering
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
6
 getFields
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getHighlightingConfiguration
0.00% covered (danger)
0.00%
0 / 35
0.00% covered (danger)
0.00%
0 / 1
12
 transformElasticsearchResult
95.45% covered (success)
95.45%
21 / 22
0.00% covered (danger)
0.00%
0 / 1
6
 getTermSearchResult
n/a
0 / 0
n/a
0 / 0
0
 extractTermFromHighlight
93.33% covered (success)
93.33%
14 / 15
0.00% covered (danger)
0.00%
0 / 1
6.01
 createEmptyResult
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace Wikibase\Search\Elastic;
4
5use CirrusSearch\Search\BaseResultsType;
6use Wikibase\DataModel\Term\Term;
7use Wikibase\Lib\Interactors\TermSearchResult;
8use Wikibase\Lib\TermLanguageFallbackChain;
9use Wikibase\Search\Elastic\Fields\DescriptionsField;
10use Wikibase\Search\Elastic\Fields\LabelsField;
11
12/**
13 * This result type implements the result for searching
14 * an entity by its {@link LabelsField label or alias}
15 * (also showing {@link DescriptionsField descriptions}).
16 *
17 * Fully implemented by {@link EntityElasticTermResult} for Wikibase entities.
18 * May also be used by other extensions,
19 * provided they use those same fields
20 * (via {@link \Wikibase\Search\Elastic\Fields\LabelsProviderFieldDefinitions LabelsProviderFieldDefinitions}
21 * and {@link \Wikibase\Search\Elastic\Fields\DescriptionsProviderFieldDefinitions DescriptionsProviderFieldDefinitions}).
22 *
23 * @license GPL-2.0-or-later
24 * @author Stas Malyshev
25 */
26abstract class ElasticTermResult extends BaseResultsType {
27
28    /**
29     * @param string[] $searchLanguageCodes List of language codes in the search fallback chain, the
30     *  first is the preferred language.
31     * @param TermLanguageFallbackChain $termFallbackChain Fallback chain for display
32     * @param string $highlightSubField 'prefix' or 'plain'
33     */
34    public function __construct(
35        private readonly array $searchLanguageCodes,
36        private readonly TermLanguageFallbackChain $termFallbackChain,
37        private readonly string $highlightSubField = 'prefix',
38    ) {
39    }
40
41    /**
42     * Get the source filtering to be used loading the result.
43     *
44     * @return string[]
45     */
46    public function getSourceFiltering() {
47        $fields = parent::getSourceFiltering();
48        foreach ( $this->termFallbackChain->getFetchLanguageCodes() as $code ) {
49            $fields[] = LabelsField::NAME . '.' . $code;
50            $fields[] = DescriptionsField::NAME . '.' . $code;
51        }
52        return $fields;
53    }
54
55    /**
56     * Get the fields to load.  Most of the time we'll use source filtering instead but
57     * some fields aren't part of the source.
58     *
59     * @return string[]
60     */
61    public function getFields() {
62        return [];
63    }
64
65    /**
66     * Get the highlighting configuration.
67     *
68     * @param array $highlightSource configuration for how to highlight the source.
69     *  Empty if source should be ignored.
70     * @return array|null highlighting configuration for elasticsearch
71     */
72    public function getHighlightingConfiguration( array $highlightSource ) {
73        $config = [
74            'pre_tags' => [ '' ],
75            'post_tags' => [ '' ],
76            'fields' => [],
77        ];
78        $config['fields']['title'] = [
79            'type' => 'experimental',
80            'fragmenter' => "none",
81            'number_of_fragments' => 0,
82            'matched_fields' => [ 'title.keyword' ]
83        ];
84        $labelsName = LabelsField::NAME;
85        $order = $this->highlightSubField === 'plain' ? 'score' : 'none';
86        foreach ( $this->searchLanguageCodes as $code ) {
87            $config['fields']["$labelsName.$code.{$this->highlightSubField}"] = [
88                'type' => 'experimental',
89                'fragmenter' => "none",
90                'order' => $order,
91                'number_of_fragments' => 0,
92                'options' => [
93                    'skip_if_last_matched' => true,
94                    'return_snippets_and_offsets' => true
95                ],
96            ];
97        }
98        $config['fields']["$labelsName.*.{$this->highlightSubField}"] = [
99            'type' => 'experimental',
100            'fragmenter' => "none",
101            'order' => $order,
102            'number_of_fragments' => 0,
103            'options' => [
104                'skip_if_last_matched' => true,
105                'return_snippets_and_offsets' => true
106            ],
107        ];
108
109        return $config;
110    }
111
112    /**
113     * Convert search result from ElasticSearch result set to TermSearchResult.
114     * @param \Elastica\ResultSet $result
115     * @return TermSearchResult[] Set of search results, the types of which vary by implementation.
116     */
117    public function transformElasticsearchResult( \Elastica\ResultSet $result ) {
118        $results = [];
119        foreach ( $result->getResults() as $r ) {
120            $sourceData = $r->getSource();
121
122            // Highlight part contains information about what has actually been matched.
123            $highlight = $r->getHighlights();
124            $displayLabel = EntitySearchUtils::findTermForDisplay( $sourceData, LabelsField::NAME, $this->termFallbackChain );
125            $displayDescription = EntitySearchUtils::findTermForDisplay( $sourceData, DescriptionsField::NAME, $this->termFallbackChain );
126
127            if ( !empty( $highlight['title'] ) ) {
128                // If we matched title, this means it's a match by ID
129                $matchedTermType = 'entityId';
130                $matchedTerm = new Term( 'qid', $sourceData['title'] );
131            } elseif ( !$highlight ) {
132                // Something went wrong, we don't have any highlighting data
133                continue;
134            } else {
135                [ $matchedTermType, $langCode, $term ] =
136                    $this->extractTermFromHighlight( $highlight, $sourceData );
137                $matchedTerm = new Term( $langCode, $term );
138            }
139
140            if ( !$displayLabel ) {
141                // This should not happen, but just in case, it's better to return something
142                $displayLabel = $matchedTerm;
143            }
144
145            $termSearchResult = $this->getTermSearchResult(
146                $sourceData, $matchedTerm, $matchedTermType, $displayLabel, $displayDescription
147            );
148            if ( $termSearchResult !== null ) {
149                $results[$termSearchResult->getEntityIdSerialization()] = $termSearchResult;
150            }
151        }
152
153        return $results;
154    }
155
156    /**
157     * Turn the given result data into a {@link TermSearchResult}
158     * (or skip this result if null is returned).
159     */
160    abstract protected function getTermSearchResult(
161        array $sourceData,
162        Term $matchedTerm,
163        string $matchedTermType,
164        ?Term $displayLabel,
165        ?Term $displayDescription
166    ): ?TermSearchResult;
167
168    /**
169     * New highlighter pattern.
170     * The new highlighter can return offsets as: 1:1-XX:YY|Text Snippet
171     * or even SNIPPET_START:MATCH1_START-MATCH1_END,MATCH2_START-MATCH2_END,...:SNIPPET_END|Text
172     */
173    public const HIGHLIGHT_PATTERN = '/^\d+:\d+-\d+(?:,\d+-\d+)*:\d+\|(.+)/';
174
175    /**
176     * Extract term, language and type from highlighter results.
177     * @param array<string,string[]> $highlight Data from highlighter
178     * @param array[] $sourceData Data from _source
179     * @return array Array of: [string $termType, string $languageCode, string $term]
180     */
181    private function extractTermFromHighlight( array $highlight, array $sourceData ) {
182        /**
183         * Highlighter returns:
184         * {
185         *   labels.en.prefix: [
186         *      "metre"  // or "0:0-5:5|metre"
187         *   ]
188         * }
189         */
190        $matchedTermType = 'label';
191        $field = array_key_first( $highlight );
192        $term = $highlight[$field][0];
193        if ( preg_match( '/^' . preg_quote( LabelsField::NAME ) . "\.([^.]+)\.{$this->highlightSubField}$/", $field, $match ) ) {
194            $langCode = $match[1];
195            if ( preg_match( self::HIGHLIGHT_PATTERN, $term, $termMatch ) ) {
196                $isFirst = ( $term[0] === '0' );
197                $term = $termMatch[1];
198            } else {
199                $isFirst = true;
200            }
201            if ( !empty( $sourceData[LabelsField::NAME][$langCode] ) ) {
202                // Here we have match in one of the languages we asked for.
203                // Primary label always comes first, so if it's not the first one,
204                // it's an alias.
205                if ( $sourceData[LabelsField::NAME][$langCode][0] !== $term ) {
206                    $matchedTermType = 'alias';
207                }
208            } else {
209                // Here we have match in one of the "other" languages.
210                // If it's the first one in the list, it's label, otherwise it is alias.
211                $matchedTermType = $isFirst ? 'label' : 'alias';
212            }
213        } else {
214            // This is weird since we didn't ask to match anything else,
215            // but we'll return it anyway for debugging.
216            $langCode = 'unknown';
217        }
218        return [ $matchedTermType, $langCode, $term ];
219    }
220
221    /**
222     * @return TermSearchResult[] Empty set of search results
223     */
224    public function createEmptyResult() {
225        return [];
226    }
227
228}