Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
46.99% covered (danger)
46.99%
39 / 83
14.29% covered (danger)
14.29%
1 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
ElasticTermResult
46.99% covered (danger)
46.99%
39 / 83
14.29% covered (danger)
14.29%
1 / 7
79.59
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 getSourceFiltering
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
6
 getFields
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getHighlightingConfiguration
0.00% covered (danger)
0.00%
0 / 35
0.00% covered (danger)
0.00%
0 / 1
12
 transformElasticsearchResult
95.45% covered (success)
95.45%
21 / 22
0.00% covered (danger)
0.00%
0 / 1
6
 getTermSearchResult
n/a
0 / 0
n/a
0 / 0
0
 extractTermFromHighlight
93.75% covered (success)
93.75%
15 / 16
0.00% covered (danger)
0.00%
0 / 1
6.01
 createEmptyResult
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace Wikibase\Search\Elastic;
4
5use CirrusSearch\Search\BaseResultsType;
6use Wikibase\DataModel\Term\Term;
7use Wikibase\Lib\Interactors\TermSearchResult;
8use Wikibase\Lib\TermLanguageFallbackChain;
9use Wikibase\Search\Elastic\Fields\DescriptionsField;
10use Wikibase\Search\Elastic\Fields\LabelsField;
11
12/**
13 * This result type implements the result for searching
14 * an entity by its {@link LabelsField label or alias}
15 * (also showing {@link DescriptionsField descriptions}).
16 *
17 * Fully implemented by {@link EntityElasticTermResult} for Wikibase entities.
18 * May also be used by other extensions,
19 * provided they use those same fields
20 * (via {@link \Wikibase\Search\Elastic\Fields\LabelsProviderFieldDefinitions LabelsProviderFieldDefinitions}
21 * and {@link \Wikibase\Search\Elastic\Fields\DescriptionsProviderFieldDefinitions DescriptionsProviderFieldDefinitions}).
22 *
23 * @license GPL-2.0-or-later
24 * @author Stas Malyshev
25 */
26abstract class ElasticTermResult extends BaseResultsType {
27
28    /**
29     * List of language codes in the search fallback chain, the first
30     * is the preferred language.
31     * @var string[]
32     */
33    private $searchLanguageCodes;
34
35    /**
36     * Display fallback chain.
37     * @var TermLanguageFallbackChain
38     */
39    private $termFallbackChain;
40    private string $highlightSubField;
41
42    /**
43     * @param string[] $searchLanguageCodes Language fallback chain for search
44     * @param TermLanguageFallbackChain $displayFallbackChain Fallback chain for display
45     * @param string $highlightSubField 'prefix' or 'plain'
46     */
47    public function __construct(
48        array $searchLanguageCodes,
49        TermLanguageFallbackChain $displayFallbackChain,
50        string $highlightSubField = 'prefix'
51    ) {
52        $this->searchLanguageCodes = $searchLanguageCodes;
53        $this->termFallbackChain = $displayFallbackChain;
54        $this->highlightSubField = $highlightSubField;
55    }
56
57    /**
58     * Get the source filtering to be used loading the result.
59     *
60     * @return string[]
61     */
62    public function getSourceFiltering() {
63        $fields = parent::getSourceFiltering();
64        foreach ( $this->termFallbackChain->getFetchLanguageCodes() as $code ) {
65            $fields[] = LabelsField::NAME . '.' . $code;
66            $fields[] = DescriptionsField::NAME . '.' . $code;
67        }
68        return $fields;
69    }
70
71    /**
72     * Get the fields to load.  Most of the time we'll use source filtering instead but
73     * some fields aren't part of the source.
74     *
75     * @return string[]
76     */
77    public function getFields() {
78        return [];
79    }
80
81    /**
82     * Get the highlighting configuration.
83     *
84     * @param array $highlightSource configuration for how to highlight the source.
85     *  Empty if source should be ignored.
86     * @return array|null highlighting configuration for elasticsearch
87     */
88    public function getHighlightingConfiguration( array $highlightSource ) {
89        $config = [
90            'pre_tags' => [ '' ],
91            'post_tags' => [ '' ],
92            'fields' => [],
93        ];
94        $config['fields']['title'] = [
95            'type' => 'experimental',
96            'fragmenter' => "none",
97            'number_of_fragments' => 0,
98            'matched_fields' => [ 'title.keyword' ]
99        ];
100        $labelsName = LabelsField::NAME;
101        $order = $this->highlightSubField === 'plain' ? 'score' : 'none';
102        foreach ( $this->searchLanguageCodes as $code ) {
103            $config['fields']["$labelsName.$code.{$this->highlightSubField}"] = [
104                'type' => 'experimental',
105                'fragmenter' => "none",
106                'order' => $order,
107                'number_of_fragments' => 0,
108                'options' => [
109                    'skip_if_last_matched' => true,
110                    'return_snippets_and_offsets' => true
111                ],
112            ];
113        }
114        $config['fields']["$labelsName.*.{$this->highlightSubField}"] = [
115            'type' => 'experimental',
116            'fragmenter' => "none",
117            'order' => $order,
118            'number_of_fragments' => 0,
119            'options' => [
120                'skip_if_last_matched' => true,
121                'return_snippets_and_offsets' => true
122            ],
123        ];
124
125        return $config;
126    }
127
128    /**
129     * Convert search result from ElasticSearch result set to TermSearchResult.
130     * @param \Elastica\ResultSet $result
131     * @return TermSearchResult[] Set of search results, the types of which vary by implementation.
132     */
133    public function transformElasticsearchResult( \Elastica\ResultSet $result ) {
134        $results = [];
135        foreach ( $result->getResults() as $r ) {
136            $sourceData = $r->getSource();
137
138            // Highlight part contains information about what has actually been matched.
139            $highlight = $r->getHighlights();
140            $displayLabel = EntitySearchUtils::findTermForDisplay( $sourceData, LabelsField::NAME, $this->termFallbackChain );
141            $displayDescription = EntitySearchUtils::findTermForDisplay( $sourceData, DescriptionsField::NAME, $this->termFallbackChain );
142
143            if ( !empty( $highlight['title'] ) ) {
144                // If we matched title, this means it's a match by ID
145                $matchedTermType = 'entityId';
146                $matchedTerm = new Term( 'qid', $sourceData['title'] );
147            } elseif ( !$highlight ) {
148                // Something went wrong, we don't have any highlighting data
149                continue;
150            } else {
151                [ $matchedTermType, $langCode, $term ] =
152                    $this->extractTermFromHighlight( $highlight, $sourceData );
153                $matchedTerm = new Term( $langCode, $term );
154            }
155
156            if ( !$displayLabel ) {
157                // This should not happen, but just in case, it's better to return something
158                $displayLabel = $matchedTerm;
159            }
160
161            $termSearchResult = $this->getTermSearchResult(
162                $sourceData, $matchedTerm, $matchedTermType, $displayLabel, $displayDescription
163            );
164            if ( $termSearchResult !== null ) {
165                $results[$termSearchResult->getEntityIdSerialization()] = $termSearchResult;
166            }
167        }
168
169        return $results;
170    }
171
172    /**
173     * Turn the given result data into a {@link TermSearchResult}
174     * (or skip this result if null is returned).
175     */
176    abstract protected function getTermSearchResult(
177        array $sourceData,
178        Term $matchedTerm,
179        string $matchedTermType,
180        ?Term $displayLabel,
181        ?Term $displayDescription
182    ): ?TermSearchResult;
183
184    /**
185     * New highlighter pattern.
186     * The new highlighter can return offsets as: 1:1-XX:YY|Text Snippet
187     * or even SNIPPET_START:MATCH1_START-MATCH1_END,MATCH2_START-MATCH2_END,...:SNIPPET_END|Text
188     */
189    public const HIGHLIGHT_PATTERN = '/^\d+:\d+-\d+(?:,\d+-\d+)*:\d+\|(.+)/';
190
191    /**
192     * Extract term, language and type from highlighter results.
193     * @param array $highlight Data from highlighter
194     * @param array[] $sourceData Data from _source
195     * @return array Array of: [string $termType, string $languageCode, string $term]
196     */
197    private function extractTermFromHighlight( array $highlight, array $sourceData ) {
198        /**
199         * Highlighter returns:
200         * {
201         *   labels.en.prefix: [
202         *      "metre"  // or "0:0-5:5|metre"
203         *   ]
204         * }
205         */
206        $matchedTermType = 'label';
207        $term = reset( $highlight ); // Take the first one
208        $term = $term[0]; // Highlighter returns array
209        $field = key( $highlight );
210        if ( preg_match( '/^' . preg_quote( LabelsField::NAME ) . "\.([^.]+)\.{$this->highlightSubField}$/", $field, $match ) ) {
211            $langCode = $match[1];
212            if ( preg_match( self::HIGHLIGHT_PATTERN, $term, $termMatch ) ) {
213                $isFirst = ( $term[0] === '0' );
214                $term = $termMatch[1];
215            } else {
216                $isFirst = true;
217            }
218            if ( !empty( $sourceData[LabelsField::NAME][$langCode] ) ) {
219                // Here we have match in one of the languages we asked for.
220                // Primary label always comes first, so if it's not the first one,
221                // it's an alias.
222                if ( $sourceData[LabelsField::NAME][$langCode][0] !== $term ) {
223                    $matchedTermType = 'alias';
224                }
225            } else {
226                // Here we have match in one of the "other" languages.
227                // If it's the first one in the list, it's label, otherwise it is alias.
228                $matchedTermType = $isFirst ? 'label' : 'alias';
229            }
230        } else {
231            // This is weird since we didn't ask to match anything else,
232            // but we'll return it anyway for debugging.
233            $langCode = 'unknown';
234        }
235        return [ $matchedTermType, $langCode, $term ];
236    }
237
238    /**
239     * @return TermSearchResult[] Empty set of search results
240     */
241    public function createEmptyResult() {
242        return [];
243    }
244
245}