Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
48.75% covered (danger)
48.75%
39 / 80
14.29% covered (danger)
14.29%
1 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
ElasticTermResult
48.75% covered (danger)
48.75%
39 / 80
14.29% covered (danger)
14.29%
1 / 7
67.59
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 getSourceFiltering
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
6
 getFields
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getHighlightingConfiguration
0.00% covered (danger)
0.00%
0 / 32
0.00% covered (danger)
0.00%
0 / 1
6
 transformElasticsearchResult
95.45% covered (success)
95.45%
21 / 22
0.00% covered (danger)
0.00%
0 / 1
6
 getTermSearchResult
n/a
0 / 0
n/a
0 / 0
0
 extractTermFromHighlight
93.75% covered (success)
93.75%
15 / 16
0.00% covered (danger)
0.00%
0 / 1
6.01
 createEmptyResult
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace Wikibase\Search\Elastic;
4
5use CirrusSearch\Search\BaseResultsType;
6use Wikibase\DataModel\Term\Term;
7use Wikibase\Lib\Interactors\TermSearchResult;
8use Wikibase\Lib\TermLanguageFallbackChain;
9use Wikibase\Search\Elastic\Fields\DescriptionsField;
10use Wikibase\Search\Elastic\Fields\LabelsField;
11
12/**
13 * This result type implements the result for searching
14 * an entity by its {@link LabelsField label or alias}
15 * (also showing {@link DescriptionsField descriptions}).
16 *
17 * Fully implemented by {@link EntityElasticTermResult} for Wikibase entities.
18 * May also be used by other extensions,
19 * provided they use those same fields
20 * (via {@link \Wikibase\Search\Elastic\Fields\LabelsProviderFieldDefinitions LabelsProviderFieldDefinitions}
21 * and {@link \Wikibase\Search\Elastic\Fields\DescriptionsProviderFieldDefinitions DescriptionsProviderFieldDefinitions}).
22 *
23 * @license GPL-2.0-or-later
24 * @author Stas Malyshev
25 */
26abstract class ElasticTermResult extends BaseResultsType {
27
28    /**
29     * List of language codes in the search fallback chain, the first
30     * is the preferred language.
31     * @var string[]
32     */
33    private $searchLanguageCodes;
34
35    /**
36     * Display fallback chain.
37     * @var TermLanguageFallbackChain
38     */
39    private $termFallbackChain;
40    private string $highlightSubField;
41
42    /**
43     * @param string[] $searchLanguageCodes Language fallback chain for search
44     * @param TermLanguageFallbackChain $displayFallbackChain Fallback chain for display
45     * @param string $highlightSubField 'prefix' or 'plain'
46     */
47    public function __construct(
48        array $searchLanguageCodes,
49        TermLanguageFallbackChain $displayFallbackChain,
50        string $highlightSubField = 'prefix'
51    ) {
52        $this->searchLanguageCodes = $searchLanguageCodes;
53        $this->termFallbackChain = $displayFallbackChain;
54        $this->highlightSubField = $highlightSubField;
55    }
56
57    /**
58     * Get the source filtering to be used loading the result.
59     *
60     * @return string[]
61     */
62    public function getSourceFiltering() {
63        $fields = parent::getSourceFiltering();
64        foreach ( $this->termFallbackChain->getFetchLanguageCodes() as $code ) {
65            $fields[] = LabelsField::NAME . '.' . $code;
66            $fields[] = DescriptionsField::NAME . '.' . $code;
67        }
68        return $fields;
69    }
70
71    /**
72     * Get the fields to load.  Most of the time we'll use source filtering instead but
73     * some fields aren't part of the source.
74     *
75     * @return string[]
76     */
77    public function getFields() {
78        return [];
79    }
80
81    /**
82     * Get the highlighting configuration.
83     *
84     * @param array $highlightSource configuration for how to highlight the source.
85     *  Empty if source should be ignored.
86     * @return array|null highlighting configuration for elasticsearch
87     */
88    public function getHighlightingConfiguration( array $highlightSource ) {
89        $config = [
90            'pre_tags' => [ '' ],
91            'post_tags' => [ '' ],
92            'fields' => [],
93        ];
94        $config['fields']['title'] = [
95            'type' => 'experimental',
96            'fragmenter' => "none",
97            'number_of_fragments' => 0,
98            'matched_fields' => [ 'title.keyword' ]
99        ];
100        $labelsName = LabelsField::NAME;
101        foreach ( $this->searchLanguageCodes as $code ) {
102            $config['fields']["$labelsName.$code.{$this->highlightSubField}"] = [
103                'type' => 'experimental',
104                'fragmenter' => "none",
105                'number_of_fragments' => 0,
106                'options' => [
107                    'skip_if_last_matched' => true,
108                    'return_snippets_and_offsets' => true
109                ],
110            ];
111        }
112        $config['fields']["$labelsName.*.{$this->highlightSubField}"] = [
113            'type' => 'experimental',
114            'fragmenter' => "none",
115            'number_of_fragments' => 0,
116            'options' => [
117                'skip_if_last_matched' => true,
118                'return_snippets_and_offsets' => true
119            ],
120        ];
121
122        return $config;
123    }
124
125    /**
126     * Convert search result from ElasticSearch result set to TermSearchResult.
127     * @param \Elastica\ResultSet $result
128     * @return TermSearchResult[] Set of search results, the types of which vary by implementation.
129     */
130    public function transformElasticsearchResult( \Elastica\ResultSet $result ) {
131        $results = [];
132        foreach ( $result->getResults() as $r ) {
133            $sourceData = $r->getSource();
134
135            // Highlight part contains information about what has actually been matched.
136            $highlight = $r->getHighlights();
137            $displayLabel = EntitySearchUtils::findTermForDisplay( $sourceData, LabelsField::NAME, $this->termFallbackChain );
138            $displayDescription = EntitySearchUtils::findTermForDisplay( $sourceData, DescriptionsField::NAME, $this->termFallbackChain );
139
140            if ( !empty( $highlight['title'] ) ) {
141                // If we matched title, this means it's a match by ID
142                $matchedTermType = 'entityId';
143                $matchedTerm = new Term( 'qid', $sourceData['title'] );
144            } elseif ( !$highlight ) {
145                // Something went wrong, we don't have any highlighting data
146                continue;
147            } else {
148                [ $matchedTermType, $langCode, $term ] =
149                    $this->extractTermFromHighlight( $highlight, $sourceData );
150                $matchedTerm = new Term( $langCode, $term );
151            }
152
153            if ( !$displayLabel ) {
154                // This should not happen, but just in case, it's better to return something
155                $displayLabel = $matchedTerm;
156            }
157
158            $termSearchResult = $this->getTermSearchResult(
159                $sourceData, $matchedTerm, $matchedTermType, $displayLabel, $displayDescription
160            );
161            if ( $termSearchResult !== null ) {
162                $results[$termSearchResult->getEntityIdSerialization()] = $termSearchResult;
163            }
164        }
165
166        return $results;
167    }
168
169    /**
170     * Turn the given result data into a {@link TermSearchResult}
171     * (or skip this result if null is returned).
172     */
173    abstract protected function getTermSearchResult(
174        array $sourceData,
175        Term $matchedTerm,
176        string $matchedTermType,
177        ?Term $displayLabel,
178        ?Term $displayDescription
179    ): ?TermSearchResult;
180
181    /**
182     * New highlighter pattern.
183     * The new highlighter can return offsets as: 1:1-XX:YY|Text Snippet
184     * or even SNIPPET_START:MATCH1_START-MATCH1_END,MATCH2_START-MATCH2_END,...:SNIPPET_END|Text
185     */
186    public const HIGHLIGHT_PATTERN = '/^\d+:\d+-\d+(?:,\d+-\d+)*:\d+\|(.+)/';
187
188    /**
189     * Extract term, language and type from highlighter results.
190     * @param array $highlight Data from highlighter
191     * @param array[] $sourceData Data from _source
192     * @return array Array of: [string $termType, string $languageCode, string $term]
193     */
194    private function extractTermFromHighlight( array $highlight, array $sourceData ) {
195        /**
196         * Highlighter returns:
197         * {
198         *   labels.en.prefix: [
199         *      "metre"  // or "0:0-5:5|metre"
200         *   ]
201         * }
202         */
203        $matchedTermType = 'label';
204        $term = reset( $highlight ); // Take the first one
205        $term = $term[0]; // Highlighter returns array
206        $field = key( $highlight );
207        if ( preg_match( '/^' . preg_quote( LabelsField::NAME ) . "\.([^.]+)\.{$this->highlightSubField}$/", $field, $match ) ) {
208            $langCode = $match[1];
209            if ( preg_match( self::HIGHLIGHT_PATTERN, $term, $termMatch ) ) {
210                $isFirst = ( $term[0] === '0' );
211                $term = $termMatch[1];
212            } else {
213                $isFirst = true;
214            }
215            if ( !empty( $sourceData[LabelsField::NAME][$langCode] ) ) {
216                // Here we have match in one of the languages we asked for.
217                // Primary label always comes first, so if it's not the first one,
218                // it's an alias.
219                if ( $sourceData[LabelsField::NAME][$langCode][0] !== $term ) {
220                    $matchedTermType = 'alias';
221                }
222            } else {
223                // Here we have match in one of the "other" languages.
224                // If it's the first one in the list, it's label, otherwise it is alias.
225                $matchedTermType = $isFirst ? 'label' : 'alias';
226            }
227        } else {
228            // This is weird since we didn't ask to match anything else,
229            // but we'll return it anyway for debugging.
230            $langCode = 'unknown';
231        }
232        return [ $matchedTermType, $langCode, $term ];
233    }
234
235    /**
236     * @return TermSearchResult[] Empty set of search results
237     */
238    public function createEmptyResult() {
239        return [];
240    }
241
242}