Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
48.15% covered (danger)
48.15%
39 / 81
14.29% covered (danger)
14.29%
1 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
ElasticTermResult
48.15% covered (danger)
48.15%
39 / 81
14.29% covered (danger)
14.29%
1 / 7
69.33
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 getSourceFiltering
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
6
 getFields
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getHighlightingConfiguration
0.00% covered (danger)
0.00%
0 / 31
0.00% covered (danger)
0.00%
0 / 1
6
 transformElasticsearchResult
87.50% covered (warning)
87.50%
21 / 24
0.00% covered (danger)
0.00%
0 / 1
6.07
 extractTermFromHighlight
93.75% covered (success)
93.75%
15 / 16
0.00% covered (danger)
0.00%
0 / 1
6.01
 createEmptyResult
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace Wikibase\Search\Elastic;
4
5use CirrusSearch\Search\BaseResultsType;
6use Wikibase\DataModel\Entity\EntityIdParser;
7use Wikibase\DataModel\Entity\EntityIdParsingException;
8use Wikibase\DataModel\Term\Term;
9use Wikibase\Lib\Interactors\TermSearchResult;
10use Wikibase\Lib\TermLanguageFallbackChain;
11
12/**
13 * This result type implements the result for searching
14 * a Wikibase entity by its label or alias.
15 *
16 * @license GPL-2.0-or-later
17 * @author Stas Malyshev
18 */
19class ElasticTermResult extends BaseResultsType {
20
21    /**
22     * @var EntityIdParser
23     */
24    private $idParser;
25
26    /**
27     * List of language codes in the search fallback chain, the first
28     * is the preferred language.
29     * @var string[]
30     */
31    private $searchLanguageCodes;
32
33    /**
34     * Display fallback chain.
35     * @var TermLanguageFallbackChain
36     */
37    private $termFallbackChain;
38
39    /**
40     * @param EntityIdParser $idParser
41     * @param string[] $searchLanguageCodes Language fallback chain for search
42     * @param TermLanguageFallbackChain $displayFallbackChain Fallback chain for display
43     */
44    public function __construct(
45        EntityIdParser $idParser,
46        array $searchLanguageCodes,
47        TermLanguageFallbackChain $displayFallbackChain
48    ) {
49        $this->idParser = $idParser;
50        $this->searchLanguageCodes = $searchLanguageCodes;
51        $this->termFallbackChain = $displayFallbackChain;
52    }
53
54    /**
55     * Get the source filtering to be used loading the result.
56     *
57     * @return string[]
58     */
59    public function getSourceFiltering() {
60        $fields = parent::getSourceFiltering();
61        foreach ( $this->termFallbackChain->getFetchLanguageCodes() as $code ) {
62            $fields[] = "labels.$code";
63            $fields[] = "descriptions.$code";
64        }
65        return $fields;
66    }
67
68    /**
69     * Get the fields to load.  Most of the time we'll use source filtering instead but
70     * some fields aren't part of the source.
71     *
72     * @return string[]
73     */
74    public function getFields() {
75        return [];
76    }
77
78    /**
79     * Get the highlighting configuration.
80     *
81     * @param array $highlightSource configuration for how to highlight the source.
82     *  Empty if source should be ignored.
83     * @return array|null highlighting configuration for elasticsearch
84     */
85    public function getHighlightingConfiguration( array $highlightSource ) {
86        $config = [
87            'pre_tags' => [ '' ],
88            'post_tags' => [ '' ],
89            'fields' => [],
90        ];
91        $config['fields']['title'] = [
92            'type' => 'experimental',
93            'fragmenter' => "none",
94            'number_of_fragments' => 0,
95            'matched_fields' => [ 'title.keyword' ]
96        ];
97        foreach ( $this->searchLanguageCodes as $code ) {
98            $config['fields']["labels.$code.prefix"] = [
99                'type' => 'experimental',
100                'fragmenter' => "none",
101                'number_of_fragments' => 0,
102                'options' => [
103                    'skip_if_last_matched' => true,
104                    'return_snippets_and_offsets' => true
105                ],
106            ];
107        }
108        $config['fields']['labels.*.prefix'] = [
109            'type' => 'experimental',
110            'fragmenter' => "none",
111            'number_of_fragments' => 0,
112            'options' => [
113                'skip_if_last_matched' => true,
114                'return_snippets_and_offsets' => true
115            ],
116        ];
117
118        return $config;
119    }
120
121    /**
122     * Convert search result from ElasticSearch result set to TermSearchResult.
123     * @param \Elastica\ResultSet $result
124     * @return TermSearchResult[] Set of search results, the types of which vary by implementation.
125     */
126    public function transformElasticsearchResult( \Elastica\ResultSet $result ) {
127        $results = [];
128        foreach ( $result->getResults() as $r ) {
129            $sourceData = $r->getSource();
130            try {
131                $entityId = $this->idParser->parse( $sourceData['title'] );
132            } catch ( EntityIdParsingException $e ) {
133                // Can not parse entity ID - skip it
134                continue;
135            }
136
137            // Highlight part contains information about what has actually been matched.
138            $highlight = $r->getHighlights();
139            $displayLabel = EntitySearchUtils::findTermForDisplay( $sourceData, 'labels', $this->termFallbackChain );
140            $displayDescription = EntitySearchUtils::findTermForDisplay( $sourceData, 'descriptions', $this->termFallbackChain );
141
142            if ( !empty( $highlight['title'] ) ) {
143                // If we matched title, this means it's a match by ID
144                $matchedTermType = 'entityId';
145                $matchedTerm = new Term( 'qid', $sourceData['title'] );
146            } elseif ( !$highlight ) {
147                // Something went wrong, we don't have any highlighting data
148                continue;
149            } else {
150                [ $matchedTermType, $langCode, $term ] =
151                    $this->extractTermFromHighlight( $highlight, $sourceData );
152                $matchedTerm = new Term( $langCode, $term );
153            }
154
155            if ( !$displayLabel ) {
156                // This should not happen, but just in case, it's better to return something
157                $displayLabel = $matchedTerm;
158            }
159
160            $results[$entityId->getSerialization()] = new TermSearchResult(
161                $matchedTerm, $matchedTermType, $entityId, $displayLabel,
162                $displayDescription
163            );
164        }
165
166        return $results;
167    }
168
169    /**
170     * New highlighter pattern.
171     * The new highlighter can return offsets as: 1:1-XX:YY|Text Snippet
172     * or even SNIPPET_START:MATCH1_START-MATCH1_END,MATCH2_START-MATCH2_END,...:SNIPPET_END|Text
173     */
174    public const HIGHLIGHT_PATTERN = '/^\d+:\d+-\d+(?:,\d+-\d+)*:\d+\|(.+)/';
175
176    /**
177     * Extract term, language and type from highlighter results.
178     * @param array $highlight Data from highlighter
179     * @param array[] $sourceData Data from _source
180     * @return array Array of: [string $termType, string $languageCode, string $term]
181     */
182    private function extractTermFromHighlight( array $highlight, array $sourceData ) {
183        /**
184         * Highlighter returns:
185         * {
186         *   labels.en.prefix: [
187         *      "metre"  // or "0:0-5:5|metre"
188         *   ]
189         * }
190         */
191        $matchedTermType = 'label';
192        $term = reset( $highlight ); // Take the first one
193        $term = $term[0]; // Highlighter returns array
194        $field = key( $highlight );
195        if ( preg_match( '/^labels\.([^.]+)\.prefix$/', $field, $match ) ) {
196            $langCode = $match[1];
197            if ( preg_match( self::HIGHLIGHT_PATTERN, $term, $termMatch ) ) {
198                $isFirst = ( $term[0] === '0' );
199                $term = $termMatch[1];
200            } else {
201                $isFirst = true;
202            }
203            if ( !empty( $sourceData['labels'][$langCode] ) ) {
204                // Here we have match in one of the languages we asked for.
205                // Primary label always comes first, so if it's not the first one,
206                // it's an alias.
207                if ( $sourceData['labels'][$langCode][0] !== $term ) {
208                    $matchedTermType = 'alias';
209                }
210            } else {
211                // Here we have match in one of the "other" languages.
212                // If it's the first one in the list, it's label, otherwise it is alias.
213                $matchedTermType = $isFirst ? 'label' : 'alias';
214            }
215        } else {
216            // This is weird since we didn't ask to match anything else,
217            // but we'll return it anyway for debugging.
218            $langCode = 'unknown';
219        }
220        return [ $matchedTermType, $langCode, $term ];
221    }
222
223    /**
224     * @return TermSearchResult[] Empty set of search results
225     */
226    public function createEmptyResult() {
227        return [];
228    }
229
230}