Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
55.32% covered (warning)
55.32%
52 / 94
28.57% covered (danger)
28.57%
2 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
LexemeTermResult
55.32% covered (warning)
55.32%
52 / 94
28.57% covered (danger)
28.57%
2 / 7
35.07
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 getSourceFiltering
0.00% covered (danger)
0.00%
0 / 5
0.00% covered (danger)
0.00%
0 / 1
2
 getFields
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getHighlightingConfiguration
0.00% covered (danger)
0.00%
0 / 32
0.00% covered (danger)
0.00%
0 / 1
2
 transformElasticsearchResult
93.88% covered (success)
93.88%
46 / 49
0.00% covered (danger)
0.00%
0 / 1
8.01
 extractLanguageCode
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 createEmptyResult
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2namespace Wikibase\Lexeme\Search\Elastic;
3
4use CirrusSearch\Search\BaseResultsType;
5use Elastica\ResultSet;
6use Language;
7use Wikibase\DataModel\Entity\EntityIdParser;
8use Wikibase\DataModel\Term\Term;
9use Wikibase\Lexeme\DataAccess\LexemeDescription;
10use Wikibase\Lib\Interactors\TermSearchResult;
11use Wikibase\Lib\Store\FallbackLabelDescriptionLookupFactory;
12use Wikibase\Search\Elastic\EntitySearchUtils;
13
14/**
15 * This result type implements the result for searching a Wikibase Lexeme.
16 *
17 * @license GPL-2.0-or-later
18 * @author Stas Malyshev
19 */
20class LexemeTermResult extends BaseResultsType {
21
22    /**
23     * @var EntityIdParser
24     */
25    private $idParser;
26
27    /**
28     * Display language
29     * @var Language
30     */
31    private $displayLanguage;
32
33    /**
34     * @var FallbackLabelDescriptionLookupFactory
35     */
36    private $termLookupFactory;
37
38    /**
39     * @param EntityIdParser $idParser
40     * @param Language $displayLanguage User display language
41     * @param FallbackLabelDescriptionLookupFactory $termLookupFactory
42     *        Lookup factory for assembling descriptions
43     */
44    public function __construct(
45        EntityIdParser $idParser,
46        Language $displayLanguage,
47        FallbackLabelDescriptionLookupFactory $termLookupFactory
48    ) {
49        $this->idParser = $idParser;
50        $this->termLookupFactory = $termLookupFactory;
51        $this->displayLanguage = $displayLanguage;
52    }
53
54    /**
55     * Get the source filtering to be used loading the result.
56     *
57     * @return string[]
58     */
59    public function getSourceFiltering() {
60        return array_merge( parent::getSourceFiltering(), [
61                LemmaField::NAME,
62                LexemeLanguageField::NAME,
63                LexemeCategoryField::NAME,
64        ] );
65    }
66
67    /**
68     * Get the fields to load.  Most of the time we'll use source filtering instead but
69     * some fields aren't part of the source.
70     *
71     * @return string[]
72     */
73    public function getFields() {
74        return [];
75    }
76
77    /**
78     * Get the highlighting configuration.
79     *
80     * @param array $highlightSource configuration for how to highlight the source.
81     *  Empty if source should be ignored.
82     * @return array|null highlighting configuration for elasticsearch
83     */
84    public function getHighlightingConfiguration( array $highlightSource ) {
85        $config = [
86            'pre_tags' => [ '' ],
87            'post_tags' => [ '' ],
88            'fields' => [],
89        ];
90        $config['fields']['title'] = [
91            'type' => 'experimental',
92            'fragmenter' => "none",
93            'number_of_fragments' => 0,
94            'matched_fields' => [ 'title.keyword' ]
95        ];
96        $config['fields']["lemma"] = [
97            'type' => 'experimental',
98            'fragmenter' => "none",
99            'number_of_fragments' => 0,
100            'options' => [
101                'skip_if_last_matched' => true,
102            ],
103            'matched_fields' => [ 'lemma.prefix' ]
104        ];
105        $config['fields']["lexeme_forms.representation"] = [
106            'type' => 'experimental',
107            'fragmenter' => "none",
108            'number_of_fragments' => 0,
109            "matched_fields" => [
110                "lexeme_forms.representation.prefix",
111            ],
112            'options' => [
113                'skip_if_last_matched' => true,
114            ],
115        ];
116
117        return $config;
118    }
119
120    /**
121     * Convert search result from ElasticSearch result set to TermSearchResult.
122     * @param ResultSet $result
123     * @return TermSearchResult[] Set of search results, the types of which vary by implementation.
124     */
125    public function transformElasticsearchResult( ResultSet $result ) {
126        $rawResults = $entityIds = [];
127        foreach ( $result->getResults() as $r ) {
128            $sourceData = $r->getSource();
129            $entityId = EntitySearchUtils::parseOrNull( $sourceData['title'], $this->idParser );
130            if ( !$entityId ) {
131                // Can not parse entity ID - skip it
132                continue;
133            }
134
135            $lemmaCode = self::extractLanguageCode( $sourceData );
136
137            // Highlight part contains information about what has actually been matched.
138            $highlight = $r->getHighlights();
139
140            if ( !empty( $highlight['title'] ) ) {
141                // If we matched title, this means it's a match by ID
142                $matchedTermType = 'entityId';
143                $matchedTerm = new Term( 'qid', $sourceData['title'] );
144            } elseif ( empty( $highlight['lemma'] ) && empty( $highlight['lexeme_forms.representation'] ) ) {
145                // Something went wrong, we don't have any highlighting data
146                continue;
147            } elseif ( !empty( $highlight['lemma'] ) ) {
148                // We matched lemma
149                $matchedTermType = 'label';
150                $matchedTerm = new Term( $lemmaCode, $highlight['lemma'][0] );
151            } else {
152                // matched one of the forms
153                $matchedTermType = 'alias';
154                // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset
155                $matchedTerm = new Term( $lemmaCode, $highlight['lexeme_forms.representation'][0] );
156            }
157
158            $lang = $sourceData['lexeme_language']['entity'];
159            $category = $sourceData['lexical_category'];
160
161            $entityIds[$lang] = EntitySearchUtils::parseOrNull( $lang, $this->idParser );
162            $entityIds[$category] = EntitySearchUtils::parseOrNull( $category, $this->idParser );
163
164            // Doing two-stage resolution here since we want to prefetch all labels for
165            // auxiliary entities before using them to construct descriptions.
166            $rawResults[$entityId->getSerialization()] = [
167                'id' => $entityId,
168                // TODO: this assumes we always take the first lemma. Maybe we should use
169                // the shortest language code or something. That would require us to index
170                // lemma language codes though.
171                'lemma' => $sourceData['lemma'][0],
172                'term' => $matchedTerm,
173                'type' => $matchedTermType,
174                'lang' => $lang,
175                'langcode' => $lemmaCode,
176                'category' => $category
177            ];
178        }
179
180        $langCode = $this->displayLanguage->getCode();
181        if ( $entityIds ) {
182            // Create prefetched lookup
183            $termLookup = $this->termLookupFactory->newLabelDescriptionLookup( $this->displayLanguage,
184                array_filter( $entityIds ) );
185            $descriptionMaker = new LexemeDescription( $termLookup, $this->idParser,
186                $this->displayLanguage );
187            // Create full descriptons and instantiate TermSearchResult objects
188            return array_map( static function ( $raw ) use ( $descriptionMaker, $langCode ) {
189                return new TermSearchResult(
190                    $raw['term'],
191                    $raw['type'],
192                    $raw['id'],
193                    new Term( $raw['langcode'], $raw['lemma'] ),
194                    // We are lying somewhat here, as description might be from fallback languages,
195                    // but I am not sure there's any better way here.
196                    new Term( $langCode,
197                        $descriptionMaker->createDescription( $raw['id'], $raw['lang'],
198                            $raw['category'] ) )
199                );
200            }, $rawResults );
201        } else {
202            return [];
203        }
204    }
205
206    /**
207     * @param array $sourceData the source data returned by elastic
208     * @return string the lexeme_language code if set, 'und' otherwise.
209     */
210    public static function extractLanguageCode( array $sourceData ) {
211        if ( empty( $sourceData['lexeme_language']['code'] ) ) {
212            return 'und';
213        } else {
214            return $sourceData['lexeme_language']['code'];
215        }
216    }
217
218    /**
219     * @return TermSearchResult[] Empty set of search results
220     */
221    public function createEmptyResult() {
222        return [];
223    }
224
225}