Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
73.48% covered (warning)
73.48%
97 / 132
22.22% covered (danger)
22.22%
2 / 9
CRAP
0.00% covered (danger)
0.00%
0 / 1
FormTermResult
73.48% covered (warning)
73.48%
97 / 132
22.22% covered (danger)
22.22%
2 / 9
34.74
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 getSourceFiltering
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
2
 getFields
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getHighlightingConfiguration
0.00% covered (danger)
0.00%
0 / 21
0.00% covered (danger)
0.00%
0 / 1
2
 getIdResult
90.91% covered (success)
90.91%
20 / 22
0.00% covered (danger)
0.00%
0 / 1
5.02
 getRepresentationResult
94.12% covered (success)
94.12%
16 / 17
0.00% covered (danger)
0.00%
0 / 1
3.00
 transformElasticsearchResult
93.88% covered (success)
93.88%
46 / 49
0.00% covered (danger)
0.00%
0 / 1
10.02
 produceTermResult
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
1
 createEmptyResult
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2namespace Wikibase\Lexeme\Search\Elastic;
3
4use CirrusSearch\Search\BaseResultsType;
5use Elastica\ResultSet;
6use Language;
7use Wikibase\DataModel\Entity\EntityIdParser;
8use Wikibase\DataModel\Term\Term;
9use Wikibase\Lexeme\DataAccess\LexemeDescription;
10use Wikibase\Lib\Interactors\TermSearchResult;
11use Wikibase\Lib\Store\FallbackLabelDescriptionLookupFactory;
12use Wikibase\Search\Elastic\EntitySearchUtils;
13
14/**
15 * This result type implements the result for searching a Wikibase Form.
16 *
17 * @license GPL-2.0-or-later
18 * @author Stas Malyshev
19 */
20class FormTermResult extends BaseResultsType {
21
22    /**
23     * @var EntityIdParser
24     */
25    private $idParser;
26
27    /**
28     * Display language
29     * @var Language
30     */
31    private $displayLanguage;
32    /**
33     * @var FallbackLabelDescriptionLookupFactory
34     */
35    private $termLookupFactory;
36    /**
37     * Limit how many results to produce
38     * @var int
39     */
40    private $limit;
41
42    /**
43     * @param EntityIdParser $idParser
44     * @param Language $displayLanguage User display language
45     * @param FallbackLabelDescriptionLookupFactory $termLookupFactory
46     *        Lookup factory for assembling descriptions
47     * @param int $limit How many results to produce
48     */
49    public function __construct(
50        EntityIdParser $idParser,
51        Language $displayLanguage,
52        FallbackLabelDescriptionLookupFactory $termLookupFactory,
53        $limit
54    ) {
55        $this->idParser = $idParser;
56        $this->termLookupFactory = $termLookupFactory;
57        $this->displayLanguage = $displayLanguage;
58        $this->limit = $limit;
59    }
60
61    /**
62     * Get the source filtering to be used loading the result.
63     *
64     * @return string[]
65     */
66    public function getSourceFiltering() {
67        return array_merge( parent::getSourceFiltering(), [
68                LemmaField::NAME,
69                LexemeLanguageField::NAME,
70                LexemeCategoryField::NAME,
71                FormsField::NAME
72        ] );
73    }
74
75    /**
76     * Get the fields to load.  Most of the time we'll use source filtering instead but
77     * some fields aren't part of the source.
78     *
79     * @return string[]
80     */
81    public function getFields() {
82        return [];
83    }
84
85    /**
86     * Get the highlighting configuration.
87     *
88     * @param array $highlightSource configuration for how to highlight the source.
89     *  Empty if source should be ignored.
90     * @return array|null highlighting configuration for elasticsearch
91     */
92    public function getHighlightingConfiguration( array $highlightSource ) {
93        $config = [
94            'pre_tags' => [ '' ],
95            'post_tags' => [ '' ],
96            'fields' => [],
97        ];
98        $config['fields']['lexeme_forms.id'] = [
99            'type' => 'experimental',
100            'fragmenter' => "none",
101            'number_of_fragments' => 0,
102        ];
103        $config['fields']["lexeme_forms.representation"] = [
104            'type' => 'experimental',
105            'fragmenter' => "none",
106            'number_of_fragments' => 30,
107            'fragment_size' => 1000, // Hopefully this is enough
108            'matched_fields' => [ 'lexeme_forms.representation.prefix' ],
109            'options' => [
110                'skip_if_last_matched' => true,
111            ],
112        ];
113
114        return $config;
115    }
116
117    /**
118     * Produce raw result for ID-type match.
119     * @param string[][] $highlight Highlighter data
120     * @param array $sourceData Lexeme source data
121     * @return array|null Null if match is bad
122     */
123    private function getIdResult( $highlight, $sourceData ) {
124        $formId = $highlight['lexeme_forms.id'][0];
125        $formIdParsed = EntitySearchUtils::parseOrNull( $formId, $this->idParser );
126        if ( !$formIdParsed ) {
127            // Got some bad id?? Weird.
128            return null;
129        }
130        $repr = '';
131        $features = [];
132        foreach ( $sourceData['lexeme_forms'] as $form ) {
133            if ( $form['id'] === $formId ) {
134                // TODO: how we choose one?
135                $repr = $form['representation'][0];
136                // Convert features to EntityId's
137                $features = array_filter( array_map( function ( $featureId ) {
138                    return EntitySearchUtils::parseOrNull( $featureId, $this->idParser );
139                }, $form['features'] ) );
140                break;
141            }
142        }
143        if ( $repr === '' ) {
144            // Didn't find the right id? Weird, skip it.
145            return null;
146        }
147
148        return [
149            'id' => $formIdParsed,
150            'representation' => $repr,
151            'features' => $features,
152            'term' => new Term( 'qid', $formId ),
153            'type' => 'entityId',
154        ];
155    }
156
157    /**
158     * Get data for specific form
159     * @param string[][] $highlight  Highlighter data
160     * @param array $form Form source data
161     * @param string $lemmaCode Language code for main lemma
162     * @return array|null Null if match is bad
163     */
164    private function getRepresentationResult( $highlight, $form, $lemmaCode ) {
165        $reprMatches = array_intersect( $form['representation'],
166            $highlight['lexeme_forms.representation'] );
167        if ( !$reprMatches ) {
168            return null;
169        }
170        // matches the data
171        $formIdParsed = EntitySearchUtils::parseOrNull( $form['id'], $this->idParser );
172        if ( !$formIdParsed ) {
173            // Got some bad id?? Weird.
174            return null;
175        }
176        // Convert features to EntityId's
177        $featureIds = array_filter( array_map( function ( $featureId ) {
178            return EntitySearchUtils::parseOrNull( $featureId, $this->idParser );
179        }, $form['features'] ) );
180        return [
181            'id' => $formIdParsed,
182            // TODO: how we choose the best one of many?
183            'representation' => reset( $form['representation'] ),
184            'features' => $featureIds,
185            // TODO: This may not be true, since matched representation can be
186            // from another language...Not sure what to do about it.
187            'term' => new Term( $lemmaCode, reset( $reprMatches ) ),
188            'type' => 'label',
189        ];
190    }
191
192    /**
193     * Convert search result from ElasticSearch result set to TermSearchResult.
194     * @param ResultSet $result
195     * @return TermSearchResult[] Set of search results, the types of which vary by implementation.
196     */
197    public function transformElasticsearchResult( ResultSet $result ) {
198        $rawResults = $entityIds = [];
199        foreach ( $result->getResults() as $r ) {
200            $sourceData = $r->getSource();
201            $entityId = EntitySearchUtils::parseOrNull( $sourceData['title'], $this->idParser );
202            if ( !$entityId ) {
203                // Can not parse entity ID - skip it
204                // TODO: what we do here if no language code?
205                // Not sure we want to index all lemma languages.
206                // Should we just fake the term language code?
207                continue;
208            }
209
210            $lemmaCode = LexemeTermResult::extractLanguageCode( $sourceData );
211
212            // Highlight part contains information about what has actually been matched.
213            $highlight = $r->getHighlights();
214
215            $lang = $sourceData['lexeme_language']['entity'];
216            $category = $sourceData['lexical_category'];
217
218            $features = [];
219            $lexemeData = [
220                'lexemeId' => $entityId,
221                'lemma' => $sourceData['lemma'][0],
222                'lang' => $lang,
223                'langcode' => $lemmaCode,
224                'category' => $category
225            ];
226            // Doing two-stage resolution here since we want to prefetch all labels for
227            // auxiliary entities before using them to construct descriptions.
228            if ( !empty( $highlight['lexeme_forms.id'] ) ) {
229                // If we matched Form ID, this means it's a match by ID
230                $idResult = $this->getIdResult( $highlight, $sourceData );
231                if ( !$idResult ) {
232                    continue;
233                }
234
235                $rawResults[$highlight['lexeme_forms.id'][0]] = $idResult + $lexemeData;
236                $features = array_merge( $features, $idResult['features'] );
237            } elseif ( !empty( $highlight['lexeme_forms.representation'] ) ) {
238                // We matched form representation, let's see which ones we've got
239                // Find all forms whose representations match what we have found.
240                // Note this can be more than one.
241                foreach ( $sourceData['lexeme_forms'] as $form ) {
242                    $formResult = $this->getRepresentationResult( $highlight, $form, $lemmaCode );
243                    if ( !$formResult ) {
244                        continue;
245                    }
246                    $rawResults[$form['id']] = $formResult + $lexemeData;
247                    $features = array_merge( $features, $formResult['features'] );
248                }
249            } else {
250                // TODO: No data to match, skip it. Should we report something?
251                continue;
252            }
253
254            $entityIds[$lang] = EntitySearchUtils::parseOrNull( $lang, $this->idParser );
255            $entityIds[$category] = EntitySearchUtils::parseOrNull( $category, $this->idParser );
256            foreach ( $features as $feature ) {
257                $entityIds[$feature->getSerialization()] = $feature;
258            }
259        }
260
261        $langCode = $this->displayLanguage->getCode();
262        if ( !$rawResults ) {
263            return [];
264        }
265        // Create prefetched lookup
266        $termLookup = $this->termLookupFactory->newLabelDescriptionLookup( $this->displayLanguage,
267            array_filter( $entityIds ) );
268        $descriptionMaker = new LexemeDescription( $termLookup, $this->idParser,
269            $this->displayLanguage );
270        // Create full descriptions and instantiate TermSearchResult objects
271        return array_map(
272            function ( $raw ) use ( $descriptionMaker, $langCode ) {
273                return $this->produceTermResult( $descriptionMaker, $langCode, $raw );
274            },
275            array_slice( $rawResults, 0, $this->limit )
276        );
277    }
278
279    /**
280     * Produce TermSearchResult from raw result data.
281     * @param LexemeDescription $descriptionMaker
282     * @param string $langCode
283     * @param array $raw
284     * @return TermSearchResult
285     */
286    private function produceTermResult(
287        LexemeDescription $descriptionMaker,
288        $langCode,
289        array $raw
290    ) {
291        return new TermSearchResult(
292            $raw['term'],
293            $raw['type'],
294            $raw['id'],
295            // We are lying somewhat here, as description might be from fallback languages,
296            // but I am not sure there's any better way here.
297            new Term( $raw['langcode'], $raw['representation'] ),
298            new Term( $langCode,
299                $descriptionMaker->createFormDescription(
300                    $raw['lexemeId'], $raw['features'], $raw['lemma'], $raw['lang'],
301                    $raw['category']
302                ) )
303        );
304    }
305
306    /**
307     * @return TermSearchResult[] Empty set of search results
308     */
309    public function createEmptyResult() {
310        return [];
311    }
312
313}