Code Coverage for /workspace/src/extensions/WikibaseLexemeCirrusSearch/src/LexemeTermResult.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	55.32% covered (warning)	55.32%	52 / 94	28.57% covered (danger)	28.57%	2 / 7	CRAP	0.00% covered (danger)	0.00%	0 / 1
LexemeTermResult	55.32% covered (warning)	55.32%	52 / 94	28.57% covered (danger)	28.57%	2 / 7	35.07	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
getSourceFiltering	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	2
getFields	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
getHighlightingConfiguration	0.00% covered (danger)	0.00%	0 / 32	0.00% covered (danger)	0.00%	0 / 1	2
transformElasticsearchResult	93.88% covered (success)	93.88%	46 / 49	0.00% covered (danger)	0.00%	0 / 1	8.01
extractLanguageCode	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
createEmptyResult	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2	namespace Wikibase\Lexeme\Search\Elastic;
3
4	use CirrusSearch\Search\BaseResultsType;
5	use Elastica\ResultSet;
6	use Language;
7	use Wikibase\DataModel\Entity\EntityIdParser;
8	use Wikibase\DataModel\Term\Term;
9	use Wikibase\Lexeme\DataAccess\LexemeDescription;
10	use Wikibase\Lib\Interactors\TermSearchResult;
11	use Wikibase\Lib\Store\FallbackLabelDescriptionLookupFactory;
12	use Wikibase\Search\Elastic\EntitySearchUtils;
13
14	/**
15	* This result type implements the result for searching a Wikibase Lexeme.
16	*
17	* @license GPL-2.0-or-later
18	* @author Stas Malyshev
19	*/
20	class LexemeTermResult extends BaseResultsType {
21
22	/**
23	* @var EntityIdParser
24	*/
25	private $idParser;
26
27	/**
28	* Display language
29	* @var Language
30	*/
31	private $displayLanguage;
32
33	/**
34	* @var FallbackLabelDescriptionLookupFactory
35	*/
36	private $termLookupFactory;
37
38	/**
39	* @param EntityIdParser $idParser
40	* @param Language $displayLanguage User display language
41	* @param FallbackLabelDescriptionLookupFactory $termLookupFactory
42	* Lookup factory for assembling descriptions
43	*/
44	public function __construct(
45	EntityIdParser $idParser,
46	Language $displayLanguage,
47	FallbackLabelDescriptionLookupFactory $termLookupFactory
48	) {
49	$this->idParser = $idParser;
50	$this->termLookupFactory = $termLookupFactory;
51	$this->displayLanguage = $displayLanguage;
52	}
53
54	/**
55	* Get the source filtering to be used loading the result.
56	*
57	* @return string[]
58	*/
59	public function getSourceFiltering() {
60	return array_merge( parent::getSourceFiltering(), [
61	LemmaField::NAME,
62	LexemeLanguageField::NAME,
63	LexemeCategoryField::NAME,
64	] );
65	}
66
67	/**
68	* Get the fields to load. Most of the time we'll use source filtering instead but
69	* some fields aren't part of the source.
70	*
71	* @return string[]
72	*/
73	public function getFields() {
74	return [];
75	}
76
77	/**
78	* Get the highlighting configuration.
79	*
80	* @param array $highlightSource configuration for how to highlight the source.
81	* Empty if source should be ignored.
82	* @return array\|null highlighting configuration for elasticsearch
83	*/
84	public function getHighlightingConfiguration( array $highlightSource ) {
85	$config = [
86	'pre_tags' => [ '' ],
87	'post_tags' => [ '' ],
88	'fields' => [],
89	];
90	$config['fields']['title'] = [
91	'type' => 'experimental',
92	'fragmenter' => "none",
93	'number_of_fragments' => 0,
94	'matched_fields' => [ 'title.keyword' ]
95	];
96	$config['fields']["lemma"] = [
97	'type' => 'experimental',
98	'fragmenter' => "none",
99	'number_of_fragments' => 0,
100	'options' => [
101	'skip_if_last_matched' => true,
102	],
103	'matched_fields' => [ 'lemma.prefix' ]
104	];
105	$config['fields']["lexeme_forms.representation"] = [
106	'type' => 'experimental',
107	'fragmenter' => "none",
108	'number_of_fragments' => 0,
109	"matched_fields" => [
110	"lexeme_forms.representation.prefix",
111	],
112	'options' => [
113	'skip_if_last_matched' => true,
114	],
115	];
116
117	return $config;
118	}
119
120	/**
121	* Convert search result from ElasticSearch result set to TermSearchResult.
122	* @param ResultSet $result
123	* @return TermSearchResult[] Set of search results, the types of which vary by implementation.
124	*/
125	public function transformElasticsearchResult( ResultSet $result ) {
126	$rawResults = $entityIds = [];
127	foreach ( $result->getResults() as $r ) {
128	$sourceData = $r->getSource();
129	$entityId = EntitySearchUtils::parseOrNull( $sourceData['title'], $this->idParser );
130	if ( !$entityId ) {
131	// Can not parse entity ID - skip it
132	continue;
133	}
134
135	$lemmaCode = self::extractLanguageCode( $sourceData );
136
137	// Highlight part contains information about what has actually been matched.
138	$highlight = $r->getHighlights();
139
140	if ( !empty( $highlight['title'] ) ) {
141	// If we matched title, this means it's a match by ID
142	$matchedTermType = 'entityId';
143	$matchedTerm = new Term( 'qid', $sourceData['title'] );
144	} elseif ( empty( $highlight['lemma'] ) && empty( $highlight['lexeme_forms.representation'] ) ) {
145	// Something went wrong, we don't have any highlighting data
146	continue;
147	} elseif ( !empty( $highlight['lemma'] ) ) {
148	// We matched lemma
149	$matchedTermType = 'label';
150	$matchedTerm = new Term( $lemmaCode, $highlight['lemma'][0] );
151	} else {
152	// matched one of the forms
153	$matchedTermType = 'alias';
154	// @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset
155	$matchedTerm = new Term( $lemmaCode, $highlight['lexeme_forms.representation'][0] );
156	}
157
158	$lang = $sourceData['lexeme_language']['entity'];
159	$category = $sourceData['lexical_category'];
160
161	$entityIds[$lang] = EntitySearchUtils::parseOrNull( $lang, $this->idParser );
162	$entityIds[$category] = EntitySearchUtils::parseOrNull( $category, $this->idParser );
163
164	// Doing two-stage resolution here since we want to prefetch all labels for
165	// auxiliary entities before using them to construct descriptions.
166	$rawResults[$entityId->getSerialization()] = [
167	'id' => $entityId,
168	// TODO: this assumes we always take the first lemma. Maybe we should use
169	// the shortest language code or something. That would require us to index
170	// lemma language codes though.
171	'lemma' => $sourceData['lemma'][0],
172	'term' => $matchedTerm,
173	'type' => $matchedTermType,
174	'lang' => $lang,
175	'langcode' => $lemmaCode,
176	'category' => $category
177	];
178	}
179
180	$langCode = $this->displayLanguage->getCode();
181	if ( $entityIds ) {
182	// Create prefetched lookup
183	$termLookup = $this->termLookupFactory->newLabelDescriptionLookup( $this->displayLanguage,
184	array_filter( $entityIds ) );
185	$descriptionMaker = new LexemeDescription( $termLookup, $this->idParser,
186	$this->displayLanguage );
187	// Create full descriptons and instantiate TermSearchResult objects
188	return array_map( static function ( $raw ) use ( $descriptionMaker, $langCode ) {
189	return new TermSearchResult(
190	$raw['term'],
191	$raw['type'],
192	$raw['id'],
193	new Term( $raw['langcode'], $raw['lemma'] ),
194	// We are lying somewhat here, as description might be from fallback languages,
195	// but I am not sure there's any better way here.
196	new Term( $langCode,
197	$descriptionMaker->createDescription( $raw['id'], $raw['lang'],
198	$raw['category'] ) )
199	);
200	}, $rawResults );
201	} else {
202	return [];
203	}
204	}
205
206	/**
207	* @param array $sourceData the source data returned by elastic
208	* @return string the lexeme_language code if set, 'und' otherwise.
209	*/
210	public static function extractLanguageCode( array $sourceData ) {
211	if ( empty( $sourceData['lexeme_language']['code'] ) ) {
212	return 'und';
213	} else {
214	return $sourceData['lexeme_language']['code'];
215	}
216	}
217
218	/**
219	* @return TermSearchResult[] Empty set of search results
220	*/
221	public function createEmptyResult() {
222	return [];
223	}
224
225	}