Code Coverage for /workspace/src/extensions/WikibaseMediaInfo/src/Search/MediaSearchASTQueryBuilder.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	75.43% covered (warning)	75.43%	175 / 232	34.78% covered (danger)	34.78%	8 / 23	CRAP	0.00% covered (danger)	0.00%	0 / 1
MediaSearchASTQueryBuilder	75.43% covered (warning)	75.43%	175 / 232	34.78% covered (danger)	34.78%	8 / 23	82.17	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	42 / 42	100.00% covered (success)	100.00%	1 / 1	1
getQuery	87.50% covered (warning)	87.50%	14 / 16	0.00% covered (danger)	0.00%	0 / 1	3.02
applyLogisticFunction	16.67% covered (danger)	16.67%	2 / 12	0.00% covered (danger)	0.00%	0 / 1	4.31
normalizeMultiClauseScores	21.43% covered (danger)	21.43%	3 / 14	0.00% covered (danger)	0.00%	0 / 1	11.76
visitParsedBooleanNode	70.00% covered (warning)	70.00%	14 / 20	0.00% covered (danger)	0.00%	0 / 1	11.19
visitBooleanClause	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
visitWordsQueryNode	100.00% covered (success)	100.00%	33 / 33	100.00% covered (success)	100.00%	1 / 1	1
visitPhraseQueryNode	0.00% covered (danger)	0.00%	0 / 9	0.00% covered (danger)	0.00%	0 / 1	2
visitPhrasePrefixNode	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
visitNegatedNode	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
visitFuzzyNode	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
visitPrefixNode	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
visitWildcardNode	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
visitEmptyQueryNode	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
visitKeywordFeatureNode	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
visitNamespaceHeader	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
getWikibaseEntitiesHandler	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	1
getSynonyms	16.67% covered (danger)	16.67%	2 / 12	0.00% covered (danger)	0.00%	0 / 1	13.26
canonicalizeTerm	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
filterTermsTooShort	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	1
filterTermsTooDissimilarCanonicalized	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	1
filterTermsTooSimilar	100.00% covered (success)	100.00%	21 / 21	100.00% covered (success)	100.00%	1 / 1	5
filterTermsSupersets	100.00% covered (success)	100.00%	18 / 18	100.00% covered (success)	100.00%	1 / 1	5

1	<?php
2
3	namespace Wikibase\MediaInfo\Search;
4
5	use CirrusSearch\Parser\AST\BooleanClause;
6	use CirrusSearch\Parser\AST\EmptyQueryNode;
7	use CirrusSearch\Parser\AST\FuzzyNode;
8	use CirrusSearch\Parser\AST\KeywordFeatureNode;
9	use CirrusSearch\Parser\AST\NamespaceHeaderNode;
10	use CirrusSearch\Parser\AST\NegatedNode;
11	use CirrusSearch\Parser\AST\ParsedBooleanNode;
12	use CirrusSearch\Parser\AST\ParsedNode;
13	use CirrusSearch\Parser\AST\ParsedQuery;
14	use CirrusSearch\Parser\AST\PhrasePrefixNode;
15	use CirrusSearch\Parser\AST\PhraseQueryNode;
16	use CirrusSearch\Parser\AST\PrefixNode;
17	use CirrusSearch\Parser\AST\Visitor\Visitor;
18	use CirrusSearch\Parser\AST\WildcardNode;
19	use CirrusSearch\Parser\AST\WordsQueryNode;
20	use CirrusSearch\Query\Builder\NearMatchFieldQueryBuilder;
21	use Elastica\Query\AbstractQuery;
22	use Elastica\Query\BoolQuery;
23	use Elastica\Query\FunctionScore;
24	use Elastica\Query\MatchNone;
25	use Elastica\Script\Script;
26	use SplObjectStorage;
27	use Wikibase\MediaInfo\Search\ASTQueryBuilder\PhraseQueryNodeHandler;
28	use Wikibase\MediaInfo\Search\ASTQueryBuilder\WikibaseEntitiesHandler;
29	use Wikibase\MediaInfo\Search\ASTQueryBuilder\WordsQueryNodeHandler;
30	use Wikimedia\Assert\Assert;
31
32	class MediaSearchASTQueryBuilder implements Visitor {
33	/** @var SplObjectStorage */
34	private $map;
35
36	/** @var ParsedQuery */
37	private $parsedQuery;
38
39	/** @var MediaSearchASTEntitiesExtractor */
40	private $entitiesExtractor;
41
42	/** @var array[] */
43	private $stemmingSettings;
44
45	/** @var string[] */
46	private $languages;
47
48	/** @var string */
49	private $contentLanguage;
50
51	/** @var float[] */
52	private $boosts;
53
54	/** @var float[] */
55	private $decays;
56
57	/** @var array */
58	private $options;
59
60	/**
61	* @param MediaSearchASTEntitiesExtractor $entitiesExtractor
62	* @param array[] $stemmingSettings Stemming settings (see $wgWBCSUseStemming)
63	* @param string[] $languages Languages to search text in
64	* @param string $contentLanguage Content language code
65	* @param array[] $settings Optional weight/decay overrides, plus some options
66	*/
67	public function __construct(
68	MediaSearchASTEntitiesExtractor $entitiesExtractor,
69	array $stemmingSettings,
70	array $languages,
71	string $contentLanguage,
72	array $settings = []
73	) {
74	$this->entitiesExtractor = $entitiesExtractor;
75	$this->stemmingSettings = $stemmingSettings;
76	$this->languages = $languages;
77	$this->contentLanguage = $contentLanguage;
78	$this->boosts = ( $settings['boost'] ?? [] ) + [
79	'statement' => 1.0,
80	'descriptions.$language' => 1.0,
81	'descriptions.$language.plain' => 1.0,
82	'title' => 1.0,
83	'title.plain' => 1.0,
84	'category' => 1.0,
85	'category.plain' => 1.0,
86	'heading' => 1.0,
87	'heading.plain' => 1.0,
88	'auxiliary_text' => 1.0,
89	'auxiliary_text.plain' => 1.0,
90	'file_text' => 1.0,
91	'file_text.plain' => 1.0,
92	'redirect.title' => 1.0,
93	'redirect.title.plain' => 1.0,
94	'text' => 1.0,
95	'text.plain' => 1.0,
96	'suggest' => 1.0,
97	];
98	$this->decays = ( $settings['decay'] ?? [] ) + [
99	'descriptions.$language' => 1.0,
100	'descriptions.$language.plain' => 1.0,
101	'synonyms' => 1.0,
102	];
103	$this->options = [
104	'normalizeMultiClauseScores' => (bool)( $settings['normalizeMultiClauseScores'] ?? false ),
105	'entitiesVariableBoost' => (bool)( $settings['entitiesVariableBoost'] ?? true ),
106	'applyLogisticFunction' => (bool)( $settings['applyLogisticFunction'] ?? false ),
107	'useSynonyms' => (bool)( $settings['useSynonyms'] ?? false ),
108	'logisticRegressionIntercept' => (float)( $settings['logisticRegressionIntercept'] ?? 0 ),
109	'synonymsMaxAmount' => (float)( $settings['synonymsMaxAmount'] ?? 0 ),
110	'synonymsMinScoreThreshold' => (float)( $settings['synonymsMinScoreThreshold'] ?? 0 ),
111	'synonymsMinByteLength' => (float)( $settings['synonymsMinByteLength'] ?? 0 ),
112	'synonymsMinSimilarityToCanonicalForm' => (float)( $settings['synonymsMinSimilarityToCanonicalForm'] ?? 0 ),
113	'synonymsMinDifferenceFromOthers' => (float)( $settings['synonymsMinDifferenceFromOthers'] ?? 0 ),
114	'nearMatchBoost' => (float)( $settings['nearMatchBoost'] ?? 5.0 ),
115	];
116	}
117
118	public function getQuery( ParsedQuery $parsedQuery ): AbstractQuery {
119	$this->map = new SplObjectStorage();
120	$this->parsedQuery = $parsedQuery;
121	$root = $parsedQuery->getRoot();
122	$root->accept( $this );
123	$nearMatchQuery = NearMatchFieldQueryBuilder::defaultFromWeight( $this->options["nearMatchBoost"] )
124	->buildFromParsedQuery( $parsedQuery );
125	$mainQuery = $this->map[$root] ?? new MatchNone();
126	if ( $mainQuery instanceof MatchNone ) {
127	$actualQuery = $nearMatchQuery;
128	} elseif ( $nearMatchQuery instanceof MatchNone ) {
129	$actualQuery = $mainQuery;
130	} else {
131	$actualQuery = new BoolQuery();
132	$actualQuery->addShould( $nearMatchQuery );
133	$actualQuery->addShould( $mainQuery );
134	$actualQuery->setMinimumShouldMatch( 1 );
135	}
136
137	return $actualQuery;
138	}
139
140	/**
141	* Applies a logistic function to the sum of the scores minus a constant
142	*
143	* @see https://phabricator.wikimedia.org/T271799
144	* @param AbstractQuery $query
145	* @return AbstractQuery
146	*/
147	private function applyLogisticFunction( AbstractQuery $query ): AbstractQuery {
148	if ( !$this->options[ 'applyLogisticFunction' ] ) {
149	return $query;
150	}
151
152	return ( new FunctionScore() )
153	->setQuery( $query )
154	->addScriptScoreFunction(
155	new Script(
156	// this will produce scores in the 0-100 range
157	'100 / ( 1 + exp( -1 * ( _score + intercept ) ) )',
158	[ 'intercept' => $this->options['logisticRegressionIntercept'] ],
159	'expression'
160	)
161	)
162	->setBoostMode( FunctionScore::BOOST_MODE_REPLACE );
163	}
164
165	/**
166	* If we've applied a logistic function to the scores, then we expect the score to be
167	* between 0 and 100, HOWEVER if we have >1 text nodes we get a score of 0-1 for each,
168	* and therefore end up with a final score between 0 and 100*(number of nodes)
169	* Wrap the root node inside a function that divides the score by the number of nodes
170	*
171	* @param BoolQuery $query
172	* @return AbstractQuery
173	*/
174	private function normalizeMultiClauseScores( BoolQuery $query ): AbstractQuery {
175	if (
176	!$this->options[ 'applyLogisticFunction' ]
177	\|\| !$this->options[ 'normalizeMultiClauseScores' ]
178	) {
179	return $query;
180	}
181
182	if ( $query->count() <= 1 ) {
183	return $query;
184	}
185
186	return ( new FunctionScore() )
187	->setQuery( $query )
188	->addScriptScoreFunction(
189	new Script(
190	'_score / count',
191	[ 'count' => $query->count() ],
192	'expression'
193	)
194	);
195	}
196
197	public function visitParsedBooleanNode( ParsedBooleanNode $node ) {
198	$query = new BoolQuery();
199
200	$should = $must = 0;
201	foreach ( $node->getClauses() as $clause ) {
202	$clauseNode = $clause->getNode();
203	$clauseNode->accept( $this );
204	if ( isset( $this->map[$clauseNode] ) ) {
205	switch ( $clause->getOccur() ) {
206	case BooleanClause::SHOULD:
207	$query->addShould( $this->map[$clauseNode] );
208	$should++;
209	break;
210	case BooleanClause::MUST:
211	$query->addMust( $this->map[$clauseNode] );
212	$must++;
213	break;
214	case BooleanClause::MUST_NOT:
215	$query->addMustNot( $this->map[$clauseNode] );
216	break;
217	}
218	}
219	}
220	if ( $should && !$must ) {
221	// If we have must and should clauses allow 0 should clauses to match. If we
222	// only have should clauses require at least 1 to match.
223	$query->setMinimumShouldMatch( 1 );
224	}
225
226	if ( $query->count() > 0 ) {
227	$query = $this->normalizeMultiClauseScores( $query );
228	$this->map[$node] = $query;
229	}
230	}
231
232	public function visitBooleanClause( BooleanClause $clause ) {
233	// BooleanClause is being handled in visitParsedBooleanNode already,
234	// this will not be visited
235	}
236
237	public function visitWordsQueryNode( WordsQueryNode $node ) {
238	$synonyms = array_merge(
239	// the original term (below) will be removed again later, but we should
240	// also consider it when clearing out synonyms that are too similar
241	[ $node->getWords() => 10 ],
242	$this->getSynonyms( $node, $this->options['synonymsMinScoreThreshold'] )
243	);
244
245	$synonyms = $this->filterTermsTooDissimilarCanonicalized(
246	$synonyms,
247	$this->options['synonymsMinSimilarityToCanonicalForm']
248	);
249	$synonyms = array_reduce(
250	array_keys( $synonyms ),
251	function ( $result, $term ) use ( $synonyms ) {
252	$canonical = $this->canonicalizeTerm( $term );
253	$result[$canonical] = max( $synonyms[$term], $result[$canonical] ?? 0 );
254	return $result;
255	},
256	[]
257	);
258	$synonyms = $this->filterTermsTooShort( $synonyms, $this->options['synonymsMinByteLength'] );
259	$synonyms = $this->filterTermsTooSimilar( $synonyms, $this->options['synonymsMinDifferenceFromOthers'] );
260	$synonyms = $this->filterTermsSupersets( $synonyms );
261
262	// remove original term (and duplicates thereof)
263	unset( $synonyms[$this->canonicalizeTerm( $node->getWords() )] );
264
265	$synonyms = array_slice( $synonyms, 0, $this->options['synonymsMaxAmount'] );
266
267	$nodeHandler = new WordsQueryNodeHandler(
268	$node,
269	$this->getWikibaseEntitiesHandler( $node ),
270	$this->languages,
271	$synonyms,
272	array_fill_keys( $synonyms, [ $this->contentLanguage ] ),
273	$this->stemmingSettings,
274	$this->boosts,
275	$this->decays
276	);
277	$this->map[$node] = $this->applyLogisticFunction( $nodeHandler->transform() );
278	}
279
280	public function visitPhraseQueryNode( PhraseQueryNode $node ) {
281	$nodeHandler = new PhraseQueryNodeHandler(
282	$node,
283	$this->getWikibaseEntitiesHandler( $node ),
284	$this->languages,
285	$this->stemmingSettings,
286	$this->boosts,
287	$this->decays
288	);
289	$this->map[$node] = $nodeHandler->transform();
290	}
291
292	public function visitPhrasePrefixNode( PhrasePrefixNode $node ) {
293	// @phan-suppress-next-line PhanImpossibleCondition
294	Assert::invariant( false, 'PhrasePrefixNode not (yet) supported.' );
295	}
296
297	public function visitNegatedNode( NegatedNode $node ) {
298	// @phan-suppress-next-line PhanImpossibleCondition
299	Assert::invariant( false, 'NegatedNode not (yet) supported.' );
300	}
301
302	public function visitFuzzyNode( FuzzyNode $node ) {
303	// @phan-suppress-next-line PhanImpossibleCondition
304	Assert::invariant( false, 'FuzzyNode not (yet) supported.' );
305	}
306
307	public function visitPrefixNode( PrefixNode $node ) {
308	// @phan-suppress-next-line PhanImpossibleCondition
309	Assert::invariant( false, 'PrefixNode not (yet) supported.' );
310	}
311
312	public function visitWildcardNode( WildcardNode $node ) {
313	// @phan-suppress-next-line PhanImpossibleCondition
314	Assert::invariant( false, 'WildcardNode not (yet) supported.' );
315	}
316
317	public function visitEmptyQueryNode( EmptyQueryNode $node ) {
318	// nothing...
319	}
320
321	public function visitKeywordFeatureNode( KeywordFeatureNode $node ) {
322	// this is already dealt with elsewhere in the query building process
323	}
324
325	public function visitNamespaceHeader( NamespaceHeaderNode $node ) {
326	// this is already dealt with elsewhere in the query building process
327	}
328
329	private function getWikibaseEntitiesHandler( ParsedNode $node ) {
330	return new WikibaseEntitiesHandler(
331	$node,
332	$this->parsedQuery,
333	$this->entitiesExtractor,
334	$this->boosts,
335	$this->options
336	);
337	}
338
339	/**
340	* @param WordsQueryNode $node
341	* @param float $threshold relevance percentage below which not to include synonyms
342	* @return array [synonym => score]
343	*/
344	private function getSynonyms( WordsQueryNode $node, float $threshold = 0.5 ): array {
345	if ( !$this->options[ 'useSynonyms' ] ) {
346	return [];
347	}
348
349	$entities = $this->entitiesExtractor->getEntities( $this->parsedQuery, $node );
350
351	$synonyms = [];
352	foreach ( $entities as $entity ) {
353	if ( $entity['score'] < $threshold ) {
354	// skip entities that don't pass relevance threshold
355	continue;
356	}
357
358	$synonyms = array_merge(
359	$synonyms,
360	array_fill_keys( $entity['synonyms'] ?? [], $entity['score'] )
361	);
362	}
363
364	return $synonyms;
365	}
366
367	private function canonicalizeTerm( string $term ): string {
368	$canonical = strtolower( $term );
369	// replace punctuation (\p{P}) and separators (\p{Z}) by a single space
370	$canonical = preg_replace( '/[\p{P}\p{Z}]+/u', ' ', $canonical );
371	return trim( $canonical );
372	}
373
374	private function filterTermsTooShort( array $synonyms, int $threshold ): array {
375	// remove variations, preserving the highest value in case of duplicates
376	return array_filter(
377	$synonyms,
378	static function ( $term ) use ( $threshold ) {
379	// discard 1-letter latin characters - they're too generic & expensive
380	return strlen( $term ) >= $threshold;
381	},
382	ARRAY_FILTER_USE_KEY
383	);
384	}
385
386	private function filterTermsTooDissimilarCanonicalized( array $synonyms, float $threshold ): array {
387	// remove variations, preserving the highest value in case of duplicates
388	return array_filter(
389	$synonyms,
390	function ( $term ) use ( $threshold ) {
391	$canonical = $this->canonicalizeTerm( $term );
392	// discard terms where a significant portion was punctuation or separators,
393	// the canonical form likely is no longer representative enough (e.g `c#` != `c`)
394	// @phan-suppress-next-line PhanPluginUseReturnValueInternalKnown
395	similar_text( strtolower( $canonical ), strtolower( $term ), $similarity );
396	return $similarity / 100 >= $threshold;
397	},
398	ARRAY_FILTER_USE_KEY
399	);
400	}
401
402	private function filterTermsTooSimilar( array $synonyms, float $threshold ): array {
403	// now calculate the similarity to other terms (with same or higher weight)
404	// and get rid of terms that are simply too similar (e.g. 'cat' and 'cats',
405	// or 'house cat' and 'housecat' are too similar; we'd rather spend our
406	// resources looking for more significantly different terms)
407	$terms = array_keys( $synonyms );
408	$differences = [];
409	foreach ( $synonyms as $term => $score ) {
410	$index = array_search( $term, $terms );
411	$previousTerms = array_slice( $terms, 0, $index );
412	$differences[$term] = array_reduce(
413	$previousTerms,
414	static function ( $min, $otherTerm ) use ( $term ) {
415	// @phan-suppress-next-line PhanPluginUseReturnValueInternalKnown
416	similar_text( strtolower( $term ), strtolower( $otherTerm ), $similarity );
417	$difference = 1 - $similarity / 100;
418	return $min === null ? $difference : min( $min, $difference );
419	},
420	null
421	);
422	if ( $differences[$term] !== null && $differences[$term] < $threshold ) {
423	unset( $synonyms[$term] );
424	}
425	}
426
427	// now re-sort them by difference compared to other terms (by weight),
428	// so that we get the "more different" terms first; then sort by weight
429	// again so that we end up with an array sorted by weight first, and
430	// "different-ness" second
431	uksort( $synonyms, static function ( $a, $b ) use ( $differences ) {
432	return $differences[ $b ] <=> $differences[ $a ];
433	} );
434	arsort( $synonyms );
435
436	return $synonyms;
437	}
438
439	private function filterTermsSupersets( array $synonyms ): array {
440	// sort synonyms by descending weight & descending term length
441	uksort( $synonyms, static function ( $a, $b ) {
442	return strlen( $a ) <=> strlen( $b );
443	} );
444	arsort( $synonyms );
445
446	// remove synonyms that are a superset of something we're already searching
447	// (unless said superset has a higher weight)
448	// e.g. if we're already matching "commons", then trying to find documents
449	// with "wikimedia commons" would yield no additional results - they'd
450	// already be found with "commons"...
451	// (yes, they would get a higher score for "wikimedia commons", but that's
452	// no more or less correct than "commons" in this case - it's just as good
453	// a description as the longer form as far as we know, both referring to the
454	// exact same concept
455	return array_reduce(
456	array_keys( $synonyms ),
457	static function ( $result, $term ) use ( $synonyms ) {
458	foreach ( $result as $existing => $weight ) {
459	if ( preg_match_all( '/\b[^\p{P}\p{Z}]+?\b/u', $existing, $matches ) ) {
460	foreach ( $matches[0] as $word ) {
461	if ( !preg_match( '/\b' . preg_quote( $word, '/' ) . '\b/', $term ) ) {
462	// at least one of the words of another synonym do not
463	// occur in this term, so it's at least more exclusive
464	// in some way = this term is no superset of that other
465	continue 2;
466	}
467	}
468	// another term of equal or higher weight already matches this
469	return $result;
470	}
471	}
472	// this synonym turned out to be different enough from all others;
473	// include it
474	$result[$term] = $synonyms[$term];
475	return $result;
476	},
477	[]
478	);
479	}
480
481	}