Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
75.43% covered (warning)
75.43%
175 / 232
34.78% covered (danger)
34.78%
8 / 23
CRAP
0.00% covered (danger)
0.00%
0 / 1
MediaSearchASTQueryBuilder
75.43% covered (warning)
75.43%
175 / 232
34.78% covered (danger)
34.78%
8 / 23
82.17
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
42 / 42
100.00% covered (success)
100.00%
1 / 1
1
 getQuery
87.50% covered (warning)
87.50%
14 / 16
0.00% covered (danger)
0.00%
0 / 1
3.02
 applyLogisticFunction
16.67% covered (danger)
16.67%
2 / 12
0.00% covered (danger)
0.00%
0 / 1
4.31
 normalizeMultiClauseScores
21.43% covered (danger)
21.43%
3 / 14
0.00% covered (danger)
0.00%
0 / 1
11.76
 visitParsedBooleanNode
70.00% covered (warning)
70.00%
14 / 20
0.00% covered (danger)
0.00%
0 / 1
11.19
 visitBooleanClause
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 visitWordsQueryNode
100.00% covered (success)
100.00%
33 / 33
100.00% covered (success)
100.00%
1 / 1
1
 visitPhraseQueryNode
0.00% covered (danger)
0.00%
0 / 9
0.00% covered (danger)
0.00%
0 / 1
2
 visitPhrasePrefixNode
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 visitNegatedNode
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 visitFuzzyNode
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 visitPrefixNode
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 visitWildcardNode
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 visitEmptyQueryNode
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 visitKeywordFeatureNode
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 visitNamespaceHeader
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getWikibaseEntitiesHandler
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
1
 getSynonyms
16.67% covered (danger)
16.67%
2 / 12
0.00% covered (danger)
0.00%
0 / 1
13.26
 canonicalizeTerm
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 filterTermsTooShort
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
1
 filterTermsTooDissimilarCanonicalized
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
1
 filterTermsTooSimilar
100.00% covered (success)
100.00%
21 / 21
100.00% covered (success)
100.00%
1 / 1
5
 filterTermsSupersets
100.00% covered (success)
100.00%
18 / 18
100.00% covered (success)
100.00%
1 / 1
5
1<?php
2
3namespace Wikibase\MediaInfo\Search;
4
5use CirrusSearch\Parser\AST\BooleanClause;
6use CirrusSearch\Parser\AST\EmptyQueryNode;
7use CirrusSearch\Parser\AST\FuzzyNode;
8use CirrusSearch\Parser\AST\KeywordFeatureNode;
9use CirrusSearch\Parser\AST\NamespaceHeaderNode;
10use CirrusSearch\Parser\AST\NegatedNode;
11use CirrusSearch\Parser\AST\ParsedBooleanNode;
12use CirrusSearch\Parser\AST\ParsedNode;
13use CirrusSearch\Parser\AST\ParsedQuery;
14use CirrusSearch\Parser\AST\PhrasePrefixNode;
15use CirrusSearch\Parser\AST\PhraseQueryNode;
16use CirrusSearch\Parser\AST\PrefixNode;
17use CirrusSearch\Parser\AST\Visitor\Visitor;
18use CirrusSearch\Parser\AST\WildcardNode;
19use CirrusSearch\Parser\AST\WordsQueryNode;
20use CirrusSearch\Query\Builder\NearMatchFieldQueryBuilder;
21use Elastica\Query\AbstractQuery;
22use Elastica\Query\BoolQuery;
23use Elastica\Query\FunctionScore;
24use Elastica\Query\MatchNone;
25use Elastica\Script\Script;
26use SplObjectStorage;
27use Wikibase\MediaInfo\Search\ASTQueryBuilder\PhraseQueryNodeHandler;
28use Wikibase\MediaInfo\Search\ASTQueryBuilder\WikibaseEntitiesHandler;
29use Wikibase\MediaInfo\Search\ASTQueryBuilder\WordsQueryNodeHandler;
30use Wikimedia\Assert\Assert;
31
32class MediaSearchASTQueryBuilder implements Visitor {
33    /** @var SplObjectStorage */
34    private $map;
35
36    /** @var ParsedQuery */
37    private $parsedQuery;
38
39    /** @var MediaSearchASTEntitiesExtractor */
40    private $entitiesExtractor;
41
42    /** @var array[] */
43    private $stemmingSettings;
44
45    /** @var string[] */
46    private $languages;
47
48    /** @var string */
49    private $contentLanguage;
50
51    /** @var float[] */
52    private $boosts;
53
54    /** @var float[] */
55    private $decays;
56
57    /** @var array */
58    private $options;
59
60    /**
61     * @param MediaSearchASTEntitiesExtractor $entitiesExtractor
62     * @param array[] $stemmingSettings Stemming settings (see $wgWBCSUseStemming)
63     * @param string[] $languages Languages to search text in
64     * @param string $contentLanguage Content language code
65     * @param array[] $settings Optional weight/decay overrides, plus some options
66     */
67    public function __construct(
68        MediaSearchASTEntitiesExtractor $entitiesExtractor,
69        array $stemmingSettings,
70        array $languages,
71        string $contentLanguage,
72        array $settings = []
73    ) {
74        $this->entitiesExtractor = $entitiesExtractor;
75        $this->stemmingSettings = $stemmingSettings;
76        $this->languages = $languages;
77        $this->contentLanguage = $contentLanguage;
78        $this->boosts = ( $settings['boost'] ?? [] ) + [
79            'statement' => 1.0,
80            'descriptions.$language' => 1.0,
81            'descriptions.$language.plain' => 1.0,
82            'title' => 1.0,
83            'title.plain' => 1.0,
84            'category' => 1.0,
85            'category.plain' => 1.0,
86            'heading' => 1.0,
87            'heading.plain' => 1.0,
88            'auxiliary_text' => 1.0,
89            'auxiliary_text.plain' => 1.0,
90            'file_text' => 1.0,
91            'file_text.plain' => 1.0,
92            'redirect.title' => 1.0,
93            'redirect.title.plain' => 1.0,
94            'text' => 1.0,
95            'text.plain' => 1.0,
96            'suggest' => 1.0,
97        ];
98        $this->decays = ( $settings['decay'] ?? [] ) + [
99            'descriptions.$language' => 1.0,
100            'descriptions.$language.plain' => 1.0,
101            'synonyms' => 1.0,
102        ];
103        $this->options = [
104            'normalizeMultiClauseScores' => (bool)( $settings['normalizeMultiClauseScores'] ?? false ),
105            'entitiesVariableBoost' => (bool)( $settings['entitiesVariableBoost'] ?? true ),
106            'applyLogisticFunction' => (bool)( $settings['applyLogisticFunction'] ?? false ),
107            'useSynonyms' => (bool)( $settings['useSynonyms'] ?? false ),
108            'logisticRegressionIntercept' => (float)( $settings['logisticRegressionIntercept'] ?? 0 ),
109            'synonymsMaxAmount' => (float)( $settings['synonymsMaxAmount'] ?? 0 ),
110            'synonymsMinScoreThreshold' => (float)( $settings['synonymsMinScoreThreshold'] ?? 0 ),
111            'synonymsMinByteLength' => (float)( $settings['synonymsMinByteLength'] ?? 0 ),
112            'synonymsMinSimilarityToCanonicalForm' => (float)( $settings['synonymsMinSimilarityToCanonicalForm'] ?? 0 ),
113            'synonymsMinDifferenceFromOthers' => (float)( $settings['synonymsMinDifferenceFromOthers'] ?? 0 ),
114            'nearMatchBoost' => (float)( $settings['nearMatchBoost'] ?? 5.0 ),
115        ];
116    }
117
118    public function getQuery( ParsedQuery $parsedQuery ): AbstractQuery {
119        $this->map = new SplObjectStorage();
120        $this->parsedQuery = $parsedQuery;
121        $root = $parsedQuery->getRoot();
122        $root->accept( $this );
123        $nearMatchQuery = NearMatchFieldQueryBuilder::defaultFromWeight( $this->options["nearMatchBoost"] )
124            ->buildFromParsedQuery( $parsedQuery );
125        $mainQuery = $this->map[$root] ?? new MatchNone();
126        if ( $mainQuery instanceof MatchNone ) {
127            $actualQuery = $nearMatchQuery;
128        } elseif ( $nearMatchQuery instanceof MatchNone ) {
129            $actualQuery = $mainQuery;
130        } else {
131            $actualQuery = new BoolQuery();
132            $actualQuery->addShould( $nearMatchQuery );
133            $actualQuery->addShould( $mainQuery );
134            $actualQuery->setMinimumShouldMatch( 1 );
135        }
136
137        return $actualQuery;
138    }
139
140    /**
141     * Applies a logistic function to the sum of the scores minus a constant
142     *
143     * @see https://phabricator.wikimedia.org/T271799
144     * @param AbstractQuery $query
145     * @return AbstractQuery
146     */
147    private function applyLogisticFunction( AbstractQuery $query ): AbstractQuery {
148        if ( !$this->options[ 'applyLogisticFunction' ] ) {
149            return $query;
150        }
151
152        return ( new FunctionScore() )
153            ->setQuery( $query )
154            ->addScriptScoreFunction(
155                new Script(
156                    // this will produce scores in the 0-100 range
157                    '100 / ( 1 + exp( -1 * ( _score + intercept ) ) )',
158                    [ 'intercept' => $this->options['logisticRegressionIntercept'] ],
159                    'expression'
160                )
161            )
162            ->setBoostMode( FunctionScore::BOOST_MODE_REPLACE );
163    }
164
165    /**
166     * If we've applied a logistic function to the scores, then we expect the score to be
167     * between 0 and 100, HOWEVER if we have >1 text nodes we get a score of 0-1 for each,
168     * and therefore end up with a final score between 0 and 100*(number of nodes)
169     * Wrap the root node inside a function that divides the score by the number of nodes
170     *
171     * @param BoolQuery $query
172     * @return AbstractQuery
173     */
174    private function normalizeMultiClauseScores( BoolQuery $query ): AbstractQuery {
175        if (
176            !$this->options[ 'applyLogisticFunction' ]
177             || !$this->options[ 'normalizeMultiClauseScores' ]
178        ) {
179            return $query;
180        }
181
182        if ( $query->count() <= 1 ) {
183            return $query;
184        }
185
186        return ( new FunctionScore() )
187            ->setQuery( $query )
188            ->addScriptScoreFunction(
189                new Script(
190                    '_score / count',
191                    [ 'count' => $query->count() ],
192                    'expression'
193                )
194            );
195    }
196
197    public function visitParsedBooleanNode( ParsedBooleanNode $node ) {
198        $query = new BoolQuery();
199
200        $should = $must = 0;
201        foreach ( $node->getClauses() as $clause ) {
202            $clauseNode = $clause->getNode();
203            $clauseNode->accept( $this );
204            if ( isset( $this->map[$clauseNode] ) ) {
205                switch ( $clause->getOccur() ) {
206                    case BooleanClause::SHOULD:
207                        $query->addShould( $this->map[$clauseNode] );
208                        $should++;
209                        break;
210                    case BooleanClause::MUST:
211                        $query->addMust( $this->map[$clauseNode] );
212                        $must++;
213                        break;
214                    case BooleanClause::MUST_NOT:
215                        $query->addMustNot( $this->map[$clauseNode] );
216                        break;
217                }
218            }
219        }
220        if ( $should && !$must ) {
221            // If we have must and should clauses allow 0 should clauses to match. If we
222            // only have should clauses require at least 1 to match.
223            $query->setMinimumShouldMatch( 1 );
224        }
225
226        if ( $query->count() > 0 ) {
227            $query = $this->normalizeMultiClauseScores( $query );
228            $this->map[$node] = $query;
229        }
230    }
231
232    public function visitBooleanClause( BooleanClause $clause ) {
233        // BooleanClause is being handled in visitParsedBooleanNode already,
234        // this will not be visited
235    }
236
237    public function visitWordsQueryNode( WordsQueryNode $node ) {
238        $synonyms = array_merge(
239            // the original term (below) will be removed again later, but we should
240            // also consider it when clearing out synonyms that are too similar
241            [ $node->getWords() => 10 ],
242            $this->getSynonyms( $node, $this->options['synonymsMinScoreThreshold'] )
243        );
244
245        $synonyms = $this->filterTermsTooDissimilarCanonicalized(
246            $synonyms,
247            $this->options['synonymsMinSimilarityToCanonicalForm']
248        );
249        $synonyms = array_reduce(
250            array_keys( $synonyms ),
251            function ( $result, $term ) use ( $synonyms ) {
252                $canonical = $this->canonicalizeTerm( $term );
253                $result[$canonical] = max( $synonyms[$term], $result[$canonical] ?? 0 );
254                return $result;
255            },
256            []
257        );
258        $synonyms = $this->filterTermsTooShort( $synonyms, $this->options['synonymsMinByteLength'] );
259        $synonyms = $this->filterTermsTooSimilar( $synonyms, $this->options['synonymsMinDifferenceFromOthers'] );
260        $synonyms = $this->filterTermsSupersets( $synonyms );
261
262        // remove original term (and duplicates thereof)
263        unset( $synonyms[$this->canonicalizeTerm( $node->getWords() )] );
264
265        $synonyms = array_slice( $synonyms, 0, $this->options['synonymsMaxAmount'] );
266
267        $nodeHandler = new WordsQueryNodeHandler(
268            $node,
269            $this->getWikibaseEntitiesHandler( $node ),
270            $this->languages,
271            $synonyms,
272            array_fill_keys( $synonyms, [ $this->contentLanguage ] ),
273            $this->stemmingSettings,
274            $this->boosts,
275            $this->decays
276        );
277        $this->map[$node] = $this->applyLogisticFunction( $nodeHandler->transform() );
278    }
279
280    public function visitPhraseQueryNode( PhraseQueryNode $node ) {
281        $nodeHandler = new PhraseQueryNodeHandler(
282            $node,
283            $this->getWikibaseEntitiesHandler( $node ),
284            $this->languages,
285            $this->stemmingSettings,
286            $this->boosts,
287            $this->decays
288        );
289        $this->map[$node] = $nodeHandler->transform();
290    }
291
292    public function visitPhrasePrefixNode( PhrasePrefixNode $node ) {
293        // @phan-suppress-next-line PhanImpossibleCondition
294        Assert::invariant( false, 'PhrasePrefixNode not (yet) supported.' );
295    }
296
297    public function visitNegatedNode( NegatedNode $node ) {
298        // @phan-suppress-next-line PhanImpossibleCondition
299        Assert::invariant( false, 'NegatedNode not (yet) supported.' );
300    }
301
302    public function visitFuzzyNode( FuzzyNode $node ) {
303        // @phan-suppress-next-line PhanImpossibleCondition
304        Assert::invariant( false, 'FuzzyNode not (yet) supported.' );
305    }
306
307    public function visitPrefixNode( PrefixNode $node ) {
308        // @phan-suppress-next-line PhanImpossibleCondition
309        Assert::invariant( false, 'PrefixNode not (yet) supported.' );
310    }
311
312    public function visitWildcardNode( WildcardNode $node ) {
313        // @phan-suppress-next-line PhanImpossibleCondition
314        Assert::invariant( false, 'WildcardNode not (yet) supported.' );
315    }
316
317    public function visitEmptyQueryNode( EmptyQueryNode $node ) {
318        // nothing...
319    }
320
321    public function visitKeywordFeatureNode( KeywordFeatureNode $node ) {
322        // this is already dealt with elsewhere in the query building process
323    }
324
325    public function visitNamespaceHeader( NamespaceHeaderNode $node ) {
326        // this is already dealt with elsewhere in the query building process
327    }
328
329    private function getWikibaseEntitiesHandler( ParsedNode $node ) {
330        return new WikibaseEntitiesHandler(
331            $node,
332            $this->parsedQuery,
333            $this->entitiesExtractor,
334            $this->boosts,
335            $this->options
336        );
337    }
338
339    /**
340     * @param WordsQueryNode $node
341     * @param float $threshold relevance percentage below which not to include synonyms
342     * @return array [synonym => score]
343     */
344    private function getSynonyms( WordsQueryNode $node, float $threshold = 0.5 ): array {
345        if ( !$this->options[ 'useSynonyms' ] ) {
346            return [];
347        }
348
349        $entities = $this->entitiesExtractor->getEntities( $this->parsedQuery, $node );
350
351        $synonyms = [];
352        foreach ( $entities as $entity ) {
353            if ( $entity['score'] < $threshold ) {
354                // skip entities that don't pass relevance threshold
355                continue;
356            }
357
358            $synonyms = array_merge(
359                $synonyms,
360                array_fill_keys( $entity['synonyms'] ?? [], $entity['score'] )
361            );
362        }
363
364        return $synonyms;
365    }
366
367    private function canonicalizeTerm( string $term ): string {
368        $canonical = strtolower( $term );
369        // replace punctuation (\p{P}) and separators (\p{Z}) by a single space
370        $canonical = preg_replace( '/[\p{P}\p{Z}]+/u', ' ', $canonical );
371        return trim( $canonical );
372    }
373
374    private function filterTermsTooShort( array $synonyms, int $threshold ): array {
375        // remove variations, preserving the highest value in case of duplicates
376        return array_filter(
377            $synonyms,
378            static function ( $term ) use ( $threshold ) {
379                // discard 1-letter latin characters - they're too generic & expensive
380                return strlen( $term ) >= $threshold;
381            },
382            ARRAY_FILTER_USE_KEY
383        );
384    }
385
386    private function filterTermsTooDissimilarCanonicalized( array $synonyms, float $threshold ): array {
387        // remove variations, preserving the highest value in case of duplicates
388        return array_filter(
389            $synonyms,
390            function ( $term ) use ( $threshold ) {
391                $canonical = $this->canonicalizeTerm( $term );
392                // discard terms where a significant portion was punctuation or separators,
393                // the canonical form likely is no longer representative enough (e.g `c#` != `c`)
394                // @phan-suppress-next-line PhanPluginUseReturnValueInternalKnown
395                similar_text( strtolower( $canonical ), strtolower( $term ), $similarity );
396                return $similarity / 100 >= $threshold;
397            },
398            ARRAY_FILTER_USE_KEY
399        );
400    }
401
402    private function filterTermsTooSimilar( array $synonyms, float $threshold ): array {
403        // now calculate the similarity to other terms (with same or higher weight)
404        // and get rid of terms that are simply too similar (e.g. 'cat' and 'cats',
405        // or 'house cat' and 'housecat' are too similar; we'd rather spend our
406        // resources looking for more significantly different terms)
407        $terms = array_keys( $synonyms );
408        $differences = [];
409        foreach ( $synonyms as $term => $score ) {
410            $index = array_search( $term, $terms );
411            $previousTerms = array_slice( $terms, 0, $index );
412            $differences[$term] = array_reduce(
413                $previousTerms,
414                static function ( $min, $otherTerm ) use ( $term ) {
415                    // @phan-suppress-next-line PhanPluginUseReturnValueInternalKnown
416                    similar_text( strtolower( $term ), strtolower( $otherTerm ), $similarity );
417                    $difference = 1 - $similarity / 100;
418                    return $min === null ? $difference : min( $min, $difference );
419                },
420                null
421            );
422            if ( $differences[$term] !== null && $differences[$term] < $threshold ) {
423                unset( $synonyms[$term] );
424            }
425        }
426
427        // now re-sort them by difference compared to other terms (by weight),
428        // so that we get the "more different" terms first; then sort by weight
429        // again so that we end up with an array sorted by weight first, and
430        // "different-ness" second
431        uksort( $synonyms, static function ( $a, $b ) use ( $differences ) {
432            return $differences[ $b ] <=> $differences[ $a ];
433        } );
434        arsort( $synonyms );
435
436        return $synonyms;
437    }
438
439    private function filterTermsSupersets( array $synonyms ): array {
440        // sort synonyms by descending weight & descending term length
441        uksort( $synonyms, static function ( $a, $b ) {
442            return strlen( $a ) <=> strlen( $b );
443        } );
444        arsort( $synonyms );
445
446        // remove synonyms that are a superset of something we're already searching
447        // (unless said superset has a higher weight)
448        // e.g. if we're already matching "commons", then trying to find documents
449        // with "wikimedia commons" would yield no additional results - they'd
450        // already be found with "commons"...
451        // (yes, they would get a higher score for "wikimedia commons", but that's
452        // no more or less correct than "commons" in this case - it's just as good
453        // a description as the longer form as far as we know, both referring to the
454        // exact same concept
455        return array_reduce(
456            array_keys( $synonyms ),
457            static function ( $result, $term ) use ( $synonyms ) {
458                foreach ( $result as $existing => $weight ) {
459                    if ( preg_match_all( '/\b[^\p{P}\p{Z}]+?\b/u', $existing, $matches ) ) {
460                        foreach ( $matches[0] as $word ) {
461                            if ( !preg_match( '/\b' . preg_quote( $word, '/' ) . '\b/', $term ) ) {
462                                // at least one of the words of another synonym do not
463                                // occur in this term, so it's at least more exclusive
464                                // in some way = this term is no superset of that other
465                                continue 2;
466                            }
467                        }
468                        // another term of equal or higher weight already matches this
469                        return $result;
470                    }
471                }
472                // this synonym turned out to be different enough from all others;
473                // include it
474                $result[$term] = $synonyms[$term];
475                return $result;
476            },
477            []
478        );
479    }
480
481}