Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 72 |
|
0.00% |
0 / 3 |
CRAP | |
0.00% |
0 / 1 |
WordsQueryNodeHandler | |
0.00% |
0 / 72 |
|
0.00% |
0 / 3 |
72 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
6 | |||
transform | |
0.00% |
0 / 36 |
|
0.00% |
0 / 1 |
30 | |||
getTermScoringFieldQueryBuilder | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace Wikibase\MediaInfo\Search\ASTQueryBuilder; |
4 | |
5 | use CirrusSearch\Parser\AST\WordsQueryNode; |
6 | use Elastica\Query\AbstractQuery; |
7 | use Elastica\Query\BoolQuery; |
8 | use Elastica\Query\DisMax; |
9 | use Elastica\Query\MatchQuery; |
10 | use Elastica\Query\MultiMatch; |
11 | |
12 | class WordsQueryNodeHandler implements ParsedNodeHandlerInterface { |
13 | /** @var WordsQueryNode */ |
14 | private $node; |
15 | |
16 | /** @var WikibaseEntitiesHandler */ |
17 | private $entitiesHandler; |
18 | |
19 | /** @var float[] */ |
20 | private $decays; |
21 | |
22 | /** @var FieldIterator[] */ |
23 | private $termScoringFieldIterators; |
24 | |
25 | public function __construct( |
26 | WordsQueryNode $node, |
27 | WikibaseEntitiesHandler $entitiesHandler, |
28 | array $languages, |
29 | array $synonyms, |
30 | array $synonymsLanguages, |
31 | array $stemmingSettings, |
32 | array $boosts, |
33 | array $decays |
34 | ) { |
35 | $fulltextBoosts = array_intersect_key( |
36 | $boosts, |
37 | array_flip( |
38 | array_merge( FieldIterator::LANGUAGE_AGNOSTIC_FIELDS, |
39 | FieldIterator::LANGUAGE_AWARE_FIELDS ) |
40 | ) |
41 | ); |
42 | $this->node = $node; |
43 | $this->entitiesHandler = $entitiesHandler; |
44 | $this->decays = $decays; |
45 | |
46 | $this->termScoringFieldIterators[$node->getWords()] = new FieldIterator( |
47 | $this->getTermScoringFieldQueryBuilder( $node->getWords() ), |
48 | array_keys( $fulltextBoosts ), |
49 | $languages, |
50 | $stemmingSettings, |
51 | $fulltextBoosts, |
52 | $decays |
53 | ); |
54 | |
55 | // create iterators for all synonyms, where the scores are applied to the boost |
56 | foreach ( $synonyms as $term => $score ) { |
57 | $termLanguages = $synonymsLanguages[$term] ?? []; |
58 | $this->termScoringFieldIterators[$term] = new FieldIterator( |
59 | $this->getTermScoringFieldQueryBuilder( $term ), |
60 | array_keys( $fulltextBoosts ), |
61 | $termLanguages, |
62 | $stemmingSettings, |
63 | array_map( static function ( $boost ) use ( $score ) { |
64 | return $boost * $score; |
65 | }, $fulltextBoosts ), |
66 | $decays |
67 | ); |
68 | } |
69 | } |
70 | |
71 | public function transform(): AbstractQuery { |
72 | // we (may) have multiple terms to match (the original search term, |
73 | // but also synonyms), which we'll wrap them all in a dis_max to |
74 | // ensure that the scores don't spiral out of control and grow too |
75 | // large with too many synonyms |
76 | // that said, if/when a document matches multiple synonyms, that's |
77 | // a fairly strong indication that it's a pretty good match for the |
78 | // subject, so we'll add a tiebreaker to allow some additional boost |
79 | // (though these additional matches won't be worth as much) |
80 | $termsDisMax = new DisMax(); |
81 | $termsDisMax->setTieBreaker( $this->decays['synonyms'] ?? 0 ); |
82 | |
83 | // search term |
84 | $termQuery = new BoolQuery(); |
85 | $termQuery->setMinimumShouldMatch( 0 ); |
86 | $termQuery->addFilter( |
87 | ( new MultiMatch() ) |
88 | ->setQuery( $this->node->getWords() ) |
89 | ->setFields( [ 'all', 'all.plain' ] ) |
90 | ->setOperator( MultiMatch::OPERATOR_AND ) |
91 | ); |
92 | // build a boolquery that matches all fields for a given term |
93 | foreach ( $this->termScoringFieldIterators[$this->node->getWords()] as $fieldQuery ) { |
94 | $termQuery->addShould( $fieldQuery ); |
95 | } |
96 | // add term query (filter + normalized scoring clause per field) to global boolquery |
97 | $termsDisMax->addQuery( $termQuery ); |
98 | |
99 | // synonyms for search term |
100 | // this is very similar as with the original search term above, |
101 | // except that we'll be more strict in the filter & expect a |
102 | // phrase match |
103 | // they'll be wrapped inside another dis_max group to make sure |
104 | // that only the single best synonym can contribute to the score, |
105 | // because synonyms are often minor variations of similar text |
106 | // and could lead to massively inflated text matches in such case |
107 | $synonyms = array_diff( array_keys( $this->termScoringFieldIterators ), [ $this->node->getWords() ] ); |
108 | if ( count( $synonyms ) > 0 ) { |
109 | $synonymsDisMax = new DisMax(); |
110 | foreach ( $synonyms as $synonym ) { |
111 | $synonymQuery = new BoolQuery(); |
112 | $synonymQuery->setMinimumShouldMatch( 0 ); |
113 | $synonymQuery->addFilter( |
114 | ( new MultiMatch() ) |
115 | ->setQuery( $synonym ) |
116 | ->setFields( [ 'all' ] ) |
117 | // needs to be exact (phrase) match to avoid, as much as |
118 | // possible, false positives |
119 | ->setType( 'phrase' ) |
120 | ); |
121 | foreach ( $this->termScoringFieldIterators[$synonym] as $fieldQuery ) { |
122 | $synonymQuery->addShould( $fieldQuery ); |
123 | } |
124 | $synonymsDisMax->addQuery( $synonymQuery ); |
125 | } |
126 | $termsDisMax->addQuery( $synonymsDisMax ); |
127 | } |
128 | |
129 | $query = new BoolQuery(); |
130 | $query->setMinimumShouldMatch( 1 ); |
131 | // search term + synonyms |
132 | $query->addShould( $termsDisMax ); |
133 | // wikibase entities |
134 | $query->addShould( $this->entitiesHandler->transform() ); |
135 | |
136 | return $query; |
137 | } |
138 | |
139 | /** |
140 | * @param string $term |
141 | * @return FieldQueryBuilderInterface |
142 | */ |
143 | private function getTermScoringFieldQueryBuilder( $term ): FieldQueryBuilderInterface { |
144 | return new class( $term ) implements FieldQueryBuilderInterface { |
145 | /** @var string */ |
146 | private $term; |
147 | |
148 | public function __construct( $term ) { |
149 | $this->term = $term; |
150 | } |
151 | |
152 | public function getQuery( $field, $boost ): AbstractQuery { |
153 | return ( new MatchQuery() ) |
154 | ->setFieldQuery( $field, $this->term ) |
155 | ->setFieldBoost( $field, $boost ); |
156 | } |
157 | }; |
158 | } |
159 | } |