Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
94.62% covered (success)
94.62%
123 / 130
75.00% covered (warning)
75.00%
6 / 8
CRAP
0.00% covered (danger)
0.00%
0 / 1
EntityFullTextQueryBuilder
94.62% covered (success)
94.62%
123 / 130
75.00% covered (warning)
75.00%
6 / 8
22.08
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 newFromGlobals
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
1
 build
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 buildDegraded
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 buildEntitySearchQuery
100.00% covered (success)
100.00%
65 / 65
100.00% covered (success)
100.00%
1 / 1
9
 buildSimpleAllFilter
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
3
 buildFieldMatch
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 buildPhraseRescore
81.25% covered (warning)
81.25%
26 / 32
0.00% covered (danger)
0.00%
0 / 1
4.11
1<?php
2
3namespace Wikibase\Search\Elastic;
4
5use CirrusSearch\Extra\Query\TokenCountRouter;
6use CirrusSearch\Query\FullTextQueryBuilder;
7use CirrusSearch\Search\SearchContext;
8use Elastica\Query\AbstractQuery;
9use Elastica\Query\BoolQuery;
10use Elastica\Query\DisMax;
11use Elastica\Query\MatchNone;
12use Elastica\Query\MatchQuery;
13use Elastica\Query\MultiMatch;
14use Elastica\Query\Term;
15use MediaWiki\Context\RequestContext;
16use MediaWiki\MediaWikiServices;
17use Wikibase\DataModel\Entity\EntityIdParser;
18use Wikibase\Lib\LanguageFallbackChainFactory;
19use Wikibase\Repo\WikibaseRepo;
20
21/**
22 * Builder for entity fulltext queries
23 */
24class EntityFullTextQueryBuilder implements FullTextQueryBuilder {
25    public const ENTITY_FULL_TEXT_MARKER = 'entity_full_text';
26
27    /**
28     * @var array
29     */
30    private $settings;
31    /**
32     * Repository 'entitySearch' settings
33     * @var array
34     */
35    private $stemmingSettings;
36    /**
37     * @var LanguageFallbackChainFactory
38     */
39    private $languageFallbackChainFactory;
40    /**
41     * @var EntityIdParser
42     */
43    private $entityIdParser;
44    /**
45     * @var string User language code
46     */
47    private $userLanguage;
48
49    /**
50     * @param array $stemmingSettings Stemming settings from UseStemming config entry
51     * @param array $settings Settings from EntitySearchProfiles.php
52     * @param LanguageFallbackChainFactory $languageFallbackChainFactory
53     * @param EntityIdParser $entityIdParser
54     * @param string $userLanguage User's language code
55     */
56    public function __construct(
57        array $stemmingSettings,
58        array $settings,
59        LanguageFallbackChainFactory $languageFallbackChainFactory,
60        EntityIdParser $entityIdParser,
61        $userLanguage
62    ) {
63        $this->stemmingSettings = $stemmingSettings;
64        $this->settings = $settings;
65        $this->languageFallbackChainFactory = $languageFallbackChainFactory;
66        $this->entityIdParser = $entityIdParser;
67        $this->userLanguage = $userLanguage;
68    }
69
70    /**
71     * Create fulltext builder from global environment.
72     * @param array $settings Configuration from config file
73     * @return EntityFullTextQueryBuilder
74     */
75    public static function newFromGlobals( array $settings ) {
76        $services = MediaWikiServices::getInstance();
77        $config = $services->getConfigFactory()->makeConfig( 'WikibaseCirrusSearch' );
78        return new static(
79            $config->get( 'UseStemming' ),
80            $settings,
81            WikibaseRepo::getLanguageFallbackChainFactory( $services ),
82            WikibaseRepo::getEntityIdParser( $services ),
83            RequestContext::getMain()->getLanguage()->getCode()
84        );
85    }
86
87    /**
88     * Search articles with provided term.
89     *
90     * @param SearchContext $searchContext
91     * @param string $term term to search
92     */
93    public function build( SearchContext $searchContext, $term ) {
94        $this->buildEntitySearchQuery( $searchContext, $term );
95        // if we did find advanced query, we keep the old setup but change the result type
96        // FIXME: make it dispatch by content model
97        $searchContext->setResultsType( new EntityResultType( $this->userLanguage,
98            $this->languageFallbackChainFactory->newFromLanguageCode( $this->userLanguage ) ) );
99    }
100
101    /**
102     * @param SearchContext $searchContext
103     * @return bool
104     */
105    public function buildDegraded( SearchContext $searchContext ) {
106        // Not doing anything for now
107        return false;
108    }
109
110    /**
111     * Build a fulltext query for Wikibase entity.
112     * @param SearchContext $searchContext
113     * @param string $term Search term
114     */
115    protected function buildEntitySearchQuery( SearchContext $searchContext, $term ) {
116        $searchContext->addSyntaxUsed( self::ENTITY_FULL_TEXT_MARKER, 10 );
117        /*
118         * Overall query structure is as follows:
119         * - Bool with:
120         *   Filter of namespace = N
121         *   OR (Should with 1 mininmum) of:
122         *     title.keyword = QUERY
123         *     fulltext match query
124         *
125         * Fulltext match query is:
126         *   Filter of:
127         *      at least one of: all, all.plain matching
128         *      description (for stemmed) or description.en (for non-stemmed) matching, with fallback
129         *   OR (should with 0 minimum) of:
130         *     DISMAX query of: all labels.near_match in fallback chain
131         *     OR (should with 0 minimum) of:
132         *        all
133         *        all.plain
134         *        DISMAX of: all fulltext matches for tokenized fields
135         */
136
137        $profile = $this->settings;
138        // $fields is collecting all the fields for dismax query to be used in
139        // scoring match
140        $fields = [
141            [ "labels.{$this->userLanguage}.near_match", $profile['lang-exact'] ],
142            [ "labels.{$this->userLanguage}.near_match_folded", $profile['lang-folded'] ],
143        ];
144
145        $fieldsTokenized = [
146            [ "labels.{$this->userLanguage}.plain", $profile['lang-partial'] ],
147            [ "descriptions.{$this->userLanguage}.plain", $profile['lang-partial'] ],
148        ];
149        if ( !empty( $this->stemmingSettings[$this->userLanguage]['query'] ) ) {
150            $fieldsTokenized[] = [ "labels.{$this->userLanguage}", $profile['lang-partial'] ];
151            $fieldsTokenized[] = [ "descriptions.{$this->userLanguage}", $profile['lang-partial'] ];
152        }
153
154        $searchLanguageCodes = $this->languageFallbackChainFactory->newFromLanguageCode( $this->userLanguage )
155                ->getFetchLanguageCodes();
156
157        $discount = $profile['fallback-discount'];
158        $stemFilterFields = [];
159
160        foreach ( $searchLanguageCodes as $fallbackCode ) {
161            if ( empty( $this->stemmingSettings[$fallbackCode]['query'] ) ) {
162                $stemFilterFields[] = "descriptions.{$fallbackCode}.plain";
163            } else {
164                $stemFilterFields[] = "descriptions.{$fallbackCode}";
165                // only add the stemmed version in the filter
166                // labels should be copied to the text field and thus be captured by the filter on the all field
167                $stemFilterFields[] = "labels.{$fallbackCode}";
168            }
169
170            if ( $fallbackCode === $this->userLanguage ) {
171                continue;
172            }
173
174            $weight = $profile['fallback-exact'] * $discount;
175            $fields[] = [ "labels.{$fallbackCode}.near_match", $weight ];
176
177            $weight = $profile['fallback-folded'] * $discount;
178            $fields[] = [ "labels.{$fallbackCode}.near_match_folded", $weight ];
179
180            $weight = $profile['fallback-partial'] * $discount;
181            $fieldsTokenized[] = [ "labels.{$fallbackCode}.plain", $weight ];
182            $fieldsTokenized[] = [ "descriptions.{$fallbackCode}.plain", $weight ];
183            if ( !empty( $this->stemmingSettings[$fallbackCode]['query'] ) ) {
184                $fieldsTokenized[] = [ "descriptions.{$fallbackCode}", $weight ];
185                $fieldsTokenized[] = [ "labels.{$fallbackCode}", $weight ];
186            }
187
188            $discount *= $profile['fallback-discount'];
189        }
190
191        $titleMatch = new Term( [
192            'title.keyword' => EntitySearchUtils::normalizeId( $term, $this->entityIdParser ),
193        ] );
194
195        // Main query filter
196        $filterQuery = $this->buildSimpleAllFilter( $term );
197        foreach ( $stemFilterFields as $filterField ) {
198            $filterQuery->addShould( $this->buildFieldMatch( $filterField, $term, 'AND' ) );
199        }
200
201        // Near match ones, they use constant score
202        $nearMatchQuery = new DisMax();
203        $nearMatchQuery->setTieBreaker( 0 );
204        foreach ( $fields as $field ) {
205            $nearMatchQuery->addQuery( EntitySearchUtils::makeConstScoreQuery( $field[0], $field[1],
206                $term ) );
207        }
208
209        // Tokenized ones
210        $tokenizedQuery = $this->buildSimpleAllFilter( $term, 'OR', $profile['any'] );
211        $tokenizedQueryFields = new DisMax();
212        $tokenizedQueryFields->setTieBreaker( 0.2 );
213        foreach ( $fieldsTokenized as $field ) {
214            $m = $this->buildFieldMatch( $field[0], $term );
215            $m->setFieldBoost( $field[0], $field[1] );
216            $tokenizedQueryFields->addQuery( $m );
217        }
218        $tokenizedQuery->addShould( $tokenizedQueryFields );
219
220        // Main labels/desc query
221        $labelsDescQuery = new BoolQuery();
222        $labelsDescQuery->setMinimumShouldMatch( 0 );
223        $labelsDescQuery->addFilter( $filterQuery );
224        $labelsDescQuery->addShould( $nearMatchQuery );
225        $labelsDescQuery->addShould( $tokenizedQuery );
226
227        // Main query
228        $query = new BoolQuery();
229
230        // Match either labels or exact match to title
231        $query->addShould( $titleMatch );
232        $query->addShould( $labelsDescQuery );
233        $query->setMinimumShouldMatch( 1 );
234
235        $searchContext->setMainQuery( $query );
236        $searchContext->setPhraseRescoreQuery( $this->buildPhraseRescore( $term, $searchContext, $profile ) );
237    }
238
239    /**
240     * Builds a simple filter on all and all.plain when all terms must match
241     *
242     * @param string $query
243     * @param string $operator
244     * @param null $boost
245     * @return BoolQuery
246     */
247    private function buildSimpleAllFilter( $query, $operator = 'AND', $boost = null ) {
248        $filter = new BoolQuery();
249        $filter->setMinimumShouldMatch( 1 );
250        // FIXME: We can't use solely the stem field here
251        // - Depending on languages it may lack stopwords,
252        // A dedicated field used for filtering would be nice
253        foreach ( [ 'all', 'all.plain' ] as $field ) {
254            $m = new MatchQuery();
255            $m->setFieldQuery( $field, $query );
256            $m->setFieldOperator( $field, $operator );
257            if ( $boost ) {
258                $m->setFieldBoost( $field, $boost );
259            }
260            $filter->addShould( $m );
261        }
262        return $filter;
263    }
264
265    /**
266     * Build simple match clause, matching field against term
267     * @param string $field
268     * @param string $term
269     * @param string|null $operator
270     * @return MatchQuery
271     */
272    private function buildFieldMatch( $field, $term, $operator = null ) {
273        $m = new MatchQuery();
274        $m->setFieldQuery( $field, $term );
275        if ( $operator ) {
276            $m->setFieldOperator( $field, $operator );
277        }
278        return $m;
279    }
280
281    /**
282     * Create phrase rescore query for "all" fields
283     * @param string $queryText
284     * @param SearchContext $context
285     * @param float[][] $profile Must contain $profile['phrase'] with keys 'all', 'slop', 'all.plain'
286     * @return AbstractQuery|null
287     */
288    private function buildPhraseRescore( $queryText, SearchContext $context, array $profile ) {
289        if ( empty( $profile['phrase'] ) ) {
290            return null;
291        } else {
292            $phraseProfile = $profile['phrase'];
293        }
294        $useRouter = $context->getConfig()->getElement( 'CirrusSearchWikimediaExtraPlugin', 'token_count_router' ) === true;
295        $phrase = new MultiMatch();
296        $phrase->setParam( 'type', 'phrase' );
297        $phrase->setParam( 'slop', $phraseProfile['slop'] );
298        $fields = [
299            "all^{$phraseProfile['all']}", "all.plain^{$phraseProfile['all.plain']}"
300        ];
301        $phrase->setFields( $fields );
302        $phrase->setQuery( $queryText );
303        if ( !$useRouter ) {
304            return $phrase;
305        }
306        $tokCount = new TokenCountRouter(
307        // text
308            $queryText,
309            // fallback
310            new MatchNone(),
311            // field
312            "text"
313        );
314        $tokCount->addCondition(
315            TokenCountRouter::GT,
316            1,
317            $phrase
318        );
319        $maxTokens = $context->getConfig()->get( 'CirrusSearchMaxPhraseTokens' );
320        if ( $maxTokens ) {
321            $tokCount->addCondition(
322                TokenCountRouter::GT,
323                $maxTokens,
324                new \Elastica\Query\MatchNone()
325            );
326        }
327        return $tokCount;
328    }
329
330}