Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
94.62% covered (success)
94.62%
123 / 130
75.00% covered (warning)
75.00%
6 / 8
CRAP
0.00% covered (danger)
0.00%
0 / 1
EntityFullTextQueryBuilder
94.62% covered (success)
94.62%
123 / 130
75.00% covered (warning)
75.00%
6 / 8
22.08
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 newFromGlobals
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
1
 build
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 buildDegraded
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 buildEntitySearchQuery
100.00% covered (success)
100.00%
65 / 65
100.00% covered (success)
100.00%
1 / 1
9
 buildSimpleAllFilter
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
3
 buildFieldMatch
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 buildPhraseRescore
81.25% covered (warning)
81.25%
26 / 32
0.00% covered (danger)
0.00%
0 / 1
4.11
1<?php
2
3namespace Wikibase\Search\Elastic;
4
5use CirrusSearch\Extra\Query\TokenCountRouter;
6use CirrusSearch\Query\FullTextQueryBuilder;
7use CirrusSearch\Search\SearchContext;
8use Elastica\Query\AbstractQuery;
9use Elastica\Query\BoolQuery;
10use Elastica\Query\DisMax;
11use Elastica\Query\MatchNone;
12use Elastica\Query\MatchQuery;
13use Elastica\Query\MultiMatch;
14use Elastica\Query\Term;
15use MediaWiki\MediaWikiServices;
16use Wikibase\DataModel\Entity\EntityIdParser;
17use Wikibase\Lib\LanguageFallbackChainFactory;
18use Wikibase\Repo\WikibaseRepo;
19
20/**
21 * Builder for entity fulltext queries
22 */
23class EntityFullTextQueryBuilder implements FullTextQueryBuilder {
24    public const ENTITY_FULL_TEXT_MARKER = 'entity_full_text';
25
26    /**
27     * @var array
28     */
29    private $settings;
30    /**
31     * Repository 'entitySearch' settings
32     * @var array
33     */
34    private $stemmingSettings;
35    /**
36     * @var LanguageFallbackChainFactory
37     */
38    private $languageFallbackChainFactory;
39    /**
40     * @var EntityIdParser
41     */
42    private $entityIdParser;
43    /**
44     * @var string User language code
45     */
46    private $userLanguage;
47
48    /**
49     * @param array $stemmingSettings Stemming settings from UseStemming config entry
50     * @param array $settings Settings from EntitySearchProfiles.php
51     * @param LanguageFallbackChainFactory $languageFallbackChainFactory
52     * @param EntityIdParser $entityIdParser
53     * @param string $userLanguage User's language code
54     */
55    public function __construct(
56        array $stemmingSettings,
57        array $settings,
58        LanguageFallbackChainFactory $languageFallbackChainFactory,
59        EntityIdParser $entityIdParser,
60        $userLanguage
61    ) {
62        $this->stemmingSettings = $stemmingSettings;
63        $this->settings = $settings;
64        $this->languageFallbackChainFactory = $languageFallbackChainFactory;
65        $this->entityIdParser = $entityIdParser;
66        $this->userLanguage = $userLanguage;
67    }
68
69    /**
70     * Create fulltext builder from global environment.
71     * @param array $settings Configuration from config file
72     * @return EntityFullTextQueryBuilder
73     */
74    public static function newFromGlobals( array $settings ) {
75        $services = MediaWikiServices::getInstance();
76        $config = $services->getConfigFactory()->makeConfig( 'WikibaseCirrusSearch' );
77        return new static(
78            $config->get( 'UseStemming' ),
79            $settings,
80            WikibaseRepo::getLanguageFallbackChainFactory( $services ),
81            WikibaseRepo::getEntityIdParser( $services ),
82            WikibaseRepo::getUserLanguage( $services )->getCode()
83        );
84    }
85
86    /**
87     * Search articles with provided term.
88     *
89     * @param SearchContext $searchContext
90     * @param string $term term to search
91     */
92    public function build( SearchContext $searchContext, $term ) {
93        $this->buildEntitySearchQuery( $searchContext, $term );
94        // if we did find advanced query, we keep the old setup but change the result type
95        // FIXME: make it dispatch by content model
96        $searchContext->setResultsType( new EntityResultType( $this->userLanguage,
97            $this->languageFallbackChainFactory->newFromLanguageCode( $this->userLanguage ) ) );
98    }
99
100    /**
101     * @param SearchContext $searchContext
102     * @return bool
103     */
104    public function buildDegraded( SearchContext $searchContext ) {
105        // Not doing anything for now
106        return false;
107    }
108
109    /**
110     * Build a fulltext query for Wikibase entity.
111     * @param SearchContext $searchContext
112     * @param string $term Search term
113     */
114    protected function buildEntitySearchQuery( SearchContext $searchContext, $term ) {
115        $searchContext->addSyntaxUsed( self::ENTITY_FULL_TEXT_MARKER, 10 );
116        /*
117         * Overall query structure is as follows:
118         * - Bool with:
119         *   Filter of namespace = N
120         *   OR (Should with 1 mininmum) of:
121         *     title.keyword = QUERY
122         *     fulltext match query
123         *
124         * Fulltext match query is:
125         *   Filter of:
126         *      at least one of: all, all.plain matching
127         *      description (for stemmed) or description.en (for non-stemmed) matching, with fallback
128         *   OR (should with 0 minimum) of:
129         *     DISMAX query of: all labels.near_match in fallback chain
130         *     OR (should with 0 minimum) of:
131         *        all
132         *        all.plain
133         *        DISMAX of: all fulltext matches for tokenized fields
134         */
135
136        $profile = $this->settings;
137        // $fields is collecting all the fields for dismax query to be used in
138        // scoring match
139        $fields = [
140            [ "labels.{$this->userLanguage}.near_match", $profile['lang-exact'] ],
141            [ "labels.{$this->userLanguage}.near_match_folded", $profile['lang-folded'] ],
142        ];
143
144        $fieldsTokenized = [
145            [ "labels.{$this->userLanguage}.plain", $profile['lang-partial'] ],
146            [ "descriptions.{$this->userLanguage}.plain", $profile['lang-partial'] ],
147        ];
148        if ( !empty( $this->stemmingSettings[$this->userLanguage]['query'] ) ) {
149            $fieldsTokenized[] = [ "labels.{$this->userLanguage}", $profile['lang-partial'] ];
150            $fieldsTokenized[] = [ "descriptions.{$this->userLanguage}", $profile['lang-partial'] ];
151        }
152
153        $searchLanguageCodes = $this->languageFallbackChainFactory->newFromLanguageCode( $this->userLanguage )
154                ->getFetchLanguageCodes();
155
156        $discount = $profile['fallback-discount'];
157        $stemFilterFields = [];
158
159        foreach ( $searchLanguageCodes as $fallbackCode ) {
160            if ( empty( $this->stemmingSettings[$fallbackCode]['query'] ) ) {
161                $stemFilterFields[] = "descriptions.{$fallbackCode}.plain";
162            } else {
163                $stemFilterFields[] = "descriptions.{$fallbackCode}";
164                // only add the stemmed version in the filter
165                // labels should be copied to the text field and thus be captured by the filter on the all field
166                $stemFilterFields[] = "labels.{$fallbackCode}";
167            }
168
169            if ( $fallbackCode === $this->userLanguage ) {
170                continue;
171            }
172
173            $weight = $profile['fallback-exact'] * $discount;
174            $fields[] = [ "labels.{$fallbackCode}.near_match", $weight ];
175
176            $weight = $profile['fallback-folded'] * $discount;
177            $fields[] = [ "labels.{$fallbackCode}.near_match_folded", $weight ];
178
179            $weight = $profile['fallback-partial'] * $discount;
180            $fieldsTokenized[] = [ "labels.{$fallbackCode}.plain", $weight ];
181            $fieldsTokenized[] = [ "descriptions.{$fallbackCode}.plain", $weight ];
182            if ( !empty( $this->stemmingSettings[$fallbackCode]['query'] ) ) {
183                $fieldsTokenized[] = [ "descriptions.{$fallbackCode}", $weight ];
184                $fieldsTokenized[] = [ "labels.{$fallbackCode}", $weight ];
185            }
186
187            $discount *= $profile['fallback-discount'];
188        }
189
190        $titleMatch = new Term( [
191            'title.keyword' => EntitySearchUtils::normalizeId( $term, $this->entityIdParser ),
192        ] );
193
194        // Main query filter
195        $filterQuery = $this->buildSimpleAllFilter( $term );
196        foreach ( $stemFilterFields as $filterField ) {
197            $filterQuery->addShould( $this->buildFieldMatch( $filterField, $term, 'AND' ) );
198        }
199
200        // Near match ones, they use constant score
201        $nearMatchQuery = new DisMax();
202        $nearMatchQuery->setTieBreaker( 0 );
203        foreach ( $fields as $field ) {
204            $nearMatchQuery->addQuery( EntitySearchUtils::makeConstScoreQuery( $field[0], $field[1],
205                $term ) );
206        }
207
208        // Tokenized ones
209        $tokenizedQuery = $this->buildSimpleAllFilter( $term, 'OR', $profile['any'] );
210        $tokenizedQueryFields = new DisMax();
211        $tokenizedQueryFields->setTieBreaker( 0.2 );
212        foreach ( $fieldsTokenized as $field ) {
213            $m = $this->buildFieldMatch( $field[0], $term );
214            $m->setFieldBoost( $field[0], $field[1] );
215            $tokenizedQueryFields->addQuery( $m );
216        }
217        $tokenizedQuery->addShould( $tokenizedQueryFields );
218
219        // Main labels/desc query
220        $labelsDescQuery = new BoolQuery();
221        $labelsDescQuery->setMinimumShouldMatch( 0 );
222        $labelsDescQuery->addFilter( $filterQuery );
223        $labelsDescQuery->addShould( $nearMatchQuery );
224        $labelsDescQuery->addShould( $tokenizedQuery );
225
226        // Main query
227        $query = new BoolQuery();
228
229        // Match either labels or exact match to title
230        $query->addShould( $titleMatch );
231        $query->addShould( $labelsDescQuery );
232        $query->setMinimumShouldMatch( 1 );
233
234        $searchContext->setMainQuery( $query );
235        $searchContext->setPhraseRescoreQuery( $this->buildPhraseRescore( $term, $searchContext, $profile ) );
236    }
237
238    /**
239     * Builds a simple filter on all and all.plain when all terms must match
240     *
241     * @param string $query
242     * @param string $operator
243     * @param null $boost
244     * @return BoolQuery
245     */
246    private function buildSimpleAllFilter( $query, $operator = 'AND', $boost = null ) {
247        $filter = new BoolQuery();
248        $filter->setMinimumShouldMatch( 1 );
249        // FIXME: We can't use solely the stem field here
250        // - Depending on languages it may lack stopwords,
251        // A dedicated field used for filtering would be nice
252        foreach ( [ 'all', 'all.plain' ] as $field ) {
253            $m = new MatchQuery();
254            $m->setFieldQuery( $field, $query );
255            $m->setFieldOperator( $field, $operator );
256            if ( $boost ) {
257                $m->setFieldBoost( $field, $boost );
258            }
259            $filter->addShould( $m );
260        }
261        return $filter;
262    }
263
264    /**
265     * Build simple match clause, matching field against term
266     * @param string $field
267     * @param string $term
268     * @param string|null $operator
269     * @return MatchQuery
270     */
271    private function buildFieldMatch( $field, $term, $operator = null ) {
272        $m = new MatchQuery();
273        $m->setFieldQuery( $field, $term );
274        if ( $operator ) {
275            $m->setFieldOperator( $field, $operator );
276        }
277        return $m;
278    }
279
280    /**
281     * Create phrase rescore query for "all" fields
282     * @param string $queryText
283     * @param SearchContext $context
284     * @param float[][] $profile Must contain $profile['phrase'] with keys 'all', 'slop', 'all.plain'
285     * @return AbstractQuery|null
286     */
287    private function buildPhraseRescore( $queryText, SearchContext $context, array $profile ) {
288        if ( empty( $profile['phrase'] ) ) {
289            return null;
290        } else {
291            $phraseProfile = $profile['phrase'];
292        }
293        $useRouter = $context->getConfig()->getElement( 'CirrusSearchWikimediaExtraPlugin', 'token_count_router' ) === true;
294        $phrase = new MultiMatch();
295        $phrase->setParam( 'type', 'phrase' );
296        $phrase->setParam( 'slop', $phraseProfile['slop'] );
297        $fields = [
298            "all^{$phraseProfile['all']}", "all.plain^{$phraseProfile['all.plain']}"
299        ];
300        $phrase->setFields( $fields );
301        $phrase->setQuery( $queryText );
302        if ( !$useRouter ) {
303            return $phrase;
304        }
305        $tokCount = new TokenCountRouter(
306        // text
307            $queryText,
308            // fallback
309            new MatchNone(),
310            // field
311            "text"
312        );
313        $tokCount->addCondition(
314            TokenCountRouter::GT,
315            1,
316            $phrase
317        );
318        $maxTokens = $context->getConfig()->get( 'CirrusSearchMaxPhraseTokens' );
319        if ( $maxTokens ) {
320            $tokCount->addCondition(
321                TokenCountRouter::GT,
322                $maxTokens,
323                new \Elastica\Query\MatchNone()
324            );
325        }
326        return $tokCount;
327    }
328
329}