Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
94.62% |
123 / 130 |
|
75.00% |
6 / 8 |
CRAP | |
0.00% |
0 / 1 |
| EntityFullTextQueryBuilder | |
94.62% |
123 / 130 |
|
75.00% |
6 / 8 |
22.08 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
| newFromGlobals | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
| build | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| buildDegraded | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| buildEntitySearchQuery | |
100.00% |
65 / 65 |
|
100.00% |
1 / 1 |
9 | |||
| buildSimpleAllFilter | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
| buildFieldMatch | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
| buildPhraseRescore | |
81.25% |
26 / 32 |
|
0.00% |
0 / 1 |
4.11 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace Wikibase\Search\Elastic; |
| 4 | |
| 5 | use CirrusSearch\Extra\Query\TokenCountRouter; |
| 6 | use CirrusSearch\Query\FullTextQueryBuilder; |
| 7 | use CirrusSearch\Search\SearchContext; |
| 8 | use Elastica\Query\AbstractQuery; |
| 9 | use Elastica\Query\BoolQuery; |
| 10 | use Elastica\Query\DisMax; |
| 11 | use Elastica\Query\MatchNone; |
| 12 | use Elastica\Query\MatchQuery; |
| 13 | use Elastica\Query\MultiMatch; |
| 14 | use Elastica\Query\Term; |
| 15 | use MediaWiki\Context\RequestContext; |
| 16 | use MediaWiki\MediaWikiServices; |
| 17 | use Wikibase\DataModel\Entity\EntityIdParser; |
| 18 | use Wikibase\Lib\LanguageFallbackChainFactory; |
| 19 | use Wikibase\Repo\WikibaseRepo; |
| 20 | |
| 21 | /** |
| 22 | * Builder for entity fulltext queries |
| 23 | */ |
| 24 | class EntityFullTextQueryBuilder implements FullTextQueryBuilder { |
| 25 | public const ENTITY_FULL_TEXT_MARKER = 'entity_full_text'; |
| 26 | |
| 27 | /** |
| 28 | * @var array |
| 29 | */ |
| 30 | private $settings; |
| 31 | /** |
| 32 | * Repository 'entitySearch' settings |
| 33 | * @var array |
| 34 | */ |
| 35 | private $stemmingSettings; |
| 36 | /** |
| 37 | * @var LanguageFallbackChainFactory |
| 38 | */ |
| 39 | private $languageFallbackChainFactory; |
| 40 | /** |
| 41 | * @var EntityIdParser |
| 42 | */ |
| 43 | private $entityIdParser; |
| 44 | /** |
| 45 | * @var string User language code |
| 46 | */ |
| 47 | private $userLanguage; |
| 48 | |
| 49 | /** |
| 50 | * @param array $stemmingSettings Stemming settings from UseStemming config entry |
| 51 | * @param array $settings Settings from EntitySearchProfiles.php |
| 52 | * @param LanguageFallbackChainFactory $languageFallbackChainFactory |
| 53 | * @param EntityIdParser $entityIdParser |
| 54 | * @param string $userLanguage User's language code |
| 55 | */ |
| 56 | public function __construct( |
| 57 | array $stemmingSettings, |
| 58 | array $settings, |
| 59 | LanguageFallbackChainFactory $languageFallbackChainFactory, |
| 60 | EntityIdParser $entityIdParser, |
| 61 | $userLanguage |
| 62 | ) { |
| 63 | $this->stemmingSettings = $stemmingSettings; |
| 64 | $this->settings = $settings; |
| 65 | $this->languageFallbackChainFactory = $languageFallbackChainFactory; |
| 66 | $this->entityIdParser = $entityIdParser; |
| 67 | $this->userLanguage = $userLanguage; |
| 68 | } |
| 69 | |
| 70 | /** |
| 71 | * Create fulltext builder from global environment. |
| 72 | * @param array $settings Configuration from config file |
| 73 | * @return EntityFullTextQueryBuilder |
| 74 | */ |
| 75 | public static function newFromGlobals( array $settings ) { |
| 76 | $services = MediaWikiServices::getInstance(); |
| 77 | $config = $services->getConfigFactory()->makeConfig( 'WikibaseCirrusSearch' ); |
| 78 | return new static( |
| 79 | $config->get( 'UseStemming' ), |
| 80 | $settings, |
| 81 | WikibaseRepo::getLanguageFallbackChainFactory( $services ), |
| 82 | WikibaseRepo::getEntityIdParser( $services ), |
| 83 | RequestContext::getMain()->getLanguage()->getCode() |
| 84 | ); |
| 85 | } |
| 86 | |
| 87 | /** |
| 88 | * Search articles with provided term. |
| 89 | * |
| 90 | * @param SearchContext $searchContext |
| 91 | * @param string $term term to search |
| 92 | */ |
| 93 | public function build( SearchContext $searchContext, $term ) { |
| 94 | $this->buildEntitySearchQuery( $searchContext, $term ); |
| 95 | // if we did find advanced query, we keep the old setup but change the result type |
| 96 | // FIXME: make it dispatch by content model |
| 97 | $searchContext->setResultsType( new EntityResultType( $this->userLanguage, |
| 98 | $this->languageFallbackChainFactory->newFromLanguageCode( $this->userLanguage ) ) ); |
| 99 | } |
| 100 | |
| 101 | /** |
| 102 | * @param SearchContext $searchContext |
| 103 | * @return bool |
| 104 | */ |
| 105 | public function buildDegraded( SearchContext $searchContext ) { |
| 106 | // Not doing anything for now |
| 107 | return false; |
| 108 | } |
| 109 | |
| 110 | /** |
| 111 | * Build a fulltext query for Wikibase entity. |
| 112 | * @param SearchContext $searchContext |
| 113 | * @param string $term Search term |
| 114 | */ |
| 115 | protected function buildEntitySearchQuery( SearchContext $searchContext, $term ) { |
| 116 | $searchContext->addSyntaxUsed( self::ENTITY_FULL_TEXT_MARKER, 10 ); |
| 117 | /* |
| 118 | * Overall query structure is as follows: |
| 119 | * - Bool with: |
| 120 | * Filter of namespace = N |
| 121 | * OR (Should with 1 mininmum) of: |
| 122 | * title.keyword = QUERY |
| 123 | * fulltext match query |
| 124 | * |
| 125 | * Fulltext match query is: |
| 126 | * Filter of: |
| 127 | * at least one of: all, all.plain matching |
| 128 | * description (for stemmed) or description.en (for non-stemmed) matching, with fallback |
| 129 | * OR (should with 0 minimum) of: |
| 130 | * DISMAX query of: all labels.near_match in fallback chain |
| 131 | * OR (should with 0 minimum) of: |
| 132 | * all |
| 133 | * all.plain |
| 134 | * DISMAX of: all fulltext matches for tokenized fields |
| 135 | */ |
| 136 | |
| 137 | $profile = $this->settings; |
| 138 | // $fields is collecting all the fields for dismax query to be used in |
| 139 | // scoring match |
| 140 | $fields = [ |
| 141 | [ "labels.{$this->userLanguage}.near_match", $profile['lang-exact'] ], |
| 142 | [ "labels.{$this->userLanguage}.near_match_folded", $profile['lang-folded'] ], |
| 143 | ]; |
| 144 | |
| 145 | $fieldsTokenized = [ |
| 146 | [ "labels.{$this->userLanguage}.plain", $profile['lang-partial'] ], |
| 147 | [ "descriptions.{$this->userLanguage}.plain", $profile['lang-partial'] ], |
| 148 | ]; |
| 149 | if ( !empty( $this->stemmingSettings[$this->userLanguage]['query'] ) ) { |
| 150 | $fieldsTokenized[] = [ "labels.{$this->userLanguage}", $profile['lang-partial'] ]; |
| 151 | $fieldsTokenized[] = [ "descriptions.{$this->userLanguage}", $profile['lang-partial'] ]; |
| 152 | } |
| 153 | |
| 154 | $searchLanguageCodes = $this->languageFallbackChainFactory->newFromLanguageCode( $this->userLanguage ) |
| 155 | ->getFetchLanguageCodes(); |
| 156 | |
| 157 | $discount = $profile['fallback-discount']; |
| 158 | $stemFilterFields = []; |
| 159 | |
| 160 | foreach ( $searchLanguageCodes as $fallbackCode ) { |
| 161 | if ( empty( $this->stemmingSettings[$fallbackCode]['query'] ) ) { |
| 162 | $stemFilterFields[] = "descriptions.{$fallbackCode}.plain"; |
| 163 | } else { |
| 164 | $stemFilterFields[] = "descriptions.{$fallbackCode}"; |
| 165 | // only add the stemmed version in the filter |
| 166 | // labels should be copied to the text field and thus be captured by the filter on the all field |
| 167 | $stemFilterFields[] = "labels.{$fallbackCode}"; |
| 168 | } |
| 169 | |
| 170 | if ( $fallbackCode === $this->userLanguage ) { |
| 171 | continue; |
| 172 | } |
| 173 | |
| 174 | $weight = $profile['fallback-exact'] * $discount; |
| 175 | $fields[] = [ "labels.{$fallbackCode}.near_match", $weight ]; |
| 176 | |
| 177 | $weight = $profile['fallback-folded'] * $discount; |
| 178 | $fields[] = [ "labels.{$fallbackCode}.near_match_folded", $weight ]; |
| 179 | |
| 180 | $weight = $profile['fallback-partial'] * $discount; |
| 181 | $fieldsTokenized[] = [ "labels.{$fallbackCode}.plain", $weight ]; |
| 182 | $fieldsTokenized[] = [ "descriptions.{$fallbackCode}.plain", $weight ]; |
| 183 | if ( !empty( $this->stemmingSettings[$fallbackCode]['query'] ) ) { |
| 184 | $fieldsTokenized[] = [ "descriptions.{$fallbackCode}", $weight ]; |
| 185 | $fieldsTokenized[] = [ "labels.{$fallbackCode}", $weight ]; |
| 186 | } |
| 187 | |
| 188 | $discount *= $profile['fallback-discount']; |
| 189 | } |
| 190 | |
| 191 | $titleMatch = new Term( [ |
| 192 | 'title.keyword' => EntitySearchUtils::normalizeId( $term, $this->entityIdParser ), |
| 193 | ] ); |
| 194 | |
| 195 | // Main query filter |
| 196 | $filterQuery = $this->buildSimpleAllFilter( $term ); |
| 197 | foreach ( $stemFilterFields as $filterField ) { |
| 198 | $filterQuery->addShould( $this->buildFieldMatch( $filterField, $term, 'AND' ) ); |
| 199 | } |
| 200 | |
| 201 | // Near match ones, they use constant score |
| 202 | $nearMatchQuery = new DisMax(); |
| 203 | $nearMatchQuery->setTieBreaker( 0 ); |
| 204 | foreach ( $fields as $field ) { |
| 205 | $nearMatchQuery->addQuery( EntitySearchUtils::makeConstScoreQuery( $field[0], $field[1], |
| 206 | $term ) ); |
| 207 | } |
| 208 | |
| 209 | // Tokenized ones |
| 210 | $tokenizedQuery = $this->buildSimpleAllFilter( $term, 'OR', $profile['any'] ); |
| 211 | $tokenizedQueryFields = new DisMax(); |
| 212 | $tokenizedQueryFields->setTieBreaker( 0.2 ); |
| 213 | foreach ( $fieldsTokenized as $field ) { |
| 214 | $m = $this->buildFieldMatch( $field[0], $term ); |
| 215 | $m->setFieldBoost( $field[0], $field[1] ); |
| 216 | $tokenizedQueryFields->addQuery( $m ); |
| 217 | } |
| 218 | $tokenizedQuery->addShould( $tokenizedQueryFields ); |
| 219 | |
| 220 | // Main labels/desc query |
| 221 | $labelsDescQuery = new BoolQuery(); |
| 222 | $labelsDescQuery->setMinimumShouldMatch( 0 ); |
| 223 | $labelsDescQuery->addFilter( $filterQuery ); |
| 224 | $labelsDescQuery->addShould( $nearMatchQuery ); |
| 225 | $labelsDescQuery->addShould( $tokenizedQuery ); |
| 226 | |
| 227 | // Main query |
| 228 | $query = new BoolQuery(); |
| 229 | |
| 230 | // Match either labels or exact match to title |
| 231 | $query->addShould( $titleMatch ); |
| 232 | $query->addShould( $labelsDescQuery ); |
| 233 | $query->setMinimumShouldMatch( 1 ); |
| 234 | |
| 235 | $searchContext->setMainQuery( $query ); |
| 236 | $searchContext->setPhraseRescoreQuery( $this->buildPhraseRescore( $term, $searchContext, $profile ) ); |
| 237 | } |
| 238 | |
| 239 | /** |
| 240 | * Builds a simple filter on all and all.plain when all terms must match |
| 241 | * |
| 242 | * @param string $query |
| 243 | * @param string $operator |
| 244 | * @param null $boost |
| 245 | * @return BoolQuery |
| 246 | */ |
| 247 | private function buildSimpleAllFilter( $query, $operator = 'AND', $boost = null ) { |
| 248 | $filter = new BoolQuery(); |
| 249 | $filter->setMinimumShouldMatch( 1 ); |
| 250 | // FIXME: We can't use solely the stem field here |
| 251 | // - Depending on languages it may lack stopwords, |
| 252 | // A dedicated field used for filtering would be nice |
| 253 | foreach ( [ 'all', 'all.plain' ] as $field ) { |
| 254 | $m = new MatchQuery(); |
| 255 | $m->setFieldQuery( $field, $query ); |
| 256 | $m->setFieldOperator( $field, $operator ); |
| 257 | if ( $boost ) { |
| 258 | $m->setFieldBoost( $field, $boost ); |
| 259 | } |
| 260 | $filter->addShould( $m ); |
| 261 | } |
| 262 | return $filter; |
| 263 | } |
| 264 | |
| 265 | /** |
| 266 | * Build simple match clause, matching field against term |
| 267 | * @param string $field |
| 268 | * @param string $term |
| 269 | * @param string|null $operator |
| 270 | * @return MatchQuery |
| 271 | */ |
| 272 | private function buildFieldMatch( $field, $term, $operator = null ) { |
| 273 | $m = new MatchQuery(); |
| 274 | $m->setFieldQuery( $field, $term ); |
| 275 | if ( $operator ) { |
| 276 | $m->setFieldOperator( $field, $operator ); |
| 277 | } |
| 278 | return $m; |
| 279 | } |
| 280 | |
| 281 | /** |
| 282 | * Create phrase rescore query for "all" fields |
| 283 | * @param string $queryText |
| 284 | * @param SearchContext $context |
| 285 | * @param float[][] $profile Must contain $profile['phrase'] with keys 'all', 'slop', 'all.plain' |
| 286 | * @return AbstractQuery|null |
| 287 | */ |
| 288 | private function buildPhraseRescore( $queryText, SearchContext $context, array $profile ) { |
| 289 | if ( empty( $profile['phrase'] ) ) { |
| 290 | return null; |
| 291 | } else { |
| 292 | $phraseProfile = $profile['phrase']; |
| 293 | } |
| 294 | $useRouter = $context->getConfig()->getElement( 'CirrusSearchWikimediaExtraPlugin', 'token_count_router' ) === true; |
| 295 | $phrase = new MultiMatch(); |
| 296 | $phrase->setParam( 'type', 'phrase' ); |
| 297 | $phrase->setParam( 'slop', $phraseProfile['slop'] ); |
| 298 | $fields = [ |
| 299 | "all^{$phraseProfile['all']}", "all.plain^{$phraseProfile['all.plain']}" |
| 300 | ]; |
| 301 | $phrase->setFields( $fields ); |
| 302 | $phrase->setQuery( $queryText ); |
| 303 | if ( !$useRouter ) { |
| 304 | return $phrase; |
| 305 | } |
| 306 | $tokCount = new TokenCountRouter( |
| 307 | // text |
| 308 | $queryText, |
| 309 | // fallback |
| 310 | new MatchNone(), |
| 311 | // field |
| 312 | "text" |
| 313 | ); |
| 314 | $tokCount->addCondition( |
| 315 | TokenCountRouter::GT, |
| 316 | 1, |
| 317 | $phrase |
| 318 | ); |
| 319 | $maxTokens = $context->getConfig()->get( 'CirrusSearchMaxPhraseTokens' ); |
| 320 | if ( $maxTokens ) { |
| 321 | $tokCount->addCondition( |
| 322 | TokenCountRouter::GT, |
| 323 | $maxTokens, |
| 324 | new \Elastica\Query\MatchNone() |
| 325 | ); |
| 326 | } |
| 327 | return $tokCount; |
| 328 | } |
| 329 | |
| 330 | } |