Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
92.46% |
184 / 199 |
|
62.50% |
5 / 8 |
CRAP | |
0.00% |
0 / 1 |
| MappingConfigBuilder | |
92.46% |
184 / 199 |
|
62.50% |
5 / 8 |
25.27 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
2 | |||
| validatePlugins | |
27.27% |
3 / 11 |
|
0.00% |
0 / 1 |
14.62 | |||
| getDefaultFields | |
94.74% |
108 / 114 |
|
0.00% |
0 / 1 |
6.01 | |||
| buildConfig | |
100.00% |
39 / 39 |
|
100.00% |
1 / 1 |
5 | |||
| setupCopyTo | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
| buildSourceTextStringField | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getTextFieldMapping | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
1 | |||
| canOptimizeAnalysisConfig | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace CirrusSearch\Maintenance; |
| 4 | |
| 5 | use CirrusSearch\CirrusSearch; |
| 6 | use CirrusSearch\CirrusSearchHookRunner; |
| 7 | use CirrusSearch\Search\CirrusIndexField; |
| 8 | use CirrusSearch\Search\CirrusSearchIndexFieldFactory; |
| 9 | use CirrusSearch\Search\SourceTextIndexField; |
| 10 | use CirrusSearch\Search\TextIndexField; |
| 11 | use CirrusSearch\SearchConfig; |
| 12 | use MediaWiki\Language\Language; |
| 13 | use MediaWiki\MediaWikiServices; |
| 14 | use SearchIndexField; |
| 15 | |
| 16 | /** |
| 17 | * Builds search mapping configuration arrays. |
| 18 | * |
| 19 | * @license GPL-2.0-or-later |
| 20 | */ |
| 21 | class MappingConfigBuilder { |
| 22 | // Bit field parameters for buildConfig |
| 23 | public const PREFIX_START_WITH_ANY = 1; |
| 24 | public const PHRASE_SUGGEST_USE_TEXT = 2; |
| 25 | public const OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER = 4; |
| 26 | |
| 27 | /** |
| 28 | * Version number for the core analysis. Increment the major |
| 29 | * version when the analysis changes in an incompatible way, |
| 30 | * and change the minor version when it changes but isn't |
| 31 | * incompatible |
| 32 | */ |
| 33 | public const VERSION = '1.10'; |
| 34 | |
| 35 | /** |
| 36 | * @var bool should the index be optimized for the experimental highlighter? |
| 37 | */ |
| 38 | private $optimizeForExperimentalHighlighter; |
| 39 | |
| 40 | /** |
| 41 | * @var SearchConfig |
| 42 | */ |
| 43 | private $config; |
| 44 | |
| 45 | /** |
| 46 | * @var CirrusSearch |
| 47 | */ |
| 48 | protected $engine; |
| 49 | |
| 50 | /** |
| 51 | * @var CirrusSearchIndexFieldFactory |
| 52 | */ |
| 53 | protected $searchIndexFieldFactory; |
| 54 | |
| 55 | /** |
| 56 | * @var int |
| 57 | */ |
| 58 | protected $flags = 0; |
| 59 | /** |
| 60 | * @var CirrusSearchHookRunner |
| 61 | */ |
| 62 | private $cirrusSearchHookRunner; |
| 63 | |
| 64 | /** @var bool if the icu plugin is available */ |
| 65 | private bool $icu; |
| 66 | /** |
| 67 | * @var Language the content language |
| 68 | */ |
| 69 | private Language $language; |
| 70 | |
| 71 | /** |
| 72 | * @param bool $optimizeForExperimentalHighlighter should the index be optimized for the experimental highlighter? |
| 73 | * @param array $plugins list of installed plugins |
| 74 | * @param int $flags |
| 75 | * @param SearchConfig|null $config |
| 76 | * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner |
| 77 | * @param Language|null $language |
| 78 | */ |
| 79 | public function __construct( |
| 80 | bool $optimizeForExperimentalHighlighter, |
| 81 | array $plugins, |
| 82 | int $flags = 0, |
| 83 | ?SearchConfig $config = null, |
| 84 | ?CirrusSearchHookRunner $cirrusSearchHookRunner = null, |
| 85 | ?Language $language = null |
| 86 | ) { |
| 87 | $this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter; |
| 88 | if ( $this->optimizeForExperimentalHighlighter ) { |
| 89 | $flags |= self::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER; |
| 90 | } |
| 91 | $this->flags = $flags; |
| 92 | $this->icu = Plugins::contains( 'analysis-icu', $plugins ); |
| 93 | $this->engine = new CirrusSearch( $config ); |
| 94 | $this->config = $this->engine->getConfig(); |
| 95 | $this->searchIndexFieldFactory = new CirrusSearchIndexFieldFactory( $this->config ); |
| 96 | $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?? new CirrusSearchHookRunner( |
| 97 | MediaWikiServices::getInstance()->getHookContainer() ); |
| 98 | $this->language = $language ?? MediaWikiServices::getInstance()->getContentLanguage(); |
| 99 | |
| 100 | $this->validatePlugins( $plugins ); |
| 101 | } |
| 102 | |
| 103 | private function validatePlugins( array $plugins ) { |
| 104 | if ( $this->config->get( 'CirrusSearchOptimizeForExperimentalHighlighter' ) && |
| 105 | !Plugins::contains( 'experimental-highlighter', $plugins ) |
| 106 | ) { |
| 107 | throw new \InvalidArgumentException( |
| 108 | "wgCirrusSearchOptimizeIndexForExperimentalHighlighter is set to true but the " . |
| 109 | "'experimental-highlighter' plugin is not available." |
| 110 | ); |
| 111 | } |
| 112 | |
| 113 | if ( $this->config->getElement( 'CirrusSearchNaturalTitleSort', 'build' ) && !$this->icu ) { |
| 114 | throw new \InvalidArgumentException( |
| 115 | "wgCirrusSearchNaturalTitleSort is set to build but the 'analysis-icu' plugin " . |
| 116 | "is not available." |
| 117 | ); |
| 118 | } |
| 119 | } |
| 120 | |
| 121 | /** |
| 122 | * Get definitions for default index fields. |
| 123 | * These fields are always present in the index. |
| 124 | * @return array |
| 125 | */ |
| 126 | private function getDefaultFields() { |
| 127 | // Note never to set something as type='object' here because that isn't returned |
| 128 | // by the search engine and is inferred anyway. |
| 129 | $titleExtraAnalyzers = [ |
| 130 | [ 'analyzer' => 'prefix', 'search_analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ], |
| 131 | [ |
| 132 | 'analyzer' => 'prefix_asciifolding', |
| 133 | 'search_analyzer' => 'near_match_asciifolding', |
| 134 | 'index_options' => 'docs', |
| 135 | 'norms' => false |
| 136 | ], |
| 137 | [ 'analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ], |
| 138 | [ 'analyzer' => 'near_match_asciifolding', 'index_options' => 'docs', 'norms' => false ], |
| 139 | [ 'type' => 'keyword', 'normalizer' => 'keyword' ], |
| 140 | ]; |
| 141 | if ( $this->flags & self::PREFIX_START_WITH_ANY ) { |
| 142 | $titleExtraAnalyzers[] = [ |
| 143 | 'analyzer' => 'word_prefix', |
| 144 | 'search_analyzer' => 'plain_search', |
| 145 | 'index_options' => 'docs' |
| 146 | ]; |
| 147 | } |
| 148 | if ( $this->icu && $this->config->getElement( 'CirrusSearchNaturalTitleSort', 'build' ) ) { |
| 149 | $titleExtraAnalyzers[] = [ |
| 150 | 'fieldName' => 'natural_sort', |
| 151 | 'type' => 'icu_collation_keyword', |
| 152 | 'ignore_above' => AnalysisConfigBuilder::KEYWORD_IGNORE_ABOVE, |
| 153 | // doc values only |
| 154 | 'index' => false, |
| 155 | 'numeric' => true, |
| 156 | 'strength' => 'tertiary', |
| 157 | // icu_collation_keyword will use new ULocale(String $l) if only provided the language |
| 158 | // which supports BCP 47 language code. |
| 159 | 'language' => $this->language->toBcp47Code() |
| 160 | ]; |
| 161 | } |
| 162 | |
| 163 | $suggestField = [ |
| 164 | 'type' => 'text', |
| 165 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest' ), |
| 166 | 'index_options' => 'freqs', |
| 167 | 'analyzer' => 'suggest', |
| 168 | ]; |
| 169 | |
| 170 | if ( $this->config->getElement( 'CirrusSearchPhraseSuggestReverseField', 'build' ) ) { |
| 171 | $suggestField['fields'] = [ |
| 172 | 'reverse' => [ |
| 173 | 'type' => 'text', |
| 174 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest', 'reverse' ), |
| 175 | 'index_options' => 'freqs', |
| 176 | 'analyzer' => 'suggest_reverse', |
| 177 | ], |
| 178 | ]; |
| 179 | } |
| 180 | |
| 181 | $page = [ |
| 182 | 'dynamic' => false, |
| 183 | 'properties' => [ |
| 184 | 'timestamp' => [ |
| 185 | 'type' => 'date', |
| 186 | 'format' => 'dateOptionalTime', |
| 187 | ], |
| 188 | 'create_timestamp' => [ |
| 189 | 'type' => 'date', |
| 190 | 'format' => 'dateOptionalTime', |
| 191 | ], |
| 192 | 'page_id' => [ |
| 193 | 'type' => 'long', |
| 194 | 'index' => false, |
| 195 | ], |
| 196 | 'wiki' => $this->searchIndexFieldFactory |
| 197 | ->newKeywordField( 'wiki' ) |
| 198 | ->getMapping( $this->engine ), |
| 199 | 'namespace' => $this->searchIndexFieldFactory |
| 200 | ->newLongField( 'namespace' ) |
| 201 | ->getMapping( $this->engine ), |
| 202 | 'namespace_text' => $this->searchIndexFieldFactory |
| 203 | ->newKeywordField( 'namespace_text' ) |
| 204 | ->withDocValues() |
| 205 | ->getMapping( $this->engine ), |
| 206 | 'title' => $this->searchIndexFieldFactory |
| 207 | ->newStringField( 'title', |
| 208 | TextIndexField::ENABLE_NORMS |
| 209 | | TextIndexField::COPY_TO_SUGGEST |
| 210 | | TextIndexField::COPY_TO_SUGGEST_VARIANT |
| 211 | | TextIndexField::SUPPORT_REGEX, |
| 212 | $titleExtraAnalyzers ) |
| 213 | ->setMappingFlags( $this->flags ) |
| 214 | ->getMapping( $this->engine ), |
| 215 | 'text' => $this->getTextFieldMapping(), |
| 216 | 'text_bytes' => $this->searchIndexFieldFactory |
| 217 | ->newLongField( 'text_bytes' ) |
| 218 | ->getMapping( $this->engine ), |
| 219 | 'source_text' => $this->buildSourceTextStringField( 'source_text' ) |
| 220 | ->setMappingFlags( $this->flags )->getMapping( $this->engine ), |
| 221 | 'redirect' => [ |
| 222 | 'dynamic' => false, |
| 223 | 'properties' => [ |
| 224 | 'namespace' => $this->searchIndexFieldFactory |
| 225 | ->newLongField( 'namespace' ) |
| 226 | ->getMapping( $this->engine ), |
| 227 | 'title' => $this->searchIndexFieldFactory |
| 228 | ->newStringField( 'redirect.title', TextIndexField::ENABLE_NORMS |
| 229 | | TextIndexField::SPEED_UP_HIGHLIGHTING |
| 230 | | TextIndexField::COPY_TO_SUGGEST |
| 231 | | TextIndexField::COPY_TO_SUGGEST_VARIANT |
| 232 | | TextIndexField::SUPPORT_REGEX, |
| 233 | $titleExtraAnalyzers |
| 234 | ) |
| 235 | ->setMappingFlags( $this->flags ) |
| 236 | ->getMapping( $this->engine ), |
| 237 | ] |
| 238 | ], |
| 239 | 'incoming_links' => $this->searchIndexFieldFactory |
| 240 | ->newLongField( 'incoming_links' ) |
| 241 | ->getMapping( $this->engine ), |
| 242 | 'local_sites_with_dupe' => $this->searchIndexFieldFactory |
| 243 | ->newKeywordField( 'local_sites_with_dupe' ) |
| 244 | ->setFlag( SearchIndexField::FLAG_CASEFOLD ) |
| 245 | ->getMapping( $this->engine ), |
| 246 | 'suggest' => $suggestField, |
| 247 | ] |
| 248 | ]; |
| 249 | |
| 250 | if ( $this->config->get( 'CirrusSearchPhraseSuggestBuildVariant' ) ) { |
| 251 | $page['properties']['suggest_variant'] = $suggestField; |
| 252 | } |
| 253 | |
| 254 | return $page; |
| 255 | } |
| 256 | |
| 257 | /** |
| 258 | * Build the mapping config. |
| 259 | * @return array the mapping config |
| 260 | */ |
| 261 | public function buildConfig() { |
| 262 | global $wgCirrusSearchWeights; |
| 263 | |
| 264 | $page = $this->getDefaultFields(); |
| 265 | |
| 266 | $fields = $this->engine->getSearchIndexFields(); |
| 267 | |
| 268 | foreach ( $fields as $fieldName => $field ) { |
| 269 | if ( $field instanceof CirrusIndexField ) { |
| 270 | $field->setMappingFlags( $this->flags ); |
| 271 | } |
| 272 | $config = $field->getMapping( $this->engine ); |
| 273 | if ( $config ) { |
| 274 | $page['properties'][$fieldName] = $config; |
| 275 | } |
| 276 | } |
| 277 | |
| 278 | // Unclear how this would otherwise fit into the process to construct the mapping. |
| 279 | // Not used directly in cirrus, supports queries from 'add-a-link' (T301096). |
| 280 | if ( isset( $page['properties']['outgoing_link'] ) ) { |
| 281 | $page['properties']['outgoing_link']['fields']['token_count'] = [ |
| 282 | 'type' => 'token_count', |
| 283 | 'analyzer' => 'keyword', |
| 284 | ]; |
| 285 | } |
| 286 | |
| 287 | // Now layer all the fields into the all field once per weight. Querying it isn't strictly the |
| 288 | // same as querying each field - in some ways it is better! In others it is worse.... |
| 289 | |
| 290 | // Better because theoretically tf/idf based scoring works better this way. |
| 291 | // Worse because we have to analyze each field multiple times.... Bleh! |
| 292 | // This field can't be used for the fvh/experimental highlighter for several reasons: |
| 293 | // 1. It is built with copy_to and not stored. |
| 294 | // 2. The term frequency information is all whoppy compared to the "real" source text. |
| 295 | $allField = $this->searchIndexFieldFactory-> |
| 296 | newStringField( 'all', TextIndexField::ENABLE_NORMS ); |
| 297 | $page['properties']['all'] = |
| 298 | $allField->setMappingFlags( $this->flags )->getMapping( $this->engine ); |
| 299 | $page = $this->setupCopyTo( $page, $wgCirrusSearchWeights, 'all' ); |
| 300 | |
| 301 | // Now repeat for near_match fields. The same considerations above apply except near_match |
| 302 | // is never used in phrase queries or highlighting. |
| 303 | $page[ 'properties' ][ 'all_near_match' ] = [ |
| 304 | 'type' => 'text', |
| 305 | 'analyzer' => 'near_match', |
| 306 | 'index_options' => 'freqs', |
| 307 | 'norms' => false, |
| 308 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match' ), |
| 309 | 'fields' => [ |
| 310 | 'asciifolding' => [ |
| 311 | 'type' => 'text', |
| 312 | 'analyzer' => 'near_match_asciifolding', |
| 313 | 'index_options' => 'freqs', |
| 314 | 'norms' => false, |
| 315 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match', 'asciifolding' ), |
| 316 | ], |
| 317 | ], |
| 318 | ]; |
| 319 | $nearMatchFields = [ |
| 320 | 'title' => $wgCirrusSearchWeights[ 'title' ], |
| 321 | 'redirect' => $wgCirrusSearchWeights[ 'redirect' ], |
| 322 | ]; |
| 323 | return $this->setupCopyTo( $page, $nearMatchFields, 'all_near_match' ); |
| 324 | } |
| 325 | |
| 326 | /** |
| 327 | * Setup copy_to for some fields to $destination. |
| 328 | * @param array $config to modify |
| 329 | * @param array $fields field name to number of times copied |
| 330 | * @param string $destination destination of the copy |
| 331 | * @return array $config modified with the copy_to setup |
| 332 | */ |
| 333 | private function setupCopyTo( $config, $fields, $destination ) { |
| 334 | foreach ( $fields as $field => $weight ) { |
| 335 | // Note that weights this causes weights that are not whole numbers to be rounded up. |
| 336 | // We're ok with that because we don't have a choice. |
| 337 | for ( $r = 0; $r < $weight; $r++ ) { |
| 338 | if ( $field === 'redirect' ) { |
| 339 | // Redirect is in a funky place |
| 340 | $config[ 'properties' ][ 'redirect' ][ 'properties' ][ 'title' ][ 'copy_to' ][] = $destination; |
| 341 | } else { |
| 342 | $config[ 'properties' ][ $field ][ 'copy_to' ][] = $destination; |
| 343 | } |
| 344 | } |
| 345 | } |
| 346 | |
| 347 | return $config; |
| 348 | } |
| 349 | |
| 350 | /** |
| 351 | * Build the source_text index field |
| 352 | * |
| 353 | * @param string $fieldName usually "source_text" |
| 354 | * @return SourceTextIndexField |
| 355 | */ |
| 356 | protected function buildSourceTextStringField( $fieldName ) { |
| 357 | return new SourceTextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config ); |
| 358 | } |
| 359 | |
| 360 | /** |
| 361 | * @return array |
| 362 | */ |
| 363 | private function getTextFieldMapping() { |
| 364 | $stringFieldMapping = $this->searchIndexFieldFactory->newStringField( |
| 365 | 'text', |
| 366 | null, |
| 367 | [] |
| 368 | )->setMappingFlags( $this->flags )->getMapping( $this->engine ); |
| 369 | |
| 370 | $extraFieldMapping = [ |
| 371 | 'fields' => [ |
| 372 | 'word_count' => [ |
| 373 | 'type' => 'token_count', |
| 374 | 'analyzer' => 'plain', |
| 375 | ] |
| 376 | ] |
| 377 | ]; |
| 378 | |
| 379 | $textFieldMapping = array_merge_recursive( $stringFieldMapping, $extraFieldMapping ); |
| 380 | |
| 381 | return $textFieldMapping; |
| 382 | } |
| 383 | |
| 384 | /** |
| 385 | * Whether or not it's safe to optimize the analysis config. |
| 386 | * It's generally safe to optimize if all the analyzers needed are |
| 387 | * properly referenced in the mapping. |
| 388 | * In the case an analyzer is used directly in a query but not referenced |
| 389 | * in the mapping it's not safe to optimize. |
| 390 | * |
| 391 | * @return bool |
| 392 | */ |
| 393 | public function canOptimizeAnalysisConfig() { |
| 394 | return true; |
| 395 | } |
| 396 | } |