Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
92.46% covered (success)
92.46%
184 / 199
62.50% covered (warning)
62.50%
5 / 8
CRAP
0.00% covered (danger)
0.00%
0 / 1
MappingConfigBuilder
92.46% covered (success)
92.46%
184 / 199
62.50% covered (warning)
62.50%
5 / 8
25.27
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
2
 validatePlugins
27.27% covered (danger)
27.27%
3 / 11
0.00% covered (danger)
0.00%
0 / 1
14.62
 getDefaultFields
94.74% covered (success)
94.74%
108 / 114
0.00% covered (danger)
0.00%
0 / 1
6.01
 buildConfig
100.00% covered (success)
100.00%
39 / 39
100.00% covered (success)
100.00%
1 / 1
5
 setupCopyTo
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
4
 buildSourceTextStringField
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getTextFieldMapping
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
1
 canOptimizeAnalysisConfig
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use CirrusSearch\CirrusSearch;
6use CirrusSearch\CirrusSearchHookRunner;
7use CirrusSearch\Search\CirrusIndexField;
8use CirrusSearch\Search\CirrusSearchIndexFieldFactory;
9use CirrusSearch\Search\SourceTextIndexField;
10use CirrusSearch\Search\TextIndexField;
11use CirrusSearch\SearchConfig;
12use MediaWiki\Language\Language;
13use MediaWiki\MediaWikiServices;
14use SearchIndexField;
15
16/**
17 * Builds search mapping configuration arrays.
18 *
19 * @license GPL-2.0-or-later
20 */
21class MappingConfigBuilder {
22    // Bit field parameters for buildConfig
23    public const PREFIX_START_WITH_ANY = 1;
24    public const PHRASE_SUGGEST_USE_TEXT = 2;
25    public const OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER = 4;
26
27    /**
28     * Version number for the core analysis. Increment the major
29     * version when the analysis changes in an incompatible way,
30     * and change the minor version when it changes but isn't
31     * incompatible
32     */
33    public const VERSION = '1.10';
34
35    /**
36     * @var bool should the index be optimized for the experimental highlighter?
37     */
38    private $optimizeForExperimentalHighlighter;
39
40    /**
41     * @var SearchConfig
42     */
43    private $config;
44
45    /**
46     * @var CirrusSearch
47     */
48    protected $engine;
49
50    /**
51     * @var CirrusSearchIndexFieldFactory
52     */
53    protected $searchIndexFieldFactory;
54
55    /**
56     * @var int
57     */
58    protected $flags = 0;
59    /**
60     * @var CirrusSearchHookRunner
61     */
62    private $cirrusSearchHookRunner;
63
64    /** @var bool if the icu plugin is available */
65    private bool $icu;
66    /**
67     * @var Language the content language
68     */
69    private Language $language;
70
71    /**
72     * @param bool $optimizeForExperimentalHighlighter should the index be optimized for the experimental highlighter?
73     * @param array $plugins list of installed plugins
74     * @param int $flags
75     * @param SearchConfig|null $config
76     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
77     * @param Language|null $language
78     */
79    public function __construct(
80        bool $optimizeForExperimentalHighlighter,
81        array $plugins,
82        int $flags = 0,
83        ?SearchConfig $config = null,
84        ?CirrusSearchHookRunner $cirrusSearchHookRunner = null,
85        ?Language $language = null
86    ) {
87        $this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter;
88        if ( $this->optimizeForExperimentalHighlighter ) {
89            $flags |= self::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER;
90        }
91        $this->flags = $flags;
92        $this->icu = Plugins::contains( 'analysis-icu', $plugins );
93        $this->engine = new CirrusSearch( $config );
94        $this->config = $this->engine->getConfig();
95        $this->searchIndexFieldFactory = new CirrusSearchIndexFieldFactory( $this->config );
96        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?? new CirrusSearchHookRunner(
97            MediaWikiServices::getInstance()->getHookContainer() );
98        $this->language = $language ?? MediaWikiServices::getInstance()->getContentLanguage();
99
100        $this->validatePlugins( $plugins );
101    }
102
103    private function validatePlugins( array $plugins ) {
104        if ( $this->config->get( 'CirrusSearchOptimizeForExperimentalHighlighter' ) &&
105            !Plugins::contains( 'experimental-highlighter', $plugins )
106        ) {
107            throw new \InvalidArgumentException(
108                "wgCirrusSearchOptimizeIndexForExperimentalHighlighter is set to true but the " .
109                "'experimental-highlighter' plugin is not available."
110            );
111        }
112
113        if ( $this->config->getElement( 'CirrusSearchNaturalTitleSort', 'build' ) && !$this->icu ) {
114            throw new \InvalidArgumentException(
115                "wgCirrusSearchNaturalTitleSort is set to build but the 'analysis-icu' plugin " .
116                "is not available."
117            );
118        }
119    }
120
121    /**
122     * Get definitions for default index fields.
123     * These fields are always present in the index.
124     * @return array
125     */
126    private function getDefaultFields() {
127        // Note never to set something as type='object' here because that isn't returned
128        // by the search engine and is inferred anyway.
129        $titleExtraAnalyzers = [
130            [ 'analyzer' => 'prefix', 'search_analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ],
131            [
132                'analyzer' => 'prefix_asciifolding',
133                'search_analyzer' => 'near_match_asciifolding',
134                'index_options' => 'docs',
135                'norms' => false
136            ],
137            [ 'analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ],
138            [ 'analyzer' => 'near_match_asciifolding', 'index_options' => 'docs', 'norms' => false ],
139            [ 'type' => 'keyword', 'normalizer' => 'keyword' ],
140        ];
141        if ( $this->flags & self::PREFIX_START_WITH_ANY ) {
142            $titleExtraAnalyzers[] = [
143                'analyzer' => 'word_prefix',
144                'search_analyzer' => 'plain_search',
145                'index_options' => 'docs'
146            ];
147        }
148        if ( $this->icu && $this->config->getElement( 'CirrusSearchNaturalTitleSort', 'build' ) ) {
149            $titleExtraAnalyzers[] = [
150                'fieldName' => 'natural_sort',
151                'type' => 'icu_collation_keyword',
152                'ignore_above' => AnalysisConfigBuilder::KEYWORD_IGNORE_ABOVE,
153                // doc values only
154                'index' => false,
155                'numeric' => true,
156                'strength' => 'tertiary',
157                // icu_collation_keyword will use new ULocale(String $l) if only provided the language
158                // which supports BCP 47 language code.
159                'language' => $this->language->toBcp47Code()
160            ];
161        }
162
163        $suggestField = [
164            'type' => 'text',
165            'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest' ),
166            'index_options' => 'freqs',
167            'analyzer' => 'suggest',
168        ];
169
170        if ( $this->config->getElement( 'CirrusSearchPhraseSuggestReverseField', 'build' ) ) {
171            $suggestField['fields'] = [
172                'reverse' => [
173                    'type' => 'text',
174                    'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest', 'reverse' ),
175                    'index_options' => 'freqs',
176                    'analyzer' => 'suggest_reverse',
177                ],
178            ];
179        }
180
181        $page = [
182            'dynamic' => false,
183            'properties' => [
184                'timestamp' => [
185                    'type' => 'date',
186                    'format' => 'dateOptionalTime',
187                ],
188                'create_timestamp' => [
189                    'type' => 'date',
190                    'format' => 'dateOptionalTime',
191                ],
192                'page_id' => [
193                    'type' => 'long',
194                    'index' => false,
195                ],
196                'wiki' => $this->searchIndexFieldFactory
197                    ->newKeywordField( 'wiki' )
198                    ->getMapping( $this->engine ),
199                'namespace' => $this->searchIndexFieldFactory
200                    ->newLongField( 'namespace' )
201                    ->getMapping( $this->engine ),
202                'namespace_text' => $this->searchIndexFieldFactory
203                    ->newKeywordField( 'namespace_text' )
204                    ->withDocValues()
205                    ->getMapping( $this->engine ),
206                'title' => $this->searchIndexFieldFactory
207                    ->newStringField( 'title',
208                        TextIndexField::ENABLE_NORMS
209                        | TextIndexField::COPY_TO_SUGGEST
210                        | TextIndexField::COPY_TO_SUGGEST_VARIANT
211                        | TextIndexField::SUPPORT_REGEX,
212                        $titleExtraAnalyzers )
213                    ->setMappingFlags( $this->flags )
214                    ->getMapping( $this->engine ),
215                'text' => $this->getTextFieldMapping(),
216                'text_bytes' => $this->searchIndexFieldFactory
217                    ->newLongField( 'text_bytes' )
218                    ->getMapping( $this->engine ),
219                'source_text' => $this->buildSourceTextStringField( 'source_text' )
220                    ->setMappingFlags( $this->flags )->getMapping( $this->engine ),
221                'redirect' => [
222                    'dynamic' => false,
223                    'properties' => [
224                        'namespace' => $this->searchIndexFieldFactory
225                            ->newLongField( 'namespace' )
226                            ->getMapping( $this->engine ),
227                        'title' => $this->searchIndexFieldFactory
228                            ->newStringField( 'redirect.title', TextIndexField::ENABLE_NORMS
229                                | TextIndexField::SPEED_UP_HIGHLIGHTING
230                                | TextIndexField::COPY_TO_SUGGEST
231                                | TextIndexField::COPY_TO_SUGGEST_VARIANT
232                                | TextIndexField::SUPPORT_REGEX,
233                                $titleExtraAnalyzers
234                            )
235                            ->setMappingFlags( $this->flags )
236                            ->getMapping( $this->engine ),
237                    ]
238                ],
239                'incoming_links' => $this->searchIndexFieldFactory
240                    ->newLongField( 'incoming_links' )
241                    ->getMapping( $this->engine ),
242                'local_sites_with_dupe' => $this->searchIndexFieldFactory
243                    ->newKeywordField( 'local_sites_with_dupe' )
244                    ->setFlag( SearchIndexField::FLAG_CASEFOLD )
245                    ->getMapping( $this->engine ),
246                'suggest' => $suggestField,
247            ]
248        ];
249
250        if ( $this->config->get( 'CirrusSearchPhraseSuggestBuildVariant' ) ) {
251            $page['properties']['suggest_variant'] = $suggestField;
252        }
253
254        return $page;
255    }
256
257    /**
258     * Build the mapping config.
259     * @return array the mapping config
260     */
261    public function buildConfig() {
262        global $wgCirrusSearchWeights;
263
264        $page = $this->getDefaultFields();
265
266        $fields = $this->engine->getSearchIndexFields();
267
268        foreach ( $fields as $fieldName => $field ) {
269            if ( $field instanceof CirrusIndexField ) {
270                $field->setMappingFlags( $this->flags );
271            }
272            $config = $field->getMapping( $this->engine );
273            if ( $config ) {
274                $page['properties'][$fieldName] = $config;
275            }
276        }
277
278        // Unclear how this would otherwise fit into the process to construct the mapping.
279        // Not used directly in cirrus, supports queries from 'add-a-link' (T301096).
280        if ( isset( $page['properties']['outgoing_link'] ) ) {
281            $page['properties']['outgoing_link']['fields']['token_count'] = [
282                'type' => 'token_count',
283                'analyzer' => 'keyword',
284            ];
285        }
286
287        // Now layer all the fields into the all field once per weight.  Querying it isn't strictly the
288        // same as querying each field - in some ways it is better!  In others it is worse....
289
290        // Better because theoretically tf/idf based scoring works better this way.
291        // Worse because we have to analyze each field multiple times....  Bleh!
292        // This field can't be used for the fvh/experimental highlighter for several reasons:
293        // 1. It is built with copy_to and not stored.
294        // 2. The term frequency information is all whoppy compared to the "real" source text.
295        $allField = $this->searchIndexFieldFactory->
296            newStringField( 'all', TextIndexField::ENABLE_NORMS );
297        $page['properties']['all'] =
298            $allField->setMappingFlags( $this->flags )->getMapping( $this->engine );
299        $page = $this->setupCopyTo( $page, $wgCirrusSearchWeights, 'all' );
300
301        // Now repeat for near_match fields.  The same considerations above apply except near_match
302        // is never used in phrase queries or highlighting.
303        $page[ 'properties' ][ 'all_near_match' ] = [
304            'type' => 'text',
305            'analyzer' => 'near_match',
306            'index_options' => 'freqs',
307            'norms' => false,
308            'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match' ),
309            'fields' => [
310                'asciifolding' => [
311                    'type' => 'text',
312                    'analyzer' => 'near_match_asciifolding',
313                    'index_options' => 'freqs',
314                    'norms' => false,
315                    'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match', 'asciifolding' ),
316                ],
317            ],
318        ];
319        $nearMatchFields = [
320            'title' => $wgCirrusSearchWeights[ 'title' ],
321            'redirect' => $wgCirrusSearchWeights[ 'redirect' ],
322        ];
323        return $this->setupCopyTo( $page, $nearMatchFields, 'all_near_match' );
324    }
325
326    /**
327     * Setup copy_to for some fields to $destination.
328     * @param array $config to modify
329     * @param array $fields field name to number of times copied
330     * @param string $destination destination of the copy
331     * @return array $config modified with the copy_to setup
332     */
333    private function setupCopyTo( $config, $fields, $destination ) {
334        foreach ( $fields as $field => $weight ) {
335            // Note that weights this causes weights that are not whole numbers to be rounded up.
336            // We're ok with that because we don't have a choice.
337            for ( $r = 0; $r < $weight; $r++ ) {
338                if ( $field === 'redirect' ) {
339                    // Redirect is in a funky place
340                    $config[ 'properties' ][ 'redirect' ][ 'properties' ][ 'title' ][ 'copy_to' ][] = $destination;
341                } else {
342                    $config[ 'properties' ][ $field ][ 'copy_to' ][] = $destination;
343                }
344            }
345        }
346
347        return $config;
348    }
349
350    /**
351     * Build the source_text index field
352     *
353     * @param string $fieldName usually "source_text"
354     * @return SourceTextIndexField
355     */
356    protected function buildSourceTextStringField( $fieldName ) {
357        return new SourceTextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config );
358    }
359
360    /**
361     * @return array
362     */
363    private function getTextFieldMapping() {
364        $stringFieldMapping = $this->searchIndexFieldFactory->newStringField(
365            'text',
366            null,
367            []
368        )->setMappingFlags( $this->flags )->getMapping( $this->engine );
369
370        $extraFieldMapping = [
371            'fields' => [
372                'word_count' => [
373                    'type' => 'token_count',
374                    'analyzer' => 'plain',
375                ]
376            ]
377        ];
378
379        $textFieldMapping = array_merge_recursive( $stringFieldMapping, $extraFieldMapping );
380
381        return $textFieldMapping;
382    }
383
384    /**
385     * Whether or not it's safe to optimize the analysis config.
386     * It's generally safe to optimize if all the analyzers needed are
387     * properly referenced in the mapping.
388     * In the case an analyzer is used directly in a query but not referenced
389     * in the mapping it's not safe to optimize.
390     *
391     * @return bool
392     */
393    public function canOptimizeAnalysisConfig() {
394        return true;
395    }
396}