Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
99.10% covered (success)
99.10%
1205 / 1216
72.41% covered (warning)
72.41%
21 / 29
CRAP
0.00% covered (danger)
0.00%
0 / 1
AnalysisConfigBuilder
99.10% covered (success)
99.10%
1205 / 1216
72.41% covered (warning)
72.41%
21 / 29
220
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
26 / 26
100.00% covered (success)
100.00%
1 / 1
6
 shouldActivateIcuFolding
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
9
 shouldActivateIcuTokenization
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
7
 buildConfig
100.00% covered (success)
100.00%
12 / 12
100.00% covered (success)
100.00%
1 / 1
4
 buildSimilarityConfig
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 enableICUTokenizer
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
6
 standardTokenizerOnlyCleanup
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
6
 disableLimitedMappings
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
4
 enableICUFolding
100.00% covered (success)
100.00%
35 / 35
100.00% covered (success)
100.00%
1 / 1
13
 switchFiltersToICUFolding
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 switchFiltersToICUFoldingPreserve
94.12% covered (success)
94.12%
16 / 17
0.00% covered (danger)
0.00%
0 / 1
7.01
 addRemoveEmpty
93.75% covered (success)
93.75%
15 / 16
0.00% covered (danger)
0.00%
0 / 1
8.02
 getICUSetFilter
66.67% covered (warning)
66.67%
2 / 3
0.00% covered (danger)
0.00%
0 / 1
2.15
 getICUNormSetFilter
80.00% covered (warning)
80.00%
4 / 5
0.00% covered (danger)
0.00%
0 / 1
4.13
 defaults
100.00% covered (success)
100.00%
322 / 322
100.00% covered (success)
100.00%
1 / 1
7
 customize
100.00% covered (success)
100.00%
635 / 635
100.00% covered (success)
100.00%
1 / 1
100
 enableLangNormCharMap
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 getDefaultTextAnalyzerType
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getDefaultFilters
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
5
 resolveFilters
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 replaceFilter
87.50% covered (warning)
87.50%
7 / 8
0.00% covered (danger)
0.00%
0 / 1
4.03
 mergeAnalyzerConfig
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 mergeNormalizerConfig
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
6
 mergeAnalysisComponent
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
6
 buildLanguageConfigs
91.67% covered (success)
91.67%
11 / 12
0.00% covered (danger)
0.00%
0 / 1
5.01
 isIcuAvailable
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 isTextifyAvailable
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 enableGlobalCustomFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 buildGlobalCustomFilters
100.00% covered (success)
100.00%
37 / 37
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use CirrusSearch\CirrusSearch;
6use CirrusSearch\CirrusSearchHookRunner;
7use CirrusSearch\Profile\SearchProfileService;
8use CirrusSearch\SearchConfig;
9use MediaWiki\MediaWikiServices;
10
11/**
12 * Builds search analysis config arrays.
13 *
14 * @license GPL-2.0-or-later
15 */
16class AnalysisConfigBuilder {
17    /**
18     * Maximum number of characters allowed in keyword terms.
19     */
20    public const KEYWORD_IGNORE_ABOVE = 5000;
21
22    /**
23     * Temporary magic value to prevent enabling ICU tokenizer in specific analyzers
24     */
25    private const STANDARD_TOKENIZER_ONLY = 'std_only';
26
27    /**
28     * @var bool is the icu plugin available?
29     */
30    private $icu;
31
32    /**
33     * @var bool is the textify plugin available?
34     */
35    private $textify;
36
37    /**
38     * @var string which ICU tokenizer should be used
39     */
40    private $icu_tokenizer = 'icu_tokenizer';
41
42    /**
43     * @var array Similarity algo (tf/idf, bm25, etc) configuration
44     */
45    private $similarity;
46
47    /**
48     * @var SearchConfig cirrus config
49     */
50    protected $config;
51
52    /**
53     * @var string[]
54     */
55    private $plugins;
56
57    /**
58     * @var string
59     */
60    protected $defaultLanguage;
61
62    /**
63     * @var CirrusSearchHookRunner
64     */
65    private $cirrusSearchHookRunner;
66
67    /**
68     * @var GlobalCustomFilter[]
69     */
70    public $globalCustomFilters;
71
72    /**
73     * @param string $langCode The language code to build config for
74     * @param string[] $plugins list of installed plugins
75     * @param SearchConfig|null $config
76     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
77     */
78    public function __construct(
79        $langCode,
80        array $plugins,
81        ?SearchConfig $config = null,
82        ?CirrusSearchHookRunner $cirrusSearchHookRunner = null
83    ) {
84        $this->globalCustomFilters = $this->buildGlobalCustomFilters();
85
86        $this->defaultLanguage = $langCode;
87        $this->plugins = $plugins;
88        foreach ( $this->searchLanguageAnalyzersFromPlugins as $pluginSpec => $extra ) {
89            $pluginsPresent = 1;
90            $pluginList = explode( ',', $pluginSpec );
91            foreach ( $pluginList as $plugin ) {
92                if ( !Plugins::contains( $plugin, $plugins ) ) {
93                    $pluginsPresent = 0;
94                    break;
95                }
96            }
97            if ( $pluginsPresent ) {
98                $this->customSearchLanguageAnalyzers =
99                    array_merge( $this->customSearchLanguageAnalyzers, $extra );
100            }
101        }
102        $this->icu = Plugins::contains( 'analysis-icu', $plugins );
103        $this->textify = Plugins::contains( 'extra-analysis-textify', $plugins );
104        if ( $this->isTextifyAvailable() ) {
105            // icu_token_repair can only work with the textify icu_tokenizer clone
106            $this->icu_tokenizer = 'textify_icu_tokenizer';
107        }
108        $config ??= MediaWikiServices::getInstance()->getConfigFactory()
109            ->makeConfig( 'CirrusSearch' );
110        $similarity = $config->getProfileService()->loadProfile( SearchProfileService::SIMILARITY );
111        $similarity['similarity'] ??= [];
112        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?? new CirrusSearchHookRunner(
113            MediaWikiServices::getInstance()->getHookContainer() );
114        $this->cirrusSearchHookRunner->onCirrusSearchSimilarityConfig( $similarity['similarity'] );
115        $this->similarity = $similarity;
116
117        $this->config = $config;
118    }
119
120    /**
121     * Determine if asciifolding should be upgraded to icu_folding, or icu_folding should
122     * be stripped.
123     * @param string $language Config language
124     * @return bool true if icu folding should be enabled
125     */
126    public function shouldActivateIcuFolding( $language ) {
127        if ( !$this->isIcuAvailable() || !Plugins::contains( 'extra', $this->plugins ) ) {
128            // ICU folding requires the icu plugin and the extra plugin
129            return false;
130        }
131        $in_config = $this->config->get( 'CirrusSearchUseIcuFolding' );
132        // BC code, this config var was originally a simple boolean
133        if ( $in_config === true ) {
134            $in_config = 'yes';
135        }
136        if ( $in_config === false ) {
137            $in_config = 'no';
138        }
139        switch ( $in_config ) {
140            case 'yes':
141                return true;
142            case 'no':
143                return false;
144            case 'default':
145                return $this->languagesWithIcuFolding[$language] ?? false;
146            default:
147                return false;
148        }
149    }
150
151    /**
152     * Determine if the icu_tokenizer can replace the standard tokenizer for this language
153     * @param string $language Config language
154     * @return bool
155     */
156    public function shouldActivateIcuTokenization( $language ) {
157        if ( !$this->isIcuAvailable() && !$this->isTextifyAvailable() ) {
158            // requires the icu or textify plugin
159            return false;
160        }
161        $in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer' );
162        switch ( $in_config ) {
163            case 'yes':
164                return true;
165            case 'no':
166                return false;
167            case 'default':
168                // languagesWithIcuTokenization[] gives absolute answers for specific languages.
169                // If the textify plugin is available, the default is 'yes'/true because we
170                // have icu_token_repair available; if not, the default is 'no'/false
171                return $this->languagesWithIcuTokenization[$language] ?? $this->isTextifyAvailable();
172            default:
173                return false;
174        }
175    }
176
177    /**
178     * Build the analysis config.
179     *
180     * @param string|null $language Config language
181     * @return array the analysis config
182     */
183    public function buildConfig( $language = null ) {
184        $language ??= $this->defaultLanguage;
185        $config = $this->customize( $this->defaults( $language ), $language );
186        $this->cirrusSearchHookRunner->onCirrusSearchAnalysisConfig( $config, $this );
187
188        if ( $this->shouldActivateIcuTokenization( $language ) ) {
189            $config = $this->enableICUTokenizer( $config );
190        }
191
192        if ( $this->shouldActivateIcuFolding( $language ) ) {
193            $config = $this->enableICUFolding( $config, $language );
194        }
195
196        $config = $this->standardTokenizerOnlyCleanup( $config );
197        if ( !$this->isTextifyAvailable() ) {
198            $config = $this->disableLimitedMappings( $config );
199        }
200
201        // should come after other upgrades to get the full context
202        $config = $this->enableGlobalCustomFilters( $config, $language );
203
204        return $config;
205    }
206
207    /**
208     * @return array|null the similarity config
209     */
210    public function buildSimilarityConfig() {
211        return $this->similarity['similarity'] ?? null;
212    }
213
214    /**
215     * replace the standard tokenizer with icu_tokenizer
216     * @param mixed[] $config
217     * @return mixed[] update config
218     */
219    public function enableICUTokenizer( array $config ) {
220        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
221            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
222                continue;
223            }
224            if ( isset( $value[ 'tokenizer' ] ) && $value[ 'tokenizer' ] === 'standard' ) {
225                $value[ 'tokenizer' ] = $this->icu_tokenizer;
226            }
227        }
228        return $config;
229    }
230
231    /**
232     * replace STANDARD_TOKENIZER_ONLY with the actual standard tokenizer
233     * @param mixed[] $config
234     * @return mixed[] update config
235     */
236    public function standardTokenizerOnlyCleanup( array $config ) {
237        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
238            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
239                continue;
240            }
241            if ( isset( $value[ 'tokenizer' ] ) &&
242                    $value[ 'tokenizer' ] === self::STANDARD_TOKENIZER_ONLY ) {
243                // if we blocked upgrades/changes to the standard tokenizer,
244                // replace the magic value with the actual standard tokenizer
245                $value[ 'tokenizer' ] = 'standard';
246            }
247        }
248        return $config;
249    }
250
251    /**
252     * replace limited_mappings with mappings if limited_mapping is unavailable
253     * @param mixed[] $config
254     * @return mixed[] update config
255     */
256    public function disableLimitedMappings( array $config ) {
257        foreach ( $config[ 'char_filter' ] as $name => &$value ) {
258            if ( !isset( $value[ 'type' ] ) || $value[ 'type' ] != 'limited_mapping' ) {
259                continue;
260            }
261            $value[ 'type' ] = 'mapping';
262        }
263        return $config;
264    }
265
266    /**
267     * Activate ICU folding instead of asciifolding
268     * @param mixed[] $config
269     * @param string $language Config language
270     * @return mixed[] update config
271     */
272    public function enableICUFolding( array $config, $language ) {
273        $unicodeSetFilter = $this->getICUSetFilter( $language );
274        $filter = [
275            'type' => 'icu_folding',
276        ];
277        if ( $unicodeSetFilter !== null ) {
278            $filter[ 'unicode_set_filter' ] = $unicodeSetFilter;
279        }
280        $config[ 'filter' ][ 'icu_folding' ] = $filter;
281
282        // Adds a simple nfkc normalizer for cases where
283        // we preserve original but the lowercase filter
284        // is not used before
285        $config[ 'filter' ][ 'icu_nfkc_normalization' ] = [
286            'type' => 'icu_normalizer',
287            'name' => 'nfkc',
288        ];
289
290        $newfilters = [];
291        foreach ( $config[ 'analyzer' ] as $name => $value ) {
292            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
293                continue;
294            }
295            if ( !isset( $value[ 'filter' ] ) ) {
296                continue;
297            }
298            if ( in_array( 'asciifolding', $value[ 'filter' ] ) ) {
299                $newfilters[ $name ] = $this->switchFiltersToICUFolding( $value[ 'filter' ] );
300            }
301            if ( in_array( 'asciifolding_preserve', $value[ 'filter' ] ) ) {
302                $newfilters[ $name ] = $this->switchFiltersToICUFoldingPreserve( $value[ 'filter' ] );
303            }
304        }
305
306        foreach ( $newfilters as $name => $filters ) {
307            $config[ 'analyzer' ][ $name ][ 'filter' ] = $filters;
308        }
309        // Explicitly enable icu_folding on plain analyzers if it's not
310        // already enabled
311        if ( isset( $config[ 'analyzer' ][ 'plain' ] ) ) {
312            if ( !isset( $config[ 'analyzer' ][ 'plain' ][ 'filter' ] ) ) {
313                $config[ 'analyzer' ][ 'plain' ][ 'filter' ] = [];
314            }
315            $config[ 'analyzer' ][ 'plain' ][ 'filter' ] =
316                $this->switchFiltersToICUFoldingPreserve(
317                    $config[ 'analyzer' ][ 'plain' ][ 'filter' ], true );
318        }
319
320        // if lowercase_keyword exists, add icu_folding
321        if ( isset( $config[ 'analyzer' ][ 'lowercase_keyword' ] ) ) {
322            $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'icu_folding';
323        }
324        if ( isset( $config[ 'normalizer' ][ 'lowercase_keyword' ] ) ) {
325            $config[ 'normalizer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'icu_folding';
326        }
327
328        // add remove_empty everywhere icu_folding happens, not just the ones we added here
329        $config = $this->addRemoveEmpty( $config );
330
331        return $config;
332    }
333
334    /**
335     * Replace occurrence of asciifolding to icu_folding
336     * @param string[] $filters
337     * @return string[] new list of filters
338     */
339    private function switchFiltersToICUFolding( array $filters ) {
340        return array_replace( $filters,
341            [ array_search( 'asciifolding', $filters ) => 'icu_folding' ] );
342    }
343
344    /**
345     * Replace occurrence of asciifolding_preserve with a set
346     * of compatible filters to enable icu_folding
347     * @param string[] $filters
348     * @param bool $append append icu_folding even if asciifolding is not present
349     * @return string[] new list of filters
350     */
351    private function switchFiltersToICUFoldingPreserve( array $filters, $append = false ) {
352        if ( in_array( 'icu_folding', $filters ) ) {
353            // ICU folding already here
354            return $filters;
355        }
356        $ap_idx = array_search( 'asciifolding_preserve', $filters );
357        if ( $ap_idx === false && $append ) {
358            $ap_idx = count( $filters );
359            // fake an asciifolding_preserve so we can
360            // reuse code that replaces it
361            $filters[] = 'asciifolding_preserve';
362        }
363        if ( $ap_idx === false ) {
364            return $filters;
365        }
366        // with ICU lowercase is replaced by icu_normalizer/nfkc_cf
367        // thus unicode normalization is already done.
368        $lc_idx = array_search( 'icu_normalizer', $filters );
369        $newfilters = [];
370        if ( $lc_idx === false || $lc_idx > $ap_idx ) {
371            // If lowercase is not detected before we
372            // will have to do some icu normalization
373            // this is to prevent preserving "un-normalized"
374            // unicode chars.
375            $newfilters[] = 'icu_nfkc_normalization';
376        }
377        $newfilters[] = 'preserve_original_recorder';
378        $newfilters[] = 'icu_folding';
379        $newfilters[] = 'preserve_original';
380        array_splice( $filters, $ap_idx, 1, $newfilters );
381        return $filters;
382    }
383
384    /**
385     * Add remove_empty as needed after icu_folding/preserve_original
386     * @param mixed[] $config
387     * @return mixed[] update config
388     */
389    protected function addRemoveEmpty( array $config ) {
390        foreach ( $config[ 'analyzer' ] as $name => $value ) {
391            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
392                continue;
393            }
394            if ( !isset( $value[ 'filter' ] ) ) {
395                continue;
396            }
397
398            $filters = $value[ 'filter' ];
399            $target_idx = array_search( 'icu_folding', $filters );
400            $re_idx = array_search( 'remove_empty', $filters );
401            if ( !$target_idx || $re_idx > $target_idx ) {
402                // if remove_empty is after icu_folding, we don't need to do anything
403                continue;
404            }
405
406            $po_idx = array_search( 'preserve_original', $filters );
407            if ( $po_idx == $target_idx + 1 ) {
408                // if preserve_original comes right after icu_folding, add remove_empty
409                // after preserve_original rather than icu_folding
410                $target_idx += 1;
411            }
412
413            array_splice( $filters, $target_idx + 1, 0, 'remove_empty' );
414            $config[ 'analyzer' ][ $name ][ 'filter' ] = $filters;
415        }
416        return $config;
417    }
418
419    /**
420     * Return the list of chars to exclude from ICU folding
421     * @param string $language Config language
422     * @return null|string
423     */
424    protected function getICUSetFilter( $language ) {
425        if ( $this->config->get( 'CirrusSearchICUFoldingUnicodeSetFilter' ) !== null ) {
426            return $this->config->get( 'CirrusSearchICUFoldingUnicodeSetFilter' );
427        }
428        return $this->icuSetFilters[ $language ] ?? null;
429    }
430
431    /**
432     * Return the list of chars to exclude from ICU normalization
433     * @param string $language Config language
434     * @return null|string
435     */
436    protected function getICUNormSetFilter( $language ) {
437        if ( $this->config->get( 'CirrusSearchICUNormalizationUnicodeSetFilter' ) !== null ) {
438            return $this->config->get( 'CirrusSearchICUNormalizationUnicodeSetFilter' );
439        }
440        switch ( $language ) {
441            case 'de':
442                return '[^ẞß]'; // T281379 Capital áºž is lowercased to ÃŸ by german_charfilter
443                                // lowercase ÃŸ is normalized to ss by german_normalization
444            default:
445                return null;
446        }
447    }
448
449    /**
450     * Build an analysis config with sane defaults.
451     *
452     * @param string $language Config language
453     * @return array
454     */
455    private function defaults( $language ) {
456        $defaults = [
457            'analyzer' => [
458                'text' => [