Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
99.26% covered (success)
99.26%
944 / 951
72.00% covered (warning)
72.00%
18 / 25
CRAP
0.00% covered (danger)
0.00%
0 / 1
AnalysisConfigBuilder
99.26% covered (success)
99.26%
944 / 951
72.00% covered (warning)
72.00%
18 / 25
206
0.00% covered (danger)
0.00%
0 / 1
 __construct
95.83% covered (success)
95.83%
23 / 24
0.00% covered (danger)
0.00%
0 / 1
7
 shouldActivateIcuFolding
92.86% covered (success)
92.86%
13 / 14
0.00% covered (danger)
0.00%
0 / 1
9.03
 shouldActivateIcuTokenization
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
6
 buildConfig
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
4
 buildSimilarityConfig
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 enableICUTokenizer
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
6
 standardTokenizerOnlyCleanup
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
6
 disableLimitedMappings
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
4
 enableICUFolding
100.00% covered (success)
100.00%
32 / 32
100.00% covered (success)
100.00%
1 / 1
12
 switchFiltersToICUFolding
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 switchFiltersToICUFoldingPreserve
94.44% covered (success)
94.44%
17 / 18
0.00% covered (danger)
0.00%
0 / 1
7.01
 getICUSetFilter
97.96% covered (success)
97.96%
48 / 49
0.00% covered (danger)
0.00%
0 / 1
28
 getICUNormSetFilter
80.00% covered (warning)
80.00%
4 / 5
0.00% covered (danger)
0.00%
0 / 1
4.13
 defaults
100.00% covered (success)
100.00%
247 / 247
100.00% covered (success)
100.00%
1 / 1
5
 customize
100.00% covered (success)
100.00%
422 / 422
100.00% covered (success)
100.00%
1 / 1
65
 fixAsciiFolding
100.00% covered (success)
100.00%
16 / 16
100.00% covered (success)
100.00%
1 / 1
7
 getDefaultTextAnalyzerType
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 getDefaultFilters
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
5
 resolveFilters
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 replaceFilter
87.50% covered (warning)
87.50%
7 / 8
0.00% covered (danger)
0.00%
0 / 1
4.03
 mergeConfig
100.00% covered (success)
100.00%
17 / 17
100.00% covered (success)
100.00%
1 / 1
12
 buildLanguageConfigs
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 isIcuAvailable
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 enableGlobalCustomFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 buildGlobalCustomFilters
100.00% covered (success)
100.00%
20 / 20
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use CirrusSearch\CirrusSearch;
6use CirrusSearch\CirrusSearchHookRunner;
7use CirrusSearch\Profile\SearchProfileService;
8use CirrusSearch\SearchConfig;
9use MediaWiki\MediaWikiServices;
10
11/**
12 * Builds elasticsearch analysis config arrays.
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License along
25 * with this program; if not, write to the Free Software Foundation, Inc.,
26 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
27 * http://www.gnu.org/copyleft/gpl.html
28 */
29class AnalysisConfigBuilder {
30    /**
31     * Version number for the core analysis. Increment the major
32     * version when the analysis changes in an incompatible way,
33     * and change the minor version when it changes but isn't
34     * incompatible.
35     *
36     * You may also need to increment MetaStoreIndex::METASTORE_VERSION
37     * manually as well.
38     */
39    public const VERSION = '0.12';
40
41    /**
42     * Maximum number of characters allowed in keyword terms.
43     */
44    private const KEYWORD_IGNORE_ABOVE = 5000;
45
46    /**
47     * Temporary magic value to prevent enabling ICU tokenizer in specific analyzers
48     */
49    private const STANDARD_TOKENIZER_ONLY = 'std_only';
50
51    /**
52     * @var bool is the icu plugin available?
53     */
54    private $icu;
55
56    /**
57     * @var array Similarity algo (tf/idf, bm25, etc) configuration
58     */
59    private $similarity;
60
61    /**
62     * @var SearchConfig cirrus config
63     */
64    protected $config;
65
66    /**
67     * @var string[]
68     */
69    private $plugins;
70
71    /**
72     * @var string
73     */
74    protected $defaultLanguage;
75
76    /**
77     * @var CirrusSearchHookRunner
78     */
79    private $cirrusSearchHookRunner;
80
81    /**
82     * @var GlobalCustomFilter[]
83     */
84    public $globalCustomFilters;
85
86    /**
87     * @param string $langCode The language code to build config for
88     * @param string[] $plugins list of plugins installed in Elasticsearch
89     * @param SearchConfig|null $config
90     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
91     */
92    public function __construct(
93        $langCode,
94        array $plugins,
95        SearchConfig $config = null,
96        CirrusSearchHookRunner $cirrusSearchHookRunner = null
97    ) {
98        $this->globalCustomFilters = $this->buildGlobalCustomFilters();
99
100        $this->defaultLanguage = $langCode;
101        $this->plugins = $plugins;
102        foreach ( $this->elasticsearchLanguageAnalyzersFromPlugins as $pluginSpec => $extra ) {
103            $pluginsPresent = 1;
104            $pluginList = explode( ',', $pluginSpec );
105            foreach ( $pluginList as $plugin ) {
106                if ( !in_array( $plugin, $plugins ) ) {
107                    $pluginsPresent = 0;
108                    break;
109                }
110            }
111            if ( $pluginsPresent ) {
112                $this->elasticsearchLanguageAnalyzers =
113                    array_merge( $this->elasticsearchLanguageAnalyzers, $extra );
114            }
115        }
116        $this->icu = in_array( 'analysis-icu', $plugins );
117        $config ??= MediaWikiServices::getInstance()->getConfigFactory()
118            ->makeConfig( 'CirrusSearch' );
119        $similarity = $config->getProfileService()->loadProfile( SearchProfileService::SIMILARITY );
120        if ( !array_key_exists( 'similarity', $similarity ) ) {
121            $similarity['similarity'] = [];
122        }
123        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
124            MediaWikiServices::getInstance()->getHookContainer() );
125        $this->cirrusSearchHookRunner->onCirrusSearchSimilarityConfig( $similarity['similarity'] );
126        $this->similarity = $similarity;
127
128        $this->config = $config;
129    }
130
131    /**
132     * Determine if ascii folding should be used
133     * @param string $language Config language
134     * @return bool true if icu folding should be enabled
135     */
136    public function shouldActivateIcuFolding( $language ) {
137        if ( !$this->isIcuAvailable() || !in_array( 'extra', $this->plugins ) ) {
138            // ICU folding requires the icu plugin and the extra plugin
139            return false;
140        }
141        $in_config = $this->config->get( 'CirrusSearchUseIcuFolding' );
142        // BC code, this config var was originally a simple boolean
143        if ( $in_config === true ) {
144            $in_config = 'yes';
145        }
146        if ( $in_config === false ) {
147            $in_config = 'no';
148        }
149        switch ( $in_config ) {
150        case 'yes':
151            return true;
152        case 'no':
153            return false;
154        case 'default':
155            return $this->languagesWithIcuFolding[$language] ?? false;
156        default:
157            return false;
158        }
159    }
160
161    /**
162     * Determine if the icu tokenizer can be enabled
163     * @param string $language Config language
164     * @return bool
165     */
166    public function shouldActivateIcuTokenization( $language ) {
167        if ( !$this->isIcuAvailable() ) {
168            // requires the icu plugin
169            return false;
170        }
171        $in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer' );
172        switch ( $in_config ) {
173        case 'yes':
174            return true;
175        case 'no':
176            return false;
177        case 'default':
178            return $this->languagesWithIcuTokenization[$language] ?? false;
179        default:
180            return false;
181        }
182    }
183
184    /**
185     * Build the analysis config.
186     *
187     * @param string|null $language Config language
188     * @return array the analysis config
189     */
190    public function buildConfig( $language = null ) {
191        $language ??= $this->defaultLanguage;
192        $config = $this->customize( $this->defaults( $language ), $language );
193        $this->cirrusSearchHookRunner->onCirrusSearchAnalysisConfig( $config, $this );
194        $config = $this->enableGlobalCustomFilters( $config, $language );
195        if ( $this->shouldActivateIcuTokenization( $language ) ) {
196            $config = $this->enableICUTokenizer( $config );
197        }
198        if ( $this->shouldActivateIcuFolding( $language ) ) {
199            $config = $this->enableICUFolding( $config, $language );
200        }
201        $config = $this->fixAsciiFolding( $config );
202        $config = $this->standardTokenizerOnlyCleanup( $config );
203        if ( !in_array( 'extra-analysis-textify', $this->plugins ) ) {
204            $config = $this->disableLimitedMappings( $config );
205        }
206
207        return $config;
208    }
209
210    /**
211     * @return array|null the similarity config
212     */
213    public function buildSimilarityConfig() {
214        return $this->similarity['similarity'] ?? null;
215    }
216
217    /**
218     * replace the standard tokenizer with icu_tokenizer
219     * @param mixed[] $config
220     * @return mixed[] update config
221     */
222    public function enableICUTokenizer( array $config ) {
223        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
224            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
225                continue;
226            }
227            if ( isset( $value[ 'tokenizer' ] ) && $value[ 'tokenizer' ] === 'standard' ) {
228                $value[ 'tokenizer' ] = 'icu_tokenizer';
229            }
230        }
231        return $config;
232    }
233
234    /**
235     * replace STANDARD_TOKENIZER_ONLY with the actual standard tokenizer
236     * @param mixed[] $config
237     * @return mixed[] update config
238     */
239    public function standardTokenizerOnlyCleanup( array $config ) {
240        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
241            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
242                continue;
243            }
244            if ( isset( $value[ 'tokenizer' ] ) &&
245                    $value[ 'tokenizer' ] === self::STANDARD_TOKENIZER_ONLY ) {
246                // if we blocked upgrades/changes to the standard tokenizer,
247                // replace the magic value with the actual standard tokenizer
248                $value[ 'tokenizer' ] = 'standard';
249            }
250        }
251        return $config;
252    }
253
254    /**
255     * replace limited_mappings with mappings if limited_mapping is unavailable
256     * @param mixed[] $config
257     * @return mixed[] update config
258     */
259    public function disableLimitedMappings( array $config ) {
260        foreach ( $config[ 'char_filter' ] as $name => &$value ) {
261            if ( !isset( $value[ 'type' ] ) || $value[ 'type' ] != 'limited_mapping' ) {
262                continue;
263            }
264            $value[ 'type' ] = 'mapping';
265        }
266        return $config;
267    }
268
269    /**
270     * Activate ICU folding instead of asciifolding
271     * @param mixed[] $config
272     * @param string $language Config language
273     * @return mixed[] update config
274     */
275    public function enableICUFolding( array $config, $language ) {
276        $unicodeSetFilter = $this->getICUSetFilter( $language );
277        $filter = [
278            'type' => 'icu_folding',
279        ];
280        if ( $unicodeSetFilter !== null ) {
281            $filter[ 'unicodeSetFilter' ] = $unicodeSetFilter;
282        }
283        $config[ 'filter' ][ 'icu_folding' ] = $filter;
284
285        // Adds a simple nfkc normalizer for cases where
286        // we preserve original but the lowercase filter
287        // is not used before
288        $config[ 'filter' ][ 'icu_nfkc_normalization' ] = [
289            'type' => 'icu_normalizer',
290            'name' => 'nfkc',
291        ];
292
293        $newfilters = [];
294        foreach ( $config[ 'analyzer' ] as $name => $value ) {
295            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
296                continue;
297            }
298            if ( !isset( $value[ 'filter' ] ) ) {
299                continue;
300            }
301            if ( in_array( 'asciifolding', $value[ 'filter' ] ) ) {
302                $newfilters[ $name ] = $this->switchFiltersToICUFolding( $value[ 'filter' ] );
303            }
304            if ( in_array( 'asciifolding_preserve', $value[ 'filter' ] ) ) {
305                $newfilters[ $name ] = $this->switchFiltersToICUFoldingPreserve( $value[ 'filter' ] );
306            }
307        }
308
309        foreach ( $newfilters as $name => $filters ) {
310            $config[ 'analyzer' ][ $name ][ 'filter' ] = $filters;
311        }
312        // Explicitly enable icu_folding on plain analyzers if it's not
313        // already enabled
314        foreach ( [ 'plain' ] as $analyzer ) {
315            if ( !isset( $config[ 'analyzer' ][ $analyzer ] ) ) {
316                continue;
317            }