Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
99.28% covered (success)
99.28%
965 / 972
72.00% covered (warning)
72.00%
18 / 25
CRAP
0.00% covered (danger)
0.00%
0 / 1
AnalysisConfigBuilder
99.28% covered (success)
99.28%
965 / 972
72.00% covered (warning)
72.00%
18 / 25
210
0.00% covered (danger)
0.00%
0 / 1
 __construct
95.83% covered (success)
95.83%
23 / 24
0.00% covered (danger)
0.00%
0 / 1
7
 shouldActivateIcuFolding
92.86% covered (success)
92.86%
13 / 14
0.00% covered (danger)
0.00%
0 / 1
9.03
 shouldActivateIcuTokenization
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
6
 buildConfig
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
3
 buildSimilarityConfig
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 enableICUTokenizer
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
6
 standardTokenizerOnlyCleanup
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
6
 enableICUFolding
100.00% covered (success)
100.00%
32 / 32
100.00% covered (success)
100.00%
1 / 1
12
 switchFiltersToICUFolding
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 switchFiltersToICUFoldingPreserve
94.44% covered (success)
94.44%
17 / 18
0.00% covered (danger)
0.00%
0 / 1
7.01
 getICUSetFilter
97.96% covered (success)
97.96%
48 / 49
0.00% covered (danger)
0.00%
0 / 1
28
 getICUNormSetFilter
80.00% covered (warning)
80.00%
4 / 5
0.00% covered (danger)
0.00%
0 / 1
4.13
 defaults
100.00% covered (success)
100.00%
210 / 210
100.00% covered (success)
100.00%
1 / 1
5
 customize
100.00% covered (success)
100.00%
484 / 484
100.00% covered (success)
100.00%
1 / 1
66
 fixAsciiFolding
100.00% covered (success)
100.00%
16 / 16
100.00% covered (success)
100.00%
1 / 1
7
 getDefaultTextAnalyzerType
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 getDefaultFilters
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
5
 resolveFilters
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 replaceFilter
87.50% covered (warning)
87.50%
7 / 8
0.00% covered (danger)
0.00%
0 / 1
4.03
 mergeConfig
100.00% covered (success)
100.00%
16 / 16
100.00% covered (success)
100.00%
1 / 1
11
 buildLanguageConfigs
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 isIcuAvailable
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 enableGlobalCustomFilters
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
4
 insertGlobalCustomFilter
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
6
 buildGlobalCustomFilters
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use CirrusSearch\CirrusSearch;
6use CirrusSearch\CirrusSearchHookRunner;
7use CirrusSearch\Profile\SearchProfileService;
8use CirrusSearch\SearchConfig;
9use MediaWiki\MediaWikiServices;
10
11/**
12 * Builds elasticsearch analysis config arrays.
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License along
25 * with this program; if not, write to the Free Software Foundation, Inc.,
26 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
27 * http://www.gnu.org/copyleft/gpl.html
28 */
29class AnalysisConfigBuilder {
30    /**
31     * Version number for the core analysis. Increment the major
32     * version when the analysis changes in an incompatible way,
33     * and change the minor version when it changes but isn't
34     * incompatible.
35     *
36     * You may also need to increment MetaStoreIndex::METASTORE_VERSION
37     * manually as well.
38     */
39    public const VERSION = '0.12';
40
41    /**
42     * Maximum number of characters allowed in keyword terms.
43     */
44    private const KEYWORD_IGNORE_ABOVE = 5000;
45
46    /**
47     * Temporary magic value to prevent enabling ICU tokenizer in specific analyzers
48     */
49    private const STANDARD_TOKENIZER_ONLY = 'std_only';
50
51    /**
52     * @var bool is the icu plugin available?
53     */
54    private $icu;
55
56    /**
57     * @var array Similarity algo (tf/idf, bm25, etc) configuration
58     */
59    private $similarity;
60
61    /**
62     * @var SearchConfig cirrus config
63     */
64    protected $config;
65    /**
66     * @var string[]
67     */
68    private $plugins;
69
70    /**
71     * @var string
72     */
73    protected $defaultLanguage;
74
75    /**
76     * @var CirrusSearchHookRunner
77     */
78    private $cirrusSearchHookRunner;
79
80    /**
81     * @var GlobalCustomFilter[]
82     */
83    public $globalCustomFilters;
84
85    /**
86     * @param string $langCode The language code to build config for
87     * @param string[] $plugins list of plugins installed in Elasticsearch
88     * @param SearchConfig|null $config
89     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
90     */
91    public function __construct(
92        $langCode,
93        array $plugins,
94        SearchConfig $config = null,
95        CirrusSearchHookRunner $cirrusSearchHookRunner = null
96    ) {
97        $this->globalCustomFilters = $this->buildGlobalCustomFilters();
98
99        $this->defaultLanguage = $langCode;
100        $this->plugins = $plugins;
101        foreach ( $this->elasticsearchLanguageAnalyzersFromPlugins as $pluginSpec => $extra ) {
102            $pluginsPresent = 1;
103            $pluginList = explode( ',', $pluginSpec );
104            foreach ( $pluginList as $plugin ) {
105                if ( !in_array( $plugin, $plugins ) ) {
106                    $pluginsPresent = 0;
107                    break;
108                }
109            }
110            if ( $pluginsPresent ) {
111                $this->elasticsearchLanguageAnalyzers =
112                    array_merge( $this->elasticsearchLanguageAnalyzers, $extra );
113            }
114        }
115        $this->icu = in_array( 'analysis-icu', $plugins );
116        $config ??= MediaWikiServices::getInstance()->getConfigFactory()
117            ->makeConfig( 'CirrusSearch' );
118        $similarity = $config->getProfileService()->loadProfile( SearchProfileService::SIMILARITY );
119        if ( !array_key_exists( 'similarity', $similarity ) ) {
120            $similarity['similarity'] = [];
121        }
122        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
123            MediaWikiServices::getInstance()->getHookContainer() );
124        $this->cirrusSearchHookRunner->onCirrusSearchSimilarityConfig( $similarity['similarity'] );
125        $this->similarity = $similarity;
126
127        $this->config = $config;
128    }
129
130    /**
131     * Determine if ascii folding should be used
132     * @param string $language Config language
133     * @return bool true if icu folding should be enabled
134     */
135    public function shouldActivateIcuFolding( $language ) {
136        if ( !$this->isIcuAvailable() || !in_array( 'extra', $this->plugins ) ) {
137            // ICU folding requires the icu plugin and the extra plugin
138            return false;
139        }
140        $in_config = $this->config->get( 'CirrusSearchUseIcuFolding' );
141        // BC code, this config var was originally a simple boolean
142        if ( $in_config === true ) {
143            $in_config = 'yes';
144        }
145        if ( $in_config === false ) {
146            $in_config = 'no';
147        }
148        switch ( $in_config ) {
149        case 'yes':
150            return true;
151        case 'no':
152            return false;
153        case 'default':
154            return $this->languagesWithIcuFolding[$language] ?? false;
155        default:
156            return false;
157        }
158    }
159
160    /**
161     * Determine if the icu tokenizer can be enabled
162     * @param string $language Config language
163     * @return bool
164     */
165    public function shouldActivateIcuTokenization( $language ) {
166        if ( !$this->isIcuAvailable() ) {
167            // requires the icu plugin
168            return false;
169        }
170        $in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer' );
171        switch ( $in_config ) {
172        case 'yes':
173            return true;
174        case 'no':
175            return false;
176        case 'default':
177            return $this->languagesWithIcuTokenization[$language] ?? false;
178        default:
179            return false;
180        }
181    }
182
183    /**
184     * Build the analysis config.
185     *
186     * @param string|null $language Config language
187     * @return array the analysis config
188     */
189    public function buildConfig( $language = null ) {
190        $language ??= $this->defaultLanguage;
191        $config = $this->customize( $this->defaults( $language ), $language );
192        $this->cirrusSearchHookRunner->onCirrusSearchAnalysisConfig( $config, $this );
193        $config = $this->enableGlobalCustomFilters( $config, $language );
194        if ( $this->shouldActivateIcuTokenization( $language ) ) {
195            $config = $this->enableICUTokenizer( $config );
196        }
197        if ( $this->shouldActivateIcuFolding( $language ) ) {
198            $config = $this->enableICUFolding( $config, $language );
199        }
200        $config = $this->fixAsciiFolding( $config );
201        $config = $this->standardTokenizerOnlyCleanup( $config );
202
203        return $config;
204    }
205
206    /**
207     * @return array|null the similarity config
208     */
209    public function buildSimilarityConfig() {
210        return $this->similarity['similarity'] ?? null;
211    }
212
213    /**
214     * replace the standard tokenizer with icu_tokenizer
215     * @param mixed[] $config
216     * @return mixed[] update config
217     */
218    public function enableICUTokenizer( array $config ) {
219        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
220            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
221                continue;
222            }
223            if ( isset( $value[ 'tokenizer' ] ) && $value[ 'tokenizer' ] === 'standard' ) {
224                $value[ 'tokenizer' ] = 'icu_tokenizer';
225            }
226        }
227        return $config;
228    }
229
230    /**
231     * replace STANDARD_TOKENIZER_ONLY with the actual standard tokenizer
232     * @param mixed[] $config
233     * @return mixed[] update config
234     */
235    public function standardTokenizerOnlyCleanup( array $config ) {
236        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
237            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
238                continue;
239            }
240            if ( isset( $value[ 'tokenizer' ] ) &&
241                    $value[ 'tokenizer' ] === self::STANDARD_TOKENIZER_ONLY ) {
242                // if we blocked upgrades/changes to the standard tokenizer,
243                // replace the magic value with the actual standard tokenizer
244                $value[ 'tokenizer' ] = 'standard';
245            }
246        }
247        return $config;
248    }
249
250    /**
251     * Activate ICU folding instead of asciifolding
252     * @param mixed[] $config
253     * @param string $language Config language
254     * @return mixed[] update config
255     */
256    public function enableICUFolding( array $config, $language ) {
257        $unicodeSetFilter = $this->getICUSetFilter( $language );
258        $filter = [
259            'type' => 'icu_folding',
260        ];
261        if ( !empty( $unicodeSetFilter ) ) {
262            $filter[ 'unicodeSetFilter' ] = $unicodeSetFilter;
263        }
264        $config[ 'filter' ][ 'icu_folding' ] = $filter;
265
266        // Adds a simple nfkc normalizer for cases where
267        // we preserve original but the lowercase filter
268        // is not used before
269        $config[ 'filter' ][ 'icu_nfkc_normalization' ] = [
270            'type' => 'icu_normalizer',
271            'name' => 'nfkc',
272        ];
273
274        $newfilters = [];
275        foreach ( $config[ 'analyzer' ] as $name => $value ) {
276            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
277                continue;
278            }
279            if ( !isset( $value[ 'filter' ] ) ) {
280                continue;
281            }
282            if ( in_array( 'asciifolding', $value[ 'filter' ] ) ) {
283                $newfilters[ $name ] = $this->switchFiltersToICUFolding( $value[ 'filter' ] );
284            }
285            if ( in_array( 'asciifolding_preserve', $value[ 'filter' ] ) ) {
286                $newfilters[ $name ] = $this->switchFiltersToICUFoldingPreserve( $value[ 'filter' ] );
287            }
288        }
289
290        foreach ( $newfilters as $name => $filters ) {
291            $config[ 'analyzer' ][ $name ][ 'filter' ] = $filters;
292        }
293        // Explicitly enable icu_folding on plain analyzers if it's not
294        // already enabled
295        foreach ( [ 'plain' ] as $analyzer ) {
296            if ( !isset( $config[ 'analyzer' ][ $analyzer ] ) ) {
297                continue;
298            }
299            if ( !isset( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] ) ) {
300                $config[ 'analyzer' ][ $analyzer ][ 'filter' ] = [];
301            }
302            $config[ 'analyzer' ][ $analyzer ][ 'filter' ] =
303                $this->switchFiltersToICUFoldingPreserve(
304                    // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset
305                    $config[ 'analyzer' ][ $analyzer ][ 'filter' ], true );
306        }
307
308        return $config;
309    }
310
311    /**
312     * Replace occurrence of asciifolding to icu_folding
313     * @param string[] $filters
314     * @return string[] new list of filters
315     */
316    private function switchFiltersToICUFolding( array $filters ) {
317        array_splice( $filters, array_search( 'asciifolding', $filters ), 1,
318            [ 'icu_folding', 'remove_empty' ] );
319        return $filters;
320    }
321
322    /**
323     * Replace occurrence of asciifolding_preserve with a set
324     * of compatible filters to enable icu_folding
325     * @param string[] $filters
326     * @param bool $append append icu_folding even if asciifolding is not present
327     * @return string[] new list of filters
328     */
329    private function switchFiltersToICUFoldingPreserve( array $filters, $append = false ) {
330        if ( in_array( 'icu_folding', $filters ) ) {
331            // ICU folding already here
332            return $filters;
333        }
334        $ap_idx = array_search( 'asciifolding_preserve', $filters );
335        if ( $ap_idx === false && $append ) {
336            $ap_idx = count( $filters );
337            // fake an asciifolding_preserve so we can
338            // reuse code that replaces it
339            $filters[] = 'asciifolding_preserve';
340        }
341        if ( $ap_idx === false ) {
342            return $filters;
343        }
344