Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
99.39% covered (success)
99.39%
983 / 989
76.92% covered (warning)
76.92%
20 / 26
CRAP
0.00% covered (danger)
0.00%
0 / 1
AnalysisConfigBuilder
99.39% covered (success)
99.39%
983 / 989
76.92% covered (warning)
76.92%
20 / 26
218
0.00% covered (danger)
0.00%
0 / 1
 __construct
96.30% covered (success)
96.30%
26 / 27
0.00% covered (danger)
0.00%
0 / 1
8
 shouldActivateIcuFolding
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
9
 shouldActivateIcuTokenization
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
7
 buildConfig
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
4
 buildSimilarityConfig
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 enableICUTokenizer
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
6
 standardTokenizerOnlyCleanup
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
6
 disableLimitedMappings
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
4
 enableICUFolding
100.00% covered (success)
100.00%
32 / 32
100.00% covered (success)
100.00%
1 / 1
12
 switchFiltersToICUFolding
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 switchFiltersToICUFoldingPreserve
94.44% covered (success)
94.44%
17 / 18
0.00% covered (danger)
0.00%
0 / 1
7.01
 getICUSetFilter
98.00% covered (success)
98.00%
49 / 50
0.00% covered (danger)
0.00%
0 / 1
29
 getICUNormSetFilter
80.00% covered (warning)
80.00%
4 / 5
0.00% covered (danger)
0.00%
0 / 1
4.13
 defaults
100.00% covered (success)
100.00%
263 / 263
100.00% covered (success)
100.00%
1 / 1
7
 customize
100.00% covered (success)
100.00%
430 / 430
100.00% covered (success)
100.00%
1 / 1
71
 fixAsciiFolding
100.00% covered (success)
100.00%
16 / 16
100.00% covered (success)
100.00%
1 / 1
7
 getDefaultTextAnalyzerType
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 getDefaultFilters
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
5
 resolveFilters
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 replaceFilter
87.50% covered (warning)
87.50%
7 / 8
0.00% covered (danger)
0.00%
0 / 1
4.03
 mergeConfig
100.00% covered (success)
100.00%
17 / 17
100.00% covered (success)
100.00%
1 / 1
12
 buildLanguageConfigs
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 isIcuAvailable
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 isTextifyAvailable
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 enableGlobalCustomFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 buildGlobalCustomFilters
100.00% covered (success)
100.00%
29 / 29
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use CirrusSearch\CirrusSearch;
6use CirrusSearch\CirrusSearchHookRunner;
7use CirrusSearch\Profile\SearchProfileService;
8use CirrusSearch\SearchConfig;
9use MediaWiki\MediaWikiServices;
10
11/**
12 * Builds elasticsearch analysis config arrays.
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License along
25 * with this program; if not, write to the Free Software Foundation, Inc.,
26 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
27 * http://www.gnu.org/copyleft/gpl.html
28 */
29class AnalysisConfigBuilder {
30    /**
31     * Version number for the core analysis. Increment the major
32     * version when the analysis changes in an incompatible way,
33     * and change the minor version when it changes but isn't
34     * incompatible.
35     *
36     * You may also need to increment MetaStoreIndex::METASTORE_VERSION
37     * manually as well.
38     */
39    public const VERSION = '0.12';
40
41    /**
42     * Maximum number of characters allowed in keyword terms.
43     */
44    private const KEYWORD_IGNORE_ABOVE = 5000;
45
46    /**
47     * Temporary magic value to prevent enabling ICU tokenizer in specific analyzers
48     */
49    private const STANDARD_TOKENIZER_ONLY = 'std_only';
50
51    /**
52     * @var bool is the icu plugin available?
53     */
54    private $icu;
55
56    /**
57     * @var bool is the textify plugin available?
58     */
59    private $textify;
60
61    /**
62     * @var string which ICU tokenizer should be used
63     */
64    private $icu_tokenizer = 'icu_tokenizer';
65
66    /**
67     * @var array Similarity algo (tf/idf, bm25, etc) configuration
68     */
69    private $similarity;
70
71    /**
72     * @var SearchConfig cirrus config
73     */
74    protected $config;
75
76    /**
77     * @var string[]
78     */
79    private $plugins;
80
81    /**
82     * @var string
83     */
84    protected $defaultLanguage;
85
86    /**
87     * @var CirrusSearchHookRunner
88     */
89    private $cirrusSearchHookRunner;
90
91    /**
92     * @var GlobalCustomFilter[]
93     */
94    public $globalCustomFilters;
95
96    /**
97     * @param string $langCode The language code to build config for
98     * @param string[] $plugins list of plugins installed in Elasticsearch
99     * @param SearchConfig|null $config
100     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
101     */
102    public function __construct(
103        $langCode,
104        array $plugins,
105        SearchConfig $config = null,
106        CirrusSearchHookRunner $cirrusSearchHookRunner = null
107    ) {
108        $this->globalCustomFilters = $this->buildGlobalCustomFilters();
109
110        $this->defaultLanguage = $langCode;
111        $this->plugins = $plugins;
112        foreach ( $this->elasticsearchLanguageAnalyzersFromPlugins as $pluginSpec => $extra ) {
113            $pluginsPresent = 1;
114            $pluginList = explode( ',', $pluginSpec );
115            foreach ( $pluginList as $plugin ) {
116                if ( !in_array( $plugin, $plugins ) ) {
117                    $pluginsPresent = 0;
118                    break;
119                }
120            }
121            if ( $pluginsPresent ) {
122                $this->elasticsearchLanguageAnalyzers =
123                    array_merge( $this->elasticsearchLanguageAnalyzers, $extra );
124            }
125        }
126        $this->icu = in_array( 'analysis-icu', $plugins );
127        $this->textify = in_array( 'extra-analysis-textify', $plugins );
128        if ( $this->isTextifyAvailable() ) {
129            // icu_token_repair can only work with the textify icu_tokenizer clone
130            $this->icu_tokenizer = 'textify_icu_tokenizer';
131        }
132        $config ??= MediaWikiServices::getInstance()->getConfigFactory()
133            ->makeConfig( 'CirrusSearch' );
134        $similarity = $config->getProfileService()->loadProfile( SearchProfileService::SIMILARITY );
135        if ( !array_key_exists( 'similarity', $similarity ) ) {
136            $similarity['similarity'] = [];
137        }
138        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
139            MediaWikiServices::getInstance()->getHookContainer() );
140        $this->cirrusSearchHookRunner->onCirrusSearchSimilarityConfig( $similarity['similarity'] );
141        $this->similarity = $similarity;
142
143        $this->config = $config;
144    }
145
146    /**
147     * Determine if ascii folding should be used
148     * @param string $language Config language
149     * @return bool true if icu folding should be enabled
150     */
151    public function shouldActivateIcuFolding( $language ) {
152        if ( !$this->isIcuAvailable() || !in_array( 'extra', $this->plugins ) ) {
153            // ICU folding requires the icu plugin and the extra plugin
154            return false;
155        }
156        $in_config = $this->config->get( 'CirrusSearchUseIcuFolding' );
157        // BC code, this config var was originally a simple boolean
158        if ( $in_config === true ) {
159            $in_config = 'yes';
160        }
161        if ( $in_config === false ) {
162            $in_config = 'no';
163        }
164        switch ( $in_config ) {
165            case 'yes':
166                return true;
167            case 'no':
168                return false;
169            case 'default':
170                return $this->languagesWithIcuFolding[$language] ?? false;
171            default:
172                return false;
173        }
174    }
175
176    /**
177     * Determine if the icu_tokenizer can replace the standard tokenizer for this language
178     * @param string $language Config language
179     * @return bool
180     */
181    public function shouldActivateIcuTokenization( $language ) {
182        if ( !$this->isIcuAvailable() && !$this->isTextifyAvailable() ) {
183            // requires the icu or textify plugin
184            return false;
185        }
186        $in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer' );
187        switch ( $in_config ) {
188            case 'yes':
189                return true;
190            case 'no':
191                return false;
192            case 'default':
193                // languagesWithIcuTokenization[] gives absolute answers for specific languages.
194                // If the textify plugin is available, the default is 'yes'/true because we
195                // have icu_token_repair available; if not, the default is 'no'/false
196                return $this->languagesWithIcuTokenization[$language] ?? $this->isTextifyAvailable();
197            default:
198                return false;
199        }
200    }
201
202    /**
203     * Build the analysis config.
204     *
205     * @param string|null $language Config language
206     * @return array the analysis config
207     */
208    public function buildConfig( $language = null ) {
209        $language ??= $this->defaultLanguage;
210        $config = $this->customize( $this->defaults( $language ), $language );
211        $this->cirrusSearchHookRunner->onCirrusSearchAnalysisConfig( $config, $this );
212
213        if ( $this->shouldActivateIcuTokenization( $language ) ) {
214            $config = $this->enableICUTokenizer( $config );
215        }
216
217        if ( $this->shouldActivateIcuFolding( $language ) ) {
218            $config = $this->enableICUFolding( $config, $language );
219        }
220        $config = $this->fixAsciiFolding( $config );
221        $config = $this->standardTokenizerOnlyCleanup( $config );
222        if ( !$this->isTextifyAvailable() ) {
223            $config = $this->disableLimitedMappings( $config );
224        }
225
226        // should come after other upgrades to get the full context
227        $config = $this->enableGlobalCustomFilters( $config, $language );
228
229        return $config;
230    }
231
232    /**
233     * @return array|null the similarity config
234     */
235    public function buildSimilarityConfig() {
236        return $this->similarity['similarity'] ?? null;
237    }
238
239    /**
240     * replace the standard tokenizer with icu_tokenizer
241     * @param mixed[] $config
242     * @return mixed[] update config
243     */
244    public function enableICUTokenizer( array $config ) {
245        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
246            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
247                continue;
248            }
249            if ( isset( $value[ 'tokenizer' ] ) && $value[ 'tokenizer' ] === 'standard' ) {
250                $value[ 'tokenizer' ] = $this->icu_tokenizer;
251            }
252        }
253        return $config;
254    }
255
256    /**
257     * replace STANDARD_TOKENIZER_ONLY with the actual standard tokenizer
258     * @param mixed[] $config
259     * @return mixed[] update config
260     */
261    public function standardTokenizerOnlyCleanup( array $config ) {
262        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
263            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
264                continue;
265            }
266            if ( isset( $value[ 'tokenizer' ] ) &&
267                    $value[ 'tokenizer' ] === self::STANDARD_TOKENIZER_ONLY ) {
268                // if we blocked upgrades/changes to the standard tokenizer,
269                // replace the magic value with the actual standard tokenizer
270                $value[ 'tokenizer' ] = 'standard';
271            }
272        }
273        return $config;
274    }
275
276    /**
277     * replace limited_mappings with mappings if limited_mapping is unavailable
278     * @param mixed[] $config
279     * @return mixed[] update config
280     */
281    public function disableLimitedMappings( array $config ) {
282        foreach ( $config[ 'char_filter' ] as $name => &$value ) {
283            if ( !isset( $value[ 'type' ] ) || $value[ 'type' ] != 'limited_mapping' ) {
284                continue;
285            }
286            $value[ 'type' ] = 'mapping';
287        }
288        return $config;
289    }
290
291    /**
292     * Activate ICU folding instead of asciifolding
293     * @param mixed[] $config
294     * @param string $language Config language
295     * @return mixed[] update config
296     */
297    public function enableICUFolding( array $config, $language ) {
298        $unicodeSetFilter = $this->getICUSetFilter( $language );
299        $filter = [
300            'type' => 'icu_folding',
301        ];
302        if ( $unicodeSetFilter !== null ) {
303            $filter[ 'unicodeSetFilter' ] = $unicodeSetFilter;
304        }
305        $config[ 'filter' ][ 'icu_folding' ] = $filter;
306
307        // Adds a simple nfkc normalizer for cases where
308        // we preserve original but the lowercase filter
309        // is not used before
310        $config[ 'filter' ][ 'icu_nfkc_normalization' ] = [
311            'type' => 'icu_normalizer',
312            'name' => 'nfkc',
313        ];
314
315        $newfilters = [];
316        foreach ( $config[ 'analyzer' ] as $name => $value ) {
317            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
318                continue;
319            }
320            if ( !isset( $value[ 'filter' ] ) ) {
321                continue;
322            }
323            if ( in_array( 'asciifolding', $value[ 'filter' ] ) ) {
324                $newfilters[ $name ] = $this->switchFiltersToICUFolding( $value[ 'filter' ] );
325            }
326            if ( in_array( 'asciifolding_preserve', $value[ 'filter' ] ) ) {
327                $newfilters[ $name ] = $this->switchFiltersToICUFoldingPreserve( $value[ 'filter' ] );
328            }
329        }
330
331        foreach ( $newfilters as $name => $filters ) {
332            $config[ 'analyzer' ][ $name ][ 'filter' ] = $filters;
333        }
334        // Explicitly enable icu_folding on plain analyzers if it's not
335        // already enabled
336        foreach ( [ 'plain' ] as $analyzer ) {
337            if ( !isset( $config[ 'analyzer' ][ $analyzer ] ) ) {
338                continue;
339            }
340            if ( !isset( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] ) ) {
341                $config[ 'analyzer' ][ $analyzer ][ 'filter' ] = [];
342            }
343            $config[ 'analyzer' ][ $analyzer ][ 'filter' ] =
344                $this->switchFiltersToICUFoldingPreserve(
345                    // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset
346                    $config[ 'analyzer' ][ $analyzer ][ 'filter' ], true );
347        }
348
349        return $config;
350    }
351
352    /**
353     * Replace occurrence of asciifolding to icu_folding
354     * @param string[] $filters
355     * @return string[] new list of filters
356     */
357    private function switchFiltersToICUFolding( array $filters ) {
358        array_splice( $filters, array_search( 'asciifolding', $filters ), 1,
359            [ 'icu_folding', 'remove_empty' ] );
360        return $filters;
361    }
362
363    /**
364     * Replace occurrence of asciifolding_preserve with a set
365     * of compatible filters to enable icu_folding
366     * @param string[] $filters
367     * @param bool $append append icu_folding even if asciifolding is not present
368     * @return string[] new list of filters
369     */
370    private function switchFiltersToICUFoldingPreserve( array $filters, $append = false ) {
371        if ( in_array( 'icu_folding', $filters ) ) {
372            // ICU folding already here
373            return $filters;
374        }
375        $ap_idx = array_search( 'asciifolding_preserve', $filters );
376        if ( $ap_idx === false && $append ) {
377            $ap_idx = count( $filters );
378            // fake an asciifolding_preserve so we can
379            // reuse code that replaces it
380            $filters[] = 'asciifolding_preserve';
381        }
382        if ( $ap_idx === false ) {
383            return $filters;
384        }
385        // with ICU lowercase is replaced by icu_normalizer/nfkc_cf
386        // thus unicode normalization is already done.
387        $lc_idx = array_search( 'icu_normalizer', $filters );
388        $newfilters = [];
389        if ( $lc_idx === false || $lc_idx > $ap_idx ) {
390            // If lowercase is not detected before we
391            // will have to do some icu normalization
392            // this is to prevent preserving "un-normalized"
393            // unicode chars.
394            $newfilters[] = 'icu_nfkc_normalization';
395        }
396        $newfilters[] = 'preserve_original_recorder';
397        $newfilters[] = 'icu_folding';
398        $newfilters[] = 'preserve_original';
399        $newfilters[] = 'remove_empty';
400        array_splice( $filters, $ap_idx, 1, $newfilters );
401        return $filters;
402    }
403
404    /**
405     * Return the list of chars to exclude from ICU folding
406     * @param string $language Config language
407     * @return null|string
408     */
409    protected function getICUSetFilter( $language ) {
410        if ( $this->config->get( 'CirrusSearchICUFoldingUnicodeSetFilter' ) !== null ) {
411            return $this->config->get( 'CirrusSearchICUFoldingUnicodeSetFilter' );
412        }
413        switch ( $language ) {
414            /* @todo: complete the default filters per language
415             *
416             * For Slovak (sk)—which has no folding configured here!—see:
417             *   https://www.mediawiki.org/wiki/User:TJones_(WMF)/T223787
418             *
419             * Exceptions are generally listed as Unicode characters for ease of
420             *   inspection. However, combining characters (such as for Thai (th))
421             *   are \u encoded to prevent problems with display or editing
422             */
423            case 'bg': // T325090
424                return '[^Йй]';
425            case 'bs': // T192395
426            case 'hr': // T192395
427            case 'sh': // T192395
428            case 'sr': // T183015
429                return '[^ĐđŽžĆ抚Čč]';
430            case 'cs': // T284578
431                return '[^ÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž]';
432            case 'da': // T283366
433                return '[^ÆæØøÅå]';
434            case 'de': // T281379
435                return '[^ÄäÖöÜüẞß]';
436            case 'eo': // T202173
437                return '[^ĈĉĜĝĤĥĴĵŜŝŬŭ]';
438            case 'es': // T277699
439                return '[^Ññ]';
440            case 'et': // T332322
441                return '[^ŠšŽžÕõÄäÖöÜü]';
442            case 'eu': // T283366
443                return '[^Ññ]';
444            case 'fi': // T284578
445                return '[^ÅåÄäÖö]';
446            case 'gl': // T284578
447                return '[^Ññ]';
448            case 'hu': // T325089
449                return '[^ÁáÉéÍíÓóÖöŐőÚúÜüŰű]';
450            case 'ja': // T326822
451                // This range includes characters that don't currently get ICU folded, in
452                // order to keep the overall regex a lot simpler. The specific targets are
453                // characters with dakuten and handakuten, the separate (han)dakuten
454                // characters (regular and combining) and the prolonged sound mark (chōonpu).
455                return '[^が-ヾ]';
456            case 'lt': // T325090
457                return '[^ĄąČčĘęĖėĮįŠšŲųŪūŽž]';
458            case 'lv': // T325089
459                return '[^ĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž]';
460            case 'nb': // T289612
461            case 'nn': // T289612
462            case 'no':
463                return '[^ÆæØøÅå]';
464            case 'ro': // T325091
465                // including s&t with cedilla because we (have to) use it internally T330893
466                return '[^ĂăÂâÎîȘșȚțŞşŢţ]';
467            case 'ru':
468                return '[^Йй]';
469            case 'sv': // T160562
470                return '[^ÅåÄäÖö]';
471            case 'th': // T294147
472                return '[^\u0E47-\u0E4E]';
473            case 'tr': // T329762
474                // (I and i aren't strictly necessary but they keep the Turkish upper/lower
475                // pairs Iı & İi together and makes it clear both are intended.)
476                return '[^ÇçĞğIıİiÖöŞşÜü]';
477            default:
478                return null;
479        }
480    }
481
482    /**
483     * Return the list of chars to exclude from ICU normalization
484     * @param string $language Config language
485     * @return null|string
486     */
487    protected function getICUNormSetFilter( $language ) {
488        if ( $this->config->get( 'CirrusSearchICUNormalizationUnicodeSetFilter' ) !== null ) {
489            return $this->config->get( 'CirrusSearchICUNormalizationUnicodeSetFilter' );
490        }
491        switch ( $language ) {
492            /* For German (de), see T281379
493             */
494            case 'de':
495                return '[^ẞß]'; // Capital ẞ is lowercased to ß by german_charfilter
496                                // lowercase ß is normalized to ss by german_normalization
497            default:
498                return null;
499        }
500    }
501
502    /**
503     * Build an analysis config with sane defaults.
504     *
505     * @param string $language Config language
506     * @return array
507     */
508    private function defaults( $language ) {
509        $defaults = [
510            'analyzer' => [
511                'text' => [
512                    'type' => $this->getDefaultTextAnalyzerType( $language ),
513                ],
514                // text_search is not configured here because it will be copied from text
515                'plain' => [
516                    // Surprisingly, the Lucene docs claim this works for
517                    // Chinese, Japanese, and Thai as well.
518                    // The difference between this and the 'standard'
519                    // analyzer is the lack of english stop words.
520                    'type' => 'custom',
521                    'char_filter' => [ 'nnbsp_norm', 'word_break_helper' ],
522                    'tokenizer' => 'standard',
523                    'filter' => [ 'lowercase' ],
524                ],
525                'plain_search' => [
526                    // In accent squashing languages this will not contain accent
527                    // squashing to allow searches with accents to only find accents
528                    // and searches without accents to find both.
529                    'type' => 'custom',
530                    'char_filter' => [ 'nnbsp_norm', 'word_break_helper' ],
531                    'tokenizer' => 'standard',
532                    'filter' => [ 'lowercase' ],
533                ],
534                // Used by ShortTextIndexField
535                'short_text' => [
536                    'type' => 'custom',
537                    'tokenizer' => 'whitespace',
538                    'filter' => [ 'lowercase', 'aggressive_splitting', 'asciifolding_preserve' ],
539                ],
540                'short_text_search' => [
541                    'type' => 'custom',
542                    'tokenizer' => 'whitespace',
543                    'filter' => [ 'lowercase', 'aggressive_splitting' ],
544                ],
545                'source_text_plain' => [
546                    'type' => 'custom',
547                    'char_filter' => [ 'word_break_helper_source_text' ],
548                    'tokenizer' => 'standard',
549                    'filter' => [ 'lowercase' ],
550                ],
551                'source_text_plain_search' => [
552                    'type' => 'custom',
553                    'char_filter' => [ 'word_break_helper_source_text' ],
554                    'tokenizer' => 'standard',
555                    'filter' => [ 'lowercase' ],
556                ],
557                'suggest' => [
558                    'type' => 'custom',
559                    'tokenizer' => 'standard',
560                    'filter' => [ 'lowercase', 'suggest_shingle' ],
561                ],
562                'suggest_reverse' => [
563                    'type' => 'custom',
564                    'tokenizer' => 'standard',
565                    'filter' => [ 'lowercase', 'suggest_shingle', 'reverse' ],
566                ],
567                'token_reverse' => [
568                    'type' => 'custom',
569                    'tokenizer' => 'no_splitting',
570                    'filter' => [ 'reverse' ]
571                ],
572                'near_match' => [
573                    'type' => 'custom',
574                    'char_filter' => [ 'near_space_flattener' ],
575                    'tokenizer' => 'no_splitting',
576                    'filter' => [ 'lowercase' ],
577                ],
578                'near_match_asciifolding' => [
579                    'type' => 'custom',
580                    'char_filter' => [ 'near_space_flattener' ],
581                    'tokenizer' => 'no_splitting',
582                    'filter' => [ 'truncate_keyword', 'lowercase', 'asciifolding' ],
583                ],
584                'prefix' => [
585                    'type' => 'custom',
586                    'char_filter' => [ 'near_space_flattener' ],
587                    'tokenizer' => 'prefix',
588                    'filter' => [ 'lowercase' ],
589                ],
590                'prefix_asciifolding' => [
591                    'type' => 'custom',
592                    'char_filter' => [ 'near_space_flattener' ],
593                    'tokenizer' => 'prefix',
594                    'filter' => [ 'lowercase', 'asciifolding' ],
595                ],
596                'word_prefix' => [
597                    'type' => 'custom',
598                    'tokenizer' => 'standard',
599                    'filter' => [ 'lowercase', 'prefix_ngram_filter' ],
600                ],
601                'keyword' => [
602                    'type' => 'custom',
603                    'tokenizer' => 'no_splitting',
604                    'filter' => [ 'truncate_keyword' ],
605                ],
606                'lowercase_keyword' => [
607                    'type' => 'custom',
608                    'tokenizer' => 'no_splitting',
609                    'filter' => [ 'truncate_keyword', 'lowercase' ],
610                ],
611                'trigram' => [
612                    'type' => 'custom',
613                    'tokenizer' => 'trigram',
614                    'filter' => [ 'lowercase' ],
615                ],
616            ],
617            'filter' => [
618                'suggest_shingle' => [
619                    'type' => 'shingle',
620                    'min_shingle_size' => 2,
621                    'max_shingle_size' => 3,
622                    'output_unigrams' => true,
623                ],
624                'lowercase' => [
625                    'type' => 'lowercase',
626                ],
627                'aggressive_splitting' => [
628                    'type' => 'word_delimiter_graph',
629                    'stem_english_possessive' => false,
630                    'preserve_original' => false
631                ],
632                'prefix_ngram_filter' => [
633                    'type' => 'edgeNGram',
634                    'max_gram' => CirrusSearch::MAX_TITLE_SEARCH,
635                ],
636                'asciifolding' => [
637                    'type' => 'asciifolding',
638                    'preserve_original' => false
639                ],
640                'asciifolding_preserve' => [
641                    'type' => 'asciifolding',
642                    'preserve_original' => true
643                ],
644                // The 'keyword' type in ES seems like a hack
645                // and doesn't allow normalization (like lowercase)
646                // prior to 5.2. Instead we consistently use 'text'
647                // and truncate where necessary.
648                'truncate_keyword' => [
649                    'type' => 'truncate',
650                    'length' => self::KEYWORD_IGNORE_ABOVE,
651                ],
652                'remove_empty' => [
653                    'type' => 'length',
654                    'min' => 1,
655                ],
656            ],
657            'tokenizer' => [
658                'prefix' => [
659                    'type' => 'edgeNGram',
660                    'max_gram' => CirrusSearch::MAX_TITLE_SEARCH,
661                ],
662                'no_splitting' => [ // Just grab the whole term.
663                    'type' => 'keyword',
664                ],
665                'trigram' => [
666                    'type' => 'nGram',
667                    'min_gram' => 3,
668                    'max_gram' => 3,
669                ],
670            ],
671            'char_filter' => [
672                // Flattens things that are space like to spaces in the near_match style analyzers
673                'near_space_flattener' => [
674                    'type' => 'limited_mapping',
675                    'mappings' => [
676                        "'=>\u0020", // Useful for finding names
677                        '\u2019=>\u0020', // Unicode right single quote
678                        '\u02BC=>\u0020', // Unicode modifier letter apostrophe
679                        '_=>\u0020', // MediaWiki loves _ and people are used to it but it
680                                     // usually means space
681                        '-=>\u0020', // Useful for finding hyphenated names unhyphenated
682                    ],
683                ],
684                // map narrow no-break space to plain space to compensate for ES6.x+
685                // analyzers generally not doing so
686                'nnbsp_norm' => [
687                    'type' => 'limited_mapping',
688                    'mappings' => [
689                        '\u202F=>\u0020',
690                    ],
691                ],
692                // Add a space between lowercase letter {Ll} and uppercase {Lu} or
693                // titlecase {Lt} letter, allowing for optional combining marks {M}
694                // or invisibles {Cf}. This is expensive, so use camelCase_splitter
695                // in extra-analysis-textify instead, if available (T219108/T346051)
696                'regex_camelCase' => [
697                    'type' => 'pattern_replace',
698                    'pattern' => '(\\p{Ll}[\\p{M}\\p{Cf}]*)([\\p{Lu}\\p{Lt}])',
699                    'replacement' => '$1 $2'
700                ],
701                // Replace period (regular or fullwidth) between [non-letter +
702                // letter] and [letter + non-letter]. This slow, and also only
703                // handles the simplest case. Use acronym_fixer in
704                // extra-analysis-textify instead, if available (T170625/T346051)
705                'regex_acronym_fixer' => [
706                    'type' => 'pattern_replace',
707                    'pattern' => '(?<=(?:^|\\P{L})\\p{L})[..](\\p{L})(?=\\P{L}|$)',
708                    'replacement' => '$1'
709                ],
710                // combine universally-applied mappings into one mapping to save on the
711                // overhead of calling multiple mappings
712                'globo_norm' => [
713                    'type' => 'limited_mapping',
714                    'mappings' => [
715                        // map lots of apostrophe-like characters to apostrophe (T315118);
716                        // formerly apostrophe_norm
717                        "`=>'", // grave accent
718                        "´=>'", // acute accent
719                        "ʹ=>'", // modifier letter prime
720                        "ʻ=>'", // modifier letter turned comma
721                        "ʼ=>'", // modifier letter apostrophe
722                        "ʽ=>'", // modifier letter reversed comma
723                        "ʾ=>'", // modifier letter right half ring
724                        "ʿ=>'", // modifier letter left half ring
725                        "ˋ=>'", // modifier letter grave accent
726                        "՚=>'", // Armenian apostrophe
727                        "\u05F3=>'", // Hebrew punctuation geresh
728                        "‘=>'", // left single quotation mark
729                        "’=>'", // right single quotation mark
730                        "‛=>'", // single high-reversed-9 quotation mark
731                        "′=>'", // prime
732                        "‵=>'", // reversed prime
733                        "ꞌ=>'", // Latin small letter saltillo
734                        "'=>'", // fullwidth apostrophe
735                        "`=>'", // fullwidth grave accent
736                        // map narrow no-break space to plain space to compensate for ES6.x+
737                        // analyzers generally not doing so; copied from nnbsp_norm, which
738                        // is still needed elsewhere
739                        '\u202F=>\u0020',
740                        // Delete primary and secondary stress markers, which are
741                        // inconsistently used across phonetic transcriptions
742                        "ˈ=>", // modifier letter vertical line
743                        "ˌ=>", // modifier letter low vertical line
744                        // Delete Arabic tatweel (ـ) (used largely for cosmetic purposes)
745                        "\u0640=>", // tatweel
746                        // Convert Arabic thousand separator and Arabic comma to comma for
747                        // more consistent number parsing
748                        "٬=>,", // Arabic thousands separator
749                        "،=>,", // Arabic comma
750                        // delete Armenian emphasis marks, exclamation marks, and question
751                        // marks, since they modify words rather than follow them.
752                        "՛=>", // Armenian emphasis mark
753                        "՜=>", // Armenian exclamation mark
754                        "՞=>", // Armenian question mark
755                        // micro sign to mu, to prevent some unneeded ICU tokenizer splits
756                        // icu_normalize does this, too.. just later
757                        "µ=>μ",
758                    ],
759                ],
760                // Converts things that don't always count as word breaks into spaces
761                // which (almost) always count as word breaks (e.g., the Nori and SmartCN
762                // tokenizers do not always count spaces as word breaks!)
763                'word_break_helper' => [
764                    'type' => 'limited_mapping',
765                    'mappings' => [
766                        '_=>\u0020',
767                        ':=>\u0020',
768                        // These are more useful for code:
769                        '.=>\u0020',
770                        '(=>\u0020',
771                        ')=>\u0020',
772                        // fullwidth variants
773                        '.=>\u0020',
774                        '_=>\u0020',
775                        ':=>\u0020',
776                        // middle dot
777                        '·=>\u0020',
778                    ],
779                ],
780                'word_break_helper_source_text' => [
781                    'type' => 'limited_mapping',
782                    'mappings' => [
783                        '_=>\u0020',
784                        // These are more useful for code:
785                        '.=>\u0020',
786                        '(=>\u0020',
787                        ')=>\u0020',
788                        ':=>\u0020', // T145023
789                    ],
790                ],
791                'dotted_I_fix' => [
792                    // A common regression caused by unpacking is that İ is no longer
793                    // treated correctly, so specify the mapping just once and re-use
794                    // in analyzer/text/char_filter as needed.
795                    'type' => 'limited_mapping',
796                    'mappings' => [
797                        'İ=>I',
798                    ],
799                ],
800            ],
801        ];
802        foreach ( $defaults[ 'analyzer' ] as &$analyzer ) {
803            if ( $analyzer[ 'type' ] === 'default' ) {
804                $analyzer = [
805                    'type' => 'custom',
806                    'tokenizer' => 'standard',
807                    'filter' => [ 'lowercase' ],
808                ];
809            }
810        }
811        if ( $this->isTextifyAvailable() && $this->shouldActivateIcuTokenization( $language ) ) {
812            $defaults[ 'filter' ][ 'icutokrep_no_camel_split' ] = [
813                'type' => 'icu_token_repair',
814                'keep_camel_split' => false
815            ];
816        }
817        if ( $this->isIcuAvailable() ) {
818            $defaults[ 'filter' ][ 'icu_normalizer' ] = [
819                'type' => 'icu_normalizer',
820                'name' => 'nfkc_cf',
821            ];
822            $unicodeSetFilter = $this->getICUNormSetFilter( $language );
823            if ( $unicodeSetFilter !== null ) {
824                $defaults[ 'filter' ][ 'icu_normalizer' ][ 'unicodeSetFilter' ] = $unicodeSetFilter;
825            }
826        }
827
828        return $defaults;
829    }
830
831    /**
832     * Customize the default config for the language.
833     *
834     * @param array $config
835     * @param string $language Config language
836     * @return array
837     */
838    private function customize( $config, $language ) {
839        $langName = $this->getDefaultTextAnalyzerType( $language );
840        switch ( $langName ) {
841            // Please add languages in alphabetical order.
842
843            // usual unpacked languages
844            case 'basque':     // Unpack Basque analyzer T283366
845            case 'brazilian':  // Unpack Brazilian analyzer T325092
846            case 'bulgarian':  // Unpack Bulgarian analyzer T325090
847            case 'czech':      // Unpack Czech analyzer T284578
848            case 'danish':     // Unpack Danish analyzer T283366
849            case 'estonian':   // Unpack Estonian analyzer T332322
850            case 'finnish':    // Unpack Finnish analyzer T284578
851            case 'galician':   // Unpack Galician analyzer T284578
852            case 'hungarian':  // Unpack Hungarian analyzer T325089
853            case 'latvian':    // Unpack Latvian analyzer T325089
854            case 'lithuanian': // Unpack Lithuanian analyzer T325090
855            case 'norwegian':  // Unpack Norwegian analyzer T289612
856                $config = ( new AnalyzerBuilder( $langName ) )->
857                    withUnpackedAnalyzer()->
858                    build( $config );
859                break;
860
861            // usual unpacked languages, with "light" variant stemmer
862            case 'portuguese':  // Unpack Portuguese analyzer T281379
863            case 'spanish':     // Unpack Spanish analyzer T277699
864                $config = ( new AnalyzerBuilder( $langName ) )->
865                    withUnpackedAnalyzer()->
866                    withLightStemmer()->
867                    build( $config );
868                break;
869
870            // customized languages
871            case 'arabic':
872            case 'arabic-egyptian':
873            case 'arabic-moroccan':
874                // Unpack Arabic analyzer T294147
875                $arBuilder = ( new AnalyzerBuilder( 'arabic' ) )->
876                    withUnpackedAnalyzer()->
877                    withDecimalDigit()->
878                    insertFiltersBefore( 'arabic_stemmer', [ 'arabic_normalization' ] );
879
880                // load extra stopwords for Arabic varieties
881                if ( $langName == 'arabic-egyptian' || $langName == 'arabic-moroccan' ) {
882                    $arStopwords = require __DIR__ . '/AnalysisLanguageData/arabicStopwords.php';
883                    $arBuilder->withExtraStop( $arStopwords, 'arz_ary_stop', 'arabic_stop' );
884                }
885
886                $config = $arBuilder->build( $config );
887                break;
888            case 'armenian':  // Unpack Armenian analyzer T325089
889                // char map: Armenian uses ․ ("one-dot leader") about 10% as often as . (period)
890                // stopwords նաև & և get normalized to նաեւ & եւ, so pick those up, too.
891                $config = ( new AnalyzerBuilder( $langName ) )->
892                    withUnpackedAnalyzer()->
893                    withLimitedCharMap( [ '․=>.' ] )->
894                    withExtraStop( [ 'նաեւ', 'եւ' ], 'armenian_norm_stop', 'armenian_stop' )->
895                    build( $config );
896                break;
897            case 'azerbaijani':
898            case 'crimean-tatar':
899            case 'gagauz':
900            case 'kazakh':
901            case 'tatar':
902                // Turkic languages that use I/ı & İ/i, so need Turkish lowercasing
903                $config = ( new AnalyzerBuilder( $langName ) )->
904                    withFilters( [ 'lowercase' ] )->
905                    withLangLowercase( 'turkish' )->
906                    build( $config );
907                break;
908            case 'bengali': // Unpack Bengali analyzer T294067
909                $config = ( new AnalyzerBuilder( $langName ) )->
910                    withUnpackedAnalyzer()->
911                    withDecimalDigit()->
912                    insertFiltersBefore( 'bengali_stop', [ 'indic_normalization' ] )->
913                    build( $config );
914                break;
915            case 'bosnian':
916            case 'croatian':
917            case 'serbian':
918            case 'serbo-croatian':
919                // Unpack default analyzer to add Serbian stemming and custom folding
920                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T183015
921                // and https://www.mediawiki.org/wiki/User:TJones_(WMF)/T192395
922                $config = ( new AnalyzerBuilder( $langName ) )->
923                    withFilters( [ 'lowercase', 'asciifolding', 'serbian_stemmer' ] )->
924                    build( $config );
925                break;
926            case 'catalan':
927                // Unpack Catalan analyzer T283366
928                $config = ( new AnalyzerBuilder( $langName ) )->
929                    withUnpackedAnalyzer()->
930                    withElision( [ 'd', 'l', 'm', 'n', 's', 't' ] )->
931                    build( $config );
932                break;
933            case 'chinese':
934                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T158203
935                $config[ 'char_filter' ][ 'tsconvert' ] = [
936                    'type' => 'stconvert',
937                    'delimiter' => '#',
938                    'keep_both' => false,
939                    'convert_type' => 't2s',
940                ];
941
942                // char map: hack for STConvert errors (still present as of July 2023)
943                // see https://github.com/medcl/elasticsearch-analysis-stconvert/issues/13
944                // stop: SmartCN converts lots of punctuation to ',' but we don't want to index it
945                $config = ( new AnalyzerBuilder( $langName ) )->
946                    withCharMap( [ '\u606d\u5f18=>\u606d \u5f18', '\u5138=>\u3469' ], 'stconvertfix' )->
947                    withCharFilters( [ 'stconvertfix', 'tsconvert' ] )->
948                    withTokenizer( 'smartcn_tokenizer' )->
949                    withStop( [ ',' ], 'smartcn_stop' )->
950                    withFilters( [ 'smartcn_stop', 'lowercase' ] )->
951                    build( $config );
952
953                $config[ 'analyzer' ][ 'plain' ][ 'filter' ] = [ 'smartcn_stop', 'lowercase' ];
954                $config[ 'analyzer' ][ 'plain_search' ][ 'filter' ] =
955                    $config[ 'analyzer' ][ 'plain' ][ 'filter' ];
956                break;
957            case 'cjk':
958                // Unpack CJK analyzer T326822
959                // map (han)dakuten to combining forms or icu_normalizer will add spaces
960                $dakutenMap = [ '゛=>\u3099', '゜=>\u309a' ];
961
962                // cjk_bigram negates the benefits of the icu_tokenizer for CJK text. The
963                // icu_tokenizer also has a few bad side effects, so don't use it for cjk.
964                // Default cjk stop words are almost the same as _english_ (add s & t; drop
965                // an). Stop words are searchable via 'plain' anyway, so just use _english_
966                $config = ( new AnalyzerBuilder( 'cjk' ) )->
967                    withUnpackedAnalyzer()->
968                    withLimitedCharMap( $dakutenMap )->
969                    withTokenizer( self::STANDARD_TOKENIZER_ONLY )->
970                    withStop( '_english_' )->
971                    omitStemmer()->
972                    insertFiltersBefore( 'lowercase', [ 'cjk_width' ] )->
973                    insertFiltersBefore( 'cjk_stop', [ 'cjk_bigram' ] )->
974                    build( $config );
975                break;
976            case 'dutch':
977                // Unpack Dutch analyzer T281379
978                $nlOverride = [ // these are in the default Dutch analyzer
979                    'fiets=>fiets',
980                    'bromfiets=>bromfiets',
981                    'ei=>eier',
982                    'kind=>kinder'
983                ];
984                $config = ( new AnalyzerBuilder( $langName ) )->
985                    withUnpackedAnalyzer()->
986                    withStemmerOverride( $nlOverride )->
987                    build( $config );
988                break;
989            case 'english':
990                // Map hiragana (\u3041-\u3096) to katakana (\u30a1-\u30f6), currently only for
991                // English
992                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T176197
993                $hkmap = [];
994                for ( $i = 0x3041; $i <= 0x3096; $i++ ) {
995                    $hkmap[] = sprintf( '\\u%04x=>\\u%04x', $i, $i + 0x60 );
996                }
997
998                // Replace English analyzer with a rebuilt copy with asciifolding inserted
999                // before stemming
1000                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T142037
1001                $config = ( new AnalyzerBuilder( $langName ) )->
1002                    withLimitedCharMap( $hkmap, 'kana_map' )->
1003                    withCharFilters( [ 'kana_map' ] )->
1004                    withExtraStemmer( 'possessive_english' )->
1005                    withStemmerOverride( 'guidelines => guideline', 'custom_stem' )->
1006                    withFilters( [ 'possessive_english', 'lowercase', 'stop', 'asciifolding',
1007                        'kstem', 'custom_stem' ] )->
1008                    build( $config );
1009
1010                // Add asciifolding_preserve to the plain analyzer as well (but not plain_search)
1011                $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 'asciifolding_preserve';
1012                // Add asciifolding_preserve filters
1013                $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
1014                break;
1015            case 'esperanto':
1016                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T202173
1017                $config = ( new AnalyzerBuilder( $langName ) )->
1018                    withFilters( [ 'lowercase', 'asciifolding', 'esperanto_stemmer' ] )->
1019                    build( $config );
1020                break;
1021            case 'french':
1022                // Add asciifolding_preserve to filters
1023                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T142620
1024                $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
1025
1026                $config = ( new AnalyzerBuilder( $langName ) )->
1027                    withUnpackedAnalyzer()->
1028                    withLimitedCharMap( [ '\u02BC=>\u0027' ] )->
1029                    withElision( [ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c',
1030                                    'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ] )->
1031                    withLightStemmer()->
1032                    withAsciifoldingPreserve()->
1033                    build( $config );
1034                break;
1035            case 'german':
1036                // Unpack German analyzer T281379
1037                // char map: We have to explicitly map capital ẞ to lowercase ß
1038                $config = ( new AnalyzerBuilder( $langName ) )->
1039                    withUnpackedAnalyzer()->
1040                    withLimitedCharMap( [ 'ẞ=>ß' ] )->
1041                    withLightStemmer()->
1042                    insertFiltersBefore( 'german_stemmer', [ 'german_normalization' ] )->
1043                    build( $config );
1044
1045                $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'german_charfilter';
1046                $config[ 'analyzer' ][ 'plain_search' ][ 'char_filter' ][] = 'german_charfilter';
1047                break;
1048            case 'greek':
1049                $config = ( new AnalyzerBuilder( $langName ) )->
1050                    withUnpackedAnalyzer()->
1051                    omitAsciifolding()->
1052                    withLangLowercase()->
1053                    withRemoveEmpty()->
1054                    build( $config );
1055                break;
1056            case 'hebrew':
1057                $config = ( new AnalyzerBuilder( $langName ) )->
1058                    withTokenizer( 'hebrew' )->
1059                    withFilters( [ 'niqqud', 'hebrew_lemmatizer', 'remove_duplicates', 'lowercase',
1060                        'asciifolding' ] )->
1061                    build( $config );
1062                break;
1063            case 'hindi':
1064                // Unpack Hindi analyzer T289612
1065                $config = ( new AnalyzerBuilder( $langName ) )->
1066                    withUnpackedAnalyzer()->
1067                    withDecimalDigit()->
1068                    insertFiltersBefore( 'hindi_stop',
1069                        [ 'indic_normalization', 'hindi_normalization' ] )->
1070                    build( $config );
1071                break;
1072            case 'indonesian':
1073            case 'malay':
1074                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T196780
1075                $config = ( new AnalyzerBuilder( 'indonesian' ) )->
1076                    withUnpackedAnalyzer()->
1077                    omitAsciifolding()->
1078                    build( $config );
1079                break;
1080            case 'irish':
1081                $gaCharMap = [ 'ḃ=>bh', 'ċ=>ch', 'ḋ=>dh', 'ḟ=>fh', 'ġ=>gh', 'ṁ=>mh', 'ṗ=>ph',
1082                      'ṡ=>sh', 'ẛ=>sh', 'ṫ=>th', 'Ḃ=>BH', 'Ċ=>CH', 'Ḋ=>DH', 'Ḟ=>FH', 'Ġ=>GH',
1083                      'Ṁ=>MH', 'Ṗ=>PH', 'Ṡ=>SH', 'Ṫ=>TH' ];
1084
1085                // Add b, bh, g, m for camelCase cleanup
1086                $gaHyphenStop = [ 'h', 'n', 't', 'b', 'bh', 'g', 'm' ];
1087
1088                // Unpack Irish analyzer T289612
1089                // See also https://www.mediawiki.org/wiki/User:TJones_(WMF)/T217602
1090                $config = ( new AnalyzerBuilder( $langName ) )->
1091                    withUnpackedAnalyzer()->
1092                    withCharMap( $gaCharMap )->
1093                    withExtraStop( $gaHyphenStop, 'irish_hyphenation', 'irish_elision', true )->
1094                    withElision( [ 'd', 'm', 'b' ] )->
1095                    withLangLowercase()->
1096                    build( $config );
1097                break;
1098            case 'italian':
1099                // Replace the default Italian analyzer with a rebuilt copy with additional filters
1100                $itElision = [ 'c', 'l', 'all', 'dall', 'dell', 'nell', 'sull', 'coll', 'pell',
1101                    'gl', 'agl', 'dagl', 'degl', 'negl', 'sugl', 'un', 'm', 't', 's', 'v', 'd' ];
1102                $config = ( new AnalyzerBuilder( $langName ) )->
1103                    withUnpackedAnalyzer()->
1104                    withElision( $itElision )->
1105                    withLightStemmer()->
1106                    build( $config );
1107
1108                // Add asciifolding_preserve to the plain analyzer as well (but not plain_search)
1109                $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 'asciifolding_preserve';
1110                // Add asciifolding_preserve to filters
1111                $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
1112                break;
1113            case 'japanese':
1114                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T166731
1115                // pre-convert fullwidth numbers because Kuromoji tokenizer treats them weirdly
1116                $config = ( new AnalyzerBuilder( $langName ) )->
1117                    withNumberCharFilter( 0xff10, 'fullwidthnumfix' )->
1118                    withCharFilters( [ 'fullwidthnumfix' ] )->
1119                    withTokenizer( 'kuromoji_tokenizer' )->
1120                    withFilters( [ 'kuromoji_baseform', 'cjk_width', 'ja_stop', 'kuromoji_stemmer',
1121                        'lowercase' ] )->
1122                    build( $config );
1123                break;
1124            case 'khmer':
1125                // See Khmer: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T185721
1126                $config = ( new AnalyzerBuilder( $langName ) )->
1127                    withNumberCharFilter( 0x17e0 )->
1128                    withCharFilters( [ 'khmer_syll_reorder', 'khmer_numbers' ] )->
1129                    withFilters( [ 'lowercase' ] )->
1130                    build( $config );
1131                break;
1132            case 'korean':
1133                // Unpack nori analyzer to add ICU normalization and custom filters
1134                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T206874
1135
1136                // Nori-specific character filter
1137                $noriMap = [
1138                    '\u00B7=>\u0020', // convert middle dot to space
1139                    '\u318D=>\u0020', // arae-a to space
1140                    '\u00AD=>', // remove soft hyphens
1141                    '\u200C=>', // remove zero-width non-joiners
1142                ];
1143
1144                // Nori-specific pattern_replace to strip combining diacritics
1145                $config[ 'char_filter' ][ 'nori_combo_filter' ] =
1146                    AnalyzerBuilder::patternFilter( '[\\u0300-\\u0331]' );
1147
1148                // 'mixed' mode keeps the original token plus the compound parts
1149                // the default is 'discard' which only keeps the parts
1150                $config[ 'tokenizer' ][ 'nori_tok' ] = [
1151                    'type' => 'nori_tokenizer',
1152                    'decompound_mode' => 'mixed',
1153                ];
1154
1155                // Nori-specific part of speech filter (add 'VCP', 'VCN', 'VX' to default)
1156                $config[ 'filter' ][ 'nori_posfilter' ] = [
1157                    'type' => 'nori_part_of_speech',
1158                    'stoptags' => [ 'E', 'IC', 'J', 'MAG', 'MAJ', 'MM', 'SP', 'SSC', 'SSO',
1159                        'SC', 'SE', 'XPN', 'XSA', 'XSN', 'XSV', 'UNA', 'NA', 'VSV', 'VCP',
1160                        'VCN', 'VX' ],
1161                ];
1162
1163                $config = ( new AnalyzerBuilder( $langName ) )->
1164                    withLimitedCharMap( $noriMap, 'nori_charfilter' )->
1165                    withCharFilters( [ 'nori_charfilter', 'nori_combo_filter' ] )->
1166                    withTokenizer( 'nori_tok' )->
1167                    withFilters( [ 'nori_posfilter', 'nori_readingform', 'lowercase',
1168                        'remove_empty' ] )->
1169                    build( $config );
1170                break;
1171            case 'mirandese':
1172                // Unpack default analyzer to add Mirandese-specific elision and stop words
1173                // See phab ticket T194941
1174                $mwlStopwords = require __DIR__ . '/AnalysisLanguageData/mirandeseStopwords.php';
1175                $config = ( new AnalyzerBuilder( $langName ) )->
1176                    withElision( [ 'l', 'd', 'qu' ] )->
1177                    withStop( $mwlStopwords )->
1178                    withFilters( [ 'lowercase', 'mirandese_elision', 'mirandese_stop' ] )->
1179                    build( $config );
1180                break;
1181            case 'persian': // Unpack Persian analyzer T325090
1182                $config = ( new AnalyzerBuilder( $langName ) )->
1183                    withUnpackedAnalyzer()->
1184                    withLimitedCharMap( [ '\u200C=>\u0020' ], 'zero_width_spaces' )->
1185                    withDecimalDigit()->
1186                    omitStemmer()->
1187                    insertFiltersBefore( 'persian_stop',
1188                        [ 'arabic_normalization', 'persian_normalization' ] )->
1189                    build( $config );
1190                break;
1191            case 'polish':
1192                // these are real stop words for Polish
1193                $plStopwords = require __DIR__ . '/AnalysisLanguageData/polishStopwords.php';
1194
1195                // Stempel-specific stop words--additional unreliable stems
1196                $stempelStopwords = [ 'ować', 'iwać', 'obić', 'snąć', 'ywać', 'ium', 'my', 'um' ];
1197
1198                // Stempel is statistical, and certain stems are really terrible, so we filter them
1199                // after stemming. See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T186046
1200                $config[ 'filter' ][ 'stempel_pattern_filter' ] =
1201                    AnalyzerBuilder::patternFilter( '^([a-zął]?[a-zćń]|..ć|\d.*ć)$' );
1202
1203                $config = ( new AnalyzerBuilder( $langName ) )->
1204                    withUnpackedAnalyzer()->
1205                    withStop( $plStopwords )->
1206                    omitStemmer()->
1207                    omitAsciiFolding()->
1208                    appendFilters( [ 'polish_stem', 'stempel_pattern_filter', 'remove_empty' ] )->
1209                    withExtraStop( $stempelStopwords, 'stempel_stop' )->
1210                    build( $config );
1211                break;
1212            case 'romanian':  // Unpack Romanian analyzer T325091 / T330893
1213                // Counterintuitively, we need to map correct s&t (with commas) to older
1214                // incorrect forms (with cedilla) so that the old Snowball stemmer (from before
1215                // comma forms were available) will work; also normalize versions with
1216                // combining diacritics to single characters.
1217                $cedillaMap = [
1218                    'ș=>ş', 's\u0326=>ş', 's\u0327=>ş', 'ț=>ţ', 't\u0326=>ţ', 't\u0327=>ţ',
1219                    'Ș=>Ş', 'S\u0326=>Ş', 'S\u0327=>Ş', 'Ț=>Ţ', 'T\u0326=>Ţ', 'T\u0327=>Ţ',
1220                ];
1221
1222                // Add stopword variants with modern commas instead of old cedillas so that
1223                // both are handled, regardless of the character mapping needed for the
1224                // stemmer. In the future, Lucene should update their stopwords and these will
1225                // be included.
1226                $roStopwords = require __DIR__ . '/AnalysisLanguageData/romanianStopwords.php';
1227
1228                $config = ( new AnalyzerBuilder( $langName ) )->
1229                    withUnpackedAnalyzer()->
1230                    withCharMap( $cedillaMap )->
1231                    withExtraStop( $roStopwords, 'ro_comma_stop', 'romanian_stemmer' )->
1232                    build( $config );
1233                break;
1234            case 'russian':
1235                // unpack built-in Russian analyzer and add character filter
1236                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T124592
1237                $ruCharMap = [
1238                    '\u0301=>',    // combining acute accent, only used to show stress T102298
1239                    '\u0435\u0308=>\u0435',    // T124592 fold ё=>е and Ё=>Е, with combining
1240                    '\u0415\u0308=>\u0415',    // diacritic...
1241                    '\u0451=>\u0435', // ... or precomposed
1242                    '\u0401=>\u0415',
1243                ];
1244                $config = ( new AnalyzerBuilder( $langName ) )->
1245                    withUnpackedAnalyzer()->
1246                    withCharMap( $ruCharMap )->
1247                    build( $config );
1248
1249                // add Russian character mappings to near_space_flattener, and convert it from
1250                // limited_mapping to mapping to handle multi-char maps
1251                $config[ 'char_filter' ][ 'near_space_flattener' ][ 'type' ] = 'mapping';
1252                array_push( $config[ 'char_filter' ][ 'near_space_flattener' ][ 'mappings' ],
1253                    ...$ruCharMap );
1254
1255                // Drop acute stress marks and fold ё=>е everywhere
1256                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T124592
1257                $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'russian_charfilter';
1258                $config[ 'analyzer' ][ 'plain_search' ][ 'char_filter' ][] = 'russian_charfilter';
1259
1260                $config[ 'analyzer' ][ 'suggest' ][ 'char_filter' ][] = 'russian_charfilter';
1261                $config[ 'analyzer' ][ 'suggest_reverse' ][ 'char_filter' ][] = 'russian_charfilter';
1262                break;
1263            case 'slovak':
1264                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T190815
1265                // and https://www.mediawiki.org/wiki/User:TJones_(WMF)/T223787
1266                $config = ( new AnalyzerBuilder( $langName ) )->
1267                    withFilters( [ 'lowercase', 'slovak_stemmer', 'asciifolding' ] )->
1268                    build( $config );
1269                break;
1270            case 'sorani':    // Unpack Sorani analyzer T325091
1271                $config = ( new AnalyzerBuilder( $langName ) )->
1272                    withUnpackedAnalyzer()->
1273                    withDecimalDigit()->
1274                    insertFiltersBefore( 'lowercase', [ 'sorani_normalization' ] )->
1275                    build( $config );
1276                break;
1277            case 'swedish':
1278                // Add asciifolding_preserve to lowercase_keyword
1279                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T160562
1280                $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
1281
1282                // Unpack built-in swedish analyzer to add asciifolding_preserve
1283                $config = ( new AnalyzerBuilder( $langName ) )->
1284                    withUnpackedAnalyzer()->
1285                    withAsciifoldingPreserve()->
1286                    build( $config );
1287                break;
1288            case 'thai':
1289                // Unpack and improve Thai analyzer: T294147
1290                $thCharMap = [
1291                    '_=>\u0020', // split tokens on underscore ..
1292                    ';=>\u0020', // .. semicolon
1293                    ':=>\u0020', // .. colon
1294                    '·=>\u0020', // .. middle dot
1295                    '‧=>\u0020', // .. & hyphenation point
1296                    'ฃ=>ข', // replace obsolete ฃ
1297                    'ฅ=>ค', // replace obsolete ฅ
1298                    '\u0e4d\u0e32=>\u0e33', // compose nikhahit + sara aa = sara am
1299                    '\u0e4d\u0e48\u0e32=>\u0e48\u0e33', // recompose sara am split around..
1300                    '\u0e4d\u0e49\u0e32=>\u0e49\u0e33', // .. other diacritics
1301                    '\u0e33\u0e48=>\u0e48\u0e33', // sara am should consistently..
1302                    '\u0e33\u0e49=>\u0e49\u0e33', // .. come after other diacritics
1303                    '\u0E34\u0E4D=>\u0E36', // compose sara i + nikhahit = sara ue..
1304                    '\u0E4D\u0E34=>\u0E36', // .. in either order
1305                ];
1306
1307                // instantiate basic unpacked analyzer builder, plus thai tokenizer by default
1308                $thBuilder = ( new AnalyzerBuilder( $langName ) )
1309                    ->withUnpackedAnalyzer()
1310                    ->withTokenizer( 'thai' );
1311
1312                if ( $this->isIcuAvailable() ) {
1313                    // ICU tokenizer is preferred in general. If it is available, replace
1314                    // default tokenizer. Also add thai_repl_pat char filter to accommodate
1315                    // some of its weaknesses.
1316                    $thBuilder->withTokenizer( $this->icu_tokenizer );
1317
1318                    $thaiLetterPat = '[ก-๏]'; // Thai characters, except for digits.
1319                    $config[ 'char_filter' ][ 'thai_repl_pat' ] =
1320                        // break between any digits and Thai letters, or vice versa
1321                        // break *Thai* tokens on periods (by making them spaces)
1322                        // (regex look-behind is okay, but look-ahead breaks offsets)
1323                        AnalyzerBuilder::patternFilter( "(?<=\\p{Nd})($thaiLetterPat)" .
1324                            "|(?<=$thaiLetterPat)(\\p{Nd})" .
1325                            "|(?<=$thaiLetterPat)\.($thaiLetterPat)",
1326                            ' $1$2$3' );
1327                    $thBuilder->withCharFilters( [ 'thai_repl_pat' ] );
1328
1329                    // if icu_token_repair (in the textify plugin) is available, we need a
1330                    // reverse number map so it doesn't rejoin split-off Arabic numbers.
1331                    if ( $this->isTextifyAvailable() ) {
1332                        $thBuilder->withReversedNumberCharFilter( 0x0e50 );
1333                    }
1334                } else {
1335                    // if we have to settle for the Thai tokenizer, add some additional
1336                    // character filters to accommodate some of its weaknesses
1337                    $thThaiTokSplits = [
1338                        '\u200B=>', // delete zero width space
1339                        '-=>\u0020', // split tokens on hyphen-minus ..
1340                        '‐=>\u0020', // .. hyphen
1341                        '–=>\u0020', // .. en dash
1342                        '—=>\u0020', // .. em dash
1343                        '―=>\u0020', // .. horizontal bar
1344                        '-=>\u0020', // .. fullwidth hyphen
1345                        '"=>\u0020', // .. & double quote
1346                    ];
1347                    array_push( $thCharMap, ...$thThaiTokSplits );
1348                }
1349
1350                // add in the rest of the bits that are always needed, and build
1351                $config = $thBuilder->withCharMap( $thCharMap )->
1352                    withDecimalDigit()->
1353                    omitStemmer()->
1354                    build( $config );
1355                break;
1356            case 'turkish':
1357                $trAposFilter = 'apostrophe';
1358                if ( in_array( 'extra-analysis-turkish', $this->plugins ) ) {
1359                    $trAposFilter = 'better_apostrophe';
1360                }
1361                $config = ( new AnalyzerBuilder( $langName ) )->
1362                    withUnpackedAnalyzer()->
1363                    withLangLowercase()->
1364                    insertFiltersBefore( 'turkish_stop', [ $trAposFilter ] )->
1365                    build( $config );
1366                break;
1367            case 'ukrainian-unpacked':
1368                $this->languagesWithIcuFolding['uk'] = true;
1369                $ukCharMap = [
1370                    '‘=>\'', // normalize apostrophes
1371                    '’=>\'',
1372                    '`=>\'',
1373                    '´=>\'',
1374                    'ʼ=>\'',
1375                    '\u0301=>', // delete combining acute and soft hyphen
1376                    '\u00AD=>',
1377                    'ґ=>г', // normalize ghe with upturn
1378                    'Ґ=>Г',
1379                ];
1380                // lowercase twice because stopwords are case sensitive, and the stemmer
1381                // generates some output with uppercase initial letters, even for
1382                // lowercase input (usually proper names)
1383                $ukFilters = [ 'lowercase', 'ukrainian_stop', 'ukrainian_stemmer',
1384                               'lowercase', 'remove_duplicates', 'asciifolding' ];
1385                $config = ( new AnalyzerBuilder( 'ukrainian' ) )->
1386                    withLimitedCharMap( $ukCharMap )->
1387                    withCharFilters( [ 'ukrainian_charfilter' ] )->
1388                    withFilters( $ukFilters )->
1389                    build( $config );
1390                break;
1391            default:
1392                // do nothing--default config is already set up
1393                break;
1394        }
1395
1396        // text_search is just a copy of text
1397        // @phan-suppress-next-line PhanTypeInvalidDimOffset
1398        $config[ 'analyzer' ][ 'text_search' ] = $config[ 'analyzer' ][ 'text' ];
1399
1400        // replace lowercase filters with icu_normalizer filter
1401        if ( $this->isIcuAvailable() ) {
1402            foreach ( $config[ 'analyzer' ] as &$analyzer ) {
1403                if ( !isset( $analyzer[ 'filter'  ] ) ) {
1404                    continue;
1405                }
1406
1407                $tmpFilters = [];
1408                foreach ( $analyzer[ 'filter' ] as $filter ) {
1409                    if ( $filter === 'lowercase' ) {
1410                        // If lowercase filter has language-specific processing, keep it,
1411                        // and do it before ICU normalization, particularly for Greek,
1412                        // Irish, and Turkish
1413                        // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T203117
1414                        // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T217602
1415                        if ( isset( $config[ 'filter' ][ 'lowercase' ][ 'language' ] ) ) {
1416                            $tmpFilters[] = 'lowercase';
1417                        }
1418                        $tmpFilters[] = 'icu_normalizer';
1419                    } else {
1420                        $tmpFilters[] = $filter;
1421                    }
1422                }
1423                $analyzer[ 'filter' ] = $tmpFilters;
1424
1425            }
1426        }
1427
1428        return $config;
1429    }
1430
1431    /**
1432     * Workaround for https://issues.apache.org/jira/browse/LUCENE-7468
1433     * The preserve_original duplicates token even if they are
1434     * not modified, leading to more space used and wrong term frequencies.
1435     * Workaround is to append a unique filter to remove the dups.
1436     * (made public for unit tests)
1437     *
1438     * @param mixed[] $config
1439     * @return mixed[] update mapping
1440     */
1441    public function fixAsciiFolding( array $config ) {
1442        $needDedupFilter = false;
1443        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
1444            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
1445                continue;
1446            }
1447            if ( !isset( $value[ 'filter' ] ) ) {
1448                continue;
1449            }
1450            $ascii_idx = array_search( 'asciifolding_preserve', $value[ 'filter' ] );
1451            if ( $ascii_idx !== false ) {
1452                $needDedupFilter = true;
1453                array_splice( $value[ 'filter' ], $ascii_idx + 1, 0, [ 'dedup_asciifolding' ] );
1454            }
1455        }
1456        if ( $needDedupFilter ) {
1457            $config[ 'filter' ][ 'dedup_asciifolding' ] = [
1458                'type' => 'unique',
1459                'only_on_same_position' => true,
1460            ];
1461        }
1462        return $config;
1463    }
1464
1465    /**
1466     * Pick the appropriate default analyzer based on the language.  Rather than think of
1467     * this as per language customization you should think of this as an effort to pick a
1468     * reasonably default in case CirrusSearch isn't customized for the language.
1469     *
1470     * @param string $language Config language
1471     * @return string the analyzer type
1472     */
1473    public function getDefaultTextAnalyzerType( $language ) {
1474        // If we match a language exactly, use it
1475        if ( array_key_exists( $language, $this->elasticsearchLanguageAnalyzers ) ) {
1476            return $this->elasticsearchLanguageAnalyzers[ $language ];
1477        }
1478
1479        return 'default';
1480    }
1481
1482    /**
1483     * Get list of filters that are mentioned in analyzers but not defined
1484     * explicitly.
1485     * @param array[] &$config Full configuration array
1486     * @param string[] $analyzers List of analyzers to consider.
1487     * @return array List of default filters, each containing only filter type
1488     */
1489    private function getDefaultFilters( array &$config, array $analyzers ) {
1490        $defaultFilters = [];
1491        foreach ( $analyzers as $analyzer ) {
1492            if ( empty( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] ) ) {
1493                continue;
1494            }
1495            foreach ( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] as $filterName ) {
1496                if ( !isset( $config[ 'filter' ][ $filterName ] ) ) {
1497                    // This is default definition for the built-in filter
1498                    $defaultFilters[ $filterName ] = [ 'type' => $filterName ];
1499                }
1500            }
1501        }
1502        return $defaultFilters;
1503    }
1504
1505    /**
1506     * Check every filter in the config - if it's the same as in old config,
1507     * ignore it. If it has the same name, but different content - create new filter
1508     * with different name by prefixing it with language code.
1509     *
1510     * @param array[] &$config Configuration being processed
1511     * @param array[] $standardFilters Existing filters list
1512     * @param array[] $defaultFilters List of default filters already mentioned in the config
1513     * @param string $prefix Prefix for disambiguation
1514     * @return array[] The list of filters not in the old config.
1515     */
1516    private function resolveFilters( array &$config, array $standardFilters, array $defaultFilters,
1517            string $prefix ) {
1518        $resultFilters = [];
1519        foreach ( $config[ 'filter' ] as $name => $filter ) {
1520            $existingFilter = $standardFilters[$name] ?? $defaultFilters[$name] ?? null;
1521            if ( $existingFilter ) { // Filter with this name already exists
1522                if ( $existingFilter != $filter ) {
1523                    // filter with the same name but different config - need to
1524                    // rename by adding prefix
1525                    $newName = $prefix . '_' . $name;
1526                    $this->replaceFilter( $config, $name, $newName );
1527                    $resultFilters[ $newName ] = $filter;
1528                }
1529            } else {
1530                $resultFilters[ $name ] = $filter;
1531            }
1532        }
1533        return $resultFilters;
1534    }
1535
1536    /**
1537     * Replace certain filter name in all configs with different name.
1538     * @param array[] &$config Configuration being processed
1539     * @param string $oldName
1540     * @param string $newName
1541     */
1542    private function replaceFilter( array &$config, $oldName, $newName ) {
1543        foreach ( $config[ 'analyzer' ] as &$analyzer ) {
1544            if ( !isset( $analyzer[ 'filter' ] ) ) {
1545                continue;
1546            }
1547            $analyzer[ 'filter' ] = array_map( static function ( $filter ) use ( $oldName, $newName ) {
1548                if ( $filter === $oldName ) {
1549                    return $newName;
1550                }
1551                return $filter;
1552            }, $analyzer[ 'filter' ] );
1553        }
1554    }
1555
1556    /**
1557     * Merge per-language config into the main config.
1558     * It will copy specific analyzer and all dependant filters and char_filters.
1559     * @param array &$config Main config
1560     * @param array $langConfig Per-language config
1561     * @param string $name Name for analyzer whose config we're merging
1562     * @param string $prefix Prefix for this configuration
1563     */
1564    private function mergeConfig( array &$config, array $langConfig, $name, $prefix ) {
1565        $analyzer = $langConfig[ 'analyzer' ][ $name ];
1566        $config[ 'analyzer' ][ $prefix . '_' . $name ] = $analyzer;
1567        if ( !empty( $analyzer[ 'filter' ] ) ) {
1568            // Add private filters for this analyzer
1569            foreach ( $analyzer[ 'filter' ] as $filter ) {
1570                // Copy filters that are in language config but not in the main config.
1571                // We would not copy the same filter into the main config since due to
1572                // the resolution step we know they are the same (otherwise we would have
1573                // renamed it).
1574                if ( isset( $langConfig[ 'filter' ][ $filter ] ) &&
1575                    !isset( $config[ 'filter' ][ $filter ] ) ) {
1576                    $config[ 'filter' ][ $filter ] = $langConfig[ 'filter' ][ $filter ];
1577                }
1578            }
1579        }
1580        if ( !empty( $analyzer[ 'char_filter' ] ) ) {
1581            // Add private char_filters for this analyzer
1582            foreach ( $analyzer[ 'char_filter' ] as $filter ) {
1583                // Copy char_filters that are in lang config but not in the main config.
1584                // Need to check whether the filter exists in langConfig because some
1585                // non-configurable filters are defined in plugins and do not have a
1586                // local definition (e.g., camelCase_splitter)
1587                if ( isset( $langConfig[ 'char_filter' ][ $filter ] ) &&
1588                    !isset( $config[ 'char_filter' ][ $filter ] ) ) {
1589                    $config[ 'char_filter' ][ $filter ] = $langConfig[ 'char_filter' ][ $filter ];
1590                }
1591            }
1592        }
1593        if ( !empty( $analyzer[ 'tokenizer' ] ) ) {
1594            $tokenizer = $analyzer[ 'tokenizer' ];
1595            if ( isset( $langConfig[ 'tokenizer' ][ $tokenizer ] ) &&
1596                    !isset( $config[ 'tokenizer' ][ $tokenizer ] ) ) {
1597                $config[ 'tokenizer' ][ $tokenizer ] = $langConfig[ 'tokenizer' ][ $tokenizer ];
1598            }
1599        }
1600    }
1601
1602    /**
1603     * Create per-language configs for specific analyzers which separates and namespaces
1604     * filters that are different between languages.
1605     * @param array &$config Existing config, will be modified
1606     * @param string[] $languages List of languages to process
1607     * @param string[] $analyzers List of analyzers to process
1608     */
1609    public function buildLanguageConfigs( array &$config, array $languages, array $analyzers ) {
1610        $defaultFilters = $this->getDefaultFilters( $config, $analyzers );
1611        foreach ( $languages as $lang ) {
1612            $langConfig = $this->buildConfig( $lang );
1613            $defaultFilters += $this->getDefaultFilters( $langConfig, $analyzers );
1614        }
1615        foreach ( $languages as $lang ) {
1616            $langConfig = $this->buildConfig( $lang );
1617            // Analyzer is: tokenizer + filter + char_filter
1618            // Char filters & Tokenizers are nicely namespaced
1619            // Filters are NOT - e.g. lowercase & icu_folding filters are different for different
1620            // languages! So we need to do some disambiguation here.
1621            $langConfig[ 'filter' ] =
1622                $this->resolveFilters( $langConfig, $config[ 'filter' ], $defaultFilters, $lang );
1623            // Merge configs
1624            foreach ( $analyzers as $analyzer ) {
1625                $this->mergeConfig( $config, $langConfig, $analyzer, $lang );
1626            }
1627        }
1628    }
1629
1630    /**
1631     * @return bool true if the icu analyzer is available.
1632     */
1633    public function isIcuAvailable() {
1634        return $this->icu;
1635    }
1636
1637    /**
1638     * @return bool true if the textify plugin is available.
1639     */
1640    public function isTextifyAvailable() {
1641        return $this->textify;
1642    }
1643
1644    /**
1645     * update languages with global custom filters (e.g., homoglyph & nnbsp filters)
1646     *
1647     * @param mixed[] $config
1648     * @param string $language language to add plugin to
1649     * @return mixed[] updated config
1650     */
1651    public function enableGlobalCustomFilters( array $config, string $language ) {
1652        return GlobalCustomFilter::enableGlobalCustomFilters( $config, $language,
1653            $this->globalCustomFilters, $this->plugins );
1654    }
1655
1656    /**
1657     * Languages for which we have a custom analysis chain (Elastic built-in or our
1658     * own custom analysis). All other languages default to the default analyzer which
1659     * isn't too good. Note that this array is sorted alphabetically by value. The
1660     * Elastic list is sourced from
1661     * https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html
1662     *
1663     * @var string[]
1664     */
1665    private $elasticsearchLanguageAnalyzers = [
1666        'ar' => 'arabic',
1667        'ary' => 'arabic-moroccan',
1668        'arz' => 'arabic-egyptian',
1669        'hy' => 'armenian',
1670        'az' => 'azerbaijani',
1671        'eu' => 'basque',
1672        'bn' => 'bengali',
1673        'pt-br' => 'brazilian',
1674        'bg' => 'bulgarian',
1675        'ca' => 'catalan',
1676        'crh' => 'crimean-tatar',
1677        'ja' => 'cjk',
1678        'ko' => 'cjk',
1679        'cs' => 'czech',
1680        'da' => 'danish',
1681        'nl' => 'dutch',
1682        'en' => 'english',
1683        'en-ca' => 'english',
1684        'en-gb' => 'english',
1685        'simple' => 'english',
1686        'et' => 'estonian',
1687        'fi' => 'finnish',
1688        'fr' => 'french',
1689        'gag' => 'gagauz',
1690        'gl' => 'galician',
1691        'de' => 'german',
1692        'el' => 'greek',
1693        'hi' => 'hindi',
1694        'hu' => 'hungarian',
1695        'id' => 'indonesian',
1696        'ga' => 'irish',
1697        'it' => 'italian',
1698        'kk' => 'kazakh',
1699        'lt' => 'lithuanian',
1700        'lv' => 'latvian',
1701        'ms' => 'malay',
1702        'mwl' => 'mirandese',
1703        'nb' => 'norwegian',
1704        'nn' => 'norwegian',
1705        'no' => 'norwegian',
1706        'fa' => 'persian',
1707        'pt' => 'portuguese',
1708        'ro' => 'romanian',
1709        'ru' => 'russian',
1710        'ckb' => 'sorani',
1711        'es' => 'spanish',
1712        'sv' => 'swedish',
1713        'tt' => 'tatar',
1714        'tr' => 'turkish',
1715        'th' => 'thai',
1716    ];
1717
1718    /**
1719     * @var bool[] indexed by language code, languages where ICU folding
1720     * can be enabled by default
1721     */
1722    private $languagesWithIcuFolding = [
1723        'ar' => true,
1724        'ary' => true,
1725        'arz' => true,
1726        'bg' => true,
1727        'bn' => true,
1728        'bs' => true,
1729        'ca' => true,
1730        'ckb' => true,
1731        'cs' => true,
1732        'da' => true,
1733        'de' => true,
1734        'el' => true,
1735        'en' => true,
1736        'en-ca' => true,
1737        'en-gb' => true,
1738        'simple' => true,
1739        'eo' => true,
1740        'es' => true,
1741        'et' => true,
1742        'eu' => true,
1743        'fa' => true,
1744        'fi' => true,
1745        'fr' => true,
1746        'ga' => true,
1747        'gl' => true,
1748        'he' => true,
1749        'hi' => true,
1750        'hr' => true,
1751        'hu' => true,
1752        'hy' => true,
1753        'ja' => true,
1754        'lt' => true,
1755        'lv' => true,
1756        'nb' => true,
1757        'nl' => true,
1758        'nn' => true,
1759        'no' => true,
1760        'pt' => true,
1761        'pt-br' => true,
1762        'ro' => true,
1763        'ru' => true,
1764        'sh' => true,
1765        'sk' => true,
1766        'sr' => true,
1767        'sv' => true,
1768        'th' => true,
1769        'tr' => true,
1770    ];
1771
1772    /**
1773     * @var bool[] indexed by language code, indicates whether languages should always
1774     * replace the standard tokenizer with the icu_tokenizer by default (true), or should
1775     * never use any version of the icu_tokenizer, even when icu_token_repair is
1776     * available (false). (Reminder to future readers of this code: languages with
1777     * non-standard tokenizers in the text field, like zh/Chinese, still use icu_tokenizer
1778     * in the plain fields & suggest fields.)
1779     */
1780    private $languagesWithIcuTokenization = [
1781        // true => use any version of icu_tokenizer available over the standard tokenizer
1782        'bo' => true,
1783        'dz' => true,
1784        'gan' => true,
1785        'ja' => true,
1786        'km' => true,
1787        'lo' => true,
1788        'my' => true,
1789        'th' => true,
1790        'wuu' => true,
1791        'zh' => true,
1792        'lzh' => true, // zh-classical
1793        'zh-classical' => true, // deprecated code for lzh
1794        'yue' => true, // zh-yue
1795        'zh-yue' => true, // deprecated code for yue
1796        // This list below are languages that may use use mixed scripts
1797        'bug' => true,
1798        'cdo' => true,
1799        'cr' => true,
1800        'hak' => true,
1801        'jv' => true,
1802        'nan' => true, // zh-min-nan
1803        'zh-min-nan' => true, // deprecated code for nan
1804
1805        // false => do not use any version of icu_tokenizer (i.e., textify_icu_tokenzier)
1806        // over the standard tokenizer, even when icu_token_repair is available
1807        // 'xyz' => false, // <-- example entry for now, since there are no actual instances
1808    ];
1809
1810    /**
1811     * @var array[]
1812     */
1813    private $elasticsearchLanguageAnalyzersFromPlugins = [
1814        /**
1815         * multiple plugin requirement can be comma separated
1816         *
1817         * Polish: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T154517
1818         * Ukrainian: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T160106
1819         * Chinese: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T158203
1820         * Hebrew: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T162741
1821         * Serbian: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T183015
1822         * Bosnian, Croatian, and Serbo-Croatian:
1823         *    https://www.mediawiki.org/wiki/User:TJones_(WMF)/T192395
1824         * Slovak: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T190815
1825         * Esperanto: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T202173
1826         * Korean: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T206874
1827         * Khmer: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T185721
1828         *
1829         * extra-analysis-ukrainian should follow analysis-ukrainian, so that
1830         * ukrainian-unpacked can overwrite value for uk if both are present.
1831         */
1832
1833        'analysis-stempel' => [ 'pl' => 'polish' ],
1834        'analysis-kuromoji' => [ 'ja' => 'japanese' ],
1835        'analysis-stconvert,analysis-smartcn' => [ 'zh' => 'chinese' ],
1836        'analysis-hebrew' => [ 'he' => 'hebrew' ],
1837        'analysis-ukrainian' => [ 'uk' => 'ukrainian' ],
1838        'extra-analysis-ukrainian' => [ 'uk' => 'ukrainian-unpacked' ],
1839        'extra-analysis-esperanto' => [ 'eo' => 'esperanto' ],
1840        'extra-analysis-serbian' => [ 'bs' => 'bosnian', 'hr' => 'croatian',
1841            'sh' => 'serbo-croatian', 'sr' => 'serbian' ],
1842        'extra-analysis-slovak' => [ 'sk' => 'slovak' ],
1843        'analysis-nori' => [ 'ko' => 'korean' ],
1844        'extra-analysis-khmer' => [ 'km' => 'khmer' ],
1845    ];
1846
1847    /**
1848     * Set up global custom filters
1849     *
1850     * @return array
1851     */
1852    private static function buildGlobalCustomFilters(): array {
1853        $gcf = [
1854            //////////////////////////
1855            // char filters
1856            'globo_norm' => new GlobalCustomFilter( 'char_filter' ),
1857
1858            'acronym_fixer' => ( new GlobalCustomFilter( 'char_filter' ) )->
1859                // follow armenian_charfilter, which normalizes another period-like
1860                // character, if it is being used
1861                setRequiredPlugins( [ 'extra-analysis-textify' ] )->
1862                setFallbackFilter( 'regex_acronym_fixer' )->
1863                setMustFollowFilters( [ 'armenian_charfilter' ] ),
1864
1865            'camelCase_splitter' => ( new GlobalCustomFilter( 'char_filter' ) )->
1866                // camelCase should generally follow acronyms so a.c.r.o.C.a.m.e.l.
1867                // is treated the same as acroCamel (real example: G.m.b.H. vs GmbH)
1868                setRequiredPlugins( [ 'extra-analysis-textify' ] )->
1869                setFallbackFilter( 'regex_camelCase' )->
1870                setMustFollowFilters( [ 'acronym_fixer', 'regex_acronym_fixer' ] ),
1871
1872            'word_break_helper' => ( new GlobalCustomFilter( 'char_filter' ) )->
1873                // * acronyms should be fixed before converting period to spaces
1874                // * follow armenian_charfilter, which normalizes another period-like
1875                //   character, if it is being used
1876                setMustFollowFilters( [ 'acronym_fixer', 'regex_acronym_fixer',
1877                    'armenian_charfilter' ] )->
1878                setLanguageDenyList( [ 'ko', 'zh' ] ),
1879
1880            'dotted_I_fix' => ( new GlobalCustomFilter( 'char_filter' ) )->
1881                // - if lowercase is present (because analysis-icu is not available, or
1882                // as a language-specific version) we don't need dotted_I_fix, because
1883                // lowercase prevents the problem.
1884                // - if icu_folding is present, we don't need dotted_I_fix, because
1885                // icu_folding also fixes it.
1886                setDisallowedTokenFilters( [ 'lowercase', 'icu_folding' ] ),
1887
1888            //////////////////////////
1889            // token filters
1890            'icu_token_repair' => ( new GlobalCustomFilter( 'filter' ) )->
1891                // apply icu_token_repair to icu_tokenizer-using analyzers
1892                // (default == text & text_search)
1893                setRequiredPlugins( [ 'extra-analysis-textify' ] )->
1894                setRequiredTokenizer( 'textify_icu_tokenizer' ),
1895
1896            'icutokrep_no_camel_split' => ( new GlobalCustomFilter( 'filter' ) )->
1897                // apply icu_token_repair variant to non-camelCase-splitting
1898                // icu_tokenizer-using analyzers when textify_icu_tokenizer is used
1899                setRequiredPlugins( [ 'extra-analysis-textify' ] )->
1900                setApplyToAnalyzers( [ 'plain', 'plain_search', 'suggest', 'suggest_reverse',
1901                    'source_text_plain', 'source_text_plain_search', 'word_prefix' ] )->
1902                setRequiredTokenizer( 'textify_icu_tokenizer' ),
1903
1904            'homoglyph_norm' => ( new GlobalCustomFilter( 'filter' ) )->
1905                // aggressive_splitting has weird graph problems and creating
1906                // multiple tokens makes it blow up
1907                setRequiredPlugins( [ 'extra-analysis-homoglyph' ] )->
1908                setMustFollowFilters( [ 'aggressive_splitting' ] ),
1909        ];
1910        // reverse the array so that items are ordered (approximately, modulo incompatible
1911        // filters) in the order specified here
1912        return array_reverse( $gcf );
1913    }
1914
1915}