Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
99.40% covered (success)
99.40%
1002 / 1008
76.92% covered (warning)
76.92%
20 / 26
CRAP
0.00% covered (danger)
0.00%
0 / 1
AnalysisConfigBuilder
99.40% covered (success)
99.40%
1002 / 1008
76.92% covered (warning)
76.92%
20 / 26
215
0.00% covered (danger)
0.00%
0 / 1
 __construct
96.30% covered (success)
96.30%
26 / 27
0.00% covered (danger)
0.00%
0 / 1
8
 shouldActivateIcuFolding
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
9
 shouldActivateIcuTokenization
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
7
 buildConfig
100.00% covered (success)
100.00%
13 / 13
100.00% covered (success)
100.00%
1 / 1
4
 buildSimilarityConfig
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 enableICUTokenizer
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
6
 standardTokenizerOnlyCleanup
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
6
 disableLimitedMappings
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
4
 enableICUFolding
100.00% covered (success)
100.00%
32 / 32
100.00% covered (success)
100.00%
1 / 1
12
 switchFiltersToICUFolding
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 switchFiltersToICUFoldingPreserve
94.44% covered (success)
94.44%
17 / 18
0.00% covered (danger)
0.00%
0 / 1
7.01
 getICUSetFilter
98.00% covered (success)
98.00%
49 / 50
0.00% covered (danger)
0.00%
0 / 1
29
 getICUNormSetFilter
80.00% covered (warning)
80.00%
4 / 5
0.00% covered (danger)
0.00%
0 / 1
4.13
 defaults
100.00% covered (success)
100.00%
286 / 286
100.00% covered (success)
100.00%
1 / 1
7
 customize
100.00% covered (success)
100.00%
424 / 424
100.00% covered (success)
100.00%
1 / 1
68
 fixAsciiFolding
100.00% covered (success)
100.00%
16 / 16
100.00% covered (success)
100.00%
1 / 1
7
 getDefaultTextAnalyzerType
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
2
 getDefaultFilters
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
5
 resolveFilters
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 replaceFilter
87.50% covered (warning)
87.50%
7 / 8
0.00% covered (danger)
0.00%
0 / 1
4.03
 mergeConfig
100.00% covered (success)
100.00%
17 / 17
100.00% covered (success)
100.00%
1 / 1
12
 buildLanguageConfigs
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 isIcuAvailable
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 isTextifyAvailable
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 enableGlobalCustomFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 buildGlobalCustomFilters
100.00% covered (success)
100.00%
31 / 31
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use CirrusSearch\CirrusSearch;
6use CirrusSearch\CirrusSearchHookRunner;
7use CirrusSearch\Profile\SearchProfileService;
8use CirrusSearch\SearchConfig;
9use MediaWiki\MediaWikiServices;
10
11/**
12 * Builds elasticsearch analysis config arrays.
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License along
25 * with this program; if not, write to the Free Software Foundation, Inc.,
26 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
27 * http://www.gnu.org/copyleft/gpl.html
28 */
29class AnalysisConfigBuilder {
30    /**
31     * Version number for the core analysis. Increment the major
32     * version when the analysis changes in an incompatible way,
33     * and change the minor version when it changes but isn't
34     * incompatible.
35     *
36     * You may also need to increment MetaStoreIndex::METASTORE_VERSION
37     * manually as well.
38     */
39    public const VERSION = '0.12';
40
41    /**
42     * Maximum number of characters allowed in keyword terms.
43     */
44    private const KEYWORD_IGNORE_ABOVE = 5000;
45
46    /**
47     * Temporary magic value to prevent enabling ICU tokenizer in specific analyzers
48     */
49    private const STANDARD_TOKENIZER_ONLY = 'std_only';
50
51    /**
52     * @var bool is the icu plugin available?
53     */
54    private $icu;
55
56    /**
57     * @var bool is the textify plugin available?
58     */
59    private $textify;
60
61    /**
62     * @var string which ICU tokenizer should be used
63     */
64    private $icu_tokenizer = 'icu_tokenizer';
65
66    /**
67     * @var array Similarity algo (tf/idf, bm25, etc) configuration
68     */
69    private $similarity;
70
71    /**
72     * @var SearchConfig cirrus config
73     */
74    protected $config;
75
76    /**
77     * @var string[]
78     */
79    private $plugins;
80
81    /**
82     * @var string
83     */
84    protected $defaultLanguage;
85
86    /**
87     * @var CirrusSearchHookRunner
88     */
89    private $cirrusSearchHookRunner;
90
91    /**
92     * @var GlobalCustomFilter[]
93     */
94    public $globalCustomFilters;
95
96    /**
97     * @param string $langCode The language code to build config for
98     * @param string[] $plugins list of plugins installed in Elasticsearch
99     * @param SearchConfig|null $config
100     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
101     */
102    public function __construct(
103        $langCode,
104        array $plugins,
105        SearchConfig $config = null,
106        CirrusSearchHookRunner $cirrusSearchHookRunner = null
107    ) {
108        $this->globalCustomFilters = $this->buildGlobalCustomFilters();
109
110        $this->defaultLanguage = $langCode;
111        $this->plugins = $plugins;
112        foreach ( $this->elasticsearchLanguageAnalyzersFromPlugins as $pluginSpec => $extra ) {
113            $pluginsPresent = 1;
114            $pluginList = explode( ',', $pluginSpec );
115            foreach ( $pluginList as $plugin ) {
116                if ( !in_array( $plugin, $plugins ) ) {
117                    $pluginsPresent = 0;
118                    break;
119                }
120            }
121            if ( $pluginsPresent ) {
122                $this->elasticsearchLanguageAnalyzers =
123                    array_merge( $this->elasticsearchLanguageAnalyzers, $extra );
124            }
125        }
126        $this->icu = in_array( 'analysis-icu', $plugins );
127        $this->textify = in_array( 'extra-analysis-textify', $plugins );
128        if ( $this->isTextifyAvailable() ) {
129            // icu_token_repair can only work with the textify icu_tokenizer clone
130            $this->icu_tokenizer = 'textify_icu_tokenizer';
131        }
132        $config ??= MediaWikiServices::getInstance()->getConfigFactory()
133            ->makeConfig( 'CirrusSearch' );
134        $similarity = $config->getProfileService()->loadProfile( SearchProfileService::SIMILARITY );
135        if ( !array_key_exists( 'similarity', $similarity ) ) {
136            $similarity['similarity'] = [];
137        }
138        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
139            MediaWikiServices::getInstance()->getHookContainer() );
140        $this->cirrusSearchHookRunner->onCirrusSearchSimilarityConfig( $similarity['similarity'] );
141        $this->similarity = $similarity;
142
143        $this->config = $config;
144    }
145
146    /**
147     * Determine if ascii folding should be used
148     * @param string $language Config language
149     * @return bool true if icu folding should be enabled
150     */
151    public function shouldActivateIcuFolding( $language ) {
152        if ( !$this->isIcuAvailable() || !in_array( 'extra', $this->plugins ) ) {
153            // ICU folding requires the icu plugin and the extra plugin
154            return false;
155        }
156        $in_config = $this->config->get( 'CirrusSearchUseIcuFolding' );
157        // BC code, this config var was originally a simple boolean
158        if ( $in_config === true ) {
159            $in_config = 'yes';
160        }
161        if ( $in_config === false ) {
162            $in_config = 'no';
163        }
164        switch ( $in_config ) {
165            case 'yes':
166                return true;
167            case 'no':
168                return false;
169            case 'default':
170                return $this->languagesWithIcuFolding[$language] ?? false;
171            default:
172                return false;
173        }
174    }
175
176    /**
177     * Determine if the icu_tokenizer can replace the standard tokenizer for this language
178     * @param string $language Config language
179     * @return bool
180     */
181    public function shouldActivateIcuTokenization( $language ) {
182        if ( !$this->isIcuAvailable() && !$this->isTextifyAvailable() ) {
183            // requires the icu or textify plugin
184            return false;
185        }
186        $in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer' );
187        switch ( $in_config ) {
188            case 'yes':
189                return true;
190            case 'no':
191                return false;
192            case 'default':
193                // languagesWithIcuTokenization[] gives absolute answers for specific languages.
194                // If the textify plugin is available, the default is 'yes'/true because we
195                // have icu_token_repair available; if not, the default is 'no'/false
196                return $this->languagesWithIcuTokenization[$language] ?? $this->isTextifyAvailable();
197            default:
198                return false;
199        }
200    }
201
202    /**
203     * Build the analysis config.
204     *
205     * @param string|null $language Config language
206     * @return array the analysis config
207     */
208    public function buildConfig( $language = null ) {
209        $language ??= $this->defaultLanguage;
210        $config = $this->customize( $this->defaults( $language ), $language );
211        $this->cirrusSearchHookRunner->onCirrusSearchAnalysisConfig( $config, $this );
212
213        if ( $this->shouldActivateIcuTokenization( $language ) ) {
214            $config = $this->enableICUTokenizer( $config );
215        }
216
217        if ( $this->shouldActivateIcuFolding( $language ) ) {
218            $config = $this->enableICUFolding( $config, $language );
219        }
220        $config = $this->fixAsciiFolding( $config );
221        $config = $this->standardTokenizerOnlyCleanup( $config );
222        if ( !$this->isTextifyAvailable() ) {
223            $config = $this->disableLimitedMappings( $config );
224        }
225
226        // should come after other upgrades to get the full context
227        $config = $this->enableGlobalCustomFilters( $config, $language );
228
229        return $config;
230    }
231
232    /**
233     * @return array|null the similarity config
234     */
235    public function buildSimilarityConfig() {
236        return $this->similarity['similarity'] ?? null;
237    }
238
239    /**
240     * replace the standard tokenizer with icu_tokenizer
241     * @param mixed[] $config
242     * @return mixed[] update config
243     */
244    public function enableICUTokenizer( array $config ) {
245        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
246            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
247                continue;
248            }
249            if ( isset( $value[ 'tokenizer' ] ) && $value[ 'tokenizer' ] === 'standard' ) {
250                $value[ 'tokenizer' ] = $this->icu_tokenizer;
251            }
252        }
253        return $config;
254    }
255
256    /**
257     * replace STANDARD_TOKENIZER_ONLY with the actual standard tokenizer
258     * @param mixed[] $config
259     * @return mixed[] update config
260     */
261    public function standardTokenizerOnlyCleanup( array $config ) {
262        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
263            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
264                continue;
265            }
266            if ( isset( $value[ 'tokenizer' ] ) &&
267                    $value[ 'tokenizer' ] === self::STANDARD_TOKENIZER_ONLY ) {
268                // if we blocked upgrades/changes to the standard tokenizer,
269                // replace the magic value with the actual standard tokenizer
270                $value[ 'tokenizer' ] = 'standard';
271            }
272        }
273        return $config;
274    }
275
276    /**
277     * replace limited_mappings with mappings if limited_mapping is unavailable
278     * @param mixed[] $config
279     * @return mixed[] update config
280     */
281    public function disableLimitedMappings( array $config ) {
282        foreach ( $config[ 'char_filter' ] as $name => &$value ) {
283            if ( !isset( $value[ 'type' ] ) || $value[ 'type' ] != 'limited_mapping' ) {
284                continue;
285            }
286            $value[ 'type' ] = 'mapping';
287        }
288        return $config;
289    }
290
291    /**
292     * Activate ICU folding instead of asciifolding
293     * @param mixed[] $config
294     * @param string $language Config language
295     * @return mixed[] update config
296     */
297    public function enableICUFolding( array $config, $language ) {
298        $unicodeSetFilter = $this->getICUSetFilter( $language );
299        $filter = [
300            'type' => 'icu_folding',
301        ];
302        if ( $unicodeSetFilter !== null ) {
303            $filter[ 'unicodeSetFilter' ] = $unicodeSetFilter;
304        }
305        $config[ 'filter' ][ 'icu_folding' ] = $filter;
306
307        // Adds a simple nfkc normalizer for cases where
308        // we preserve original but the lowercase filter
309        // is not used before
310        $config[ 'filter' ][ 'icu_nfkc_normalization' ] = [
311            'type' => 'icu_normalizer',
312            'name' => 'nfkc',
313        ];
314
315        $newfilters = [];
316        foreach ( $config[ 'analyzer' ] as $name => $value ) {
317            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
318                continue;
319            }
320            if ( !isset( $value[ 'filter' ] ) ) {
321                continue;
322            }
323            if ( in_array( 'asciifolding', $value[ 'filter' ] ) ) {
324                $newfilters[ $name ] = $this->switchFiltersToICUFolding( $value[ 'filter' ] );
325            }
326            if ( in_array( 'asciifolding_preserve', $value[ 'filter' ] ) ) {
327                $newfilters[ $name ] = $this->switchFiltersToICUFoldingPreserve( $value[ 'filter' ] );
328            }
329        }
330
331        foreach ( $newfilters as $name => $filters ) {
332            $config[ 'analyzer' ][ $name ][ 'filter' ] = $filters;
333        }
334        // Explicitly enable icu_folding on plain analyzers if it's not
335        // already enabled
336        foreach ( [ 'plain' ] as $analyzer ) {
337            if ( !isset( $config[ 'analyzer' ][ $analyzer ] ) ) {
338                continue;
339            }
340            if ( !isset( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] ) ) {
341                $config[ 'analyzer' ][ $analyzer ][ 'filter' ] = [];
342            }
343            $config[ 'analyzer' ][ $analyzer ][ 'filter' ] =
344                $this->switchFiltersToICUFoldingPreserve(
345                    // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset
346                    $config[ 'analyzer' ][ $analyzer ][ 'filter' ], true );
347        }
348
349        return $config;
350    }
351
352    /**
353     * Replace occurrence of asciifolding to icu_folding
354     * @param string[] $filters
355     * @return string[] new list of filters
356     */
357    private function switchFiltersToICUFolding( array $filters ) {
358        array_splice( $filters, array_search( 'asciifolding', $filters ), 1,
359            [ 'icu_folding', 'remove_empty' ] );
360        return $filters;
361    }
362
363    /**
364     * Replace occurrence of asciifolding_preserve with a set
365     * of compatible filters to enable icu_folding
366     * @param string[] $filters
367     * @param bool $append append icu_folding even if asciifolding is not present
368     * @return string[] new list of filters
369     */
370    private function switchFiltersToICUFoldingPreserve( array $filters, $append = false ) {
371        if ( in_array( 'icu_folding', $filters ) ) {
372            // ICU folding already here
373            return $filters;
374        }
375        $ap_idx = array_search( 'asciifolding_preserve', $filters );
376        if ( $ap_idx === false && $append ) {
377            $ap_idx = count( $filters );
378            // fake an asciifolding_preserve so we can
379            // reuse code that replaces it
380            $filters[] = 'asciifolding_preserve';
381        }
382        if ( $ap_idx === false ) {
383            return $filters;
384        }
385        // with ICU lowercase is replaced by icu_normalizer/nfkc_cf
386        // thus unicode normalization is already done.
387        $lc_idx = array_search( 'icu_normalizer', $filters );
388        $newfilters = [];
389        if ( $lc_idx === false || $lc_idx > $ap_idx ) {
390            // If lowercase is not detected before we
391            // will have to do some icu normalization
392            // this is to prevent preserving "un-normalized"
393            // unicode chars.
394            $newfilters[] = 'icu_nfkc_normalization';
395        }
396        $newfilters[] = 'preserve_original_recorder';
397        $newfilters[] = 'icu_folding';
398        $newfilters[] = 'preserve_original';
399        $newfilters[] = 'remove_empty';
400        array_splice( $filters, $ap_idx, 1, $newfilters );
401        return $filters;
402    }
403
404    /**
405     * Return the list of chars to exclude from ICU folding
406     * @param string $language Config language
407     * @return null|string
408     */
409    protected function getICUSetFilter( $language ) {
410        if ( $this->config->get( 'CirrusSearchICUFoldingUnicodeSetFilter' ) !== null ) {
411            return $this->config->get( 'CirrusSearchICUFoldingUnicodeSetFilter' );
412        }
413        switch ( $language ) {
414            /* @todo: complete the default filters per language
415             *
416             * For Slovak (sk)—which has no folding configured here!—see:
417             *   https://www.mediawiki.org/wiki/User:TJones_(WMF)/T223787
418             *
419             * Exceptions are generally listed as Unicode characters for ease of
420             *   inspection. However, combining characters (such as for Thai (th))
421             *   are \u encoded to prevent problems with display or editing
422             */
423            case 'bg': // T325090
424                return '[^Йй]';
425            case 'bs': // T192395
426            case 'hr': // T192395
427            case 'sh': // T192395
428            case 'sr': // T183015
429                return '[^ĐđŽžĆ抚Čč]';
430            case 'cs': // T284578
431                return '[^ÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž]';
432            case 'da': // T283366
433                return '[^ÆæØøÅå]';
434            case 'de': // T281379
435                return '[^ÄäÖöÜüẞß]';
436            case 'eo': // T202173
437                return '[^ĈĉĜĝĤĥĴĵŜŝŬŭ]';
438            case 'es': // T277699
439                return '[^Ññ]';
440            case 'et': // T332322
441                return '[^ŠšŽžÕõÄäÖöÜü]';
442            case 'eu': // T283366
443                return '[^Ññ]';
444            case 'fi': // T284578
445                return '[^ÅåÄäÖö]';
446            case 'gl': // T284578
447                return '[^Ññ]';
448            case 'hu': // T325089
449                return '[^ÁáÉéÍíÓóÖöŐőÚúÜüŰű]';
450            case 'ja': // T326822
451                // This range includes characters that don't currently get ICU folded, in
452                // order to keep the overall regex a lot simpler. The specific targets are
453                // characters with dakuten and handakuten, the separate (han)dakuten
454                // characters (regular and combining) and the prolonged sound mark (chōonpu).
455                return '[^が-ヾ]';
456            case 'lt': // T325090
457                return '[^ĄąČčĘęĖėĮįŠšŲųŪūŽž]';
458            case 'lv': // T325089
459                return '[^ĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž]';
460            case 'nb': // T289612
461            case 'nn': // T289612
462            case 'no':
463                return '[^ÆæØøÅå]';
464            case 'ro': // T325091
465                // including s&t with cedilla because we (have to) use it internally T330893
466                return '[^ĂăÂâÎîȘșȚțŞşŢţ]';
467            case 'ru':
468                return '[^Йй]';
469            case 'sv': // T160562
470                return '[^ÅåÄäÖö]';
471            case 'th': // T294147
472                return '[^\u0E47-\u0E4E]';
473            case 'tr': // T329762
474                // (I and i aren't strictly necessary but they keep the Turkish upper/lower
475                // pairs Iı & İi together and makes it clear both are intended.)
476                return '[^ÇçĞğIıİiÖöŞşÜü]';
477            default:
478                return null;
479        }
480    }
481
482    /**
483     * Return the list of chars to exclude from ICU normalization
484     * @param string $language Config language
485     * @return null|string
486     */
487    protected function getICUNormSetFilter( $language ) {
488        if ( $this->config->get( 'CirrusSearchICUNormalizationUnicodeSetFilter' ) !== null ) {
489            return $this->config->get( 'CirrusSearchICUNormalizationUnicodeSetFilter' );
490        }
491        switch ( $language ) {
492            /* For German (de), see T281379
493             */
494            case 'de':
495                return '[^ẞß]'; // Capital ẞ is lowercased to ß by german_charfilter
496                                // lowercase ß is normalized to ss by german_normalization
497            default:
498                return null;
499        }
500    }
501
502    /**
503     * Build an analysis config with sane defaults.
504     *
505     * @param string $language Config language
506     * @return array
507     */
508    private function defaults( $language ) {
509        $defaults = [
510            'analyzer' => [
511                'text' => [
512                    'type' => $this->getDefaultTextAnalyzerType( $language ),
513                ],
514                // text_search is not configured here because it will be copied from text
515                'plain' => [
516                    // Surprisingly, the Lucene docs claim this works for
517                    // Chinese, Japanese, and Thai as well.
518                    // The difference between this and the 'standard'
519                    // analyzer is the lack of english stop words.
520                    'type' => 'custom',
521                    'char_filter' => [ 'nnbsp_norm', 'word_break_helper' ],
522                    'tokenizer' => 'standard',
523                    'filter' => [ 'lowercase' ],
524                ],
525                'plain_search' => [
526                    // In accent squashing languages this will not contain accent
527                    // squashing to allow searches with accents to only find accents
528                    // and searches without accents to find both.
529                    'type' => 'custom',
530                    'char_filter' => [ 'nnbsp_norm', 'word_break_helper' ],
531                    'tokenizer' => 'standard',
532                    'filter' => [ 'lowercase' ],
533                ],
534                // Used by ShortTextIndexField
535                'short_text' => [
536                    'type' => 'custom',
537                    'tokenizer' => 'whitespace',
538                    'filter' => [ 'lowercase', 'aggressive_splitting', 'asciifolding_preserve' ],
539                ],
540                'short_text_search' => [
541                    'type' => 'custom',
542                    'tokenizer' => 'whitespace',
543                    'filter' => [ 'lowercase', 'aggressive_splitting' ],
544                ],
545                'source_text_plain' => [
546                    'type' => 'custom',
547                    'char_filter' => [ 'word_break_helper_source_text' ],
548                    'tokenizer' => 'standard',
549                    'filter' => [ 'lowercase' ],
550                ],
551                'source_text_plain_search' => [
552                    'type' => 'custom',
553                    'char_filter' => [ 'word_break_helper_source_text' ],
554                    'tokenizer' => 'standard',
555                    'filter' => [ 'lowercase' ],
556                ],
557                'suggest' => [
558                    'type' => 'custom',
559                    'tokenizer' => 'standard',
560                    'filter' => [ 'lowercase', 'suggest_shingle' ],
561                ],
562                'suggest_reverse' => [
563                    'type' => 'custom',
564                    'tokenizer' => 'standard',
565                    'filter' => [ 'lowercase', 'suggest_shingle', 'reverse' ],
566                ],
567                'token_reverse' => [
568                    'type' => 'custom',
569                    'tokenizer' => 'no_splitting',
570                    'filter' => [ 'reverse' ]
571                ],
572                'near_match' => [
573                    'type' => 'custom',
574                    'char_filter' => [ 'near_space_flattener' ],
575                    'tokenizer' => 'no_splitting',
576                    'filter' => [ 'lowercase' ],
577                ],
578                'near_match_asciifolding' => [
579                    'type' => 'custom',
580                    'char_filter' => [ 'near_space_flattener' ],
581                    'tokenizer' => 'no_splitting',
582                    'filter' => [ 'truncate_keyword', 'lowercase', 'asciifolding' ],
583                ],
584                'prefix' => [
585                    'type' => 'custom',
586                    'char_filter' => [ 'near_space_flattener' ],
587                    'tokenizer' => 'prefix',
588                    'filter' => [ 'lowercase' ],
589                ],
590                'prefix_asciifolding' => [
591                    'type' => 'custom',
592                    'char_filter' => [ 'near_space_flattener' ],
593                    'tokenizer' => 'prefix',
594                    'filter' => [ 'lowercase', 'asciifolding' ],
595                ],
596                'word_prefix' => [
597                    'type' => 'custom',
598                    'tokenizer' => 'standard',
599                    'filter' => [ 'lowercase', 'prefix_ngram_filter' ],
600                ],
601                'keyword' => [
602                    'type' => 'custom',
603                    'tokenizer' => 'no_splitting',
604                    'filter' => [ 'truncate_keyword' ],
605                ],
606                'lowercase_keyword' => [
607                    'type' => 'custom',
608                    'tokenizer' => 'no_splitting',
609                    'filter' => [ 'truncate_keyword', 'lowercase' ],
610                ],
611                'trigram' => [
612                    'type' => 'custom',
613                    'tokenizer' => 'trigram',
614                    'filter' => [ 'lowercase' ],
615                ],
616            ],
617            'filter' => [
618                'suggest_shingle' => [
619                    'type' => 'shingle',
620                    'min_shingle_size' => 2,
621                    'max_shingle_size' => 3,
622                    'output_unigrams' => true,
623                ],
624                'lowercase' => [
625                    'type' => 'lowercase',
626                ],
627                'aggressive_splitting' => [
628                    'type' => 'word_delimiter_graph',
629                    'stem_english_possessive' => false,
630                    'preserve_original' => false
631                ],
632                'prefix_ngram_filter' => [
633                    'type' => 'edgeNGram',
634                    'max_gram' => CirrusSearch::MAX_TITLE_SEARCH,
635                ],
636                'asciifolding' => [
637                    'type' => 'asciifolding',
638                    'preserve_original' => false
639                ],
640                'asciifolding_preserve' => [
641                    'type' => 'asciifolding',
642                    'preserve_original' => true
643                ],
644                // The 'keyword' type in ES seems like a hack
645                // and doesn't allow normalization (like lowercase)
646                // prior to 5.2. Instead we consistently use 'text'
647                // and truncate where necessary.
648                'truncate_keyword' => [
649                    'type' => 'truncate',
650                    'length' => self::KEYWORD_IGNORE_ABOVE,
651                ],
652                'remove_empty' => [
653                    'type' => 'length',
654                    'min' => 1,
655                ],
656            ],
657            'tokenizer' => [
658                'prefix' => [
659                    'type' => 'edgeNGram',
660                    'max_gram' => CirrusSearch::MAX_TITLE_SEARCH,
661                ],
662                'no_splitting' => [ // Just grab the whole term.
663                    'type' => 'keyword',
664                ],
665                'trigram' => [
666                    'type' => 'nGram',
667                    'min_gram' => 3,
668                    'max_gram' => 3,
669                ],
670            ],
671            'char_filter' => [
672                // Flattens things that are space like to spaces in the near_match style analyzers
673                'near_space_flattener' => [
674                    'type' => 'limited_mapping',
675                    'mappings' => [
676                        "'=>\u0020", // Useful for finding names
677                        '\u2019=>\u0020', // Unicode right single quote
678                        '\u02BC=>\u0020', // Unicode modifier letter apostrophe
679                        '_=>\u0020', // MediaWiki loves _ and people are used to it but it
680                                     // usually means space
681                        '-=>\u0020', // Useful for finding hyphenated names unhyphenated
682                    ],
683                ],
684                // map narrow no-break space to plain space to compensate for ES6.x+
685                // analyzers generally not doing so
686                'nnbsp_norm' => [
687                    'type' => 'limited_mapping',
688                    'mappings' => [
689                        '\u202F=>\u0020',
690                    ],
691                ],
692                // Add a space between lowercase letter {Ll} and uppercase {Lu} or
693                // titlecase {Lt} letter, allowing for optional combining marks {M}
694                // or invisibles {Cf}. This is expensive, so use camelCase_splitter
695                // in extra-analysis-textify instead, if available (T219108/T346051)
696                'regex_camelCase' => [
697                    'type' => 'pattern_replace',
698                    'pattern' => '(\\p{Ll}[\\p{M}\\p{Cf}]*)([\\p{Lu}\\p{Lt}])',
699                    'replacement' => '$1 $2'
700                ],
701                // Replace period (regular or fullwidth) between [non-letter +
702                // letter] and [letter + non-letter]. This slow, and also only
703                // handles the simplest case. Use acronym_fixer in
704                // extra-analysis-textify instead, if available (T170625/T346051)
705                'regex_acronym_fixer' => [
706                    'type' => 'pattern_replace',
707                    'pattern' => '(?<=(?:^|\\P{L})\\p{L})[..](\\p{L})(?=\\P{L}|$)',
708                    'replacement' => '$1'
709                ],
710                // combine universally-applied mappings into one mapping to save on the
711                // overhead of calling multiple mappings
712                'globo_norm' => [
713                    'type' => 'mapping',
714                    'mappings' => [
715                        // map lots of apostrophe-like characters to apostrophe (T315118);
716                        // formerly apostrophe_norm
717                        "`=>'", // grave accent
718                        "´=>'", // acute accent
719                        "ʹ=>'", // modifier letter prime
720                        "ʻ=>'", // modifier letter turned comma
721                        "ʼ=>'", // modifier letter apostrophe
722                        "ʽ=>'", // modifier letter reversed comma
723                        "ʾ=>'", // modifier letter right half ring
724                        "ʿ=>'", // modifier letter left half ring
725                        "ˋ=>'", // modifier letter grave accent
726                        "՚=>'", // Armenian apostrophe
727                        "\u05F3=>'", // Hebrew punctuation geresh
728                        "‘=>'", // left single quotation mark
729                        "’=>'", // right single quotation mark
730                        "‛=>'", // single high-reversed-9 quotation mark
731                        "′=>'", // prime
732                        "‵=>'", // reversed prime
733                        "ꞌ=>'", // Latin small letter saltillo
734                        "'=>'", // fullwidth apostrophe
735                        "`=>'", // fullwidth grave accent
736                        // map narrow no-break space to plain space to compensate for ES6.x+
737                        // analyzers generally not doing so; copied from nnbsp_norm, which
738                        // is still needed elsewhere
739                        '\u202F=>\u0020',
740                        // Delete primary and secondary stress markers, which are
741                        // inconsistently used across phonetic transcriptions
742                        "ˈ=>", // modifier letter vertical line
743                        "ˌ=>", // modifier letter low vertical line
744                        // Delete Arabic tatweel (ـ) (used largely for cosmetic purposes)
745                        "\u0640=>", // tatweel
746                        // Convert Arabic thousand separator and Arabic comma to comma for
747                        // more consistent number parsing
748                        "٬=>,", // Arabic thousands separator
749                        "،=>,", // Arabic comma
750                        // delete Armenian emphasis marks, exclamation marks, and question
751                        // marks, since they modify words rather than follow them.
752                        "՛=>", // Armenian emphasis mark
753                        "՜=>", // Armenian exclamation mark
754                        "՞=>", // Armenian question mark
755                        // micro sign to mu, to prevent some unneeded ICU tokenizer splits
756                        // icu_normalize does this, too.. just later
757                        "µ=>μ",
758                        // Yiddish Ligatures (T362501)
759                        "\u05F0=>\u05D5\u05D5", // double vav
760                        "\u05F1=>\u05D5\u05D9", // vav yod
761                        "\u05F2=>\u05D9\u05D9", // double yod
762                        "\uFB1F=>\u05D9\u05D9\u05B7", // single char yod-yod-patah decomposed
763                        "\u05D9\u05B7\u05D9=>\u05D9\u05D9\u05B7", // rarer alternate order
764                    ],
765                ],
766                'arabic_extended_norm' => [
767                    'type' => 'limited_mapping',
768                    'mappings' => [
769                        '\uFB8E=>\u0643', '\uFB8F=>\u0643', '\uFB90=>\u0643', // kaf
770                        '\uFB91=>\u0643', '\u06A9=>\u0643', '\u06AA=>\u0643',
771                        '\uFEDB=>\u0643', '\uFEDC=>\u0643', '\uFED9=>\u0643',
772                        '\uFEDA=>\u0643',
773
774                        '\uFBFC=>\u064A', '\uFBFD=>\u064A', '\uFBFE=>\u064A', // yeh
775                        '\uFBFF=>\u064A', '\u06CC=>\u064A', '\uFBE8=>\u064A',
776                        '\uFBE9=>\u064A', '\uFEEF=>\u064A', '\uFEF0=>\u064A',
777                        '\u0649=>\u064A', '\u06CD=>\u064A', '\uFBE4=>\u064A',
778                        '\uFBE5=>\u064A', '\uFBE6=>\u064A', '\uFBE7=>\u064A',
779                        '\u06D0=>\u064A',
780
781                        '\uFBA6=>\u0647', '\uFBA7=>\u0647', '\uFBA8=>\u0647', // heh
782                        '\uFBA9=>\u0647', '\u06C1=>\u0647', '\u06C0=>\u0647',
783                        '\uFBA4=>\u0647', '\uFBA5=>\u0647', '\u06D5=>\u0647',
784                    ],
785                ],
786                // Converts things that don't always count as word breaks into spaces
787                // which (almost) always count as word breaks (e.g., the Nori and SmartCN
788                // tokenizers do not always count spaces as word breaks!)
789                'word_break_helper' => [
790                    'type' => 'limited_mapping',
791                    'mappings' => [
792                        '_=>\u0020',
793                        ':=>\u0020',
794                        // These are more useful for code:
795                        '.=>\u0020',
796                        '(=>\u0020',
797                        ')=>\u0020',
798                        // fullwidth variants
799                        '.=>\u0020',
800                        '_=>\u0020',
801                        ':=>\u0020',
802                        // middle dot
803                        '·=>\u0020',
804                    ],
805                ],
806                'word_break_helper_source_text' => [
807                    'type' => 'limited_mapping',
808                    'mappings' => [
809                        '_=>\u0020',
810                        // These are more useful for code:
811                        '.=>\u0020',
812                        '(=>\u0020',
813                        ')=>\u0020',
814                        ':=>\u0020', // T145023
815                    ],
816                ],
817                'dotted_I_fix' => [
818                    // A common regression caused by unpacking is that İ is no longer
819                    // treated correctly, so specify the mapping just once and re-use
820                    // in analyzer/text/char_filter as needed.
821                    'type' => 'limited_mapping',
822                    'mappings' => [
823                        'İ=>I',
824                    ],
825                ],
826            ],
827        ];
828        foreach ( $defaults[ 'analyzer' ] as &$analyzer ) {
829            if ( $analyzer[ 'type' ] === 'default' ) {
830                $analyzer = [
831                    'type' => 'custom',
832                    'tokenizer' => 'standard',
833                    'filter' => [ 'lowercase' ],
834                ];
835            }
836        }
837        if ( $this->isTextifyAvailable() && $this->shouldActivateIcuTokenization( $language ) ) {
838            $defaults[ 'filter' ][ 'icutokrep_no_camel_split' ] = [
839                'type' => 'icu_token_repair',
840                'keep_camel_split' => false
841            ];
842        }
843        if ( $this->isIcuAvailable() ) {
844            $defaults[ 'filter' ][ 'icu_normalizer' ] = [
845                'type' => 'icu_normalizer',
846                'name' => 'nfkc_cf',
847            ];
848            $unicodeSetFilter = $this->getICUNormSetFilter( $language );
849            if ( $unicodeSetFilter !== null ) {
850                $defaults[ 'filter' ][ 'icu_normalizer' ][ 'unicodeSetFilter' ] = $unicodeSetFilter;
851            }
852        }
853
854        return $defaults;
855    }
856
857    /**
858     * Customize the default config for the language.
859     *
860     * @param array $config
861     * @param string $language Config language
862     * @return array
863     */
864    private function customize( $config, $language ) {
865        $langName = $this->getDefaultTextAnalyzerType( $language );
866        switch ( $langName ) {
867            // Please add languages in alphabetical order.
868
869            // usual unpacked languages
870            case 'basque':     // Unpack Basque analyzer T283366
871            case 'brazilian':  // Unpack Brazilian analyzer T325092
872            case 'bulgarian':  // Unpack Bulgarian analyzer T325090
873            case 'czech':      // Unpack Czech analyzer T284578
874            case 'danish':     // Unpack Danish analyzer T283366
875            case 'estonian':   // Unpack Estonian analyzer T332322
876            case 'finnish':    // Unpack Finnish analyzer T284578
877            case 'galician':   // Unpack Galician analyzer T284578
878            case 'hungarian':  // Unpack Hungarian analyzer T325089
879            case 'latvian':    // Unpack Latvian analyzer T325089
880            case 'lithuanian': // Unpack Lithuanian analyzer T325090
881            case 'norwegian':  // Unpack Norwegian analyzer T289612
882                $config = ( new AnalyzerBuilder( $langName ) )->
883                    withUnpackedAnalyzer()->
884                    build( $config );
885                break;
886
887            // usual unpacked languages, with "light" variant stemmer
888            case 'portuguese':  // Unpack Portuguese analyzer T281379
889            case 'spanish':     // Unpack Spanish analyzer T277699
890                $config = ( new AnalyzerBuilder( $langName ) )->
891                    withUnpackedAnalyzer()->
892                    withLightStemmer()->
893                    build( $config );
894                break;
895
896            // customized languages
897            case 'arabic':
898            case 'arabic-egyptian':
899            case 'arabic-moroccan':
900                // Unpack Arabic analyzer T294147
901                $arBuilder = ( new AnalyzerBuilder( 'arabic' ) )->
902                    withUnpackedAnalyzer()->
903                    withDecimalDigit()->
904                    insertFiltersBefore( 'arabic_stemmer', [ 'arabic_normalization' ] );
905
906                // load extra stopwords for Arabic
907                $arabicExtraStopwords = require __DIR__ . '/AnalysisLanguageData/arabicStopwords.php';
908                $arBuilder->withExtraStop( $arabicExtraStopwords, 'arabic_extra_stop', 'arabic_stop' );
909
910                $config = $arBuilder->build( $config );
911                break;
912            case 'armenian':  // Unpack Armenian analyzer T325089
913                // char map: Armenian uses ․ ("one-dot leader") about 10% as often as . (period)
914                // stopwords նաև & և get normalized to նաեւ & եւ, so pick those up, too.
915                $config = ( new AnalyzerBuilder( $langName ) )->
916                    withUnpackedAnalyzer()->
917                    withLimitedCharMap( [ '․=>.' ] )->
918                    withExtraStop( [ 'նաեւ', 'եւ' ], 'armenian_norm_stop', 'armenian_stop' )->
919                    build( $config );
920                break;
921            case 'azerbaijani':
922            case 'crimean-tatar':
923            case 'gagauz':
924            case 'kazakh':
925            case 'tatar':
926                // Turkic languages that use I/ı & İ/i, so need Turkish lowercasing
927                $config = ( new AnalyzerBuilder( $langName ) )->
928                    withFilters( [ 'lowercase' ] )->
929                    withLangLowercase( 'turkish' )->
930                    build( $config );
931                break;
932            case 'bengali': // Unpack Bengali analyzer T294067
933                $config = ( new AnalyzerBuilder( $langName ) )->
934                    withUnpackedAnalyzer()->
935                    withDecimalDigit()->
936                    insertFiltersBefore( 'bengali_stop', [ 'indic_normalization' ] )->
937                    build( $config );
938                break;
939            case 'bosnian':
940            case 'croatian':
941            case 'serbian':
942            case 'serbo-croatian':
943                // Unpack default analyzer to add Serbian stemming and custom folding
944                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T183015
945                // and https://www.mediawiki.org/wiki/User:TJones_(WMF)/T192395
946                $config = ( new AnalyzerBuilder( $langName ) )->
947                    withFilters( [ 'lowercase', 'asciifolding', 'serbian_stemmer' ] )->
948                    build( $config );
949                break;
950            case 'catalan':
951                // Unpack Catalan analyzer T283366
952                $config = ( new AnalyzerBuilder( $langName ) )->
953                    withUnpackedAnalyzer()->
954                    withElision( [ 'd', 'l', 'm', 'n', 's', 't' ] )->
955                    build( $config );
956                break;
957            case 'chinese':
958                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T158203
959                $config[ 'char_filter' ][ 'tsconvert' ] = [
960                    'type' => 'stconvert',
961                    'delimiter' => '#',
962                    'keep_both' => false,
963                    'convert_type' => 't2s',
964                ];
965
966                // char map: hack for STConvert errors (still present as of July 2023)
967                // see https://github.com/medcl/elasticsearch-analysis-stconvert/issues/13
968                // stop: SmartCN converts lots of punctuation to ',' but we don't want to index it
969                $config = ( new AnalyzerBuilder( $langName ) )->
970                    withCharMap( [ '\u606d\u5f18=>\u606d \u5f18', '\u5138=>\u3469' ], 'stconvertfix' )->
971                    withCharFilters( [ 'stconvertfix', 'tsconvert' ] )->
972                    withTokenizer( 'smartcn_tokenizer' )->
973                    withStop( [ ',' ], 'smartcn_stop' )->
974                    withFilters( [ 'smartcn_stop', 'lowercase' ] )->
975                    build( $config );
976
977                $config[ 'analyzer' ][ 'plain' ][ 'filter' ] = [ 'smartcn_stop', 'lowercase' ];
978                $config[ 'analyzer' ][ 'plain_search' ][ 'filter' ] =
979                    $config[ 'analyzer' ][ 'plain' ][ 'filter' ];
980                break;
981            case 'cjk':
982                // Unpack CJK analyzer T326822
983                // map (han)dakuten to combining forms or icu_normalizer will add spaces
984                $dakutenMap = [ '゛=>\u3099', '゜=>\u309a' ];
985
986                // cjk_bigram negates the benefits of the icu_tokenizer for CJK text. The
987                // icu_tokenizer also has a few bad side effects, so don't use it for cjk.
988                // Default cjk stop words are almost the same as _english_ (add s & t; drop
989                // an). Stop words are searchable via 'plain' anyway, so just use _english_
990                $config = ( new AnalyzerBuilder( 'cjk' ) )->
991                    withUnpackedAnalyzer()->
992                    withLimitedCharMap( $dakutenMap )->
993                    withTokenizer( self::STANDARD_TOKENIZER_ONLY )->
994                    withStop( '_english_' )->
995                    omitStemmer()->
996                    insertFiltersBefore( 'lowercase', [ 'cjk_width' ] )->
997                    insertFiltersBefore( 'cjk_stop', [ 'cjk_bigram' ] )->
998                    build( $config );
999                break;
1000            case 'dutch':
1001                // Unpack Dutch analyzer T281379
1002                $nlOverride = [ // these are in the default Dutch analyzer
1003                    'fiets=>fiets',
1004                    'bromfiets=>bromfiets',
1005                    'ei=>eier',
1006                    'kind=>kinder'
1007                ];
1008                $config = ( new AnalyzerBuilder( $langName ) )->
1009                    withUnpackedAnalyzer()->
1010                    withStemmerOverride( $nlOverride )->
1011                    build( $config );
1012                break;
1013            case 'english':
1014                // Replace English analyzer with a rebuilt copy with asciifolding inserted
1015                // before stemming
1016                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T142037
1017                $config = ( new AnalyzerBuilder( $langName ) )->
1018                    withExtraStemmer( 'possessive_english' )->
1019                    withStemmerOverride( 'guidelines => guideline', 'custom_stem' )->
1020                    withFilters( [ 'possessive_english', 'lowercase', 'stop', 'asciifolding',
1021                        'kstem', 'custom_stem' ] )->
1022                    build( $config );
1023
1024                // Add asciifolding_preserve to the plain analyzer as well (but not plain_search)
1025                $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 'asciifolding_preserve';
1026                // Add asciifolding_preserve filters
1027                $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
1028                break;
1029            case 'esperanto':
1030                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T202173
1031                $config = ( new AnalyzerBuilder( $langName ) )->
1032                    withFilters( [ 'lowercase', 'asciifolding', 'esperanto_stemmer' ] )->
1033                    build( $config );
1034                break;
1035            case 'french':
1036                // Add asciifolding_preserve to filters
1037                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T142620
1038                $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
1039
1040                $config = ( new AnalyzerBuilder( $langName ) )->
1041                    withUnpackedAnalyzer()->
1042                    withLimitedCharMap( [ '\u02BC=>\u0027' ] )->
1043                    withElision( [ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c',
1044                                    'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ] )->
1045                    withLightStemmer()->
1046                    withAsciifoldingPreserve()->
1047                    build( $config );
1048                break;
1049            case 'german':
1050                // Unpack German analyzer T281379
1051                // char map: We have to explicitly map capital ẞ to lowercase ß
1052                $config = ( new AnalyzerBuilder( $langName ) )->
1053                    withUnpackedAnalyzer()->
1054                    withLimitedCharMap( [ 'ẞ=>ß' ] )->
1055                    withLightStemmer()->
1056                    insertFiltersBefore( 'german_stemmer', [ 'german_normalization' ] )->
1057                    build( $config );
1058
1059                $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'german_charfilter';
1060                $config[ 'analyzer' ][ 'plain_search' ][ 'char_filter' ][] = 'german_charfilter';
1061                break;
1062            case 'greek':
1063                $config = ( new AnalyzerBuilder( $langName ) )->
1064                    withUnpackedAnalyzer()->
1065                    omitAsciifolding()->
1066                    withLangLowercase()->
1067                    withRemoveEmpty()->
1068                    build( $config );
1069                break;
1070            case 'hebrew':
1071                $config = ( new AnalyzerBuilder( $langName ) )->
1072                    withTokenizer( 'hebrew' )->
1073                    withFilters( [ 'niqqud', 'hebrew_lemmatizer', 'remove_duplicates', 'lowercase',
1074                        'asciifolding' ] )->
1075                    build( $config );
1076                break;
1077            case 'hindi':
1078                // Unpack Hindi analyzer T289612
1079                $config = ( new AnalyzerBuilder( $langName ) )->
1080                    withUnpackedAnalyzer()->
1081                    withDecimalDigit()->
1082                    insertFiltersBefore( 'hindi_stop',
1083                        [ 'indic_normalization', 'hindi_normalization' ] )->
1084                    build( $config );
1085                break;
1086            case 'indonesian':
1087            case 'malay':
1088                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T196780
1089                $config = ( new AnalyzerBuilder( 'indonesian' ) )->
1090                    withUnpackedAnalyzer()->
1091                    omitAsciifolding()->
1092                    build( $config );
1093                break;
1094            case 'irish':
1095                $gaCharMap = [ 'ḃ=>bh', 'ċ=>ch', 'ḋ=>dh', 'ḟ=>fh', 'ġ=>gh', 'ṁ=>mh', 'ṗ=>ph',
1096                      'ṡ=>sh', 'ẛ=>sh', 'ṫ=>th', 'Ḃ=>BH', 'Ċ=>CH', 'Ḋ=>DH', 'Ḟ=>FH', 'Ġ=>GH',
1097                      'Ṁ=>MH', 'Ṗ=>PH', 'Ṡ=>SH', 'Ṫ=>TH' ];
1098
1099                // Add b, bh, g, m for camelCase cleanup
1100                $gaHyphenStop = [ 'h', 'n', 't', 'b', 'bh', 'g', 'm' ];
1101
1102                // Unpack Irish analyzer T289612
1103                // See also https://www.mediawiki.org/wiki/User:TJones_(WMF)/T217602
1104                $config = ( new AnalyzerBuilder( $langName ) )->
1105                    withUnpackedAnalyzer()->
1106                    withCharMap( $gaCharMap )->
1107                    withExtraStop( $gaHyphenStop, 'irish_hyphenation', 'irish_elision', true )->
1108                    withElision( [ 'd', 'm', 'b' ] )->
1109                    withLangLowercase()->
1110                    build( $config );
1111                break;
1112            case 'italian':
1113                // Replace the default Italian analyzer with a rebuilt copy with additional filters
1114                $itElision = [ 'c', 'l', 'all', 'dall', 'dell', 'nell', 'sull', 'coll', 'pell',
1115                    'gl', 'agl', 'dagl', 'degl', 'negl', 'sugl', 'un', 'm', 't', 's', 'v', 'd' ];
1116                $config = ( new AnalyzerBuilder( $langName ) )->
1117                    withUnpackedAnalyzer()->
1118                    withElision( $itElision )->
1119                    withLightStemmer()->
1120                    build( $config );
1121
1122                // Add asciifolding_preserve to the plain analyzer as well (but not plain_search)
1123                $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 'asciifolding_preserve';
1124                // Add asciifolding_preserve to filters
1125                $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
1126                break;
1127            case 'japanese':
1128                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T166731
1129                // pre-convert fullwidth numbers because Kuromoji tokenizer treats them weirdly
1130                $config = ( new AnalyzerBuilder( $langName ) )->
1131                    withNumberCharFilter( 0xff10, 'fullwidthnumfix' )->
1132                    withCharFilters( [ 'fullwidthnumfix' ] )->
1133                    withTokenizer( 'kuromoji_tokenizer' )->
1134                    withFilters( [ 'kuromoji_baseform', 'cjk_width', 'ja_stop', 'kuromoji_stemmer',
1135                        'lowercase' ] )->
1136                    build( $config );
1137                break;
1138            case 'khmer':
1139                // See Khmer: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T185721
1140                $config = ( new AnalyzerBuilder( $langName ) )->
1141                    withNumberCharFilter( 0x17e0 )->
1142                    withCharFilters( [ 'khmer_syll_reorder', 'khmer_numbers' ] )->
1143                    withFilters( [ 'lowercase' ] )->
1144                    build( $config );
1145                break;
1146            case 'korean':
1147                // Unpack nori analyzer to add ICU normalization and custom filters
1148                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T206874
1149
1150                // Nori-specific character filter
1151                $noriMap = [
1152                    '\u00B7=>\u0020', // convert middle dot to space
1153                    '\u318D=>\u0020', // arae-a to space
1154                    '\u00AD=>', // remove soft hyphens
1155                    '\u200C=>', // remove zero-width non-joiners
1156                ];
1157
1158                // Nori-specific pattern_replace to strip combining diacritics
1159                $config[ 'char_filter' ][ 'nori_combo_filter' ] =
1160                    AnalyzerBuilder::patternFilter( '[\\u0300-\\u0331]' );
1161
1162                // 'mixed' mode keeps the original token plus the compound parts
1163                // the default is 'discard' which only keeps the parts
1164                $config[ 'tokenizer' ][ 'nori_tok' ] = [
1165                    'type' => 'nori_tokenizer',
1166                    'decompound_mode' => 'mixed',
1167                ];
1168
1169                // Nori-specific part of speech filter (add 'VCP', 'VCN', 'VX' to default)
1170                $config[ 'filter' ][ 'nori_posfilter' ] = [
1171                    'type' => 'nori_part_of_speech',
1172                    'stoptags' => [ 'E', 'IC', 'J', 'MAG', 'MAJ', 'MM', 'SP', 'SSC', 'SSO',
1173                        'SC', 'SE', 'XPN', 'XSA', 'XSN', 'XSV', 'UNA', 'NA', 'VSV', 'VCP',
1174                        'VCN', 'VX' ],
1175                ];
1176
1177                $config = ( new AnalyzerBuilder( $langName ) )->
1178                    withLimitedCharMap( $noriMap, 'nori_charfilter' )->
1179                    withCharFilters( [ 'nori_charfilter', 'nori_combo_filter' ] )->
1180                    withTokenizer( 'nori_tok' )->
1181                    withFilters( [ 'nori_posfilter', 'nori_readingform', 'lowercase',
1182                        'remove_empty' ] )->
1183                    build( $config );
1184                break;
1185            case 'mirandese':
1186                // Unpack default analyzer to add Mirandese-specific elision and stop words
1187                // See phab ticket T194941
1188                $mwlStopwords = require __DIR__ . '/AnalysisLanguageData/mirandeseStopwords.php';
1189                $config = ( new AnalyzerBuilder( $langName ) )->
1190                    withElision( [ 'l', 'd', 'qu' ] )->
1191                    withStop( $mwlStopwords )->
1192                    withFilters( [ 'lowercase', 'mirandese_elision', 'mirandese_stop' ] )->
1193                    build( $config );
1194                break;
1195            case 'persian': // Unpack Persian analyzer T325090
1196                $config = ( new AnalyzerBuilder( $langName ) )->
1197                    withUnpackedAnalyzer()->
1198                    withLimitedCharMap( [ '\u200C=>\u0020' ], 'zero_width_spaces' )->
1199                    withDecimalDigit()->
1200                    omitStemmer()->
1201                    insertFiltersBefore( 'persian_stop',
1202                        [ 'arabic_normalization', 'persian_normalization' ] )->
1203                    build( $config );
1204                break;
1205            case 'polish':
1206                // these are real stop words for Polish
1207                $plStopwords = require __DIR__ . '/AnalysisLanguageData/polishStopwords.php';
1208
1209                // Stempel-specific stop words--additional unreliable stems
1210                $stempelStopwords = [ 'ować', 'iwać', 'obić', 'snąć', 'ywać', 'ium', 'my', 'um' ];
1211
1212                // Stempel is statistical, and certain stems are really terrible, so we filter them
1213                // after stemming. See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T186046
1214                $config[ 'filter' ][ 'stempel_pattern_filter' ] =
1215                    AnalyzerBuilder::patternFilter( '^([a-zął]?[a-zćń]|..ć|\d.*ć)$' );
1216
1217                $config = ( new AnalyzerBuilder( $langName ) )->
1218                    withUnpackedAnalyzer()->
1219                    withStop( $plStopwords )->
1220                    omitStemmer()->
1221                    omitAsciiFolding()->
1222                    appendFilters( [ 'polish_stem', 'stempel_pattern_filter', 'remove_empty' ] )->
1223                    withExtraStop( $stempelStopwords, 'stempel_stop' )->
1224                    build( $config );
1225                break;
1226            case 'romanian':  // Unpack Romanian analyzer T325091 / T330893
1227                // Counterintuitively, we need to map correct s&t (with commas) to older
1228                // incorrect forms (with cedilla) so that the old Snowball stemmer (from before
1229                // comma forms were available) will work; also normalize versions with
1230                // combining diacritics to single characters.
1231                $cedillaMap = [
1232                    'ș=>ş', 's\u0326=>ş', 's\u0327=>ş', 'ț=>ţ', 't\u0326=>ţ', 't\u0327=>ţ',
1233                    'Ș=>Ş', 'S\u0326=>Ş', 'S\u0327=>Ş', 'Ț=>Ţ', 'T\u0326=>Ţ', 'T\u0327=>Ţ',
1234                ];
1235
1236                // Add stopword variants with modern commas instead of old cedillas so that
1237                // both are handled, regardless of the character mapping needed for the
1238                // stemmer. In the future, Lucene should update their stopwords and these will
1239                // be included.
1240                $roStopwords = require __DIR__ . '/AnalysisLanguageData/romanianStopwords.php';
1241
1242                $config = ( new AnalyzerBuilder( $langName ) )->
1243                    withUnpackedAnalyzer()->
1244                    withCharMap( $cedillaMap )->
1245                    withExtraStop( $roStopwords, 'ro_comma_stop', 'romanian_stemmer' )->
1246                    build( $config );
1247                break;
1248            case 'russian':
1249                // unpack built-in Russian analyzer and add character filter
1250                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T124592
1251                $ruCharMap = [
1252                    '\u0301=>',    // combining acute accent, only used to show stress T102298
1253                    '\u0435\u0308=>\u0435',    // T124592 fold ё=>е and Ё=>Е, with combining
1254                    '\u0415\u0308=>\u0415',    // diacritic...
1255                    '\u0451=>\u0435', // ... or precomposed
1256                    '\u0401=>\u0415',
1257                ];
1258                $config = ( new AnalyzerBuilder( $langName ) )->
1259                    withUnpackedAnalyzer()->
1260                    withCharMap( $ruCharMap )->
1261                    build( $config );
1262
1263                // add Russian character mappings to near_space_flattener, and convert it from
1264                // limited_mapping to mapping to handle multi-char maps
1265                $config[ 'char_filter' ][ 'near_space_flattener' ][ 'type' ] = 'mapping';
1266                array_push( $config[ 'char_filter' ][ 'near_space_flattener' ][ 'mappings' ],
1267                    ...$ruCharMap );
1268
1269                // Drop acute stress marks and fold ё=>е everywhere
1270                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T124592
1271                $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'russian_charfilter';
1272                $config[ 'analyzer' ][ 'plain_search' ][ 'char_filter' ][] = 'russian_charfilter';
1273
1274                $config[ 'analyzer' ][ 'suggest' ][ 'char_filter' ][] = 'russian_charfilter';
1275                $config[ 'analyzer' ][ 'suggest_reverse' ][ 'char_filter' ][] = 'russian_charfilter';
1276                break;
1277            case 'slovak':
1278                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T190815
1279                // and https://www.mediawiki.org/wiki/User:TJones_(WMF)/T223787
1280                $config = ( new AnalyzerBuilder( $langName ) )->
1281                    withFilters( [ 'lowercase', 'slovak_stemmer', 'asciifolding' ] )->
1282                    build( $config );
1283                break;
1284            case 'sorani':    // Unpack Sorani analyzer T325091
1285                $config = ( new AnalyzerBuilder( $langName ) )->
1286                    withUnpackedAnalyzer()->
1287                    withDecimalDigit()->
1288                    insertFiltersBefore( 'lowercase', [ 'sorani_normalization' ] )->
1289                    build( $config );
1290                break;
1291            case 'swedish':
1292                // Add asciifolding_preserve to lowercase_keyword
1293                // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T160562
1294                $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
1295
1296                // Unpack built-in swedish analyzer to add asciifolding_preserve
1297                $config = ( new AnalyzerBuilder( $langName ) )->
1298                    withUnpackedAnalyzer()->
1299                    withAsciifoldingPreserve()->
1300                    build( $config );
1301                break;
1302            case 'thai':
1303                // Unpack and improve Thai analyzer: T294147
1304                $thCharMap = [
1305                    '_=>\u0020', // split tokens on underscore ..
1306                    ';=>\u0020', // .. semicolon
1307                    ':=>\u0020', // .. colon
1308                    '·=>\u0020', // .. middle dot
1309                    '‧=>\u0020', // .. & hyphenation point
1310                    'ฃ=>ข', // replace obsolete ฃ
1311                    'ฅ=>ค', // replace obsolete ฅ
1312                    '\u0e4d\u0e32=>\u0e33', // compose nikhahit + sara aa = sara am
1313                    '\u0e4d\u0e48\u0e32=>\u0e48\u0e33', // recompose sara am split around..
1314                    '\u0e4d\u0e49\u0e32=>\u0e49\u0e33', // .. other diacritics
1315                    '\u0e33\u0e48=>\u0e48\u0e33', // sara am should consistently..
1316                    '\u0e33\u0e49=>\u0e49\u0e33', // .. come after other diacritics
1317                    '\u0E34\u0E4D=>\u0E36', // compose sara i + nikhahit = sara ue..
1318                    '\u0E4D\u0E34=>\u0E36', // .. in either order
1319                ];
1320
1321                // instantiate basic unpacked analyzer builder, plus thai tokenizer by default
1322                $thBuilder = ( new AnalyzerBuilder( $langName ) )
1323                    ->withUnpackedAnalyzer()
1324                    ->withTokenizer( 'thai' );
1325
1326                if ( $this->isIcuAvailable() ) {
1327                    // ICU tokenizer is preferred in general. If it is available, replace
1328                    // default tokenizer. Also add thai_repl_pat char filter to accommodate
1329                    // some of its weaknesses.
1330                    $thBuilder->withTokenizer( $this->icu_tokenizer );
1331
1332                    $thaiLetterPat = '[ก-๏]'; // Thai characters, except for digits.
1333                    $config[ 'char_filter' ][ 'thai_repl_pat' ] =
1334                        // break between any digits and Thai letters, or vice versa
1335                        // break *Thai* tokens on periods (by making them spaces)
1336                        // (regex look-behind is okay, but look-ahead breaks offsets)
1337                        AnalyzerBuilder::patternFilter( "(?<=\\p{Nd})($thaiLetterPat)" .
1338                            "|(?<=$thaiLetterPat)(\\p{Nd})" .
1339                            "|(?<=$thaiLetterPat)\.($thaiLetterPat)",
1340                            ' $1$2$3' );
1341                    $thBuilder->withCharFilters( [ 'thai_repl_pat' ] );
1342
1343                    // if icu_token_repair (in the textify plugin) is available, we need a
1344                    // reverse number map so it doesn't rejoin split-off Arabic numbers.
1345                    if ( $this->isTextifyAvailable() ) {
1346                        $thBuilder->withReversedNumberCharFilter( 0x0e50 );
1347                    }
1348                } else {
1349                    // if we have to settle for the Thai tokenizer, add some additional
1350                    // character filters to accommodate some of its weaknesses
1351                    $thThaiTokSplits = [
1352                        '\u200B=>', // delete zero width space
1353                        '-=>\u0020', // split tokens on hyphen-minus ..
1354                        '‐=>\u0020', // .. hyphen
1355                        '–=>\u0020', // .. en dash
1356                        '—=>\u0020', // .. em dash
1357                        '―=>\u0020', // .. horizontal bar
1358                        '-=>\u0020', // .. fullwidth hyphen
1359                        '"=>\u0020', // .. & double quote
1360                    ];
1361                    array_push( $thCharMap, ...$thThaiTokSplits );
1362                }
1363
1364                // add in the rest of the bits that are always needed, and build
1365                $config = $thBuilder->withCharMap( $thCharMap )->
1366                    withDecimalDigit()->
1367                    omitStemmer()->
1368                    build( $config );
1369                break;
1370            case 'turkish':
1371                $trAposFilter = 'apostrophe';
1372                if ( in_array( 'extra-analysis-turkish', $this->plugins ) ) {
1373                    $trAposFilter = 'better_apostrophe';
1374                }
1375                $config = ( new AnalyzerBuilder( $langName ) )->
1376                    withUnpackedAnalyzer()->
1377                    withLangLowercase()->
1378                    insertFiltersBefore( 'turkish_stop', [ $trAposFilter ] )->
1379                    build( $config );
1380                break;
1381            case 'ukrainian-unpacked':
1382                $this->languagesWithIcuFolding['uk'] = true;
1383                $ukCharMap = [
1384                    '‘=>\'', // normalize apostrophes
1385                    '’=>\'',
1386                    '`=>\'',
1387                    '´=>\'',
1388                    'ʼ=>\'',
1389                    '\u0301=>', // delete combining acute and soft hyphen
1390                    '\u00AD=>',
1391                    'ґ=>г', // normalize ghe with upturn
1392                    'Ґ=>Г',
1393                ];
1394                // lowercase twice because stopwords are case sensitive, and the stemmer
1395                // generates some output with uppercase initial letters, even for
1396                // lowercase input (usually proper names)
1397                $ukFilters = [ 'lowercase', 'ukrainian_stop', 'ukrainian_stemmer',
1398                               'lowercase', 'remove_duplicates', 'asciifolding' ];
1399                $config = ( new AnalyzerBuilder( 'ukrainian' ) )->
1400                    withLimitedCharMap( $ukCharMap )->
1401                    withCharFilters( [ 'ukrainian_charfilter' ] )->
1402                    withFilters( $ukFilters )->
1403                    build( $config );
1404                break;
1405            default:
1406                // do nothing--default config is already set up
1407                break;
1408        }
1409
1410        // text_search is just a copy of text
1411        // @phan-suppress-next-line PhanTypeInvalidDimOffset
1412        $config[ 'analyzer' ][ 'text_search' ] = $config[ 'analyzer' ][ 'text' ];
1413
1414        // replace lowercase filters with icu_normalizer filter
1415        if ( $this->isIcuAvailable() ) {
1416            foreach ( $config[ 'analyzer' ] as &$analyzer ) {
1417                if ( !isset( $analyzer[ 'filter'  ] ) ) {
1418                    continue;
1419                }
1420
1421                $tmpFilters = [];
1422                foreach ( $analyzer[ 'filter' ] as $filter ) {
1423                    if ( $filter === 'lowercase' ) {
1424                        // If lowercase filter has language-specific processing, keep it,
1425                        // and do it before ICU normalization, particularly for Greek,
1426                        // Irish, and Turkish
1427                        // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T203117
1428                        // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T217602
1429                        if ( isset( $config[ 'filter' ][ 'lowercase' ][ 'language' ] ) ) {
1430                            $tmpFilters[] = 'lowercase';
1431                        }
1432                        $tmpFilters[] = 'icu_normalizer';
1433                    } else {
1434                        $tmpFilters[] = $filter;
1435                    }
1436                }
1437                $analyzer[ 'filter' ] = $tmpFilters;
1438
1439            }
1440        }
1441
1442        return $config;
1443    }
1444
1445    /**
1446     * Workaround for https://issues.apache.org/jira/browse/LUCENE-7468
1447     * The preserve_original duplicates token even if they are
1448     * not modified, leading to more space used and wrong term frequencies.
1449     * Workaround is to append a unique filter to remove the dups.
1450     * (made public for unit tests)
1451     *
1452     * @param mixed[] $config
1453     * @return mixed[] update mapping
1454     */
1455    public function fixAsciiFolding( array $config ) {
1456        $needDedupFilter = false;
1457        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
1458            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
1459                continue;
1460            }
1461            if ( !isset( $value[ 'filter' ] ) ) {
1462                continue;
1463            }
1464            $ascii_idx = array_search( 'asciifolding_preserve', $value[ 'filter' ] );
1465            if ( $ascii_idx !== false ) {
1466                $needDedupFilter = true;
1467                array_splice( $value[ 'filter' ], $ascii_idx + 1, 0, [ 'dedup_asciifolding' ] );
1468            }
1469        }
1470        if ( $needDedupFilter ) {
1471            $config[ 'filter' ][ 'dedup_asciifolding' ] = [
1472                'type' => 'unique',
1473                'only_on_same_position' => true,
1474            ];
1475        }
1476        return $config;
1477    }
1478
1479    /**
1480     * Pick the appropriate default analyzer based on the language.  Rather than think of
1481     * this as per language customization you should think of this as an effort to pick a
1482     * reasonably default in case CirrusSearch isn't customized for the language.
1483     *
1484     * @param string $language Config language
1485     * @return string the analyzer type
1486     */
1487    public function getDefaultTextAnalyzerType( $language ) {
1488        // If we match a language exactly, use it
1489        if ( array_key_exists( $language, $this->elasticsearchLanguageAnalyzers ) ) {
1490            return $this->elasticsearchLanguageAnalyzers[ $language ];
1491        }
1492
1493        return 'default';
1494    }
1495
1496    /**
1497     * Get list of filters that are mentioned in analyzers but not defined
1498     * explicitly.
1499     * @param array[] &$config Full configuration array
1500     * @param string[] $analyzers List of analyzers to consider.
1501     * @return array List of default filters, each containing only filter type
1502     */
1503    private function getDefaultFilters( array &$config, array $analyzers ) {
1504        $defaultFilters = [];
1505        foreach ( $analyzers as $analyzer ) {
1506            if ( empty( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] ) ) {
1507                continue;
1508            }
1509            foreach ( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] as $filterName ) {
1510                if ( !isset( $config[ 'filter' ][ $filterName ] ) ) {
1511                    // This is default definition for the built-in filter
1512                    $defaultFilters[ $filterName ] = [ 'type' => $filterName ];
1513                }
1514            }
1515        }
1516        return $defaultFilters;
1517    }
1518
1519    /**
1520     * Check every filter in the config - if it's the same as in old config,
1521     * ignore it. If it has the same name, but different content - create new filter
1522     * with different name by prefixing it with language code.
1523     *
1524     * @param array[] &$config Configuration being processed
1525     * @param array[] $standardFilters Existing filters list
1526     * @param array[] $defaultFilters List of default filters already mentioned in the config
1527     * @param string $prefix Prefix for disambiguation
1528     * @return array[] The list of filters not in the old config.
1529     */
1530    private function resolveFilters( array &$config, array $standardFilters, array $defaultFilters,
1531            string $prefix ) {
1532        $resultFilters = [];
1533        foreach ( $config[ 'filter' ] as $name => $filter ) {
1534            $existingFilter = $standardFilters[$name] ?? $defaultFilters[$name] ?? null;
1535            if ( $existingFilter ) { // Filter with this name already exists
1536                if ( $existingFilter != $filter ) {
1537                    // filter with the same name but different config - need to
1538                    // rename by adding prefix
1539                    $newName = $prefix . '_' . $name;
1540                    $this->replaceFilter( $config, $name, $newName );
1541                    $resultFilters[ $newName ] = $filter;
1542                }
1543            } else {
1544                $resultFilters[ $name ] = $filter;
1545            }
1546        }
1547        return $resultFilters;
1548    }
1549
1550    /**
1551     * Replace certain filter name in all configs with different name.
1552     * @param array[] &$config Configuration being processed
1553     * @param string $oldName
1554     * @param string $newName
1555     */
1556    private function replaceFilter( array &$config, $oldName, $newName ) {
1557        foreach ( $config[ 'analyzer' ] as &$analyzer ) {
1558            if ( !isset( $analyzer[ 'filter' ] ) ) {
1559                continue;
1560            }
1561            $analyzer[ 'filter' ] = array_map( static function ( $filter ) use ( $oldName, $newName ) {
1562                if ( $filter === $oldName ) {
1563                    return $newName;
1564                }
1565                return $filter;
1566            }, $analyzer[ 'filter' ] );
1567        }
1568    }
1569
1570    /**
1571     * Merge per-language config into the main config.
1572     * It will copy specific analyzer and all dependant filters and char_filters.
1573     * @param array &$config Main config
1574     * @param array $langConfig Per-language config
1575     * @param string $name Name for analyzer whose config we're merging
1576     * @param string $prefix Prefix for this configuration
1577     */
1578    private function mergeConfig( array &$config, array $langConfig, $name, $prefix ) {
1579        $analyzer = $langConfig[ 'analyzer' ][ $name ];
1580        $config[ 'analyzer' ][ $prefix . '_' . $name ] = $analyzer;
1581        if ( !empty( $analyzer[ 'filter' ] ) ) {
1582            // Add private filters for this analyzer
1583            foreach ( $analyzer[ 'filter' ] as $filter ) {
1584                // Copy filters that are in language config but not in the main config.
1585                // We would not copy the same filter into the main config since due to
1586                // the resolution step we know they are the same (otherwise we would have
1587                // renamed it).
1588                if ( isset( $langConfig[ 'filter' ][ $filter ] ) &&
1589                    !isset( $config[ 'filter' ][ $filter ] ) ) {
1590                    $config[ 'filter' ][ $filter ] = $langConfig[ 'filter' ][ $filter ];
1591                }
1592            }
1593        }
1594        if ( !empty( $analyzer[ 'char_filter' ] ) ) {
1595            // Add private char_filters for this analyzer
1596            foreach ( $analyzer[ 'char_filter' ] as $filter ) {
1597                // Copy char_filters that are in lang config but not in the main config.
1598                // Need to check whether the filter exists in langConfig because some
1599                // non-configurable filters are defined in plugins and do not have a
1600                // local definition (e.g., camelCase_splitter)
1601                if ( isset( $langConfig[ 'char_filter' ][ $filter ] ) &&
1602                    !isset( $config[ 'char_filter' ][ $filter ] ) ) {
1603                    $config[ 'char_filter' ][ $filter ] = $langConfig[ 'char_filter' ][ $filter ];
1604                }
1605            }
1606        }
1607        if ( !empty( $analyzer[ 'tokenizer' ] ) ) {
1608            $tokenizer = $analyzer[ 'tokenizer' ];
1609            if ( isset( $langConfig[ 'tokenizer' ][ $tokenizer ] ) &&
1610                    !isset( $config[ 'tokenizer' ][ $tokenizer ] ) ) {
1611                $config[ 'tokenizer' ][ $tokenizer ] = $langConfig[ 'tokenizer' ][ $tokenizer ];
1612            }
1613        }
1614    }
1615
1616    /**
1617     * Create per-language configs for specific analyzers which separates and namespaces
1618     * filters that are different between languages.
1619     * @param array &$config Existing config, will be modified
1620     * @param string[] $languages List of languages to process
1621     * @param string[] $analyzers List of analyzers to process
1622     */
1623    public function buildLanguageConfigs( array &$config, array $languages, array $analyzers ) {
1624        $defaultFilters = $this->getDefaultFilters( $config, $analyzers );
1625        foreach ( $languages as $lang ) {
1626            $langConfig = $this->buildConfig( $lang );
1627            $defaultFilters += $this->getDefaultFilters( $langConfig, $analyzers );
1628        }
1629        foreach ( $languages as $lang ) {
1630            $langConfig = $this->buildConfig( $lang );
1631            // Analyzer is: tokenizer + filter + char_filter
1632            // Char filters & Tokenizers are nicely namespaced
1633            // Filters are NOT - e.g. lowercase & icu_folding filters are different for different
1634            // languages! So we need to do some disambiguation here.
1635            $langConfig[ 'filter' ] =
1636                $this->resolveFilters( $langConfig, $config[ 'filter' ], $defaultFilters, $lang );
1637            // Merge configs
1638            foreach ( $analyzers as $analyzer ) {
1639                $this->mergeConfig( $config, $langConfig, $analyzer, $lang );
1640            }
1641        }
1642    }
1643
1644    /**
1645     * @return bool true if the icu analyzer is available.
1646     */
1647    public function isIcuAvailable() {
1648        return $this->icu;
1649    }
1650
1651    /**
1652     * @return bool true if the textify plugin is available.
1653     */
1654    public function isTextifyAvailable() {
1655        return $this->textify;
1656    }
1657
1658    /**
1659     * update languages with global custom filters (e.g., homoglyph & nnbsp filters)
1660     *
1661     * @param mixed[] $config
1662     * @param string $language language to add plugin to
1663     * @return mixed[] updated config
1664     */
1665    public function enableGlobalCustomFilters( array $config, string $language ) {
1666        return GlobalCustomFilter::enableGlobalCustomFilters( $config, $language,
1667            $this->globalCustomFilters, $this->plugins );
1668    }
1669
1670    /**
1671     * Languages for which we have a custom analysis chain (Elastic built-in or our
1672     * own custom analysis). All other languages default to the default analyzer which
1673     * isn't too good. Note that this array is sorted alphabetically by value. The
1674     * Elastic list is sourced from
1675     * https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html
1676     *
1677     * @var string[]
1678     */
1679    private $elasticsearchLanguageAnalyzers = [
1680        'ar' => 'arabic',
1681        'ary' => 'arabic-moroccan',
1682        'arz' => 'arabic-egyptian',
1683        'hy' => 'armenian',
1684        'az' => 'azerbaijani',
1685        'eu' => 'basque',
1686        'bn' => 'bengali',
1687        'pt-br' => 'brazilian',
1688        'bg' => 'bulgarian',
1689        'ca' => 'catalan',
1690        'crh' => 'crimean-tatar',
1691        'ja' => 'cjk',
1692        'ko' => 'cjk',
1693        'cs' => 'czech',
1694        'da' => 'danish',
1695        'nl' => 'dutch',
1696        'en' => 'english',
1697        'en-ca' => 'english',
1698        'en-gb' => 'english',
1699        'simple' => 'english',
1700        'et' => 'estonian',
1701        'fi' => 'finnish',
1702        'fr' => 'french',
1703        'gag' => 'gagauz',
1704        'gl' => 'galician',
1705        'de' => 'german',
1706        'el' => 'greek',
1707        'hi' => 'hindi',
1708        'hu' => 'hungarian',
1709        'id' => 'indonesian',
1710        'ga' => 'irish',
1711        'it' => 'italian',
1712        'kk' => 'kazakh',
1713        'lt' => 'lithuanian',
1714        'lv' => 'latvian',
1715        'ms' => 'malay',
1716        'mwl' => 'mirandese',
1717        'nb' => 'norwegian',
1718        'nn' => 'norwegian',
1719        'no' => 'norwegian',
1720        'fa' => 'persian',
1721        'pt' => 'portuguese',
1722        'ro' => 'romanian',
1723        'ru' => 'russian',
1724        'ckb' => 'sorani',
1725        'es' => 'spanish',
1726        'sv' => 'swedish',
1727        'tt' => 'tatar',
1728        'tr' => 'turkish',
1729        'th' => 'thai',
1730    ];
1731
1732    /**
1733     * @var bool[] indexed by language code, languages where ICU folding
1734     * can be enabled by default
1735     */
1736    private $languagesWithIcuFolding = [
1737        'ar' => true,
1738        'ary' => true,
1739        'arz' => true,
1740        'bg' => true,
1741        'bn' => true,
1742        'bs' => true,
1743        'ca' => true,
1744        'ckb' => true,
1745        'cs' => true,
1746        'da' => true,
1747        'de' => true,
1748        'el' => true,
1749        'en' => true,
1750        'en-ca' => true,
1751        'en-gb' => true,
1752        'simple' => true,
1753        'eo' => true,
1754        'es' => true,
1755        'et' => true,
1756        'eu' => true,
1757        'fa' => true,
1758        'fi' => true,
1759        'fr' => true,
1760        'ga' => true,
1761        'gl' => true,
1762        'he' => true,
1763        'hi' => true,
1764        'hr' => true,
1765        'hu' => true,
1766        'hy' => true,
1767        'ja' => true,
1768        'lt' => true,
1769        'lv' => true,
1770        'nb' => true,
1771        'nl' => true,
1772        'nn' => true,
1773        'no' => true,
1774        'pt' => true,
1775        'pt-br' => true,
1776        'ro' => true,
1777        'ru' => true,
1778        'sh' => true,
1779        'sk' => true,
1780        'sr' => true,
1781        'sv' => true,
1782        'th' => true,
1783        'tr' => true,
1784    ];
1785
1786    /**
1787     * @var bool[] indexed by language code, indicates whether languages should always
1788     * replace the standard tokenizer with the icu_tokenizer by default (true), or should
1789     * never use any version of the icu_tokenizer, even when icu_token_repair is
1790     * available (false). (Reminder to future readers of this code: languages with
1791     * non-standard tokenizers in the text field, like zh/Chinese, still use icu_tokenizer
1792     * in the plain fields & suggest fields.)
1793     */
1794    private $languagesWithIcuTokenization = [
1795        // true => use any version of icu_tokenizer available over the standard tokenizer
1796        'bo' => true,
1797        'dz' => true,
1798        'gan' => true,
1799        'ja' => true,
1800        'km' => true,
1801        'lo' => true,
1802        'my' => true,
1803        'th' => true,
1804        'wuu' => true,
1805        'zh' => true,
1806        'lzh' => true, // zh-classical
1807        'zh-classical' => true, // deprecated code for lzh
1808        'yue' => true, // zh-yue
1809        'zh-yue' => true, // deprecated code for yue
1810        // This list below are languages that may use use mixed scripts
1811        'bug' => true,
1812        'cdo' => true,
1813        'cr' => true,
1814        'hak' => true,
1815        'jv' => true,
1816        'nan' => true, // zh-min-nan
1817        'zh-min-nan' => true, // deprecated code for nan
1818
1819        // false => do not use any version of icu_tokenizer (i.e., textify_icu_tokenzier)
1820        // over the standard tokenizer, even when icu_token_repair is available
1821        // 'xyz' => false, // <-- example entry for now, since there are no actual instances
1822    ];
1823
1824    /**
1825     * @var array[]
1826     */
1827    private $elasticsearchLanguageAnalyzersFromPlugins = [
1828        /**
1829         * multiple plugin requirement can be comma separated
1830         *
1831         * Polish: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T154517
1832         * Ukrainian: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T160106
1833         * Chinese: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T158203
1834         * Hebrew: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T162741
1835         * Serbian: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T183015
1836         * Bosnian, Croatian, and Serbo-Croatian:
1837         *    https://www.mediawiki.org/wiki/User:TJones_(WMF)/T192395
1838         * Slovak: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T190815
1839         * Esperanto: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T202173
1840         * Korean: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T206874
1841         * Khmer: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T185721
1842         *
1843         * extra-analysis-ukrainian should follow analysis-ukrainian, so that
1844         * ukrainian-unpacked can overwrite value for uk if both are present.
1845         */
1846
1847        'analysis-stempel' => [ 'pl' => 'polish' ],
1848        'analysis-kuromoji' => [ 'ja' => 'japanese' ],
1849        'analysis-stconvert,analysis-smartcn' => [ 'zh' => 'chinese' ],
1850        'analysis-hebrew' => [ 'he' => 'hebrew' ],
1851        'analysis-ukrainian' => [ 'uk' => 'ukrainian' ],
1852        'extra-analysis-ukrainian' => [ 'uk' => 'ukrainian-unpacked' ],
1853        'extra-analysis-esperanto' => [ 'eo' => 'esperanto' ],
1854        'extra-analysis-serbian' => [ 'bs' => 'bosnian', 'hr' => 'croatian',
1855            'sh' => 'serbo-croatian', 'sr' => 'serbian' ],
1856        'extra-analysis-slovak' => [ 'sk' => 'slovak' ],
1857        'analysis-nori' => [ 'ko' => 'korean' ],
1858        'extra-analysis-khmer' => [ 'km' => 'khmer' ],
1859    ];
1860
1861    /**
1862     * Set up global custom filters
1863     *
1864     * @return array
1865     */
1866    private static function buildGlobalCustomFilters(): array {
1867        $gcf = [
1868            //////////////////////////
1869            // char filters
1870            'globo_norm' => new GlobalCustomFilter( 'char_filter' ),
1871
1872            'acronym_fixer' => ( new GlobalCustomFilter( 'char_filter' ) )->
1873                // follow armenian_charfilter, which normalizes another period-like
1874                // character, if it is being used
1875                setRequiredPlugins( [ 'extra-analysis-textify' ] )->
1876                setFallbackFilter( 'regex_acronym_fixer' )->
1877                setMustFollowFilters( [ 'armenian_charfilter' ] ),
1878
1879            'camelCase_splitter' => ( new GlobalCustomFilter( 'char_filter' ) )->
1880                // camelCase should generally follow acronyms so a.c.r.o.C.a.m.e.l.
1881                // is treated the same as acroCamel (real example: G.m.b.H. vs GmbH)
1882                setRequiredPlugins( [ 'extra-analysis-textify' ] )->
1883                setFallbackFilter( 'regex_camelCase' )->
1884                setMustFollowFilters( [ 'acronym_fixer', 'regex_acronym_fixer' ] ),
1885
1886            'word_break_helper' => ( new GlobalCustomFilter( 'char_filter' ) )->
1887                // * acronyms should be fixed before converting period to spaces
1888                // * follow armenian_charfilter, which normalizes another period-like
1889                //   character, if it is being used
1890                setMustFollowFilters( [ 'acronym_fixer', 'regex_acronym_fixer',
1891                    'armenian_charfilter' ] )->
1892                setLanguageDenyList( [ 'ko', 'zh' ] ),
1893
1894            'dotted_I_fix' => ( new GlobalCustomFilter( 'char_filter' ) )->
1895                // - if lowercase is present (because analysis-icu is not available, or
1896                // as a language-specific version) we don't need dotted_I_fix, because
1897                // lowercase prevents the problem.
1898                // - if icu_folding is present, we don't need dotted_I_fix, because
1899                // icu_folding also fixes it.
1900                setDisallowedTokenFilters( [ 'lowercase', 'icu_folding' ] ),
1901
1902            'arabic_extended_norm' => ( new GlobalCustomFilter( 'char_filter' ) )->
1903                // Mappings that are best for Arabic and Persian; default for any other
1904                // language except Sorani (ckb), which prefers Persian characters and
1905                // has it's own mapping (TT72899)
1906                setLanguageDenyList( [ 'ckb' ] ),
1907
1908            //////////////////////////
1909            // token filters
1910            'icu_token_repair' => ( new GlobalCustomFilter( 'filter' ) )->
1911                // apply icu_token_repair to icu_tokenizer-using analyzers
1912                // (default == text & text_search)
1913                setRequiredPlugins( [ 'extra-analysis-textify' ] )->
1914                setRequiredTokenizer( 'textify_icu_tokenizer' ),
1915
1916            'icutokrep_no_camel_split' => ( new GlobalCustomFilter( 'filter' ) )->
1917                // apply icu_token_repair variant to non-camelCase-splitting
1918                // icu_tokenizer-using analyzers when textify_icu_tokenizer is used
1919                setRequiredPlugins( [ 'extra-analysis-textify' ] )->
1920                setApplyToAnalyzers( [ 'plain', 'plain_search', 'suggest', 'suggest_reverse',
1921                    'source_text_plain', 'source_text_plain_search', 'word_prefix' ] )->
1922                setRequiredTokenizer( 'textify_icu_tokenizer' ),
1923
1924            'homoglyph_norm' => ( new GlobalCustomFilter( 'filter' ) )->
1925                // aggressive_splitting has weird graph problems and creating
1926                // multiple tokens makes it blow up
1927                setRequiredPlugins( [ 'extra-analysis-homoglyph' ] )->
1928                setMustFollowFilters( [ 'aggressive_splitting' ] ),
1929        ];
1930        // reverse the array so that items are ordered (approximately, modulo incompatible
1931        // filters) in the order specified here
1932        return array_reverse( $gcf );
1933    }
1934
1935}