Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
78.61% covered (warning)
78.61%
147 / 187
40.00% covered (danger)
40.00%
2 / 5
CRAP
0.00% covered (danger)
0.00%
0 / 1
SuggesterAnalysisConfigBuilder
78.61% covered (warning)
78.61%
147 / 187
40.00% covered (danger)
40.00%
2 / 5
21.17
0.00% covered (danger)
0.00%
0 / 1
 defaults
79.43% covered (warning)
79.43%
112 / 141
0.00% covered (danger)
0.00%
0 / 1
6.31
 customize
76.19% covered (warning)
76.19%
32 / 42
0.00% covered (danger)
0.00%
0 / 1
10.09
 buildConfig
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 getDefaultStopSet
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 hasStopWords
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2/**
3 * @license GPL-2.0-or-later
4 */
5
6namespace CirrusSearch\Maintenance;
7
8/**
9 * Builds search analysis config arrays for the completion suggester
10 * index.
11 */
12class SuggesterAnalysisConfigBuilder extends AnalysisConfigBuilder {
13    public const VERSION = "1.4";
14
15    /**
16     * Build an analysis config with sane defaults
17     *
18     * @param string $language Config language
19     * @return array
20     */
21    protected function defaults( $language ) {
22        // Use default lowercase filter
23        $lowercase_type = [ 'type' => 'lowercase' ];
24        if ( $this->isIcuAvailable() ) {
25            $lowercase_type = [
26                "type" => "icu_normalizer",
27                "name" => "nfkc_cf",
28            ];
29        }
30        // Use the default Lucene ASCII filter
31        $folding_type = [ 'type' => 'asciifolding' ];
32        if ( $this->shouldActivateIcuFolding( $language ) ) {
33            // Use ICU Folding if the plugin is available and activated in the config
34            $folding_type = [ 'type' => 'icu_folding' ];
35            $unicodeSetFilter = $this->getICUSetFilter( $language );
36            if ( $unicodeSetFilter !== null ) {
37                $folding_type['unicodeSetFilter'] = $unicodeSetFilter;
38            }
39        }
40        $textTokenizer = 'standard';
41        $plainTokenizer = 'whitespace';
42        if ( $this->shouldActivateIcuTokenization( $language ) ) {
43            $textTokenizer = 'icu_tokenizer';
44            // We cannot use the icu_tokenizer for plain here
45            // even if icu tokenization is mostly needed for languages
46            // where space is not used to break words. We don't want
47            // to break some punctuation chars like ':'
48        }
49        $defaults = [
50            'char_filter' => [
51                'word_break_helper' => [
52                    'type' => 'mapping',
53                    'mappings' => [
54                        '_=>\u0020', // a space for mw
55                        ',=>\u0020', // useful for "Lastname, Firstname"
56                        '"=>\u0020', // " certainly phrase search?
57                        '-=>\u0020', // useful for hyphenated names
58                        "'=>\u0020", // Useful for finding names
59                        '\u2019=>\u0020', // Unicode right single quote
60                        '\u02BC=>\u0020', // Unicode modifier letter apostrophe
61                        // Not sure about ( and )...
62                        // very useful to search for :
63                        // "john smith explo" instead of "john smith (expl"
64                        // but annoying to search for "(C)"
65                        // ')=>\u0020',
66                        // '(=>\u0020',
67                        // Ignoring : can be misleading for expert users
68                        // Because we will return unrelated pages when the user
69                        // search for "magic keywords" like WP:WP which are sometimes
70                        // pages in the main namespace that redirect to other namespace
71                        // ':=>\u0020',
72                        // Others are the ones ignored by common search engines
73                        ';=>\u0020',
74                        '\\[=>\u0020',
75                        '\\]=>\u0020',
76                        '{=>\u0020',
77                        '}=>\u0020',
78                        '\\\\=>\u0020',
79                        // Unicode white spaces
80                        // cause issues with completion
81                        // only few of them where actually
82                        // identified as problematic but
83                        // more are added for extra safety
84                        // see: T156234
85                        // TODO: reevaluate with es5
86                        '\u00a0=>\u0020',
87                        '\u1680=>\u0020',
88                        '\u180e=>\u0020',
89                        '\u2000=>\u0020',
90                        '\u2001=>\u0020',
91                        '\u2002=>\u0020',
92                        '\u2003=>\u0020',
93                        '\u2004=>\u0020',
94                        '\u2005=>\u0020',
95                        '\u2006=>\u0020',
96                        '\u2007=>\u0020',
97                        '\u2008=>\u0020',
98                        '\u2009=>\u0020',
99                        '\u200a=>\u0020',
100                        '\u200b=>\u0020', // causes issue
101                        '\u200c=>\u0020', // causes issue
102                        '\u200d=>\u0020', // causes issue
103                        '\u202f=>\u0020',
104                        '\u205f=>\u0020',
105                        '\u3000=>\u0020',
106                        '\ufeff=>\u0020', // causes issue
107                    ],
108                ],
109            ],
110            'filter' => [
111                "stop_filter" => [
112                    "type" => "stop",
113                    "stopwords" => "_none_",
114                    "remove_trailing" => "true"
115                ],
116                "lowercase" => $lowercase_type,
117                "accentfolding" => $folding_type,
118                "token_limit" => [
119                    "type" => "limit",
120                    "max_token_count" => "20"
121                ],
122                // Workaround what seems to be a bug in the
123                // completion suggester, empty tokens cause an
124                // issue similar to
125                // https://github.com/elastic/elasticsearch/pull/11158
126                // can be removed with es5 if we want
127                // note that icu_folding can introduce empty tokens, so
128                // maybe it is best to leave this in place
129                "remove_empty" => [
130                    "type" => "length",
131                    "min" => 1,
132                ],
133            ],
134            'analyzer' => [
135                "stop_analyzer" => [
136                    "type" => "custom",
137                    "filter" => [
138                        "lowercase",
139                        "stop_filter",
140                        "accentfolding",
141                        "remove_empty",
142                        "token_limit"
143                    ],
144                    "tokenizer" => $textTokenizer,
145                ],
146                // We do not remove stop words when searching,
147                // this leads to extremely weird behaviors while
148                // writing "to be or no to be"
149                "stop_analyzer_search" => [
150                    "type" => "custom",
151                    "filter" => [
152                        "lowercase",
153                        "accentfolding",
154                        "remove_empty",
155                        "token_limit"
156                    ],
157                    "tokenizer" => $textTokenizer,
158                ],
159                "plain" => [
160                    "type" => "custom",
161                    "char_filter" => [ 'word_break_helper' ],
162                    "filter" => [
163                        "remove_empty",
164                        "token_limit",
165                        "lowercase"
166                    ],
167                    "tokenizer" => $plainTokenizer,
168                ],
169                "plain_search" => [
170                    "type" => "custom",
171                    "char_filter" => [ 'word_break_helper' ],
172                    "filter" => [
173                        "remove_empty",
174                        "token_limit",
175                        "lowercase"
176                    ],
177                    "tokenizer" => $plainTokenizer,
178                ],
179            ],
180        ];
181        if ( $this->config->getElement( 'CirrusSearchCompletionSuggesterSubphrases', 'build' ) ) {
182            $defaults['analyzer']['subphrases'] = [
183                "type" => "custom",
184                "filter" => [
185                    "lowercase",
186                    "accentfolding",
187                    "remove_empty",
188                    "token_limit"
189                ],
190                "tokenizer" => $textTokenizer,
191            ];
192            $defaults['analyzer']['subphrases_search'] = [
193                "type" => "custom",
194                "filter" => [
195                    "lowercase",
196                    "accentfolding",
197                    "remove_empty",
198                    "token_limit"
199                ],
200                "tokenizer" => $textTokenizer,
201            ];
202        }
203        return $defaults;
204    }
205
206    /**
207     * @param array $config
208     * @param string $language
209     * @return array
210     */
211    private function customize( array $config, $language ) {
212        $defaultStopSet = $this->getDefaultStopSet( $language );
213        $config['filter']['stop_filter']['stopwords'] = $defaultStopSet;
214
215        switch ( $this->getDefaultTextAnalyzerType( $language ) ) {
216            // Please add languages in alphabetical order.
217            case 'arabic':
218                $config[ 'char_filter' ][ 'arabic_numeral_map' ] = [
219                    // T117217 fold Eastern Arabic Numerals (٠۱۲۳...) into Western (0123...)
220                    'type' => 'mapping',
221                    'mappings' => [
222                        '\u0660=>0', '\u0661=>1', '\u0662=>2',
223                        '\u0663=>3', '\u0664=>4', '\u0665=>5',
224                        '\u0666=>6', '\u0667=>7', '\u0668=>8',
225                        '\u0669=>9',
226                    ],
227                ];
228
229                // add arabic_numeral_map to plain and copy plain to plain_search
230                $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'arabic_numeral_map';
231                $config[ 'analyzer' ][ 'plain_search' ] = $config[ 'analyzer' ][ 'plain' ];
232                break;
233            case 'russian':
234                $config[ 'char_filter' ][ 'russian_diacritic_map' ] = [
235                    // T117217 fold Eastern Arabic Numerals (٠۱۲۳...) into Western (0123...)
236                    'type' => 'mapping',
237                    'mappings' => [
238                        // T102298 ignore combining acute / stress accents
239                        '\u0301=>',
240                        // T124592 fold ё=>е and Ё=>Е, precomposed or with combining diacritic
241                        '\u0451=>\u0435',
242                        '\u0401=>\u0415',
243                        '\u0435\u0308=>\u0435',
244                        '\u0415\u0308=>\u0415',
245
246                    ],
247                ];
248
249                // add arabic_numeral_map to plain and copy plain to plain_search
250                $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'russian_diacritic_map';
251                $config[ 'analyzer' ][ 'plain_search' ] = $config[ 'analyzer' ][ 'plain' ];
252                break;
253        }
254
255        if ( $this->isIcuAvailable() ) {
256            foreach ( $config[ 'analyzer' ] as $k => &$analyzer ) {
257                if ( $k != "stop_analyzer" && $k != "stop_analyzer_search" ) {
258                    continue;
259                }
260                if ( !isset( $analyzer[ 'filter'  ] ) ) {
261                    continue;
262                }
263                $analyzer[ 'filter' ] = array_map( static function ( $filter ) {
264                    if ( $filter === 'lowercase' ) {
265                        return 'icu_normalizer';
266                    }
267                    return $filter;
268                }, $analyzer[ 'filter' ] );
269            }
270        }
271        return $config;
272    }
273
274    /**
275     * Build the analysis config.
276     *
277     * @param string|null $language Config language
278     * @return array the analysis config
279     */
280    public function buildConfig( $language = null ) {
281        $language ??= $this->defaultLanguage;
282        return $this->customize( $this->defaults( $language ), $language );
283    }
284
285    /** @var string[] */
286    private static $stopwords = [
287        'ar' => '_arabic_',
288        'hy' => '_armenian_',
289        'eu' => '_basque_',
290        'pt-br' => '_brazilian_',
291        'bg' => '_bulgarian_',
292        'ca' => '_catalan_',
293        'cs' => '_czech_',
294        'da' => '_danish_',
295        'nl' => '_dutch_',
296        'en' => '_english_',
297        'en-ca' => '_english_',
298        'en-gb' => '_english_',
299        'simple' => '_english_',
300        'fi' => '_finnish_',
301        'fr' => '_french_',
302        'gl' => '_galician_',
303        'de' => '_german_',
304        'el' => '_greek_',
305        'hi' => '_hindi_',
306        'hu' => '_hungarian_',
307        'id' => '_indonesian_',
308        'lt' => '_lithuanian_',
309        'lv' => '_latvian_',
310        'ga' => '_irish_',
311        'it' => '_italian_',
312        'nb' => '_norwegian_',
313        'nn' => '_norwegian_',
314        'fa' => '_persian_',
315        'pt' => '_portuguese_',
316        'ro' => '_romanian_',
317        'ru' => '_russian_',
318        'ckb' => '_sorani_',
319        'es' => '_spanish_',
320        'sv' => '_swedish_',
321        'th' => '_thai_',
322        'tr' => '_turkish_'
323    ];
324
325    /**
326     * @param string $lang
327     * @return string
328     */
329    private function getDefaultStopSet( $lang ) {
330        return self::$stopwords[$lang] ?? '_none_';
331    }
332
333    /**
334     * @param string $lang
335     * @return bool
336     */
337    public static function hasStopWords( $lang ) {
338        return isset( self::$stopwords[$lang] );
339    }
340}