Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
78.61% covered (warning)
78.61%
147 / 187
40.00% covered (danger)
40.00%
2 / 5
CRAP
0.00% covered (danger)
0.00%
0 / 1
SuggesterAnalysisConfigBuilder
78.61% covered (warning)
78.61%
147 / 187
40.00% covered (danger)
40.00%
2 / 5
21.17
0.00% covered (danger)
0.00%
0 / 1
 defaults
79.43% covered (warning)
79.43%
112 / 141
0.00% covered (danger)
0.00%
0 / 1
6.31
 customize
76.19% covered (warning)
76.19%
32 / 42
0.00% covered (danger)
0.00%
0 / 1
10.09
 buildConfig
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 getDefaultStopSet
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 hasStopWords
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5/**
6 * Builds elasticsearch analysis config arrays for the completion suggester
7 * index.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 * http://www.gnu.org/copyleft/gpl.html
23 */
24
25class SuggesterAnalysisConfigBuilder extends AnalysisConfigBuilder {
26    public const VERSION = "1.4";
27
28    /**
29     * Build an analysis config with sane defaults
30     *
31     * @param string $language Config language
32     * @return array
33     */
34    protected function defaults( $language ) {
35        // Use default lowercase filter
36        $lowercase_type = [ 'type' => 'lowercase' ];
37        if ( $this->isIcuAvailable() ) {
38            $lowercase_type = [
39                "type" => "icu_normalizer",
40                "name" => "nfkc_cf",
41            ];
42        }
43        // Use the default Lucene ASCII filter
44        $folding_type = [ 'type' => 'asciifolding' ];
45        if ( $this->shouldActivateIcuFolding( $language ) ) {
46            // Use ICU Folding if the plugin is available and activated in the config
47            $folding_type = [ 'type' => 'icu_folding' ];
48            $unicodeSetFilter = $this->getICUSetFilter( $language );
49            if ( $unicodeSetFilter !== null ) {
50                $folding_type['unicodeSetFilter'] = $unicodeSetFilter;
51            }
52        }
53        $textTokenizer = 'standard';
54        $plainTokenizer = 'whitespace';
55        if ( $this->shouldActivateIcuTokenization( $language ) ) {
56            $textTokenizer = 'icu_tokenizer';
57            // We cannot use the icu_tokenizer for plain here
58            // even if icu tokenization is mostly needed for languages
59            // where space is not used to break words. We don't want
60            // to break some punctuation chars like ':'
61        }
62        $defaults = [
63            'char_filter' => [
64                'word_break_helper' => [
65                    'type' => 'mapping',
66                    'mappings' => [
67                        '_=>\u0020', // a space for mw
68                        ',=>\u0020', // useful for "Lastname, Firstname"
69                        '"=>\u0020', // " certainly phrase search?
70                        '-=>\u0020', // useful for hyphenated names
71                        "'=>\u0020", // Useful for finding names
72                        '\u2019=>\u0020', // Unicode right single quote
73                        '\u02BC=>\u0020', // Unicode modifier letter apostrophe
74                        // Not sure about ( and )...
75                        // very useful to search for :
76                        // "john smith explo" instead of "john smith (expl"
77                        // but annoying to search for "(C)"
78                        // ')=>\u0020',
79                        // '(=>\u0020',
80                        // Ignoring : can be misleading for expert users
81                        // Because we will return unrelated pages when the user
82                        // search for "magic keywords" like WP:WP which are sometimes
83                        // pages in the main namespace that redirect to other namespace
84                        // ':=>\u0020',
85                        // Others are the ones ignored by common search engines
86                        ';=>\u0020',
87                        '\\[=>\u0020',
88                        '\\]=>\u0020',
89                        '{=>\u0020',
90                        '}=>\u0020',
91                        '\\\\=>\u0020',
92                        // Unicode white spaces
93                        // cause issues with completion
94                        // only few of them where actually
95                        // identified as problematic but
96                        // more are added for extra safety
97                        // see: T156234
98                        // TODO: reevaluate with es5
99                        '\u00a0=>\u0020',
100                        '\u1680=>\u0020',
101                        '\u180e=>\u0020',
102                        '\u2000=>\u0020',
103                        '\u2001=>\u0020',
104                        '\u2002=>\u0020',
105                        '\u2003=>\u0020',
106                        '\u2004=>\u0020',
107                        '\u2005=>\u0020',
108                        '\u2006=>\u0020',
109                        '\u2007=>\u0020',
110                        '\u2008=>\u0020',
111                        '\u2009=>\u0020',
112                        '\u200a=>\u0020',
113                        '\u200b=>\u0020', // causes issue
114                        '\u200c=>\u0020', // causes issue
115                        '\u200d=>\u0020', // causes issue
116                        '\u202f=>\u0020',
117                        '\u205f=>\u0020',
118                        '\u3000=>\u0020',
119                        '\ufeff=>\u0020', // causes issue
120                    ],
121                ],
122            ],
123            'filter' => [
124                "stop_filter" => [
125                    "type" => "stop",
126                    "stopwords" => "_none_",
127                    "remove_trailing" => "true"
128                ],
129                "lowercase" => $lowercase_type,
130                "accentfolding" => $folding_type,
131                "token_limit" => [
132                    "type" => "limit",
133                    "max_token_count" => "20"
134                ],
135                // Workaround what seems to be a bug in the
136                // completion suggester, empty tokens cause an
137                // issue similar to
138                // https://github.com/elastic/elasticsearch/pull/11158
139                // can be removed with es5 if we want
140                // note that icu_folding can introduce empty tokens, so
141                // maybe it is best to leave this in place
142                "remove_empty" => [
143                    "type" => "length",
144                    "min" => 1,
145                ],
146            ],
147            'analyzer' => [
148                "stop_analyzer" => [
149                    "type" => "custom",
150                    "filter" => [
151                        "lowercase",
152                        "stop_filter",
153                        "accentfolding",
154                        "remove_empty",
155                        "token_limit"
156                    ],
157                    "tokenizer" => $textTokenizer,
158                ],
159                // We do not remove stop words when searching,
160                // this leads to extremely weird behaviors while
161                // writing "to be or no to be"
162                "stop_analyzer_search" => [
163                    "type" => "custom",
164                    "filter" => [
165                        "lowercase",
166                        "accentfolding",
167                        "remove_empty",
168                        "token_limit"
169                    ],
170                    "tokenizer" => $textTokenizer,
171                ],
172                "plain" => [
173                    "type" => "custom",
174                    "char_filter" => [ 'word_break_helper' ],
175                    "filter" => [
176                        "remove_empty",
177                        "token_limit",
178                        "lowercase"
179                    ],
180                    "tokenizer" => $plainTokenizer,
181                ],
182                "plain_search" => [
183                    "type" => "custom",
184                    "char_filter" => [ 'word_break_helper' ],
185                    "filter" => [
186                        "remove_empty",
187                        "token_limit",
188                        "lowercase"
189                    ],
190                    "tokenizer" => $plainTokenizer,
191                ],
192            ],
193        ];
194        if ( $this->config->getElement( 'CirrusSearchCompletionSuggesterSubphrases', 'build' ) ) {
195            $defaults['analyzer']['subphrases'] = [
196                "type" => "custom",
197                "filter" => [
198                    "lowercase",
199                    "accentfolding",
200                    "remove_empty",
201                    "token_limit"
202                ],
203                "tokenizer" => $textTokenizer,
204            ];
205            $defaults['analyzer']['subphrases_search'] = [
206                "type" => "custom",
207                "filter" => [
208                    "lowercase",
209                    "accentfolding",
210                    "remove_empty",
211                    "token_limit"
212                ],
213                "tokenizer" => $textTokenizer,
214            ];
215        }
216        return $defaults;
217    }
218
219    /**
220     * @param array $config
221     * @param string $language
222     * @return array
223     */
224    private function customize( array $config, $language ) {
225        $defaultStopSet = $this->getDefaultStopSet( $language );
226        $config['filter']['stop_filter']['stopwords'] = $defaultStopSet;
227
228        switch ( $this->getDefaultTextAnalyzerType( $language ) ) {
229            // Please add languages in alphabetical order.
230            case 'arabic':
231                $config[ 'char_filter' ][ 'arabic_numeral_map' ] = [
232                    // T117217 fold Eastern Arabic Numerals (٠۱۲۳...) into Western (0123...)
233                    'type' => 'mapping',
234                    'mappings' => [
235                        '\u0660=>0', '\u0661=>1', '\u0662=>2',
236                        '\u0663=>3', '\u0664=>4', '\u0665=>5',
237                        '\u0666=>6', '\u0667=>7', '\u0668=>8',
238                        '\u0669=>9',
239                    ],
240                ];
241
242                // add arabic_numeral_map to plain and copy plain to plain_search
243                $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'arabic_numeral_map';
244                $config[ 'analyzer' ][ 'plain_search' ] = $config[ 'analyzer' ][ 'plain' ];
245                break;
246            case 'russian':
247                $config[ 'char_filter' ][ 'russian_diacritic_map' ] = [
248                    // T117217 fold Eastern Arabic Numerals (٠۱۲۳...) into Western (0123...)
249                    'type' => 'mapping',
250                    'mappings' => [
251                        // T102298 ignore combining acute / stress accents
252                        '\u0301=>',
253                        // T124592 fold ё=>е and Ё=>Е, precomposed or with combining diacritic
254                        '\u0451=>\u0435',
255                        '\u0401=>\u0415',
256                        '\u0435\u0308=>\u0435',
257                        '\u0415\u0308=>\u0415',
258
259                    ],
260                ];
261
262                // add arabic_numeral_map to plain and copy plain to plain_search
263                $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'russian_diacritic_map';
264                $config[ 'analyzer' ][ 'plain_search' ] = $config[ 'analyzer' ][ 'plain' ];
265                break;
266        }
267
268        if ( $this->isIcuAvailable() ) {
269            foreach ( $config[ 'analyzer' ] as $k => &$analyzer ) {
270                if ( $k != "stop_analyzer" && $k != "stop_analyzer_search" ) {
271                    continue;
272                }
273                if ( !isset( $analyzer[ 'filter'  ] ) ) {
274                    continue;
275                }
276                $analyzer[ 'filter' ] = array_map( static function ( $filter ) {
277                    if ( $filter === 'lowercase' ) {
278                        return 'icu_normalizer';
279                    }
280                    return $filter;
281                }, $analyzer[ 'filter' ] );
282            }
283        }
284        return $config;
285    }
286
287    /**
288     * Build the analysis config.
289     *
290     * @param string|null $language Config language
291     * @return array the analysis config
292     */
293    public function buildConfig( $language = null ) {
294        $language ??= $this->defaultLanguage;
295        return $this->customize( $this->defaults( $language ), $language );
296    }
297
298    /** @var string[] */
299    private static $stopwords = [
300        'ar' => '_arabic_',
301        'hy' => '_armenian_',
302        'eu' => '_basque_',
303        'pt-br' => '_brazilian_',
304        'bg' => '_bulgarian_',
305        'ca' => '_catalan_',
306        'cs' => '_czech_',
307        'da' => '_danish_',
308        'nl' => '_dutch_',
309        'en' => '_english_',
310        'en-ca' => '_english_',
311        'en-gb' => '_english_',
312        'simple' => '_english_',
313        'fi' => '_finnish_',
314        'fr' => '_french_',
315        'gl' => '_galician_',
316        'de' => '_german_',
317        'el' => '_greek_',
318        'hi' => '_hindi_',
319        'hu' => '_hungarian_',
320        'id' => '_indonesian_',
321        'lt' => '_lithuanian_',
322        'lv' => '_latvian_',
323        'ga' => '_irish_',
324        'it' => '_italian_',
325        'nb' => '_norwegian_',
326        'nn' => '_norwegian_',
327        'fa' => '_persian_',
328        'pt' => '_portuguese_',
329        'ro' => '_romanian_',
330        'ru' => '_russian_',
331        'ckb' => '_sorani_',
332        'es' => '_spanish_',
333        'sv' => '_swedish_',
334        'th' => '_thai_',
335        'tr' => '_turkish_'
336    ];
337
338    /**
339     * @param string $lang
340     * @return string
341     */
342    private function getDefaultStopSet( $lang ) {
343        return self::$stopwords[$lang] ?? '_none_';
344    }
345
346    /**
347     * @param string $lang
348     * @return bool
349     */
350    public static function hasStopWords( $lang ) {
351        return isset( self::$stopwords[$lang] );
352    }
353}