Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
78.61% |
147 / 187 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
SuggesterAnalysisConfigBuilder | |
78.61% |
147 / 187 |
|
40.00% |
2 / 5 |
21.17 | |
0.00% |
0 / 1 |
defaults | |
79.43% |
112 / 141 |
|
0.00% |
0 / 1 |
6.31 | |||
customize | |
76.19% |
32 / 42 |
|
0.00% |
0 / 1 |
10.09 | |||
buildConfig | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
getDefaultStopSet | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
hasStopWords | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | /** |
6 | * Builds elasticsearch analysis config arrays for the completion suggester |
7 | * index. |
8 | * |
9 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of the GNU General Public License as published by |
11 | * the Free Software Foundation; either version 2 of the License, or |
12 | * (at your option) any later version. |
13 | * |
14 | * This program is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | * GNU General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU General Public License along |
20 | * with this program; if not, write to the Free Software Foundation, Inc., |
21 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
22 | * http://www.gnu.org/copyleft/gpl.html |
23 | */ |
24 | |
25 | class SuggesterAnalysisConfigBuilder extends AnalysisConfigBuilder { |
26 | public const VERSION = "1.4"; |
27 | |
28 | /** |
29 | * Build an analysis config with sane defaults |
30 | * |
31 | * @param string $language Config language |
32 | * @return array |
33 | */ |
34 | protected function defaults( $language ) { |
35 | // Use default lowercase filter |
36 | $lowercase_type = [ 'type' => 'lowercase' ]; |
37 | if ( $this->isIcuAvailable() ) { |
38 | $lowercase_type = [ |
39 | "type" => "icu_normalizer", |
40 | "name" => "nfkc_cf", |
41 | ]; |
42 | } |
43 | // Use the default Lucene ASCII filter |
44 | $folding_type = [ 'type' => 'asciifolding' ]; |
45 | if ( $this->shouldActivateIcuFolding( $language ) ) { |
46 | // Use ICU Folding if the plugin is available and activated in the config |
47 | $folding_type = [ 'type' => 'icu_folding' ]; |
48 | $unicodeSetFilter = $this->getICUSetFilter( $language ); |
49 | if ( $unicodeSetFilter !== null ) { |
50 | $folding_type['unicodeSetFilter'] = $unicodeSetFilter; |
51 | } |
52 | } |
53 | $textTokenizer = 'standard'; |
54 | $plainTokenizer = 'whitespace'; |
55 | if ( $this->shouldActivateIcuTokenization( $language ) ) { |
56 | $textTokenizer = 'icu_tokenizer'; |
57 | // We cannot use the icu_tokenizer for plain here |
58 | // even if icu tokenization is mostly needed for languages |
59 | // where space is not used to break words. We don't want |
60 | // to break some punctuation chars like ':' |
61 | } |
62 | $defaults = [ |
63 | 'char_filter' => [ |
64 | 'word_break_helper' => [ |
65 | 'type' => 'mapping', |
66 | 'mappings' => [ |
67 | '_=>\u0020', // a space for mw |
68 | ',=>\u0020', // useful for "Lastname, Firstname" |
69 | '"=>\u0020', // " certainly phrase search? |
70 | '-=>\u0020', // useful for hyphenated names |
71 | "'=>\u0020", // Useful for finding names |
72 | '\u2019=>\u0020', // Unicode right single quote |
73 | '\u02BC=>\u0020', // Unicode modifier letter apostrophe |
74 | // Not sure about ( and )... |
75 | // very useful to search for : |
76 | // "john smith explo" instead of "john smith (expl" |
77 | // but annoying to search for "(C)" |
78 | // ')=>\u0020', |
79 | // '(=>\u0020', |
80 | // Ignoring : can be misleading for expert users |
81 | // Because we will return unrelated pages when the user |
82 | // search for "magic keywords" like WP:WP which are sometimes |
83 | // pages in the main namespace that redirect to other namespace |
84 | // ':=>\u0020', |
85 | // Others are the ones ignored by common search engines |
86 | ';=>\u0020', |
87 | '\\[=>\u0020', |
88 | '\\]=>\u0020', |
89 | '{=>\u0020', |
90 | '}=>\u0020', |
91 | '\\\\=>\u0020', |
92 | // Unicode white spaces |
93 | // cause issues with completion |
94 | // only few of them where actually |
95 | // identified as problematic but |
96 | // more are added for extra safety |
97 | // see: T156234 |
98 | // TODO: reevaluate with es5 |
99 | '\u00a0=>\u0020', |
100 | '\u1680=>\u0020', |
101 | '\u180e=>\u0020', |
102 | '\u2000=>\u0020', |
103 | '\u2001=>\u0020', |
104 | '\u2002=>\u0020', |
105 | '\u2003=>\u0020', |
106 | '\u2004=>\u0020', |
107 | '\u2005=>\u0020', |
108 | '\u2006=>\u0020', |
109 | '\u2007=>\u0020', |
110 | '\u2008=>\u0020', |
111 | '\u2009=>\u0020', |
112 | '\u200a=>\u0020', |
113 | '\u200b=>\u0020', // causes issue |
114 | '\u200c=>\u0020', // causes issue |
115 | '\u200d=>\u0020', // causes issue |
116 | '\u202f=>\u0020', |
117 | '\u205f=>\u0020', |
118 | '\u3000=>\u0020', |
119 | '\ufeff=>\u0020', // causes issue |
120 | ], |
121 | ], |
122 | ], |
123 | 'filter' => [ |
124 | "stop_filter" => [ |
125 | "type" => "stop", |
126 | "stopwords" => "_none_", |
127 | "remove_trailing" => "true" |
128 | ], |
129 | "lowercase" => $lowercase_type, |
130 | "accentfolding" => $folding_type, |
131 | "token_limit" => [ |
132 | "type" => "limit", |
133 | "max_token_count" => "20" |
134 | ], |
135 | // Workaround what seems to be a bug in the |
136 | // completion suggester, empty tokens cause an |
137 | // issue similar to |
138 | // https://github.com/elastic/elasticsearch/pull/11158 |
139 | // can be removed with es5 if we want |
140 | // note that icu_folding can introduce empty tokens, so |
141 | // maybe it is best to leave this in place |
142 | "remove_empty" => [ |
143 | "type" => "length", |
144 | "min" => 1, |
145 | ], |
146 | ], |
147 | 'analyzer' => [ |
148 | "stop_analyzer" => [ |
149 | "type" => "custom", |
150 | "filter" => [ |
151 | "lowercase", |
152 | "stop_filter", |
153 | "accentfolding", |
154 | "remove_empty", |
155 | "token_limit" |
156 | ], |
157 | "tokenizer" => $textTokenizer, |
158 | ], |
159 | // We do not remove stop words when searching, |
160 | // this leads to extremely weird behaviors while |
161 | // writing "to be or no to be" |
162 | "stop_analyzer_search" => [ |
163 | "type" => "custom", |
164 | "filter" => [ |
165 | "lowercase", |
166 | "accentfolding", |
167 | "remove_empty", |
168 | "token_limit" |
169 | ], |
170 | "tokenizer" => $textTokenizer, |
171 | ], |
172 | "plain" => [ |
173 | "type" => "custom", |
174 | "char_filter" => [ 'word_break_helper' ], |
175 | "filter" => [ |
176 | "remove_empty", |
177 | "token_limit", |
178 | "lowercase" |
179 | ], |
180 | "tokenizer" => $plainTokenizer, |
181 | ], |
182 | "plain_search" => [ |
183 | "type" => "custom", |
184 | "char_filter" => [ 'word_break_helper' ], |
185 | "filter" => [ |
186 | "remove_empty", |
187 | "token_limit", |
188 | "lowercase" |
189 | ], |
190 | "tokenizer" => $plainTokenizer, |
191 | ], |
192 | ], |
193 | ]; |
194 | if ( $this->config->getElement( 'CirrusSearchCompletionSuggesterSubphrases', 'build' ) ) { |
195 | $defaults['analyzer']['subphrases'] = [ |
196 | "type" => "custom", |
197 | "filter" => [ |
198 | "lowercase", |
199 | "accentfolding", |
200 | "remove_empty", |
201 | "token_limit" |
202 | ], |
203 | "tokenizer" => $textTokenizer, |
204 | ]; |
205 | $defaults['analyzer']['subphrases_search'] = [ |
206 | "type" => "custom", |
207 | "filter" => [ |
208 | "lowercase", |
209 | "accentfolding", |
210 | "remove_empty", |
211 | "token_limit" |
212 | ], |
213 | "tokenizer" => $textTokenizer, |
214 | ]; |
215 | } |
216 | return $defaults; |
217 | } |
218 | |
219 | /** |
220 | * @param array $config |
221 | * @param string $language |
222 | * @return array |
223 | */ |
224 | private function customize( array $config, $language ) { |
225 | $defaultStopSet = $this->getDefaultStopSet( $language ); |
226 | $config['filter']['stop_filter']['stopwords'] = $defaultStopSet; |
227 | |
228 | switch ( $this->getDefaultTextAnalyzerType( $language ) ) { |
229 | // Please add languages in alphabetical order. |
230 | case 'arabic': |
231 | $config[ 'char_filter' ][ 'arabic_numeral_map' ] = [ |
232 | // T117217 fold Eastern Arabic Numerals (٠۱۲۳...) into Western (0123...) |
233 | 'type' => 'mapping', |
234 | 'mappings' => [ |
235 | '\u0660=>0', '\u0661=>1', '\u0662=>2', |
236 | '\u0663=>3', '\u0664=>4', '\u0665=>5', |
237 | '\u0666=>6', '\u0667=>7', '\u0668=>8', |
238 | '\u0669=>9', |
239 | ], |
240 | ]; |
241 | |
242 | // add arabic_numeral_map to plain and copy plain to plain_search |
243 | $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'arabic_numeral_map'; |
244 | $config[ 'analyzer' ][ 'plain_search' ] = $config[ 'analyzer' ][ 'plain' ]; |
245 | break; |
246 | case 'russian': |
247 | $config[ 'char_filter' ][ 'russian_diacritic_map' ] = [ |
248 | // T117217 fold Eastern Arabic Numerals (٠۱۲۳...) into Western (0123...) |
249 | 'type' => 'mapping', |
250 | 'mappings' => [ |
251 | // T102298 ignore combining acute / stress accents |
252 | '\u0301=>', |
253 | // T124592 fold ё=>е and Ё=>Е, precomposed or with combining diacritic |
254 | '\u0451=>\u0435', |
255 | '\u0401=>\u0415', |
256 | '\u0435\u0308=>\u0435', |
257 | '\u0415\u0308=>\u0415', |
258 | |
259 | ], |
260 | ]; |
261 | |
262 | // add arabic_numeral_map to plain and copy plain to plain_search |
263 | $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'russian_diacritic_map'; |
264 | $config[ 'analyzer' ][ 'plain_search' ] = $config[ 'analyzer' ][ 'plain' ]; |
265 | break; |
266 | } |
267 | |
268 | if ( $this->isIcuAvailable() ) { |
269 | foreach ( $config[ 'analyzer' ] as $k => &$analyzer ) { |
270 | if ( $k != "stop_analyzer" && $k != "stop_analyzer_search" ) { |
271 | continue; |
272 | } |
273 | if ( !isset( $analyzer[ 'filter' ] ) ) { |
274 | continue; |
275 | } |
276 | $analyzer[ 'filter' ] = array_map( static function ( $filter ) { |
277 | if ( $filter === 'lowercase' ) { |
278 | return 'icu_normalizer'; |
279 | } |
280 | return $filter; |
281 | }, $analyzer[ 'filter' ] ); |
282 | } |
283 | } |
284 | return $config; |
285 | } |
286 | |
287 | /** |
288 | * Build the analysis config. |
289 | * |
290 | * @param string|null $language Config language |
291 | * @return array the analysis config |
292 | */ |
293 | public function buildConfig( $language = null ) { |
294 | $language ??= $this->defaultLanguage; |
295 | return $this->customize( $this->defaults( $language ), $language ); |
296 | } |
297 | |
298 | /** @var string[] */ |
299 | private static $stopwords = [ |
300 | 'ar' => '_arabic_', |
301 | 'hy' => '_armenian_', |
302 | 'eu' => '_basque_', |
303 | 'pt-br' => '_brazilian_', |
304 | 'bg' => '_bulgarian_', |
305 | 'ca' => '_catalan_', |
306 | 'cs' => '_czech_', |
307 | 'da' => '_danish_', |
308 | 'nl' => '_dutch_', |
309 | 'en' => '_english_', |
310 | 'en-ca' => '_english_', |
311 | 'en-gb' => '_english_', |
312 | 'simple' => '_english_', |
313 | 'fi' => '_finnish_', |
314 | 'fr' => '_french_', |
315 | 'gl' => '_galician_', |
316 | 'de' => '_german_', |
317 | 'el' => '_greek_', |
318 | 'hi' => '_hindi_', |
319 | 'hu' => '_hungarian_', |
320 | 'id' => '_indonesian_', |
321 | 'lt' => '_lithuanian_', |
322 | 'lv' => '_latvian_', |
323 | 'ga' => '_irish_', |
324 | 'it' => '_italian_', |
325 | 'nb' => '_norwegian_', |
326 | 'nn' => '_norwegian_', |
327 | 'fa' => '_persian_', |
328 | 'pt' => '_portuguese_', |
329 | 'ro' => '_romanian_', |
330 | 'ru' => '_russian_', |
331 | 'ckb' => '_sorani_', |
332 | 'es' => '_spanish_', |
333 | 'sv' => '_swedish_', |
334 | 'th' => '_thai_', |
335 | 'tr' => '_turkish_' |
336 | ]; |
337 | |
338 | /** |
339 | * @param string $lang |
340 | * @return string |
341 | */ |
342 | private function getDefaultStopSet( $lang ) { |
343 | return self::$stopwords[$lang] ?? '_none_'; |
344 | } |
345 | |
346 | /** |
347 | * @param string $lang |
348 | * @return bool |
349 | */ |
350 | public static function hasStopWords( $lang ) { |
351 | return isset( self::$stopwords[$lang] ); |
352 | } |
353 | } |