Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
99.39% |
983 / 989 |
|
76.92% |
20 / 26 |
CRAP | |
0.00% |
0 / 1 |
AnalysisConfigBuilder | |
99.39% |
983 / 989 |
|
76.92% |
20 / 26 |
218 | |
0.00% |
0 / 1 |
__construct | |
96.30% |
26 / 27 |
|
0.00% |
0 / 1 |
8 | |||
shouldActivateIcuFolding | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
9 | |||
shouldActivateIcuTokenization | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
7 | |||
buildConfig | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
4 | |||
buildSimilarityConfig | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
enableICUTokenizer | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
6 | |||
standardTokenizerOnlyCleanup | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
6 | |||
disableLimitedMappings | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
4 | |||
enableICUFolding | |
100.00% |
32 / 32 |
|
100.00% |
1 / 1 |
12 | |||
switchFiltersToICUFolding | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
switchFiltersToICUFoldingPreserve | |
94.44% |
17 / 18 |
|
0.00% |
0 / 1 |
7.01 | |||
getICUSetFilter | |
98.00% |
49 / 50 |
|
0.00% |
0 / 1 |
29 | |||
getICUNormSetFilter | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
4.13 | |||
defaults | |
100.00% |
263 / 263 |
|
100.00% |
1 / 1 |
7 | |||
customize | |
100.00% |
430 / 430 |
|
100.00% |
1 / 1 |
71 | |||
fixAsciiFolding | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
7 | |||
getDefaultTextAnalyzerType | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getDefaultFilters | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
resolveFilters | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
replaceFilter | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
4.03 | |||
mergeConfig | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
12 | |||
buildLanguageConfigs | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
isIcuAvailable | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
isTextifyAvailable | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
enableGlobalCustomFilters | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
buildGlobalCustomFilters | |
100.00% |
29 / 29 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use CirrusSearch\CirrusSearch; |
6 | use CirrusSearch\CirrusSearchHookRunner; |
7 | use CirrusSearch\Profile\SearchProfileService; |
8 | use CirrusSearch\SearchConfig; |
9 | use MediaWiki\MediaWikiServices; |
10 | |
11 | /** |
12 | * Builds elasticsearch analysis config arrays. |
13 | * |
14 | * This program is free software; you can redistribute it and/or modify |
15 | * it under the terms of the GNU General Public License as published by |
16 | * the Free Software Foundation; either version 2 of the License, or |
17 | * (at your option) any later version. |
18 | * |
19 | * This program is distributed in the hope that it will be useful, |
20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
22 | * GNU General Public License for more details. |
23 | * |
24 | * You should have received a copy of the GNU General Public License along |
25 | * with this program; if not, write to the Free Software Foundation, Inc., |
26 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
27 | * http://www.gnu.org/copyleft/gpl.html |
28 | */ |
29 | class AnalysisConfigBuilder { |
30 | /** |
31 | * Version number for the core analysis. Increment the major |
32 | * version when the analysis changes in an incompatible way, |
33 | * and change the minor version when it changes but isn't |
34 | * incompatible. |
35 | * |
36 | * You may also need to increment MetaStoreIndex::METASTORE_VERSION |
37 | * manually as well. |
38 | */ |
39 | public const VERSION = '0.12'; |
40 | |
41 | /** |
42 | * Maximum number of characters allowed in keyword terms. |
43 | */ |
44 | private const KEYWORD_IGNORE_ABOVE = 5000; |
45 | |
46 | /** |
47 | * Temporary magic value to prevent enabling ICU tokenizer in specific analyzers |
48 | */ |
49 | private const STANDARD_TOKENIZER_ONLY = 'std_only'; |
50 | |
51 | /** |
52 | * @var bool is the icu plugin available? |
53 | */ |
54 | private $icu; |
55 | |
56 | /** |
57 | * @var bool is the textify plugin available? |
58 | */ |
59 | private $textify; |
60 | |
61 | /** |
62 | * @var string which ICU tokenizer should be used |
63 | */ |
64 | private $icu_tokenizer = 'icu_tokenizer'; |
65 | |
66 | /** |
67 | * @var array Similarity algo (tf/idf, bm25, etc) configuration |
68 | */ |
69 | private $similarity; |
70 | |
71 | /** |
72 | * @var SearchConfig cirrus config |
73 | */ |
74 | protected $config; |
75 | |
76 | /** |
77 | * @var string[] |
78 | */ |
79 | private $plugins; |
80 | |
81 | /** |
82 | * @var string |
83 | */ |
84 | protected $defaultLanguage; |
85 | |
86 | /** |
87 | * @var CirrusSearchHookRunner |
88 | */ |
89 | private $cirrusSearchHookRunner; |
90 | |
91 | /** |
92 | * @var GlobalCustomFilter[] |
93 | */ |
94 | public $globalCustomFilters; |
95 | |
96 | /** |
97 | * @param string $langCode The language code to build config for |
98 | * @param string[] $plugins list of plugins installed in Elasticsearch |
99 | * @param SearchConfig|null $config |
100 | * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner |
101 | */ |
102 | public function __construct( |
103 | $langCode, |
104 | array $plugins, |
105 | SearchConfig $config = null, |
106 | CirrusSearchHookRunner $cirrusSearchHookRunner = null |
107 | ) { |
108 | $this->globalCustomFilters = $this->buildGlobalCustomFilters(); |
109 | |
110 | $this->defaultLanguage = $langCode; |
111 | $this->plugins = $plugins; |
112 | foreach ( $this->elasticsearchLanguageAnalyzersFromPlugins as $pluginSpec => $extra ) { |
113 | $pluginsPresent = 1; |
114 | $pluginList = explode( ',', $pluginSpec ); |
115 | foreach ( $pluginList as $plugin ) { |
116 | if ( !in_array( $plugin, $plugins ) ) { |
117 | $pluginsPresent = 0; |
118 | break; |
119 | } |
120 | } |
121 | if ( $pluginsPresent ) { |
122 | $this->elasticsearchLanguageAnalyzers = |
123 | array_merge( $this->elasticsearchLanguageAnalyzers, $extra ); |
124 | } |
125 | } |
126 | $this->icu = in_array( 'analysis-icu', $plugins ); |
127 | $this->textify = in_array( 'extra-analysis-textify', $plugins ); |
128 | if ( $this->isTextifyAvailable() ) { |
129 | // icu_token_repair can only work with the textify icu_tokenizer clone |
130 | $this->icu_tokenizer = 'textify_icu_tokenizer'; |
131 | } |
132 | $config ??= MediaWikiServices::getInstance()->getConfigFactory() |
133 | ->makeConfig( 'CirrusSearch' ); |
134 | $similarity = $config->getProfileService()->loadProfile( SearchProfileService::SIMILARITY ); |
135 | if ( !array_key_exists( 'similarity', $similarity ) ) { |
136 | $similarity['similarity'] = []; |
137 | } |
138 | $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner( |
139 | MediaWikiServices::getInstance()->getHookContainer() ); |
140 | $this->cirrusSearchHookRunner->onCirrusSearchSimilarityConfig( $similarity['similarity'] ); |
141 | $this->similarity = $similarity; |
142 | |
143 | $this->config = $config; |
144 | } |
145 | |
146 | /** |
147 | * Determine if ascii folding should be used |
148 | * @param string $language Config language |
149 | * @return bool true if icu folding should be enabled |
150 | */ |
151 | public function shouldActivateIcuFolding( $language ) { |
152 | if ( !$this->isIcuAvailable() || !in_array( 'extra', $this->plugins ) ) { |
153 | // ICU folding requires the icu plugin and the extra plugin |
154 | return false; |
155 | } |
156 | $in_config = $this->config->get( 'CirrusSearchUseIcuFolding' ); |
157 | // BC code, this config var was originally a simple boolean |
158 | if ( $in_config === true ) { |
159 | $in_config = 'yes'; |
160 | } |
161 | if ( $in_config === false ) { |
162 | $in_config = 'no'; |
163 | } |
164 | switch ( $in_config ) { |
165 | case 'yes': |
166 | return true; |
167 | case 'no': |
168 | return false; |
169 | case 'default': |
170 | return $this->languagesWithIcuFolding[$language] ?? false; |
171 | default: |
172 | return false; |
173 | } |
174 | } |
175 | |
176 | /** |
177 | * Determine if the icu_tokenizer can replace the standard tokenizer for this language |
178 | * @param string $language Config language |
179 | * @return bool |
180 | */ |
181 | public function shouldActivateIcuTokenization( $language ) { |
182 | if ( !$this->isIcuAvailable() && !$this->isTextifyAvailable() ) { |
183 | // requires the icu or textify plugin |
184 | return false; |
185 | } |
186 | $in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer' ); |
187 | switch ( $in_config ) { |
188 | case 'yes': |
189 | return true; |
190 | case 'no': |
191 | return false; |
192 | case 'default': |
193 | // languagesWithIcuTokenization[] gives absolute answers for specific languages. |
194 | // If the textify plugin is available, the default is 'yes'/true because we |
195 | // have icu_token_repair available; if not, the default is 'no'/false |
196 | return $this->languagesWithIcuTokenization[$language] ?? $this->isTextifyAvailable(); |
197 | default: |
198 | return false; |
199 | } |
200 | } |
201 | |
202 | /** |
203 | * Build the analysis config. |
204 | * |
205 | * @param string|null $language Config language |
206 | * @return array the analysis config |
207 | */ |
208 | public function buildConfig( $language = null ) { |
209 | $language ??= $this->defaultLanguage; |
210 | $config = $this->customize( $this->defaults( $language ), $language ); |
211 | $this->cirrusSearchHookRunner->onCirrusSearchAnalysisConfig( $config, $this ); |
212 | |
213 | if ( $this->shouldActivateIcuTokenization( $language ) ) { |
214 | $config = $this->enableICUTokenizer( $config ); |
215 | } |
216 | |
217 | if ( $this->shouldActivateIcuFolding( $language ) ) { |
218 | $config = $this->enableICUFolding( $config, $language ); |
219 | } |
220 | $config = $this->fixAsciiFolding( $config ); |
221 | $config = $this->standardTokenizerOnlyCleanup( $config ); |
222 | if ( !$this->isTextifyAvailable() ) { |
223 | $config = $this->disableLimitedMappings( $config ); |
224 | } |
225 | |
226 | // should come after other upgrades to get the full context |
227 | $config = $this->enableGlobalCustomFilters( $config, $language ); |
228 | |
229 | return $config; |
230 | } |
231 | |
232 | /** |
233 | * @return array|null the similarity config |
234 | */ |
235 | public function buildSimilarityConfig() { |
236 | return $this->similarity['similarity'] ?? null; |
237 | } |
238 | |
239 | /** |
240 | * replace the standard tokenizer with icu_tokenizer |
241 | * @param mixed[] $config |
242 | * @return mixed[] update config |
243 | */ |
244 | public function enableICUTokenizer( array $config ) { |
245 | foreach ( $config[ 'analyzer' ] as $name => &$value ) { |
246 | if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) { |
247 | continue; |
248 | } |
249 | if ( isset( $value[ 'tokenizer' ] ) && $value[ 'tokenizer' ] === 'standard' ) { |
250 | $value[ 'tokenizer' ] = $this->icu_tokenizer; |
251 | } |
252 | } |
253 | return $config; |
254 | } |
255 | |
256 | /** |
257 | * replace STANDARD_TOKENIZER_ONLY with the actual standard tokenizer |
258 | * @param mixed[] $config |
259 | * @return mixed[] update config |
260 | */ |
261 | public function standardTokenizerOnlyCleanup( array $config ) { |
262 | foreach ( $config[ 'analyzer' ] as $name => &$value ) { |
263 | if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) { |
264 | continue; |
265 | } |
266 | if ( isset( $value[ 'tokenizer' ] ) && |
267 | $value[ 'tokenizer' ] === self::STANDARD_TOKENIZER_ONLY ) { |
268 | // if we blocked upgrades/changes to the standard tokenizer, |
269 | // replace the magic value with the actual standard tokenizer |
270 | $value[ 'tokenizer' ] = 'standard'; |
271 | } |
272 | } |
273 | return $config; |
274 | } |
275 | |
276 | /** |
277 | * replace limited_mappings with mappings if limited_mapping is unavailable |
278 | * @param mixed[] $config |
279 | * @return mixed[] update config |
280 | */ |
281 | public function disableLimitedMappings( array $config ) { |
282 | foreach ( $config[ 'char_filter' ] as $name => &$value ) { |
283 | if ( !isset( $value[ 'type' ] ) || $value[ 'type' ] != 'limited_mapping' ) { |
284 | continue; |
285 | } |
286 | $value[ 'type' ] = 'mapping'; |
287 | } |
288 | return $config; |
289 | } |
290 | |
291 | /** |
292 | * Activate ICU folding instead of asciifolding |
293 | * @param mixed[] $config |
294 | * @param string $language Config language |
295 | * @return mixed[] update config |
296 | */ |
297 | public function enableICUFolding( array $config, $language ) { |
298 | $unicodeSetFilter = $this->getICUSetFilter( $language ); |
299 | $filter = [ |
300 | 'type' => 'icu_folding', |
301 | ]; |
302 | if ( $unicodeSetFilter !== null ) { |
303 | $filter[ 'unicodeSetFilter' ] = $unicodeSetFilter; |
304 | } |
305 | $config[ 'filter' ][ 'icu_folding' ] = $filter; |
306 | |
307 | // Adds a simple nfkc normalizer for cases where |
308 | // we preserve original but the lowercase filter |
309 | // is not used before |
310 | $config[ 'filter' ][ 'icu_nfkc_normalization' ] = [ |
311 | 'type' => 'icu_normalizer', |
312 | 'name' => 'nfkc', |
313 | ]; |
314 | |
315 | $newfilters = []; |
316 | foreach ( $config[ 'analyzer' ] as $name => $value ) { |
317 | if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) { |
318 | continue; |
319 | } |
320 | if ( !isset( $value[ 'filter' ] ) ) { |
321 | continue; |
322 | } |
323 | if ( in_array( 'asciifolding', $value[ 'filter' ] ) ) { |
324 | $newfilters[ $name ] = $this->switchFiltersToICUFolding( $value[ 'filter' ] ); |
325 | } |
326 | if ( in_array( 'asciifolding_preserve', $value[ 'filter' ] ) ) { |
327 | $newfilters[ $name ] = $this->switchFiltersToICUFoldingPreserve( $value[ 'filter' ] ); |
328 | } |
329 | } |
330 | |
331 | foreach ( $newfilters as $name => $filters ) { |
332 | $config[ 'analyzer' ][ $name ][ 'filter' ] = $filters; |
333 | } |
334 | // Explicitly enable icu_folding on plain analyzers if it's not |
335 | // already enabled |
336 | foreach ( [ 'plain' ] as $analyzer ) { |
337 | if ( !isset( $config[ 'analyzer' ][ $analyzer ] ) ) { |
338 | continue; |
339 | } |
340 | if ( !isset( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] ) ) { |
341 | $config[ 'analyzer' ][ $analyzer ][ 'filter' ] = []; |
342 | } |
343 | $config[ 'analyzer' ][ $analyzer ][ 'filter' ] = |
344 | $this->switchFiltersToICUFoldingPreserve( |
345 | // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset |
346 | $config[ 'analyzer' ][ $analyzer ][ 'filter' ], true ); |
347 | } |
348 | |
349 | return $config; |
350 | } |
351 | |
352 | /** |
353 | * Replace occurrence of asciifolding to icu_folding |
354 | * @param string[] $filters |
355 | * @return string[] new list of filters |
356 | */ |
357 | private function switchFiltersToICUFolding( array $filters ) { |
358 | array_splice( $filters, array_search( 'asciifolding', $filters ), 1, |
359 | [ 'icu_folding', 'remove_empty' ] ); |
360 | return $filters; |
361 | } |
362 | |
363 | /** |
364 | * Replace occurrence of asciifolding_preserve with a set |
365 | * of compatible filters to enable icu_folding |
366 | * @param string[] $filters |
367 | * @param bool $append append icu_folding even if asciifolding is not present |
368 | * @return string[] new list of filters |
369 | */ |
370 | private function switchFiltersToICUFoldingPreserve( array $filters, $append = false ) { |
371 | if ( in_array( 'icu_folding', $filters ) ) { |
372 | // ICU folding already here |
373 | return $filters; |
374 | } |
375 | $ap_idx = array_search( 'asciifolding_preserve', $filters ); |
376 | if ( $ap_idx === false && $append ) { |
377 | $ap_idx = count( $filters ); |
378 | // fake an asciifolding_preserve so we can |
379 | // reuse code that replaces it |
380 | $filters[] = 'asciifolding_preserve'; |
381 | } |
382 | if ( $ap_idx === false ) { |
383 | return $filters; |
384 | } |
385 | // with ICU lowercase is replaced by icu_normalizer/nfkc_cf |
386 | // thus unicode normalization is already done. |
387 | $lc_idx = array_search( 'icu_normalizer', $filters ); |
388 | $newfilters = []; |
389 | if ( $lc_idx === false || $lc_idx > $ap_idx ) { |
390 | // If lowercase is not detected before we |
391 | // will have to do some icu normalization |
392 | // this is to prevent preserving "un-normalized" |
393 | // unicode chars. |
394 | $newfilters[] = 'icu_nfkc_normalization'; |
395 | } |
396 | $newfilters[] = 'preserve_original_recorder'; |
397 | $newfilters[] = 'icu_folding'; |
398 | $newfilters[] = 'preserve_original'; |
399 | $newfilters[] = 'remove_empty'; |
400 | array_splice( $filters, $ap_idx, 1, $newfilters ); |
401 | return $filters; |
402 | } |
403 | |
404 | /** |
405 | * Return the list of chars to exclude from ICU folding |
406 | * @param string $language Config language |
407 | * @return null|string |
408 | */ |
409 | protected function getICUSetFilter( $language ) { |
410 | if ( $this->config->get( 'CirrusSearchICUFoldingUnicodeSetFilter' ) !== null ) { |
411 | return $this->config->get( 'CirrusSearchICUFoldingUnicodeSetFilter' ); |
412 | } |
413 | switch ( $language ) { |
414 | /* @todo: complete the default filters per language |
415 | * |
416 | * For Slovak (sk)—which has no folding configured here!—see: |
417 | * https://www.mediawiki.org/wiki/User:TJones_(WMF)/T223787 |
418 | * |
419 | * Exceptions are generally listed as Unicode characters for ease of |
420 | * inspection. However, combining characters (such as for Thai (th)) |
421 | * are \u encoded to prevent problems with display or editing |
422 | */ |
423 | case 'bg': // T325090 |
424 | return '[^Йй]'; |
425 | case 'bs': // T192395 |
426 | case 'hr': // T192395 |
427 | case 'sh': // T192395 |
428 | case 'sr': // T183015 |
429 | return '[^ĐđŽžĆ抚Čč]'; |
430 | case 'cs': // T284578 |
431 | return '[^ÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž]'; |
432 | case 'da': // T283366 |
433 | return '[^ÆæØøÅå]'; |
434 | case 'de': // T281379 |
435 | return '[^ÄäÖöÜüẞß]'; |
436 | case 'eo': // T202173 |
437 | return '[^ĈĉĜĝĤĥĴĵŜŝŬŭ]'; |
438 | case 'es': // T277699 |
439 | return '[^Ññ]'; |
440 | case 'et': // T332322 |
441 | return '[^ŠšŽžÕõÄäÖöÜü]'; |
442 | case 'eu': // T283366 |
443 | return '[^Ññ]'; |
444 | case 'fi': // T284578 |
445 | return '[^ÅåÄäÖö]'; |
446 | case 'gl': // T284578 |
447 | return '[^Ññ]'; |
448 | case 'hu': // T325089 |
449 | return '[^ÁáÉéÍíÓóÖöŐőÚúÜüŰű]'; |
450 | case 'ja': // T326822 |
451 | // This range includes characters that don't currently get ICU folded, in |
452 | // order to keep the overall regex a lot simpler. The specific targets are |
453 | // characters with dakuten and handakuten, the separate (han)dakuten |
454 | // characters (regular and combining) and the prolonged sound mark (chōonpu). |
455 | return '[^が-ヾ]'; |
456 | case 'lt': // T325090 |
457 | return '[^ĄąČčĘęĖėĮįŠšŲųŪūŽž]'; |
458 | case 'lv': // T325089 |
459 | return '[^ĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž]'; |
460 | case 'nb': // T289612 |
461 | case 'nn': // T289612 |
462 | case 'no': |
463 | return '[^ÆæØøÅå]'; |
464 | case 'ro': // T325091 |
465 | // including s&t with cedilla because we (have to) use it internally T330893 |
466 | return '[^ĂăÂâÎîȘșȚțŞşŢţ]'; |
467 | case 'ru': |
468 | return '[^Йй]'; |
469 | case 'sv': // T160562 |
470 | return '[^ÅåÄäÖö]'; |
471 | case 'th': // T294147 |
472 | return '[^\u0E47-\u0E4E]'; |
473 | case 'tr': // T329762 |
474 | // (I and i aren't strictly necessary but they keep the Turkish upper/lower |
475 | // pairs Iı & İi together and makes it clear both are intended.) |
476 | return '[^ÇçĞğIıİiÖöŞşÜü]'; |
477 | default: |
478 | return null; |
479 | } |
480 | } |
481 | |
482 | /** |
483 | * Return the list of chars to exclude from ICU normalization |
484 | * @param string $language Config language |
485 | * @return null|string |
486 | */ |
487 | protected function getICUNormSetFilter( $language ) { |
488 | if ( $this->config->get( 'CirrusSearchICUNormalizationUnicodeSetFilter' ) !== null ) { |
489 | return $this->config->get( 'CirrusSearchICUNormalizationUnicodeSetFilter' ); |
490 | } |
491 | switch ( $language ) { |
492 | /* For German (de), see T281379 |
493 | */ |
494 | case 'de': |
495 | return '[^ẞß]'; // Capital ẞ is lowercased to ß by german_charfilter |
496 | // lowercase ß is normalized to ss by german_normalization |
497 | default: |
498 | return null; |
499 | } |
500 | } |
501 | |
502 | /** |
503 | * Build an analysis config with sane defaults. |
504 | * |
505 | * @param string $language Config language |
506 | * @return array |
507 | */ |
508 | private function defaults( $language ) { |
509 | $defaults = [ |
510 | 'analyzer' => [ |
511 | 'text' => [ |
512 | 'type' => $this->getDefaultTextAnalyzerType( $language ), |
513 | ], |
514 | // text_search is not configured here because it will be copied from text |
515 | 'plain' => [ |
516 | // Surprisingly, the Lucene docs claim this works for |
517 | // Chinese, Japanese, and Thai as well. |
518 | // The difference between this and the 'standard' |
519 | // analyzer is the lack of english stop words. |
520 | 'type' => 'custom', |
521 | 'char_filter' => [ 'nnbsp_norm', 'word_break_helper' ], |
522 | 'tokenizer' => 'standard', |
523 | 'filter' => [ 'lowercase' ], |
524 | ], |
525 | 'plain_search' => [ |
526 | // In accent squashing languages this will not contain accent |
527 | // squashing to allow searches with accents to only find accents |
528 | // and searches without accents to find both. |
529 | 'type' => 'custom', |
530 | 'char_filter' => [ 'nnbsp_norm', 'word_break_helper' ], |
531 | 'tokenizer' => 'standard', |
532 | 'filter' => [ 'lowercase' ], |
533 | ], |
534 | // Used by ShortTextIndexField |
535 | 'short_text' => [ |
536 | 'type' => 'custom', |
537 | 'tokenizer' => 'whitespace', |
538 | 'filter' => [ 'lowercase', 'aggressive_splitting', 'asciifolding_preserve' ], |
539 | ], |
540 | 'short_text_search' => [ |
541 | 'type' => 'custom', |
542 | 'tokenizer' => 'whitespace', |
543 | 'filter' => [ 'lowercase', 'aggressive_splitting' ], |
544 | ], |
545 | 'source_text_plain' => [ |
546 | 'type' => 'custom', |
547 | 'char_filter' => [ 'word_break_helper_source_text' ], |
548 | 'tokenizer' => 'standard', |
549 | 'filter' => [ 'lowercase' ], |
550 | ], |
551 | 'source_text_plain_search' => [ |
552 | 'type' => 'custom', |
553 | 'char_filter' => [ 'word_break_helper_source_text' ], |
554 | 'tokenizer' => 'standard', |
555 | 'filter' => [ 'lowercase' ], |
556 | ], |
557 | 'suggest' => [ |
558 | 'type' => 'custom', |
559 | 'tokenizer' => 'standard', |
560 | 'filter' => [ 'lowercase', 'suggest_shingle' ], |
561 | ], |
562 | 'suggest_reverse' => [ |
563 | 'type' => 'custom', |
564 | 'tokenizer' => 'standard', |
565 | 'filter' => [ 'lowercase', 'suggest_shingle', 'reverse' ], |
566 | ], |
567 | 'token_reverse' => [ |
568 | 'type' => 'custom', |
569 | 'tokenizer' => 'no_splitting', |
570 | 'filter' => [ 'reverse' ] |
571 | ], |
572 | 'near_match' => [ |
573 | 'type' => 'custom', |
574 | 'char_filter' => [ 'near_space_flattener' ], |
575 | 'tokenizer' => 'no_splitting', |
576 | 'filter' => [ 'lowercase' ], |
577 | ], |
578 | 'near_match_asciifolding' => [ |
579 | 'type' => 'custom', |
580 | 'char_filter' => [ 'near_space_flattener' ], |
581 | 'tokenizer' => 'no_splitting', |
582 | 'filter' => [ 'truncate_keyword', 'lowercase', 'asciifolding' ], |
583 | ], |
584 | 'prefix' => [ |
585 | 'type' => 'custom', |
586 | 'char_filter' => [ 'near_space_flattener' ], |
587 | 'tokenizer' => 'prefix', |
588 | 'filter' => [ 'lowercase' ], |
589 | ], |
590 | 'prefix_asciifolding' => [ |
591 | 'type' => 'custom', |
592 | 'char_filter' => [ 'near_space_flattener' ], |
593 | 'tokenizer' => 'prefix', |
594 | 'filter' => [ 'lowercase', 'asciifolding' ], |
595 | ], |
596 | 'word_prefix' => [ |
597 | 'type' => 'custom', |
598 | 'tokenizer' => 'standard', |
599 | 'filter' => [ 'lowercase', 'prefix_ngram_filter' ], |
600 | ], |
601 | 'keyword' => [ |
602 | 'type' => 'custom', |
603 | 'tokenizer' => 'no_splitting', |
604 | 'filter' => [ 'truncate_keyword' ], |
605 | ], |
606 | 'lowercase_keyword' => [ |
607 | 'type' => 'custom', |
608 | 'tokenizer' => 'no_splitting', |
609 | 'filter' => [ 'truncate_keyword', 'lowercase' ], |
610 | ], |
611 | 'trigram' => [ |
612 | 'type' => 'custom', |
613 | 'tokenizer' => 'trigram', |
614 | 'filter' => [ 'lowercase' ], |
615 | ], |
616 | ], |
617 | 'filter' => [ |
618 | 'suggest_shingle' => [ |
619 | 'type' => 'shingle', |
620 | 'min_shingle_size' => 2, |
621 | 'max_shingle_size' => 3, |
622 | 'output_unigrams' => true, |
623 | ], |
624 | 'lowercase' => [ |
625 | 'type' => 'lowercase', |
626 | ], |
627 | 'aggressive_splitting' => [ |
628 | 'type' => 'word_delimiter_graph', |
629 | 'stem_english_possessive' => false, |
630 | 'preserve_original' => false |
631 | ], |
632 | 'prefix_ngram_filter' => [ |
633 | 'type' => 'edgeNGram', |
634 | 'max_gram' => CirrusSearch::MAX_TITLE_SEARCH, |
635 | ], |
636 | 'asciifolding' => [ |
637 | 'type' => 'asciifolding', |
638 | 'preserve_original' => false |
639 | ], |
640 | 'asciifolding_preserve' => [ |
641 | 'type' => 'asciifolding', |
642 | 'preserve_original' => true |
643 | ], |
644 | // The 'keyword' type in ES seems like a hack |
645 | // and doesn't allow normalization (like lowercase) |
646 | // prior to 5.2. Instead we consistently use 'text' |
647 | // and truncate where necessary. |
648 | 'truncate_keyword' => [ |
649 | 'type' => 'truncate', |
650 | 'length' => self::KEYWORD_IGNORE_ABOVE, |
651 | ], |
652 | 'remove_empty' => [ |
653 | 'type' => 'length', |
654 | 'min' => 1, |
655 | ], |
656 | ], |
657 | 'tokenizer' => [ |
658 | 'prefix' => [ |
659 | 'type' => 'edgeNGram', |
660 | 'max_gram' => CirrusSearch::MAX_TITLE_SEARCH, |
661 | ], |
662 | 'no_splitting' => [ // Just grab the whole term. |
663 | 'type' => 'keyword', |
664 | ], |
665 | 'trigram' => [ |
666 | 'type' => 'nGram', |
667 | 'min_gram' => 3, |
668 | 'max_gram' => 3, |
669 | ], |
670 | ], |
671 | 'char_filter' => [ |
672 | // Flattens things that are space like to spaces in the near_match style analyzers |
673 | 'near_space_flattener' => [ |
674 | 'type' => 'limited_mapping', |
675 | 'mappings' => [ |
676 | "'=>\u0020", // Useful for finding names |
677 | '\u2019=>\u0020', // Unicode right single quote |
678 | '\u02BC=>\u0020', // Unicode modifier letter apostrophe |
679 | '_=>\u0020', // MediaWiki loves _ and people are used to it but it |
680 | // usually means space |
681 | '-=>\u0020', // Useful for finding hyphenated names unhyphenated |
682 | ], |
683 | ], |
684 | // map narrow no-break space to plain space to compensate for ES6.x+ |
685 | // analyzers generally not doing so |
686 | 'nnbsp_norm' => [ |
687 | 'type' => 'limited_mapping', |
688 | 'mappings' => [ |
689 | '\u202F=>\u0020', |
690 | ], |
691 | ], |
692 | // Add a space between lowercase letter {Ll} and uppercase {Lu} or |
693 | // titlecase {Lt} letter, allowing for optional combining marks {M} |
694 | // or invisibles {Cf}. This is expensive, so use camelCase_splitter |
695 | // in extra-analysis-textify instead, if available (T219108/T346051) |
696 | 'regex_camelCase' => [ |
697 | 'type' => 'pattern_replace', |
698 | 'pattern' => '(\\p{Ll}[\\p{M}\\p{Cf}]*)([\\p{Lu}\\p{Lt}])', |
699 | 'replacement' => '$1 $2' |
700 | ], |
701 | // Replace period (regular or fullwidth) between [non-letter + |
702 | // letter] and [letter + non-letter]. This slow, and also only |
703 | // handles the simplest case. Use acronym_fixer in |
704 | // extra-analysis-textify instead, if available (T170625/T346051) |
705 | 'regex_acronym_fixer' => [ |
706 | 'type' => 'pattern_replace', |
707 | 'pattern' => '(?<=(?:^|\\P{L})\\p{L})[..](\\p{L})(?=\\P{L}|$)', |
708 | 'replacement' => '$1' |
709 | ], |
710 | // combine universally-applied mappings into one mapping to save on the |
711 | // overhead of calling multiple mappings |
712 | 'globo_norm' => [ |
713 | 'type' => 'limited_mapping', |
714 | 'mappings' => [ |
715 | // map lots of apostrophe-like characters to apostrophe (T315118); |
716 | // formerly apostrophe_norm |
717 | "`=>'", // grave accent |
718 | "´=>'", // acute accent |
719 | "ʹ=>'", // modifier letter prime |
720 | "ʻ=>'", // modifier letter turned comma |
721 | "ʼ=>'", // modifier letter apostrophe |
722 | "ʽ=>'", // modifier letter reversed comma |
723 | "ʾ=>'", // modifier letter right half ring |
724 | "ʿ=>'", // modifier letter left half ring |
725 | "ˋ=>'", // modifier letter grave accent |
726 | "՚=>'", // Armenian apostrophe |
727 | "\u05F3=>'", // Hebrew punctuation geresh |
728 | "‘=>'", // left single quotation mark |
729 | "’=>'", // right single quotation mark |
730 | "‛=>'", // single high-reversed-9 quotation mark |
731 | "′=>'", // prime |
732 | "‵=>'", // reversed prime |
733 | "ꞌ=>'", // Latin small letter saltillo |
734 | "'=>'", // fullwidth apostrophe |
735 | "`=>'", // fullwidth grave accent |
736 | // map narrow no-break space to plain space to compensate for ES6.x+ |
737 | // analyzers generally not doing so; copied from nnbsp_norm, which |
738 | // is still needed elsewhere |
739 | '\u202F=>\u0020', |
740 | // Delete primary and secondary stress markers, which are |
741 | // inconsistently used across phonetic transcriptions |
742 | "ˈ=>", // modifier letter vertical line |
743 | "ˌ=>", // modifier letter low vertical line |
744 | // Delete Arabic tatweel (ـ) (used largely for cosmetic purposes) |
745 | "\u0640=>", // tatweel |
746 | // Convert Arabic thousand separator and Arabic comma to comma for |
747 | // more consistent number parsing |
748 | "٬=>,", // Arabic thousands separator |
749 | "،=>,", // Arabic comma |
750 | // delete Armenian emphasis marks, exclamation marks, and question |
751 | // marks, since they modify words rather than follow them. |
752 | "՛=>", // Armenian emphasis mark |
753 | "՜=>", // Armenian exclamation mark |
754 | "՞=>", // Armenian question mark |
755 | // micro sign to mu, to prevent some unneeded ICU tokenizer splits |
756 | // icu_normalize does this, too.. just later |
757 | "µ=>μ", |
758 | ], |
759 | ], |
760 | // Converts things that don't always count as word breaks into spaces |
761 | // which (almost) always count as word breaks (e.g., the Nori and SmartCN |
762 | // tokenizers do not always count spaces as word breaks!) |
763 | 'word_break_helper' => [ |
764 | 'type' => 'limited_mapping', |
765 | 'mappings' => [ |
766 | '_=>\u0020', |
767 | ':=>\u0020', |
768 | // These are more useful for code: |
769 | '.=>\u0020', |
770 | '(=>\u0020', |
771 | ')=>\u0020', |
772 | // fullwidth variants |
773 | '.=>\u0020', |
774 | '_=>\u0020', |
775 | ':=>\u0020', |
776 | // middle dot |
777 | '·=>\u0020', |
778 | ], |
779 | ], |
780 | 'word_break_helper_source_text' => [ |
781 | 'type' => 'limited_mapping', |
782 | 'mappings' => [ |
783 | '_=>\u0020', |
784 | // These are more useful for code: |
785 | '.=>\u0020', |
786 | '(=>\u0020', |
787 | ')=>\u0020', |
788 | ':=>\u0020', // T145023 |
789 | ], |
790 | ], |
791 | 'dotted_I_fix' => [ |
792 | // A common regression caused by unpacking is that İ is no longer |
793 | // treated correctly, so specify the mapping just once and re-use |
794 | // in analyzer/text/char_filter as needed. |
795 | 'type' => 'limited_mapping', |
796 | 'mappings' => [ |
797 | 'İ=>I', |
798 | ], |
799 | ], |
800 | ], |
801 | ]; |
802 | foreach ( $defaults[ 'analyzer' ] as &$analyzer ) { |
803 | if ( $analyzer[ 'type' ] === 'default' ) { |
804 | $analyzer = [ |
805 | 'type' => 'custom', |
806 | 'tokenizer' => 'standard', |
807 | 'filter' => [ 'lowercase' ], |
808 | ]; |
809 | } |
810 | } |
811 | if ( $this->isTextifyAvailable() && $this->shouldActivateIcuTokenization( $language ) ) { |
812 | $defaults[ 'filter' ][ 'icutokrep_no_camel_split' ] = [ |
813 | 'type' => 'icu_token_repair', |
814 | 'keep_camel_split' => false |
815 | ]; |
816 | } |
817 | if ( $this->isIcuAvailable() ) { |
818 | $defaults[ 'filter' ][ 'icu_normalizer' ] = [ |
819 | 'type' => 'icu_normalizer', |
820 | 'name' => 'nfkc_cf', |
821 | ]; |
822 | $unicodeSetFilter = $this->getICUNormSetFilter( $language ); |
823 | if ( $unicodeSetFilter !== null ) { |
824 | $defaults[ 'filter' ][ 'icu_normalizer' ][ 'unicodeSetFilter' ] = $unicodeSetFilter; |
825 | } |
826 | } |
827 | |
828 | return $defaults; |
829 | } |
830 | |
831 | /** |
832 | * Customize the default config for the language. |
833 | * |
834 | * @param array $config |
835 | * @param string $language Config language |
836 | * @return array |
837 | */ |
838 | private function customize( $config, $language ) { |
839 | $langName = $this->getDefaultTextAnalyzerType( $language ); |
840 | switch ( $langName ) { |
841 | // Please add languages in alphabetical order. |
842 | |
843 | // usual unpacked languages |
844 | case 'basque': // Unpack Basque analyzer T283366 |
845 | case 'brazilian': // Unpack Brazilian analyzer T325092 |
846 | case 'bulgarian': // Unpack Bulgarian analyzer T325090 |
847 | case 'czech': // Unpack Czech analyzer T284578 |
848 | case 'danish': // Unpack Danish analyzer T283366 |
849 | case 'estonian': // Unpack Estonian analyzer T332322 |
850 | case 'finnish': // Unpack Finnish analyzer T284578 |
851 | case 'galician': // Unpack Galician analyzer T284578 |
852 | case 'hungarian': // Unpack Hungarian analyzer T325089 |
853 | case 'latvian': // Unpack Latvian analyzer T325089 |
854 | case 'lithuanian': // Unpack Lithuanian analyzer T325090 |
855 | case 'norwegian': // Unpack Norwegian analyzer T289612 |
856 | $config = ( new AnalyzerBuilder( $langName ) )-> |
857 | withUnpackedAnalyzer()-> |
858 | build( $config ); |
859 | break; |
860 | |
861 | // usual unpacked languages, with "light" variant stemmer |
862 | case 'portuguese': // Unpack Portuguese analyzer T281379 |
863 | case 'spanish': // Unpack Spanish analyzer T277699 |
864 | $config = ( new AnalyzerBuilder( $langName ) )-> |
865 | withUnpackedAnalyzer()-> |
866 | withLightStemmer()-> |
867 | build( $config ); |
868 | break; |
869 | |
870 | // customized languages |
871 | case 'arabic': |
872 | case 'arabic-egyptian': |
873 | case 'arabic-moroccan': |
874 | // Unpack Arabic analyzer T294147 |
875 | $arBuilder = ( new AnalyzerBuilder( 'arabic' ) )-> |
876 | withUnpackedAnalyzer()-> |
877 | withDecimalDigit()-> |
878 | insertFiltersBefore( 'arabic_stemmer', [ 'arabic_normalization' ] ); |
879 | |
880 | // load extra stopwords for Arabic varieties |
881 | if ( $langName == 'arabic-egyptian' || $langName == 'arabic-moroccan' ) { |
882 | $arStopwords = require __DIR__ . '/AnalysisLanguageData/arabicStopwords.php'; |
883 | $arBuilder->withExtraStop( $arStopwords, 'arz_ary_stop', 'arabic_stop' ); |
884 | } |
885 | |
886 | $config = $arBuilder->build( $config ); |
887 | break; |
888 | case 'armenian': // Unpack Armenian analyzer T325089 |
889 | // char map: Armenian uses ․ ("one-dot leader") about 10% as often as . (period) |
890 | // stopwords նաև & և get normalized to նաեւ & եւ, so pick those up, too. |
891 | $config = ( new AnalyzerBuilder( $langName ) )-> |
892 | withUnpackedAnalyzer()-> |
893 | withLimitedCharMap( [ '․=>.' ] )-> |
894 | withExtraStop( [ 'նաեւ', 'եւ' ], 'armenian_norm_stop', 'armenian_stop' )-> |
895 | build( $config ); |
896 | break; |
897 | case 'azerbaijani': |
898 | case 'crimean-tatar': |
899 | case 'gagauz': |
900 | case 'kazakh': |
901 | case 'tatar': |
902 | // Turkic languages that use I/ı & İ/i, so need Turkish lowercasing |
903 | $config = ( new AnalyzerBuilder( $langName ) )-> |
904 | withFilters( [ 'lowercase' ] )-> |
905 | withLangLowercase( 'turkish' )-> |
906 | build( $config ); |
907 | break; |
908 | case 'bengali': // Unpack Bengali analyzer T294067 |
909 | $config = ( new AnalyzerBuilder( $langName ) )-> |
910 | withUnpackedAnalyzer()-> |
911 | withDecimalDigit()-> |
912 | insertFiltersBefore( 'bengali_stop', [ 'indic_normalization' ] )-> |
913 | build( $config ); |
914 | break; |
915 | case 'bosnian': |
916 | case 'croatian': |
917 | case 'serbian': |
918 | case 'serbo-croatian': |
919 | // Unpack default analyzer to add Serbian stemming and custom folding |
920 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T183015 |
921 | // and https://www.mediawiki.org/wiki/User:TJones_(WMF)/T192395 |
922 | $config = ( new AnalyzerBuilder( $langName ) )-> |
923 | withFilters( [ 'lowercase', 'asciifolding', 'serbian_stemmer' ] )-> |
924 | build( $config ); |
925 | break; |
926 | case 'catalan': |
927 | // Unpack Catalan analyzer T283366 |
928 | $config = ( new AnalyzerBuilder( $langName ) )-> |
929 | withUnpackedAnalyzer()-> |
930 | withElision( [ 'd', 'l', 'm', 'n', 's', 't' ] )-> |
931 | build( $config ); |
932 | break; |
933 | case 'chinese': |
934 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T158203 |
935 | $config[ 'char_filter' ][ 'tsconvert' ] = [ |
936 | 'type' => 'stconvert', |
937 | 'delimiter' => '#', |
938 | 'keep_both' => false, |
939 | 'convert_type' => 't2s', |
940 | ]; |
941 | |
942 | // char map: hack for STConvert errors (still present as of July 2023) |
943 | // see https://github.com/medcl/elasticsearch-analysis-stconvert/issues/13 |
944 | // stop: SmartCN converts lots of punctuation to ',' but we don't want to index it |
945 | $config = ( new AnalyzerBuilder( $langName ) )-> |
946 | withCharMap( [ '\u606d\u5f18=>\u606d \u5f18', '\u5138=>\u3469' ], 'stconvertfix' )-> |
947 | withCharFilters( [ 'stconvertfix', 'tsconvert' ] )-> |
948 | withTokenizer( 'smartcn_tokenizer' )-> |
949 | withStop( [ ',' ], 'smartcn_stop' )-> |
950 | withFilters( [ 'smartcn_stop', 'lowercase' ] )-> |
951 | build( $config ); |
952 | |
953 | $config[ 'analyzer' ][ 'plain' ][ 'filter' ] = [ 'smartcn_stop', 'lowercase' ]; |
954 | $config[ 'analyzer' ][ 'plain_search' ][ 'filter' ] = |
955 | $config[ 'analyzer' ][ 'plain' ][ 'filter' ]; |
956 | break; |
957 | case 'cjk': |
958 | // Unpack CJK analyzer T326822 |
959 | // map (han)dakuten to combining forms or icu_normalizer will add spaces |
960 | $dakutenMap = [ '゛=>\u3099', '゜=>\u309a' ]; |
961 | |
962 | // cjk_bigram negates the benefits of the icu_tokenizer for CJK text. The |
963 | // icu_tokenizer also has a few bad side effects, so don't use it for cjk. |
964 | // Default cjk stop words are almost the same as _english_ (add s & t; drop |
965 | // an). Stop words are searchable via 'plain' anyway, so just use _english_ |
966 | $config = ( new AnalyzerBuilder( 'cjk' ) )-> |
967 | withUnpackedAnalyzer()-> |
968 | withLimitedCharMap( $dakutenMap )-> |
969 | withTokenizer( self::STANDARD_TOKENIZER_ONLY )-> |
970 | withStop( '_english_' )-> |
971 | omitStemmer()-> |
972 | insertFiltersBefore( 'lowercase', [ 'cjk_width' ] )-> |
973 | insertFiltersBefore( 'cjk_stop', [ 'cjk_bigram' ] )-> |
974 | build( $config ); |
975 | break; |
976 | case 'dutch': |
977 | // Unpack Dutch analyzer T281379 |
978 | $nlOverride = [ // these are in the default Dutch analyzer |
979 | 'fiets=>fiets', |
980 | 'bromfiets=>bromfiets', |
981 | 'ei=>eier', |
982 | 'kind=>kinder' |
983 | ]; |
984 | $config = ( new AnalyzerBuilder( $langName ) )-> |
985 | withUnpackedAnalyzer()-> |
986 | withStemmerOverride( $nlOverride )-> |
987 | build( $config ); |
988 | break; |
989 | case 'english': |
990 | // Map hiragana (\u3041-\u3096) to katakana (\u30a1-\u30f6), currently only for |
991 | // English |
992 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T176197 |
993 | $hkmap = []; |
994 | for ( $i = 0x3041; $i <= 0x3096; $i++ ) { |
995 | $hkmap[] = sprintf( '\\u%04x=>\\u%04x', $i, $i + 0x60 ); |
996 | } |
997 | |
998 | // Replace English analyzer with a rebuilt copy with asciifolding inserted |
999 | // before stemming |
1000 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T142037 |
1001 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1002 | withLimitedCharMap( $hkmap, 'kana_map' )-> |
1003 | withCharFilters( [ 'kana_map' ] )-> |
1004 | withExtraStemmer( 'possessive_english' )-> |
1005 | withStemmerOverride( 'guidelines => guideline', 'custom_stem' )-> |
1006 | withFilters( [ 'possessive_english', 'lowercase', 'stop', 'asciifolding', |
1007 | 'kstem', 'custom_stem' ] )-> |
1008 | build( $config ); |
1009 | |
1010 | // Add asciifolding_preserve to the plain analyzer as well (but not plain_search) |
1011 | $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 'asciifolding_preserve'; |
1012 | // Add asciifolding_preserve filters |
1013 | $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve'; |
1014 | break; |
1015 | case 'esperanto': |
1016 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T202173 |
1017 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1018 | withFilters( [ 'lowercase', 'asciifolding', 'esperanto_stemmer' ] )-> |
1019 | build( $config ); |
1020 | break; |
1021 | case 'french': |
1022 | // Add asciifolding_preserve to filters |
1023 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T142620 |
1024 | $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve'; |
1025 | |
1026 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1027 | withUnpackedAnalyzer()-> |
1028 | withLimitedCharMap( [ '\u02BC=>\u0027' ] )-> |
1029 | withElision( [ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c', |
1030 | 'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ] )-> |
1031 | withLightStemmer()-> |
1032 | withAsciifoldingPreserve()-> |
1033 | build( $config ); |
1034 | break; |
1035 | case 'german': |
1036 | // Unpack German analyzer T281379 |
1037 | // char map: We have to explicitly map capital ẞ to lowercase ß |
1038 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1039 | withUnpackedAnalyzer()-> |
1040 | withLimitedCharMap( [ 'ẞ=>ß' ] )-> |
1041 | withLightStemmer()-> |
1042 | insertFiltersBefore( 'german_stemmer', [ 'german_normalization' ] )-> |
1043 | build( $config ); |
1044 | |
1045 | $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'german_charfilter'; |
1046 | $config[ 'analyzer' ][ 'plain_search' ][ 'char_filter' ][] = 'german_charfilter'; |
1047 | break; |
1048 | case 'greek': |
1049 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1050 | withUnpackedAnalyzer()-> |
1051 | omitAsciifolding()-> |
1052 | withLangLowercase()-> |
1053 | withRemoveEmpty()-> |
1054 | build( $config ); |
1055 | break; |
1056 | case 'hebrew': |
1057 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1058 | withTokenizer( 'hebrew' )-> |
1059 | withFilters( [ 'niqqud', 'hebrew_lemmatizer', 'remove_duplicates', 'lowercase', |
1060 | 'asciifolding' ] )-> |
1061 | build( $config ); |
1062 | break; |
1063 | case 'hindi': |
1064 | // Unpack Hindi analyzer T289612 |
1065 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1066 | withUnpackedAnalyzer()-> |
1067 | withDecimalDigit()-> |
1068 | insertFiltersBefore( 'hindi_stop', |
1069 | [ 'indic_normalization', 'hindi_normalization' ] )-> |
1070 | build( $config ); |
1071 | break; |
1072 | case 'indonesian': |
1073 | case 'malay': |
1074 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T196780 |
1075 | $config = ( new AnalyzerBuilder( 'indonesian' ) )-> |
1076 | withUnpackedAnalyzer()-> |
1077 | omitAsciifolding()-> |
1078 | build( $config ); |
1079 | break; |
1080 | case 'irish': |
1081 | $gaCharMap = [ 'ḃ=>bh', 'ċ=>ch', 'ḋ=>dh', 'ḟ=>fh', 'ġ=>gh', 'ṁ=>mh', 'ṗ=>ph', |
1082 | 'ṡ=>sh', 'ẛ=>sh', 'ṫ=>th', 'Ḃ=>BH', 'Ċ=>CH', 'Ḋ=>DH', 'Ḟ=>FH', 'Ġ=>GH', |
1083 | 'Ṁ=>MH', 'Ṗ=>PH', 'Ṡ=>SH', 'Ṫ=>TH' ]; |
1084 | |
1085 | // Add b, bh, g, m for camelCase cleanup |
1086 | $gaHyphenStop = [ 'h', 'n', 't', 'b', 'bh', 'g', 'm' ]; |
1087 | |
1088 | // Unpack Irish analyzer T289612 |
1089 | // See also https://www.mediawiki.org/wiki/User:TJones_(WMF)/T217602 |
1090 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1091 | withUnpackedAnalyzer()-> |
1092 | withCharMap( $gaCharMap )-> |
1093 | withExtraStop( $gaHyphenStop, 'irish_hyphenation', 'irish_elision', true )-> |
1094 | withElision( [ 'd', 'm', 'b' ] )-> |
1095 | withLangLowercase()-> |
1096 | build( $config ); |
1097 | break; |
1098 | case 'italian': |
1099 | // Replace the default Italian analyzer with a rebuilt copy with additional filters |
1100 | $itElision = [ 'c', 'l', 'all', 'dall', 'dell', 'nell', 'sull', 'coll', 'pell', |
1101 | 'gl', 'agl', 'dagl', 'degl', 'negl', 'sugl', 'un', 'm', 't', 's', 'v', 'd' ]; |
1102 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1103 | withUnpackedAnalyzer()-> |
1104 | withElision( $itElision )-> |
1105 | withLightStemmer()-> |
1106 | build( $config ); |
1107 | |
1108 | // Add asciifolding_preserve to the plain analyzer as well (but not plain_search) |
1109 | $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 'asciifolding_preserve'; |
1110 | // Add asciifolding_preserve to filters |
1111 | $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve'; |
1112 | break; |
1113 | case 'japanese': |
1114 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T166731 |
1115 | // pre-convert fullwidth numbers because Kuromoji tokenizer treats them weirdly |
1116 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1117 | withNumberCharFilter( 0xff10, 'fullwidthnumfix' )-> |
1118 | withCharFilters( [ 'fullwidthnumfix' ] )-> |
1119 | withTokenizer( 'kuromoji_tokenizer' )-> |
1120 | withFilters( [ 'kuromoji_baseform', 'cjk_width', 'ja_stop', 'kuromoji_stemmer', |
1121 | 'lowercase' ] )-> |
1122 | build( $config ); |
1123 | break; |
1124 | case 'khmer': |
1125 | // See Khmer: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T185721 |
1126 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1127 | withNumberCharFilter( 0x17e0 )-> |
1128 | withCharFilters( [ 'khmer_syll_reorder', 'khmer_numbers' ] )-> |
1129 | withFilters( [ 'lowercase' ] )-> |
1130 | build( $config ); |
1131 | break; |
1132 | case 'korean': |
1133 | // Unpack nori analyzer to add ICU normalization and custom filters |
1134 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T206874 |
1135 | |
1136 | // Nori-specific character filter |
1137 | $noriMap = [ |
1138 | '\u00B7=>\u0020', // convert middle dot to space |
1139 | '\u318D=>\u0020', // arae-a to space |
1140 | '\u00AD=>', // remove soft hyphens |
1141 | '\u200C=>', // remove zero-width non-joiners |
1142 | ]; |
1143 | |
1144 | // Nori-specific pattern_replace to strip combining diacritics |
1145 | $config[ 'char_filter' ][ 'nori_combo_filter' ] = |
1146 | AnalyzerBuilder::patternFilter( '[\\u0300-\\u0331]' ); |
1147 | |
1148 | // 'mixed' mode keeps the original token plus the compound parts |
1149 | // the default is 'discard' which only keeps the parts |
1150 | $config[ 'tokenizer' ][ 'nori_tok' ] = [ |
1151 | 'type' => 'nori_tokenizer', |
1152 | 'decompound_mode' => 'mixed', |
1153 | ]; |
1154 | |
1155 | // Nori-specific part of speech filter (add 'VCP', 'VCN', 'VX' to default) |
1156 | $config[ 'filter' ][ 'nori_posfilter' ] = [ |
1157 | 'type' => 'nori_part_of_speech', |
1158 | 'stoptags' => [ 'E', 'IC', 'J', 'MAG', 'MAJ', 'MM', 'SP', 'SSC', 'SSO', |
1159 | 'SC', 'SE', 'XPN', 'XSA', 'XSN', 'XSV', 'UNA', 'NA', 'VSV', 'VCP', |
1160 | 'VCN', 'VX' ], |
1161 | ]; |
1162 | |
1163 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1164 | withLimitedCharMap( $noriMap, 'nori_charfilter' )-> |
1165 | withCharFilters( [ 'nori_charfilter', 'nori_combo_filter' ] )-> |
1166 | withTokenizer( 'nori_tok' )-> |
1167 | withFilters( [ 'nori_posfilter', 'nori_readingform', 'lowercase', |
1168 | 'remove_empty' ] )-> |
1169 | build( $config ); |
1170 | break; |
1171 | case 'mirandese': |
1172 | // Unpack default analyzer to add Mirandese-specific elision and stop words |
1173 | // See phab ticket T194941 |
1174 | $mwlStopwords = require __DIR__ . '/AnalysisLanguageData/mirandeseStopwords.php'; |
1175 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1176 | withElision( [ 'l', 'd', 'qu' ] )-> |
1177 | withStop( $mwlStopwords )-> |
1178 | withFilters( [ 'lowercase', 'mirandese_elision', 'mirandese_stop' ] )-> |
1179 | build( $config ); |
1180 | break; |
1181 | case 'persian': // Unpack Persian analyzer T325090 |
1182 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1183 | withUnpackedAnalyzer()-> |
1184 | withLimitedCharMap( [ '\u200C=>\u0020' ], 'zero_width_spaces' )-> |
1185 | withDecimalDigit()-> |
1186 | omitStemmer()-> |
1187 | insertFiltersBefore( 'persian_stop', |
1188 | [ 'arabic_normalization', 'persian_normalization' ] )-> |
1189 | build( $config ); |
1190 | break; |
1191 | case 'polish': |
1192 | // these are real stop words for Polish |
1193 | $plStopwords = require __DIR__ . '/AnalysisLanguageData/polishStopwords.php'; |
1194 | |
1195 | // Stempel-specific stop words--additional unreliable stems |
1196 | $stempelStopwords = [ 'ować', 'iwać', 'obić', 'snąć', 'ywać', 'ium', 'my', 'um' ]; |
1197 | |
1198 | // Stempel is statistical, and certain stems are really terrible, so we filter them |
1199 | // after stemming. See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T186046 |
1200 | $config[ 'filter' ][ 'stempel_pattern_filter' ] = |
1201 | AnalyzerBuilder::patternFilter( '^([a-zął]?[a-zćń]|..ć|\d.*ć)$' ); |
1202 | |
1203 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1204 | withUnpackedAnalyzer()-> |
1205 | withStop( $plStopwords )-> |
1206 | omitStemmer()-> |
1207 | omitAsciiFolding()-> |
1208 | appendFilters( [ 'polish_stem', 'stempel_pattern_filter', 'remove_empty' ] )-> |
1209 | withExtraStop( $stempelStopwords, 'stempel_stop' )-> |
1210 | build( $config ); |
1211 | break; |
1212 | case 'romanian': // Unpack Romanian analyzer T325091 / T330893 |
1213 | // Counterintuitively, we need to map correct s&t (with commas) to older |
1214 | // incorrect forms (with cedilla) so that the old Snowball stemmer (from before |
1215 | // comma forms were available) will work; also normalize versions with |
1216 | // combining diacritics to single characters. |
1217 | $cedillaMap = [ |
1218 | 'ș=>ş', 's\u0326=>ş', 's\u0327=>ş', 'ț=>ţ', 't\u0326=>ţ', 't\u0327=>ţ', |
1219 | 'Ș=>Ş', 'S\u0326=>Ş', 'S\u0327=>Ş', 'Ț=>Ţ', 'T\u0326=>Ţ', 'T\u0327=>Ţ', |
1220 | ]; |
1221 | |
1222 | // Add stopword variants with modern commas instead of old cedillas so that |
1223 | // both are handled, regardless of the character mapping needed for the |
1224 | // stemmer. In the future, Lucene should update their stopwords and these will |
1225 | // be included. |
1226 | $roStopwords = require __DIR__ . '/AnalysisLanguageData/romanianStopwords.php'; |
1227 | |
1228 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1229 | withUnpackedAnalyzer()-> |
1230 | withCharMap( $cedillaMap )-> |
1231 | withExtraStop( $roStopwords, 'ro_comma_stop', 'romanian_stemmer' )-> |
1232 | build( $config ); |
1233 | break; |
1234 | case 'russian': |
1235 | // unpack built-in Russian analyzer and add character filter |
1236 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T124592 |
1237 | $ruCharMap = [ |
1238 | '\u0301=>', // combining acute accent, only used to show stress T102298 |
1239 | '\u0435\u0308=>\u0435', // T124592 fold ё=>е and Ё=>Е, with combining |
1240 | '\u0415\u0308=>\u0415', // diacritic... |
1241 | '\u0451=>\u0435', // ... or precomposed |
1242 | '\u0401=>\u0415', |
1243 | ]; |
1244 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1245 | withUnpackedAnalyzer()-> |
1246 | withCharMap( $ruCharMap )-> |
1247 | build( $config ); |
1248 | |
1249 | // add Russian character mappings to near_space_flattener, and convert it from |
1250 | // limited_mapping to mapping to handle multi-char maps |
1251 | $config[ 'char_filter' ][ 'near_space_flattener' ][ 'type' ] = 'mapping'; |
1252 | array_push( $config[ 'char_filter' ][ 'near_space_flattener' ][ 'mappings' ], |
1253 | ...$ruCharMap ); |
1254 | |
1255 | // Drop acute stress marks and fold ё=>е everywhere |
1256 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T124592 |
1257 | $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'russian_charfilter'; |
1258 | $config[ 'analyzer' ][ 'plain_search' ][ 'char_filter' ][] = 'russian_charfilter'; |
1259 | |
1260 | $config[ 'analyzer' ][ 'suggest' ][ 'char_filter' ][] = 'russian_charfilter'; |
1261 | $config[ 'analyzer' ][ 'suggest_reverse' ][ 'char_filter' ][] = 'russian_charfilter'; |
1262 | break; |
1263 | case 'slovak': |
1264 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T190815 |
1265 | // and https://www.mediawiki.org/wiki/User:TJones_(WMF)/T223787 |
1266 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1267 | withFilters( [ 'lowercase', 'slovak_stemmer', 'asciifolding' ] )-> |
1268 | build( $config ); |
1269 | break; |
1270 | case 'sorani': // Unpack Sorani analyzer T325091 |
1271 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1272 | withUnpackedAnalyzer()-> |
1273 | withDecimalDigit()-> |
1274 | insertFiltersBefore( 'lowercase', [ 'sorani_normalization' ] )-> |
1275 | build( $config ); |
1276 | break; |
1277 | case 'swedish': |
1278 | // Add asciifolding_preserve to lowercase_keyword |
1279 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T160562 |
1280 | $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve'; |
1281 | |
1282 | // Unpack built-in swedish analyzer to add asciifolding_preserve |
1283 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1284 | withUnpackedAnalyzer()-> |
1285 | withAsciifoldingPreserve()-> |
1286 | build( $config ); |
1287 | break; |
1288 | case 'thai': |
1289 | // Unpack and improve Thai analyzer: T294147 |
1290 | $thCharMap = [ |
1291 | '_=>\u0020', // split tokens on underscore .. |
1292 | ';=>\u0020', // .. semicolon |
1293 | ':=>\u0020', // .. colon |
1294 | '·=>\u0020', // .. middle dot |
1295 | '‧=>\u0020', // .. & hyphenation point |
1296 | 'ฃ=>ข', // replace obsolete ฃ |
1297 | 'ฅ=>ค', // replace obsolete ฅ |
1298 | '\u0e4d\u0e32=>\u0e33', // compose nikhahit + sara aa = sara am |
1299 | '\u0e4d\u0e48\u0e32=>\u0e48\u0e33', // recompose sara am split around.. |
1300 | '\u0e4d\u0e49\u0e32=>\u0e49\u0e33', // .. other diacritics |
1301 | '\u0e33\u0e48=>\u0e48\u0e33', // sara am should consistently.. |
1302 | '\u0e33\u0e49=>\u0e49\u0e33', // .. come after other diacritics |
1303 | '\u0E34\u0E4D=>\u0E36', // compose sara i + nikhahit = sara ue.. |
1304 | '\u0E4D\u0E34=>\u0E36', // .. in either order |
1305 | ]; |
1306 | |
1307 | // instantiate basic unpacked analyzer builder, plus thai tokenizer by default |
1308 | $thBuilder = ( new AnalyzerBuilder( $langName ) ) |
1309 | ->withUnpackedAnalyzer() |
1310 | ->withTokenizer( 'thai' ); |
1311 | |
1312 | if ( $this->isIcuAvailable() ) { |
1313 | // ICU tokenizer is preferred in general. If it is available, replace |
1314 | // default tokenizer. Also add thai_repl_pat char filter to accommodate |
1315 | // some of its weaknesses. |
1316 | $thBuilder->withTokenizer( $this->icu_tokenizer ); |
1317 | |
1318 | $thaiLetterPat = '[ก-๏]'; // Thai characters, except for digits. |
1319 | $config[ 'char_filter' ][ 'thai_repl_pat' ] = |
1320 | // break between any digits and Thai letters, or vice versa |
1321 | // break *Thai* tokens on periods (by making them spaces) |
1322 | // (regex look-behind is okay, but look-ahead breaks offsets) |
1323 | AnalyzerBuilder::patternFilter( "(?<=\\p{Nd})($thaiLetterPat)" . |
1324 | "|(?<=$thaiLetterPat)(\\p{Nd})" . |
1325 | "|(?<=$thaiLetterPat)\.($thaiLetterPat)", |
1326 | ' $1$2$3' ); |
1327 | $thBuilder->withCharFilters( [ 'thai_repl_pat' ] ); |
1328 | |
1329 | // if icu_token_repair (in the textify plugin) is available, we need a |
1330 | // reverse number map so it doesn't rejoin split-off Arabic numbers. |
1331 | if ( $this->isTextifyAvailable() ) { |
1332 | $thBuilder->withReversedNumberCharFilter( 0x0e50 ); |
1333 | } |
1334 | } else { |
1335 | // if we have to settle for the Thai tokenizer, add some additional |
1336 | // character filters to accommodate some of its weaknesses |
1337 | $thThaiTokSplits = [ |
1338 | '\u200B=>', // delete zero width space |
1339 | '-=>\u0020', // split tokens on hyphen-minus .. |
1340 | '‐=>\u0020', // .. hyphen |
1341 | '–=>\u0020', // .. en dash |
1342 | '—=>\u0020', // .. em dash |
1343 | '―=>\u0020', // .. horizontal bar |
1344 | '-=>\u0020', // .. fullwidth hyphen |
1345 | '"=>\u0020', // .. & double quote |
1346 | ]; |
1347 | array_push( $thCharMap, ...$thThaiTokSplits ); |
1348 | } |
1349 | |
1350 | // add in the rest of the bits that are always needed, and build |
1351 | $config = $thBuilder->withCharMap( $thCharMap )-> |
1352 | withDecimalDigit()-> |
1353 | omitStemmer()-> |
1354 | build( $config ); |
1355 | break; |
1356 | case 'turkish': |
1357 | $trAposFilter = 'apostrophe'; |
1358 | if ( in_array( 'extra-analysis-turkish', $this->plugins ) ) { |
1359 | $trAposFilter = 'better_apostrophe'; |
1360 | } |
1361 | $config = ( new AnalyzerBuilder( $langName ) )-> |
1362 | withUnpackedAnalyzer()-> |
1363 | withLangLowercase()-> |
1364 | insertFiltersBefore( 'turkish_stop', [ $trAposFilter ] )-> |
1365 | build( $config ); |
1366 | break; |
1367 | case 'ukrainian-unpacked': |
1368 | $this->languagesWithIcuFolding['uk'] = true; |
1369 | $ukCharMap = [ |
1370 | '‘=>\'', // normalize apostrophes |
1371 | '’=>\'', |
1372 | '`=>\'', |
1373 | '´=>\'', |
1374 | 'ʼ=>\'', |
1375 | '\u0301=>', // delete combining acute and soft hyphen |
1376 | '\u00AD=>', |
1377 | 'ґ=>г', // normalize ghe with upturn |
1378 | 'Ґ=>Г', |
1379 | ]; |
1380 | // lowercase twice because stopwords are case sensitive, and the stemmer |
1381 | // generates some output with uppercase initial letters, even for |
1382 | // lowercase input (usually proper names) |
1383 | $ukFilters = [ 'lowercase', 'ukrainian_stop', 'ukrainian_stemmer', |
1384 | 'lowercase', 'remove_duplicates', 'asciifolding' ]; |
1385 | $config = ( new AnalyzerBuilder( 'ukrainian' ) )-> |
1386 | withLimitedCharMap( $ukCharMap )-> |
1387 | withCharFilters( [ 'ukrainian_charfilter' ] )-> |
1388 | withFilters( $ukFilters )-> |
1389 | build( $config ); |
1390 | break; |
1391 | default: |
1392 | // do nothing--default config is already set up |
1393 | break; |
1394 | } |
1395 | |
1396 | // text_search is just a copy of text |
1397 | // @phan-suppress-next-line PhanTypeInvalidDimOffset |
1398 | $config[ 'analyzer' ][ 'text_search' ] = $config[ 'analyzer' ][ 'text' ]; |
1399 | |
1400 | // replace lowercase filters with icu_normalizer filter |
1401 | if ( $this->isIcuAvailable() ) { |
1402 | foreach ( $config[ 'analyzer' ] as &$analyzer ) { |
1403 | if ( !isset( $analyzer[ 'filter' ] ) ) { |
1404 | continue; |
1405 | } |
1406 | |
1407 | $tmpFilters = []; |
1408 | foreach ( $analyzer[ 'filter' ] as $filter ) { |
1409 | if ( $filter === 'lowercase' ) { |
1410 | // If lowercase filter has language-specific processing, keep it, |
1411 | // and do it before ICU normalization, particularly for Greek, |
1412 | // Irish, and Turkish |
1413 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T203117 |
1414 | // See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T217602 |
1415 | if ( isset( $config[ 'filter' ][ 'lowercase' ][ 'language' ] ) ) { |
1416 | $tmpFilters[] = 'lowercase'; |
1417 | } |
1418 | $tmpFilters[] = 'icu_normalizer'; |
1419 | } else { |
1420 | $tmpFilters[] = $filter; |
1421 | } |
1422 | } |
1423 | $analyzer[ 'filter' ] = $tmpFilters; |
1424 | |
1425 | } |
1426 | } |
1427 | |
1428 | return $config; |
1429 | } |
1430 | |
1431 | /** |
1432 | * Workaround for https://issues.apache.org/jira/browse/LUCENE-7468 |
1433 | * The preserve_original duplicates token even if they are |
1434 | * not modified, leading to more space used and wrong term frequencies. |
1435 | * Workaround is to append a unique filter to remove the dups. |
1436 | * (made public for unit tests) |
1437 | * |
1438 | * @param mixed[] $config |
1439 | * @return mixed[] update mapping |
1440 | */ |
1441 | public function fixAsciiFolding( array $config ) { |
1442 | $needDedupFilter = false; |
1443 | foreach ( $config[ 'analyzer' ] as $name => &$value ) { |
1444 | if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) { |
1445 | continue; |
1446 | } |
1447 | if ( !isset( $value[ 'filter' ] ) ) { |
1448 | continue; |
1449 | } |
1450 | $ascii_idx = array_search( 'asciifolding_preserve', $value[ 'filter' ] ); |
1451 | if ( $ascii_idx !== false ) { |
1452 | $needDedupFilter = true; |
1453 | array_splice( $value[ 'filter' ], $ascii_idx + 1, 0, [ 'dedup_asciifolding' ] ); |
1454 | } |
1455 | } |
1456 | if ( $needDedupFilter ) { |
1457 | $config[ 'filter' ][ 'dedup_asciifolding' ] = [ |
1458 | 'type' => 'unique', |
1459 | 'only_on_same_position' => true, |
1460 | ]; |
1461 | } |
1462 | return $config; |
1463 | } |
1464 | |
1465 | /** |
1466 | * Pick the appropriate default analyzer based on the language. Rather than think of |
1467 | * this as per language customization you should think of this as an effort to pick a |
1468 | * reasonably default in case CirrusSearch isn't customized for the language. |
1469 | * |
1470 | * @param string $language Config language |
1471 | * @return string the analyzer type |
1472 | */ |
1473 | public function getDefaultTextAnalyzerType( $language ) { |
1474 | // If we match a language exactly, use it |
1475 | if ( array_key_exists( $language, $this->elasticsearchLanguageAnalyzers ) ) { |
1476 | return $this->elasticsearchLanguageAnalyzers[ $language ]; |
1477 | } |
1478 | |
1479 | return 'default'; |
1480 | } |
1481 | |
1482 | /** |
1483 | * Get list of filters that are mentioned in analyzers but not defined |
1484 | * explicitly. |
1485 | * @param array[] &$config Full configuration array |
1486 | * @param string[] $analyzers List of analyzers to consider. |
1487 | * @return array List of default filters, each containing only filter type |
1488 | */ |
1489 | private function getDefaultFilters( array &$config, array $analyzers ) { |
1490 | $defaultFilters = []; |
1491 | foreach ( $analyzers as $analyzer ) { |
1492 | if ( empty( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] ) ) { |
1493 | continue; |
1494 | } |
1495 | foreach ( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] as $filterName ) { |
1496 | if ( !isset( $config[ 'filter' ][ $filterName ] ) ) { |
1497 | // This is default definition for the built-in filter |
1498 | $defaultFilters[ $filterName ] = [ 'type' => $filterName ]; |
1499 | } |
1500 | } |
1501 | } |
1502 | return $defaultFilters; |
1503 | } |
1504 | |
1505 | /** |
1506 | * Check every filter in the config - if it's the same as in old config, |
1507 | * ignore it. If it has the same name, but different content - create new filter |
1508 | * with different name by prefixing it with language code. |
1509 | * |
1510 | * @param array[] &$config Configuration being processed |
1511 | * @param array[] $standardFilters Existing filters list |
1512 | * @param array[] $defaultFilters List of default filters already mentioned in the config |
1513 | * @param string $prefix Prefix for disambiguation |
1514 | * @return array[] The list of filters not in the old config. |
1515 | */ |
1516 | private function resolveFilters( array &$config, array $standardFilters, array $defaultFilters, |
1517 | string $prefix ) { |
1518 | $resultFilters = []; |
1519 | foreach ( $config[ 'filter' ] as $name => $filter ) { |
1520 | $existingFilter = $standardFilters[$name] ?? $defaultFilters[$name] ?? null; |
1521 | if ( $existingFilter ) { // Filter with this name already exists |
1522 | if ( $existingFilter != $filter ) { |
1523 | // filter with the same name but different config - need to |
1524 | // rename by adding prefix |
1525 | $newName = $prefix . '_' . $name; |
1526 | $this->replaceFilter( $config, $name, $newName ); |
1527 | $resultFilters[ $newName ] = $filter; |
1528 | } |
1529 | } else { |
1530 | $resultFilters[ $name ] = $filter; |
1531 | } |
1532 | } |
1533 | return $resultFilters; |
1534 | } |
1535 | |
1536 | /** |
1537 | * Replace certain filter name in all configs with different name. |
1538 | * @param array[] &$config Configuration being processed |
1539 | * @param string $oldName |
1540 | * @param string $newName |
1541 | */ |
1542 | private function replaceFilter( array &$config, $oldName, $newName ) { |
1543 | foreach ( $config[ 'analyzer' ] as &$analyzer ) { |
1544 | if ( !isset( $analyzer[ 'filter' ] ) ) { |
1545 | continue; |
1546 | } |
1547 | $analyzer[ 'filter' ] = array_map( static function ( $filter ) use ( $oldName, $newName ) { |
1548 | if ( $filter === $oldName ) { |
1549 | return $newName; |
1550 | } |
1551 | return $filter; |
1552 | }, $analyzer[ 'filter' ] ); |
1553 | } |
1554 | } |
1555 | |
1556 | /** |
1557 | * Merge per-language config into the main config. |
1558 | * It will copy specific analyzer and all dependant filters and char_filters. |
1559 | * @param array &$config Main config |
1560 | * @param array $langConfig Per-language config |
1561 | * @param string $name Name for analyzer whose config we're merging |
1562 | * @param string $prefix Prefix for this configuration |
1563 | */ |
1564 | private function mergeConfig( array &$config, array $langConfig, $name, $prefix ) { |
1565 | $analyzer = $langConfig[ 'analyzer' ][ $name ]; |
1566 | $config[ 'analyzer' ][ $prefix . '_' . $name ] = $analyzer; |
1567 | if ( !empty( $analyzer[ 'filter' ] ) ) { |
1568 | // Add private filters for this analyzer |
1569 | foreach ( $analyzer[ 'filter' ] as $filter ) { |
1570 | // Copy filters that are in language config but not in the main config. |
1571 | // We would not copy the same filter into the main config since due to |
1572 | // the resolution step we know they are the same (otherwise we would have |
1573 | // renamed it). |
1574 | if ( isset( $langConfig[ 'filter' ][ $filter ] ) && |
1575 | !isset( $config[ 'filter' ][ $filter ] ) ) { |
1576 | $config[ 'filter' ][ $filter ] = $langConfig[ 'filter' ][ $filter ]; |
1577 | } |
1578 | } |
1579 | } |
1580 | if ( !empty( $analyzer[ 'char_filter' ] ) ) { |
1581 | // Add private char_filters for this analyzer |
1582 | foreach ( $analyzer[ 'char_filter' ] as $filter ) { |
1583 | // Copy char_filters that are in lang config but not in the main config. |
1584 | // Need to check whether the filter exists in langConfig because some |
1585 | // non-configurable filters are defined in plugins and do not have a |
1586 | // local definition (e.g., camelCase_splitter) |
1587 | if ( isset( $langConfig[ 'char_filter' ][ $filter ] ) && |
1588 | !isset( $config[ 'char_filter' ][ $filter ] ) ) { |
1589 | $config[ 'char_filter' ][ $filter ] = $langConfig[ 'char_filter' ][ $filter ]; |
1590 | } |
1591 | } |
1592 | } |
1593 | if ( !empty( $analyzer[ 'tokenizer' ] ) ) { |
1594 | $tokenizer = $analyzer[ 'tokenizer' ]; |
1595 | if ( isset( $langConfig[ 'tokenizer' ][ $tokenizer ] ) && |
1596 | !isset( $config[ 'tokenizer' ][ $tokenizer ] ) ) { |
1597 | $config[ 'tokenizer' ][ $tokenizer ] = $langConfig[ 'tokenizer' ][ $tokenizer ]; |
1598 | } |
1599 | } |
1600 | } |
1601 | |
1602 | /** |
1603 | * Create per-language configs for specific analyzers which separates and namespaces |
1604 | * filters that are different between languages. |
1605 | * @param array &$config Existing config, will be modified |
1606 | * @param string[] $languages List of languages to process |
1607 | * @param string[] $analyzers List of analyzers to process |
1608 | */ |
1609 | public function buildLanguageConfigs( array &$config, array $languages, array $analyzers ) { |
1610 | $defaultFilters = $this->getDefaultFilters( $config, $analyzers ); |
1611 | foreach ( $languages as $lang ) { |
1612 | $langConfig = $this->buildConfig( $lang ); |
1613 | $defaultFilters += $this->getDefaultFilters( $langConfig, $analyzers ); |
1614 | } |
1615 | foreach ( $languages as $lang ) { |
1616 | $langConfig = $this->buildConfig( $lang ); |
1617 | // Analyzer is: tokenizer + filter + char_filter |
1618 | // Char filters & Tokenizers are nicely namespaced |
1619 | // Filters are NOT - e.g. lowercase & icu_folding filters are different for different |
1620 | // languages! So we need to do some disambiguation here. |
1621 | $langConfig[ 'filter' ] = |
1622 | $this->resolveFilters( $langConfig, $config[ 'filter' ], $defaultFilters, $lang ); |
1623 | // Merge configs |
1624 | foreach ( $analyzers as $analyzer ) { |
1625 | $this->mergeConfig( $config, $langConfig, $analyzer, $lang ); |
1626 | } |
1627 | } |
1628 | } |
1629 | |
1630 | /** |
1631 | * @return bool true if the icu analyzer is available. |
1632 | */ |
1633 | public function isIcuAvailable() { |
1634 | return $this->icu; |
1635 | } |
1636 | |
1637 | /** |
1638 | * @return bool true if the textify plugin is available. |
1639 | */ |
1640 | public function isTextifyAvailable() { |
1641 | return $this->textify; |
1642 | } |
1643 | |
1644 | /** |
1645 | * update languages with global custom filters (e.g., homoglyph & nnbsp filters) |
1646 | * |
1647 | * @param mixed[] $config |
1648 | * @param string $language language to add plugin to |
1649 | * @return mixed[] updated config |
1650 | */ |
1651 | public function enableGlobalCustomFilters( array $config, string $language ) { |
1652 | return GlobalCustomFilter::enableGlobalCustomFilters( $config, $language, |
1653 | $this->globalCustomFilters, $this->plugins ); |
1654 | } |
1655 | |
1656 | /** |
1657 | * Languages for which we have a custom analysis chain (Elastic built-in or our |
1658 | * own custom analysis). All other languages default to the default analyzer which |
1659 | * isn't too good. Note that this array is sorted alphabetically by value. The |
1660 | * Elastic list is sourced from |
1661 | * https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html |
1662 | * |
1663 | * @var string[] |
1664 | */ |
1665 | private $elasticsearchLanguageAnalyzers = [ |
1666 | 'ar' => 'arabic', |
1667 | 'ary' => 'arabic-moroccan', |
1668 | 'arz' => 'arabic-egyptian', |
1669 | 'hy' => 'armenian', |
1670 | 'az' => 'azerbaijani', |
1671 | 'eu' => 'basque', |
1672 | 'bn' => 'bengali', |
1673 | 'pt-br' => 'brazilian', |
1674 | 'bg' => 'bulgarian', |
1675 | 'ca' => 'catalan', |
1676 | 'crh' => 'crimean-tatar', |
1677 | 'ja' => 'cjk', |
1678 | 'ko' => 'cjk', |
1679 | 'cs' => 'czech', |
1680 | 'da' => 'danish', |
1681 | 'nl' => 'dutch', |
1682 | 'en' => 'english', |
1683 | 'en-ca' => 'english', |
1684 | 'en-gb' => 'english', |
1685 | 'simple' => 'english', |
1686 | 'et' => 'estonian', |
1687 | 'fi' => 'finnish', |
1688 | 'fr' => 'french', |
1689 | 'gag' => 'gagauz', |
1690 | 'gl' => 'galician', |
1691 | 'de' => 'german', |
1692 | 'el' => 'greek', |
1693 | 'hi' => 'hindi', |
1694 | 'hu' => 'hungarian', |
1695 | 'id' => 'indonesian', |
1696 | 'ga' => 'irish', |
1697 | 'it' => 'italian', |
1698 | 'kk' => 'kazakh', |
1699 | 'lt' => 'lithuanian', |
1700 | 'lv' => 'latvian', |
1701 | 'ms' => 'malay', |
1702 | 'mwl' => 'mirandese', |
1703 | 'nb' => 'norwegian', |
1704 | 'nn' => 'norwegian', |
1705 | 'no' => 'norwegian', |
1706 | 'fa' => 'persian', |
1707 | 'pt' => 'portuguese', |
1708 | 'ro' => 'romanian', |
1709 | 'ru' => 'russian', |
1710 | 'ckb' => 'sorani', |
1711 | 'es' => 'spanish', |
1712 | 'sv' => 'swedish', |
1713 | 'tt' => 'tatar', |
1714 | 'tr' => 'turkish', |
1715 | 'th' => 'thai', |
1716 | ]; |
1717 | |
1718 | /** |
1719 | * @var bool[] indexed by language code, languages where ICU folding |
1720 | * can be enabled by default |
1721 | */ |
1722 | private $languagesWithIcuFolding = [ |
1723 | 'ar' => true, |
1724 | 'ary' => true, |
1725 | 'arz' => true, |
1726 | 'bg' => true, |
1727 | 'bn' => true, |
1728 | 'bs' => true, |
1729 | 'ca' => true, |
1730 | 'ckb' => true, |
1731 | 'cs' => true, |
1732 | 'da' => true, |
1733 | 'de' => true, |
1734 | 'el' => true, |
1735 | 'en' => true, |
1736 | 'en-ca' => true, |
1737 | 'en-gb' => true, |
1738 | 'simple' => true, |
1739 | 'eo' => true, |
1740 | 'es' => true, |
1741 | 'et' => true, |
1742 | 'eu' => true, |
1743 | 'fa' => true, |
1744 | 'fi' => true, |
1745 | 'fr' => true, |
1746 | 'ga' => true, |
1747 | 'gl' => true, |
1748 | 'he' => true, |
1749 | 'hi' => true, |
1750 | 'hr' => true, |
1751 | 'hu' => true, |
1752 | 'hy' => true, |
1753 | 'ja' => true, |
1754 | 'lt' => true, |
1755 | 'lv' => true, |
1756 | 'nb' => true, |
1757 | 'nl' => true, |
1758 | 'nn' => true, |
1759 | 'no' => true, |
1760 | 'pt' => true, |
1761 | 'pt-br' => true, |
1762 | 'ro' => true, |
1763 | 'ru' => true, |
1764 | 'sh' => true, |
1765 | 'sk' => true, |
1766 | 'sr' => true, |
1767 | 'sv' => true, |
1768 | 'th' => true, |
1769 | 'tr' => true, |
1770 | ]; |
1771 | |
1772 | /** |
1773 | * @var bool[] indexed by language code, indicates whether languages should always |
1774 | * replace the standard tokenizer with the icu_tokenizer by default (true), or should |
1775 | * never use any version of the icu_tokenizer, even when icu_token_repair is |
1776 | * available (false). (Reminder to future readers of this code: languages with |
1777 | * non-standard tokenizers in the text field, like zh/Chinese, still use icu_tokenizer |
1778 | * in the plain fields & suggest fields.) |
1779 | */ |
1780 | private $languagesWithIcuTokenization = [ |
1781 | // true => use any version of icu_tokenizer available over the standard tokenizer |
1782 | 'bo' => true, |
1783 | 'dz' => true, |
1784 | 'gan' => true, |
1785 | 'ja' => true, |
1786 | 'km' => true, |
1787 | 'lo' => true, |
1788 | 'my' => true, |
1789 | 'th' => true, |
1790 | 'wuu' => true, |
1791 | 'zh' => true, |
1792 | 'lzh' => true, // zh-classical |
1793 | 'zh-classical' => true, // deprecated code for lzh |
1794 | 'yue' => true, // zh-yue |
1795 | 'zh-yue' => true, // deprecated code for yue |
1796 | // This list below are languages that may use use mixed scripts |
1797 | 'bug' => true, |
1798 | 'cdo' => true, |
1799 | 'cr' => true, |
1800 | 'hak' => true, |
1801 | 'jv' => true, |
1802 | 'nan' => true, // zh-min-nan |
1803 | 'zh-min-nan' => true, // deprecated code for nan |
1804 | |
1805 | // false => do not use any version of icu_tokenizer (i.e., textify_icu_tokenzier) |
1806 | // over the standard tokenizer, even when icu_token_repair is available |
1807 | // 'xyz' => false, // <-- example entry for now, since there are no actual instances |
1808 | ]; |
1809 | |
1810 | /** |
1811 | * @var array[] |
1812 | */ |
1813 | private $elasticsearchLanguageAnalyzersFromPlugins = [ |
1814 | /** |
1815 | * multiple plugin requirement can be comma separated |
1816 | * |
1817 | * Polish: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T154517 |
1818 | * Ukrainian: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T160106 |
1819 | * Chinese: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T158203 |
1820 | * Hebrew: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T162741 |
1821 | * Serbian: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T183015 |
1822 | * Bosnian, Croatian, and Serbo-Croatian: |
1823 | * https://www.mediawiki.org/wiki/User:TJones_(WMF)/T192395 |
1824 | * Slovak: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T190815 |
1825 | * Esperanto: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T202173 |
1826 | * Korean: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T206874 |
1827 | * Khmer: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T185721 |
1828 | * |
1829 | * extra-analysis-ukrainian should follow analysis-ukrainian, so that |
1830 | * ukrainian-unpacked can overwrite value for uk if both are present. |
1831 | */ |
1832 | |
1833 | 'analysis-stempel' => [ 'pl' => 'polish' ], |
1834 | 'analysis-kuromoji' => [ 'ja' => 'japanese' ], |
1835 | 'analysis-stconvert,analysis-smartcn' => [ 'zh' => 'chinese' ], |
1836 | 'analysis-hebrew' => [ 'he' => 'hebrew' ], |
1837 | 'analysis-ukrainian' => [ 'uk' => 'ukrainian' ], |
1838 | 'extra-analysis-ukrainian' => [ 'uk' => 'ukrainian-unpacked' ], |
1839 | 'extra-analysis-esperanto' => [ 'eo' => 'esperanto' ], |
1840 | 'extra-analysis-serbian' => [ 'bs' => 'bosnian', 'hr' => 'croatian', |
1841 | 'sh' => 'serbo-croatian', 'sr' => 'serbian' ], |
1842 | 'extra-analysis-slovak' => [ 'sk' => 'slovak' ], |
1843 | 'analysis-nori' => [ 'ko' => 'korean' ], |
1844 | 'extra-analysis-khmer' => [ 'km' => 'khmer' ], |
1845 | ]; |
1846 | |
1847 | /** |
1848 | * Set up global custom filters |
1849 | * |
1850 | * @return array |
1851 | */ |
1852 | private static function buildGlobalCustomFilters(): array { |
1853 | $gcf = [ |
1854 | ////////////////////////// |
1855 | // char filters |
1856 | 'globo_norm' => new GlobalCustomFilter( 'char_filter' ), |
1857 | |
1858 | 'acronym_fixer' => ( new GlobalCustomFilter( 'char_filter' ) )-> |
1859 | // follow armenian_charfilter, which normalizes another period-like |
1860 | // character, if it is being used |
1861 | setRequiredPlugins( [ 'extra-analysis-textify' ] )-> |
1862 | setFallbackFilter( 'regex_acronym_fixer' )-> |
1863 | setMustFollowFilters( [ 'armenian_charfilter' ] ), |
1864 | |
1865 | 'camelCase_splitter' => ( new GlobalCustomFilter( 'char_filter' ) )-> |
1866 | // camelCase should generally follow acronyms so a.c.r.o.C.a.m.e.l. |
1867 | // is treated the same as acroCamel (real example: G.m.b.H. vs GmbH) |
1868 | setRequiredPlugins( [ 'extra-analysis-textify' ] )-> |
1869 | setFallbackFilter( 'regex_camelCase' )-> |
1870 | setMustFollowFilters( [ 'acronym_fixer', 'regex_acronym_fixer' ] ), |
1871 | |
1872 | 'word_break_helper' => ( new GlobalCustomFilter( 'char_filter' ) )-> |
1873 | // * acronyms should be fixed before converting period to spaces |
1874 | // * follow armenian_charfilter, which normalizes another period-like |
1875 | // character, if it is being used |
1876 | setMustFollowFilters( [ 'acronym_fixer', 'regex_acronym_fixer', |
1877 | 'armenian_charfilter' ] )-> |
1878 | setLanguageDenyList( [ 'ko', 'zh' ] ), |
1879 | |
1880 | 'dotted_I_fix' => ( new GlobalCustomFilter( 'char_filter' ) )-> |
1881 | // - if lowercase is present (because analysis-icu is not available, or |
1882 | // as a language-specific version) we don't need dotted_I_fix, because |
1883 | // lowercase prevents the problem. |
1884 | // - if icu_folding is present, we don't need dotted_I_fix, because |
1885 | // icu_folding also fixes it. |
1886 | setDisallowedTokenFilters( [ 'lowercase', 'icu_folding' ] ), |
1887 | |
1888 | ////////////////////////// |
1889 | // token filters |
1890 | 'icu_token_repair' => ( new GlobalCustomFilter( 'filter' ) )-> |
1891 | // apply icu_token_repair to icu_tokenizer-using analyzers |
1892 | // (default == text & text_search) |
1893 | setRequiredPlugins( [ 'extra-analysis-textify' ] )-> |
1894 | setRequiredTokenizer( 'textify_icu_tokenizer' ), |
1895 | |
1896 | 'icutokrep_no_camel_split' => ( new GlobalCustomFilter( 'filter' ) )-> |
1897 | // apply icu_token_repair variant to non-camelCase-splitting |
1898 | // icu_tokenizer-using analyzers when textify_icu_tokenizer is used |
1899 | setRequiredPlugins( [ 'extra-analysis-textify' ] )-> |
1900 | setApplyToAnalyzers( [ 'plain', 'plain_search', 'suggest', 'suggest_reverse', |
1901 | 'source_text_plain', 'source_text_plain_search', 'word_prefix' ] )-> |
1902 | setRequiredTokenizer( 'textify_icu_tokenizer' ), |
1903 | |
1904 | 'homoglyph_norm' => ( new GlobalCustomFilter( 'filter' ) )-> |
1905 | // aggressive_splitting has weird graph problems and creating |
1906 | // multiple tokens makes it blow up |
1907 | setRequiredPlugins( [ 'extra-analysis-homoglyph' ] )-> |
1908 | setMustFollowFilters( [ 'aggressive_splitting' ] ), |
1909 | ]; |
1910 | // reverse the array so that items are ordered (approximately, modulo incompatible |
1911 | // filters) in the order specified here |
1912 | return array_reverse( $gcf ); |
1913 | } |
1914 | |
1915 | } |