Code Coverage for /workspace/src/extensions/CirrusSearch/includes/Maintenance/AnalysisConfigBuilder.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	99.40% covered (success)	99.40%	1002 / 1008	76.92% covered (warning)	76.92%	20 / 26	CRAP	0.00% covered (danger)	0.00%	0 / 1
AnalysisConfigBuilder	99.40% covered (success)	99.40%	1002 / 1008	76.92% covered (warning)	76.92%	20 / 26	215	0.00% covered (danger)	0.00%	0 / 1
__construct	96.30% covered (success)	96.30%	26 / 27	0.00% covered (danger)	0.00%	0 / 1	8
shouldActivateIcuFolding	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	9
shouldActivateIcuTokenization	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	7
buildConfig	100.00% covered (success)	100.00%	13 / 13	100.00% covered (success)	100.00%	1 / 1	4
buildSimilarityConfig	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
enableICUTokenizer	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	6
standardTokenizerOnlyCleanup	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	6
disableLimitedMappings	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	4
enableICUFolding	100.00% covered (success)	100.00%	32 / 32	100.00% covered (success)	100.00%	1 / 1	12
switchFiltersToICUFolding	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
switchFiltersToICUFoldingPreserve	94.44% covered (success)	94.44%	17 / 18	0.00% covered (danger)	0.00%	0 / 1	7.01
getICUSetFilter	98.00% covered (success)	98.00%	49 / 50	0.00% covered (danger)	0.00%	0 / 1	29
getICUNormSetFilter	80.00% covered (warning)	80.00%	4 / 5	0.00% covered (danger)	0.00%	0 / 1	4.13
defaults	100.00% covered (success)	100.00%	286 / 286	100.00% covered (success)	100.00%	1 / 1	7
customize	100.00% covered (success)	100.00%	424 / 424	100.00% covered (success)	100.00%	1 / 1	68
fixAsciiFolding	100.00% covered (success)	100.00%	16 / 16	100.00% covered (success)	100.00%	1 / 1	7
getDefaultTextAnalyzerType	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
getDefaultFilters	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	5
resolveFilters	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	4
replaceFilter	87.50% covered (warning)	87.50%	7 / 8	0.00% covered (danger)	0.00%	0 / 1	4.03
mergeConfig	100.00% covered (success)	100.00%	17 / 17	100.00% covered (success)	100.00%	1 / 1	12
buildLanguageConfigs	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	4
isIcuAvailable	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
isTextifyAvailable	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
enableGlobalCustomFilters	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
buildGlobalCustomFilters	100.00% covered (success)	100.00%	31 / 31	100.00% covered (success)	100.00%	1 / 1	1

1	<?php
2
3	namespace CirrusSearch\Maintenance;
4
5	use CirrusSearch\CirrusSearch;
6	use CirrusSearch\CirrusSearchHookRunner;
7	use CirrusSearch\Profile\SearchProfileService;
8	use CirrusSearch\SearchConfig;
9	use MediaWiki\MediaWikiServices;
10
11	/**
12	* Builds elasticsearch analysis config arrays.
13	*
14	* This program is free software; you can redistribute it and/or modify
15	* it under the terms of the GNU General Public License as published by
16	* the Free Software Foundation; either version 2 of the License, or
17	* (at your option) any later version.
18	*
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*
24	* You should have received a copy of the GNU General Public License along
25	* with this program; if not, write to the Free Software Foundation, Inc.,
26	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
27	* http://www.gnu.org/copyleft/gpl.html
28	*/
29	class AnalysisConfigBuilder {
30	/**
31	* Version number for the core analysis. Increment the major
32	* version when the analysis changes in an incompatible way,
33	* and change the minor version when it changes but isn't
34	* incompatible.
35	*
36	* You may also need to increment MetaStoreIndex::METASTORE_VERSION
37	* manually as well.
38	*/
39	public const VERSION = '0.12';
40
41	/**
42	* Maximum number of characters allowed in keyword terms.
43	*/
44	private const KEYWORD_IGNORE_ABOVE = 5000;
45
46	/**
47	* Temporary magic value to prevent enabling ICU tokenizer in specific analyzers
48	*/
49	private const STANDARD_TOKENIZER_ONLY = 'std_only';
50
51	/**
52	* @var bool is the icu plugin available?
53	*/
54	private $icu;
55
56	/**
57	* @var bool is the textify plugin available?
58	*/
59	private $textify;
60
61	/**
62	* @var string which ICU tokenizer should be used
63	*/
64	private $icu_tokenizer = 'icu_tokenizer';
65
66	/**
67	* @var array Similarity algo (tf/idf, bm25, etc) configuration
68	*/
69	private $similarity;
70
71	/**
72	* @var SearchConfig cirrus config
73	*/
74	protected $config;
75
76	/**
77	* @var string[]
78	*/
79	private $plugins;
80
81	/**
82	* @var string
83	*/
84	protected $defaultLanguage;
85
86	/**
87	* @var CirrusSearchHookRunner
88	*/
89	private $cirrusSearchHookRunner;
90
91	/**
92	* @var GlobalCustomFilter[]
93	*/
94	public $globalCustomFilters;
95
96	/**
97	* @param string $langCode The language code to build config for
98	* @param string[] $plugins list of plugins installed in Elasticsearch
99	* @param SearchConfig\|null $config
100	* @param CirrusSearchHookRunner\|null $cirrusSearchHookRunner
101	*/
102	public function __construct(
103	$langCode,
104	array $plugins,
105	SearchConfig $config = null,
106	CirrusSearchHookRunner $cirrusSearchHookRunner = null
107	) {
108	$this->globalCustomFilters = $this->buildGlobalCustomFilters();
109
110	$this->defaultLanguage = $langCode;
111	$this->plugins = $plugins;
112	foreach ( $this->elasticsearchLanguageAnalyzersFromPlugins as $pluginSpec => $extra ) {
113	$pluginsPresent = 1;
114	$pluginList = explode( ',', $pluginSpec );
115	foreach ( $pluginList as $plugin ) {
116	if ( !in_array( $plugin, $plugins ) ) {
117	$pluginsPresent = 0;
118	break;
119	}
120	}
121	if ( $pluginsPresent ) {
122	$this->elasticsearchLanguageAnalyzers =
123	array_merge( $this->elasticsearchLanguageAnalyzers, $extra );
124	}
125	}
126	$this->icu = in_array( 'analysis-icu', $plugins );
127	$this->textify = in_array( 'extra-analysis-textify', $plugins );
128	if ( $this->isTextifyAvailable() ) {
129	// icu_token_repair can only work with the textify icu_tokenizer clone
130	$this->icu_tokenizer = 'textify_icu_tokenizer';
131	}
132	$config ??= MediaWikiServices::getInstance()->getConfigFactory()
133	->makeConfig( 'CirrusSearch' );
134	$similarity = $config->getProfileService()->loadProfile( SearchProfileService::SIMILARITY );
135	if ( !array_key_exists( 'similarity', $similarity ) ) {
136	$similarity['similarity'] = [];
137	}
138	$this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
139	MediaWikiServices::getInstance()->getHookContainer() );
140	$this->cirrusSearchHookRunner->onCirrusSearchSimilarityConfig( $similarity['similarity'] );
141	$this->similarity = $similarity;
142
143	$this->config = $config;
144	}
145
146	/**
147	* Determine if ascii folding should be used
148	* @param string $language Config language
149	* @return bool true if icu folding should be enabled
150	*/
151	public function shouldActivateIcuFolding( $language ) {
152	if ( !$this->isIcuAvailable() \|\| !in_array( 'extra', $this->plugins ) ) {
153	// ICU folding requires the icu plugin and the extra plugin
154	return false;
155	}
156	$in_config = $this->config->get( 'CirrusSearchUseIcuFolding' );
157	// BC code, this config var was originally a simple boolean
158	if ( $in_config === true ) {
159	$in_config = 'yes';
160	}
161	if ( $in_config === false ) {
162	$in_config = 'no';
163	}
164	switch ( $in_config ) {
165	case 'yes':
166	return true;
167	case 'no':
168	return false;
169	case 'default':
170	return $this->languagesWithIcuFolding[$language] ?? false;
171	default:
172	return false;
173	}
174	}
175
176	/**
177	* Determine if the icu_tokenizer can replace the standard tokenizer for this language
178	* @param string $language Config language
179	* @return bool
180	*/
181	public function shouldActivateIcuTokenization( $language ) {
182	if ( !$this->isIcuAvailable() && !$this->isTextifyAvailable() ) {
183	// requires the icu or textify plugin
184	return false;
185	}
186	$in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer' );
187	switch ( $in_config ) {
188	case 'yes':
189	return true;
190	case 'no':
191	return false;
192	case 'default':
193	// languagesWithIcuTokenization[] gives absolute answers for specific languages.
194	// If the textify plugin is available, the default is 'yes'/true because we
195	// have icu_token_repair available; if not, the default is 'no'/false
196	return $this->languagesWithIcuTokenization[$language] ?? $this->isTextifyAvailable();
197	default:
198	return false;
199	}
200	}
201
202	/**
203	* Build the analysis config.
204	*
205	* @param string\|null $language Config language
206	* @return array the analysis config
207	*/
208	public function buildConfig( $language = null ) {
209	$language ??= $this->defaultLanguage;
210	$config = $this->customize( $this->defaults( $language ), $language );
211	$this->cirrusSearchHookRunner->onCirrusSearchAnalysisConfig( $config, $this );
212
213	if ( $this->shouldActivateIcuTokenization( $language ) ) {
214	$config = $this->enableICUTokenizer( $config );
215	}
216
217	if ( $this->shouldActivateIcuFolding( $language ) ) {
218	$config = $this->enableICUFolding( $config, $language );
219	}
220	$config = $this->fixAsciiFolding( $config );
221	$config = $this->standardTokenizerOnlyCleanup( $config );
222	if ( !$this->isTextifyAvailable() ) {
223	$config = $this->disableLimitedMappings( $config );
224	}
225
226	// should come after other upgrades to get the full context
227	$config = $this->enableGlobalCustomFilters( $config, $language );
228
229	return $config;
230	}
231
232	/**
233	* @return array\|null the similarity config
234	*/
235	public function buildSimilarityConfig() {
236	return $this->similarity['similarity'] ?? null;
237	}
238
239	/**
240	* replace the standard tokenizer with icu_tokenizer
241	* @param mixed[] $config
242	* @return mixed[] update config
243	*/
244	public function enableICUTokenizer( array $config ) {
245	foreach ( $config[ 'analyzer' ] as $name => &$value ) {
246	if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
247	continue;
248	}
249	if ( isset( $value[ 'tokenizer' ] ) && $value[ 'tokenizer' ] === 'standard' ) {
250	$value[ 'tokenizer' ] = $this->icu_tokenizer;
251	}
252	}
253	return $config;
254	}
255
256	/**
257	* replace STANDARD_TOKENIZER_ONLY with the actual standard tokenizer
258	* @param mixed[] $config
259	* @return mixed[] update config
260	*/
261	public function standardTokenizerOnlyCleanup( array $config ) {
262	foreach ( $config[ 'analyzer' ] as $name => &$value ) {
263	if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
264	continue;
265	}
266	if ( isset( $value[ 'tokenizer' ] ) &&
267	$value[ 'tokenizer' ] === self::STANDARD_TOKENIZER_ONLY ) {
268	// if we blocked upgrades/changes to the standard tokenizer,
269	// replace the magic value with the actual standard tokenizer
270	$value[ 'tokenizer' ] = 'standard';
271	}
272	}
273	return $config;
274	}
275
276	/**
277	* replace limited_mappings with mappings if limited_mapping is unavailable
278	* @param mixed[] $config
279	* @return mixed[] update config
280	*/
281	public function disableLimitedMappings( array $config ) {
282	foreach ( $config[ 'char_filter' ] as $name => &$value ) {
283	if ( !isset( $value[ 'type' ] ) \|\| $value[ 'type' ] != 'limited_mapping' ) {
284	continue;
285	}
286	$value[ 'type' ] = 'mapping';
287	}
288	return $config;
289	}
290
291	/**
292	* Activate ICU folding instead of asciifolding
293	* @param mixed[] $config
294	* @param string $language Config language
295	* @return mixed[] update config
296	*/
297	public function enableICUFolding( array $config, $language ) {
298	$unicodeSetFilter = $this->getICUSetFilter( $language );
299	$filter = [
300	'type' => 'icu_folding',
301	];
302	if ( $unicodeSetFilter !== null ) {
303	$filter[ 'unicodeSetFilter' ] = $unicodeSetFilter;
304	}
305	$config[ 'filter' ][ 'icu_folding' ] = $filter;
306
307	// Adds a simple nfkc normalizer for cases where
308	// we preserve original but the lowercase filter
309	// is not used before
310	$config[ 'filter' ][ 'icu_nfkc_normalization' ] = [
311	'type' => 'icu_normalizer',
312	'name' => 'nfkc',
313	];
314
315	$newfilters = [];
316	foreach ( $config[ 'analyzer' ] as $name => $value ) {
317	if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
318	continue;
319	}
320	if ( !isset( $value[ 'filter' ] ) ) {
321	continue;
322	}
323	if ( in_array( 'asciifolding', $value[ 'filter' ] ) ) {
324	$newfilters[ $name ] = $this->switchFiltersToICUFolding( $value[ 'filter' ] );
325	}
326	if ( in_array( 'asciifolding_preserve', $value[ 'filter' ] ) ) {
327	$newfilters[ $name ] = $this->switchFiltersToICUFoldingPreserve( $value[ 'filter' ] );
328	}
329	}
330
331	foreach ( $newfilters as $name => $filters ) {
332	$config[ 'analyzer' ][ $name ][ 'filter' ] = $filters;
333	}
334	// Explicitly enable icu_folding on plain analyzers if it's not
335	// already enabled
336	foreach ( [ 'plain' ] as $analyzer ) {
337	if ( !isset( $config[ 'analyzer' ][ $analyzer ] ) ) {
338	continue;
339	}
340	if ( !isset( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] ) ) {
341	$config[ 'analyzer' ][ $analyzer ][ 'filter' ] = [];
342	}
343	$config[ 'analyzer' ][ $analyzer ][ 'filter' ] =
344	$this->switchFiltersToICUFoldingPreserve(
345	// @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset
346	$config[ 'analyzer' ][ $analyzer ][ 'filter' ], true );
347	}
348
349	return $config;
350	}
351
352	/**
353	* Replace occurrence of asciifolding to icu_folding
354	* @param string[] $filters
355	* @return string[] new list of filters
356	*/
357	private function switchFiltersToICUFolding( array $filters ) {
358	array_splice( $filters, array_search( 'asciifolding', $filters ), 1,
359	[ 'icu_folding', 'remove_empty' ] );
360	return $filters;
361	}
362
363	/**
364	* Replace occurrence of asciifolding_preserve with a set
365	* of compatible filters to enable icu_folding
366	* @param string[] $filters
367	* @param bool $append append icu_folding even if asciifolding is not present
368	* @return string[] new list of filters
369	*/
370	private function switchFiltersToICUFoldingPreserve( array $filters, $append = false ) {
371	if ( in_array( 'icu_folding', $filters ) ) {
372	// ICU folding already here
373	return $filters;
374	}
375	$ap_idx = array_search( 'asciifolding_preserve', $filters );
376	if ( $ap_idx === false && $append ) {
377	$ap_idx = count( $filters );
378	// fake an asciifolding_preserve so we can
379	// reuse code that replaces it
380	$filters[] = 'asciifolding_preserve';
381	}
382	if ( $ap_idx === false ) {
383	return $filters;
384	}
385	// with ICU lowercase is replaced by icu_normalizer/nfkc_cf
386	// thus unicode normalization is already done.
387	$lc_idx = array_search( 'icu_normalizer', $filters );
388	$newfilters = [];
389	if ( $lc_idx === false \|\| $lc_idx > $ap_idx ) {
390	// If lowercase is not detected before we
391	// will have to do some icu normalization
392	// this is to prevent preserving "un-normalized"
393	// unicode chars.
394	$newfilters[] = 'icu_nfkc_normalization';
395	}
396	$newfilters[] = 'preserve_original_recorder';
397	$newfilters[] = 'icu_folding';
398	$newfilters[] = 'preserve_original';
399	$newfilters[] = 'remove_empty';
400	array_splice( $filters, $ap_idx, 1, $newfilters );
401	return $filters;
402	}
403
404	/**
405	* Return the list of chars to exclude from ICU folding
406	* @param string $language Config language
407	* @return null\|string
408	*/
409	protected function getICUSetFilter( $language ) {
410	if ( $this->config->get( 'CirrusSearchICUFoldingUnicodeSetFilter' ) !== null ) {
411	return $this->config->get( 'CirrusSearchICUFoldingUnicodeSetFilter' );
412	}
413	switch ( $language ) {
414	/* @todo: complete the default filters per language
415	*
416	* For Slovak (sk)—which has no folding configured here!—see:
417	* https://www.mediawiki.org/wiki/User:TJones_(WMF)/T223787
418	*
419	* Exceptions are generally listed as Unicode characters for ease of
420	* inspection. However, combining characters (such as for Thai (th))
421	* are \u encoded to prevent problems with display or editing
422	*/
423	case 'bg': // T325090
424	return '[^Йй]';
425	case 'bs': // T192395
426	case 'hr': // T192395
427	case 'sh': // T192395
428	case 'sr': // T183015
429	return '[^ĐđŽžĆćŠšČč]';
430	case 'cs': // T284578
431	return '[^ÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž]';
432	case 'da': // T283366
433	return '[^ÆæØøÅå]';
434	case 'de': // T281379
435	return '[^ÄäÖöÜüẞß]';
436	case 'eo': // T202173
437	return '[^ĈĉĜĝĤĥĴĵŜŝŬŭ]';
438	case 'es': // T277699
439	return '[^Ññ]';
440	case 'et': // T332322
441	return '[^ŠšŽžÕõÄäÖöÜü]';
442	case 'eu': // T283366
443	return '[^Ññ]';
444	case 'fi': // T284578
445	return '[^ÅåÄäÖö]';
446	case 'gl': // T284578
447	return '[^Ññ]';
448	case 'hu': // T325089
449	return '[^ÁáÉéÍíÓóÖöŐőÚúÜüŰű]';
450	case 'ja': // T326822
451	// This range includes characters that don't currently get ICU folded, in
452	// order to keep the overall regex a lot simpler. The specific targets are
453	// characters with dakuten and handakuten, the separate (han)dakuten
454	// characters (regular and combining) and the prolonged sound mark (chōonpu).
455	return '[^が-ヾ]';
456	case 'lt': // T325090
457	return '[^ĄąČčĘęĖėĮįŠšŲųŪūŽž]';
458	case 'lv': // T325089
459	return '[^ĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž]';
460	case 'nb': // T289612
461	case 'nn': // T289612
462	case 'no':
463	return '[^ÆæØøÅå]';
464	case 'ro': // T325091
465	// including s&t with cedilla because we (have to) use it internally T330893
466	return '[^ĂăÂâÎîȘșȚțŞşŢţ]';
467	case 'ru':
468	return '[^Йй]';
469	case 'sv': // T160562
470	return '[^ÅåÄäÖö]';
471	case 'th': // T294147
472	return '[^\u0E47-\u0E4E]';
473	case 'tr': // T329762
474	// (I and i aren't strictly necessary but they keep the Turkish upper/lower
475	// pairs Iı & İi together and makes it clear both are intended.)
476	return '[^ÇçĞğIıİiÖöŞşÜü]';
477	default:
478	return null;
479	}
480	}
481
482	/**
483	* Return the list of chars to exclude from ICU normalization
484	* @param string $language Config language
485	* @return null\|string
486	*/
487	protected function getICUNormSetFilter( $language ) {
488	if ( $this->config->get( 'CirrusSearchICUNormalizationUnicodeSetFilter' ) !== null ) {
489	return $this->config->get( 'CirrusSearchICUNormalizationUnicodeSetFilter' );
490	}
491	switch ( $language ) {
492	/* For German (de), see T281379
493	*/
494	case 'de':
495	return '[^ẞß]'; // Capital ẞ is lowercased to ß by german_charfilter
496	// lowercase ß is normalized to ss by german_normalization
497	default:
498	return null;
499	}
500	}
501
502	/**
503	* Build an analysis config with sane defaults.
504	*
505	* @param string $language Config language
506	* @return array
507	*/
508	private function defaults( $language ) {
509	$defaults = [
510	'analyzer' => [
511	'text' => [
512	'type' => $this->getDefaultTextAnalyzerType( $language ),
513	],
514	// text_search is not configured here because it will be copied from text
515	'plain' => [
516	// Surprisingly, the Lucene docs claim this works for
517	// Chinese, Japanese, and Thai as well.
518	// The difference between this and the 'standard'
519	// analyzer is the lack of english stop words.
520	'type' => 'custom',
521	'char_filter' => [ 'nnbsp_norm', 'word_break_helper' ],
522	'tokenizer' => 'standard',
523	'filter' => [ 'lowercase' ],
524	],
525	'plain_search' => [
526	// In accent squashing languages this will not contain accent
527	// squashing to allow searches with accents to only find accents
528	// and searches without accents to find both.
529	'type' => 'custom',
530	'char_filter' => [ 'nnbsp_norm', 'word_break_helper' ],
531	'tokenizer' => 'standard',
532	'filter' => [ 'lowercase' ],
533	],
534	// Used by ShortTextIndexField
535	'short_text' => [
536	'type' => 'custom',
537	'tokenizer' => 'whitespace',
538	'filter' => [ 'lowercase', 'aggressive_splitting', 'asciifolding_preserve' ],
539	],
540	'short_text_search' => [
541	'type' => 'custom',
542	'tokenizer' => 'whitespace',
543	'filter' => [ 'lowercase', 'aggressive_splitting' ],
544	],
545	'source_text_plain' => [
546	'type' => 'custom',
547	'char_filter' => [ 'word_break_helper_source_text' ],
548	'tokenizer' => 'standard',
549	'filter' => [ 'lowercase' ],
550	],
551	'source_text_plain_search' => [
552	'type' => 'custom',
553	'char_filter' => [ 'word_break_helper_source_text' ],
554	'tokenizer' => 'standard',
555	'filter' => [ 'lowercase' ],
556	],
557	'suggest' => [
558	'type' => 'custom',
559	'tokenizer' => 'standard',
560	'filter' => [ 'lowercase', 'suggest_shingle' ],
561	],
562	'suggest_reverse' => [
563	'type' => 'custom',
564	'tokenizer' => 'standard',
565	'filter' => [ 'lowercase', 'suggest_shingle', 'reverse' ],
566	],
567	'token_reverse' => [
568	'type' => 'custom',
569	'tokenizer' => 'no_splitting',
570	'filter' => [ 'reverse' ]
571	],
572	'near_match' => [
573	'type' => 'custom',
574	'char_filter' => [ 'near_space_flattener' ],
575	'tokenizer' => 'no_splitting',
576	'filter' => [ 'lowercase' ],
577	],
578	'near_match_asciifolding' => [
579	'type' => 'custom',
580	'char_filter' => [ 'near_space_flattener' ],
581	'tokenizer' => 'no_splitting',
582	'filter' => [ 'truncate_keyword', 'lowercase', 'asciifolding' ],
583	],
584	'prefix' => [
585	'type' => 'custom',
586	'char_filter' => [ 'near_space_flattener' ],
587	'tokenizer' => 'prefix',
588	'filter' => [ 'lowercase' ],
589	],
590	'prefix_asciifolding' => [
591	'type' => 'custom',
592	'char_filter' => [ 'near_space_flattener' ],
593	'tokenizer' => 'prefix',
594	'filter' => [ 'lowercase', 'asciifolding' ],
595	],
596	'word_prefix' => [
597	'type' => 'custom',
598	'tokenizer' => 'standard',
599	'filter' => [ 'lowercase', 'prefix_ngram_filter' ],
600	],
601	'keyword' => [
602	'type' => 'custom',
603	'tokenizer' => 'no_splitting',
604	'filter' => [ 'truncate_keyword' ],
605	],
606	'lowercase_keyword' => [
607	'type' => 'custom',
608	'tokenizer' => 'no_splitting',
609	'filter' => [ 'truncate_keyword', 'lowercase' ],
610	],
611	'trigram' => [
612	'type' => 'custom',
613	'tokenizer' => 'trigram',
614	'filter' => [ 'lowercase' ],
615	],
616	],
617	'filter' => [
618	'suggest_shingle' => [
619	'type' => 'shingle',
620	'min_shingle_size' => 2,
621	'max_shingle_size' => 3,
622	'output_unigrams' => true,
623	],
624	'lowercase' => [
625	'type' => 'lowercase',
626	],
627	'aggressive_splitting' => [
628	'type' => 'word_delimiter_graph',
629	'stem_english_possessive' => false,
630	'preserve_original' => false
631	],
632	'prefix_ngram_filter' => [
633	'type' => 'edgeNGram',
634	'max_gram' => CirrusSearch::MAX_TITLE_SEARCH,
635	],
636	'asciifolding' => [
637	'type' => 'asciifolding',
638	'preserve_original' => false
639	],
640	'asciifolding_preserve' => [
641	'type' => 'asciifolding',
642	'preserve_original' => true
643	],
644	// The 'keyword' type in ES seems like a hack
645	// and doesn't allow normalization (like lowercase)
646	// prior to 5.2. Instead we consistently use 'text'
647	// and truncate where necessary.
648	'truncate_keyword' => [
649	'type' => 'truncate',
650	'length' => self::KEYWORD_IGNORE_ABOVE,
651	],
652	'remove_empty' => [
653	'type' => 'length',
654	'min' => 1,
655	],
656	],
657	'tokenizer' => [
658	'prefix' => [
659	'type' => 'edgeNGram',
660	'max_gram' => CirrusSearch::MAX_TITLE_SEARCH,
661	],
662	'no_splitting' => [ // Just grab the whole term.
663	'type' => 'keyword',
664	],
665	'trigram' => [
666	'type' => 'nGram',
667	'min_gram' => 3,
668	'max_gram' => 3,
669	],
670	],
671	'char_filter' => [
672	// Flattens things that are space like to spaces in the near_match style analyzers
673	'near_space_flattener' => [
674	'type' => 'limited_mapping',
675	'mappings' => [
676	"'=>\u0020", // Useful for finding names
677	'\u2019=>\u0020', // Unicode right single quote
678	'\u02BC=>\u0020', // Unicode modifier letter apostrophe
679	'_=>\u0020', // MediaWiki loves _ and people are used to it but it
680	// usually means space
681	'-=>\u0020', // Useful for finding hyphenated names unhyphenated
682	],
683	],
684	// map narrow no-break space to plain space to compensate for ES6.x+
685	// analyzers generally not doing so
686	'nnbsp_norm' => [
687	'type' => 'limited_mapping',
688	'mappings' => [
689	'\u202F=>\u0020',
690	],
691	],
692	// Add a space between lowercase letter {Ll} and uppercase {Lu} or
693	// titlecase {Lt} letter, allowing for optional combining marks {M}
694	// or invisibles {Cf}. This is expensive, so use camelCase_splitter
695	// in extra-analysis-textify instead, if available (T219108/T346051)
696	'regex_camelCase' => [
697	'type' => 'pattern_replace',
698	'pattern' => '(\\p{Ll}[\\p{M}\\p{Cf}]*)([\\p{Lu}\\p{Lt}])',
699	'replacement' => '$1 $2'
700	],
701	// Replace period (regular or fullwidth) between [non-letter +
702	// letter] and [letter + non-letter]. This slow, and also only
703	// handles the simplest case. Use acronym_fixer in
704	// extra-analysis-textify instead, if available (T170625/T346051)
705	'regex_acronym_fixer' => [
706	'type' => 'pattern_replace',
707	'pattern' => '(?<=(?:^\|\\P{L})\\p{L})[.．](\\p{L})(?=\\P{L}\|$)',
708	'replacement' => '$1'
709	],
710	// combine universally-applied mappings into one mapping to save on the
711	// overhead of calling multiple mappings
712	'globo_norm' => [
713	'type' => 'mapping',
714	'mappings' => [
715	// map lots of apostrophe-like characters to apostrophe (T315118);
716	// formerly apostrophe_norm
717	"`=>'", // grave accent
718	"´=>'", // acute accent
719	"ʹ=>'", // modifier letter prime
720	"ʻ=>'", // modifier letter turned comma
721	"ʼ=>'", // modifier letter apostrophe
722	"ʽ=>'", // modifier letter reversed comma
723	"ʾ=>'", // modifier letter right half ring
724	"ʿ=>'", // modifier letter left half ring
725	"ˋ=>'", // modifier letter grave accent
726	"՚=>'", // Armenian apostrophe
727	"\u05F3=>'", // Hebrew punctuation geresh
728	"‘=>'", // left single quotation mark
729	"’=>'", // right single quotation mark
730	"‛=>'", // single high-reversed-9 quotation mark
731	"′=>'", // prime
732	"‵=>'", // reversed prime
733	"ꞌ=>'", // Latin small letter saltillo
734	"＇=>'", // fullwidth apostrophe
735	"｀=>'", // fullwidth grave accent
736	// map narrow no-break space to plain space to compensate for ES6.x+
737	// analyzers generally not doing so; copied from nnbsp_norm, which
738	// is still needed elsewhere
739	'\u202F=>\u0020',
740	// Delete primary and secondary stress markers, which are
741	// inconsistently used across phonetic transcriptions
742	"ˈ=>", // modifier letter vertical line
743	"ˌ=>", // modifier letter low vertical line
744	// Delete Arabic tatweel (ـ) (used largely for cosmetic purposes)
745	"\u0640=>", // tatweel
746	// Convert Arabic thousand separator and Arabic comma to comma for
747	// more consistent number parsing
748	"٬=>,", // Arabic thousands separator
749	"،=>,", // Arabic comma
750	// delete Armenian emphasis marks, exclamation marks, and question
751	// marks, since they modify words rather than follow them.
752	"՛=>", // Armenian emphasis mark
753	"՜=>", // Armenian exclamation mark
754	"՞=>", // Armenian question mark
755	// micro sign to mu, to prevent some unneeded ICU tokenizer splits
756	// icu_normalize does this, too.. just later
757	"µ=>μ",
758	// Yiddish Ligatures (T362501)
759	"\u05F0=>\u05D5\u05D5", // double vav
760	"\u05F1=>\u05D5\u05D9", // vav yod
761	"\u05F2=>\u05D9\u05D9", // double yod
762	"\uFB1F=>\u05D9\u05D9\u05B7", // single char yod-yod-patah decomposed
763	"\u05D9\u05B7\u05D9=>\u05D9\u05D9\u05B7", // rarer alternate order
764	],
765	],
766	'arabic_extended_norm' => [
767	'type' => 'limited_mapping',
768	'mappings' => [
769	'\uFB8E=>\u0643', '\uFB8F=>\u0643', '\uFB90=>\u0643', // kaf
770	'\uFB91=>\u0643', '\u06A9=>\u0643', '\u06AA=>\u0643',
771	'\uFEDB=>\u0643', '\uFEDC=>\u0643', '\uFED9=>\u0643',
772	'\uFEDA=>\u0643',
773
774	'\uFBFC=>\u064A', '\uFBFD=>\u064A', '\uFBFE=>\u064A', // yeh
775	'\uFBFF=>\u064A', '\u06CC=>\u064A', '\uFBE8=>\u064A',
776	'\uFBE9=>\u064A', '\uFEEF=>\u064A', '\uFEF0=>\u064A',
777	'\u0649=>\u064A', '\u06CD=>\u064A', '\uFBE4=>\u064A',
778	'\uFBE5=>\u064A', '\uFBE6=>\u064A', '\uFBE7=>\u064A',
779	'\u06D0=>\u064A',
780
781	'\uFBA6=>\u0647', '\uFBA7=>\u0647', '\uFBA8=>\u0647', // heh
782	'\uFBA9=>\u0647', '\u06C1=>\u0647', '\u06C0=>\u0647',
783	'\uFBA4=>\u0647', '\uFBA5=>\u0647', '\u06D5=>\u0647',
784	],
785	],
786	// Converts things that don't always count as word breaks into spaces
787	// which (almost) always count as word breaks (e.g., the Nori and SmartCN
788	// tokenizers do not always count spaces as word breaks!)
789	'word_break_helper' => [
790	'type' => 'limited_mapping',
791	'mappings' => [
792	'_=>\u0020',
793	':=>\u0020',
794	// These are more useful for code:
795	'.=>\u0020',
796	'(=>\u0020',
797	')=>\u0020',
798	// fullwidth variants
799	'．=>\u0020',
800	'＿=>\u0020',
801	'：=>\u0020',
802	// middle dot
803	'·=>\u0020',
804	],
805	],
806	'word_break_helper_source_text' => [
807	'type' => 'limited_mapping',
808	'mappings' => [
809	'_=>\u0020',
810	// These are more useful for code:
811	'.=>\u0020',
812	'(=>\u0020',
813	')=>\u0020',
814	':=>\u0020', // T145023
815	],
816	],
817	'dotted_I_fix' => [
818	// A common regression caused by unpacking is that İ is no longer
819	// treated correctly, so specify the mapping just once and re-use
820	// in analyzer/text/char_filter as needed.
821	'type' => 'limited_mapping',
822	'mappings' => [
823	'İ=>I',
824	],
825	],
826	],
827	];
828	foreach ( $defaults[ 'analyzer' ] as &$analyzer ) {
829	if ( $analyzer[ 'type' ] === 'default' ) {
830	$analyzer = [
831	'type' => 'custom',
832	'tokenizer' => 'standard',
833	'filter' => [ 'lowercase' ],
834	];
835	}
836	}
837	if ( $this->isTextifyAvailable() && $this->shouldActivateIcuTokenization( $language ) ) {
838	$defaults[ 'filter' ][ 'icutokrep_no_camel_split' ] = [
839	'type' => 'icu_token_repair',
840	'keep_camel_split' => false
841	];
842	}
843	if ( $this->isIcuAvailable() ) {
844	$defaults[ 'filter' ][ 'icu_normalizer' ] = [
845	'type' => 'icu_normalizer',
846	'name' => 'nfkc_cf',
847	];
848	$unicodeSetFilter = $this->getICUNormSetFilter( $language );
849	if ( $unicodeSetFilter !== null ) {
850	$defaults[ 'filter' ][ 'icu_normalizer' ][ 'unicodeSetFilter' ] = $unicodeSetFilter;
851	}
852	}
853
854	return $defaults;
855	}
856
857	/**
858	* Customize the default config for the language.
859	*
860	* @param array $config
861	* @param string $language Config language
862	* @return array
863	*/
864	private function customize( $config, $language ) {
865	$langName = $this->getDefaultTextAnalyzerType( $language );
866	switch ( $langName ) {
867	// Please add languages in alphabetical order.
868
869	// usual unpacked languages
870	case 'basque': // Unpack Basque analyzer T283366
871	case 'brazilian': // Unpack Brazilian analyzer T325092
872	case 'bulgarian': // Unpack Bulgarian analyzer T325090
873	case 'czech': // Unpack Czech analyzer T284578
874	case 'danish': // Unpack Danish analyzer T283366
875	case 'estonian': // Unpack Estonian analyzer T332322
876	case 'finnish': // Unpack Finnish analyzer T284578
877	case 'galician': // Unpack Galician analyzer T284578
878	case 'hungarian': // Unpack Hungarian analyzer T325089
879	case 'latvian': // Unpack Latvian analyzer T325089
880	case 'lithuanian': // Unpack Lithuanian analyzer T325090
881	case 'norwegian': // Unpack Norwegian analyzer T289612
882	$config = ( new AnalyzerBuilder( $langName ) )->
883	withUnpackedAnalyzer()->
884	build( $config );
885	break;
886
887	// usual unpacked languages, with "light" variant stemmer
888	case 'portuguese': // Unpack Portuguese analyzer T281379
889	case 'spanish': // Unpack Spanish analyzer T277699
890	$config = ( new AnalyzerBuilder( $langName ) )->
891	withUnpackedAnalyzer()->
892	withLightStemmer()->
893	build( $config );
894	break;
895
896	// customized languages
897	case 'arabic':
898	case 'arabic-egyptian':
899	case 'arabic-moroccan':
900	// Unpack Arabic analyzer T294147
901	$arBuilder = ( new AnalyzerBuilder( 'arabic' ) )->
902	withUnpackedAnalyzer()->
903	withDecimalDigit()->
904	insertFiltersBefore( 'arabic_stemmer', [ 'arabic_normalization' ] );
905
906	// load extra stopwords for Arabic
907	$arabicExtraStopwords = require __DIR__ . '/AnalysisLanguageData/arabicStopwords.php';
908	$arBuilder->withExtraStop( $arabicExtraStopwords, 'arabic_extra_stop', 'arabic_stop' );
909
910	$config = $arBuilder->build( $config );
911	break;
912	case 'armenian': // Unpack Armenian analyzer T325089
913	// char map: Armenian uses ․ ("one-dot leader") about 10% as often as . (period)
914	// stopwords նաև & և get normalized to նաեւ & եւ, so pick those up, too.
915	$config = ( new AnalyzerBuilder( $langName ) )->
916	withUnpackedAnalyzer()->
917	withLimitedCharMap( [ '․=>.' ] )->
918	withExtraStop( [ 'նաեւ', 'եւ' ], 'armenian_norm_stop', 'armenian_stop' )->
919	build( $config );
920	break;
921	case 'azerbaijani':
922	case 'crimean-tatar':
923	case 'gagauz':
924	case 'kazakh':
925	case 'tatar':
926	// Turkic languages that use I/ı & İ/i, so need Turkish lowercasing
927	$config = ( new AnalyzerBuilder( $langName ) )->
928	withFilters( [ 'lowercase' ] )->
929	withLangLowercase( 'turkish' )->
930	build( $config );
931	break;
932	case 'bengali': // Unpack Bengali analyzer T294067
933	$config = ( new AnalyzerBuilder( $langName ) )->
934	withUnpackedAnalyzer()->
935	withDecimalDigit()->
936	insertFiltersBefore( 'bengali_stop', [ 'indic_normalization' ] )->
937	build( $config );
938	break;
939	case 'bosnian':
940	case 'croatian':
941	case 'serbian':
942	case 'serbo-croatian':
943	// Unpack default analyzer to add Serbian stemming and custom folding
944	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T183015
945	// and https://www.mediawiki.org/wiki/User:TJones_(WMF)/T192395
946	$config = ( new AnalyzerBuilder( $langName ) )->
947	withFilters( [ 'lowercase', 'asciifolding', 'serbian_stemmer' ] )->
948	build( $config );
949	break;
950	case 'catalan':
951	// Unpack Catalan analyzer T283366
952	$config = ( new AnalyzerBuilder( $langName ) )->
953	withUnpackedAnalyzer()->
954	withElision( [ 'd', 'l', 'm', 'n', 's', 't' ] )->
955	build( $config );
956	break;
957	case 'chinese':
958	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T158203
959	$config[ 'char_filter' ][ 'tsconvert' ] = [
960	'type' => 'stconvert',
961	'delimiter' => '#',
962	'keep_both' => false,
963	'convert_type' => 't2s',
964	];
965
966	// char map: hack for STConvert errors (still present as of July 2023)
967	// see https://github.com/medcl/elasticsearch-analysis-stconvert/issues/13
968	// stop: SmartCN converts lots of punctuation to ',' but we don't want to index it
969	$config = ( new AnalyzerBuilder( $langName ) )->
970	withCharMap( [ '\u606d\u5f18=>\u606d \u5f18', '\u5138=>\u3469' ], 'stconvertfix' )->
971	withCharFilters( [ 'stconvertfix', 'tsconvert' ] )->
972	withTokenizer( 'smartcn_tokenizer' )->
973	withStop( [ ',' ], 'smartcn_stop' )->
974	withFilters( [ 'smartcn_stop', 'lowercase' ] )->
975	build( $config );
976
977	$config[ 'analyzer' ][ 'plain' ][ 'filter' ] = [ 'smartcn_stop', 'lowercase' ];
978	$config[ 'analyzer' ][ 'plain_search' ][ 'filter' ] =
979	$config[ 'analyzer' ][ 'plain' ][ 'filter' ];
980	break;
981	case 'cjk':
982	// Unpack CJK analyzer T326822
983	// map (han)dakuten to combining forms or icu_normalizer will add spaces
984	$dakutenMap = [ '゛=>\u3099', '゜=>\u309a' ];
985
986	// cjk_bigram negates the benefits of the icu_tokenizer for CJK text. The
987	// icu_tokenizer also has a few bad side effects, so don't use it for cjk.
988	// Default cjk stop words are almost the same as _english_ (add s & t; drop
989	// an). Stop words are searchable via 'plain' anyway, so just use _english_
990	$config = ( new AnalyzerBuilder( 'cjk' ) )->
991	withUnpackedAnalyzer()->
992	withLimitedCharMap( $dakutenMap )->
993	withTokenizer( self::STANDARD_TOKENIZER_ONLY )->
994	withStop( '_english_' )->
995	omitStemmer()->
996	insertFiltersBefore( 'lowercase', [ 'cjk_width' ] )->
997	insertFiltersBefore( 'cjk_stop', [ 'cjk_bigram' ] )->
998	build( $config );
999	break;
1000	case 'dutch':
1001	// Unpack Dutch analyzer T281379
1002	$nlOverride = [ // these are in the default Dutch analyzer
1003	'fiets=>fiets',
1004	'bromfiets=>bromfiets',
1005	'ei=>eier',
1006	'kind=>kinder'
1007	];
1008	$config = ( new AnalyzerBuilder( $langName ) )->
1009	withUnpackedAnalyzer()->
1010	withStemmerOverride( $nlOverride )->
1011	build( $config );
1012	break;
1013	case 'english':
1014	// Replace English analyzer with a rebuilt copy with asciifolding inserted
1015	// before stemming
1016	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T142037
1017	$config = ( new AnalyzerBuilder( $langName ) )->
1018	withExtraStemmer( 'possessive_english' )->
1019	withStemmerOverride( 'guidelines => guideline', 'custom_stem' )->
1020	withFilters( [ 'possessive_english', 'lowercase', 'stop', 'asciifolding',
1021	'kstem', 'custom_stem' ] )->
1022	build( $config );
1023
1024	// Add asciifolding_preserve to the plain analyzer as well (but not plain_search)
1025	$config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 'asciifolding_preserve';
1026	// Add asciifolding_preserve filters
1027	$config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
1028	break;
1029	case 'esperanto':
1030	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T202173
1031	$config = ( new AnalyzerBuilder( $langName ) )->
1032	withFilters( [ 'lowercase', 'asciifolding', 'esperanto_stemmer' ] )->
1033	build( $config );
1034	break;
1035	case 'french':
1036	// Add asciifolding_preserve to filters
1037	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T142620
1038	$config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
1039
1040	$config = ( new AnalyzerBuilder( $langName ) )->
1041	withUnpackedAnalyzer()->
1042	withLimitedCharMap( [ '\u02BC=>\u0027' ] )->
1043	withElision( [ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c',
1044	'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ] )->
1045	withLightStemmer()->
1046	withAsciifoldingPreserve()->
1047	build( $config );
1048	break;
1049	case 'german':
1050	// Unpack German analyzer T281379
1051	// char map: We have to explicitly map capital ẞ to lowercase ß
1052	$config = ( new AnalyzerBuilder( $langName ) )->
1053	withUnpackedAnalyzer()->
1054	withLimitedCharMap( [ 'ẞ=>ß' ] )->
1055	withLightStemmer()->
1056	insertFiltersBefore( 'german_stemmer', [ 'german_normalization' ] )->
1057	build( $config );
1058
1059	$config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'german_charfilter';
1060	$config[ 'analyzer' ][ 'plain_search' ][ 'char_filter' ][] = 'german_charfilter';
1061	break;
1062	case 'greek':
1063	$config = ( new AnalyzerBuilder( $langName ) )->
1064	withUnpackedAnalyzer()->
1065	omitAsciifolding()->
1066	withLangLowercase()->
1067	withRemoveEmpty()->
1068	build( $config );
1069	break;
1070	case 'hebrew':
1071	$config = ( new AnalyzerBuilder( $langName ) )->
1072	withTokenizer( 'hebrew' )->
1073	withFilters( [ 'niqqud', 'hebrew_lemmatizer', 'remove_duplicates', 'lowercase',
1074	'asciifolding' ] )->
1075	build( $config );
1076	break;
1077	case 'hindi':
1078	// Unpack Hindi analyzer T289612
1079	$config = ( new AnalyzerBuilder( $langName ) )->
1080	withUnpackedAnalyzer()->
1081	withDecimalDigit()->
1082	insertFiltersBefore( 'hindi_stop',
1083	[ 'indic_normalization', 'hindi_normalization' ] )->
1084	build( $config );
1085	break;
1086	case 'indonesian':
1087	case 'malay':
1088	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T196780
1089	$config = ( new AnalyzerBuilder( 'indonesian' ) )->
1090	withUnpackedAnalyzer()->
1091	omitAsciifolding()->
1092	build( $config );
1093	break;
1094	case 'irish':
1095	$gaCharMap = [ 'ḃ=>bh', 'ċ=>ch', 'ḋ=>dh', 'ḟ=>fh', 'ġ=>gh', 'ṁ=>mh', 'ṗ=>ph',
1096	'ṡ=>sh', 'ẛ=>sh', 'ṫ=>th', 'Ḃ=>BH', 'Ċ=>CH', 'Ḋ=>DH', 'Ḟ=>FH', 'Ġ=>GH',
1097	'Ṁ=>MH', 'Ṗ=>PH', 'Ṡ=>SH', 'Ṫ=>TH' ];
1098
1099	// Add b, bh, g, m for camelCase cleanup
1100	$gaHyphenStop = [ 'h', 'n', 't', 'b', 'bh', 'g', 'm' ];
1101
1102	// Unpack Irish analyzer T289612
1103	// See also https://www.mediawiki.org/wiki/User:TJones_(WMF)/T217602
1104	$config = ( new AnalyzerBuilder( $langName ) )->
1105	withUnpackedAnalyzer()->
1106	withCharMap( $gaCharMap )->
1107	withExtraStop( $gaHyphenStop, 'irish_hyphenation', 'irish_elision', true )->
1108	withElision( [ 'd', 'm', 'b' ] )->
1109	withLangLowercase()->
1110	build( $config );
1111	break;
1112	case 'italian':
1113	// Replace the default Italian analyzer with a rebuilt copy with additional filters
1114	$itElision = [ 'c', 'l', 'all', 'dall', 'dell', 'nell', 'sull', 'coll', 'pell',
1115	'gl', 'agl', 'dagl', 'degl', 'negl', 'sugl', 'un', 'm', 't', 's', 'v', 'd' ];
1116	$config = ( new AnalyzerBuilder( $langName ) )->
1117	withUnpackedAnalyzer()->
1118	withElision( $itElision )->
1119	withLightStemmer()->
1120	build( $config );
1121
1122	// Add asciifolding_preserve to the plain analyzer as well (but not plain_search)
1123	$config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 'asciifolding_preserve';
1124	// Add asciifolding_preserve to filters
1125	$config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
1126	break;
1127	case 'japanese':
1128	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T166731
1129	// pre-convert fullwidth numbers because Kuromoji tokenizer treats them weirdly
1130	$config = ( new AnalyzerBuilder( $langName ) )->
1131	withNumberCharFilter( 0xff10, 'fullwidthnumfix' )->
1132	withCharFilters( [ 'fullwidthnumfix' ] )->
1133	withTokenizer( 'kuromoji_tokenizer' )->
1134	withFilters( [ 'kuromoji_baseform', 'cjk_width', 'ja_stop', 'kuromoji_stemmer',
1135	'lowercase' ] )->
1136	build( $config );
1137	break;
1138	case 'khmer':
1139	// See Khmer: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T185721
1140	$config = ( new AnalyzerBuilder( $langName ) )->
1141	withNumberCharFilter( 0x17e0 )->
1142	withCharFilters( [ 'khmer_syll_reorder', 'khmer_numbers' ] )->
1143	withFilters( [ 'lowercase' ] )->
1144	build( $config );
1145	break;
1146	case 'korean':
1147	// Unpack nori analyzer to add ICU normalization and custom filters
1148	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T206874
1149
1150	// Nori-specific character filter
1151	$noriMap = [
1152	'\u00B7=>\u0020', // convert middle dot to space
1153	'\u318D=>\u0020', // arae-a to space
1154	'\u00AD=>', // remove soft hyphens
1155	'\u200C=>', // remove zero-width non-joiners
1156	];
1157
1158	// Nori-specific pattern_replace to strip combining diacritics
1159	$config[ 'char_filter' ][ 'nori_combo_filter' ] =
1160	AnalyzerBuilder::patternFilter( '[\\u0300-\\u0331]' );
1161
1162	// 'mixed' mode keeps the original token plus the compound parts
1163	// the default is 'discard' which only keeps the parts
1164	$config[ 'tokenizer' ][ 'nori_tok' ] = [
1165	'type' => 'nori_tokenizer',
1166	'decompound_mode' => 'mixed',
1167	];
1168
1169	// Nori-specific part of speech filter (add 'VCP', 'VCN', 'VX' to default)
1170	$config[ 'filter' ][ 'nori_posfilter' ] = [
1171	'type' => 'nori_part_of_speech',
1172	'stoptags' => [ 'E', 'IC', 'J', 'MAG', 'MAJ', 'MM', 'SP', 'SSC', 'SSO',
1173	'SC', 'SE', 'XPN', 'XSA', 'XSN', 'XSV', 'UNA', 'NA', 'VSV', 'VCP',
1174	'VCN', 'VX' ],
1175	];
1176
1177	$config = ( new AnalyzerBuilder( $langName ) )->
1178	withLimitedCharMap( $noriMap, 'nori_charfilter' )->
1179	withCharFilters( [ 'nori_charfilter', 'nori_combo_filter' ] )->
1180	withTokenizer( 'nori_tok' )->
1181	withFilters( [ 'nori_posfilter', 'nori_readingform', 'lowercase',
1182	'remove_empty' ] )->
1183	build( $config );
1184	break;
1185	case 'mirandese':
1186	// Unpack default analyzer to add Mirandese-specific elision and stop words
1187	// See phab ticket T194941
1188	$mwlStopwords = require __DIR__ . '/AnalysisLanguageData/mirandeseStopwords.php';
1189	$config = ( new AnalyzerBuilder( $langName ) )->
1190	withElision( [ 'l', 'd', 'qu' ] )->
1191	withStop( $mwlStopwords )->
1192	withFilters( [ 'lowercase', 'mirandese_elision', 'mirandese_stop' ] )->
1193	build( $config );
1194	break;
1195	case 'persian': // Unpack Persian analyzer T325090
1196	$config = ( new AnalyzerBuilder( $langName ) )->
1197	withUnpackedAnalyzer()->
1198	withLimitedCharMap( [ '\u200C=>\u0020' ], 'zero_width_spaces' )->
1199	withDecimalDigit()->
1200	omitStemmer()->
1201	insertFiltersBefore( 'persian_stop',
1202	[ 'arabic_normalization', 'persian_normalization' ] )->
1203	build( $config );
1204	break;
1205	case 'polish':
1206	// these are real stop words for Polish
1207	$plStopwords = require __DIR__ . '/AnalysisLanguageData/polishStopwords.php';
1208
1209	// Stempel-specific stop words--additional unreliable stems
1210	$stempelStopwords = [ 'ować', 'iwać', 'obić', 'snąć', 'ywać', 'ium', 'my', 'um' ];
1211
1212	// Stempel is statistical, and certain stems are really terrible, so we filter them
1213	// after stemming. See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T186046
1214	$config[ 'filter' ][ 'stempel_pattern_filter' ] =
1215	AnalyzerBuilder::patternFilter( '^([a-zął]?[a-zćń]\|..ć\|\d.*ć)$' );
1216
1217	$config = ( new AnalyzerBuilder( $langName ) )->
1218	withUnpackedAnalyzer()->
1219	withStop( $plStopwords )->
1220	omitStemmer()->
1221	omitAsciiFolding()->
1222	appendFilters( [ 'polish_stem', 'stempel_pattern_filter', 'remove_empty' ] )->
1223	withExtraStop( $stempelStopwords, 'stempel_stop' )->
1224	build( $config );
1225	break;
1226	case 'romanian': // Unpack Romanian analyzer T325091 / T330893
1227	// Counterintuitively, we need to map correct s&t (with commas) to older
1228	// incorrect forms (with cedilla) so that the old Snowball stemmer (from before
1229	// comma forms were available) will work; also normalize versions with
1230	// combining diacritics to single characters.
1231	$cedillaMap = [
1232	'ș=>ş', 's\u0326=>ş', 's\u0327=>ş', 'ț=>ţ', 't\u0326=>ţ', 't\u0327=>ţ',
1233	'Ș=>Ş', 'S\u0326=>Ş', 'S\u0327=>Ş', 'Ț=>Ţ', 'T\u0326=>Ţ', 'T\u0327=>Ţ',
1234	];
1235
1236	// Add stopword variants with modern commas instead of old cedillas so that
1237	// both are handled, regardless of the character mapping needed for the
1238	// stemmer. In the future, Lucene should update their stopwords and these will
1239	// be included.
1240	$roStopwords = require __DIR__ . '/AnalysisLanguageData/romanianStopwords.php';
1241
1242	$config = ( new AnalyzerBuilder( $langName ) )->
1243	withUnpackedAnalyzer()->
1244	withCharMap( $cedillaMap )->
1245	withExtraStop( $roStopwords, 'ro_comma_stop', 'romanian_stemmer' )->
1246	build( $config );
1247	break;
1248	case 'russian':
1249	// unpack built-in Russian analyzer and add character filter
1250	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T124592
1251	$ruCharMap = [
1252	'\u0301=>', // combining acute accent, only used to show stress T102298
1253	'\u0435\u0308=>\u0435', // T124592 fold ё=>е and Ё=>Е, with combining
1254	'\u0415\u0308=>\u0415', // diacritic...
1255	'\u0451=>\u0435', // ... or precomposed
1256	'\u0401=>\u0415',
1257	];
1258	$config = ( new AnalyzerBuilder( $langName ) )->
1259	withUnpackedAnalyzer()->
1260	withCharMap( $ruCharMap )->
1261	build( $config );
1262
1263	// add Russian character mappings to near_space_flattener, and convert it from
1264	// limited_mapping to mapping to handle multi-char maps
1265	$config[ 'char_filter' ][ 'near_space_flattener' ][ 'type' ] = 'mapping';
1266	array_push( $config[ 'char_filter' ][ 'near_space_flattener' ][ 'mappings' ],
1267	...$ruCharMap );
1268
1269	// Drop acute stress marks and fold ё=>е everywhere
1270	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T124592
1271	$config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'russian_charfilter';
1272	$config[ 'analyzer' ][ 'plain_search' ][ 'char_filter' ][] = 'russian_charfilter';
1273
1274	$config[ 'analyzer' ][ 'suggest' ][ 'char_filter' ][] = 'russian_charfilter';
1275	$config[ 'analyzer' ][ 'suggest_reverse' ][ 'char_filter' ][] = 'russian_charfilter';
1276	break;
1277	case 'slovak':
1278	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T190815
1279	// and https://www.mediawiki.org/wiki/User:TJones_(WMF)/T223787
1280	$config = ( new AnalyzerBuilder( $langName ) )->
1281	withFilters( [ 'lowercase', 'slovak_stemmer', 'asciifolding' ] )->
1282	build( $config );
1283	break;
1284	case 'sorani': // Unpack Sorani analyzer T325091
1285	$config = ( new AnalyzerBuilder( $langName ) )->
1286	withUnpackedAnalyzer()->
1287	withDecimalDigit()->
1288	insertFiltersBefore( 'lowercase', [ 'sorani_normalization' ] )->
1289	build( $config );
1290	break;
1291	case 'swedish':
1292	// Add asciifolding_preserve to lowercase_keyword
1293	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T160562
1294	$config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve';
1295
1296	// Unpack built-in swedish analyzer to add asciifolding_preserve
1297	$config = ( new AnalyzerBuilder( $langName ) )->
1298	withUnpackedAnalyzer()->
1299	withAsciifoldingPreserve()->
1300	build( $config );
1301	break;
1302	case 'thai':
1303	// Unpack and improve Thai analyzer: T294147
1304	$thCharMap = [
1305	'_=>\u0020', // split tokens on underscore ..
1306	';=>\u0020', // .. semicolon
1307	':=>\u0020', // .. colon
1308	'·=>\u0020', // .. middle dot
1309	'‧=>\u0020', // .. & hyphenation point
1310	'ฃ=>ข', // replace obsolete ฃ
1311	'ฅ=>ค', // replace obsolete ฅ
1312	'\u0e4d\u0e32=>\u0e33', // compose nikhahit + sara aa = sara am
1313	'\u0e4d\u0e48\u0e32=>\u0e48\u0e33', // recompose sara am split around..
1314	'\u0e4d\u0e49\u0e32=>\u0e49\u0e33', // .. other diacritics
1315	'\u0e33\u0e48=>\u0e48\u0e33', // sara am should consistently..
1316	'\u0e33\u0e49=>\u0e49\u0e33', // .. come after other diacritics
1317	'\u0E34\u0E4D=>\u0E36', // compose sara i + nikhahit = sara ue..
1318	'\u0E4D\u0E34=>\u0E36', // .. in either order
1319	];
1320
1321	// instantiate basic unpacked analyzer builder, plus thai tokenizer by default
1322	$thBuilder = ( new AnalyzerBuilder( $langName ) )
1323	->withUnpackedAnalyzer()
1324	->withTokenizer( 'thai' );
1325
1326	if ( $this->isIcuAvailable() ) {
1327	// ICU tokenizer is preferred in general. If it is available, replace
1328	// default tokenizer. Also add thai_repl_pat char filter to accommodate
1329	// some of its weaknesses.
1330	$thBuilder->withTokenizer( $this->icu_tokenizer );
1331
1332	$thaiLetterPat = '[ก-๏]'; // Thai characters, except for digits.
1333	$config[ 'char_filter' ][ 'thai_repl_pat' ] =
1334	// break between any digits and Thai letters, or vice versa
1335	// break Thai tokens on periods (by making them spaces)
1336	// (regex look-behind is okay, but look-ahead breaks offsets)
1337	AnalyzerBuilder::patternFilter( "(?<=\\p{Nd})($thaiLetterPat)" .
1338	"\|(?<=$thaiLetterPat)(\\p{Nd})" .
1339	"\|(?<=$thaiLetterPat)\.($thaiLetterPat)",
1340	' $1$2$3' );
1341	$thBuilder->withCharFilters( [ 'thai_repl_pat' ] );
1342
1343	// if icu_token_repair (in the textify plugin) is available, we need a
1344	// reverse number map so it doesn't rejoin split-off Arabic numbers.
1345	if ( $this->isTextifyAvailable() ) {
1346	$thBuilder->withReversedNumberCharFilter( 0x0e50 );
1347	}
1348	} else {
1349	// if we have to settle for the Thai tokenizer, add some additional
1350	// character filters to accommodate some of its weaknesses
1351	$thThaiTokSplits = [
1352	'\u200B=>', // delete zero width space
1353	'-=>\u0020', // split tokens on hyphen-minus ..
1354	'‐=>\u0020', // .. hyphen
1355	'–=>\u0020', // .. en dash
1356	'—=>\u0020', // .. em dash
1357	'―=>\u0020', // .. horizontal bar
1358	'－=>\u0020', // .. fullwidth hyphen
1359	'"=>\u0020', // .. & double quote
1360	];
1361	array_push( $thCharMap, ...$thThaiTokSplits );
1362	}
1363
1364	// add in the rest of the bits that are always needed, and build
1365	$config = $thBuilder->withCharMap( $thCharMap )->
1366	withDecimalDigit()->
1367	omitStemmer()->
1368	build( $config );
1369	break;
1370	case 'turkish':
1371	$trAposFilter = 'apostrophe';
1372	if ( in_array( 'extra-analysis-turkish', $this->plugins ) ) {
1373	$trAposFilter = 'better_apostrophe';
1374	}
1375	$config = ( new AnalyzerBuilder( $langName ) )->
1376	withUnpackedAnalyzer()->
1377	withLangLowercase()->
1378	insertFiltersBefore( 'turkish_stop', [ $trAposFilter ] )->
1379	build( $config );
1380	break;
1381	case 'ukrainian-unpacked':
1382	$this->languagesWithIcuFolding['uk'] = true;
1383	$ukCharMap = [
1384	'‘=>\'', // normalize apostrophes
1385	'’=>\'',
1386	'`=>\'',
1387	'´=>\'',
1388	'ʼ=>\'',
1389	'\u0301=>', // delete combining acute and soft hyphen
1390	'\u00AD=>',
1391	'ґ=>г', // normalize ghe with upturn
1392	'Ґ=>Г',
1393	];
1394	// lowercase twice because stopwords are case sensitive, and the stemmer
1395	// generates some output with uppercase initial letters, even for
1396	// lowercase input (usually proper names)
1397	$ukFilters = [ 'lowercase', 'ukrainian_stop', 'ukrainian_stemmer',
1398	'lowercase', 'remove_duplicates', 'asciifolding' ];
1399	$config = ( new AnalyzerBuilder( 'ukrainian' ) )->
1400	withLimitedCharMap( $ukCharMap )->
1401	withCharFilters( [ 'ukrainian_charfilter' ] )->
1402	withFilters( $ukFilters )->
1403	build( $config );
1404	break;
1405	default:
1406	// do nothing--default config is already set up
1407	break;
1408	}
1409
1410	// text_search is just a copy of text
1411	// @phan-suppress-next-line PhanTypeInvalidDimOffset
1412	$config[ 'analyzer' ][ 'text_search' ] = $config[ 'analyzer' ][ 'text' ];
1413
1414	// replace lowercase filters with icu_normalizer filter
1415	if ( $this->isIcuAvailable() ) {
1416	foreach ( $config[ 'analyzer' ] as &$analyzer ) {
1417	if ( !isset( $analyzer[ 'filter' ] ) ) {
1418	continue;
1419	}
1420
1421	$tmpFilters = [];
1422	foreach ( $analyzer[ 'filter' ] as $filter ) {
1423	if ( $filter === 'lowercase' ) {
1424	// If lowercase filter has language-specific processing, keep it,
1425	// and do it before ICU normalization, particularly for Greek,
1426	// Irish, and Turkish
1427	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T203117
1428	// See https://www.mediawiki.org/wiki/User:TJones_(WMF)/T217602
1429	if ( isset( $config[ 'filter' ][ 'lowercase' ][ 'language' ] ) ) {
1430	$tmpFilters[] = 'lowercase';
1431	}
1432	$tmpFilters[] = 'icu_normalizer';
1433	} else {
1434	$tmpFilters[] = $filter;
1435	}
1436	}
1437	$analyzer[ 'filter' ] = $tmpFilters;
1438
1439	}
1440	}
1441
1442	return $config;
1443	}
1444
1445	/**
1446	* Workaround for https://issues.apache.org/jira/browse/LUCENE-7468
1447	* The preserve_original duplicates token even if they are
1448	* not modified, leading to more space used and wrong term frequencies.
1449	* Workaround is to append a unique filter to remove the dups.
1450	* (made public for unit tests)
1451	*
1452	* @param mixed[] $config
1453	* @return mixed[] update mapping
1454	*/
1455	public function fixAsciiFolding( array $config ) {
1456	$needDedupFilter = false;
1457	foreach ( $config[ 'analyzer' ] as $name => &$value ) {
1458	if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
1459	continue;
1460	}
1461	if ( !isset( $value[ 'filter' ] ) ) {
1462	continue;
1463	}
1464	$ascii_idx = array_search( 'asciifolding_preserve', $value[ 'filter' ] );
1465	if ( $ascii_idx !== false ) {
1466	$needDedupFilter = true;
1467	array_splice( $value[ 'filter' ], $ascii_idx + 1, 0, [ 'dedup_asciifolding' ] );
1468	}
1469	}
1470	if ( $needDedupFilter ) {
1471	$config[ 'filter' ][ 'dedup_asciifolding' ] = [
1472	'type' => 'unique',
1473	'only_on_same_position' => true,
1474	];
1475	}
1476	return $config;
1477	}
1478
1479	/**
1480	* Pick the appropriate default analyzer based on the language. Rather than think of
1481	* this as per language customization you should think of this as an effort to pick a
1482	* reasonably default in case CirrusSearch isn't customized for the language.
1483	*
1484	* @param string $language Config language
1485	* @return string the analyzer type
1486	*/
1487	public function getDefaultTextAnalyzerType( $language ) {
1488	// If we match a language exactly, use it
1489	if ( array_key_exists( $language, $this->elasticsearchLanguageAnalyzers ) ) {
1490	return $this->elasticsearchLanguageAnalyzers[ $language ];
1491	}
1492
1493	return 'default';
1494	}
1495
1496	/**
1497	* Get list of filters that are mentioned in analyzers but not defined
1498	* explicitly.
1499	* @param array[] &$config Full configuration array
1500	* @param string[] $analyzers List of analyzers to consider.
1501	* @return array List of default filters, each containing only filter type
1502	*/
1503	private function getDefaultFilters( array &$config, array $analyzers ) {
1504	$defaultFilters = [];
1505	foreach ( $analyzers as $analyzer ) {
1506	if ( empty( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] ) ) {
1507	continue;
1508	}
1509	foreach ( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] as $filterName ) {
1510	if ( !isset( $config[ 'filter' ][ $filterName ] ) ) {
1511	// This is default definition for the built-in filter
1512	$defaultFilters[ $filterName ] = [ 'type' => $filterName ];
1513	}
1514	}
1515	}
1516	return $defaultFilters;
1517	}
1518
1519	/**
1520	* Check every filter in the config - if it's the same as in old config,
1521	* ignore it. If it has the same name, but different content - create new filter
1522	* with different name by prefixing it with language code.
1523	*
1524	* @param array[] &$config Configuration being processed
1525	* @param array[] $standardFilters Existing filters list
1526	* @param array[] $defaultFilters List of default filters already mentioned in the config
1527	* @param string $prefix Prefix for disambiguation
1528	* @return array[] The list of filters not in the old config.
1529	*/
1530	private function resolveFilters( array &$config, array $standardFilters, array $defaultFilters,
1531	string $prefix ) {
1532	$resultFilters = [];
1533	foreach ( $config[ 'filter' ] as $name => $filter ) {
1534	$existingFilter = $standardFilters[$name] ?? $defaultFilters[$name] ?? null;
1535	if ( $existingFilter ) { // Filter with this name already exists
1536	if ( $existingFilter != $filter ) {
1537	// filter with the same name but different config - need to
1538	// rename by adding prefix
1539	$newName = $prefix . '_' . $name;
1540	$this->replaceFilter( $config, $name, $newName );
1541	$resultFilters[ $newName ] = $filter;
1542	}
1543	} else {
1544	$resultFilters[ $name ] = $filter;
1545	}
1546	}
1547	return $resultFilters;
1548	}
1549
1550	/**
1551	* Replace certain filter name in all configs with different name.
1552	* @param array[] &$config Configuration being processed
1553	* @param string $oldName
1554	* @param string $newName
1555	*/
1556	private function replaceFilter( array &$config, $oldName, $newName ) {
1557	foreach ( $config[ 'analyzer' ] as &$analyzer ) {
1558	if ( !isset( $analyzer[ 'filter' ] ) ) {
1559	continue;
1560	}
1561	$analyzer[ 'filter' ] = array_map( static function ( $filter ) use ( $oldName, $newName ) {
1562	if ( $filter === $oldName ) {
1563	return $newName;
1564	}
1565	return $filter;
1566	}, $analyzer[ 'filter' ] );
1567	}
1568	}
1569
1570	/**
1571	* Merge per-language config into the main config.
1572	* It will copy specific analyzer and all dependant filters and char_filters.
1573	* @param array &$config Main config
1574	* @param array $langConfig Per-language config
1575	* @param string $name Name for analyzer whose config we're merging
1576	* @param string $prefix Prefix for this configuration
1577	*/
1578	private function mergeConfig( array &$config, array $langConfig, $name, $prefix ) {
1579	$analyzer = $langConfig[ 'analyzer' ][ $name ];
1580	$config[ 'analyzer' ][ $prefix . '_' . $name ] = $analyzer;
1581	if ( !empty( $analyzer[ 'filter' ] ) ) {
1582	// Add private filters for this analyzer
1583	foreach ( $analyzer[ 'filter' ] as $filter ) {
1584	// Copy filters that are in language config but not in the main config.
1585	// We would not copy the same filter into the main config since due to
1586	// the resolution step we know they are the same (otherwise we would have
1587	// renamed it).
1588	if ( isset( $langConfig[ 'filter' ][ $filter ] ) &&
1589	!isset( $config[ 'filter' ][ $filter ] ) ) {
1590	$config[ 'filter' ][ $filter ] = $langConfig[ 'filter' ][ $filter ];
1591	}
1592	}
1593	}
1594	if ( !empty( $analyzer[ 'char_filter' ] ) ) {
1595	// Add private char_filters for this analyzer
1596	foreach ( $analyzer[ 'char_filter' ] as $filter ) {
1597	// Copy char_filters that are in lang config but not in the main config.
1598	// Need to check whether the filter exists in langConfig because some
1599	// non-configurable filters are defined in plugins and do not have a
1600	// local definition (e.g., camelCase_splitter)
1601	if ( isset( $langConfig[ 'char_filter' ][ $filter ] ) &&
1602	!isset( $config[ 'char_filter' ][ $filter ] ) ) {
1603	$config[ 'char_filter' ][ $filter ] = $langConfig[ 'char_filter' ][ $filter ];
1604	}
1605	}
1606	}
1607	if ( !empty( $analyzer[ 'tokenizer' ] ) ) {
1608	$tokenizer = $analyzer[ 'tokenizer' ];
1609	if ( isset( $langConfig[ 'tokenizer' ][ $tokenizer ] ) &&
1610	!isset( $config[ 'tokenizer' ][ $tokenizer ] ) ) {
1611	$config[ 'tokenizer' ][ $tokenizer ] = $langConfig[ 'tokenizer' ][ $tokenizer ];
1612	}
1613	}
1614	}
1615
1616	/**
1617	* Create per-language configs for specific analyzers which separates and namespaces
1618	* filters that are different between languages.
1619	* @param array &$config Existing config, will be modified
1620	* @param string[] $languages List of languages to process
1621	* @param string[] $analyzers List of analyzers to process
1622	*/
1623	public function buildLanguageConfigs( array &$config, array $languages, array $analyzers ) {
1624	$defaultFilters = $this->getDefaultFilters( $config, $analyzers );
1625	foreach ( $languages as $lang ) {
1626	$langConfig = $this->buildConfig( $lang );
1627	$defaultFilters += $this->getDefaultFilters( $langConfig, $analyzers );
1628	}
1629	foreach ( $languages as $lang ) {
1630	$langConfig = $this->buildConfig( $lang );
1631	// Analyzer is: tokenizer + filter + char_filter
1632	// Char filters & Tokenizers are nicely namespaced
1633	// Filters are NOT - e.g. lowercase & icu_folding filters are different for different
1634	// languages! So we need to do some disambiguation here.
1635	$langConfig[ 'filter' ] =
1636	$this->resolveFilters( $langConfig, $config[ 'filter' ], $defaultFilters, $lang );
1637	// Merge configs
1638	foreach ( $analyzers as $analyzer ) {
1639	$this->mergeConfig( $config, $langConfig, $analyzer, $lang );
1640	}
1641	}
1642	}
1643
1644	/**
1645	* @return bool true if the icu analyzer is available.
1646	*/
1647	public function isIcuAvailable() {
1648	return $this->icu;
1649	}
1650
1651	/**
1652	* @return bool true if the textify plugin is available.
1653	*/
1654	public function isTextifyAvailable() {
1655	return $this->textify;
1656	}
1657
1658	/**
1659	* update languages with global custom filters (e.g., homoglyph & nnbsp filters)
1660	*
1661	* @param mixed[] $config
1662	* @param string $language language to add plugin to
1663	* @return mixed[] updated config
1664	*/
1665	public function enableGlobalCustomFilters( array $config, string $language ) {
1666	return GlobalCustomFilter::enableGlobalCustomFilters( $config, $language,
1667	$this->globalCustomFilters, $this->plugins );
1668	}
1669
1670	/**
1671	* Languages for which we have a custom analysis chain (Elastic built-in or our
1672	* own custom analysis). All other languages default to the default analyzer which
1673	* isn't too good. Note that this array is sorted alphabetically by value. The
1674	* Elastic list is sourced from
1675	* https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html
1676	*
1677	* @var string[]
1678	*/
1679	private $elasticsearchLanguageAnalyzers = [
1680	'ar' => 'arabic',
1681	'ary' => 'arabic-moroccan',
1682	'arz' => 'arabic-egyptian',
1683	'hy' => 'armenian',
1684	'az' => 'azerbaijani',
1685	'eu' => 'basque',
1686	'bn' => 'bengali',
1687	'pt-br' => 'brazilian',
1688	'bg' => 'bulgarian',
1689	'ca' => 'catalan',
1690	'crh' => 'crimean-tatar',
1691	'ja' => 'cjk',
1692	'ko' => 'cjk',
1693	'cs' => 'czech',
1694	'da' => 'danish',
1695	'nl' => 'dutch',
1696	'en' => 'english',
1697	'en-ca' => 'english',
1698	'en-gb' => 'english',
1699	'simple' => 'english',
1700	'et' => 'estonian',
1701	'fi' => 'finnish',
1702	'fr' => 'french',
1703	'gag' => 'gagauz',
1704	'gl' => 'galician',
1705	'de' => 'german',
1706	'el' => 'greek',
1707	'hi' => 'hindi',
1708	'hu' => 'hungarian',
1709	'id' => 'indonesian',
1710	'ga' => 'irish',
1711	'it' => 'italian',
1712	'kk' => 'kazakh',
1713	'lt' => 'lithuanian',
1714	'lv' => 'latvian',
1715	'ms' => 'malay',
1716	'mwl' => 'mirandese',
1717	'nb' => 'norwegian',
1718	'nn' => 'norwegian',
1719	'no' => 'norwegian',
1720	'fa' => 'persian',
1721	'pt' => 'portuguese',
1722	'ro' => 'romanian',
1723	'ru' => 'russian',
1724	'ckb' => 'sorani',
1725	'es' => 'spanish',
1726	'sv' => 'swedish',
1727	'tt' => 'tatar',
1728	'tr' => 'turkish',
1729	'th' => 'thai',
1730	];
1731
1732	/**
1733	* @var bool[] indexed by language code, languages where ICU folding
1734	* can be enabled by default
1735	*/
1736	private $languagesWithIcuFolding = [
1737	'ar' => true,
1738	'ary' => true,
1739	'arz' => true,
1740	'bg' => true,
1741	'bn' => true,
1742	'bs' => true,
1743	'ca' => true,
1744	'ckb' => true,
1745	'cs' => true,
1746	'da' => true,
1747	'de' => true,
1748	'el' => true,
1749	'en' => true,
1750	'en-ca' => true,
1751	'en-gb' => true,
1752	'simple' => true,
1753	'eo' => true,
1754	'es' => true,
1755	'et' => true,
1756	'eu' => true,
1757	'fa' => true,
1758	'fi' => true,
1759	'fr' => true,
1760	'ga' => true,
1761	'gl' => true,
1762	'he' => true,
1763	'hi' => true,
1764	'hr' => true,
1765	'hu' => true,
1766	'hy' => true,
1767	'ja' => true,
1768	'lt' => true,
1769	'lv' => true,
1770	'nb' => true,
1771	'nl' => true,
1772	'nn' => true,
1773	'no' => true,
1774	'pt' => true,
1775	'pt-br' => true,
1776	'ro' => true,
1777	'ru' => true,
1778	'sh' => true,
1779	'sk' => true,
1780	'sr' => true,
1781	'sv' => true,
1782	'th' => true,
1783	'tr' => true,
1784	];
1785
1786	/**
1787	* @var bool[] indexed by language code, indicates whether languages should always
1788	* replace the standard tokenizer with the icu_tokenizer by default (true), or should
1789	* never use any version of the icu_tokenizer, even when icu_token_repair is
1790	* available (false). (Reminder to future readers of this code: languages with
1791	* non-standard tokenizers in the text field, like zh/Chinese, still use icu_tokenizer
1792	* in the plain fields & suggest fields.)
1793	*/
1794	private $languagesWithIcuTokenization = [
1795	// true => use any version of icu_tokenizer available over the standard tokenizer
1796	'bo' => true,
1797	'dz' => true,
1798	'gan' => true,
1799	'ja' => true,
1800	'km' => true,
1801	'lo' => true,
1802	'my' => true,
1803	'th' => true,
1804	'wuu' => true,
1805	'zh' => true,
1806	'lzh' => true, // zh-classical
1807	'zh-classical' => true, // deprecated code for lzh
1808	'yue' => true, // zh-yue
1809	'zh-yue' => true, // deprecated code for yue
1810	// This list below are languages that may use use mixed scripts
1811	'bug' => true,
1812	'cdo' => true,
1813	'cr' => true,
1814	'hak' => true,
1815	'jv' => true,
1816	'nan' => true, // zh-min-nan
1817	'zh-min-nan' => true, // deprecated code for nan
1818
1819	// false => do not use any version of icu_tokenizer (i.e., textify_icu_tokenzier)
1820	// over the standard tokenizer, even when icu_token_repair is available
1821	// 'xyz' => false, // <-- example entry for now, since there are no actual instances
1822	];
1823
1824	/**
1825	* @var array[]
1826	*/
1827	private $elasticsearchLanguageAnalyzersFromPlugins = [
1828	/**
1829	* multiple plugin requirement can be comma separated
1830	*
1831	* Polish: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T154517
1832	* Ukrainian: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T160106
1833	* Chinese: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T158203
1834	* Hebrew: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T162741
1835	* Serbian: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T183015
1836	* Bosnian, Croatian, and Serbo-Croatian:
1837	* https://www.mediawiki.org/wiki/User:TJones_(WMF)/T192395
1838	* Slovak: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T190815
1839	* Esperanto: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T202173
1840	* Korean: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T206874
1841	* Khmer: https://www.mediawiki.org/wiki/User:TJones_(WMF)/T185721
1842	*
1843	* extra-analysis-ukrainian should follow analysis-ukrainian, so that
1844	* ukrainian-unpacked can overwrite value for uk if both are present.
1845	*/
1846
1847	'analysis-stempel' => [ 'pl' => 'polish' ],
1848	'analysis-kuromoji' => [ 'ja' => 'japanese' ],
1849	'analysis-stconvert,analysis-smartcn' => [ 'zh' => 'chinese' ],
1850	'analysis-hebrew' => [ 'he' => 'hebrew' ],
1851	'analysis-ukrainian' => [ 'uk' => 'ukrainian' ],
1852	'extra-analysis-ukrainian' => [ 'uk' => 'ukrainian-unpacked' ],
1853	'extra-analysis-esperanto' => [ 'eo' => 'esperanto' ],
1854	'extra-analysis-serbian' => [ 'bs' => 'bosnian', 'hr' => 'croatian',
1855	'sh' => 'serbo-croatian', 'sr' => 'serbian' ],
1856	'extra-analysis-slovak' => [ 'sk' => 'slovak' ],
1857	'analysis-nori' => [ 'ko' => 'korean' ],
1858	'extra-analysis-khmer' => [ 'km' => 'khmer' ],
1859	];
1860
1861	/**
1862	* Set up global custom filters
1863	*
1864	* @return array
1865	*/
1866	private static function buildGlobalCustomFilters(): array {
1867	$gcf = [
1868	//////////////////////////
1869	// char filters
1870	'globo_norm' => new GlobalCustomFilter( 'char_filter' ),
1871
1872	'acronym_fixer' => ( new GlobalCustomFilter( 'char_filter' ) )->
1873	// follow armenian_charfilter, which normalizes another period-like
1874	// character, if it is being used
1875	setRequiredPlugins( [ 'extra-analysis-textify' ] )->
1876	setFallbackFilter( 'regex_acronym_fixer' )->
1877	setMustFollowFilters( [ 'armenian_charfilter' ] ),
1878
1879	'camelCase_splitter' => ( new GlobalCustomFilter( 'char_filter' ) )->
1880	// camelCase should generally follow acronyms so a.c.r.o.C.a.m.e.l.
1881	// is treated the same as acroCamel (real example: G.m.b.H. vs GmbH)
1882	setRequiredPlugins( [ 'extra-analysis-textify' ] )->
1883	setFallbackFilter( 'regex_camelCase' )->
1884	setMustFollowFilters( [ 'acronym_fixer', 'regex_acronym_fixer' ] ),
1885
1886	'word_break_helper' => ( new GlobalCustomFilter( 'char_filter' ) )->
1887	// * acronyms should be fixed before converting period to spaces
1888	// * follow armenian_charfilter, which normalizes another period-like
1889	// character, if it is being used
1890	setMustFollowFilters( [ 'acronym_fixer', 'regex_acronym_fixer',
1891	'armenian_charfilter' ] )->
1892	setLanguageDenyList( [ 'ko', 'zh' ] ),
1893
1894	'dotted_I_fix' => ( new GlobalCustomFilter( 'char_filter' ) )->
1895	// - if lowercase is present (because analysis-icu is not available, or
1896	// as a language-specific version) we don't need dotted_I_fix, because
1897	// lowercase prevents the problem.
1898	// - if icu_folding is present, we don't need dotted_I_fix, because
1899	// icu_folding also fixes it.
1900	setDisallowedTokenFilters( [ 'lowercase', 'icu_folding' ] ),
1901
1902	'arabic_extended_norm' => ( new GlobalCustomFilter( 'char_filter' ) )->
1903	// Mappings that are best for Arabic and Persian; default for any other
1904	// language except Sorani (ckb), which prefers Persian characters and
1905	// has it's own mapping (TT72899)
1906	setLanguageDenyList( [ 'ckb' ] ),
1907
1908	//////////////////////////
1909	// token filters
1910	'icu_token_repair' => ( new GlobalCustomFilter( 'filter' ) )->
1911	// apply icu_token_repair to icu_tokenizer-using analyzers
1912	// (default == text & text_search)
1913	setRequiredPlugins( [ 'extra-analysis-textify' ] )->
1914	setRequiredTokenizer( 'textify_icu_tokenizer' ),
1915
1916	'icutokrep_no_camel_split' => ( new GlobalCustomFilter( 'filter' ) )->
1917	// apply icu_token_repair variant to non-camelCase-splitting
1918	// icu_tokenizer-using analyzers when textify_icu_tokenizer is used
1919	setRequiredPlugins( [ 'extra-analysis-textify' ] )->
1920	setApplyToAnalyzers( [ 'plain', 'plain_search', 'suggest', 'suggest_reverse',
1921	'source_text_plain', 'source_text_plain_search', 'word_prefix' ] )->
1922	setRequiredTokenizer( 'textify_icu_tokenizer' ),
1923
1924	'homoglyph_norm' => ( new GlobalCustomFilter( 'filter' ) )->
1925	// aggressive_splitting has weird graph problems and creating
1926	// multiple tokens makes it blow up
1927	setRequiredPlugins( [ 'extra-analysis-homoglyph' ] )->
1928	setMustFollowFilters( [ 'aggressive_splitting' ] ),
1929	];
1930	// reverse the array so that items are ordered (approximately, modulo incompatible
1931	// filters) in the order specified here
1932	return array_reverse( $gcf );
1933	}
1934
1935	}