Code Coverage for /workspace/src/extensions/CirrusSearch/includes/Maintenance/SuggesterAnalysisConfigBuilder.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	78.61% covered (warning)	78.61%	147 / 187	40.00% covered (danger)	40.00%	2 / 5	CRAP	0.00% covered (danger)	0.00%	0 / 1
SuggesterAnalysisConfigBuilder	78.61% covered (warning)	78.61%	147 / 187	40.00% covered (danger)	40.00%	2 / 5	21.17	0.00% covered (danger)	0.00%	0 / 1
defaults	79.43% covered (warning)	79.43%	112 / 141	0.00% covered (danger)	0.00%	0 / 1	6.31
customize	76.19% covered (warning)	76.19%	32 / 42	0.00% covered (danger)	0.00%	0 / 1	10.09
buildConfig	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
getDefaultStopSet	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
hasStopWords	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2
3	namespace CirrusSearch\Maintenance;
4
5	/**
6	* Builds elasticsearch analysis config arrays for the completion suggester
7	* index.
8	*
9	* This program is free software; you can redistribute it and/or modify
10	* it under the terms of the GNU General Public License as published by
11	* the Free Software Foundation; either version 2 of the License, or
12	* (at your option) any later version.
13	*
14	* This program is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17	* GNU General Public License for more details.
18	*
19	* You should have received a copy of the GNU General Public License along
20	* with this program; if not, write to the Free Software Foundation, Inc.,
21	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22	* http://www.gnu.org/copyleft/gpl.html
23	*/
24
25	class SuggesterAnalysisConfigBuilder extends AnalysisConfigBuilder {
26	public const VERSION = "1.4";
27
28	/**
29	* Build an analysis config with sane defaults
30	*
31	* @param string $language Config language
32	* @return array
33	*/
34	protected function defaults( $language ) {
35	// Use default lowercase filter
36	$lowercase_type = [ 'type' => 'lowercase' ];
37	if ( $this->isIcuAvailable() ) {
38	$lowercase_type = [
39	"type" => "icu_normalizer",
40	"name" => "nfkc_cf",
41	];
42	}
43	// Use the default Lucene ASCII filter
44	$folding_type = [ 'type' => 'asciifolding' ];
45	if ( $this->shouldActivateIcuFolding( $language ) ) {
46	// Use ICU Folding if the plugin is available and activated in the config
47	$folding_type = [ 'type' => 'icu_folding' ];
48	$unicodeSetFilter = $this->getICUSetFilter( $language );
49	if ( $unicodeSetFilter !== null ) {
50	$folding_type['unicodeSetFilter'] = $unicodeSetFilter;
51	}
52	}
53	$textTokenizer = 'standard';
54	$plainTokenizer = 'whitespace';
55	if ( $this->shouldActivateIcuTokenization( $language ) ) {
56	$textTokenizer = 'icu_tokenizer';
57	// We cannot use the icu_tokenizer for plain here
58	// even if icu tokenization is mostly needed for languages
59	// where space is not used to break words. We don't want
60	// to break some punctuation chars like ':'
61	}
62	$defaults = [
63	'char_filter' => [
64	'word_break_helper' => [
65	'type' => 'mapping',
66	'mappings' => [
67	'_=>\u0020', // a space for mw
68	',=>\u0020', // useful for "Lastname, Firstname"
69	'"=>\u0020', // " certainly phrase search?
70	'-=>\u0020', // useful for hyphenated names
71	"'=>\u0020", // Useful for finding names
72	'\u2019=>\u0020', // Unicode right single quote
73	'\u02BC=>\u0020', // Unicode modifier letter apostrophe
74	// Not sure about ( and )...
75	// very useful to search for :
76	// "john smith explo" instead of "john smith (expl"
77	// but annoying to search for "(C)"
78	// ')=>\u0020',
79	// '(=>\u0020',
80	// Ignoring : can be misleading for expert users
81	// Because we will return unrelated pages when the user
82	// search for "magic keywords" like WP:WP which are sometimes
83	// pages in the main namespace that redirect to other namespace
84	// ':=>\u0020',
85	// Others are the ones ignored by common search engines
86	';=>\u0020',
87	'\\[=>\u0020',
88	'\\]=>\u0020',
89	'{=>\u0020',
90	'}=>\u0020',
91	'\\\\=>\u0020',
92	// Unicode white spaces
93	// cause issues with completion
94	// only few of them where actually
95	// identified as problematic but
96	// more are added for extra safety
97	// see: T156234
98	// TODO: reevaluate with es5
99	'\u00a0=>\u0020',
100	'\u1680=>\u0020',
101	'\u180e=>\u0020',
102	'\u2000=>\u0020',
103	'\u2001=>\u0020',
104	'\u2002=>\u0020',
105	'\u2003=>\u0020',
106	'\u2004=>\u0020',
107	'\u2005=>\u0020',
108	'\u2006=>\u0020',
109	'\u2007=>\u0020',
110	'\u2008=>\u0020',
111	'\u2009=>\u0020',
112	'\u200a=>\u0020',
113	'\u200b=>\u0020', // causes issue
114	'\u200c=>\u0020', // causes issue
115	'\u200d=>\u0020', // causes issue
116	'\u202f=>\u0020',
117	'\u205f=>\u0020',
118	'\u3000=>\u0020',
119	'\ufeff=>\u0020', // causes issue
120	],
121	],
122	],
123	'filter' => [
124	"stop_filter" => [
125	"type" => "stop",
126	"stopwords" => "_none_",
127	"remove_trailing" => "true"
128	],
129	"lowercase" => $lowercase_type,
130	"accentfolding" => $folding_type,
131	"token_limit" => [
132	"type" => "limit",
133	"max_token_count" => "20"
134	],
135	// Workaround what seems to be a bug in the
136	// completion suggester, empty tokens cause an
137	// issue similar to
138	// https://github.com/elastic/elasticsearch/pull/11158
139	// can be removed with es5 if we want
140	// note that icu_folding can introduce empty tokens, so
141	// maybe it is best to leave this in place
142	"remove_empty" => [
143	"type" => "length",
144	"min" => 1,
145	],
146	],
147	'analyzer' => [
148	"stop_analyzer" => [
149	"type" => "custom",
150	"filter" => [
151	"lowercase",
152	"stop_filter",
153	"accentfolding",
154	"remove_empty",
155	"token_limit"
156	],
157	"tokenizer" => $textTokenizer,
158	],
159	// We do not remove stop words when searching,
160	// this leads to extremely weird behaviors while
161	// writing "to be or no to be"
162	"stop_analyzer_search" => [
163	"type" => "custom",
164	"filter" => [
165	"lowercase",
166	"accentfolding",
167	"remove_empty",
168	"token_limit"
169	],
170	"tokenizer" => $textTokenizer,
171	],
172	"plain" => [
173	"type" => "custom",
174	"char_filter" => [ 'word_break_helper' ],
175	"filter" => [
176	"remove_empty",
177	"token_limit",
178	"lowercase"
179	],
180	"tokenizer" => $plainTokenizer,
181	],
182	"plain_search" => [
183	"type" => "custom",
184	"char_filter" => [ 'word_break_helper' ],
185	"filter" => [
186	"remove_empty",
187	"token_limit",
188	"lowercase"
189	],
190	"tokenizer" => $plainTokenizer,
191	],
192	],
193	];
194	if ( $this->config->getElement( 'CirrusSearchCompletionSuggesterSubphrases', 'build' ) ) {
195	$defaults['analyzer']['subphrases'] = [
196	"type" => "custom",
197	"filter" => [
198	"lowercase",
199	"accentfolding",
200	"remove_empty",
201	"token_limit"
202	],
203	"tokenizer" => $textTokenizer,
204	];
205	$defaults['analyzer']['subphrases_search'] = [
206	"type" => "custom",
207	"filter" => [
208	"lowercase",
209	"accentfolding",
210	"remove_empty",
211	"token_limit"
212	],
213	"tokenizer" => $textTokenizer,
214	];
215	}
216	return $defaults;
217	}
218
219	/**
220	* @param array $config
221	* @param string $language
222	* @return array
223	*/
224	private function customize( array $config, $language ) {
225	$defaultStopSet = $this->getDefaultStopSet( $language );
226	$config['filter']['stop_filter']['stopwords'] = $defaultStopSet;
227
228	switch ( $this->getDefaultTextAnalyzerType( $language ) ) {
229	// Please add languages in alphabetical order.
230	case 'arabic':
231	$config[ 'char_filter' ][ 'arabic_numeral_map' ] = [
232	// T117217 fold Eastern Arabic Numerals (٠۱۲۳...) into Western (0123...)
233	'type' => 'mapping',
234	'mappings' => [
235	'\u0660=>0', '\u0661=>1', '\u0662=>2',
236	'\u0663=>3', '\u0664=>4', '\u0665=>5',
237	'\u0666=>6', '\u0667=>7', '\u0668=>8',
238	'\u0669=>9',
239	],
240	];
241
242	// add arabic_numeral_map to plain and copy plain to plain_search
243	$config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'arabic_numeral_map';
244	$config[ 'analyzer' ][ 'plain_search' ] = $config[ 'analyzer' ][ 'plain' ];
245	break;
246	case 'russian':
247	$config[ 'char_filter' ][ 'russian_diacritic_map' ] = [
248	// T117217 fold Eastern Arabic Numerals (٠۱۲۳...) into Western (0123...)
249	'type' => 'mapping',
250	'mappings' => [
251	// T102298 ignore combining acute / stress accents
252	'\u0301=>',
253	// T124592 fold ё=>е and Ё=>Е, precomposed or with combining diacritic
254	'\u0451=>\u0435',
255	'\u0401=>\u0415',
256	'\u0435\u0308=>\u0435',
257	'\u0415\u0308=>\u0415',
258
259	],
260	];
261
262	// add arabic_numeral_map to plain and copy plain to plain_search
263	$config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'russian_diacritic_map';
264	$config[ 'analyzer' ][ 'plain_search' ] = $config[ 'analyzer' ][ 'plain' ];
265	break;
266	}
267
268	if ( $this->isIcuAvailable() ) {
269	foreach ( $config[ 'analyzer' ] as $k => &$analyzer ) {
270	if ( $k != "stop_analyzer" && $k != "stop_analyzer_search" ) {
271	continue;
272	}
273	if ( !isset( $analyzer[ 'filter' ] ) ) {
274	continue;
275	}
276	$analyzer[ 'filter' ] = array_map( static function ( $filter ) {
277	if ( $filter === 'lowercase' ) {
278	return 'icu_normalizer';
279	}
280	return $filter;
281	}, $analyzer[ 'filter' ] );
282	}
283	}
284	return $config;
285	}
286
287	/**
288	* Build the analysis config.
289	*
290	* @param string\|null $language Config language
291	* @return array the analysis config
292	*/
293	public function buildConfig( $language = null ) {
294	$language ??= $this->defaultLanguage;
295	return $this->customize( $this->defaults( $language ), $language );
296	}
297
298	/** @var string[] */
299	private static $stopwords = [
300	'ar' => '_arabic_',
301	'hy' => '_armenian_',
302	'eu' => '_basque_',
303	'pt-br' => '_brazilian_',
304	'bg' => '_bulgarian_',
305	'ca' => '_catalan_',
306	'cs' => '_czech_',
307	'da' => '_danish_',
308	'nl' => '_dutch_',
309	'en' => '_english_',
310	'en-ca' => '_english_',
311	'en-gb' => '_english_',
312	'simple' => '_english_',
313	'fi' => '_finnish_',
314	'fr' => '_french_',
315	'gl' => '_galician_',
316	'de' => '_german_',
317	'el' => '_greek_',
318	'hi' => '_hindi_',
319	'hu' => '_hungarian_',
320	'id' => '_indonesian_',
321	'lt' => '_lithuanian_',
322	'lv' => '_latvian_',
323	'ga' => '_irish_',
324	'it' => '_italian_',
325	'nb' => '_norwegian_',
326	'nn' => '_norwegian_',
327	'fa' => '_persian_',
328	'pt' => '_portuguese_',
329	'ro' => '_romanian_',
330	'ru' => '_russian_',
331	'ckb' => '_sorani_',
332	'es' => '_spanish_',
333	'sv' => '_swedish_',
334	'th' => '_thai_',
335	'tr' => '_turkish_'
336	];
337
338	/**
339	* @param string $lang
340	* @return string
341	*/
342	private function getDefaultStopSet( $lang ) {
343	return self::$stopwords[$lang] ?? '_none_';
344	}
345
346	/**
347	* @param string $lang
348	* @return bool
349	*/
350	public static function hasStopWords( $lang ) {
351	return isset( self::$stopwords[$lang] );
352	}
353	}