Code Coverage for /workspace/src/extensions/CirrusSearch/includes/Maintenance/AnalyzerBuilder.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	100.00% covered (success)	100.00%	162 / 162	100.00% covered (success)	100.00%	34 / 34	CRAP	100.00% covered (success)	100.00%	1 / 1
AnalyzerBuilder	100.00% covered (success)	100.00%	162 / 162	100.00% covered (success)	100.00%	34 / 34	64	100.00% covered (success)	100.00%	1 / 1
__construct	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
withLangName	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
withCharFilters	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
withTokenizer	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
withFilters	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
withCharMap	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	1
withLimitedCharMap	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
withReversedNumberCharFilter	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
withNumberCharFilter	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	2
withElision	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	1
withLangLowercase	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	2
withStop	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
withExtraStop	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	1
withExtraStemmer	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
withStemmerOverride	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
withUnpackedAnalyzer	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
unpackedCheck	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2
insertFiltersBefore	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
appendFilters	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
prependFilters	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
withLightStemmer	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
omitStemmer	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
withAsciifolding	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
omitFolding	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
withRemoveEmpty	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
withDecimalDigit	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
build	100.00% covered (success)	100.00%	71 / 71	100.00% covered (success)	100.00%	1 / 1	24
patternFilter	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
mappingCharFilter	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	2
numberCharFilter	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	3
elisionFilter	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
stopFilterFromList	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2
overrideFilter	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
stemmerFilter	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1

1	<?php
2
3	namespace CirrusSearch\Maintenance;
4
5	use MediaWiki\Config\ConfigException;
6
7	/**
8	* Builds one elasticsearch analyzer to add to an analysis config array.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License along
21	* with this program; if not, write to the Free Software Foundation, Inc.,
22	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23	* http://www.gnu.org/copyleft/gpl.html
24	*/
25	class AnalyzerBuilder {
26	/**
27	* Indicate that filters should be automatically appended or prepended, rather
28	* than inserted before a given filter.
29	*/
30	public const APPEND = 1;
31	public const PREPEND = 2;
32
33	/** @var string */
34	private $langName;
35
36	/** @var string */
37	private $analyzerName = 'text';
38
39	/** @var bool */
40	private $icuEnabled;
41
42	/** @var string[]\|null list of char_filters */
43	private $charFilters;
44
45	/** @var string\|null name of tokenizer */
46	private $tokenizer = 'standard';
47
48	/** @var string[]\|null list of filters */
49	private $filters;
50
51	/** @var string[]\|null list of lang-specific character filter mappings */
52	private $charMap;
53
54	/** @var bool */
55	private $charMapLimited = false;
56
57	/** @var string\|null */
58	private $charMapName;
59
60	/** @var int\|null Unicode value for script-specific zero */
61	private $langZero;
62
63	/** @var bool should langZero's map be reversed (Arabic to non-Arabic)? */
64	private $numCharMapReversed = false;
65
66	/** @var string\|null name of char filter mapping digits (using $langZero) */
67	private $numCharMapName;
68
69	/** @var bool is elision processing case INsensitive? */
70	private $elisionArticleCase = true;
71
72	/** @var string[]\|null list of articles to elide */
73	private $elisionArticles;
74
75	/** @var string\|null */
76	private $elisionName;
77
78	/** @var string\|null */
79	private $langLowercase;
80
81	/** @var mixed\|null stopword _list_ or array of stopwords */
82	private $customStopList;
83
84	/** @var string\|null */
85	private $stopName;
86
87	/** @var mixed\|null stopword _list_ or array of stopwords */
88	private $extraStopList;
89
90	/** @var string\|null */
91	private $extraStopName;
92
93	/** @var bool\|null */
94	private $extraStopIgnoreCase;
95
96	/** @var string\|null */
97	private $extraStemmerLang;
98
99	/** @var string\|null */
100	private $extraStemmerName;
101
102	/** @var string[]\|null list of stemmer override rules */
103	private $overrideRules;
104
105	/** @var string\|null */
106	private $overrideName;
107
108	/**********
109	* The properties below are only used by unpacked analyzers
110	*/
111
112	/** @var bool */
113	private $unpacked = false;
114
115	/** @var array<int, array<string, string[]>> */
116	private $insertFilterList = [];
117
118	/** @var bool */
119	private $useStemmer = true;
120
121	/** @var string\|null */
122	private $stemmerLang;
123
124	/** @var string\|null folding flavor to use (null for none) */
125	private $folding = 'icu_folding';
126
127	/** @var string\|null */
128	private $removeEmpty;
129
130	/** @var string\|null */
131	private $decimalDigit;
132
133	/**
134	* @param string $langName
135	* @param bool $icuEnabled
136	*/
137	public function __construct( string $langName, bool $icuEnabled = false ) {
138	$this->langName = $langName;
139	$this->icuEnabled = $icuEnabled;
140	}
141
142	/**
143	* @param string $langName
144	* @return self
145	*/
146	public function withLangName( string $langName ): self {
147	$this->langName = $langName;
148	return $this;
149	}
150
151	/**
152	* @param string[] $charFilters
153	* @return self
154	*/
155	public function withCharFilters( array $charFilters ): self {
156	$this->charFilters = $charFilters;
157	return $this;
158	}
159
160	/**
161	* @param string $tokenizer
162	* @return self
163	*/
164	public function withTokenizer( string $tokenizer ): self {
165	$this->tokenizer = $tokenizer;
166	return $this;
167	}
168
169	/**
170	* @param string[] $filters
171	* @return self
172	*/
173	public function withFilters( array $filters ): self {
174	$this->filters = $filters;
175	return $this;
176	}
177
178	/**
179	* @param string[] $mappings
180	* @param string\|null $name
181	* @param bool $limited
182	* @return self
183	*/
184	public function withCharMap( array $mappings, ?string $name = null, bool $limited = false ): self {
185	$this->charMap = $mappings;
186	$this->charMapName = $name ?? "{$this->langName}_charfilter";
187	$this->charMapLimited = false;
188	return $this;
189	}
190
191	/**
192	* @param string[] $mappings
193	* @param string\|null $name
194	* @return self
195	*/
196	public function withLimitedCharMap( array $mappings, ?string $name = null ): self {
197	return $this->withCharMap( $mappings, $name, true );
198	}
199
200	/**
201	* @param int $langZero
202	* @param string\|null $name
203	* @return self
204	*/
205	public function withReversedNumberCharFilter( int $langZero, ?string $name = null ): self {
206	$this->withNumberCharFilter( $langZero, $name, true );
207	return $this;
208	}
209
210	/**
211	* @param int $langZero
212	* @param string\|null $name
213	* @param bool $reversed reverse the mapping from Arabic to non-Arabic
214	* @return self
215	*/
216	public function withNumberCharFilter( int $langZero, ?string $name = null, bool $reversed = false ): self {
217	$defName = $reversed ? "{$this->langName}_reversed_numbers" : "{$this->langName}_numbers";
218	$this->langZero = $langZero;
219	$this->numCharMapName = $name ?? $defName;
220	$this->numCharMapReversed = $reversed;
221	return $this;
222	}
223
224	/**
225	* @param string[] $articles "articles" to be elided
226	* @param bool $articleCase whether elision is case insensitive
227	* @return self
228	*/
229	public function withElision( array $articles, bool $articleCase = true ): self {
230	$this->elisionArticleCase = $articleCase;
231	$this->elisionArticles = $articles;
232	$this->elisionName = "{$this->langName}_elision";
233	return $this;
234	}
235
236	/**
237	* @param string\|null $name
238	* @return self
239	*/
240	public function withLangLowercase( ?string $name = null ): self {
241	$this->langLowercase = $name ?: $this->langName;
242	return $this;
243	}
244
245	/**
246	* @param mixed $stop pre-defined list like _french_ or an array of stopwords
247	* @param string\|null $name
248	* @return self
249	*/
250	public function withStop( $stop, ?string $name = null ): self {
251	$this->customStopList = $stop;
252	$this->stopName = $name ?? "{$this->langName}_stop";
253	return $this;
254	}
255
256	/**
257	* @param mixed $stop pre-defined list like _french_ or an array of stopwords
258	* @param string $name
259	* @param mixed $beforeFilter filter to insert extra stop before
260	* @param bool\|null $ignoreCase
261	* @return self
262	*/
263	public function withExtraStop( $stop, string $name, $beforeFilter = self::APPEND,
264	?bool $ignoreCase = null ): self {
265	$this->extraStopList = $stop;
266	$this->extraStopName = $name;
267	$this->extraStopIgnoreCase = $ignoreCase;
268	$this->insertFiltersBefore( $beforeFilter, [ $name ] );
269	return $this;
270	}
271
272	/**
273	* @param string $lang
274	* @param string\|null $name
275	* @return self
276	*/
277	public function withExtraStemmer( string $lang, ?string $name = null ): self {
278	$this->extraStemmerLang = $lang;
279	$this->extraStemmerName = $name ?? $lang;
280	return $this;
281	}
282
283	/**
284	* Rules can be a single rule string, or an array of rules
285	*
286	* @param mixed $rules stemmer override rules
287	* @param string\|null $name
288	* @return self
289	*/
290	public function withStemmerOverride( $rules, ?string $name = null ): self {
291	$this->overrideRules = $rules;
292	$this->overrideName = $name ?? "{$this->langName}_override";
293	return $this;
294	}
295
296	/**********
297	* The with.., omit.., and insert.. methods below are only used by unpacked analyzers
298	*/
299
300	/** @return self */
301	public function withUnpackedAnalyzer(): self {
302	$this->unpacked = true;
303	return $this;
304	}
305
306	private function unpackedCheck(): void {
307	if ( !$this->unpacked ) {
308	$caller = debug_backtrace()[1]['function'];
309	throw new ConfigException( "$caller() is only compatible with unpacked analyzers;" .
310	"call withUnpackedAnalyzer() before calling $caller()." );
311	}
312	}
313
314	/**
315	* @param mixed $beforeFilter specific filter to insert $filters before; use APPEND
316	* or PREPEND to always add to beginning or end of the list
317	* @param string[] $filterList list of additional filters to insert
318	* @return self
319	*/
320	public function insertFiltersBefore( $beforeFilter, array $filterList ): self {
321	$this->unpackedCheck();
322	$this->insertFilterList[] = [ $beforeFilter => $filterList ];
323	return $this;
324	}
325
326	/**
327	* @param string[] $filterList list of additional filters to append
328	* @return self
329	*/
330	public function appendFilters( array $filterList ): self {
331	$this->unpackedCheck();
332	$this->insertFiltersBefore( self::APPEND, $filterList );
333	return $this;
334	}
335
336	/**
337	* @param string[] $filterList list of additional filters to prepend
338	* @return self
339	*/
340	public function prependFilters( array $filterList ): self {
341	$this->unpackedCheck();
342	$this->insertFiltersBefore( self::PREPEND, $filterList );
343	return $this;
344	}
345
346	/** @return self */
347	public function withLightStemmer(): self {
348	$this->unpackedCheck();
349	$this->stemmerLang = "light_{$this->langName}";
350	return $this;
351	}
352
353	/** @return self */
354	public function omitStemmer(): self {
355	$this->unpackedCheck();
356	$this->useStemmer = false;
357	return $this;
358	}
359
360	/** @return self */
361	public function withAsciifolding(): self {
362	$this->unpackedCheck();
363	$this->folding = 'asciifolding';
364	return $this;
365	}
366
367	/** @return self */
368	public function omitFolding(): self {
369	$this->unpackedCheck();
370	$this->folding = '';
371	return $this;
372	}
373
374	/** @return self */
375	public function withRemoveEmpty(): self {
376	$this->unpackedCheck();
377	$this->removeEmpty = 'remove_empty';
378	return $this;
379	}
380
381	/** @return self */
382	public function withDecimalDigit(): self {
383	$this->unpackedCheck();
384	$this->decimalDigit = 'decimal_digit';
385	return $this;
386	}
387
388	/**
389	* Create a basic analyzer with support for various common options
390	*
391	* Can create various filters and character filters as specified.
392	* None are automatically added to the char_filter or filter list
393	* because the best order for these basic analyzers depends on the
394	* details of various third-party plugins.
395	*
396	* type: custom
397	* tokenizer: standard
398	* char_filter: as per $this->charFilters
399	* filter: as per $this->filters
400	*
401	* @param mixed[] $config to be updated
402	* @return mixed[] updated config
403	*/
404	public function build( array $config ): array {
405	$langStem = "{$this->langName}_stemmer";
406
407	if ( $this->unpacked ) {
408	// Analyzer config for char_filter and filter will be in the order below,
409	// if the relevant filters are enabled/configured.
410	//
411	// type: custom
412	// tokenizer: standard
413	// char_filter: lang_charfilter, lang_numbers
414	// filter: elision, aggressive_splitting, lowercase, stopwords, lang_norm,
415	// stemmer_override, stemmer, folding, remove_empty
416	if ( $this->useStemmer ) {
417	$this->stemmerLang ??= $this->langName;
418	} else {
419	$langStem = '';
420	}
421	$this->withStop( $this->customStopList ?? "_{$this->langName}_" );
422
423	// remove icu_folding if icu plugin unavailable or unwanted
424	if ( $this->folding == 'icu_folding' ) {
425	if ( !$this->icuEnabled ) {
426	$this->folding = '';
427	}
428	}
429
430	// build up the char_filter list--everything is optional
431	$this->charFilters[] = $this->charMapName;
432	$this->charFilters[] = $this->numCharMapName;
433
434	// remove 'falsey' (== not configured) values from the list
435	$this->charFilters = array_values( array_filter( $this->charFilters ) );
436
437	// build up the filter list--lowercase, stop, and stem are required
438	$this->filters[] = $this->elisionName;
439	$this->filters[] = 'lowercase';
440	$this->filters[] = $this->decimalDigit;
441	$this->filters[] = $this->stopName;
442	$this->filters[] = $this->overrideName;
443	$this->filters[] = $langStem;
444	$this->filters[] = $this->folding;
445	$this->filters[] = $this->removeEmpty;
446
447	// remove 'falsey' (== not configured) values from the list
448	$this->filters = array_values( array_filter( $this->filters ) );
449
450	// iterate over all lists of sets of filters to insert, in order, and insert
451	// them before the specified filter. If no such filter exists, $idx == -1 and
452	// the filters will be prepended, but you shouldn't count on that. APPEND and
453	// PREPEND constants can be used to add to beginning or end, regardless of
454	// other filters
455	foreach ( $this->insertFilterList as $filterPatch ) {
456	foreach ( $filterPatch as $beforeFilter => $filterList ) {
457	switch ( $beforeFilter ) {
458	case self::APPEND:
459	$this->filters = array_merge( $this->filters, $filterList );
460	break;
461	case self::PREPEND:
462	$this->filters = array_merge( $filterList, $this->filters );
463	break;
464	default:
465	$idx = array_search( $beforeFilter, $this->filters );
466	array_splice( $this->filters, $idx, 0, $filterList );
467	break;
468	}
469	}
470	}
471
472	} else {
473	// for simple filter lists, remove icu_folding if ICU not enabled
474	if ( !$this->icuEnabled ) {
475	$if_idx = array_search( 'icu_folding', $this->filters );
476	if ( $if_idx !== false ) {
477	array_splice( $this->filters, $if_idx, 1 );
478	}
479	}
480	}
481
482	$config[ 'analyzer' ][ $this->analyzerName ] = [
483	'type' => 'custom',
484	'tokenizer' => $this->tokenizer,
485	];
486
487	if ( $this->charMapName ) {
488	$config[ 'char_filter' ][ $this->charMapName ] =
489	$this->mappingCharFilter( $this->charMap, $this->charMapLimited );
490	}
491
492	if ( $this->numCharMapName ) {
493	$config[ 'char_filter' ][ $this->numCharMapName ] =
494	$this->numberCharFilter( $this->langZero, $this->numCharMapReversed );
495	}
496
497	if ( $this->elisionName ) {
498	$config[ 'filter' ][ $this->elisionName ] =
499	$this->elisionFilter( $this->elisionArticles, $this->elisionArticleCase );
500	}
501
502	if ( $this->langLowercase ) {
503	$config[ 'filter' ][ 'lowercase' ][ 'language' ] = $this->langLowercase;
504	}
505
506	if ( $this->overrideName ) {
507	$config[ 'filter' ][ $this->overrideName ] =
508	$this->overrideFilter( $this->overrideRules );
509	}
510
511	if ( $this->stopName ) {
512	$config[ 'filter' ][ $this->stopName ] =
513	$this->stopFilterFromList( $this->customStopList );
514	}
515
516	if ( $this->extraStopName ) {
517	$config[ 'filter' ][ $this->extraStopName ] =
518	$this->stopFilterFromList( $this->extraStopList, $this->extraStopIgnoreCase );
519	}
520
521	if ( $this->charFilters ) {
522	$config[ 'analyzer' ][ $this->analyzerName ][ 'char_filter' ] = $this->charFilters;
523	}
524
525	if ( $this->filters ) {
526	$config[ 'analyzer' ][ $this->analyzerName ][ 'filter' ] = $this->filters;
527	}
528
529	if ( $this->stemmerLang && $this->useStemmer ) {
530	$config[ 'filter' ][ $langStem ] =
531	$this->stemmerFilter( $this->stemmerLang );
532	}
533
534	if ( $this->extraStemmerName ) {
535	$config[ 'filter' ][ $this->extraStemmerName ] =
536	$this->stemmerFilter( $this->extraStemmerLang );
537	}
538
539	return $config;
540	}
541
542	/**
543	* Create a pattern_replace filter/char_filter with the mappings provided.
544	*
545	* @param string $pat
546	* @param string $repl
547	* @return mixed[] filter
548	*/
549	public static function patternFilter( string $pat, string $repl = '' ): array {
550	return [ 'type' => 'pattern_replace', 'pattern' => $pat, 'replacement' => $repl ];
551	}
552
553	/**
554	* Create a mapping or limited_mapping character filter with the mappings provided.
555	*
556	* @param string[] $mappings
557	* @param bool $limited
558	* @return mixed[] character filter
559	*/
560	public static function mappingCharFilter( array $mappings, bool $limited ): array {
561	$type = $limited ? 'limited_mapping' : 'mapping';
562	return [ 'type' => $type, 'mappings' => $mappings ];
563	}
564
565	/**
566	* Create a character filter that maps non-Arabic digits (e.g., ០-៩ or ０-９) to
567	* Arabic digits (0-9). Since they are usually all in a row, we just need the
568	* starting digit (equal to 0).
569	*
570	* Optionally reverse the mapping from Arabic to non-Arabic. For example, the ICU
571	* tokenizer works better on tokenizing Thai digits in Thai text than it does on
572	* Arabic digits.
573	*
574	* @param int $langZero
575	* @param bool $reversed reverse the mapping from Arabic to non-Arabic
576	* @return mixed[] character filter
577	*/
578	public static function numberCharFilter( int $langZero, bool $reversed = false ): array {
579	$numMap = [];
580	for ( $i = 0; $i <= 9; $i++ ) {
581	if ( $reversed ) {
582	$numMap[] = sprintf( '%d=>\\u%04x', $i, $langZero + $i );
583	} else {
584	$numMap[] = sprintf( '\\u%04x=>%d', $langZero + $i, $i );
585	}
586	}
587	return self::mappingCharFilter( $numMap, true );
588	}
589
590	/**
591	* Create an elision filter with the "articles" provided; $case determines whether
592	* stripping is case sensitive or not
593	*
594	* @param string[] $articles
595	* @param bool $case
596	* @return mixed[] token filter
597	*/
598	public static function elisionFilter( array $articles, bool $case = true ): array {
599	return [ 'type' => 'elision', 'articles_case' => $case, 'articles' => $articles ];
600	}
601
602	/**
603	* Create a stop word filter with the provided config. The config can be an array
604	* of stop words, or a string like _french_ that refers to a pre-defined list.
605	*
606	* @param mixed $stopwords
607	* @param bool\|null $ignoreCase
608	* @return mixed[] token filter
609	*/
610	public static function stopFilterFromList( $stopwords, ?bool $ignoreCase = null ): array {
611	$retArray = [ 'type' => 'stop', 'stopwords' => $stopwords ];
612	if ( $ignoreCase !== null ) {
613	$retArray['ignore_case'] = $ignoreCase;
614	}
615	return $retArray;
616	}
617
618	/**
619	* Create an stemming override filter with the rules provided, which can be a string
620	* with one rule or an array of such rules
621	*
622	* @param mixed $rules
623	* @return mixed[] token filter
624	*/
625	private function overrideFilter( $rules ): array {
626	return [ 'type' => 'stemmer_override', 'rules' => $rules ];
627	}
628
629	/**
630	* Create a stemmer filter with the provided config.
631	*
632	* @param string $stemmer
633	* @return mixed[] token filter
634	*/
635	public static function stemmerFilter( string $stemmer ): array {
636	return [ 'type' => 'stemmer', 'language' => $stemmer ];
637	}
638
639	}