Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
100.00% covered (success)
100.00%
162 / 162
100.00% covered (success)
100.00%
34 / 34
CRAP
100.00% covered (success)
100.00%
1 / 1
AnalyzerBuilder
100.00% covered (success)
100.00%
162 / 162
100.00% covered (success)
100.00%
34 / 34
64
100.00% covered (success)
100.00%
1 / 1
 __construct
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withLangName
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withCharFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withTokenizer
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withCharMap
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 withLimitedCharMap
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 withReversedNumberCharFilter
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withNumberCharFilter
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 withElision
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 withLangLowercase
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 withStop
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withExtraStop
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 withExtraStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withStemmerOverride
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withUnpackedAnalyzer
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 unpackedCheck
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 insertFiltersBefore
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 appendFilters
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 prependFilters
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withLightStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 omitStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withAsciifolding
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 omitFolding
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withRemoveEmpty
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withDecimalDigit
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 build
100.00% covered (success)
100.00%
71 / 71
100.00% covered (success)
100.00%
1 / 1
24
 patternFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 mappingCharFilter
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 numberCharFilter
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 elisionFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stopFilterFromList
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 overrideFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stemmerFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use MediaWiki\Config\ConfigException;
6
7/**
8 * Builds one elasticsearch analyzer to add to an analysis config array.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License along
21 * with this program; if not, write to the Free Software Foundation, Inc.,
22 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 * http://www.gnu.org/copyleft/gpl.html
24 */
25class AnalyzerBuilder {
26    /**
27     * Indicate that filters should be automatically appended or prepended, rather
28     * than inserted before a given filter.
29     */
30    public const APPEND = 1;
31    public const PREPEND = 2;
32
33    /** @var string */
34    private $langName;
35
36    /** @var string */
37    private $analyzerName = 'text';
38
39    /** @var bool */
40    private $icuEnabled;
41
42    /** @var string[]|null list of char_filters */
43    private $charFilters;
44
45    /** @var string|null name of tokenizer */
46    private $tokenizer = 'standard';
47
48    /** @var string[]|null list of filters */
49    private $filters;
50
51    /** @var string[]|null list of lang-specific character filter mappings */
52    private $charMap;
53
54    /** @var bool */
55    private $charMapLimited = false;
56
57    /** @var string|null */
58    private $charMapName;
59
60    /** @var int|null Unicode value for script-specific zero */
61    private $langZero;
62
63    /** @var bool should langZero's map be reversed (Arabic to non-Arabic)? */
64    private $numCharMapReversed = false;
65
66    /** @var string|null name of char filter mapping digits (using $langZero) */
67    private $numCharMapName;
68
69    /** @var bool is elision processing case INsensitive? */
70    private $elisionArticleCase = true;
71
72    /** @var string[]|null list of articles to elide */
73    private $elisionArticles;
74
75    /** @var string|null */
76    private $elisionName;
77
78    /** @var string|null */
79    private $langLowercase;
80
81    /** @var mixed|null stopword _list_ or array of stopwords */
82    private $customStopList;
83
84    /** @var string|null */
85    private $stopName;
86
87    /** @var mixed|null stopword _list_ or array of stopwords */
88    private $extraStopList;
89
90    /** @var string|null */
91    private $extraStopName;
92
93    /** @var bool|null */
94    private $extraStopIgnoreCase;
95
96    /** @var string|null */
97    private $extraStemmerLang;
98
99    /** @var string|null */
100    private $extraStemmerName;
101
102    /** @var string[]|null list of stemmer override rules */
103    private $overrideRules;
104
105    /** @var string|null */
106    private $overrideName;
107
108    /**********
109     * The properties below are only used by unpacked analyzers
110     */
111
112    /** @var bool */
113    private $unpacked = false;
114
115    /** @var array<int, array<string, string[]>> */
116    private $insertFilterList = [];
117
118    /** @var bool */
119    private $useStemmer = true;
120
121    /** @var string|null */
122    private $stemmerLang;
123
124    /** @var string|null folding flavor to use (null for none) */
125    private $folding = 'icu_folding';
126
127    /** @var string|null */
128    private $removeEmpty;
129
130    /** @var string|null */
131    private $decimalDigit;
132
133    /**
134     * @param string $langName
135     * @param bool $icuEnabled
136     */
137    public function __construct( string $langName, bool $icuEnabled = false ) {
138        $this->langName = $langName;
139        $this->icuEnabled = $icuEnabled;
140    }
141
142    /**
143     * @param string $langName
144     * @return self
145     */
146    public function withLangName( string $langName ): self {
147        $this->langName = $langName;
148        return $this;
149    }
150
151    /**
152     * @param string[] $charFilters
153     * @return self
154     */
155    public function withCharFilters( array $charFilters ): self {
156        $this->charFilters = $charFilters;
157        return $this;
158    }
159
160    /**
161     * @param string $tokenizer
162     * @return self
163     */
164    public function withTokenizer( string $tokenizer ): self {
165        $this->tokenizer = $tokenizer;
166        return $this;
167    }
168
169    /**
170     * @param string[] $filters
171     * @return self
172     */
173    public function withFilters( array $filters ): self {
174        $this->filters = $filters;
175        return $this;
176    }
177
178    /**
179     * @param string[] $mappings
180     * @param string|null $name
181     * @param bool $limited
182     * @return self
183     */
184    public function withCharMap( array $mappings, ?string $name = null, bool $limited = false ): self {
185        $this->charMap = $mappings;
186        $this->charMapName = $name ?? "{$this->langName}_charfilter";
187        $this->charMapLimited = false;
188        return $this;
189    }
190
191    /**
192     * @param string[] $mappings
193     * @param string|null $name
194     * @return self
195     */
196    public function withLimitedCharMap( array $mappings, ?string $name = null ): self {
197        return $this->withCharMap( $mappings, $name, true );
198    }
199
200    /**
201     * @param int $langZero
202     * @param string|null $name
203     * @return self
204     */
205    public function withReversedNumberCharFilter( int $langZero, ?string $name = null ): self {
206        $this->withNumberCharFilter( $langZero, $name, true );
207        return $this;
208    }
209
210    /**
211     * @param int $langZero
212     * @param string|null $name
213     * @param bool $reversed reverse the mapping from Arabic to non-Arabic
214     * @return self
215     */
216    public function withNumberCharFilter( int $langZero, ?string $name = null, bool $reversed = false ): self {
217        $defName = $reversed ? "{$this->langName}_reversed_numbers" : "{$this->langName}_numbers";
218        $this->langZero = $langZero;
219        $this->numCharMapName = $name ?? $defName;
220        $this->numCharMapReversed = $reversed;
221        return $this;
222    }
223
224    /**
225     * @param string[] $articles "articles" to be elided
226     * @param bool $articleCase whether elision is case insensitive
227     * @return self
228     */
229    public function withElision( array $articles, bool $articleCase = true ): self {
230        $this->elisionArticleCase = $articleCase;
231        $this->elisionArticles = $articles;
232        $this->elisionName = "{$this->langName}_elision";
233        return $this;
234    }
235
236    /**
237     * @param string|null $name
238     * @return self
239     */
240    public function withLangLowercase( ?string $name = null ): self {
241        $this->langLowercase = $name ?: $this->langName;
242        return $this;
243    }
244
245    /**
246     * @param mixed $stop pre-defined list like _french_ or an array of stopwords
247     * @param string|null $name
248     * @return self
249     */
250    public function withStop( $stop, ?string $name = null ): self {
251        $this->customStopList = $stop;
252        $this->stopName = $name ?? "{$this->langName}_stop";
253        return $this;
254    }
255
256    /**
257     * @param mixed $stop pre-defined list like _french_ or an array of stopwords
258     * @param string $name
259     * @param mixed $beforeFilter filter to insert extra stop before
260     * @param bool|null $ignoreCase
261     * @return self
262     */
263    public function withExtraStop( $stop, string $name, $beforeFilter = self::APPEND,
264            ?bool $ignoreCase = null ): self {
265        $this->extraStopList = $stop;
266        $this->extraStopName = $name;
267        $this->extraStopIgnoreCase = $ignoreCase;
268        $this->insertFiltersBefore( $beforeFilter, [ $name ] );
269        return $this;
270    }
271
272    /**
273     * @param string $lang
274     * @param string|null $name
275     * @return self
276     */
277    public function withExtraStemmer( string $lang, ?string $name = null ): self {
278        $this->extraStemmerLang = $lang;
279        $this->extraStemmerName = $name ?? $lang;
280        return $this;
281    }
282
283    /**
284     * Rules can be a single rule string, or an array of rules
285     *
286     * @param mixed $rules stemmer override rules
287     * @param string|null $name
288     * @return self
289     */
290    public function withStemmerOverride( $rules, ?string $name = null ): self {
291        $this->overrideRules = $rules;
292        $this->overrideName = $name ?? "{$this->langName}_override";
293        return $this;
294    }
295
296    /**********
297     * The with.., omit.., and insert.. methods below are only used by unpacked analyzers
298     */
299
300    /** @return self */
301    public function withUnpackedAnalyzer(): self {
302        $this->unpacked = true;
303        return $this;
304    }
305
306    private function unpackedCheck(): void {
307        if ( !$this->unpacked ) {
308            $caller = debug_backtrace()[1]['function'];
309            throw new ConfigException( "$caller() is only compatible with unpacked analyzers;" .
310                "call withUnpackedAnalyzer() before calling $caller()." );
311        }
312    }
313
314    /**
315     * @param mixed $beforeFilter specific filter to insert $filters before; use APPEND
316     *                            or PREPEND to always add to beginning or end of the list
317     * @param string[] $filterList list of additional filters to insert
318     * @return self
319     */
320    public function insertFiltersBefore( $beforeFilter, array $filterList ): self {
321        $this->unpackedCheck();
322        $this->insertFilterList[] = [ $beforeFilter => $filterList ];
323        return $this;
324    }
325
326    /**
327     * @param string[] $filterList list of additional filters to append
328     * @return self
329     */
330    public function appendFilters( array $filterList ): self {
331        $this->unpackedCheck();
332        $this->insertFiltersBefore( self::APPEND, $filterList );
333        return $this;
334    }
335
336    /**
337     * @param string[] $filterList list of additional filters to prepend
338     * @return self
339     */
340    public function prependFilters( array $filterList ): self {
341        $this->unpackedCheck();
342        $this->insertFiltersBefore( self::PREPEND, $filterList );
343        return $this;
344    }
345
346    /** @return self */
347    public function withLightStemmer(): self {
348        $this->unpackedCheck();
349        $this->stemmerLang = "light_{$this->langName}";
350        return $this;
351    }
352
353    /** @return self */
354    public function omitStemmer(): self {
355        $this->unpackedCheck();
356        $this->useStemmer = false;
357        return $this;
358    }
359
360    /** @return self */
361    public function withAsciifolding(): self {
362        $this->unpackedCheck();
363        $this->folding = 'asciifolding';
364        return $this;
365    }
366
367    /** @return self */
368    public function omitFolding(): self {
369        $this->unpackedCheck();
370        $this->folding = '';
371        return $this;
372    }
373
374    /** @return self */
375    public function withRemoveEmpty(): self {
376        $this->unpackedCheck();
377        $this->removeEmpty = 'remove_empty';
378        return $this;
379    }
380
381    /** @return self */
382    public function withDecimalDigit(): self {
383        $this->unpackedCheck();
384        $this->decimalDigit = 'decimal_digit';
385        return $this;
386    }
387
388    /**
389     * Create a basic analyzer with support for various common options
390     *
391     * Can create various filters and character filters as specified.
392     * None are automatically added to the char_filter or filter list
393     * because the best order for these basic analyzers depends on the
394     * details of various third-party plugins.
395     *
396     * type: custom
397     * tokenizer: standard
398     * char_filter: as per $this->charFilters
399     * filter: as per $this->filters
400     *
401     * @param mixed[] $config to be updated
402     * @return mixed[] updated config
403     */
404    public function build( array $config ): array {
405        $langStem = "{$this->langName}_stemmer";
406
407        if ( $this->unpacked ) {
408            // Analyzer config for char_filter and filter will be in the order below,
409            // if the relevant filters are enabled/configured.
410            //
411            // type: custom
412            // tokenizer: standard
413            // char_filter: lang_charfilter, lang_numbers
414            // filter: elision, aggressive_splitting, lowercase, stopwords, lang_norm,
415            //         stemmer_override, stemmer, folding, remove_empty
416            if ( $this->useStemmer ) {
417                $this->stemmerLang ??= $this->langName;
418            } else {
419                $langStem = '';
420            }
421            $this->withStop( $this->customStopList ?? "_{$this->langName}_" );
422
423            // remove icu_folding if icu plugin unavailable or unwanted
424            if ( $this->folding == 'icu_folding' ) {
425                if ( !$this->icuEnabled ) {
426                    $this->folding = '';
427                }
428            }
429
430            // build up the char_filter list--everything is optional
431            $this->charFilters[] = $this->charMapName;
432            $this->charFilters[] = $this->numCharMapName;
433
434            // remove 'falsey' (== not configured) values from the list
435            $this->charFilters = array_values( array_filter( $this->charFilters ) );
436
437            // build up the filter list--lowercase, stop, and stem are required
438            $this->filters[] = $this->elisionName;
439            $this->filters[] = 'lowercase';
440            $this->filters[] = $this->decimalDigit;
441            $this->filters[] = $this->stopName;
442            $this->filters[] = $this->overrideName;
443            $this->filters[] = $langStem;
444            $this->filters[] = $this->folding;
445            $this->filters[] = $this->removeEmpty;
446
447            // remove 'falsey' (== not configured) values from the list
448            $this->filters = array_values( array_filter( $this->filters ) );
449
450            // iterate over all lists of sets of filters to insert, in order, and insert
451            // them before the specified filter. If no such filter exists, $idx == -1 and
452            // the filters will be prepended, but you shouldn't count on that. APPEND and
453            // PREPEND constants can be used to add to beginning or end, regardless of
454            // other filters
455            foreach ( $this->insertFilterList as $filterPatch ) {
456                foreach ( $filterPatch as $beforeFilter => $filterList ) {
457                    switch ( $beforeFilter ) {
458                        case self::APPEND:
459                            $this->filters = array_merge( $this->filters, $filterList );
460                            break;
461                        case self::PREPEND:
462                            $this->filters = array_merge( $filterList, $this->filters );
463                            break;
464                        default:
465                            $idx = array_search( $beforeFilter, $this->filters );
466                            array_splice( $this->filters, $idx, 0, $filterList );
467                            break;
468                    }
469                }
470            }
471
472        } else {
473            // for simple filter lists, remove icu_folding if ICU not enabled
474            if ( !$this->icuEnabled ) {
475                $if_idx = array_search( 'icu_folding', $this->filters );
476                if ( $if_idx !== false ) {
477                    array_splice( $this->filters, $if_idx, 1 );
478                }
479            }
480        }
481
482        $config[ 'analyzer' ][ $this->analyzerName ] = [
483            'type' => 'custom',
484            'tokenizer' => $this->tokenizer,
485        ];
486
487        if ( $this->charMapName ) {
488            $config[ 'char_filter' ][ $this->charMapName ] =
489                $this->mappingCharFilter( $this->charMap, $this->charMapLimited );
490        }
491
492        if ( $this->numCharMapName ) {
493            $config[ 'char_filter' ][ $this->numCharMapName ] =
494                $this->numberCharFilter( $this->langZero, $this->numCharMapReversed );
495        }
496
497        if ( $this->elisionName ) {
498            $config[ 'filter' ][ $this->elisionName ] =
499                $this->elisionFilter( $this->elisionArticles, $this->elisionArticleCase );
500        }
501
502        if ( $this->langLowercase ) {
503            $config[ 'filter' ][ 'lowercase' ][ 'language' ] = $this->langLowercase;
504        }
505
506        if ( $this->overrideName ) {
507            $config[ 'filter' ][ $this->overrideName ] =
508                $this->overrideFilter( $this->overrideRules );
509        }
510
511        if ( $this->stopName ) {
512            $config[ 'filter' ][ $this->stopName ] =
513                $this->stopFilterFromList( $this->customStopList );
514        }
515
516        if ( $this->extraStopName ) {
517            $config[ 'filter' ][ $this->extraStopName ] =
518                $this->stopFilterFromList( $this->extraStopList, $this->extraStopIgnoreCase );
519        }
520
521        if ( $this->charFilters ) {
522            $config[ 'analyzer' ][ $this->analyzerName ][ 'char_filter' ] = $this->charFilters;
523        }
524
525        if ( $this->filters ) {
526            $config[ 'analyzer' ][ $this->analyzerName ][ 'filter' ] = $this->filters;
527        }
528
529        if ( $this->stemmerLang && $this->useStemmer ) {
530            $config[ 'filter' ][ $langStem ] =
531                $this->stemmerFilter( $this->stemmerLang );
532        }
533
534        if ( $this->extraStemmerName ) {
535            $config[ 'filter' ][ $this->extraStemmerName ] =
536                $this->stemmerFilter( $this->extraStemmerLang );
537        }
538
539        return $config;
540    }
541
542    /**
543     * Create a pattern_replace filter/char_filter with the mappings provided.
544     *
545     * @param string $pat
546     * @param string $repl
547     * @return mixed[] filter
548     */
549    public static function patternFilter( string $pat, string $repl = '' ): array {
550        return [ 'type' => 'pattern_replace', 'pattern' => $pat, 'replacement' => $repl ];
551    }
552
553    /**
554     * Create a mapping or limited_mapping character filter with the mappings provided.
555     *
556     * @param string[] $mappings
557     * @param bool $limited
558     * @return mixed[] character filter
559     */
560    public static function mappingCharFilter( array $mappings, bool $limited ): array {
561        $type = $limited ? 'limited_mapping' : 'mapping';
562        return [ 'type' => $type, 'mappings' => $mappings ];
563    }
564
565    /**
566     * Create a character filter that maps non-Arabic digits (e.g., ០-៩ or 0-9) to
567     * Arabic digits (0-9). Since they are usually all in a row, we just need the
568     * starting digit (equal to 0).
569     *
570     * Optionally reverse the mapping from Arabic to non-Arabic. For example, the ICU
571     * tokenizer works better on tokenizing Thai digits in Thai text than it does on
572     * Arabic digits.
573     *
574     * @param int $langZero
575     * @param bool $reversed reverse the mapping from Arabic to non-Arabic
576     * @return mixed[] character filter
577     */
578    public static function numberCharFilter( int $langZero, bool $reversed = false ): array {
579        $numMap = [];
580        for ( $i = 0; $i <= 9; $i++ ) {
581            if ( $reversed ) {
582                $numMap[] = sprintf( '%d=>\\u%04x', $i, $langZero + $i );
583            } else {
584                $numMap[] = sprintf( '\\u%04x=>%d', $langZero + $i, $i );
585            }
586        }
587        return self::mappingCharFilter( $numMap, true );
588    }
589
590    /**
591     * Create an elision filter with the "articles" provided; $case determines whether
592     * stripping is case sensitive or not
593     *
594     * @param string[] $articles
595     * @param bool $case
596     * @return mixed[] token filter
597     */
598    public static function elisionFilter( array $articles, bool $case = true ): array {
599        return [ 'type' => 'elision', 'articles_case' => $case, 'articles' => $articles ];
600    }
601
602    /**
603     * Create a stop word filter with the provided config. The config can be an array
604     * of stop words, or a string like _french_ that refers to a pre-defined list.
605     *
606     * @param mixed $stopwords
607     * @param bool|null $ignoreCase
608     * @return mixed[] token filter
609     */
610    public static function stopFilterFromList( $stopwords, ?bool $ignoreCase = null ): array {
611        $retArray = [ 'type' => 'stop', 'stopwords' => $stopwords ];
612        if ( $ignoreCase !== null ) {
613            $retArray['ignore_case'] = $ignoreCase;
614        }
615        return $retArray;
616    }
617
618    /**
619     * Create an stemming override filter with the rules provided, which can be a string
620     * with one rule or an array of such rules
621     *
622     * @param mixed $rules
623     * @return mixed[] token filter
624     */
625    private function overrideFilter( $rules ): array {
626        return [ 'type' => 'stemmer_override', 'rules' => $rules ];
627    }
628
629    /**
630     * Create a stemmer filter with the provided config.
631     *
632     * @param string $stemmer
633     * @return mixed[] token filter
634     */
635    public static function stemmerFilter( string $stemmer ): array {
636        return [ 'type' => 'stemmer', 'language' => $stemmer ];
637    }
638
639}