Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
100.00% covered (success)
100.00%
153 / 153
100.00% covered (success)
100.00%
33 / 33
CRAP
100.00% covered (success)
100.00%
1 / 1
AnalyzerBuilder
100.00% covered (success)
100.00%
153 / 153
100.00% covered (success)
100.00%
33 / 33
59
100.00% covered (success)
100.00%
1 / 1
 __construct
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withCharFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withTokenizer
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withCharMap
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 withLimitedCharMap
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 withReversedNumberCharFilter
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withNumberCharFilter
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 withElision
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 withLangLowercase
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 withStop
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withExtraStop
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 withExtraStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withStemmerOverride
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withUnpackedAnalyzer
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 unpackedCheck
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 insertFiltersBefore
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 appendFilters
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 prependFilters
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withLightStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 omitStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withAsciifoldingPreserve
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 omitAsciifolding
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withRemoveEmpty
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withDecimalDigit
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 build
100.00% covered (success)
100.00%
64 / 64
100.00% covered (success)
100.00%
1 / 1
20
 patternFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 mappingCharFilter
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 numberCharFilter
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 elisionFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stopFilterFromList
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 overrideFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stemmerFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use MediaWiki\Config\ConfigException;
6
7/**
8 * Builds one elasticsearch analyzer to add to an analysis config array.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License along
21 * with this program; if not, write to the Free Software Foundation, Inc.,
22 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 * http://www.gnu.org/copyleft/gpl.html
24 */
25class AnalyzerBuilder {
26    /**
27     * Indicate that filters should be automatically appended or prepended, rather
28     * than inserted before a given filter.
29     */
30    public const APPEND = 1;
31    public const PREPEND = 2;
32
33    /** @var string */
34    private $langName;
35
36    /** @var string */
37    private $analyzerName;
38
39    /** @var string[]|null list of char_filters */
40    private $charFilters;
41
42    /** @var string|null name of tokenizer */
43    private $tokenizer = 'standard';
44
45    /** @var string[]|null list of filters */
46    private $filters;
47
48    /** @var string[]|null list of lang-specific character filter mappings */
49    private $charMap;
50
51    /** @var bool */
52    private $charMapLimited = false;
53
54    /** @var string|null */
55    private $charMapName;
56
57    /** @var int|null Unicode value for script-specific zero */
58    private $langZero;
59
60    /** @var bool should langZero's map be reversed (Arabic to non-Arabic)? */
61    private $numCharMapReversed = false;
62
63    /** @var string|null name of char filter mapping digits (using $langZero) */
64    private $numCharMapName;
65
66    /** @var bool is elision processing case INsensitive? */
67    private $elisionArticleCase = true;
68
69    /** @var string[]|null list of articles to elide */
70    private $elisionArticles;
71
72    /** @var string|null */
73    private $elisionName;
74
75    /** @var string|null */
76    private $langLowercase;
77
78    /** @var mixed|null stopword _list_ or array of stopwords */
79    private $customStopList;
80
81    /** @var string|null */
82    private $stopName;
83
84    /** @var mixed|null stopword _list_ or array of stopwords */
85    private $extraStopList;
86
87    /** @var string|null */
88    private $extraStopName;
89
90    /** @var bool|null */
91    private $extraStopIgnoreCase;
92
93    /** @var string|null */
94    private $extraStemmerLang;
95
96    /** @var string|null */
97    private $extraStemmerName;
98
99    /** @var string[]|null list of stemmer override rules */
100    private $overrideRules;
101
102    /** @var string|null */
103    private $overrideName;
104
105    /**********
106     * The properties below are only used by unpacked analyzers
107     */
108
109    /** @var bool */
110    private $unpacked = false;
111
112    /** @var array<int, array<string, string[]>> */
113    private $insertFilterList = [];
114
115    /** @var bool */
116    private $useStemmer = true;
117
118    /** @var string|null */
119    private $stemmerLang;
120
121    /** @var string|null asciifolding flavor to use (null for none) */
122    private $asciifolding = 'asciifolding';
123
124    /** @var string|null */
125    private $removeEmpty;
126
127    /** @var string|null */
128    private $decimalDigit;
129
130    /**
131     * @param string $langName
132     * @param string $analyzerName (default to 'text')
133     */
134    public function __construct( string $langName, string $analyzerName = 'text' ) {
135        $this->langName = $langName;
136        $this->analyzerName = $analyzerName;
137    }
138
139    /**
140     * @param string[] $charFilters
141     * @return self
142     */
143    public function withCharFilters( array $charFilters ): self {
144        $this->charFilters = $charFilters;
145        return $this;
146    }
147
148    /**
149     * @param string $tokenizer
150     * @return self
151     */
152    public function withTokenizer( string $tokenizer ): self {
153        $this->tokenizer = $tokenizer;
154        return $this;
155    }
156
157    /**
158     * @param string[] $filters
159     * @return self
160     */
161    public function withFilters( array $filters ): self {
162        $this->filters = $filters;
163        return $this;
164    }
165
166    /**
167     * @param string[] $mappings
168     * @param string|null $name
169     * @param bool $limited
170     * @return self
171     */
172    public function withCharMap( array $mappings, string $name = null, bool $limited = false ): self {
173        $this->charMap = $mappings;
174        $this->charMapName = $name ?? "{$this->langName}_charfilter";
175        $this->charMapLimited = false;
176        return $this;
177    }
178
179    /**
180     * @param string[] $mappings
181     * @param string|null $name
182     * @return self
183     */
184    public function withLimitedCharMap( array $mappings, string $name = null ): self {
185        return $this->withCharMap( $mappings, $name, true );
186    }
187
188    /**
189     * @param int $langZero
190     * @param string|null $name
191     * @return self
192     */
193    public function withReversedNumberCharFilter( int $langZero, string $name = null ): self {
194        $this->withNumberCharFilter( $langZero, $name, true );
195        return $this;
196    }
197
198    /**
199     * @param int $langZero
200     * @param string|null $name
201     * @param bool $reversed reverse the mapping from Arabic to non-Arabic
202     * @return self
203     */
204    public function withNumberCharFilter( int $langZero, string $name = null, bool $reversed = false ): self {
205        $defName = $reversed ? "{$this->langName}_reversed_numbers" : "{$this->langName}_numbers";
206        $this->langZero = $langZero;
207        $this->numCharMapName = $name ?? $defName;
208        $this->numCharMapReversed = $reversed;
209        return $this;
210    }
211
212    /**
213     * @param string[] $articles "articles" to be elided
214     * @param bool $articleCase whether elision is case insensitive
215     * @return self
216     */
217    public function withElision( array $articles, bool $articleCase = true ): self {
218        $this->elisionArticleCase = $articleCase;
219        $this->elisionArticles = $articles;
220        $this->elisionName = "{$this->langName}_elision";
221        return $this;
222    }
223
224    /**
225     * @param string|null $name
226     * @return self
227     */
228    public function withLangLowercase( string $name = null ): self {
229        $this->langLowercase = $name ?: $this->langName;
230        return $this;
231    }
232
233    /**
234     * @param mixed $stop pre-defined list like _french_ or an array of stopwords
235     * @param string|null $name
236     * @return self
237     */
238    public function withStop( $stop, string $name = null ): self {
239        $this->customStopList = $stop;
240        $this->stopName = $name ?? "{$this->langName}_stop";
241        return $this;
242    }
243
244    /**
245     * @param mixed $stop pre-defined list like _french_ or an array of stopwords
246     * @param string $name
247     * @param mixed $beforeFilter filter to insert extra stop before
248     * @param bool|null $ignoreCase
249     * @return self
250     */
251    public function withExtraStop( $stop, string $name, $beforeFilter = self::APPEND,
252            bool $ignoreCase = null ): self {
253        $this->extraStopList = $stop;
254        $this->extraStopName = $name;
255        $this->extraStopIgnoreCase = $ignoreCase;
256        $this->insertFiltersBefore( $beforeFilter, [ $name ] );
257        return $this;
258    }
259
260    /**
261     * @param string $lang
262     * @param string|null $name
263     * @return self
264     */
265    public function withExtraStemmer( string $lang, string $name = null ): self {
266        $this->extraStemmerLang = $lang;
267        $this->extraStemmerName = $name ?? $lang;
268        return $this;
269    }
270
271    /**
272     * Rules can be a single rule string, or an array of rules
273     *
274     * @param mixed $rules stemmer override rules
275     * @param string|null $name
276     * @return self
277     */
278    public function withStemmerOverride( $rules, string $name = null ): self {
279        $this->overrideRules = $rules;
280        $this->overrideName = $name ?? "{$this->langName}_override";
281        return $this;
282    }
283
284    /**********
285     * The with.., omit.., and insert.. methods below are only used by unpacked analyzers
286     */
287
288    /** @return self */
289    public function withUnpackedAnalyzer(): self {
290        $this->unpacked = true;
291        return $this;
292    }
293
294    private function unpackedCheck(): void {
295        if ( !$this->unpacked ) {
296            $caller = debug_backtrace()[1]['function'];
297            throw new ConfigException( "$caller() is only compatible with unpacked analyzers;" .
298                "call withUnpackedAnalyzer() before calling $caller()." );
299        }
300    }
301
302    /**
303     * @param mixed $beforeFilter specific filter to insert $filters before; use APPEND
304     *                            or PREPEND to always add to beginning or end of the list
305     * @param string[] $filterList list of additional filters to insert
306     * @return self
307     */
308    public function insertFiltersBefore( $beforeFilter, array $filterList ): self {
309        $this->unpackedCheck();
310        $this->insertFilterList[] = [ $beforeFilter => $filterList ];
311        return $this;
312    }
313
314    /**
315     * @param string[] $filterList list of additional filters to append
316     * @return self
317     */
318    public function appendFilters( array $filterList ): self {
319        $this->unpackedCheck();
320        $this->insertFiltersBefore( self::APPEND, $filterList );
321        return $this;
322    }
323
324    /**
325     * @param string[] $filterList list of additional filters to prepend
326     * @return self
327     */
328    public function prependFilters( array $filterList ): self {
329        $this->unpackedCheck();
330        $this->insertFiltersBefore( self::PREPEND, $filterList );
331        return $this;
332    }
333
334    /** @return self */
335    public function withLightStemmer(): self {
336        $this->unpackedCheck();
337        $this->stemmerLang = "light_{$this->langName}";
338        return $this;
339    }
340
341    /** @return self */
342    public function omitStemmer(): self {
343        $this->unpackedCheck();
344        $this->useStemmer = false;
345        return $this;
346    }
347
348    /** @return self */
349    public function withAsciifoldingPreserve(): self {
350        $this->unpackedCheck();
351        $this->asciifolding = 'asciifolding_preserve';
352        return $this;
353    }
354
355    /** @return self */
356    public function omitAsciifolding(): self {
357        $this->unpackedCheck();
358        $this->asciifolding = '';
359        return $this;
360    }
361
362    /** @return self */
363    public function withRemoveEmpty(): self {
364        $this->unpackedCheck();
365        $this->removeEmpty = 'remove_empty';
366        return $this;
367    }
368
369    /** @return self */
370    public function withDecimalDigit(): self {
371        $this->unpackedCheck();
372        $this->decimalDigit = 'decimal_digit';
373        return $this;
374    }
375
376    /**
377     * Create a basic analyzer with support for various common options
378     *
379     * Can create various filters and character filters as specified.
380     * None are automatically added to the char_filter or filter list
381     * because the best order for these basic analyzers depends on the
382     * details of various third-party plugins.
383     *
384     * type: custom
385     * tokenizer: standard
386     * char_filter: as per $this->charFilters
387     * filter: as per $this->filters
388     *
389     * @param mixed[] $config to be updated
390     * @return mixed[] updated config
391     */
392    public function build( array $config ): array {
393        $langStem = "{$this->langName}_stemmer";
394
395        if ( $this->unpacked ) {
396            // Analyzer config for char_filter and filter will be in the order below,
397            // if the relevant filters are enabled/configured.
398            //
399            // type: custom
400            // tokenizer: standard
401            // char_filter: lang_charfilter, lang_numbers
402            // filter: elision, aggressive_splitting, lowercase, stopwords, lang_norm,
403            //         stemmer_override, stemmer, asciifolding, remove_empty
404            if ( $this->useStemmer ) {
405                $this->stemmerLang ??= $this->langName;
406            } else {
407                $langStem = '';
408            }
409            $this->withStop( $this->customStopList ?? "_{$this->langName}_" );
410
411            // build up the char_filter list--everything is optional
412            $this->charFilters[] = $this->charMapName;
413            $this->charFilters[] = $this->numCharMapName;
414
415            // remove 'falsey' (== not configured) values from the list
416            $this->charFilters = array_values( array_filter( $this->charFilters ) );
417
418            // build up the filter list--lowercase, stop, and stem are required
419            $this->filters[] = $this->elisionName;
420            $this->filters[] = 'lowercase';
421            $this->filters[] = $this->decimalDigit;
422            $this->filters[] = $this->stopName;
423            $this->filters[] = $this->overrideName;
424            $this->filters[] = $langStem;
425            $this->filters[] = $this->asciifolding;
426            $this->filters[] = $this->removeEmpty;
427
428            // remove 'falsey' (== not configured) values from the list
429            $this->filters = array_values( array_filter( $this->filters ) );
430
431            // iterate over all lists of sets of filters to insert, in order, and insert
432            // them before the specified filter. If no such filter exists, $idx == -1 and
433            // the filters will be prepended, but you shouldn't count on that. APPEND and
434            // PREPEND constants can be used to add to beginning or end, regardless of
435            // other filters
436            foreach ( $this->insertFilterList as $filterPatch ) {
437                foreach ( $filterPatch as $beforeFilter => $filterList ) {
438                    switch ( $beforeFilter ) {
439                        case self::APPEND:
440                            $this->filters = array_merge( $this->filters, $filterList );
441                            break;
442                        case self::PREPEND:
443                            $this->filters = array_merge( $filterList, $this->filters );
444                            break;
445                        default:
446                            $idx = array_search( $beforeFilter, $this->filters );
447                            array_splice( $this->filters, $idx, 0, $filterList );
448                            break;
449                    }
450                }
451            }
452
453        }
454
455        $config[ 'analyzer' ][ $this->analyzerName ] = [
456            'type' => 'custom',
457            'tokenizer' => $this->tokenizer,
458        ];
459
460        if ( $this->charMapName ) {
461            $config[ 'char_filter' ][ $this->charMapName ] =
462                $this->mappingCharFilter( $this->charMap, $this->charMapLimited );
463        }
464
465        if ( $this->numCharMapName ) {
466            $config[ 'char_filter' ][ $this->numCharMapName ] =
467                $this->numberCharFilter( $this->langZero, $this->numCharMapReversed );
468        }
469
470        if ( $this->elisionName ) {
471            $config[ 'filter' ][ $this->elisionName ] =
472                $this->elisionFilter( $this->elisionArticles, $this->elisionArticleCase );
473        }
474
475        if ( $this->langLowercase ) {
476            $config[ 'filter' ][ 'lowercase' ][ 'language' ] = $this->langLowercase;
477        }
478
479        if ( $this->overrideName ) {
480            $config[ 'filter' ][ $this->overrideName ] =
481                $this->overrideFilter( $this->overrideRules );
482        }
483
484        if ( $this->stopName ) {
485            $config[ 'filter' ][ $this->stopName ] =
486                $this->stopFilterFromList( $this->customStopList );
487        }
488
489        if ( $this->extraStopName ) {
490            $config[ 'filter' ][ $this->extraStopName ] =
491                $this->stopFilterFromList( $this->extraStopList, $this->extraStopIgnoreCase );
492        }
493
494        if ( $this->charFilters ) {
495            $config[ 'analyzer' ][ $this->analyzerName ][ 'char_filter' ] = $this->charFilters;
496        }
497
498        if ( $this->filters ) {
499            $config[ 'analyzer' ][ $this->analyzerName ][ 'filter' ] = $this->filters;
500        }
501
502        if ( $this->stemmerLang && $this->useStemmer ) {
503            $config[ 'filter' ][ $langStem ] =
504                $this->stemmerFilter( $this->stemmerLang );
505        }
506
507        if ( $this->extraStemmerName ) {
508            $config[ 'filter' ][ $this->extraStemmerName ] =
509                $this->stemmerFilter( $this->extraStemmerLang );
510        }
511
512        return $config;
513    }
514
515    /**
516     * Create a pattern_replace filter/char_filter with the mappings provided.
517     *
518     * @param string $pat
519     * @param string $repl
520     * @return mixed[] filter
521     */
522    public static function patternFilter( string $pat, string $repl = '' ): array {
523        return [ 'type' => 'pattern_replace', 'pattern' => $pat, 'replacement' => $repl ];
524    }
525
526    /**
527     * Create a mapping or limited_mapping character filter with the mappings provided.
528     *
529     * @param string[] $mappings
530     * @param bool $limited
531     * @return mixed[] character filter
532     */
533    public static function mappingCharFilter( array $mappings, bool $limited ): array {
534        $type = $limited ? 'limited_mapping' : 'mapping';
535        return [ 'type' => $type, 'mappings' => $mappings ];
536    }
537
538    /**
539     * Create a character filter that maps non-Arabic digits (e.g., ០-៩ or 0-9) to
540     * Arabic digits (0-9). Since they are usually all in a row, we just need the
541     * starting digit (equal to 0).
542     *
543     * Optionally reverse the mapping from Arabic to non-Arabic. For example, the ICU
544     * tokenizer works better on tokenizing Thai digits in Thai text than it does on
545     * Arabic digits.
546     *
547     * @param int $langZero
548     * @param bool $reversed reverse the mapping from Arabic to non-Arabic
549     * @return mixed[] character filter
550     */
551    public static function numberCharFilter( int $langZero, bool $reversed = false ): array {
552        $numMap = [];
553        for ( $i = 0; $i <= 9; $i++ ) {
554            if ( $reversed ) {
555                $numMap[] = sprintf( '%d=>\\u%04x', $i, $langZero + $i );
556            } else {
557                $numMap[] = sprintf( '\\u%04x=>%d', $langZero + $i, $i );
558            }
559        }
560        return self::mappingCharFilter( $numMap, true );
561    }
562
563    /**
564     * Create an elision filter with the "articles" provided; $case determines whether
565     * stripping is case sensitive or not
566     *
567     * @param string[] $articles
568     * @param bool $case
569     * @return mixed[] token filter
570     */
571    public static function elisionFilter( array $articles, bool $case = true ): array {
572        return [ 'type' => 'elision', 'articles_case' => $case, 'articles' => $articles ];
573    }
574
575    /**
576     * Create a stop word filter with the provided config. The config can be an array
577     * of stop words, or a string like _french_ that refers to a pre-defined list.
578     *
579     * @param mixed $stopwords
580     * @param bool|null $ignoreCase
581     * @return mixed[] token filter
582     */
583    public static function stopFilterFromList( $stopwords, bool $ignoreCase = null ): array {
584        $retArray = [ 'type' => 'stop', 'stopwords' => $stopwords ];
585        if ( isset( $ignoreCase ) ) {
586            $retArray['ignore_case'] = $ignoreCase;
587        }
588        return $retArray;
589    }
590
591    /**
592     * Create an stemming override filter with the rules provided, which can be a string
593     * with one rule or an array of such rules
594     *
595     * @param mixed $rules
596     * @return mixed[] token filter
597     */
598    private function overrideFilter( $rules ): array {
599        return [ 'type' => 'stemmer_override', 'rules' => $rules ];
600    }
601
602    /**
603     * Create a stemmer filter with the provided config.
604     *
605     * @param string $stemmer
606     * @return mixed[] token filter
607     */
608    public static function stemmerFilter( string $stemmer ): array {
609        return [ 'type' => 'stemmer', 'language' => $stemmer ];
610    }
611
612}