Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
100.00% covered (success)
100.00%
196 / 196
100.00% covered (success)
100.00%
35 / 35
CRAP
100.00% covered (success)
100.00%
1 / 1
AnalyzerBuilder
100.00% covered (success)
100.00%
196 / 196
100.00% covered (success)
100.00%
35 / 35
68
100.00% covered (success)
100.00%
1 / 1
 __construct
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withLangName
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withCharFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withTokenizer
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withCharMap
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 withLimitedCharMap
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 withInvisCharMap
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withReversedNumberCharFilter
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withNumberCharFilter
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 withElision
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 withLangLowercase
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 withStop
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withExtraStop
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 withExtraStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withStemmerOverride
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withUnpackedAnalyzer
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 unpackedCheck
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 insertFiltersBefore
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 appendFilters
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 prependFilters
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withLightStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 omitStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withAsciifolding
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 omitFolding
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withRemoveEmpty
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withDecimalDigit
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 build
100.00% covered (success)
100.00%
103 / 103
100.00% covered (success)
100.00%
1 / 1
27
 patternFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 mappingCharFilter
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 numberCharFilter
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 elisionFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stopFilterFromList
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 overrideFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stemmerFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use MediaWiki\Config\ConfigException;
6
7/**
8 * Builds one search analyzer to add to an analysis config array.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License along
21 * with this program; if not, write to the Free Software Foundation, Inc.,
22 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 * http://www.gnu.org/copyleft/gpl.html
24 */
25class AnalyzerBuilder {
26    /**
27     * Indicate that filters should be automatically appended or prepended, rather
28     * than inserted before a given filter.
29     */
30    public const APPEND = 1;
31    public const PREPEND = 2;
32
33    /** @var string */
34    private $langName;
35
36    /** @var string */
37    private $analyzerName = 'text';
38
39    /** @var bool */
40    private $icuEnabled;
41
42    /** @var string[]|null list of char_filters */
43    private $charFilters;
44
45    /** @var string|null name of tokenizer */
46    private $tokenizer = 'standard';
47
48    /** @var string[]|null list of filters */
49    private $filters;
50
51    /** @var string[]|null list of lang-specific character filter mappings */
52    private $charMap;
53
54    /** @var bool */
55    private $charMapLimited = false;
56
57    /** @var string|null */
58    private $charMapName;
59
60    /** @var int|null Unicode value for script-specific zero */
61    private $langZero;
62
63    /** @var bool should langZero's map be reversed (Arabic to non-Arabic)? */
64    private $numCharMapReversed = false;
65
66    /** @var string|null name of char filter mapping digits (using $langZero) */
67    private $numCharMapName;
68
69    /** @var string|null name of char filter for cleaning up invisibles */
70    private $invisCharMapName;
71
72    /** @var bool is elision processing case INsensitive? */
73    private $elisionArticleCase = true;
74
75    /** @var string[]|null list of articles to elide */
76    private $elisionArticles;
77
78    /** @var string|null */
79    private $elisionName;
80
81    /** @var string|null */
82    private $langLowercase;
83
84    /** @var mixed|null stopword _list_ or array of stopwords */
85    private $customStopList;
86
87    /** @var string|null */
88    private $stopName;
89
90    /** @var mixed|null stopword _list_ or array of stopwords */
91    private $extraStopList;
92
93    /** @var string|null */
94    private $extraStopName;
95
96    /** @var bool|null */
97    private $extraStopIgnoreCase;
98
99    /** @var string|null */
100    private $extraStemmerLang;
101
102    /** @var string|null */
103    private $extraStemmerName;
104
105    /** @var string[]|null list of stemmer override rules */
106    private $overrideRules;
107
108    /** @var string|null */
109    private $overrideName;
110
111    /**********
112     * The properties below are only used by unpacked analyzers
113     */
114
115    /** @var bool */
116    private $unpacked = false;
117
118    /** @var array<int, array<string, string[]>> */
119    private $insertFilterList = [];
120
121    /** @var bool */
122    private $useStemmer = true;
123
124    /** @var string|null */
125    private $stemmerLang;
126
127    /** @var string|null folding flavor to use (null for none) */
128    private $folding = 'icu_folding';
129
130    /** @var string|null */
131    private $removeEmpty;
132
133    /** @var string|null */
134    private $decimalDigit;
135
136    /**
137     * @param string $langName
138     * @param bool $icuEnabled
139     */
140    public function __construct( string $langName, bool $icuEnabled = false ) {
141        $this->langName = $langName;
142        $this->icuEnabled = $icuEnabled;
143    }
144
145    public function withLangName( string $langName ): self {
146        $this->langName = $langName;
147        return $this;
148    }
149
150    /**
151     * @param string[] $charFilters
152     * @return self
153     */
154    public function withCharFilters( array $charFilters ): self {
155        $this->charFilters = $charFilters;
156        return $this;
157    }
158
159    public function withTokenizer( string $tokenizer ): self {
160        $this->tokenizer = $tokenizer;
161        return $this;
162    }
163
164    /**
165     * @param string[] $filters
166     * @return self
167     */
168    public function withFilters( array $filters ): self {
169        $this->filters = $filters;
170        return $this;
171    }
172
173    /**
174     * @param string[] $mappings
175     * @param string|null $name
176     * @param bool $limited
177     * @return self
178     */
179    public function withCharMap( array $mappings, ?string $name = null, bool $limited = false ): self {
180        $this->charMap = $mappings;
181        $this->charMapName = $name ?? "{$this->langName}_charfilter";
182        $this->charMapLimited = $limited;
183        return $this;
184    }
185
186    /**
187     * @param string[] $mappings
188     * @param string|null $name
189     * @return self
190     */
191    public function withLimitedCharMap( array $mappings, ?string $name = null ): self {
192        return $this->withCharMap( $mappings, $name, true );
193    }
194
195    /**
196     * @param string|null $name
197     * @return self
198     */
199    public function withInvisCharMap( ?string $name = 'invis_cleanup' ): self {
200        $this->invisCharMapName = $name;
201        return $this;
202    }
203
204    /**
205     * @param int $langZero
206     * @param string|null $name
207     * @return self
208     */
209    public function withReversedNumberCharFilter( int $langZero, ?string $name = null ): self {
210        $this->withNumberCharFilter( $langZero, $name, true );
211        return $this;
212    }
213
214    /**
215     * @param int $langZero
216     * @param string|null $name
217     * @param bool $reversed reverse the mapping from Arabic to non-Arabic
218     * @return self
219     */
220    public function withNumberCharFilter( int $langZero, ?string $name = null, bool $reversed = false ): self {
221        $defName = $reversed ? "{$this->langName}_reversed_numbers" : "{$this->langName}_numbers";
222        $this->langZero = $langZero;
223        $this->numCharMapName = $name ?? $defName;
224        $this->numCharMapReversed = $reversed;
225        return $this;
226    }
227
228    /**
229     * @param string[] $articles "articles" to be elided
230     * @param bool $articleCase whether elision is case insensitive
231     * @return self
232     */
233    public function withElision( array $articles, bool $articleCase = true ): self {
234        $this->elisionArticleCase = $articleCase;
235        $this->elisionArticles = $articles;
236        $this->elisionName = "{$this->langName}_elision";
237        return $this;
238    }
239
240    /**
241     * @param string|null $name
242     * @return self
243     */
244    public function withLangLowercase( ?string $name = null ): self {
245        $this->langLowercase = $name ?: $this->langName;
246        return $this;
247    }
248
249    /**
250     * @param mixed $stop pre-defined list like _french_ or an array of stopwords
251     * @param string|null $name
252     * @return self
253     */
254    public function withStop( $stop, ?string $name = null ): self {
255        $this->customStopList = $stop;
256        $this->stopName = $name ?? "{$this->langName}_stop";
257        return $this;
258    }
259
260    /**
261     * @param mixed $stop pre-defined list like _french_ or an array of stopwords
262     * @param string $name
263     * @param mixed $beforeFilter filter to insert extra stop before
264     * @param bool|null $ignoreCase
265     * @return self
266     */
267    public function withExtraStop( $stop, string $name, $beforeFilter = self::APPEND,
268            ?bool $ignoreCase = null ): self {
269        $this->extraStopList = $stop;
270        $this->extraStopName = $name;
271        $this->extraStopIgnoreCase = $ignoreCase;
272        $this->insertFiltersBefore( $beforeFilter, [ $name ] );
273        return $this;
274    }
275
276    /**
277     * @param string $lang
278     * @param string|null $name
279     * @return self
280     */
281    public function withExtraStemmer( string $lang, ?string $name = null ): self {
282        $this->extraStemmerLang = $lang;
283        $this->extraStemmerName = $name ?? $lang;
284        return $this;
285    }
286
287    /**
288     * Rules can be a single rule string, or an array of rules
289     *
290     * @param mixed $rules stemmer override rules
291     * @param string|null $name
292     * @return self
293     */
294    public function withStemmerOverride( $rules, ?string $name = null ): self {
295        $this->overrideRules = $rules;
296        $this->overrideName = $name ?? "{$this->langName}_override";
297        return $this;
298    }
299
300    public function withUnpackedAnalyzer(): self {
301        $this->unpacked = true;
302        return $this;
303    }
304
305    private function unpackedCheck(): void {
306        if ( !$this->unpacked ) {
307            $caller = debug_backtrace()[1]['function'];
308            throw new ConfigException( "$caller() is only compatible with unpacked analyzers;" .
309                "call withUnpackedAnalyzer() before calling $caller()." );
310        }
311    }
312
313    /**
314     * @param mixed $beforeFilter specific filter to insert $filters before; use APPEND
315     *                            or PREPEND to always add to beginning or end of the list
316     * @param string[] $filterList list of additional filters to insert
317     * @return self
318     */
319    public function insertFiltersBefore( $beforeFilter, array $filterList ): self {
320        $this->unpackedCheck();
321        $this->insertFilterList[] = [ $beforeFilter => $filterList ];
322        return $this;
323    }
324
325    /**
326     * @param string[] $filterList list of additional filters to append
327     * @return self
328     */
329    public function appendFilters( array $filterList ): self {
330        $this->unpackedCheck();
331        $this->insertFiltersBefore( self::APPEND, $filterList );
332        return $this;
333    }
334
335    /**
336     * @param string[] $filterList list of additional filters to prepend
337     * @return self
338     */
339    public function prependFilters( array $filterList ): self {
340        $this->unpackedCheck();
341        $this->insertFiltersBefore( self::PREPEND, $filterList );
342        return $this;
343    }
344
345    public function withLightStemmer(): self {
346        $this->unpackedCheck();
347        $this->stemmerLang = "light_{$this->langName}";
348        return $this;
349    }
350
351    public function omitStemmer(): self {
352        $this->unpackedCheck();
353        $this->useStemmer = false;
354        return $this;
355    }
356
357    public function withAsciifolding(): self {
358        $this->unpackedCheck();
359        $this->folding = 'asciifolding';
360        return $this;
361    }
362
363    public function omitFolding(): self {
364        $this->unpackedCheck();
365        $this->folding = '';
366        return $this;
367    }
368
369    public function withRemoveEmpty(): self {
370        $this->unpackedCheck();
371        $this->removeEmpty = 'remove_empty';
372        return $this;
373    }
374
375    public function withDecimalDigit(): self {
376        $this->unpackedCheck();
377        $this->decimalDigit = 'decimal_digit';
378        return $this;
379    }
380
381    /**
382     * Create a basic analyzer with support for various common options
383     *
384     * Can create various filters and character filters as specified.
385     * None are automatically added to the char_filter or filter list
386     * because the best order for these basic analyzers depends on the
387     * details of various third-party plugins.
388     *
389     * type: custom
390     * tokenizer: standard
391     * char_filter: as per $this->charFilters
392     * filter: as per $this->filters
393     *
394     * @param mixed[] $config to be updated
395     * @return mixed[] updated config
396     */
397    public function build( array $config ): array {
398        $langStem = "{$this->langName}_stemmer";
399
400        if ( $this->unpacked ) {
401            // Analyzer config for char_filter and filter will be in the order below,
402            // if the relevant filters are enabled/configured.
403            //
404            // type: custom
405            // tokenizer: standard
406            // char_filter: lang_charfilter, lang_numbers
407            // filter: elision, aggressive_splitting, lowercase, stopwords, lang_norm,
408            //         stemmer_override, stemmer, folding, remove_empty
409            if ( $this->useStemmer ) {
410                $this->stemmerLang ??= $this->langName;
411            } else {
412                $langStem = '';
413            }
414            $this->withStop( $this->customStopList ?? "_{$this->langName}_" );
415
416            // remove icu_folding if icu plugin unavailable or unwanted
417            if ( $this->folding == 'icu_folding' ) {
418                if ( !$this->icuEnabled ) {
419                    $this->folding = '';
420                }
421            }
422
423            // build up the char_filter list--everything is optional
424            $this->charFilters[] = $this->charMapName;
425            $this->charFilters[] = $this->numCharMapName;
426
427            // remove 'falsey' (== not configured) values from the list
428            $this->charFilters = array_values( array_filter( $this->charFilters ) );
429
430            // build up the filter list--lowercase, stop, and stem are required
431            $this->filters[] = $this->elisionName;
432            $this->filters[] = 'lowercase';
433            $this->filters[] = $this->decimalDigit;
434            $this->filters[] = $this->stopName;
435            $this->filters[] = $this->overrideName;
436            $this->filters[] = $langStem;
437            $this->filters[] = $this->folding;
438            $this->filters[] = $this->removeEmpty;
439
440            // remove 'falsey' (== not configured) values from the list
441            $this->filters = array_values( array_filter( $this->filters ) );
442
443            // iterate over all lists of sets of filters to insert, in order, and insert
444            // them before the specified filter. If no such filter exists, $idx == -1 and
445            // the filters will be prepended, but you shouldn't count on that. APPEND and
446            // PREPEND constants can be used to add to beginning or end, regardless of
447            // other filters
448            foreach ( $this->insertFilterList as $filterPatch ) {
449                foreach ( $filterPatch as $beforeFilter => $filterList ) {
450                    switch ( $beforeFilter ) {
451                        case self::APPEND:
452                            $this->filters = array_merge( $this->filters, $filterList );
453                            break;
454                        case self::PREPEND:
455                            $this->filters = array_merge( $filterList, $this->filters );
456                            break;
457                        default:
458                            $idx = array_search( $beforeFilter, $this->filters );
459                            array_splice( $this->filters, $idx, 0, $filterList );
460                            break;
461                    }
462                }
463            }
464
465        } else {
466            // for simple filter lists, remove icu_folding if ICU not enabled
467            if ( !$this->icuEnabled ) {
468                $if_idx = array_search( 'icu_folding', $this->filters );
469                if ( $if_idx !== false ) {
470                    array_splice( $this->filters, $if_idx, 1 );
471                }
472            }
473        }
474
475        $config[ 'analyzer' ][ $this->analyzerName ] = [
476            'type' => 'custom',
477            'tokenizer' => $this->tokenizer,
478        ];
479
480        if ( $this->charMapName ) {
481            $config[ 'char_filter' ][ $this->charMapName ] =
482                $this->mappingCharFilter( $this->charMap, $this->charMapLimited );
483        }
484
485        if ( $this->numCharMapName ) {
486            $config[ 'char_filter' ][ $this->numCharMapName ] =
487                $this->numberCharFilter( $this->langZero, $this->numCharMapReversed );
488        }
489
490        if ( $this->invisCharMapName ) {
491            $config[ 'char_filter' ][ $this->invisCharMapName ] = [
492                'type' => 'mapping',
493                'mappings' => [
494                    // split on ...
495                    '\u200B=>\u0020', // ... zero-width space
496                    // remove ...
497                    '\u00AD=>', // ... soft hyphen
498                    '\u200C=>', // ... zero-width non-joiner
499                    '\u200D=>', // ... zero-width joiner
500                    '\u2060=>', // ... word joiner
501                    '\uFEFF=>', // ... zero-width non-breaking space
502                    '\u200E=>', // ... LTR mark
503                    '\u200F=>', // ... RTL mark
504                    '\u202A=>', // ... LTR embedding
505                    '\u202B=>', // ... RTL embedding
506                    '\u202C=>', // ... pop directional formatting
507                    '\u202D=>', // ... LTR override
508                    '\u202E=>', // ... RTL override
509                    '\u2066=>', // ... LTR isolate
510                    '\u2067=>', // ... RTL isolate
511                    '\u2068=>', // ... first strong isolate
512                    '\u2069=>', // ... pop directional isolate
513                    '\u2061=>', // ... function application
514                    '\u2062=>', // ... invisible times
515                    '\u2063=>', // ... invisible separator
516                    '\u2064=>', // ... invisible plus
517                    // remove variation selectors 1-16 & 17-256 (below)
518                ]
519            ];
520
521            // add all 256 variation selectors to invisCharMapName
522            for ( $varIdx = 1; $varIdx <= 16; $varIdx++ ) { // 1-16 (FE00-FE0F)
523                $chr = mb_chr( 65023 + $varIdx, 'UTF-8' ); // xFE00 = 65024
524                $config[ 'char_filter' ][ $this->invisCharMapName ][ 'mappings' ][] = "$chr=>";
525            }
526            for ( $varIdx = 17; $varIdx <= 256; $varIdx++ ) { // 17-256 (E0100-E01EF)
527                $chr = mb_chr( 917743 + $varIdx, 'UTF-8' ); // E0100 = 917760
528                $config[ 'char_filter' ][ $this->invisCharMapName ][ 'mappings' ][] = "$chr=>";
529            }
530        }
531
532        if ( $this->elisionName ) {
533            $config[ 'filter' ][ $this->elisionName ] =
534                $this->elisionFilter( $this->elisionArticles, $this->elisionArticleCase );
535        }
536
537        if ( $this->langLowercase ) {
538            $config[ 'filter' ][ 'lowercase' ][ 'language' ] = $this->langLowercase;
539        }
540
541        if ( $this->overrideName ) {
542            $config[ 'filter' ][ $this->overrideName ] =
543                $this->overrideFilter( $this->overrideRules );
544        }
545
546        if ( $this->stopName ) {
547            $config[ 'filter' ][ $this->stopName ] =
548                $this->stopFilterFromList( $this->customStopList );
549        }
550
551        if ( $this->extraStopName ) {
552            $config[ 'filter' ][ $this->extraStopName ] =
553                $this->stopFilterFromList( $this->extraStopList, $this->extraStopIgnoreCase );
554        }
555
556        if ( $this->charFilters ) {
557            $config[ 'analyzer' ][ $this->analyzerName ][ 'char_filter' ] = $this->charFilters;
558        }
559
560        if ( $this->filters ) {
561            $config[ 'analyzer' ][ $this->analyzerName ][ 'filter' ] = $this->filters;
562        }
563
564        if ( $this->stemmerLang && $this->useStemmer ) {
565            $config[ 'filter' ][ $langStem ] =
566                $this->stemmerFilter( $this->stemmerLang );
567        }
568
569        if ( $this->extraStemmerName ) {
570            $config[ 'filter' ][ $this->extraStemmerName ] =
571                $this->stemmerFilter( $this->extraStemmerLang );
572        }
573
574        return $config;
575    }
576
577    /**
578     * Create a pattern_replace filter/char_filter with the mappings provided.
579     *
580     * @param string $pat
581     * @param string $repl
582     * @return mixed[] filter
583     */
584    public static function patternFilter( string $pat, string $repl = '' ): array {
585        return [ 'type' => 'pattern_replace', 'pattern' => $pat, 'replacement' => $repl ];
586    }
587
588    /**
589     * Create a mapping or limited_mapping character filter with the mappings provided.
590     *
591     * @param string[] $mappings
592     * @param bool $limited
593     * @return mixed[] character filter
594     */
595    public static function mappingCharFilter( array $mappings, bool $limited ): array {
596        $type = $limited ? 'limited_mapping' : 'mapping';
597        return [ 'type' => $type, 'mappings' => $mappings ];
598    }
599
600    /**
601     * Create a character filter that maps non-Arabic digits (e.g., ០-៩ or 0-9) to
602     * Arabic digits (0-9). Since they are usually all in a row, we just need the
603     * starting digit (equal to 0).
604     *
605     * Optionally reverse the mapping from Arabic to non-Arabic. For example, the ICU
606     * tokenizer works better on tokenizing Thai digits in Thai text than it does on
607     * Arabic digits.
608     *
609     * @param int $langZero
610     * @param bool $reversed reverse the mapping from Arabic to non-Arabic
611     * @return mixed[] character filter
612     */
613    public static function numberCharFilter( int $langZero, bool $reversed = false ): array {
614        $numMap = [];
615        for ( $i = 0; $i <= 9; $i++ ) {
616            if ( $reversed ) {
617                $numMap[] = sprintf( '%d=>\\u%04x', $i, $langZero + $i );
618            } else {
619                $numMap[] = sprintf( '\\u%04x=>%d', $langZero + $i, $i );
620            }
621        }
622        return self::mappingCharFilter( $numMap, true );
623    }
624
625    /**
626     * Create an elision filter with the "articles" provided; $case determines whether
627     * stripping is case sensitive or not
628     *
629     * @param string[] $articles
630     * @param bool $case
631     * @return mixed[] token filter
632     */
633    public static function elisionFilter( array $articles, bool $case = true ): array {
634        return [ 'type' => 'elision', 'articles_case' => $case, 'articles' => $articles ];
635    }
636
637    /**
638     * Create a stop word filter with the provided config. The config can be an array
639     * of stop words, or a string like _french_ that refers to a pre-defined list.
640     *
641     * @param mixed $stopwords
642     * @param bool|null $ignoreCase
643     * @return mixed[] token filter
644     */
645    public static function stopFilterFromList( $stopwords, ?bool $ignoreCase = null ): array {
646        $retArray = [ 'type' => 'stop', 'stopwords' => $stopwords ];
647        if ( $ignoreCase !== null ) {
648            $retArray['ignore_case'] = $ignoreCase;
649        }
650        return $retArray;
651    }
652
653    /**
654     * Create an stemming override filter with the rules provided, which can be a string
655     * with one rule or an array of such rules
656     *
657     * @param mixed $rules
658     * @return mixed[] token filter
659     */
660    private function overrideFilter( $rules ): array {
661        return [ 'type' => 'stemmer_override', 'rules' => $rules ];
662    }
663
664    /**
665     * Create a stemmer filter with the provided config.
666     *
667     * @param string $stemmer
668     * @return mixed[] token filter
669     */
670    public static function stemmerFilter( string $stemmer ): array {
671        return [ 'type' => 'stemmer', 'language' => $stemmer ];
672    }
673
674}