Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
100.00% covered (success)
100.00%
151 / 151
100.00% covered (success)
100.00%
33 / 33
CRAP
100.00% covered (success)
100.00%
1 / 1
AnalyzerBuilder
100.00% covered (success)
100.00%
151 / 151
100.00% covered (success)
100.00%
33 / 33
56
100.00% covered (success)
100.00%
1 / 1
 __construct
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withCharFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withTokenizer
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withCharMap
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 withLimitedCharMap
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 withNumberCharFilter
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withElision
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 withLangLowercase
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withStop
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withExtraStop
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
1
 withExtraStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withStemmerOverride
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withUnpackedAnalyzer
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 unpackedCheck
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 insertFiltersBefore
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 appendFilters
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 prependFilters
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 omitDottedI
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withLightStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 omitStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withAsciifoldingPreserve
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 omitAsciifolding
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withRemoveEmpty
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withDecimalDigit
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 build
100.00% covered (success)
100.00%
65 / 65
100.00% covered (success)
100.00%
1 / 1
20
 patternFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 mappingCharFilter
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 numberCharFilter
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 elisionFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stopFilterFromList
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 overrideFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stemmerFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5/**
6 * Builds one elasticsearch analyzer to add to an analysis config array.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 */
23class AnalyzerBuilder {
24    /**
25     * Indicate that filters should be automatically appended or prepended, rather
26     * than inserted before a given filter.
27     */
28    public const APPEND = 1;
29    public const PREPEND = 2;
30
31    /** @var string */
32    private $langName;
33
34    /** @var string */
35    private $analyzerName;
36
37    /** @var string[]|null list of char_filters */
38    private $charFilters;
39
40    /** @var string|null name of tokenizer */
41    private $tokenizer = 'standard';
42
43    /** @var string[]|null list of filters */
44    private $filters;
45
46    /** @var string[]|null list of lang-specific character filter mappings */
47    private $charMap;
48
49    /** @var bool */
50    private $charMapLimited = false;
51
52    /** @var string|null */
53    private $charMapName;
54
55    /** @var int|null Unicode value for script-specific zero */
56    private $langZero;
57
58    /** @var string|null name of char filter mapping digits (using $langZero) */
59    private $numCharMapName;
60
61    /** @var bool is elision processing case INsensitive? */
62    private $elisionArticleCase = true;
63
64    /** @var string[]|null list of articles to elide */
65    private $elisionArticles;
66
67    /** @var string|null */
68    private $elisionName;
69
70    /** @var bool use language-specific lowercasing? */
71    private $langLowercase = false;
72
73    /** @var mixed|null stopword _list_ or array of stopwords */
74    private $customStopList;
75
76    /** @var string|null */
77    private $stopName;
78
79    /** @var mixed|null stopword _list_ or array of stopwords */
80    private $extraStopList;
81
82    /** @var string|null */
83    private $extraStopName;
84
85    /** @var bool|null */
86    private $extraStopIgnoreCase;
87
88    /** @var string|null */
89    private $extraStemmerLang;
90
91    /** @var string|null */
92    private $extraStemmerName;
93
94    /** @var string[]|null list of stemmer override rules */
95    private $overrideRules;
96
97    /** @var string|null */
98    private $overrideName;
99
100    /**********
101     * The properties below are only used by unpacked analyzers
102     */
103
104    /** @var bool */
105    private $unpacked = false;
106
107    /** @var array<int, array<string, string[]>> */
108    private $insertFilterList = [];
109
110    /** @var string */
111    private $dottedIFix = 'dotted_I_fix';
112
113    /** @var bool */
114    private $useStemmer = true;
115
116    /** @var string|null */
117    private $stemmerLang;
118
119    /** @var string|null asciifolding flavor to use (null for none) */
120    private $asciifolding = 'asciifolding';
121
122    /** @var string|null */
123    private $removeEmpty;
124
125    /** @var string|null */
126    private $decimalDigit;
127
128    /**
129     * @param string $langName
130     * @param string $analyzerName (default to 'text')
131     */
132    public function __construct( string $langName, string $analyzerName = 'text' ) {
133        $this->langName = $langName;
134        $this->analyzerName = $analyzerName;
135    }
136
137    /**
138     * @param string[] $charFilters
139     * @return self
140     */
141    public function withCharFilters( array $charFilters ): self {
142        $this->charFilters = $charFilters;
143        return $this;
144    }
145
146    /**
147     * @param string $tokenizer
148     * @return self
149     */
150    public function withTokenizer( string $tokenizer ): self {
151        $this->tokenizer = $tokenizer;
152        return $this;
153    }
154
155    /**
156     * @param string[] $filters
157     * @return self
158     */
159    public function withFilters( array $filters ): self {
160        $this->filters = $filters;
161        return $this;
162    }
163
164    /**
165     * @param string[] $mappings
166     * @param string|null $name
167     * @param bool $limited
168     * @return self
169     */
170    public function withCharMap( array $mappings, string $name = null, bool $limited = false ): self {
171        $this->charMap = $mappings;
172        $this->charMapName = $name ?? "{$this->langName}_charfilter";
173        $this->charMapLimited = false;
174        return $this;
175    }
176
177    /**
178     * @param string[] $mappings
179     * @param string|null $name
180     * @return self
181     */
182    public function withLimitedCharMap( array $mappings, string $name = null ): self {
183        return $this->withCharMap( $mappings, $name, true );
184    }
185
186    /**
187     * @param int $langZero
188     * @param string|null $name
189     * @return self
190     */
191    public function withNumberCharFilter( int $langZero, string $name = null ): self {
192        $this->langZero = $langZero;
193        $this->numCharMapName = $name ?? "{$this->langName}_numbers";
194        return $this;
195    }
196
197    /**
198     * @param string[] $articles "articles" to be elided
199     * @param bool $articleCase whether elision is case insensitive
200     * @return self
201     */
202    public function withElision( array $articles, bool $articleCase = true ): self {
203        $this->elisionArticleCase = $articleCase;
204        $this->elisionArticles = $articles;
205        $this->elisionName = "{$this->langName}_elision";
206        return $this;
207    }
208
209    /** @return self */
210    public function withLangLowercase(): self {
211        $this->langLowercase = true;
212        return $this;
213    }
214
215    /**
216     * @param mixed $stop pre-defined list like _french_ or an array of stopwords
217     * @param string|null $name
218     * @return self
219     */
220    public function withStop( $stop, string $name = null ): self {
221        $this->customStopList = $stop;
222        $this->stopName = $name ?? "{$this->langName}_stop";
223        return $this;
224    }
225
226    /**
227     * @param mixed $stop pre-defined list like _french_ or an array of stopwords
228     * @param string $name
229     * @param mixed $beforeFilter filter to insert extra stop before
230     * @param bool|null $ignoreCase
231     * @return self
232     */
233    public function withExtraStop( $stop, string $name, $beforeFilter = self::APPEND,
234            bool $ignoreCase = null ): self {
235        $this->extraStopList = $stop;
236        $this->extraStopName = $name;
237        $this->extraStopIgnoreCase = $ignoreCase;
238        $this->insertFiltersBefore( $beforeFilter, [ $name ] );
239        return $this;
240    }
241
242    /**
243     * @param string $lang
244     * @param string|null $name
245     * @return self
246     */
247    public function withExtraStemmer( string $lang, string $name = null ): self {
248        $this->extraStemmerLang = $lang;
249        $this->extraStemmerName = $name ?? $lang;
250        return $this;
251    }
252
253    /**
254     * Rules can be a single rule string, or an array of rules
255     *
256     * @param mixed $rules stemmer override rules
257     * @param string|null $name
258     * @return self
259     */
260    public function withStemmerOverride( $rules, string $name = null ): self {
261        $this->overrideRules = $rules;
262        $this->overrideName = $name ?? "{$this->langName}_override";
263        return $this;
264    }
265
266    /**********
267     * The with.., omit.., and insert.. methods below are only used by unpacked analyzers
268     */
269
270    /** @return self */
271    public function withUnpackedAnalyzer(): self {
272        $this->unpacked = true;
273        return $this;
274    }
275
276    private function unpackedCheck(): void {
277        if ( !$this->unpacked ) {
278            $caller = debug_backtrace()[1]['function'];
279            throw new \ConfigException( "$caller() is only compatible with unpacked analyzers;" .
280                "call withUnpackedAnalyzer() before calling $caller()." );
281        }
282    }
283
284    /**
285     * @param mixed $beforeFilter specific filter to insert $filters before; use APPEND
286     *                            or PREPEND to always add to beginning or end of the list
287     * @param string[] $filterList list of additional filters to insert
288     * @return self
289     */
290    public function insertFiltersBefore( $beforeFilter, array $filterList ): self {
291        $this->unpackedCheck();
292        $this->insertFilterList[] = [ $beforeFilter => $filterList ];
293        return $this;
294    }
295
296    /**
297     * @param string[] $filterList list of additional filters to append
298     * @return self
299     */
300    public function appendFilters( array $filterList ): self {
301        $this->unpackedCheck();
302        $this->insertFiltersBefore( self::APPEND, $filterList );
303        return $this;
304    }
305
306    /**
307     * @param string[] $filterList list of additional filters to prepend
308     * @return self
309     */
310    public function prependFilters( array $filterList ): self {
311        $this->unpackedCheck();
312        $this->insertFiltersBefore( self::PREPEND, $filterList );
313        return $this;
314    }
315
316    /** @return self */
317    public function omitDottedI(): self {
318        $this->unpackedCheck();
319        $this->dottedIFix = '';
320        return $this;
321    }
322
323    /** @return self */
324    public function withLightStemmer(): self {
325        $this->unpackedCheck();
326        $this->stemmerLang = "light_{$this->langName}";
327        return $this;
328    }
329
330    /** @return self */
331    public function omitStemmer(): self {
332        $this->unpackedCheck();
333        $this->useStemmer = false;
334        return $this;
335    }
336
337    /** @return self */
338    public function withAsciifoldingPreserve(): self {
339        $this->unpackedCheck();
340        $this->asciifolding = 'asciifolding_preserve';
341        return $this;
342    }
343
344    /** @return self */
345    public function omitAsciifolding(): self {
346        $this->unpackedCheck();
347        $this->asciifolding = '';
348        return $this;
349    }
350
351    /** @return self */
352    public function withRemoveEmpty(): self {
353        $this->unpackedCheck();
354        $this->removeEmpty = 'remove_empty';
355        return $this;
356    }
357
358    /** @return self */
359    public function withDecimalDigit(): self {
360        $this->unpackedCheck();
361        $this->decimalDigit = 'decimal_digit';
362        return $this;
363    }
364
365    /**
366     * Create a basic analyzer with support for various common options
367     *
368     * Can create various filters and character filters as specified.
369     * None are automatically added to the char_filter or filter list
370     * because the best order for these basic analyzers depends on the
371     * details of various third-party plugins.
372     *
373     * type: custom
374     * tokenizer: standard
375     * char_filter: as per $this->charFilters
376     * filter: as per $this->filters
377     *
378     * @param mixed[] $config to be updated
379     * @return mixed[] updated config
380     */
381    public function build( array $config ): array {
382        $langStem = "{$this->langName}_stemmer";
383
384        if ( $this->unpacked ) {
385            // Analyzer config for char_filter and filter will be in the order below,
386            // if the relevant filters are enabled/configured.
387            //
388            // type: custom
389            // tokenizer: standard
390            // char_filter: dotted_I_fix, lang_charfilter, lang_numbers
391            // filter: elision, aggressive_splitting, lowercase, stopwords, lang_norm,
392            //         stemmer_override, stemmer, asciifolding, remove_empty
393            if ( $this->useStemmer ) {
394                $this->stemmerLang ??= $this->langName;
395            } else {
396                $langStem = '';
397            }
398            $this->withStop( $this->customStopList ?? "_{$this->langName}_" );
399
400            // build up the char_filter list--everything is optional
401            $this->charFilters[] = $this->dottedIFix;
402            $this->charFilters[] = $this->charMapName;
403            $this->charFilters[] = $this->numCharMapName;
404
405            // remove 'falsey' (== not configured) values from the list
406            $this->charFilters = array_values( array_filter( $this->charFilters ) );
407
408            // build up the filter list--lowercase, stop, and stem are required
409            $this->filters[] = $this->elisionName;
410            $this->filters[] = 'lowercase';
411            $this->filters[] = $this->decimalDigit;
412            $this->filters[] = $this->stopName;
413            $this->filters[] = $this->overrideName;
414            $this->filters[] = $langStem;
415            $this->filters[] = $this->asciifolding;
416            $this->filters[] = $this->removeEmpty;
417
418            // remove 'falsey' (== not configured) values from the list
419            $this->filters = array_values( array_filter( $this->filters ) );
420
421            // iterate over all lists of sets of filters to insert, in order, and insert
422            // them before the specified filter. If no such filter exists, $idx == -1 and
423            // the filters will be prepended, but you shouldn't count on that. APPEND and
424            // PREPEND constants can be used to add to beginning or end, regardless of
425            // other filters
426            foreach ( $this->insertFilterList as $filterPatch ) {
427                foreach ( $filterPatch as $beforeFilter => $filterList ) {
428                    switch ( $beforeFilter ) {
429                    case self::APPEND:
430                        $this->filters = array_merge( $this->filters, $filterList );
431                        break;
432                    case self::PREPEND:
433                        $this->filters = array_merge( $filterList, $this->filters );
434                        break;
435                    default:
436                        $idx = array_search( $beforeFilter, $this->filters );
437                        array_splice( $this->filters, $idx, 0, $filterList );
438                        break;
439                    }
440                }
441            }
442
443        }
444
445        $config[ 'analyzer' ][ $this->analyzerName ] = [
446            'type' => 'custom',
447            'tokenizer' => $this->tokenizer,
448        ];
449
450        if ( $this->charMapName ) {
451            $config[ 'char_filter' ][ $this->charMapName ] =
452                $this->mappingCharFilter( $this->charMap, $this->charMapLimited );
453        }
454
455        if ( $this->numCharMapName ) {
456            $config[ 'char_filter' ][ $this->numCharMapName ] =
457                $this->numberCharFilter( $this->langZero );
458        }
459
460        if ( $this->elisionName ) {
461            $config[ 'filter' ][ $this->elisionName ] =
462                $this->elisionFilter( $this->elisionArticles, $this->elisionArticleCase );
463        }
464
465        if ( $this->langLowercase ) {
466            $config[ 'filter' ][ 'lowercase' ][ 'language' ] = $this->langName;
467        }
468
469        if ( $this->overrideName ) {
470            $config[ 'filter' ][ $this->overrideName ] =
471                $this->overrideFilter( $this->overrideRules );
472        }
473
474        if ( $this->stopName ) {
475            $config[ 'filter' ][ $this->stopName ] =
476                $this->stopFilterFromList( $this->customStopList );
477        }
478
479        if ( $this->extraStopName ) {
480            $config[ 'filter' ][ $this->extraStopName ] =
481                $this->stopFilterFromList( $this->extraStopList, $this->extraStopIgnoreCase );
482        }
483
484        if ( $this->charFilters ) {
485            $config[ 'analyzer' ][ $this->analyzerName ][ 'char_filter' ] = $this->charFilters;
486        }
487
488        if ( $this->filters ) {
489            $config[ 'analyzer' ][ $this->analyzerName ][ 'filter' ] = $this->filters;
490        }
491
492        if ( $this->stemmerLang && $this->useStemmer ) {
493            $config[ 'filter' ][ $langStem ] =
494                $this->stemmerFilter( $this->stemmerLang );
495        }
496
497        if ( $this->extraStemmerName ) {
498            $config[ 'filter' ][ $this->extraStemmerName ] =
499                $this->stemmerFilter( $this->extraStemmerLang );
500        }
501
502        return $config;
503    }
504
505    /**
506     * Create a pattern_replace filter/char_filter with the mappings provided.
507     *
508     * @param string $pat
509     * @param string $repl
510     * @return mixed[] filter
511     */
512    public static function patternFilter( string $pat, string $repl = '' ): array {
513        return [ 'type' => 'pattern_replace', 'pattern' => $pat, 'replacement' => $repl ];
514    }
515
516    /**
517     * Create a mapping or limited_mapping character filter with the mappings provided.
518     *
519     * @param string[] $mappings
520     * @param bool $limited
521     * @return mixed[] character filter
522     */
523    public static function mappingCharFilter( array $mappings, bool $limited ): array {
524        $type = $limited ? 'limited_mapping' : 'mapping';
525        return [ 'type' => $type, 'mappings' => $mappings ];
526    }
527
528    /**
529     * Create a character filter that maps non-Arabic digits (e.g., ០-៩ or 0-9) to
530     * Arabic digits (0-9). Since they are usually all in a row, we just need the
531     * starting digit (equal to 0)
532     *
533     * @param int $langZero
534     * @return mixed[] character filter
535     */
536    public static function numberCharFilter( int $langZero ): array {
537        $numMap = [];
538        for ( $i = 0; $i <= 9; $i++ ) {
539          $numMap[] = sprintf( '\\u%04x=>%d', $langZero + $i, $i );
540        }
541        return self::mappingCharFilter( $numMap, true );
542    }
543
544    /**
545     * Create an elision filter with the "articles" provided; $case determines whether
546     * stripping is case sensitive or not
547     *
548     * @param string[] $articles
549     * @param bool $case
550     * @return mixed[] token filter
551     */
552    public static function elisionFilter( array $articles, bool $case = true ): array {
553        return [ 'type' => 'elision', 'articles_case' => $case, 'articles' => $articles ];
554    }
555
556    /**
557     * Create a stop word filter with the provided config. The config can be an array
558     * of stop words, or a string like _french_ that refers to a pre-defined list.
559     *
560     * @param mixed $stopwords
561     * @param bool|null $ignoreCase
562     * @return mixed[] token filter
563     */
564    public static function stopFilterFromList( $stopwords, bool $ignoreCase = null ): array {
565        $retArray = [ 'type' => 'stop', 'stopwords' => $stopwords ];
566        if ( isset( $ignoreCase ) ) {
567            $retArray['ignore_case'] = $ignoreCase;
568        }
569        return $retArray;
570    }
571
572    /**
573     * Create an stemming override filter with the rules provided, which can be a string
574     * with one rule or an array of such rules
575     *
576     * @param mixed $rules
577     * @return mixed[] token filter
578     */
579    private function overrideFilter( $rules ): array {
580        return [ 'type' => 'stemmer_override', 'rules' => $rules ];
581    }
582
583    /**
584     * Create a stemmer filter with the provided config.
585     *
586     * @param string $stemmer
587     * @return mixed[] token filter
588     */
589    public static function stemmerFilter( string $stemmer ): array {
590        return [ 'type' => 'stemmer', 'language' => $stemmer ];
591    }
592
593}