Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
100.00% covered (success)
100.00%
136 / 136
100.00% covered (success)
100.00%
30 / 30
CRAP
100.00% covered (success)
100.00%
1 / 1
AnalyzerBuilder
100.00% covered (success)
100.00%
136 / 136
100.00% covered (success)
100.00%
30 / 30
50
100.00% covered (success)
100.00%
1 / 1
 __construct
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withCharFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withTokenizer
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withFilters
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withCharMap
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withNumberCharFilter
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withElision
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 withLangLowercase
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 withStop
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withStemmerOverride
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withUnpackedAnalyzer
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
1
 unpackedCheck
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 insertFiltersBefore
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 omitDottedI
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withWordBreakHelper
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withAggressiveSplitting
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withLightStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 omitStemmer
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withAsciifoldingPreserve
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 omitAsciifolding
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withRemoveEmpty
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 withDecimalDigit
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 build
100.00% covered (success)
100.00%
61 / 61
100.00% covered (success)
100.00%
1 / 1
18
 patternFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 mappingCharFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 numberCharFilter
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 elisionFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stopFilterFromList
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
2
 overrideFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 stemmerFilter
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5/**
6 * Builds one elasticsearch analyzer to add to an analysis config array.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 */
23class AnalyzerBuilder {
24    /**
25     * Indicate that filters should be automatically appended or prepended, rather
26     * than inserted before a given filter.
27     */
28    public const APPEND = 1;
29    public const PREPEND = 2;
30
31    /** @var string */
32    private $langName;
33
34    /** @var string */
35    private $analyzerName;
36
37    /** @var string[]|null list of char_filters */
38    private $charFilters;
39
40    /** @var string|null name of tokenizer */
41    private $tokenizer = 'standard';
42
43    /** @var string[]|null list of filters */
44    private $filters;
45
46    /** @var string[]|null list of lang-specific character filter mappings */
47    private $charMap;
48
49    /** @var string|null */
50    private $charMapName;
51
52    /** @var int|null Unicode value for script-specific zero */
53    private $langZero;
54
55    /** @var string|null name of char filter mapping digits (using $langZero) */
56    private $numCharMapName;
57
58    /** @var bool is elision processing case INsensitive? */
59    private $elisionArticleCase = true;
60
61    /** @var string[]|null list of articles to elide */
62    private $elisionArticles;
63
64    /** @var string|null */
65    private $elisionName;
66
67    /** @var bool use language-specific lowercasing? */
68    private $langLowercase = false;
69
70    /** @var mixed|null stopword _list_ or array of stopwords */
71    private $customStopList;
72
73    /** @var string|null */
74    private $stopName;
75
76    /** @var string[]|null list of stemmer override rules */
77    private $overrideRules;
78
79    /** @var string|null */
80    private $overrideName;
81
82    /**********
83     * The properties below are only used by unpacked analyzers
84     */
85
86    /** @var bool */
87    private $unpacked = false;
88
89    /** @var array<int, array<string, string[]>> */
90    private $insertFilterList = [];
91
92    /** @var string */
93    private $dottedIFix = 'dotted_I_fix';
94
95    /** @var string|null */
96    private $wordBreakHelper;
97
98    /** @var string|null */
99    private $aggressiveSplitting;
100
101    /** @var bool */
102    private $useStemmer = true;
103
104    /** @var string|null */
105    private $stemmerName;
106
107    /** @var string|null asciifolding flavor to use (null for none) */
108    private $asciifolding = 'asciifolding';
109
110    /** @var string|null */
111    private $removeEmpty;
112
113    /** @var string|null */
114    private $decimalDigit;
115
116    /**
117     * @param string $langName
118     * @param string $analyzerName (default to 'text')
119     */
120    public function __construct( string $langName, string $analyzerName = 'text' ) {
121        $this->langName = $langName;
122        $this->analyzerName = $analyzerName;
123    }
124
125    /**
126     * @param string[] $charFilters
127     * @return self
128     */
129    public function withCharFilters( array $charFilters ): self {
130        $this->charFilters = $charFilters;
131        return $this;
132    }
133
134    /**
135     * @param string $tokenizer
136     * @return self
137     */
138    public function withTokenizer( string $tokenizer ): self {
139        $this->tokenizer = $tokenizer;
140        return $this;
141    }
142
143    /**
144     * @param string[] $filters
145     * @return self
146     */
147    public function withFilters( array $filters ): self {
148        $this->filters = $filters;
149        return $this;
150    }
151
152    /**
153     * @param string[] $mappings
154     * @return self
155     */
156    public function withCharMap( array $mappings ): self {
157        $this->charMap = $mappings;
158        $this->charMapName = "{$this->langName}_charfilter";
159        return $this;
160    }
161
162    /**
163     * @param int $langZero
164     * @return self
165     */
166    public function withNumberCharFilter( int $langZero ): self {
167        $this->langZero = $langZero;
168        $this->numCharMapName = "{$this->langName}_numbers";
169        return $this;
170    }
171
172    /**
173     * @param string[] $articles "articles" to be elided
174     * @param bool $articleCase whether elision is case insensitive
175     * @return self
176     */
177    public function withElision( array $articles, bool $articleCase = true ): self {
178        $this->elisionArticleCase = $articleCase;
179        $this->elisionArticles = $articles;
180        $this->elisionName = "{$this->langName}_elision";
181        return $this;
182    }
183
184    /** @return self */
185    public function withLangLowercase(): self {
186        $this->langLowercase = true;
187        return $this;
188    }
189
190    /**
191     * @param mixed $stop pre-defined list like _french_ or an array of stopwords
192     * @return self
193     */
194    public function withStop( $stop ): self {
195        $this->customStopList = $stop;
196        $this->stopName = "{$this->langName}_stop";
197        return $this;
198    }
199
200    /**
201     * @param string[] $rules stemmer override rules
202     * @return self
203     */
204    public function withStemmerOverride( array $rules ): self {
205        $this->overrideRules = $rules;
206        $this->overrideName = "{$this->langName}_override";
207        return $this;
208    }
209
210    /**********
211     * The with.., omit.., and insert.. methods below are only used by unpacked analyzers
212     */
213
214    /** @return self */
215    public function withUnpackedAnalyzer(): self {
216        $this->unpacked = true;
217        return $this;
218    }
219
220    private function unpackedCheck(): void {
221        if ( !$this->unpacked ) {
222            $caller = debug_backtrace()[1]['function'];
223            throw new \ConfigException( "$caller() is only compatible with unpacked analyzers;" .
224                "call withUnpackedAnalyzer() before calling $caller()." );
225        }
226    }
227
228    /**
229     * @param mixed $beforeFilter specific filter to insert $filters before; use APPEND
230     *                            or PREPEND to always add to beginning or end of the list
231     * @param string[] $filterList list of additional filters to insert
232     * @return self
233     */
234    public function insertFiltersBefore( $beforeFilter, array $filterList ): self {
235        $this->unpackedCheck();
236        $this->insertFilterList[] = [ $beforeFilter => $filterList ];
237        return $this;
238    }
239
240    /** @return self */
241    public function omitDottedI(): self {
242        $this->unpackedCheck();
243        $this->dottedIFix = '';
244        return $this;
245    }
246
247    /** @return self */
248    public function withWordBreakHelper(): self {
249        $this->unpackedCheck();
250        $this->wordBreakHelper = 'word_break_helper';
251        return $this;
252    }
253
254    /** @return self */
255    public function withAggressiveSplitting(): self {
256        $this->unpackedCheck();
257        $this->aggressiveSplitting = 'aggressive_splitting';
258        return $this;
259    }
260
261    /** @return self */
262    public function withLightStemmer(): self {
263        $this->unpackedCheck();
264        $this->stemmerName = "light_{$this->langName}";
265        return $this;
266    }
267
268    /** @return self */
269    public function omitStemmer(): self {
270        $this->unpackedCheck();
271        $this->useStemmer = false;
272        return $this;
273    }
274
275    /** @return self */
276    public function withAsciifoldingPreserve(): self {
277        $this->unpackedCheck();
278        $this->asciifolding = 'asciifolding_preserve';
279        return $this;
280    }
281
282    /** @return self */
283    public function omitAsciifolding(): self {
284        $this->unpackedCheck();
285        $this->asciifolding = '';
286        return $this;
287    }
288
289    /** @return self */
290    public function withRemoveEmpty(): self {
291        $this->unpackedCheck();
292        $this->removeEmpty = 'remove_empty';
293        return $this;
294    }
295
296    /** @return self */
297    public function withDecimalDigit(): self {
298        $this->unpackedCheck();
299        $this->decimalDigit = 'decimal_digit';
300        return $this;
301    }
302
303    /**
304     * Create a basic analyzer with support for various common options
305     *
306     * Can create various filters and character filters as specified.
307     * None are automatically added to the char_filter or filter list
308     * because the best order for these basic analyzers depends on the
309     * details of various third-party plugins.
310     *
311     * type: custom
312     * tokenizer: standard
313     * char_filter: as per $this->charFilters
314     * filter: as per $this->filters
315     *
316     * @param mixed[] $config to be updated
317     * @return mixed[] updated config
318     */
319    public function build( array $config ): array {
320        $langStem = "{$this->langName}_stemmer";
321
322        if ( $this->unpacked ) {
323            // Analyzer config for char_filter and filter will be in the order below,
324            // if the relevant filters are enabled/configured.
325            //
326            // type: custom
327            // tokenizer: standard
328            // char_filter: dotted_I_fix, lang_charfilter, lang_numbers, word_break_helper
329            // filter: elision, aggressive_splitting, lowercase, stopwords, lang_norm,
330            //         stemmer_override, stemmer, asciifolding, remove_empty
331            if ( $this->useStemmer ) {
332                $this->stemmerName = $this->stemmerName ?? $this->langName;
333            } else {
334                $langStem = '';
335            }
336            $this->withStop( $this->customStopList ?? "_{$this->langName}_" );
337
338            // build up the char_filter list--everything is optional
339            $this->charFilters[] = $this->dottedIFix;
340            $this->charFilters[] = $this->charMapName;
341            $this->charFilters[] = $this->numCharMapName;
342            $this->charFilters[] = $this->wordBreakHelper;
343
344            // remove 'falsey' (== not configured) values from the list
345            $this->charFilters = array_values( array_filter( $this->charFilters ) );
346
347            // build up the filter list--lowercase, stop, and stem are required
348            $this->filters[] = $this->elisionName;
349            $this->filters[] = $this->aggressiveSplitting;
350            $this->filters[] = 'lowercase';
351            $this->filters[] = $this->decimalDigit;
352            $this->filters[] = $this->stopName;
353            $this->filters[] = $this->overrideName;
354            $this->filters[] = $langStem;
355            $this->filters[] = $this->asciifolding;
356            $this->filters[] = $this->removeEmpty;
357
358            // remove 'falsey' (== not configured) values from the list
359            $this->filters = array_values( array_filter( $this->filters ) );
360
361            // iterate over all lists of sets of filters to insert, in order, and insert
362            // them before the specified filter. If no such filter exists, $idx == -1 and
363            // the filters will be prepended, but you shouldn't count on that. APPEND and
364            // PREPEND constants can be used to add to beginning or end, regardless of
365            // other filters
366            foreach ( $this->insertFilterList as $filterPatch ) {
367                foreach ( $filterPatch as $beforeFilter => $filterList ) {
368                    switch ( $beforeFilter ) {
369                    case self::APPEND:
370                        $this->filters = array_merge( $this->filters, $filterList );
371                        break;
372                    case self::PREPEND:
373                        $this->filters = array_merge( $filterList, $this->filters );
374                        break;
375                    default:
376                        $idx = array_search( $beforeFilter, $this->filters );
377                        array_splice( $this->filters, $idx, 0, $filterList );
378                        break;
379                    }
380                }
381            }
382
383        }
384
385        $config[ 'analyzer' ][ $this->analyzerName ] = [
386            'type' => 'custom',
387            'tokenizer' => $this->tokenizer,
388        ];
389
390        if ( $this->charMapName ) {
391            $config[ 'char_filter' ][ $this->charMapName ] =
392                $this->mappingCharFilter( $this->charMap );
393        }
394
395        if ( $this->numCharMapName ) {
396            $config[ 'char_filter' ][ $this->numCharMapName ] =
397                $this->numberCharFilter( $this->langZero );
398        }
399
400        if ( $this->elisionName ) {
401            $config[ 'filter' ][ $this->elisionName ] =
402                $this->elisionFilter( $this->elisionArticles, $this->elisionArticleCase );
403        }
404
405        if ( $this->langLowercase ) {
406            $config[ 'filter' ][ 'lowercase' ][ 'language' ] = $this->langName;
407        }
408
409        if ( $this->overrideName ) {
410            $config[ 'filter' ][ $this->overrideName ] =
411                $this->overrideFilter( $this->overrideRules );
412        }
413
414        if ( $this->stopName ) {
415            $config[ 'filter' ][ $this->stopName ] =
416                $this->stopFilterFromList( $this->customStopList );
417        }
418
419        if ( $this->charFilters ) {
420            $config[ 'analyzer' ][ $this->analyzerName ][ 'char_filter' ] = $this->charFilters;
421        }
422
423        if ( $this->filters ) {
424            $config[ 'analyzer' ][ $this->analyzerName ][ 'filter' ] = $this->filters;
425        }
426
427        if ( $this->stemmerName && $this->useStemmer ) {
428            $config[ 'filter' ][ $langStem ] =
429                $this->stemmerFilter( $this->stemmerName );
430        }
431
432        return $config;
433    }
434
435    /**
436     * Create a pattern_replace filter/char_filter with the mappings provided.
437     *
438     * @param string $pat
439     * @param string $repl
440     * @return mixed[] filter
441     */
442    public static function patternFilter( string $pat, string $repl ): array {
443        return [ 'type' => 'pattern_replace', 'pattern' => $pat, 'replacement' => $repl ];
444    }
445
446    /**
447     * Create a mapping character filter with the mappings provided.
448     *
449     * @param string[] $mappings
450     * @return mixed[] character filter
451     */
452    public static function mappingCharFilter( array $mappings ): array {
453        return [ 'type' => 'mapping', 'mappings' => $mappings ];
454    }
455
456    /**
457     * Create a character filter that maps non-Arabic digits (e.g., ០-៩ or 0-9) to
458     * Arabic digits (0-9). Since they are usually all in a row, we just need the
459     * starting digit (equal to 0)
460     *
461     * @param int $langZero
462     * @return mixed[] character filter
463     */
464    public static function numberCharFilter( int $langZero ): array {
465        $numMap = [];
466        for ( $i = 0; $i <= 9; $i++ ) {
467          $numMap[] = sprintf( '\\u%04x=>%d', $langZero + $i, $i );
468        }
469        return self::mappingCharFilter( $numMap );
470    }
471
472    /**
473     * Create an elision filter with the "articles" provided; $case determines whether
474     * stripping is case sensitive or not
475     *
476     * @param string[] $articles
477     * @param bool $case
478     * @return mixed[] token filter
479     */
480    public static function elisionFilter( array $articles, bool $case = true ): array {
481        return [ 'type' => 'elision', 'articles_case' => $case, 'articles' => $articles ];
482    }
483
484    /**
485     * Create a stop word filter with the provided config. The config can be an array
486     * of stop words, or a string like _french_ that refers to a pre-defined list.
487     *
488     * @param mixed $stopwords
489     * @param bool|null $ignoreCase
490     * @return mixed[] token filter
491     */
492    public static function stopFilterFromList( $stopwords, bool $ignoreCase = null ): array {
493        $retArray = [ 'type' => 'stop', 'stopwords' => $stopwords ];
494        if ( isset( $ignoreCase ) ) {
495            $retArray['ignore_case'] = $ignoreCase;
496        }
497        return $retArray;
498    }
499
500    /**
501     * Create an stemming override filter with the rules provided
502     *
503     * @param string[] $rules
504     * @return mixed[] token filter
505     */
506    private function overrideFilter( array $rules ): array {
507        return [ 'type' => 'stemmer_override', 'rules' => $rules ];
508    }
509
510    /**
511     * Create a stemmer filter with the provided config.
512     *
513     * @param string $stemmer
514     * @return mixed[] token filter
515     */
516    public static function stemmerFilter( string $stemmer ): array {
517        return [ 'type' => 'stemmer', 'language' => $stemmer ];
518    }
519
520}