Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
136 / 136 |
|
100.00% |
30 / 30 |
CRAP | |
100.00% |
1 / 1 |
AnalyzerBuilder | |
100.00% |
136 / 136 |
|
100.00% |
30 / 30 |
50 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withCharFilters | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withTokenizer | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withFilters | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withCharMap | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withNumberCharFilter | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withElision | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
withLangLowercase | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withStop | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withStemmerOverride | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withUnpackedAnalyzer | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
unpackedCheck | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
insertFiltersBefore | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
omitDottedI | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withWordBreakHelper | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withAggressiveSplitting | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withLightStemmer | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
omitStemmer | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withAsciifoldingPreserve | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
omitAsciifolding | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withRemoveEmpty | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withDecimalDigit | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
build | |
100.00% |
61 / 61 |
|
100.00% |
1 / 1 |
18 | |||
patternFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
mappingCharFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
numberCharFilter | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
elisionFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
stopFilterFromList | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
overrideFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
stemmerFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | /** |
6 | * Builds one elasticsearch analyzer to add to an analysis config array. |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | */ |
23 | class AnalyzerBuilder { |
24 | /** |
25 | * Indicate that filters should be automatically appended or prepended, rather |
26 | * than inserted before a given filter. |
27 | */ |
28 | public const APPEND = 1; |
29 | public const PREPEND = 2; |
30 | |
31 | /** @var string */ |
32 | private $langName; |
33 | |
34 | /** @var string */ |
35 | private $analyzerName; |
36 | |
37 | /** @var string[]|null list of char_filters */ |
38 | private $charFilters; |
39 | |
40 | /** @var string|null name of tokenizer */ |
41 | private $tokenizer = 'standard'; |
42 | |
43 | /** @var string[]|null list of filters */ |
44 | private $filters; |
45 | |
46 | /** @var string[]|null list of lang-specific character filter mappings */ |
47 | private $charMap; |
48 | |
49 | /** @var string|null */ |
50 | private $charMapName; |
51 | |
52 | /** @var int|null Unicode value for script-specific zero */ |
53 | private $langZero; |
54 | |
55 | /** @var string|null name of char filter mapping digits (using $langZero) */ |
56 | private $numCharMapName; |
57 | |
58 | /** @var bool is elision processing case INsensitive? */ |
59 | private $elisionArticleCase = true; |
60 | |
61 | /** @var string[]|null list of articles to elide */ |
62 | private $elisionArticles; |
63 | |
64 | /** @var string|null */ |
65 | private $elisionName; |
66 | |
67 | /** @var bool use language-specific lowercasing? */ |
68 | private $langLowercase = false; |
69 | |
70 | /** @var mixed|null stopword _list_ or array of stopwords */ |
71 | private $customStopList; |
72 | |
73 | /** @var string|null */ |
74 | private $stopName; |
75 | |
76 | /** @var string[]|null list of stemmer override rules */ |
77 | private $overrideRules; |
78 | |
79 | /** @var string|null */ |
80 | private $overrideName; |
81 | |
82 | /********** |
83 | * The properties below are only used by unpacked analyzers |
84 | */ |
85 | |
86 | /** @var bool */ |
87 | private $unpacked = false; |
88 | |
89 | /** @var array<int, array<string, string[]>> */ |
90 | private $insertFilterList = []; |
91 | |
92 | /** @var string */ |
93 | private $dottedIFix = 'dotted_I_fix'; |
94 | |
95 | /** @var string|null */ |
96 | private $wordBreakHelper; |
97 | |
98 | /** @var string|null */ |
99 | private $aggressiveSplitting; |
100 | |
101 | /** @var bool */ |
102 | private $useStemmer = true; |
103 | |
104 | /** @var string|null */ |
105 | private $stemmerName; |
106 | |
107 | /** @var string|null asciifolding flavor to use (null for none) */ |
108 | private $asciifolding = 'asciifolding'; |
109 | |
110 | /** @var string|null */ |
111 | private $removeEmpty; |
112 | |
113 | /** @var string|null */ |
114 | private $decimalDigit; |
115 | |
116 | /** |
117 | * @param string $langName |
118 | * @param string $analyzerName (default to 'text') |
119 | */ |
120 | public function __construct( string $langName, string $analyzerName = 'text' ) { |
121 | $this->langName = $langName; |
122 | $this->analyzerName = $analyzerName; |
123 | } |
124 | |
125 | /** |
126 | * @param string[] $charFilters |
127 | * @return self |
128 | */ |
129 | public function withCharFilters( array $charFilters ): self { |
130 | $this->charFilters = $charFilters; |
131 | return $this; |
132 | } |
133 | |
134 | /** |
135 | * @param string $tokenizer |
136 | * @return self |
137 | */ |
138 | public function withTokenizer( string $tokenizer ): self { |
139 | $this->tokenizer = $tokenizer; |
140 | return $this; |
141 | } |
142 | |
143 | /** |
144 | * @param string[] $filters |
145 | * @return self |
146 | */ |
147 | public function withFilters( array $filters ): self { |
148 | $this->filters = $filters; |
149 | return $this; |
150 | } |
151 | |
152 | /** |
153 | * @param string[] $mappings |
154 | * @return self |
155 | */ |
156 | public function withCharMap( array $mappings ): self { |
157 | $this->charMap = $mappings; |
158 | $this->charMapName = "{$this->langName}_charfilter"; |
159 | return $this; |
160 | } |
161 | |
162 | /** |
163 | * @param int $langZero |
164 | * @return self |
165 | */ |
166 | public function withNumberCharFilter( int $langZero ): self { |
167 | $this->langZero = $langZero; |
168 | $this->numCharMapName = "{$this->langName}_numbers"; |
169 | return $this; |
170 | } |
171 | |
172 | /** |
173 | * @param string[] $articles "articles" to be elided |
174 | * @param bool $articleCase whether elision is case insensitive |
175 | * @return self |
176 | */ |
177 | public function withElision( array $articles, bool $articleCase = true ): self { |
178 | $this->elisionArticleCase = $articleCase; |
179 | $this->elisionArticles = $articles; |
180 | $this->elisionName = "{$this->langName}_elision"; |
181 | return $this; |
182 | } |
183 | |
184 | /** @return self */ |
185 | public function withLangLowercase(): self { |
186 | $this->langLowercase = true; |
187 | return $this; |
188 | } |
189 | |
190 | /** |
191 | * @param mixed $stop pre-defined list like _french_ or an array of stopwords |
192 | * @return self |
193 | */ |
194 | public function withStop( $stop ): self { |
195 | $this->customStopList = $stop; |
196 | $this->stopName = "{$this->langName}_stop"; |
197 | return $this; |
198 | } |
199 | |
200 | /** |
201 | * @param string[] $rules stemmer override rules |
202 | * @return self |
203 | */ |
204 | public function withStemmerOverride( array $rules ): self { |
205 | $this->overrideRules = $rules; |
206 | $this->overrideName = "{$this->langName}_override"; |
207 | return $this; |
208 | } |
209 | |
210 | /********** |
211 | * The with.., omit.., and insert.. methods below are only used by unpacked analyzers |
212 | */ |
213 | |
214 | /** @return self */ |
215 | public function withUnpackedAnalyzer(): self { |
216 | $this->unpacked = true; |
217 | return $this; |
218 | } |
219 | |
220 | private function unpackedCheck(): void { |
221 | if ( !$this->unpacked ) { |
222 | $caller = debug_backtrace()[1]['function']; |
223 | throw new \ConfigException( "$caller() is only compatible with unpacked analyzers;" . |
224 | "call withUnpackedAnalyzer() before calling $caller()." ); |
225 | } |
226 | } |
227 | |
228 | /** |
229 | * @param mixed $beforeFilter specific filter to insert $filters before; use APPEND |
230 | * or PREPEND to always add to beginning or end of the list |
231 | * @param string[] $filterList list of additional filters to insert |
232 | * @return self |
233 | */ |
234 | public function insertFiltersBefore( $beforeFilter, array $filterList ): self { |
235 | $this->unpackedCheck(); |
236 | $this->insertFilterList[] = [ $beforeFilter => $filterList ]; |
237 | return $this; |
238 | } |
239 | |
240 | /** @return self */ |
241 | public function omitDottedI(): self { |
242 | $this->unpackedCheck(); |
243 | $this->dottedIFix = ''; |
244 | return $this; |
245 | } |
246 | |
247 | /** @return self */ |
248 | public function withWordBreakHelper(): self { |
249 | $this->unpackedCheck(); |
250 | $this->wordBreakHelper = 'word_break_helper'; |
251 | return $this; |
252 | } |
253 | |
254 | /** @return self */ |
255 | public function withAggressiveSplitting(): self { |
256 | $this->unpackedCheck(); |
257 | $this->aggressiveSplitting = 'aggressive_splitting'; |
258 | return $this; |
259 | } |
260 | |
261 | /** @return self */ |
262 | public function withLightStemmer(): self { |
263 | $this->unpackedCheck(); |
264 | $this->stemmerName = "light_{$this->langName}"; |
265 | return $this; |
266 | } |
267 | |
268 | /** @return self */ |
269 | public function omitStemmer(): self { |
270 | $this->unpackedCheck(); |
271 | $this->useStemmer = false; |
272 | return $this; |
273 | } |
274 | |
275 | /** @return self */ |
276 | public function withAsciifoldingPreserve(): self { |
277 | $this->unpackedCheck(); |
278 | $this->asciifolding = 'asciifolding_preserve'; |
279 | return $this; |
280 | } |
281 | |
282 | /** @return self */ |
283 | public function omitAsciifolding(): self { |
284 | $this->unpackedCheck(); |
285 | $this->asciifolding = ''; |
286 | return $this; |
287 | } |
288 | |
289 | /** @return self */ |
290 | public function withRemoveEmpty(): self { |
291 | $this->unpackedCheck(); |
292 | $this->removeEmpty = 'remove_empty'; |
293 | return $this; |
294 | } |
295 | |
296 | /** @return self */ |
297 | public function withDecimalDigit(): self { |
298 | $this->unpackedCheck(); |
299 | $this->decimalDigit = 'decimal_digit'; |
300 | return $this; |
301 | } |
302 | |
303 | /** |
304 | * Create a basic analyzer with support for various common options |
305 | * |
306 | * Can create various filters and character filters as specified. |
307 | * None are automatically added to the char_filter or filter list |
308 | * because the best order for these basic analyzers depends on the |
309 | * details of various third-party plugins. |
310 | * |
311 | * type: custom |
312 | * tokenizer: standard |
313 | * char_filter: as per $this->charFilters |
314 | * filter: as per $this->filters |
315 | * |
316 | * @param mixed[] $config to be updated |
317 | * @return mixed[] updated config |
318 | */ |
319 | public function build( array $config ): array { |
320 | $langStem = "{$this->langName}_stemmer"; |
321 | |
322 | if ( $this->unpacked ) { |
323 | // Analyzer config for char_filter and filter will be in the order below, |
324 | // if the relevant filters are enabled/configured. |
325 | // |
326 | // type: custom |
327 | // tokenizer: standard |
328 | // char_filter: dotted_I_fix, lang_charfilter, lang_numbers, word_break_helper |
329 | // filter: elision, aggressive_splitting, lowercase, stopwords, lang_norm, |
330 | // stemmer_override, stemmer, asciifolding, remove_empty |
331 | if ( $this->useStemmer ) { |
332 | $this->stemmerName = $this->stemmerName ?? $this->langName; |
333 | } else { |
334 | $langStem = ''; |
335 | } |
336 | $this->withStop( $this->customStopList ?? "_{$this->langName}_" ); |
337 | |
338 | // build up the char_filter list--everything is optional |
339 | $this->charFilters[] = $this->dottedIFix; |
340 | $this->charFilters[] = $this->charMapName; |
341 | $this->charFilters[] = $this->numCharMapName; |
342 | $this->charFilters[] = $this->wordBreakHelper; |
343 | |
344 | // remove 'falsey' (== not configured) values from the list |
345 | $this->charFilters = array_values( array_filter( $this->charFilters ) ); |
346 | |
347 | // build up the filter list--lowercase, stop, and stem are required |
348 | $this->filters[] = $this->elisionName; |
349 | $this->filters[] = $this->aggressiveSplitting; |
350 | $this->filters[] = 'lowercase'; |
351 | $this->filters[] = $this->decimalDigit; |
352 | $this->filters[] = $this->stopName; |
353 | $this->filters[] = $this->overrideName; |
354 | $this->filters[] = $langStem; |
355 | $this->filters[] = $this->asciifolding; |
356 | $this->filters[] = $this->removeEmpty; |
357 | |
358 | // remove 'falsey' (== not configured) values from the list |
359 | $this->filters = array_values( array_filter( $this->filters ) ); |
360 | |
361 | // iterate over all lists of sets of filters to insert, in order, and insert |
362 | // them before the specified filter. If no such filter exists, $idx == -1 and |
363 | // the filters will be prepended, but you shouldn't count on that. APPEND and |
364 | // PREPEND constants can be used to add to beginning or end, regardless of |
365 | // other filters |
366 | foreach ( $this->insertFilterList as $filterPatch ) { |
367 | foreach ( $filterPatch as $beforeFilter => $filterList ) { |
368 | switch ( $beforeFilter ) { |
369 | case self::APPEND: |
370 | $this->filters = array_merge( $this->filters, $filterList ); |
371 | break; |
372 | case self::PREPEND: |
373 | $this->filters = array_merge( $filterList, $this->filters ); |
374 | break; |
375 | default: |
376 | $idx = array_search( $beforeFilter, $this->filters ); |
377 | array_splice( $this->filters, $idx, 0, $filterList ); |
378 | break; |
379 | } |
380 | } |
381 | } |
382 | |
383 | } |
384 | |
385 | $config[ 'analyzer' ][ $this->analyzerName ] = [ |
386 | 'type' => 'custom', |
387 | 'tokenizer' => $this->tokenizer, |
388 | ]; |
389 | |
390 | if ( $this->charMapName ) { |
391 | $config[ 'char_filter' ][ $this->charMapName ] = |
392 | $this->mappingCharFilter( $this->charMap ); |
393 | } |
394 | |
395 | if ( $this->numCharMapName ) { |
396 | $config[ 'char_filter' ][ $this->numCharMapName ] = |
397 | $this->numberCharFilter( $this->langZero ); |
398 | } |
399 | |
400 | if ( $this->elisionName ) { |
401 | $config[ 'filter' ][ $this->elisionName ] = |
402 | $this->elisionFilter( $this->elisionArticles, $this->elisionArticleCase ); |
403 | } |
404 | |
405 | if ( $this->langLowercase ) { |
406 | $config[ 'filter' ][ 'lowercase' ][ 'language' ] = $this->langName; |
407 | } |
408 | |
409 | if ( $this->overrideName ) { |
410 | $config[ 'filter' ][ $this->overrideName ] = |
411 | $this->overrideFilter( $this->overrideRules ); |
412 | } |
413 | |
414 | if ( $this->stopName ) { |
415 | $config[ 'filter' ][ $this->stopName ] = |
416 | $this->stopFilterFromList( $this->customStopList ); |
417 | } |
418 | |
419 | if ( $this->charFilters ) { |
420 | $config[ 'analyzer' ][ $this->analyzerName ][ 'char_filter' ] = $this->charFilters; |
421 | } |
422 | |
423 | if ( $this->filters ) { |
424 | $config[ 'analyzer' ][ $this->analyzerName ][ 'filter' ] = $this->filters; |
425 | } |
426 | |
427 | if ( $this->stemmerName && $this->useStemmer ) { |
428 | $config[ 'filter' ][ $langStem ] = |
429 | $this->stemmerFilter( $this->stemmerName ); |
430 | } |
431 | |
432 | return $config; |
433 | } |
434 | |
435 | /** |
436 | * Create a pattern_replace filter/char_filter with the mappings provided. |
437 | * |
438 | * @param string $pat |
439 | * @param string $repl |
440 | * @return mixed[] filter |
441 | */ |
442 | public static function patternFilter( string $pat, string $repl ): array { |
443 | return [ 'type' => 'pattern_replace', 'pattern' => $pat, 'replacement' => $repl ]; |
444 | } |
445 | |
446 | /** |
447 | * Create a mapping character filter with the mappings provided. |
448 | * |
449 | * @param string[] $mappings |
450 | * @return mixed[] character filter |
451 | */ |
452 | public static function mappingCharFilter( array $mappings ): array { |
453 | return [ 'type' => 'mapping', 'mappings' => $mappings ]; |
454 | } |
455 | |
456 | /** |
457 | * Create a character filter that maps non-Arabic digits (e.g., ០-៩ or 0-9) to |
458 | * Arabic digits (0-9). Since they are usually all in a row, we just need the |
459 | * starting digit (equal to 0) |
460 | * |
461 | * @param int $langZero |
462 | * @return mixed[] character filter |
463 | */ |
464 | public static function numberCharFilter( int $langZero ): array { |
465 | $numMap = []; |
466 | for ( $i = 0; $i <= 9; $i++ ) { |
467 | $numMap[] = sprintf( '\\u%04x=>%d', $langZero + $i, $i ); |
468 | } |
469 | return self::mappingCharFilter( $numMap ); |
470 | } |
471 | |
472 | /** |
473 | * Create an elision filter with the "articles" provided; $case determines whether |
474 | * stripping is case sensitive or not |
475 | * |
476 | * @param string[] $articles |
477 | * @param bool $case |
478 | * @return mixed[] token filter |
479 | */ |
480 | public static function elisionFilter( array $articles, bool $case = true ): array { |
481 | return [ 'type' => 'elision', 'articles_case' => $case, 'articles' => $articles ]; |
482 | } |
483 | |
484 | /** |
485 | * Create a stop word filter with the provided config. The config can be an array |
486 | * of stop words, or a string like _french_ that refers to a pre-defined list. |
487 | * |
488 | * @param mixed $stopwords |
489 | * @param bool|null $ignoreCase |
490 | * @return mixed[] token filter |
491 | */ |
492 | public static function stopFilterFromList( $stopwords, bool $ignoreCase = null ): array { |
493 | $retArray = [ 'type' => 'stop', 'stopwords' => $stopwords ]; |
494 | if ( isset( $ignoreCase ) ) { |
495 | $retArray['ignore_case'] = $ignoreCase; |
496 | } |
497 | return $retArray; |
498 | } |
499 | |
500 | /** |
501 | * Create an stemming override filter with the rules provided |
502 | * |
503 | * @param string[] $rules |
504 | * @return mixed[] token filter |
505 | */ |
506 | private function overrideFilter( array $rules ): array { |
507 | return [ 'type' => 'stemmer_override', 'rules' => $rules ]; |
508 | } |
509 | |
510 | /** |
511 | * Create a stemmer filter with the provided config. |
512 | * |
513 | * @param string $stemmer |
514 | * @return mixed[] token filter |
515 | */ |
516 | public static function stemmerFilter( string $stemmer ): array { |
517 | return [ 'type' => 'stemmer', 'language' => $stemmer ]; |
518 | } |
519 | |
520 | } |