Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
153 / 153 |
|
100.00% |
33 / 33 |
CRAP | |
100.00% |
1 / 1 |
AnalyzerBuilder | |
100.00% |
153 / 153 |
|
100.00% |
33 / 33 |
59 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withCharFilters | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withTokenizer | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withFilters | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withCharMap | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
withLimitedCharMap | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
withReversedNumberCharFilter | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withNumberCharFilter | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
withElision | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
withLangLowercase | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
withStop | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withExtraStop | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
withExtraStemmer | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withStemmerOverride | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withUnpackedAnalyzer | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
unpackedCheck | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
insertFiltersBefore | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
appendFilters | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
prependFilters | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withLightStemmer | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
omitStemmer | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withAsciifoldingPreserve | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
omitAsciifolding | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withRemoveEmpty | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withDecimalDigit | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
build | |
100.00% |
64 / 64 |
|
100.00% |
1 / 1 |
20 | |||
patternFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
mappingCharFilter | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
numberCharFilter | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
elisionFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
stopFilterFromList | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
overrideFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
stemmerFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use MediaWiki\Config\ConfigException; |
6 | |
7 | /** |
8 | * Builds one elasticsearch analyzer to add to an analysis config array. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License as published by |
12 | * the Free Software Foundation; either version 2 of the License, or |
13 | * (at your option) any later version. |
14 | * |
15 | * This program is distributed in the hope that it will be useful, |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 | * GNU General Public License for more details. |
19 | * |
20 | * You should have received a copy of the GNU General Public License along |
21 | * with this program; if not, write to the Free Software Foundation, Inc., |
22 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
23 | * http://www.gnu.org/copyleft/gpl.html |
24 | */ |
25 | class AnalyzerBuilder { |
26 | /** |
27 | * Indicate that filters should be automatically appended or prepended, rather |
28 | * than inserted before a given filter. |
29 | */ |
30 | public const APPEND = 1; |
31 | public const PREPEND = 2; |
32 | |
33 | /** @var string */ |
34 | private $langName; |
35 | |
36 | /** @var string */ |
37 | private $analyzerName; |
38 | |
39 | /** @var string[]|null list of char_filters */ |
40 | private $charFilters; |
41 | |
42 | /** @var string|null name of tokenizer */ |
43 | private $tokenizer = 'standard'; |
44 | |
45 | /** @var string[]|null list of filters */ |
46 | private $filters; |
47 | |
48 | /** @var string[]|null list of lang-specific character filter mappings */ |
49 | private $charMap; |
50 | |
51 | /** @var bool */ |
52 | private $charMapLimited = false; |
53 | |
54 | /** @var string|null */ |
55 | private $charMapName; |
56 | |
57 | /** @var int|null Unicode value for script-specific zero */ |
58 | private $langZero; |
59 | |
60 | /** @var bool should langZero's map be reversed (Arabic to non-Arabic)? */ |
61 | private $numCharMapReversed = false; |
62 | |
63 | /** @var string|null name of char filter mapping digits (using $langZero) */ |
64 | private $numCharMapName; |
65 | |
66 | /** @var bool is elision processing case INsensitive? */ |
67 | private $elisionArticleCase = true; |
68 | |
69 | /** @var string[]|null list of articles to elide */ |
70 | private $elisionArticles; |
71 | |
72 | /** @var string|null */ |
73 | private $elisionName; |
74 | |
75 | /** @var string|null */ |
76 | private $langLowercase; |
77 | |
78 | /** @var mixed|null stopword _list_ or array of stopwords */ |
79 | private $customStopList; |
80 | |
81 | /** @var string|null */ |
82 | private $stopName; |
83 | |
84 | /** @var mixed|null stopword _list_ or array of stopwords */ |
85 | private $extraStopList; |
86 | |
87 | /** @var string|null */ |
88 | private $extraStopName; |
89 | |
90 | /** @var bool|null */ |
91 | private $extraStopIgnoreCase; |
92 | |
93 | /** @var string|null */ |
94 | private $extraStemmerLang; |
95 | |
96 | /** @var string|null */ |
97 | private $extraStemmerName; |
98 | |
99 | /** @var string[]|null list of stemmer override rules */ |
100 | private $overrideRules; |
101 | |
102 | /** @var string|null */ |
103 | private $overrideName; |
104 | |
105 | /********** |
106 | * The properties below are only used by unpacked analyzers |
107 | */ |
108 | |
109 | /** @var bool */ |
110 | private $unpacked = false; |
111 | |
112 | /** @var array<int, array<string, string[]>> */ |
113 | private $insertFilterList = []; |
114 | |
115 | /** @var bool */ |
116 | private $useStemmer = true; |
117 | |
118 | /** @var string|null */ |
119 | private $stemmerLang; |
120 | |
121 | /** @var string|null asciifolding flavor to use (null for none) */ |
122 | private $asciifolding = 'asciifolding'; |
123 | |
124 | /** @var string|null */ |
125 | private $removeEmpty; |
126 | |
127 | /** @var string|null */ |
128 | private $decimalDigit; |
129 | |
130 | /** |
131 | * @param string $langName |
132 | * @param string $analyzerName (default to 'text') |
133 | */ |
134 | public function __construct( string $langName, string $analyzerName = 'text' ) { |
135 | $this->langName = $langName; |
136 | $this->analyzerName = $analyzerName; |
137 | } |
138 | |
139 | /** |
140 | * @param string[] $charFilters |
141 | * @return self |
142 | */ |
143 | public function withCharFilters( array $charFilters ): self { |
144 | $this->charFilters = $charFilters; |
145 | return $this; |
146 | } |
147 | |
148 | /** |
149 | * @param string $tokenizer |
150 | * @return self |
151 | */ |
152 | public function withTokenizer( string $tokenizer ): self { |
153 | $this->tokenizer = $tokenizer; |
154 | return $this; |
155 | } |
156 | |
157 | /** |
158 | * @param string[] $filters |
159 | * @return self |
160 | */ |
161 | public function withFilters( array $filters ): self { |
162 | $this->filters = $filters; |
163 | return $this; |
164 | } |
165 | |
166 | /** |
167 | * @param string[] $mappings |
168 | * @param string|null $name |
169 | * @param bool $limited |
170 | * @return self |
171 | */ |
172 | public function withCharMap( array $mappings, string $name = null, bool $limited = false ): self { |
173 | $this->charMap = $mappings; |
174 | $this->charMapName = $name ?? "{$this->langName}_charfilter"; |
175 | $this->charMapLimited = false; |
176 | return $this; |
177 | } |
178 | |
179 | /** |
180 | * @param string[] $mappings |
181 | * @param string|null $name |
182 | * @return self |
183 | */ |
184 | public function withLimitedCharMap( array $mappings, string $name = null ): self { |
185 | return $this->withCharMap( $mappings, $name, true ); |
186 | } |
187 | |
188 | /** |
189 | * @param int $langZero |
190 | * @param string|null $name |
191 | * @return self |
192 | */ |
193 | public function withReversedNumberCharFilter( int $langZero, string $name = null ): self { |
194 | $this->withNumberCharFilter( $langZero, $name, true ); |
195 | return $this; |
196 | } |
197 | |
198 | /** |
199 | * @param int $langZero |
200 | * @param string|null $name |
201 | * @param bool $reversed reverse the mapping from Arabic to non-Arabic |
202 | * @return self |
203 | */ |
204 | public function withNumberCharFilter( int $langZero, string $name = null, bool $reversed = false ): self { |
205 | $defName = $reversed ? "{$this->langName}_reversed_numbers" : "{$this->langName}_numbers"; |
206 | $this->langZero = $langZero; |
207 | $this->numCharMapName = $name ?? $defName; |
208 | $this->numCharMapReversed = $reversed; |
209 | return $this; |
210 | } |
211 | |
212 | /** |
213 | * @param string[] $articles "articles" to be elided |
214 | * @param bool $articleCase whether elision is case insensitive |
215 | * @return self |
216 | */ |
217 | public function withElision( array $articles, bool $articleCase = true ): self { |
218 | $this->elisionArticleCase = $articleCase; |
219 | $this->elisionArticles = $articles; |
220 | $this->elisionName = "{$this->langName}_elision"; |
221 | return $this; |
222 | } |
223 | |
224 | /** |
225 | * @param string|null $name |
226 | * @return self |
227 | */ |
228 | public function withLangLowercase( string $name = null ): self { |
229 | $this->langLowercase = $name ?: $this->langName; |
230 | return $this; |
231 | } |
232 | |
233 | /** |
234 | * @param mixed $stop pre-defined list like _french_ or an array of stopwords |
235 | * @param string|null $name |
236 | * @return self |
237 | */ |
238 | public function withStop( $stop, string $name = null ): self { |
239 | $this->customStopList = $stop; |
240 | $this->stopName = $name ?? "{$this->langName}_stop"; |
241 | return $this; |
242 | } |
243 | |
244 | /** |
245 | * @param mixed $stop pre-defined list like _french_ or an array of stopwords |
246 | * @param string $name |
247 | * @param mixed $beforeFilter filter to insert extra stop before |
248 | * @param bool|null $ignoreCase |
249 | * @return self |
250 | */ |
251 | public function withExtraStop( $stop, string $name, $beforeFilter = self::APPEND, |
252 | bool $ignoreCase = null ): self { |
253 | $this->extraStopList = $stop; |
254 | $this->extraStopName = $name; |
255 | $this->extraStopIgnoreCase = $ignoreCase; |
256 | $this->insertFiltersBefore( $beforeFilter, [ $name ] ); |
257 | return $this; |
258 | } |
259 | |
260 | /** |
261 | * @param string $lang |
262 | * @param string|null $name |
263 | * @return self |
264 | */ |
265 | public function withExtraStemmer( string $lang, string $name = null ): self { |
266 | $this->extraStemmerLang = $lang; |
267 | $this->extraStemmerName = $name ?? $lang; |
268 | return $this; |
269 | } |
270 | |
271 | /** |
272 | * Rules can be a single rule string, or an array of rules |
273 | * |
274 | * @param mixed $rules stemmer override rules |
275 | * @param string|null $name |
276 | * @return self |
277 | */ |
278 | public function withStemmerOverride( $rules, string $name = null ): self { |
279 | $this->overrideRules = $rules; |
280 | $this->overrideName = $name ?? "{$this->langName}_override"; |
281 | return $this; |
282 | } |
283 | |
284 | /********** |
285 | * The with.., omit.., and insert.. methods below are only used by unpacked analyzers |
286 | */ |
287 | |
288 | /** @return self */ |
289 | public function withUnpackedAnalyzer(): self { |
290 | $this->unpacked = true; |
291 | return $this; |
292 | } |
293 | |
294 | private function unpackedCheck(): void { |
295 | if ( !$this->unpacked ) { |
296 | $caller = debug_backtrace()[1]['function']; |
297 | throw new ConfigException( "$caller() is only compatible with unpacked analyzers;" . |
298 | "call withUnpackedAnalyzer() before calling $caller()." ); |
299 | } |
300 | } |
301 | |
302 | /** |
303 | * @param mixed $beforeFilter specific filter to insert $filters before; use APPEND |
304 | * or PREPEND to always add to beginning or end of the list |
305 | * @param string[] $filterList list of additional filters to insert |
306 | * @return self |
307 | */ |
308 | public function insertFiltersBefore( $beforeFilter, array $filterList ): self { |
309 | $this->unpackedCheck(); |
310 | $this->insertFilterList[] = [ $beforeFilter => $filterList ]; |
311 | return $this; |
312 | } |
313 | |
314 | /** |
315 | * @param string[] $filterList list of additional filters to append |
316 | * @return self |
317 | */ |
318 | public function appendFilters( array $filterList ): self { |
319 | $this->unpackedCheck(); |
320 | $this->insertFiltersBefore( self::APPEND, $filterList ); |
321 | return $this; |
322 | } |
323 | |
324 | /** |
325 | * @param string[] $filterList list of additional filters to prepend |
326 | * @return self |
327 | */ |
328 | public function prependFilters( array $filterList ): self { |
329 | $this->unpackedCheck(); |
330 | $this->insertFiltersBefore( self::PREPEND, $filterList ); |
331 | return $this; |
332 | } |
333 | |
334 | /** @return self */ |
335 | public function withLightStemmer(): self { |
336 | $this->unpackedCheck(); |
337 | $this->stemmerLang = "light_{$this->langName}"; |
338 | return $this; |
339 | } |
340 | |
341 | /** @return self */ |
342 | public function omitStemmer(): self { |
343 | $this->unpackedCheck(); |
344 | $this->useStemmer = false; |
345 | return $this; |
346 | } |
347 | |
348 | /** @return self */ |
349 | public function withAsciifoldingPreserve(): self { |
350 | $this->unpackedCheck(); |
351 | $this->asciifolding = 'asciifolding_preserve'; |
352 | return $this; |
353 | } |
354 | |
355 | /** @return self */ |
356 | public function omitAsciifolding(): self { |
357 | $this->unpackedCheck(); |
358 | $this->asciifolding = ''; |
359 | return $this; |
360 | } |
361 | |
362 | /** @return self */ |
363 | public function withRemoveEmpty(): self { |
364 | $this->unpackedCheck(); |
365 | $this->removeEmpty = 'remove_empty'; |
366 | return $this; |
367 | } |
368 | |
369 | /** @return self */ |
370 | public function withDecimalDigit(): self { |
371 | $this->unpackedCheck(); |
372 | $this->decimalDigit = 'decimal_digit'; |
373 | return $this; |
374 | } |
375 | |
376 | /** |
377 | * Create a basic analyzer with support for various common options |
378 | * |
379 | * Can create various filters and character filters as specified. |
380 | * None are automatically added to the char_filter or filter list |
381 | * because the best order for these basic analyzers depends on the |
382 | * details of various third-party plugins. |
383 | * |
384 | * type: custom |
385 | * tokenizer: standard |
386 | * char_filter: as per $this->charFilters |
387 | * filter: as per $this->filters |
388 | * |
389 | * @param mixed[] $config to be updated |
390 | * @return mixed[] updated config |
391 | */ |
392 | public function build( array $config ): array { |
393 | $langStem = "{$this->langName}_stemmer"; |
394 | |
395 | if ( $this->unpacked ) { |
396 | // Analyzer config for char_filter and filter will be in the order below, |
397 | // if the relevant filters are enabled/configured. |
398 | // |
399 | // type: custom |
400 | // tokenizer: standard |
401 | // char_filter: lang_charfilter, lang_numbers |
402 | // filter: elision, aggressive_splitting, lowercase, stopwords, lang_norm, |
403 | // stemmer_override, stemmer, asciifolding, remove_empty |
404 | if ( $this->useStemmer ) { |
405 | $this->stemmerLang ??= $this->langName; |
406 | } else { |
407 | $langStem = ''; |
408 | } |
409 | $this->withStop( $this->customStopList ?? "_{$this->langName}_" ); |
410 | |
411 | // build up the char_filter list--everything is optional |
412 | $this->charFilters[] = $this->charMapName; |
413 | $this->charFilters[] = $this->numCharMapName; |
414 | |
415 | // remove 'falsey' (== not configured) values from the list |
416 | $this->charFilters = array_values( array_filter( $this->charFilters ) ); |
417 | |
418 | // build up the filter list--lowercase, stop, and stem are required |
419 | $this->filters[] = $this->elisionName; |
420 | $this->filters[] = 'lowercase'; |
421 | $this->filters[] = $this->decimalDigit; |
422 | $this->filters[] = $this->stopName; |
423 | $this->filters[] = $this->overrideName; |
424 | $this->filters[] = $langStem; |
425 | $this->filters[] = $this->asciifolding; |
426 | $this->filters[] = $this->removeEmpty; |
427 | |
428 | // remove 'falsey' (== not configured) values from the list |
429 | $this->filters = array_values( array_filter( $this->filters ) ); |
430 | |
431 | // iterate over all lists of sets of filters to insert, in order, and insert |
432 | // them before the specified filter. If no such filter exists, $idx == -1 and |
433 | // the filters will be prepended, but you shouldn't count on that. APPEND and |
434 | // PREPEND constants can be used to add to beginning or end, regardless of |
435 | // other filters |
436 | foreach ( $this->insertFilterList as $filterPatch ) { |
437 | foreach ( $filterPatch as $beforeFilter => $filterList ) { |
438 | switch ( $beforeFilter ) { |
439 | case self::APPEND: |
440 | $this->filters = array_merge( $this->filters, $filterList ); |
441 | break; |
442 | case self::PREPEND: |
443 | $this->filters = array_merge( $filterList, $this->filters ); |
444 | break; |
445 | default: |
446 | $idx = array_search( $beforeFilter, $this->filters ); |
447 | array_splice( $this->filters, $idx, 0, $filterList ); |
448 | break; |
449 | } |
450 | } |
451 | } |
452 | |
453 | } |
454 | |
455 | $config[ 'analyzer' ][ $this->analyzerName ] = [ |
456 | 'type' => 'custom', |
457 | 'tokenizer' => $this->tokenizer, |
458 | ]; |
459 | |
460 | if ( $this->charMapName ) { |
461 | $config[ 'char_filter' ][ $this->charMapName ] = |
462 | $this->mappingCharFilter( $this->charMap, $this->charMapLimited ); |
463 | } |
464 | |
465 | if ( $this->numCharMapName ) { |
466 | $config[ 'char_filter' ][ $this->numCharMapName ] = |
467 | $this->numberCharFilter( $this->langZero, $this->numCharMapReversed ); |
468 | } |
469 | |
470 | if ( $this->elisionName ) { |
471 | $config[ 'filter' ][ $this->elisionName ] = |
472 | $this->elisionFilter( $this->elisionArticles, $this->elisionArticleCase ); |
473 | } |
474 | |
475 | if ( $this->langLowercase ) { |
476 | $config[ 'filter' ][ 'lowercase' ][ 'language' ] = $this->langLowercase; |
477 | } |
478 | |
479 | if ( $this->overrideName ) { |
480 | $config[ 'filter' ][ $this->overrideName ] = |
481 | $this->overrideFilter( $this->overrideRules ); |
482 | } |
483 | |
484 | if ( $this->stopName ) { |
485 | $config[ 'filter' ][ $this->stopName ] = |
486 | $this->stopFilterFromList( $this->customStopList ); |
487 | } |
488 | |
489 | if ( $this->extraStopName ) { |
490 | $config[ 'filter' ][ $this->extraStopName ] = |
491 | $this->stopFilterFromList( $this->extraStopList, $this->extraStopIgnoreCase ); |
492 | } |
493 | |
494 | if ( $this->charFilters ) { |
495 | $config[ 'analyzer' ][ $this->analyzerName ][ 'char_filter' ] = $this->charFilters; |
496 | } |
497 | |
498 | if ( $this->filters ) { |
499 | $config[ 'analyzer' ][ $this->analyzerName ][ 'filter' ] = $this->filters; |
500 | } |
501 | |
502 | if ( $this->stemmerLang && $this->useStemmer ) { |
503 | $config[ 'filter' ][ $langStem ] = |
504 | $this->stemmerFilter( $this->stemmerLang ); |
505 | } |
506 | |
507 | if ( $this->extraStemmerName ) { |
508 | $config[ 'filter' ][ $this->extraStemmerName ] = |
509 | $this->stemmerFilter( $this->extraStemmerLang ); |
510 | } |
511 | |
512 | return $config; |
513 | } |
514 | |
515 | /** |
516 | * Create a pattern_replace filter/char_filter with the mappings provided. |
517 | * |
518 | * @param string $pat |
519 | * @param string $repl |
520 | * @return mixed[] filter |
521 | */ |
522 | public static function patternFilter( string $pat, string $repl = '' ): array { |
523 | return [ 'type' => 'pattern_replace', 'pattern' => $pat, 'replacement' => $repl ]; |
524 | } |
525 | |
526 | /** |
527 | * Create a mapping or limited_mapping character filter with the mappings provided. |
528 | * |
529 | * @param string[] $mappings |
530 | * @param bool $limited |
531 | * @return mixed[] character filter |
532 | */ |
533 | public static function mappingCharFilter( array $mappings, bool $limited ): array { |
534 | $type = $limited ? 'limited_mapping' : 'mapping'; |
535 | return [ 'type' => $type, 'mappings' => $mappings ]; |
536 | } |
537 | |
538 | /** |
539 | * Create a character filter that maps non-Arabic digits (e.g., ០-៩ or 0-9) to |
540 | * Arabic digits (0-9). Since they are usually all in a row, we just need the |
541 | * starting digit (equal to 0). |
542 | * |
543 | * Optionally reverse the mapping from Arabic to non-Arabic. For example, the ICU |
544 | * tokenizer works better on tokenizing Thai digits in Thai text than it does on |
545 | * Arabic digits. |
546 | * |
547 | * @param int $langZero |
548 | * @param bool $reversed reverse the mapping from Arabic to non-Arabic |
549 | * @return mixed[] character filter |
550 | */ |
551 | public static function numberCharFilter( int $langZero, bool $reversed = false ): array { |
552 | $numMap = []; |
553 | for ( $i = 0; $i <= 9; $i++ ) { |
554 | if ( $reversed ) { |
555 | $numMap[] = sprintf( '%d=>\\u%04x', $i, $langZero + $i ); |
556 | } else { |
557 | $numMap[] = sprintf( '\\u%04x=>%d', $langZero + $i, $i ); |
558 | } |
559 | } |
560 | return self::mappingCharFilter( $numMap, true ); |
561 | } |
562 | |
563 | /** |
564 | * Create an elision filter with the "articles" provided; $case determines whether |
565 | * stripping is case sensitive or not |
566 | * |
567 | * @param string[] $articles |
568 | * @param bool $case |
569 | * @return mixed[] token filter |
570 | */ |
571 | public static function elisionFilter( array $articles, bool $case = true ): array { |
572 | return [ 'type' => 'elision', 'articles_case' => $case, 'articles' => $articles ]; |
573 | } |
574 | |
575 | /** |
576 | * Create a stop word filter with the provided config. The config can be an array |
577 | * of stop words, or a string like _french_ that refers to a pre-defined list. |
578 | * |
579 | * @param mixed $stopwords |
580 | * @param bool|null $ignoreCase |
581 | * @return mixed[] token filter |
582 | */ |
583 | public static function stopFilterFromList( $stopwords, bool $ignoreCase = null ): array { |
584 | $retArray = [ 'type' => 'stop', 'stopwords' => $stopwords ]; |
585 | if ( isset( $ignoreCase ) ) { |
586 | $retArray['ignore_case'] = $ignoreCase; |
587 | } |
588 | return $retArray; |
589 | } |
590 | |
591 | /** |
592 | * Create an stemming override filter with the rules provided, which can be a string |
593 | * with one rule or an array of such rules |
594 | * |
595 | * @param mixed $rules |
596 | * @return mixed[] token filter |
597 | */ |
598 | private function overrideFilter( $rules ): array { |
599 | return [ 'type' => 'stemmer_override', 'rules' => $rules ]; |
600 | } |
601 | |
602 | /** |
603 | * Create a stemmer filter with the provided config. |
604 | * |
605 | * @param string $stemmer |
606 | * @return mixed[] token filter |
607 | */ |
608 | public static function stemmerFilter( string $stemmer ): array { |
609 | return [ 'type' => 'stemmer', 'language' => $stemmer ]; |
610 | } |
611 | |
612 | } |