Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
162 / 162 |
|
100.00% |
34 / 34 |
CRAP | |
100.00% |
1 / 1 |
AnalyzerBuilder | |
100.00% |
162 / 162 |
|
100.00% |
34 / 34 |
64 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withLangName | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withCharFilters | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withTokenizer | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withFilters | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withCharMap | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
withLimitedCharMap | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
withReversedNumberCharFilter | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
withNumberCharFilter | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
withElision | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
withLangLowercase | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
withStop | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withExtraStop | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
withExtraStemmer | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withStemmerOverride | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withUnpackedAnalyzer | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
unpackedCheck | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
insertFiltersBefore | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
appendFilters | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
prependFilters | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withLightStemmer | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
omitStemmer | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withAsciifolding | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
omitFolding | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withRemoveEmpty | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
withDecimalDigit | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
build | |
100.00% |
71 / 71 |
|
100.00% |
1 / 1 |
24 | |||
patternFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
mappingCharFilter | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
numberCharFilter | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
elisionFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
stopFilterFromList | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
overrideFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
stemmerFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use MediaWiki\Config\ConfigException; |
6 | |
7 | /** |
8 | * Builds one elasticsearch analyzer to add to an analysis config array. |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify |
11 | * it under the terms of the GNU General Public License as published by |
12 | * the Free Software Foundation; either version 2 of the License, or |
13 | * (at your option) any later version. |
14 | * |
15 | * This program is distributed in the hope that it will be useful, |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 | * GNU General Public License for more details. |
19 | * |
20 | * You should have received a copy of the GNU General Public License along |
21 | * with this program; if not, write to the Free Software Foundation, Inc., |
22 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
23 | * http://www.gnu.org/copyleft/gpl.html |
24 | */ |
25 | class AnalyzerBuilder { |
26 | /** |
27 | * Indicate that filters should be automatically appended or prepended, rather |
28 | * than inserted before a given filter. |
29 | */ |
30 | public const APPEND = 1; |
31 | public const PREPEND = 2; |
32 | |
33 | /** @var string */ |
34 | private $langName; |
35 | |
36 | /** @var string */ |
37 | private $analyzerName = 'text'; |
38 | |
39 | /** @var bool */ |
40 | private $icuEnabled; |
41 | |
42 | /** @var string[]|null list of char_filters */ |
43 | private $charFilters; |
44 | |
45 | /** @var string|null name of tokenizer */ |
46 | private $tokenizer = 'standard'; |
47 | |
48 | /** @var string[]|null list of filters */ |
49 | private $filters; |
50 | |
51 | /** @var string[]|null list of lang-specific character filter mappings */ |
52 | private $charMap; |
53 | |
54 | /** @var bool */ |
55 | private $charMapLimited = false; |
56 | |
57 | /** @var string|null */ |
58 | private $charMapName; |
59 | |
60 | /** @var int|null Unicode value for script-specific zero */ |
61 | private $langZero; |
62 | |
63 | /** @var bool should langZero's map be reversed (Arabic to non-Arabic)? */ |
64 | private $numCharMapReversed = false; |
65 | |
66 | /** @var string|null name of char filter mapping digits (using $langZero) */ |
67 | private $numCharMapName; |
68 | |
69 | /** @var bool is elision processing case INsensitive? */ |
70 | private $elisionArticleCase = true; |
71 | |
72 | /** @var string[]|null list of articles to elide */ |
73 | private $elisionArticles; |
74 | |
75 | /** @var string|null */ |
76 | private $elisionName; |
77 | |
78 | /** @var string|null */ |
79 | private $langLowercase; |
80 | |
81 | /** @var mixed|null stopword _list_ or array of stopwords */ |
82 | private $customStopList; |
83 | |
84 | /** @var string|null */ |
85 | private $stopName; |
86 | |
87 | /** @var mixed|null stopword _list_ or array of stopwords */ |
88 | private $extraStopList; |
89 | |
90 | /** @var string|null */ |
91 | private $extraStopName; |
92 | |
93 | /** @var bool|null */ |
94 | private $extraStopIgnoreCase; |
95 | |
96 | /** @var string|null */ |
97 | private $extraStemmerLang; |
98 | |
99 | /** @var string|null */ |
100 | private $extraStemmerName; |
101 | |
102 | /** @var string[]|null list of stemmer override rules */ |
103 | private $overrideRules; |
104 | |
105 | /** @var string|null */ |
106 | private $overrideName; |
107 | |
108 | /********** |
109 | * The properties below are only used by unpacked analyzers |
110 | */ |
111 | |
112 | /** @var bool */ |
113 | private $unpacked = false; |
114 | |
115 | /** @var array<int, array<string, string[]>> */ |
116 | private $insertFilterList = []; |
117 | |
118 | /** @var bool */ |
119 | private $useStemmer = true; |
120 | |
121 | /** @var string|null */ |
122 | private $stemmerLang; |
123 | |
124 | /** @var string|null folding flavor to use (null for none) */ |
125 | private $folding = 'icu_folding'; |
126 | |
127 | /** @var string|null */ |
128 | private $removeEmpty; |
129 | |
130 | /** @var string|null */ |
131 | private $decimalDigit; |
132 | |
133 | /** |
134 | * @param string $langName |
135 | * @param bool $icuEnabled |
136 | */ |
137 | public function __construct( string $langName, bool $icuEnabled = false ) { |
138 | $this->langName = $langName; |
139 | $this->icuEnabled = $icuEnabled; |
140 | } |
141 | |
142 | /** |
143 | * @param string $langName |
144 | * @return self |
145 | */ |
146 | public function withLangName( string $langName ): self { |
147 | $this->langName = $langName; |
148 | return $this; |
149 | } |
150 | |
151 | /** |
152 | * @param string[] $charFilters |
153 | * @return self |
154 | */ |
155 | public function withCharFilters( array $charFilters ): self { |
156 | $this->charFilters = $charFilters; |
157 | return $this; |
158 | } |
159 | |
160 | /** |
161 | * @param string $tokenizer |
162 | * @return self |
163 | */ |
164 | public function withTokenizer( string $tokenizer ): self { |
165 | $this->tokenizer = $tokenizer; |
166 | return $this; |
167 | } |
168 | |
169 | /** |
170 | * @param string[] $filters |
171 | * @return self |
172 | */ |
173 | public function withFilters( array $filters ): self { |
174 | $this->filters = $filters; |
175 | return $this; |
176 | } |
177 | |
178 | /** |
179 | * @param string[] $mappings |
180 | * @param string|null $name |
181 | * @param bool $limited |
182 | * @return self |
183 | */ |
184 | public function withCharMap( array $mappings, ?string $name = null, bool $limited = false ): self { |
185 | $this->charMap = $mappings; |
186 | $this->charMapName = $name ?? "{$this->langName}_charfilter"; |
187 | $this->charMapLimited = false; |
188 | return $this; |
189 | } |
190 | |
191 | /** |
192 | * @param string[] $mappings |
193 | * @param string|null $name |
194 | * @return self |
195 | */ |
196 | public function withLimitedCharMap( array $mappings, ?string $name = null ): self { |
197 | return $this->withCharMap( $mappings, $name, true ); |
198 | } |
199 | |
200 | /** |
201 | * @param int $langZero |
202 | * @param string|null $name |
203 | * @return self |
204 | */ |
205 | public function withReversedNumberCharFilter( int $langZero, ?string $name = null ): self { |
206 | $this->withNumberCharFilter( $langZero, $name, true ); |
207 | return $this; |
208 | } |
209 | |
210 | /** |
211 | * @param int $langZero |
212 | * @param string|null $name |
213 | * @param bool $reversed reverse the mapping from Arabic to non-Arabic |
214 | * @return self |
215 | */ |
216 | public function withNumberCharFilter( int $langZero, ?string $name = null, bool $reversed = false ): self { |
217 | $defName = $reversed ? "{$this->langName}_reversed_numbers" : "{$this->langName}_numbers"; |
218 | $this->langZero = $langZero; |
219 | $this->numCharMapName = $name ?? $defName; |
220 | $this->numCharMapReversed = $reversed; |
221 | return $this; |
222 | } |
223 | |
224 | /** |
225 | * @param string[] $articles "articles" to be elided |
226 | * @param bool $articleCase whether elision is case insensitive |
227 | * @return self |
228 | */ |
229 | public function withElision( array $articles, bool $articleCase = true ): self { |
230 | $this->elisionArticleCase = $articleCase; |
231 | $this->elisionArticles = $articles; |
232 | $this->elisionName = "{$this->langName}_elision"; |
233 | return $this; |
234 | } |
235 | |
236 | /** |
237 | * @param string|null $name |
238 | * @return self |
239 | */ |
240 | public function withLangLowercase( ?string $name = null ): self { |
241 | $this->langLowercase = $name ?: $this->langName; |
242 | return $this; |
243 | } |
244 | |
245 | /** |
246 | * @param mixed $stop pre-defined list like _french_ or an array of stopwords |
247 | * @param string|null $name |
248 | * @return self |
249 | */ |
250 | public function withStop( $stop, ?string $name = null ): self { |
251 | $this->customStopList = $stop; |
252 | $this->stopName = $name ?? "{$this->langName}_stop"; |
253 | return $this; |
254 | } |
255 | |
256 | /** |
257 | * @param mixed $stop pre-defined list like _french_ or an array of stopwords |
258 | * @param string $name |
259 | * @param mixed $beforeFilter filter to insert extra stop before |
260 | * @param bool|null $ignoreCase |
261 | * @return self |
262 | */ |
263 | public function withExtraStop( $stop, string $name, $beforeFilter = self::APPEND, |
264 | ?bool $ignoreCase = null ): self { |
265 | $this->extraStopList = $stop; |
266 | $this->extraStopName = $name; |
267 | $this->extraStopIgnoreCase = $ignoreCase; |
268 | $this->insertFiltersBefore( $beforeFilter, [ $name ] ); |
269 | return $this; |
270 | } |
271 | |
272 | /** |
273 | * @param string $lang |
274 | * @param string|null $name |
275 | * @return self |
276 | */ |
277 | public function withExtraStemmer( string $lang, ?string $name = null ): self { |
278 | $this->extraStemmerLang = $lang; |
279 | $this->extraStemmerName = $name ?? $lang; |
280 | return $this; |
281 | } |
282 | |
283 | /** |
284 | * Rules can be a single rule string, or an array of rules |
285 | * |
286 | * @param mixed $rules stemmer override rules |
287 | * @param string|null $name |
288 | * @return self |
289 | */ |
290 | public function withStemmerOverride( $rules, ?string $name = null ): self { |
291 | $this->overrideRules = $rules; |
292 | $this->overrideName = $name ?? "{$this->langName}_override"; |
293 | return $this; |
294 | } |
295 | |
296 | /********** |
297 | * The with.., omit.., and insert.. methods below are only used by unpacked analyzers |
298 | */ |
299 | |
300 | /** @return self */ |
301 | public function withUnpackedAnalyzer(): self { |
302 | $this->unpacked = true; |
303 | return $this; |
304 | } |
305 | |
306 | private function unpackedCheck(): void { |
307 | if ( !$this->unpacked ) { |
308 | $caller = debug_backtrace()[1]['function']; |
309 | throw new ConfigException( "$caller() is only compatible with unpacked analyzers;" . |
310 | "call withUnpackedAnalyzer() before calling $caller()." ); |
311 | } |
312 | } |
313 | |
314 | /** |
315 | * @param mixed $beforeFilter specific filter to insert $filters before; use APPEND |
316 | * or PREPEND to always add to beginning or end of the list |
317 | * @param string[] $filterList list of additional filters to insert |
318 | * @return self |
319 | */ |
320 | public function insertFiltersBefore( $beforeFilter, array $filterList ): self { |
321 | $this->unpackedCheck(); |
322 | $this->insertFilterList[] = [ $beforeFilter => $filterList ]; |
323 | return $this; |
324 | } |
325 | |
326 | /** |
327 | * @param string[] $filterList list of additional filters to append |
328 | * @return self |
329 | */ |
330 | public function appendFilters( array $filterList ): self { |
331 | $this->unpackedCheck(); |
332 | $this->insertFiltersBefore( self::APPEND, $filterList ); |
333 | return $this; |
334 | } |
335 | |
336 | /** |
337 | * @param string[] $filterList list of additional filters to prepend |
338 | * @return self |
339 | */ |
340 | public function prependFilters( array $filterList ): self { |
341 | $this->unpackedCheck(); |
342 | $this->insertFiltersBefore( self::PREPEND, $filterList ); |
343 | return $this; |
344 | } |
345 | |
346 | /** @return self */ |
347 | public function withLightStemmer(): self { |
348 | $this->unpackedCheck(); |
349 | $this->stemmerLang = "light_{$this->langName}"; |
350 | return $this; |
351 | } |
352 | |
353 | /** @return self */ |
354 | public function omitStemmer(): self { |
355 | $this->unpackedCheck(); |
356 | $this->useStemmer = false; |
357 | return $this; |
358 | } |
359 | |
360 | /** @return self */ |
361 | public function withAsciifolding(): self { |
362 | $this->unpackedCheck(); |
363 | $this->folding = 'asciifolding'; |
364 | return $this; |
365 | } |
366 | |
367 | /** @return self */ |
368 | public function omitFolding(): self { |
369 | $this->unpackedCheck(); |
370 | $this->folding = ''; |
371 | return $this; |
372 | } |
373 | |
374 | /** @return self */ |
375 | public function withRemoveEmpty(): self { |
376 | $this->unpackedCheck(); |
377 | $this->removeEmpty = 'remove_empty'; |
378 | return $this; |
379 | } |
380 | |
381 | /** @return self */ |
382 | public function withDecimalDigit(): self { |
383 | $this->unpackedCheck(); |
384 | $this->decimalDigit = 'decimal_digit'; |
385 | return $this; |
386 | } |
387 | |
388 | /** |
389 | * Create a basic analyzer with support for various common options |
390 | * |
391 | * Can create various filters and character filters as specified. |
392 | * None are automatically added to the char_filter or filter list |
393 | * because the best order for these basic analyzers depends on the |
394 | * details of various third-party plugins. |
395 | * |
396 | * type: custom |
397 | * tokenizer: standard |
398 | * char_filter: as per $this->charFilters |
399 | * filter: as per $this->filters |
400 | * |
401 | * @param mixed[] $config to be updated |
402 | * @return mixed[] updated config |
403 | */ |
404 | public function build( array $config ): array { |
405 | $langStem = "{$this->langName}_stemmer"; |
406 | |
407 | if ( $this->unpacked ) { |
408 | // Analyzer config for char_filter and filter will be in the order below, |
409 | // if the relevant filters are enabled/configured. |
410 | // |
411 | // type: custom |
412 | // tokenizer: standard |
413 | // char_filter: lang_charfilter, lang_numbers |
414 | // filter: elision, aggressive_splitting, lowercase, stopwords, lang_norm, |
415 | // stemmer_override, stemmer, folding, remove_empty |
416 | if ( $this->useStemmer ) { |
417 | $this->stemmerLang ??= $this->langName; |
418 | } else { |
419 | $langStem = ''; |
420 | } |
421 | $this->withStop( $this->customStopList ?? "_{$this->langName}_" ); |
422 | |
423 | // remove icu_folding if icu plugin unavailable or unwanted |
424 | if ( $this->folding == 'icu_folding' ) { |
425 | if ( !$this->icuEnabled ) { |
426 | $this->folding = ''; |
427 | } |
428 | } |
429 | |
430 | // build up the char_filter list--everything is optional |
431 | $this->charFilters[] = $this->charMapName; |
432 | $this->charFilters[] = $this->numCharMapName; |
433 | |
434 | // remove 'falsey' (== not configured) values from the list |
435 | $this->charFilters = array_values( array_filter( $this->charFilters ) ); |
436 | |
437 | // build up the filter list--lowercase, stop, and stem are required |
438 | $this->filters[] = $this->elisionName; |
439 | $this->filters[] = 'lowercase'; |
440 | $this->filters[] = $this->decimalDigit; |
441 | $this->filters[] = $this->stopName; |
442 | $this->filters[] = $this->overrideName; |
443 | $this->filters[] = $langStem; |
444 | $this->filters[] = $this->folding; |
445 | $this->filters[] = $this->removeEmpty; |
446 | |
447 | // remove 'falsey' (== not configured) values from the list |
448 | $this->filters = array_values( array_filter( $this->filters ) ); |
449 | |
450 | // iterate over all lists of sets of filters to insert, in order, and insert |
451 | // them before the specified filter. If no such filter exists, $idx == -1 and |
452 | // the filters will be prepended, but you shouldn't count on that. APPEND and |
453 | // PREPEND constants can be used to add to beginning or end, regardless of |
454 | // other filters |
455 | foreach ( $this->insertFilterList as $filterPatch ) { |
456 | foreach ( $filterPatch as $beforeFilter => $filterList ) { |
457 | switch ( $beforeFilter ) { |
458 | case self::APPEND: |
459 | $this->filters = array_merge( $this->filters, $filterList ); |
460 | break; |
461 | case self::PREPEND: |
462 | $this->filters = array_merge( $filterList, $this->filters ); |
463 | break; |
464 | default: |
465 | $idx = array_search( $beforeFilter, $this->filters ); |
466 | array_splice( $this->filters, $idx, 0, $filterList ); |
467 | break; |
468 | } |
469 | } |
470 | } |
471 | |
472 | } else { |
473 | // for simple filter lists, remove icu_folding if ICU not enabled |
474 | if ( !$this->icuEnabled ) { |
475 | $if_idx = array_search( 'icu_folding', $this->filters ); |
476 | if ( $if_idx !== false ) { |
477 | array_splice( $this->filters, $if_idx, 1 ); |
478 | } |
479 | } |
480 | } |
481 | |
482 | $config[ 'analyzer' ][ $this->analyzerName ] = [ |
483 | 'type' => 'custom', |
484 | 'tokenizer' => $this->tokenizer, |
485 | ]; |
486 | |
487 | if ( $this->charMapName ) { |
488 | $config[ 'char_filter' ][ $this->charMapName ] = |
489 | $this->mappingCharFilter( $this->charMap, $this->charMapLimited ); |
490 | } |
491 | |
492 | if ( $this->numCharMapName ) { |
493 | $config[ 'char_filter' ][ $this->numCharMapName ] = |
494 | $this->numberCharFilter( $this->langZero, $this->numCharMapReversed ); |
495 | } |
496 | |
497 | if ( $this->elisionName ) { |
498 | $config[ 'filter' ][ $this->elisionName ] = |
499 | $this->elisionFilter( $this->elisionArticles, $this->elisionArticleCase ); |
500 | } |
501 | |
502 | if ( $this->langLowercase ) { |
503 | $config[ 'filter' ][ 'lowercase' ][ 'language' ] = $this->langLowercase; |
504 | } |
505 | |
506 | if ( $this->overrideName ) { |
507 | $config[ 'filter' ][ $this->overrideName ] = |
508 | $this->overrideFilter( $this->overrideRules ); |
509 | } |
510 | |
511 | if ( $this->stopName ) { |
512 | $config[ 'filter' ][ $this->stopName ] = |
513 | $this->stopFilterFromList( $this->customStopList ); |
514 | } |
515 | |
516 | if ( $this->extraStopName ) { |
517 | $config[ 'filter' ][ $this->extraStopName ] = |
518 | $this->stopFilterFromList( $this->extraStopList, $this->extraStopIgnoreCase ); |
519 | } |
520 | |
521 | if ( $this->charFilters ) { |
522 | $config[ 'analyzer' ][ $this->analyzerName ][ 'char_filter' ] = $this->charFilters; |
523 | } |
524 | |
525 | if ( $this->filters ) { |
526 | $config[ 'analyzer' ][ $this->analyzerName ][ 'filter' ] = $this->filters; |
527 | } |
528 | |
529 | if ( $this->stemmerLang && $this->useStemmer ) { |
530 | $config[ 'filter' ][ $langStem ] = |
531 | $this->stemmerFilter( $this->stemmerLang ); |
532 | } |
533 | |
534 | if ( $this->extraStemmerName ) { |
535 | $config[ 'filter' ][ $this->extraStemmerName ] = |
536 | $this->stemmerFilter( $this->extraStemmerLang ); |
537 | } |
538 | |
539 | return $config; |
540 | } |
541 | |
542 | /** |
543 | * Create a pattern_replace filter/char_filter with the mappings provided. |
544 | * |
545 | * @param string $pat |
546 | * @param string $repl |
547 | * @return mixed[] filter |
548 | */ |
549 | public static function patternFilter( string $pat, string $repl = '' ): array { |
550 | return [ 'type' => 'pattern_replace', 'pattern' => $pat, 'replacement' => $repl ]; |
551 | } |
552 | |
553 | /** |
554 | * Create a mapping or limited_mapping character filter with the mappings provided. |
555 | * |
556 | * @param string[] $mappings |
557 | * @param bool $limited |
558 | * @return mixed[] character filter |
559 | */ |
560 | public static function mappingCharFilter( array $mappings, bool $limited ): array { |
561 | $type = $limited ? 'limited_mapping' : 'mapping'; |
562 | return [ 'type' => $type, 'mappings' => $mappings ]; |
563 | } |
564 | |
565 | /** |
566 | * Create a character filter that maps non-Arabic digits (e.g., ០-៩ or 0-9) to |
567 | * Arabic digits (0-9). Since they are usually all in a row, we just need the |
568 | * starting digit (equal to 0). |
569 | * |
570 | * Optionally reverse the mapping from Arabic to non-Arabic. For example, the ICU |
571 | * tokenizer works better on tokenizing Thai digits in Thai text than it does on |
572 | * Arabic digits. |
573 | * |
574 | * @param int $langZero |
575 | * @param bool $reversed reverse the mapping from Arabic to non-Arabic |
576 | * @return mixed[] character filter |
577 | */ |
578 | public static function numberCharFilter( int $langZero, bool $reversed = false ): array { |
579 | $numMap = []; |
580 | for ( $i = 0; $i <= 9; $i++ ) { |
581 | if ( $reversed ) { |
582 | $numMap[] = sprintf( '%d=>\\u%04x', $i, $langZero + $i ); |
583 | } else { |
584 | $numMap[] = sprintf( '\\u%04x=>%d', $langZero + $i, $i ); |
585 | } |
586 | } |
587 | return self::mappingCharFilter( $numMap, true ); |
588 | } |
589 | |
590 | /** |
591 | * Create an elision filter with the "articles" provided; $case determines whether |
592 | * stripping is case sensitive or not |
593 | * |
594 | * @param string[] $articles |
595 | * @param bool $case |
596 | * @return mixed[] token filter |
597 | */ |
598 | public static function elisionFilter( array $articles, bool $case = true ): array { |
599 | return [ 'type' => 'elision', 'articles_case' => $case, 'articles' => $articles ]; |
600 | } |
601 | |
602 | /** |
603 | * Create a stop word filter with the provided config. The config can be an array |
604 | * of stop words, or a string like _french_ that refers to a pre-defined list. |
605 | * |
606 | * @param mixed $stopwords |
607 | * @param bool|null $ignoreCase |
608 | * @return mixed[] token filter |
609 | */ |
610 | public static function stopFilterFromList( $stopwords, ?bool $ignoreCase = null ): array { |
611 | $retArray = [ 'type' => 'stop', 'stopwords' => $stopwords ]; |
612 | if ( $ignoreCase !== null ) { |
613 | $retArray['ignore_case'] = $ignoreCase; |
614 | } |
615 | return $retArray; |
616 | } |
617 | |
618 | /** |
619 | * Create an stemming override filter with the rules provided, which can be a string |
620 | * with one rule or an array of such rules |
621 | * |
622 | * @param mixed $rules |
623 | * @return mixed[] token filter |
624 | */ |
625 | private function overrideFilter( $rules ): array { |
626 | return [ 'type' => 'stemmer_override', 'rules' => $rules ]; |
627 | } |
628 | |
629 | /** |
630 | * Create a stemmer filter with the provided config. |
631 | * |
632 | * @param string $stemmer |
633 | * @return mixed[] token filter |
634 | */ |
635 | public static function stemmerFilter( string $stemmer ): array { |
636 | return [ 'type' => 'stemmer', 'language' => $stemmer ]; |
637 | } |
638 | |
639 | } |