Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
100.00% |
196 / 196 |
|
100.00% |
35 / 35 |
CRAP | |
100.00% |
1 / 1 |
| AnalyzerBuilder | |
100.00% |
196 / 196 |
|
100.00% |
35 / 35 |
68 | |
100.00% |
1 / 1 |
| __construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| withLangName | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| withCharFilters | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| withTokenizer | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| withFilters | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| withCharMap | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| withLimitedCharMap | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| withInvisCharMap | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| withReversedNumberCharFilter | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| withNumberCharFilter | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
| withElision | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| withLangLowercase | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
| withStop | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| withExtraStop | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
| withExtraStemmer | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| withStemmerOverride | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| withUnpackedAnalyzer | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| unpackedCheck | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| insertFiltersBefore | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| appendFilters | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| prependFilters | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| withLightStemmer | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| omitStemmer | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| withAsciifolding | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| omitFolding | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| withRemoveEmpty | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| withDecimalDigit | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| build | |
100.00% |
103 / 103 |
|
100.00% |
1 / 1 |
27 | |||
| patternFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| mappingCharFilter | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
| numberCharFilter | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
| elisionFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| stopFilterFromList | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| overrideFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| stemmerFilter | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace CirrusSearch\Maintenance; |
| 4 | |
| 5 | use MediaWiki\Config\ConfigException; |
| 6 | |
| 7 | /** |
| 8 | * Builds one search analyzer to add to an analysis config array. |
| 9 | * |
| 10 | * This program is free software; you can redistribute it and/or modify |
| 11 | * it under the terms of the GNU General Public License as published by |
| 12 | * the Free Software Foundation; either version 2 of the License, or |
| 13 | * (at your option) any later version. |
| 14 | * |
| 15 | * This program is distributed in the hope that it will be useful, |
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 18 | * GNU General Public License for more details. |
| 19 | * |
| 20 | * You should have received a copy of the GNU General Public License along |
| 21 | * with this program; if not, write to the Free Software Foundation, Inc., |
| 22 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
| 23 | * http://www.gnu.org/copyleft/gpl.html |
| 24 | */ |
| 25 | class AnalyzerBuilder { |
| 26 | /** |
| 27 | * Indicate that filters should be automatically appended or prepended, rather |
| 28 | * than inserted before a given filter. |
| 29 | */ |
| 30 | public const APPEND = 1; |
| 31 | public const PREPEND = 2; |
| 32 | |
| 33 | /** @var string */ |
| 34 | private $langName; |
| 35 | |
| 36 | /** @var string */ |
| 37 | private $analyzerName = 'text'; |
| 38 | |
| 39 | /** @var bool */ |
| 40 | private $icuEnabled; |
| 41 | |
| 42 | /** @var string[]|null list of char_filters */ |
| 43 | private $charFilters; |
| 44 | |
| 45 | /** @var string|null name of tokenizer */ |
| 46 | private $tokenizer = 'standard'; |
| 47 | |
| 48 | /** @var string[]|null list of filters */ |
| 49 | private $filters; |
| 50 | |
| 51 | /** @var string[]|null list of lang-specific character filter mappings */ |
| 52 | private $charMap; |
| 53 | |
| 54 | /** @var bool */ |
| 55 | private $charMapLimited = false; |
| 56 | |
| 57 | /** @var string|null */ |
| 58 | private $charMapName; |
| 59 | |
| 60 | /** @var int|null Unicode value for script-specific zero */ |
| 61 | private $langZero; |
| 62 | |
| 63 | /** @var bool should langZero's map be reversed (Arabic to non-Arabic)? */ |
| 64 | private $numCharMapReversed = false; |
| 65 | |
| 66 | /** @var string|null name of char filter mapping digits (using $langZero) */ |
| 67 | private $numCharMapName; |
| 68 | |
| 69 | /** @var string|null name of char filter for cleaning up invisibles */ |
| 70 | private $invisCharMapName; |
| 71 | |
| 72 | /** @var bool is elision processing case INsensitive? */ |
| 73 | private $elisionArticleCase = true; |
| 74 | |
| 75 | /** @var string[]|null list of articles to elide */ |
| 76 | private $elisionArticles; |
| 77 | |
| 78 | /** @var string|null */ |
| 79 | private $elisionName; |
| 80 | |
| 81 | /** @var string|null */ |
| 82 | private $langLowercase; |
| 83 | |
| 84 | /** @var mixed|null stopword _list_ or array of stopwords */ |
| 85 | private $customStopList; |
| 86 | |
| 87 | /** @var string|null */ |
| 88 | private $stopName; |
| 89 | |
| 90 | /** @var mixed|null stopword _list_ or array of stopwords */ |
| 91 | private $extraStopList; |
| 92 | |
| 93 | /** @var string|null */ |
| 94 | private $extraStopName; |
| 95 | |
| 96 | /** @var bool|null */ |
| 97 | private $extraStopIgnoreCase; |
| 98 | |
| 99 | /** @var string|null */ |
| 100 | private $extraStemmerLang; |
| 101 | |
| 102 | /** @var string|null */ |
| 103 | private $extraStemmerName; |
| 104 | |
| 105 | /** @var string[]|null list of stemmer override rules */ |
| 106 | private $overrideRules; |
| 107 | |
| 108 | /** @var string|null */ |
| 109 | private $overrideName; |
| 110 | |
| 111 | /********** |
| 112 | * The properties below are only used by unpacked analyzers |
| 113 | */ |
| 114 | |
| 115 | /** @var bool */ |
| 116 | private $unpacked = false; |
| 117 | |
| 118 | /** @var array<int, array<string, string[]>> */ |
| 119 | private $insertFilterList = []; |
| 120 | |
| 121 | /** @var bool */ |
| 122 | private $useStemmer = true; |
| 123 | |
| 124 | /** @var string|null */ |
| 125 | private $stemmerLang; |
| 126 | |
| 127 | /** @var string|null folding flavor to use (null for none) */ |
| 128 | private $folding = 'icu_folding'; |
| 129 | |
| 130 | /** @var string|null */ |
| 131 | private $removeEmpty; |
| 132 | |
| 133 | /** @var string|null */ |
| 134 | private $decimalDigit; |
| 135 | |
| 136 | /** |
| 137 | * @param string $langName |
| 138 | * @param bool $icuEnabled |
| 139 | */ |
| 140 | public function __construct( string $langName, bool $icuEnabled = false ) { |
| 141 | $this->langName = $langName; |
| 142 | $this->icuEnabled = $icuEnabled; |
| 143 | } |
| 144 | |
| 145 | public function withLangName( string $langName ): self { |
| 146 | $this->langName = $langName; |
| 147 | return $this; |
| 148 | } |
| 149 | |
| 150 | /** |
| 151 | * @param string[] $charFilters |
| 152 | * @return self |
| 153 | */ |
| 154 | public function withCharFilters( array $charFilters ): self { |
| 155 | $this->charFilters = $charFilters; |
| 156 | return $this; |
| 157 | } |
| 158 | |
| 159 | public function withTokenizer( string $tokenizer ): self { |
| 160 | $this->tokenizer = $tokenizer; |
| 161 | return $this; |
| 162 | } |
| 163 | |
| 164 | /** |
| 165 | * @param string[] $filters |
| 166 | * @return self |
| 167 | */ |
| 168 | public function withFilters( array $filters ): self { |
| 169 | $this->filters = $filters; |
| 170 | return $this; |
| 171 | } |
| 172 | |
| 173 | /** |
| 174 | * @param string[] $mappings |
| 175 | * @param string|null $name |
| 176 | * @param bool $limited |
| 177 | * @return self |
| 178 | */ |
| 179 | public function withCharMap( array $mappings, ?string $name = null, bool $limited = false ): self { |
| 180 | $this->charMap = $mappings; |
| 181 | $this->charMapName = $name ?? "{$this->langName}_charfilter"; |
| 182 | $this->charMapLimited = $limited; |
| 183 | return $this; |
| 184 | } |
| 185 | |
| 186 | /** |
| 187 | * @param string[] $mappings |
| 188 | * @param string|null $name |
| 189 | * @return self |
| 190 | */ |
| 191 | public function withLimitedCharMap( array $mappings, ?string $name = null ): self { |
| 192 | return $this->withCharMap( $mappings, $name, true ); |
| 193 | } |
| 194 | |
| 195 | /** |
| 196 | * @param string|null $name |
| 197 | * @return self |
| 198 | */ |
| 199 | public function withInvisCharMap( ?string $name = 'invis_cleanup' ): self { |
| 200 | $this->invisCharMapName = $name; |
| 201 | return $this; |
| 202 | } |
| 203 | |
| 204 | /** |
| 205 | * @param int $langZero |
| 206 | * @param string|null $name |
| 207 | * @return self |
| 208 | */ |
| 209 | public function withReversedNumberCharFilter( int $langZero, ?string $name = null ): self { |
| 210 | $this->withNumberCharFilter( $langZero, $name, true ); |
| 211 | return $this; |
| 212 | } |
| 213 | |
| 214 | /** |
| 215 | * @param int $langZero |
| 216 | * @param string|null $name |
| 217 | * @param bool $reversed reverse the mapping from Arabic to non-Arabic |
| 218 | * @return self |
| 219 | */ |
| 220 | public function withNumberCharFilter( int $langZero, ?string $name = null, bool $reversed = false ): self { |
| 221 | $defName = $reversed ? "{$this->langName}_reversed_numbers" : "{$this->langName}_numbers"; |
| 222 | $this->langZero = $langZero; |
| 223 | $this->numCharMapName = $name ?? $defName; |
| 224 | $this->numCharMapReversed = $reversed; |
| 225 | return $this; |
| 226 | } |
| 227 | |
| 228 | /** |
| 229 | * @param string[] $articles "articles" to be elided |
| 230 | * @param bool $articleCase whether elision is case insensitive |
| 231 | * @return self |
| 232 | */ |
| 233 | public function withElision( array $articles, bool $articleCase = true ): self { |
| 234 | $this->elisionArticleCase = $articleCase; |
| 235 | $this->elisionArticles = $articles; |
| 236 | $this->elisionName = "{$this->langName}_elision"; |
| 237 | return $this; |
| 238 | } |
| 239 | |
| 240 | /** |
| 241 | * @param string|null $name |
| 242 | * @return self |
| 243 | */ |
| 244 | public function withLangLowercase( ?string $name = null ): self { |
| 245 | $this->langLowercase = $name ?: $this->langName; |
| 246 | return $this; |
| 247 | } |
| 248 | |
| 249 | /** |
| 250 | * @param mixed $stop pre-defined list like _french_ or an array of stopwords |
| 251 | * @param string|null $name |
| 252 | * @return self |
| 253 | */ |
| 254 | public function withStop( $stop, ?string $name = null ): self { |
| 255 | $this->customStopList = $stop; |
| 256 | $this->stopName = $name ?? "{$this->langName}_stop"; |
| 257 | return $this; |
| 258 | } |
| 259 | |
| 260 | /** |
| 261 | * @param mixed $stop pre-defined list like _french_ or an array of stopwords |
| 262 | * @param string $name |
| 263 | * @param mixed $beforeFilter filter to insert extra stop before |
| 264 | * @param bool|null $ignoreCase |
| 265 | * @return self |
| 266 | */ |
| 267 | public function withExtraStop( $stop, string $name, $beforeFilter = self::APPEND, |
| 268 | ?bool $ignoreCase = null ): self { |
| 269 | $this->extraStopList = $stop; |
| 270 | $this->extraStopName = $name; |
| 271 | $this->extraStopIgnoreCase = $ignoreCase; |
| 272 | $this->insertFiltersBefore( $beforeFilter, [ $name ] ); |
| 273 | return $this; |
| 274 | } |
| 275 | |
| 276 | /** |
| 277 | * @param string $lang |
| 278 | * @param string|null $name |
| 279 | * @return self |
| 280 | */ |
| 281 | public function withExtraStemmer( string $lang, ?string $name = null ): self { |
| 282 | $this->extraStemmerLang = $lang; |
| 283 | $this->extraStemmerName = $name ?? $lang; |
| 284 | return $this; |
| 285 | } |
| 286 | |
| 287 | /** |
| 288 | * Rules can be a single rule string, or an array of rules |
| 289 | * |
| 290 | * @param mixed $rules stemmer override rules |
| 291 | * @param string|null $name |
| 292 | * @return self |
| 293 | */ |
| 294 | public function withStemmerOverride( $rules, ?string $name = null ): self { |
| 295 | $this->overrideRules = $rules; |
| 296 | $this->overrideName = $name ?? "{$this->langName}_override"; |
| 297 | return $this; |
| 298 | } |
| 299 | |
| 300 | public function withUnpackedAnalyzer(): self { |
| 301 | $this->unpacked = true; |
| 302 | return $this; |
| 303 | } |
| 304 | |
| 305 | private function unpackedCheck(): void { |
| 306 | if ( !$this->unpacked ) { |
| 307 | $caller = debug_backtrace()[1]['function']; |
| 308 | throw new ConfigException( "$caller() is only compatible with unpacked analyzers;" . |
| 309 | "call withUnpackedAnalyzer() before calling $caller()." ); |
| 310 | } |
| 311 | } |
| 312 | |
| 313 | /** |
| 314 | * @param mixed $beforeFilter specific filter to insert $filters before; use APPEND |
| 315 | * or PREPEND to always add to beginning or end of the list |
| 316 | * @param string[] $filterList list of additional filters to insert |
| 317 | * @return self |
| 318 | */ |
| 319 | public function insertFiltersBefore( $beforeFilter, array $filterList ): self { |
| 320 | $this->unpackedCheck(); |
| 321 | $this->insertFilterList[] = [ $beforeFilter => $filterList ]; |
| 322 | return $this; |
| 323 | } |
| 324 | |
| 325 | /** |
| 326 | * @param string[] $filterList list of additional filters to append |
| 327 | * @return self |
| 328 | */ |
| 329 | public function appendFilters( array $filterList ): self { |
| 330 | $this->unpackedCheck(); |
| 331 | $this->insertFiltersBefore( self::APPEND, $filterList ); |
| 332 | return $this; |
| 333 | } |
| 334 | |
| 335 | /** |
| 336 | * @param string[] $filterList list of additional filters to prepend |
| 337 | * @return self |
| 338 | */ |
| 339 | public function prependFilters( array $filterList ): self { |
| 340 | $this->unpackedCheck(); |
| 341 | $this->insertFiltersBefore( self::PREPEND, $filterList ); |
| 342 | return $this; |
| 343 | } |
| 344 | |
| 345 | public function withLightStemmer(): self { |
| 346 | $this->unpackedCheck(); |
| 347 | $this->stemmerLang = "light_{$this->langName}"; |
| 348 | return $this; |
| 349 | } |
| 350 | |
| 351 | public function omitStemmer(): self { |
| 352 | $this->unpackedCheck(); |
| 353 | $this->useStemmer = false; |
| 354 | return $this; |
| 355 | } |
| 356 | |
| 357 | public function withAsciifolding(): self { |
| 358 | $this->unpackedCheck(); |
| 359 | $this->folding = 'asciifolding'; |
| 360 | return $this; |
| 361 | } |
| 362 | |
| 363 | public function omitFolding(): self { |
| 364 | $this->unpackedCheck(); |
| 365 | $this->folding = ''; |
| 366 | return $this; |
| 367 | } |
| 368 | |
| 369 | public function withRemoveEmpty(): self { |
| 370 | $this->unpackedCheck(); |
| 371 | $this->removeEmpty = 'remove_empty'; |
| 372 | return $this; |
| 373 | } |
| 374 | |
| 375 | public function withDecimalDigit(): self { |
| 376 | $this->unpackedCheck(); |
| 377 | $this->decimalDigit = 'decimal_digit'; |
| 378 | return $this; |
| 379 | } |
| 380 | |
| 381 | /** |
| 382 | * Create a basic analyzer with support for various common options |
| 383 | * |
| 384 | * Can create various filters and character filters as specified. |
| 385 | * None are automatically added to the char_filter or filter list |
| 386 | * because the best order for these basic analyzers depends on the |
| 387 | * details of various third-party plugins. |
| 388 | * |
| 389 | * type: custom |
| 390 | * tokenizer: standard |
| 391 | * char_filter: as per $this->charFilters |
| 392 | * filter: as per $this->filters |
| 393 | * |
| 394 | * @param mixed[] $config to be updated |
| 395 | * @return mixed[] updated config |
| 396 | */ |
| 397 | public function build( array $config ): array { |
| 398 | $langStem = "{$this->langName}_stemmer"; |
| 399 | |
| 400 | if ( $this->unpacked ) { |
| 401 | // Analyzer config for char_filter and filter will be in the order below, |
| 402 | // if the relevant filters are enabled/configured. |
| 403 | // |
| 404 | // type: custom |
| 405 | // tokenizer: standard |
| 406 | // char_filter: lang_charfilter, lang_numbers |
| 407 | // filter: elision, aggressive_splitting, lowercase, stopwords, lang_norm, |
| 408 | // stemmer_override, stemmer, folding, remove_empty |
| 409 | if ( $this->useStemmer ) { |
| 410 | $this->stemmerLang ??= $this->langName; |
| 411 | } else { |
| 412 | $langStem = ''; |
| 413 | } |
| 414 | $this->withStop( $this->customStopList ?? "_{$this->langName}_" ); |
| 415 | |
| 416 | // remove icu_folding if icu plugin unavailable or unwanted |
| 417 | if ( $this->folding == 'icu_folding' ) { |
| 418 | if ( !$this->icuEnabled ) { |
| 419 | $this->folding = ''; |
| 420 | } |
| 421 | } |
| 422 | |
| 423 | // build up the char_filter list--everything is optional |
| 424 | $this->charFilters[] = $this->charMapName; |
| 425 | $this->charFilters[] = $this->numCharMapName; |
| 426 | |
| 427 | // remove 'falsey' (== not configured) values from the list |
| 428 | $this->charFilters = array_values( array_filter( $this->charFilters ) ); |
| 429 | |
| 430 | // build up the filter list--lowercase, stop, and stem are required |
| 431 | $this->filters[] = $this->elisionName; |
| 432 | $this->filters[] = 'lowercase'; |
| 433 | $this->filters[] = $this->decimalDigit; |
| 434 | $this->filters[] = $this->stopName; |
| 435 | $this->filters[] = $this->overrideName; |
| 436 | $this->filters[] = $langStem; |
| 437 | $this->filters[] = $this->folding; |
| 438 | $this->filters[] = $this->removeEmpty; |
| 439 | |
| 440 | // remove 'falsey' (== not configured) values from the list |
| 441 | $this->filters = array_values( array_filter( $this->filters ) ); |
| 442 | |
| 443 | // iterate over all lists of sets of filters to insert, in order, and insert |
| 444 | // them before the specified filter. If no such filter exists, $idx == -1 and |
| 445 | // the filters will be prepended, but you shouldn't count on that. APPEND and |
| 446 | // PREPEND constants can be used to add to beginning or end, regardless of |
| 447 | // other filters |
| 448 | foreach ( $this->insertFilterList as $filterPatch ) { |
| 449 | foreach ( $filterPatch as $beforeFilter => $filterList ) { |
| 450 | switch ( $beforeFilter ) { |
| 451 | case self::APPEND: |
| 452 | $this->filters = array_merge( $this->filters, $filterList ); |
| 453 | break; |
| 454 | case self::PREPEND: |
| 455 | $this->filters = array_merge( $filterList, $this->filters ); |
| 456 | break; |
| 457 | default: |
| 458 | $idx = array_search( $beforeFilter, $this->filters ); |
| 459 | array_splice( $this->filters, $idx, 0, $filterList ); |
| 460 | break; |
| 461 | } |
| 462 | } |
| 463 | } |
| 464 | |
| 465 | } else { |
| 466 | // for simple filter lists, remove icu_folding if ICU not enabled |
| 467 | if ( !$this->icuEnabled ) { |
| 468 | $if_idx = array_search( 'icu_folding', $this->filters ); |
| 469 | if ( $if_idx !== false ) { |
| 470 | array_splice( $this->filters, $if_idx, 1 ); |
| 471 | } |
| 472 | } |
| 473 | } |
| 474 | |
| 475 | $config[ 'analyzer' ][ $this->analyzerName ] = [ |
| 476 | 'type' => 'custom', |
| 477 | 'tokenizer' => $this->tokenizer, |
| 478 | ]; |
| 479 | |
| 480 | if ( $this->charMapName ) { |
| 481 | $config[ 'char_filter' ][ $this->charMapName ] = |
| 482 | $this->mappingCharFilter( $this->charMap, $this->charMapLimited ); |
| 483 | } |
| 484 | |
| 485 | if ( $this->numCharMapName ) { |
| 486 | $config[ 'char_filter' ][ $this->numCharMapName ] = |
| 487 | $this->numberCharFilter( $this->langZero, $this->numCharMapReversed ); |
| 488 | } |
| 489 | |
| 490 | if ( $this->invisCharMapName ) { |
| 491 | $config[ 'char_filter' ][ $this->invisCharMapName ] = [ |
| 492 | 'type' => 'mapping', |
| 493 | 'mappings' => [ |
| 494 | // split on ... |
| 495 | '\u200B=>\u0020', // ... zero-width space |
| 496 | // remove ... |
| 497 | '\u00AD=>', // ... soft hyphen |
| 498 | '\u200C=>', // ... zero-width non-joiner |
| 499 | '\u200D=>', // ... zero-width joiner |
| 500 | '\u2060=>', // ... word joiner |
| 501 | '\uFEFF=>', // ... zero-width non-breaking space |
| 502 | '\u200E=>', // ... LTR mark |
| 503 | '\u200F=>', // ... RTL mark |
| 504 | '\u202A=>', // ... LTR embedding |
| 505 | '\u202B=>', // ... RTL embedding |
| 506 | '\u202C=>', // ... pop directional formatting |
| 507 | '\u202D=>', // ... LTR override |
| 508 | '\u202E=>', // ... RTL override |
| 509 | '\u2066=>', // ... LTR isolate |
| 510 | '\u2067=>', // ... RTL isolate |
| 511 | '\u2068=>', // ... first strong isolate |
| 512 | '\u2069=>', // ... pop directional isolate |
| 513 | '\u2061=>', // ... function application |
| 514 | '\u2062=>', // ... invisible times |
| 515 | '\u2063=>', // ... invisible separator |
| 516 | '\u2064=>', // ... invisible plus |
| 517 | // remove variation selectors 1-16 & 17-256 (below) |
| 518 | ] |
| 519 | ]; |
| 520 | |
| 521 | // add all 256 variation selectors to invisCharMapName |
| 522 | for ( $varIdx = 1; $varIdx <= 16; $varIdx++ ) { // 1-16 (FE00-FE0F) |
| 523 | $chr = mb_chr( 65023 + $varIdx, 'UTF-8' ); // xFE00 = 65024 |
| 524 | $config[ 'char_filter' ][ $this->invisCharMapName ][ 'mappings' ][] = "$chr=>"; |
| 525 | } |
| 526 | for ( $varIdx = 17; $varIdx <= 256; $varIdx++ ) { // 17-256 (E0100-E01EF) |
| 527 | $chr = mb_chr( 917743 + $varIdx, 'UTF-8' ); // E0100 = 917760 |
| 528 | $config[ 'char_filter' ][ $this->invisCharMapName ][ 'mappings' ][] = "$chr=>"; |
| 529 | } |
| 530 | } |
| 531 | |
| 532 | if ( $this->elisionName ) { |
| 533 | $config[ 'filter' ][ $this->elisionName ] = |
| 534 | $this->elisionFilter( $this->elisionArticles, $this->elisionArticleCase ); |
| 535 | } |
| 536 | |
| 537 | if ( $this->langLowercase ) { |
| 538 | $config[ 'filter' ][ 'lowercase' ][ 'language' ] = $this->langLowercase; |
| 539 | } |
| 540 | |
| 541 | if ( $this->overrideName ) { |
| 542 | $config[ 'filter' ][ $this->overrideName ] = |
| 543 | $this->overrideFilter( $this->overrideRules ); |
| 544 | } |
| 545 | |
| 546 | if ( $this->stopName ) { |
| 547 | $config[ 'filter' ][ $this->stopName ] = |
| 548 | $this->stopFilterFromList( $this->customStopList ); |
| 549 | } |
| 550 | |
| 551 | if ( $this->extraStopName ) { |
| 552 | $config[ 'filter' ][ $this->extraStopName ] = |
| 553 | $this->stopFilterFromList( $this->extraStopList, $this->extraStopIgnoreCase ); |
| 554 | } |
| 555 | |
| 556 | if ( $this->charFilters ) { |
| 557 | $config[ 'analyzer' ][ $this->analyzerName ][ 'char_filter' ] = $this->charFilters; |
| 558 | } |
| 559 | |
| 560 | if ( $this->filters ) { |
| 561 | $config[ 'analyzer' ][ $this->analyzerName ][ 'filter' ] = $this->filters; |
| 562 | } |
| 563 | |
| 564 | if ( $this->stemmerLang && $this->useStemmer ) { |
| 565 | $config[ 'filter' ][ $langStem ] = |
| 566 | $this->stemmerFilter( $this->stemmerLang ); |
| 567 | } |
| 568 | |
| 569 | if ( $this->extraStemmerName ) { |
| 570 | $config[ 'filter' ][ $this->extraStemmerName ] = |
| 571 | $this->stemmerFilter( $this->extraStemmerLang ); |
| 572 | } |
| 573 | |
| 574 | return $config; |
| 575 | } |
| 576 | |
| 577 | /** |
| 578 | * Create a pattern_replace filter/char_filter with the mappings provided. |
| 579 | * |
| 580 | * @param string $pat |
| 581 | * @param string $repl |
| 582 | * @return mixed[] filter |
| 583 | */ |
| 584 | public static function patternFilter( string $pat, string $repl = '' ): array { |
| 585 | return [ 'type' => 'pattern_replace', 'pattern' => $pat, 'replacement' => $repl ]; |
| 586 | } |
| 587 | |
| 588 | /** |
| 589 | * Create a mapping or limited_mapping character filter with the mappings provided. |
| 590 | * |
| 591 | * @param string[] $mappings |
| 592 | * @param bool $limited |
| 593 | * @return mixed[] character filter |
| 594 | */ |
| 595 | public static function mappingCharFilter( array $mappings, bool $limited ): array { |
| 596 | $type = $limited ? 'limited_mapping' : 'mapping'; |
| 597 | return [ 'type' => $type, 'mappings' => $mappings ]; |
| 598 | } |
| 599 | |
| 600 | /** |
| 601 | * Create a character filter that maps non-Arabic digits (e.g., ០-៩ or 0-9) to |
| 602 | * Arabic digits (0-9). Since they are usually all in a row, we just need the |
| 603 | * starting digit (equal to 0). |
| 604 | * |
| 605 | * Optionally reverse the mapping from Arabic to non-Arabic. For example, the ICU |
| 606 | * tokenizer works better on tokenizing Thai digits in Thai text than it does on |
| 607 | * Arabic digits. |
| 608 | * |
| 609 | * @param int $langZero |
| 610 | * @param bool $reversed reverse the mapping from Arabic to non-Arabic |
| 611 | * @return mixed[] character filter |
| 612 | */ |
| 613 | public static function numberCharFilter( int $langZero, bool $reversed = false ): array { |
| 614 | $numMap = []; |
| 615 | for ( $i = 0; $i <= 9; $i++ ) { |
| 616 | if ( $reversed ) { |
| 617 | $numMap[] = sprintf( '%d=>\\u%04x', $i, $langZero + $i ); |
| 618 | } else { |
| 619 | $numMap[] = sprintf( '\\u%04x=>%d', $langZero + $i, $i ); |
| 620 | } |
| 621 | } |
| 622 | return self::mappingCharFilter( $numMap, true ); |
| 623 | } |
| 624 | |
| 625 | /** |
| 626 | * Create an elision filter with the "articles" provided; $case determines whether |
| 627 | * stripping is case sensitive or not |
| 628 | * |
| 629 | * @param string[] $articles |
| 630 | * @param bool $case |
| 631 | * @return mixed[] token filter |
| 632 | */ |
| 633 | public static function elisionFilter( array $articles, bool $case = true ): array { |
| 634 | return [ 'type' => 'elision', 'articles_case' => $case, 'articles' => $articles ]; |
| 635 | } |
| 636 | |
| 637 | /** |
| 638 | * Create a stop word filter with the provided config. The config can be an array |
| 639 | * of stop words, or a string like _french_ that refers to a pre-defined list. |
| 640 | * |
| 641 | * @param mixed $stopwords |
| 642 | * @param bool|null $ignoreCase |
| 643 | * @return mixed[] token filter |
| 644 | */ |
| 645 | public static function stopFilterFromList( $stopwords, ?bool $ignoreCase = null ): array { |
| 646 | $retArray = [ 'type' => 'stop', 'stopwords' => $stopwords ]; |
| 647 | if ( $ignoreCase !== null ) { |
| 648 | $retArray['ignore_case'] = $ignoreCase; |
| 649 | } |
| 650 | return $retArray; |
| 651 | } |
| 652 | |
| 653 | /** |
| 654 | * Create an stemming override filter with the rules provided, which can be a string |
| 655 | * with one rule or an array of such rules |
| 656 | * |
| 657 | * @param mixed $rules |
| 658 | * @return mixed[] token filter |
| 659 | */ |
| 660 | private function overrideFilter( $rules ): array { |
| 661 | return [ 'type' => 'stemmer_override', 'rules' => $rules ]; |
| 662 | } |
| 663 | |
| 664 | /** |
| 665 | * Create a stemmer filter with the provided config. |
| 666 | * |
| 667 | * @param string $stemmer |
| 668 | * @return mixed[] token filter |
| 669 | */ |
| 670 | public static function stemmerFilter( string $stemmer ): array { |
| 671 | return [ 'type' => 'stemmer', 'language' => $stemmer ]; |
| 672 | } |
| 673 | |
| 674 | } |