Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
78.61% |
147 / 187 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
| SuggesterAnalysisConfigBuilder | |
78.61% |
147 / 187 |
|
40.00% |
2 / 5 |
21.17 | |
0.00% |
0 / 1 |
| defaults | |
79.43% |
112 / 141 |
|
0.00% |
0 / 1 |
6.31 | |||
| customize | |
76.19% |
32 / 42 |
|
0.00% |
0 / 1 |
10.09 | |||
| buildConfig | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| getDefaultStopSet | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| hasStopWords | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * @license GPL-2.0-or-later |
| 4 | */ |
| 5 | |
| 6 | namespace CirrusSearch\Maintenance; |
| 7 | |
| 8 | /** |
| 9 | * Builds search analysis config arrays for the completion suggester |
| 10 | * index. |
| 11 | */ |
| 12 | class SuggesterAnalysisConfigBuilder extends AnalysisConfigBuilder { |
| 13 | public const VERSION = "1.4"; |
| 14 | |
| 15 | /** |
| 16 | * Build an analysis config with sane defaults |
| 17 | * |
| 18 | * @param string $language Config language |
| 19 | * @return array |
| 20 | */ |
| 21 | protected function defaults( $language ) { |
| 22 | // Use default lowercase filter |
| 23 | $lowercase_type = [ 'type' => 'lowercase' ]; |
| 24 | if ( $this->isIcuAvailable() ) { |
| 25 | $lowercase_type = [ |
| 26 | "type" => "icu_normalizer", |
| 27 | "name" => "nfkc_cf", |
| 28 | ]; |
| 29 | } |
| 30 | // Use the default Lucene ASCII filter |
| 31 | $folding_type = [ 'type' => 'asciifolding' ]; |
| 32 | if ( $this->shouldActivateIcuFolding( $language ) ) { |
| 33 | // Use ICU Folding if the plugin is available and activated in the config |
| 34 | $folding_type = [ 'type' => 'icu_folding' ]; |
| 35 | $unicodeSetFilter = $this->getICUSetFilter( $language ); |
| 36 | if ( $unicodeSetFilter !== null ) { |
| 37 | $folding_type['unicodeSetFilter'] = $unicodeSetFilter; |
| 38 | } |
| 39 | } |
| 40 | $textTokenizer = 'standard'; |
| 41 | $plainTokenizer = 'whitespace'; |
| 42 | if ( $this->shouldActivateIcuTokenization( $language ) ) { |
| 43 | $textTokenizer = 'icu_tokenizer'; |
| 44 | // We cannot use the icu_tokenizer for plain here |
| 45 | // even if icu tokenization is mostly needed for languages |
| 46 | // where space is not used to break words. We don't want |
| 47 | // to break some punctuation chars like ':' |
| 48 | } |
| 49 | $defaults = [ |
| 50 | 'char_filter' => [ |
| 51 | 'word_break_helper' => [ |
| 52 | 'type' => 'mapping', |
| 53 | 'mappings' => [ |
| 54 | '_=>\u0020', // a space for mw |
| 55 | ',=>\u0020', // useful for "Lastname, Firstname" |
| 56 | '"=>\u0020', // " certainly phrase search? |
| 57 | '-=>\u0020', // useful for hyphenated names |
| 58 | "'=>\u0020", // Useful for finding names |
| 59 | '\u2019=>\u0020', // Unicode right single quote |
| 60 | '\u02BC=>\u0020', // Unicode modifier letter apostrophe |
| 61 | // Not sure about ( and )... |
| 62 | // very useful to search for : |
| 63 | // "john smith explo" instead of "john smith (expl" |
| 64 | // but annoying to search for "(C)" |
| 65 | // ')=>\u0020', |
| 66 | // '(=>\u0020', |
| 67 | // Ignoring : can be misleading for expert users |
| 68 | // Because we will return unrelated pages when the user |
| 69 | // search for "magic keywords" like WP:WP which are sometimes |
| 70 | // pages in the main namespace that redirect to other namespace |
| 71 | // ':=>\u0020', |
| 72 | // Others are the ones ignored by common search engines |
| 73 | ';=>\u0020', |
| 74 | '\\[=>\u0020', |
| 75 | '\\]=>\u0020', |
| 76 | '{=>\u0020', |
| 77 | '}=>\u0020', |
| 78 | '\\\\=>\u0020', |
| 79 | // Unicode white spaces |
| 80 | // cause issues with completion |
| 81 | // only few of them where actually |
| 82 | // identified as problematic but |
| 83 | // more are added for extra safety |
| 84 | // see: T156234 |
| 85 | // TODO: reevaluate with es5 |
| 86 | '\u00a0=>\u0020', |
| 87 | '\u1680=>\u0020', |
| 88 | '\u180e=>\u0020', |
| 89 | '\u2000=>\u0020', |
| 90 | '\u2001=>\u0020', |
| 91 | '\u2002=>\u0020', |
| 92 | '\u2003=>\u0020', |
| 93 | '\u2004=>\u0020', |
| 94 | '\u2005=>\u0020', |
| 95 | '\u2006=>\u0020', |
| 96 | '\u2007=>\u0020', |
| 97 | '\u2008=>\u0020', |
| 98 | '\u2009=>\u0020', |
| 99 | '\u200a=>\u0020', |
| 100 | '\u200b=>\u0020', // causes issue |
| 101 | '\u200c=>\u0020', // causes issue |
| 102 | '\u200d=>\u0020', // causes issue |
| 103 | '\u202f=>\u0020', |
| 104 | '\u205f=>\u0020', |
| 105 | '\u3000=>\u0020', |
| 106 | '\ufeff=>\u0020', // causes issue |
| 107 | ], |
| 108 | ], |
| 109 | ], |
| 110 | 'filter' => [ |
| 111 | "stop_filter" => [ |
| 112 | "type" => "stop", |
| 113 | "stopwords" => "_none_", |
| 114 | "remove_trailing" => "true" |
| 115 | ], |
| 116 | "lowercase" => $lowercase_type, |
| 117 | "accentfolding" => $folding_type, |
| 118 | "token_limit" => [ |
| 119 | "type" => "limit", |
| 120 | "max_token_count" => "20" |
| 121 | ], |
| 122 | // Workaround what seems to be a bug in the |
| 123 | // completion suggester, empty tokens cause an |
| 124 | // issue similar to |
| 125 | // https://github.com/elastic/elasticsearch/pull/11158 |
| 126 | // can be removed with es5 if we want |
| 127 | // note that icu_folding can introduce empty tokens, so |
| 128 | // maybe it is best to leave this in place |
| 129 | "remove_empty" => [ |
| 130 | "type" => "length", |
| 131 | "min" => 1, |
| 132 | ], |
| 133 | ], |
| 134 | 'analyzer' => [ |
| 135 | "stop_analyzer" => [ |
| 136 | "type" => "custom", |
| 137 | "filter" => [ |
| 138 | "lowercase", |
| 139 | "stop_filter", |
| 140 | "accentfolding", |
| 141 | "remove_empty", |
| 142 | "token_limit" |
| 143 | ], |
| 144 | "tokenizer" => $textTokenizer, |
| 145 | ], |
| 146 | // We do not remove stop words when searching, |
| 147 | // this leads to extremely weird behaviors while |
| 148 | // writing "to be or no to be" |
| 149 | "stop_analyzer_search" => [ |
| 150 | "type" => "custom", |
| 151 | "filter" => [ |
| 152 | "lowercase", |
| 153 | "accentfolding", |
| 154 | "remove_empty", |
| 155 | "token_limit" |
| 156 | ], |
| 157 | "tokenizer" => $textTokenizer, |
| 158 | ], |
| 159 | "plain" => [ |
| 160 | "type" => "custom", |
| 161 | "char_filter" => [ 'word_break_helper' ], |
| 162 | "filter" => [ |
| 163 | "remove_empty", |
| 164 | "token_limit", |
| 165 | "lowercase" |
| 166 | ], |
| 167 | "tokenizer" => $plainTokenizer, |
| 168 | ], |
| 169 | "plain_search" => [ |
| 170 | "type" => "custom", |
| 171 | "char_filter" => [ 'word_break_helper' ], |
| 172 | "filter" => [ |
| 173 | "remove_empty", |
| 174 | "token_limit", |
| 175 | "lowercase" |
| 176 | ], |
| 177 | "tokenizer" => $plainTokenizer, |
| 178 | ], |
| 179 | ], |
| 180 | ]; |
| 181 | if ( $this->config->getElement( 'CirrusSearchCompletionSuggesterSubphrases', 'build' ) ) { |
| 182 | $defaults['analyzer']['subphrases'] = [ |
| 183 | "type" => "custom", |
| 184 | "filter" => [ |
| 185 | "lowercase", |
| 186 | "accentfolding", |
| 187 | "remove_empty", |
| 188 | "token_limit" |
| 189 | ], |
| 190 | "tokenizer" => $textTokenizer, |
| 191 | ]; |
| 192 | $defaults['analyzer']['subphrases_search'] = [ |
| 193 | "type" => "custom", |
| 194 | "filter" => [ |
| 195 | "lowercase", |
| 196 | "accentfolding", |
| 197 | "remove_empty", |
| 198 | "token_limit" |
| 199 | ], |
| 200 | "tokenizer" => $textTokenizer, |
| 201 | ]; |
| 202 | } |
| 203 | return $defaults; |
| 204 | } |
| 205 | |
| 206 | /** |
| 207 | * @param array $config |
| 208 | * @param string $language |
| 209 | * @return array |
| 210 | */ |
| 211 | private function customize( array $config, $language ) { |
| 212 | $defaultStopSet = $this->getDefaultStopSet( $language ); |
| 213 | $config['filter']['stop_filter']['stopwords'] = $defaultStopSet; |
| 214 | |
| 215 | switch ( $this->getDefaultTextAnalyzerType( $language ) ) { |
| 216 | // Please add languages in alphabetical order. |
| 217 | case 'arabic': |
| 218 | $config[ 'char_filter' ][ 'arabic_numeral_map' ] = [ |
| 219 | // T117217 fold Eastern Arabic Numerals (٠۱۲۳...) into Western (0123...) |
| 220 | 'type' => 'mapping', |
| 221 | 'mappings' => [ |
| 222 | '\u0660=>0', '\u0661=>1', '\u0662=>2', |
| 223 | '\u0663=>3', '\u0664=>4', '\u0665=>5', |
| 224 | '\u0666=>6', '\u0667=>7', '\u0668=>8', |
| 225 | '\u0669=>9', |
| 226 | ], |
| 227 | ]; |
| 228 | |
| 229 | // add arabic_numeral_map to plain and copy plain to plain_search |
| 230 | $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'arabic_numeral_map'; |
| 231 | $config[ 'analyzer' ][ 'plain_search' ] = $config[ 'analyzer' ][ 'plain' ]; |
| 232 | break; |
| 233 | case 'russian': |
| 234 | $config[ 'char_filter' ][ 'russian_diacritic_map' ] = [ |
| 235 | // T117217 fold Eastern Arabic Numerals (٠۱۲۳...) into Western (0123...) |
| 236 | 'type' => 'mapping', |
| 237 | 'mappings' => [ |
| 238 | // T102298 ignore combining acute / stress accents |
| 239 | '\u0301=>', |
| 240 | // T124592 fold ё=>е and Ё=>Е, precomposed or with combining diacritic |
| 241 | '\u0451=>\u0435', |
| 242 | '\u0401=>\u0415', |
| 243 | '\u0435\u0308=>\u0435', |
| 244 | '\u0415\u0308=>\u0415', |
| 245 | |
| 246 | ], |
| 247 | ]; |
| 248 | |
| 249 | // add arabic_numeral_map to plain and copy plain to plain_search |
| 250 | $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 'russian_diacritic_map'; |
| 251 | $config[ 'analyzer' ][ 'plain_search' ] = $config[ 'analyzer' ][ 'plain' ]; |
| 252 | break; |
| 253 | } |
| 254 | |
| 255 | if ( $this->isIcuAvailable() ) { |
| 256 | foreach ( $config[ 'analyzer' ] as $k => &$analyzer ) { |
| 257 | if ( $k != "stop_analyzer" && $k != "stop_analyzer_search" ) { |
| 258 | continue; |
| 259 | } |
| 260 | if ( !isset( $analyzer[ 'filter' ] ) ) { |
| 261 | continue; |
| 262 | } |
| 263 | $analyzer[ 'filter' ] = array_map( static function ( $filter ) { |
| 264 | if ( $filter === 'lowercase' ) { |
| 265 | return 'icu_normalizer'; |
| 266 | } |
| 267 | return $filter; |
| 268 | }, $analyzer[ 'filter' ] ); |
| 269 | } |
| 270 | } |
| 271 | return $config; |
| 272 | } |
| 273 | |
| 274 | /** |
| 275 | * Build the analysis config. |
| 276 | * |
| 277 | * @param string|null $language Config language |
| 278 | * @return array the analysis config |
| 279 | */ |
| 280 | public function buildConfig( $language = null ) { |
| 281 | $language ??= $this->defaultLanguage; |
| 282 | return $this->customize( $this->defaults( $language ), $language ); |
| 283 | } |
| 284 | |
| 285 | /** @var string[] */ |
| 286 | private static $stopwords = [ |
| 287 | 'ar' => '_arabic_', |
| 288 | 'hy' => '_armenian_', |
| 289 | 'eu' => '_basque_', |
| 290 | 'pt-br' => '_brazilian_', |
| 291 | 'bg' => '_bulgarian_', |
| 292 | 'ca' => '_catalan_', |
| 293 | 'cs' => '_czech_', |
| 294 | 'da' => '_danish_', |
| 295 | 'nl' => '_dutch_', |
| 296 | 'en' => '_english_', |
| 297 | 'en-ca' => '_english_', |
| 298 | 'en-gb' => '_english_', |
| 299 | 'simple' => '_english_', |
| 300 | 'fi' => '_finnish_', |
| 301 | 'fr' => '_french_', |
| 302 | 'gl' => '_galician_', |
| 303 | 'de' => '_german_', |
| 304 | 'el' => '_greek_', |
| 305 | 'hi' => '_hindi_', |
| 306 | 'hu' => '_hungarian_', |
| 307 | 'id' => '_indonesian_', |
| 308 | 'lt' => '_lithuanian_', |
| 309 | 'lv' => '_latvian_', |
| 310 | 'ga' => '_irish_', |
| 311 | 'it' => '_italian_', |
| 312 | 'nb' => '_norwegian_', |
| 313 | 'nn' => '_norwegian_', |
| 314 | 'fa' => '_persian_', |
| 315 | 'pt' => '_portuguese_', |
| 316 | 'ro' => '_romanian_', |
| 317 | 'ru' => '_russian_', |
| 318 | 'ckb' => '_sorani_', |
| 319 | 'es' => '_spanish_', |
| 320 | 'sv' => '_swedish_', |
| 321 | 'th' => '_thai_', |
| 322 | 'tr' => '_turkish_' |
| 323 | ]; |
| 324 | |
| 325 | /** |
| 326 | * @param string $lang |
| 327 | * @return string |
| 328 | */ |
| 329 | private function getDefaultStopSet( $lang ) { |
| 330 | return self::$stopwords[$lang] ?? '_none_'; |
| 331 | } |
| 332 | |
| 333 | /** |
| 334 | * @param string $lang |
| 335 | * @return bool |
| 336 | */ |
| 337 | public static function hasStopWords( $lang ) { |
| 338 | return isset( self::$stopwords[$lang] ); |
| 339 | } |
| 340 | } |