Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
99.26% |
944 / 951 |
|
72.00% |
18 / 25 |
CRAP | |
0.00% |
0 / 1 |
AnalysisConfigBuilder | |
99.26% |
944 / 951 |
|
72.00% |
18 / 25 |
206 | |
0.00% |
0 / 1 |
__construct | |
95.83% |
23 / 24 |
|
0.00% |
0 / 1 |
7 | |||
shouldActivateIcuFolding | |
92.86% |
13 / 14 |
|
0.00% |
0 / 1 |
9.03 | |||
shouldActivateIcuTokenization | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
6 | |||
buildConfig | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
4 | |||
buildSimilarityConfig | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
enableICUTokenizer | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
6 | |||
standardTokenizerOnlyCleanup | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
6 | |||
disableLimitedMappings | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
4 | |||
enableICUFolding | |
100.00% |
32 / 32 |
|
100.00% |
1 / 1 |
12 | |||
switchFiltersToICUFolding | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
switchFiltersToICUFoldingPreserve | |
94.44% |
17 / 18 |
|
0.00% |
0 / 1 |
7.01 | |||
getICUSetFilter | |
97.96% |
48 / 49 |
|
0.00% |
0 / 1 |
28 | |||
getICUNormSetFilter | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
4.13 | |||
defaults | |
100.00% |
247 / 247 |
|
100.00% |
1 / 1 |
5 | |||
customize | |
100.00% |
422 / 422 |
|
100.00% |
1 / 1 |
65 | |||
fixAsciiFolding | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
7 | |||
getDefaultTextAnalyzerType | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getDefaultFilters | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
resolveFilters | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
replaceFilter | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
4.03 | |||
mergeConfig | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
12 | |||
buildLanguageConfigs | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
isIcuAvailable | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
enableGlobalCustomFilters | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
buildGlobalCustomFilters | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use CirrusSearch\CirrusSearch; |
6 | use CirrusSearch\CirrusSearchHookRunner; |
7 | use CirrusSearch\Profile\SearchProfileService; |
8 | use CirrusSearch\SearchConfig; |
9 | use MediaWiki\MediaWikiServices; |
10 | |
11 | /** |
12 | * Builds elasticsearch analysis config arrays. |
13 | * |
14 | * This program is free software; you can redistribute it and/or modify |
15 | * it under the terms of the GNU General Public License as published by |
16 | * the Free Software Foundation; either version 2 of the License, or |
17 | * (at your option) any later version. |
18 | * |
19 | * This program is distributed in the hope that it will be useful, |
20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
22 | * GNU General Public License for more details. |
23 | * |
24 | * You should have received a copy of the GNU General Public License along |
25 | * with this program; if not, write to the Free Software Foundation, Inc., |
26 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
27 | * http://www.gnu.org/copyleft/gpl.html |
28 | */ |
29 | class AnalysisConfigBuilder { |
30 | /** |
31 | * Version number for the core analysis. Increment the major |
32 | * version when the analysis changes in an incompatible way, |
33 | * and change the minor version when it changes but isn't |
34 | * incompatible. |
35 | * |
36 | * You may also need to increment MetaStoreIndex::METASTORE_VERSION |
37 | * manually as well. |
38 | */ |
39 | public const VERSION = '0.12'; |
40 | |
41 | /** |
42 | * Maximum number of characters allowed in keyword terms. |
43 | */ |
44 | private const KEYWORD_IGNORE_ABOVE = 5000; |
45 | |
46 | /** |
47 | * Temporary magic value to prevent enabling ICU tokenizer in specific analyzers |
48 | */ |
49 | private const STANDARD_TOKENIZER_ONLY = 'std_only'; |
50 | |
51 | /** |
52 | * @var bool is the icu plugin available? |
53 | */ |
54 | private $icu; |
55 | |
56 | /** |
57 | * @var array Similarity algo (tf/idf, bm25, etc) configuration |
58 | */ |
59 | private $similarity; |
60 | |
61 | /** |
62 | * @var SearchConfig cirrus config |
63 | */ |
64 | protected $config; |
65 | |
66 | /** |
67 | * @var string[] |
68 | */ |
69 | private $plugins; |
70 | |
71 | /** |
72 | * @var string |
73 | */ |
74 | protected $defaultLanguage; |
75 | |
76 | /** |
77 | * @var CirrusSearchHookRunner |
78 | */ |
79 | private $cirrusSearchHookRunner; |
80 | |
81 | /** |
82 | * @var GlobalCustomFilter[] |
83 | */ |
84 | public $globalCustomFilters; |
85 | |
86 | /** |
87 | * @param string $langCode The language code to build config for |
88 | * @param string[] $plugins list of plugins installed in Elasticsearch |
89 | * @param SearchConfig|null $config |
90 | * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner |
91 | */ |
92 | public function __construct( |
93 | $langCode, |
94 | array $plugins, |
95 | SearchConfig $config = null, |
96 | CirrusSearchHookRunner $cirrusSearchHookRunner = null |
97 | ) { |
98 | $this->globalCustomFilters = $this->buildGlobalCustomFilters(); |
99 | |
100 | $this->defaultLanguage = $langCode; |
101 | $this->plugins = $plugins; |
102 | foreach ( $this->elasticsearchLanguageAnalyzersFromPlugins as $pluginSpec => $extra ) { |
103 | $pluginsPresent = 1; |
104 | $pluginList = explode( ',', $pluginSpec ); |
105 | foreach ( $pluginList as $plugin ) { |
106 | if ( !in_array( $plugin, $plugins ) ) { |
107 | $pluginsPresent = 0; |
108 | break; |
109 | } |
110 | } |
111 | if ( $pluginsPresent ) { |
112 | $this->elasticsearchLanguageAnalyzers = |
113 | array_merge( $this->elasticsearchLanguageAnalyzers, $extra ); |
114 | } |
115 | } |
116 | $this->icu = in_array( 'analysis-icu', $plugins ); |
117 | $config ??= MediaWikiServices::getInstance()->getConfigFactory() |
118 | ->makeConfig( 'CirrusSearch' ); |
119 | $similarity = $config->getProfileService()->loadProfile( SearchProfileService::SIMILARITY ); |
120 | if ( !array_key_exists( 'similarity', $similarity ) ) { |
121 | $similarity['similarity'] = []; |
122 | } |
123 | $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner( |
124 | MediaWikiServices::getInstance()->getHookContainer() ); |
125 | $this->cirrusSearchHookRunner->onCirrusSearchSimilarityConfig( $similarity['similarity'] ); |
126 | $this->similarity = $similarity; |
127 | |
128 | $this->config = $config; |
129 | } |
130 | |
131 | /** |
132 | * Determine if ascii folding should be used |
133 | * @param string $language Config language |
134 | * @return bool true if icu folding should be enabled |
135 | */ |
136 | public function shouldActivateIcuFolding( $language ) { |
137 | if ( !$this->isIcuAvailable() || !in_array( 'extra', $this->plugins ) ) { |
138 | // ICU folding requires the icu plugin and the extra plugin |
139 | return false; |
140 | } |
141 | $in_config = $this->config->get( 'CirrusSearchUseIcuFolding' ); |
142 | // BC code, this config var was originally a simple boolean |
143 | if ( $in_config === true ) { |
144 | $in_config = 'yes'; |
145 | } |
146 | if ( $in_config === false ) { |
147 | $in_config = 'no'; |
148 | } |
149 | switch ( $in_config ) { |
150 | case 'yes': |
151 | return true; |
152 | case 'no': |
153 | return false; |
154 | case 'default': |
155 | return $this->languagesWithIcuFolding[$language] ?? false; |
156 | default: |
157 | return false; |
158 | } |
159 | } |
160 | |
161 | /** |
162 | * Determine if the icu tokenizer can be enabled |
163 | * @param string $language Config language |
164 | * @return bool |
165 | */ |
166 | public function shouldActivateIcuTokenization( $language ) { |
167 | if ( !$this->isIcuAvailable() ) { |
168 | // requires the icu plugin |
169 | return false; |
170 | } |
171 | $in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer' ); |
172 | switch ( $in_config ) { |
173 | case 'yes': |
174 | return true; |
175 | case 'no': |
176 | return false; |
177 | case 'default': |
178 | return $this->languagesWithIcuTokenization[$language] ?? false; |
179 | default: |
180 | return false; |
181 | } |
182 | } |
183 | |
184 | /** |
185 | * Build the analysis config. |
186 | * |
187 | * @param string|null $language Config language |
188 | * @return array the analysis config |
189 | */ |
190 | public function buildConfig( $language = null ) { |
191 | $language ??= $this->defaultLanguage; |
192 | $config = $this->customize( $this->defaults( $language ), $language ); |
193 | $this->cirrusSearchHookRunner->onCirrusSearchAnalysisConfig( $config, $this ); |
194 | $config = $this->enableGlobalCustomFilters( $config, $language ); |
195 | if ( $this->shouldActivateIcuTokenization( $language ) ) { |
196 | $config = $this->enableICUTokenizer( $config ); |
197 | } |
198 | if ( $this->shouldActivateIcuFolding( $language ) ) { |
199 | $config = $this->enableICUFolding( $config, $language ); |
200 | } |
201 | $config = $this->fixAsciiFolding( $config ); |
202 | $config = $this->standardTokenizerOnlyCleanup( $config ); |
203 | if ( !in_array( 'extra-analysis-textify', $this->plugins ) ) { |
204 | $config = $this->disableLimitedMappings( $config ); |
205 | } |
206 | |
207 | return $config; |
208 | } |
209 | |
210 | /** |
211 | * @return array|null the similarity config |
212 | */ |
213 | public function buildSimilarityConfig() { |
214 | return $this->similarity['similarity'] ?? null; |
215 | } |
216 | |
217 | /** |
218 | * replace the standard tokenizer with icu_tokenizer |
219 | * @param mixed[] $config |
220 | * @return mixed[] update config |
221 | */ |
222 | public function enableICUTokenizer( array $config ) { |
223 | foreach ( $config[ 'analyzer' ] as $name => &$value ) { |
224 | if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) { |
225 | continue; |
226 | } |
227 | if ( isset( $value[ 'tokenizer' ] ) && $value[ 'tokenizer' ] === 'standard' ) { |
228 | $value[ 'tokenizer' ] = 'icu_tokenizer'; |
229 | } |
230 | } |
231 | return $config; |
232 | } |
233 | |
234 | /** |
235 | * replace STANDARD_TOKENIZER_ONLY with the actual standard tokenizer |
236 | * @param mixed[] $config |
237 | * @return mixed[] update config |
238 | */ |
239 | public function standardTokenizerOnlyCleanup( array $config ) { |
240 | foreach ( $config[ 'analyzer' ] as $name => &$value ) { |
241 | if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) { |
242 | continue; |
243 | } |
244 | if ( isset( $value[ 'tokenizer' ] ) && |
245 | $value[ 'tokenizer' ] === self::STANDARD_TOKENIZER_ONLY ) { |
246 | // if we blocked upgrades/changes to the standard tokenizer, |
247 | // replace the magic value with the actual standard tokenizer |
248 | $value[ 'tokenizer' ] = 'standard'; |
249 | } |
250 | } |
251 | return $config; |
252 | } |
253 | |
254 | /** |
255 | * replace limited_mappings with mappings if limited_mapping is unavailable |
256 | * @param mixed[] $config |
257 | * @return mixed[] update config |
258 | */ |
259 | public function disableLimitedMappings( array $config ) { |
260 | foreach ( $config[ 'char_filter' ] as $name => &$value ) { |
261 | if ( !isset( $value[ 'type' ] ) || $value[ 'type' ] != 'limited_mapping' ) { |
262 | continue; |
263 | } |
264 | $value[ 'type' ] = 'mapping'; |
265 | } |
266 | return $config; |
267 | } |
268 | |
269 | /** |
270 | * Activate ICU folding instead of asciifolding |
271 | * @param mixed[] $config |
272 | * @param string $language Config language |
273 | * @return mixed[] update config |
274 | */ |
275 | public function enableICUFolding( array $config, $language ) { |
276 | $unicodeSetFilter = $this->getICUSetFilter( $language ); |
277 | $filter = [ |
278 | 'type' => 'icu_folding', |
279 | ]; |
280 | if ( $unicodeSetFilter !== null ) { |
281 | $filter[ 'unicodeSetFilter' ] = $unicodeSetFilter; |
282 | } |
283 | $config[ 'filter' ][ 'icu_folding' ] = $filter; |
284 | |
285 | // Adds a simple nfkc normalizer for cases where |
286 | // we preserve original but the lowercase filter |
287 | // is not used before |
288 | $config[ 'filter' ][ 'icu_nfkc_normalization' ] = [ |
289 | 'type' => 'icu_normalizer', |
290 | 'name' => 'nfkc', |
291 | ]; |
292 | |
293 | $newfilters = []; |
294 | foreach ( $config[ 'analyzer' ] as $name => $value ) { |
295 | if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) { |
296 | continue; |
297 | } |
298 | if ( !isset( $value[ 'filter' ] ) ) { |
299 | continue; |
300 | } |
301 | if ( in_array( 'asciifolding', $value[ 'filter' ] ) ) { |
302 | $newfilters[ $name ] = $this->switchFiltersToICUFolding( $value[ 'filter' ] ); |
303 | } |
304 | if ( in_array( 'asciifolding_preserve', $value[ 'filter' ] ) ) { |
305 | $newfilters[ $name ] = $this->switchFiltersToICUFoldingPreserve( $value[ 'filter' ] ); |
306 | } |
307 | } |
308 | |
309 | foreach ( $newfilters as $name => $filters ) { |
310 | $config[ 'analyzer' ][ $name ][ 'filter' ] = $filters; |
311 | } |
312 | // Explicitly enable icu_folding on plain analyzers if it's not |
313 | // already enabled |
314 | foreach ( [ 'plain' ] as $analyzer ) { |
315 | if ( !isset( $config[ 'analyzer' ][ $analyzer ] ) ) { |
316 | continue; |
317 | } |