Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
99.28% |
965 / 972 |
|
72.00% |
18 / 25 |
CRAP | |
0.00% |
0 / 1 |
AnalysisConfigBuilder | |
99.28% |
965 / 972 |
|
72.00% |
18 / 25 |
210 | |
0.00% |
0 / 1 |
__construct | |
95.83% |
23 / 24 |
|
0.00% |
0 / 1 |
7 | |||
shouldActivateIcuFolding | |
92.86% |
13 / 14 |
|
0.00% |
0 / 1 |
9.03 | |||
shouldActivateIcuTokenization | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
6 | |||
buildConfig | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
3 | |||
buildSimilarityConfig | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
enableICUTokenizer | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
6 | |||
standardTokenizerOnlyCleanup | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
6 | |||
enableICUFolding | |
100.00% |
32 / 32 |
|
100.00% |
1 / 1 |
12 | |||
switchFiltersToICUFolding | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
switchFiltersToICUFoldingPreserve | |
94.44% |
17 / 18 |
|
0.00% |
0 / 1 |
7.01 | |||
getICUSetFilter | |
97.96% |
48 / 49 |
|
0.00% |
0 / 1 |
28 | |||
getICUNormSetFilter | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
4.13 | |||
defaults | |
100.00% |
210 / 210 |
|
100.00% |
1 / 1 |
5 | |||
customize | |
100.00% |
484 / 484 |
|
100.00% |
1 / 1 |
66 | |||
fixAsciiFolding | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
7 | |||
getDefaultTextAnalyzerType | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getDefaultFilters | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
resolveFilters | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
replaceFilter | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
4.03 | |||
mergeConfig | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
11 | |||
buildLanguageConfigs | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
isIcuAvailable | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
enableGlobalCustomFilters | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
insertGlobalCustomFilter | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
6 | |||
buildGlobalCustomFilters | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use CirrusSearch\CirrusSearch; |
6 | use CirrusSearch\CirrusSearchHookRunner; |
7 | use CirrusSearch\Profile\SearchProfileService; |
8 | use CirrusSearch\SearchConfig; |
9 | use MediaWiki\MediaWikiServices; |
10 | |
11 | /** |
12 | * Builds elasticsearch analysis config arrays. |
13 | * |
14 | * This program is free software; you can redistribute it and/or modify |
15 | * it under the terms of the GNU General Public License as published by |
16 | * the Free Software Foundation; either version 2 of the License, or |
17 | * (at your option) any later version. |
18 | * |
19 | * This program is distributed in the hope that it will be useful, |
20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
22 | * GNU General Public License for more details. |
23 | * |
24 | * You should have received a copy of the GNU General Public License along |
25 | * with this program; if not, write to the Free Software Foundation, Inc., |
26 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
27 | * http://www.gnu.org/copyleft/gpl.html |
28 | */ |
29 | class AnalysisConfigBuilder { |
30 | /** |
31 | * Version number for the core analysis. Increment the major |
32 | * version when the analysis changes in an incompatible way, |
33 | * and change the minor version when it changes but isn't |
34 | * incompatible. |
35 | * |
36 | * You may also need to increment MetaStoreIndex::METASTORE_VERSION |
37 | * manually as well. |
38 | */ |
39 | public const VERSION = '0.12'; |
40 | |
41 | /** |
42 | * Maximum number of characters allowed in keyword terms. |
43 | */ |
44 | private const KEYWORD_IGNORE_ABOVE = 5000; |
45 | |
46 | /** |
47 | * Temporary magic value to prevent enabling ICU tokenizer in specific analyzers |
48 | */ |
49 | private const STANDARD_TOKENIZER_ONLY = 'std_only'; |
50 | |
51 | /** |
52 | * @var bool is the icu plugin available? |
53 | */ |
54 | private $icu; |
55 | |
56 | /** |
57 | * @var array Similarity algo (tf/idf, bm25, etc) configuration |
58 | */ |
59 | private $similarity; |
60 | |
61 | /** |
62 | * @var SearchConfig cirrus config |
63 | */ |
64 | protected $config; |
65 | /** |
66 | * @var string[] |
67 | */ |
68 | private $plugins; |
69 | |
70 | /** |
71 | * @var string |
72 | */ |
73 | protected $defaultLanguage; |
74 | |
75 | /** |
76 | * @var CirrusSearchHookRunner |
77 | */ |
78 | private $cirrusSearchHookRunner; |
79 | |
80 | /** |
81 | * @var GlobalCustomFilter[] |
82 | */ |
83 | public $globalCustomFilters; |
84 | |
85 | /** |
86 | * @param string $langCode The language code to build config for |
87 | * @param string[] $plugins list of plugins installed in Elasticsearch |
88 | * @param SearchConfig|null $config |
89 | * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner |
90 | */ |
91 | public function __construct( |
92 | $langCode, |
93 | array $plugins, |
94 | SearchConfig $config = null, |
95 | CirrusSearchHookRunner $cirrusSearchHookRunner = null |
96 | ) { |
97 | $this->globalCustomFilters = $this->buildGlobalCustomFilters(); |
98 | |
99 | $this->defaultLanguage = $langCode; |
100 | $this->plugins = $plugins; |
101 | foreach ( $this->elasticsearchLanguageAnalyzersFromPlugins as $pluginSpec => $extra ) { |
102 | $pluginsPresent = 1; |
103 | $pluginList = explode( ',', $pluginSpec ); |
104 | foreach ( $pluginList as $plugin ) { |
105 | if ( !in_array( $plugin, $plugins ) ) { |
106 | $pluginsPresent = 0; |
107 | break; |
108 | } |
109 | } |
110 | if ( $pluginsPresent ) { |
111 | $this->elasticsearchLanguageAnalyzers = |
112 | array_merge( $this->elasticsearchLanguageAnalyzers, $extra ); |
113 | } |
114 | } |
115 | $this->icu = in_array( 'analysis-icu', $plugins ); |
116 | $config ??= MediaWikiServices::getInstance()->getConfigFactory() |
117 | ->makeConfig( 'CirrusSearch' ); |
118 | $similarity = $config->getProfileService()->loadProfile( SearchProfileService::SIMILARITY ); |
119 | if ( !array_key_exists( 'similarity', $similarity ) ) { |
120 | $similarity['similarity'] = []; |
121 | } |
122 | $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner( |
123 | MediaWikiServices::getInstance()->getHookContainer() ); |
124 | $this->cirrusSearchHookRunner->onCirrusSearchSimilarityConfig( $similarity['similarity'] ); |
125 | $this->similarity = $similarity; |
126 | |
127 | $this->config = $config; |
128 | } |
129 | |
130 | /** |
131 | * Determine if ascii folding should be used |
132 | * @param string $language Config language |
133 | * @return bool true if icu folding should be enabled |
134 | */ |
135 | public function shouldActivateIcuFolding( $language ) { |
136 | if ( !$this->isIcuAvailable() || !in_array( 'extra', $this->plugins ) ) { |
137 | // ICU folding requires the icu plugin and the extra plugin |
138 | return false; |
139 | } |
140 | $in_config = $this->config->get( 'CirrusSearchUseIcuFolding' ); |
141 | // BC code, this config var was originally a simple boolean |
142 | if ( $in_config === true ) { |
143 | $in_config = 'yes'; |
144 | } |
145 | if ( $in_config === false ) { |
146 | $in_config = 'no'; |
147 | } |
148 | switch ( $in_config ) { |
149 | case 'yes': |
150 | return true; |
151 | case 'no': |
152 | return false; |
153 | case 'default': |
154 | return $this->languagesWithIcuFolding[$language] ?? false; |
155 | default: |
156 | return false; |
157 | } |
158 | } |
159 | |
160 | /** |
161 | * Determine if the icu tokenizer can be enabled |
162 | * @param string $language Config language |
163 | * @return bool |
164 | */ |
165 | public function shouldActivateIcuTokenization( $language ) { |
166 | if ( !$this->isIcuAvailable() ) { |
167 | // requires the icu plugin |
168 | return false; |
169 | } |
170 | $in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer' ); |
171 | switch ( $in_config ) { |
172 | case 'yes': |
173 | return true; |
174 | case 'no': |
175 | return false; |
176 | case 'default': |
177 | return $this->languagesWithIcuTokenization[$language] ?? false; |
178 | default: |
179 | return false; |
180 | } |
181 | } |
182 | |
183 | /** |
184 | * Build the analysis config. |
185 | * |
186 | * @param string|null $language Config language |
187 | * @return array the analysis config |
188 | */ |
189 | public function buildConfig( $language = null ) { |
190 | $language ??= $this->defaultLanguage; |
191 | $config = $this->customize( $this->defaults( $language ), $language ); |
192 | $this->cirrusSearchHookRunner->onCirrusSearchAnalysisConfig( $config, $this ); |
193 | $config = $this->enableGlobalCustomFilters( $config, $language ); |
194 | if ( $this->shouldActivateIcuTokenization( $language ) ) { |
195 | $config = $this->enableICUTokenizer( $config ); |
196 | } |
197 | if ( $this->shouldActivateIcuFolding( $language ) ) { |
198 | $config = $this->enableICUFolding( $config, $language ); |
199 | } |
200 | $config = $this->fixAsciiFolding( $config ); |
201 | $config = $this->standardTokenizerOnlyCleanup( $config ); |
202 | |
203 | return $config; |
204 | } |
205 | |
206 | /** |
207 | * @return array|null the similarity config |
208 | */ |
209 | public function buildSimilarityConfig() { |
210 | return $this->similarity['similarity'] ?? null; |
211 | } |
212 | |
213 | /** |
214 | * replace the standard tokenizer with icu_tokenizer |
215 | * @param mixed[] $config |
216 | * @return mixed[] update config |
217 | */ |
218 | public function enableICUTokenizer( array $config ) { |
219 | foreach ( $config[ 'analyzer' ] as $name => &$value ) { |
220 | if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) { |
221 | continue; |
222 | } |
223 | if ( isset( $value[ 'tokenizer' ] ) && $value[ 'tokenizer' ] === 'standard' ) { |
224 | $value[ 'tokenizer' ] = 'icu_tokenizer'; |
225 | } |
226 | } |
227 | return $config; |
228 | } |
229 | |
230 | /** |
231 | * replace STANDARD_TOKENIZER_ONLY with the actual standard tokenizer |
232 | * @param mixed[] $config |
233 | * @return mixed[] update config |
234 | */ |
235 | public function standardTokenizerOnlyCleanup( array $config ) { |
236 | foreach ( $config[ 'analyzer' ] as $name => &$value ) { |
237 | if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) { |
238 | continue; |
239 | } |
240 | if ( isset( $value[ 'tokenizer' ] ) && |
241 | $value[ 'tokenizer' ] === self::STANDARD_TOKENIZER_ONLY ) { |
242 | // if we blocked upgrades/changes to the standard tokenizer, |
243 | // replace the magic value with the actual standard tokenizer |
244 | $value[ 'tokenizer' ] = 'standard'; |
245 | } |
246 | } |
247 | return $config; |
248 | } |
249 | |
250 | /** |
251 | * Activate ICU folding instead of asciifolding |
252 | * @param mixed[] $config |
253 | * @param string $language Config language |
254 | * @return mixed[] update config |
255 | */ |
256 | public function enableICUFolding( array $config, $language ) { |
257 | $unicodeSetFilter = $this->getICUSetFilter( $language ); |
258 | $filter = [ |
259 | 'type' => 'icu_folding', |
260 | ]; |
261 | if ( !empty( $unicodeSetFilter ) ) { |
262 | $filter[ 'unicodeSetFilter' ] = $unicodeSetFilter; |
263 | } |
264 | $config[ 'filter' ][ 'icu_folding' ] = $filter; |
265 | |
266 | // Adds a simple nfkc normalizer for cases where |
267 | // we preserve original but the lowercase filter |
268 | // is not used before |
269 | $config[ 'filter' ][ 'icu_nfkc_normalization' ] = [ |
270 | 'type' => 'icu_normalizer', |
271 | 'name' => 'nfkc', |
272 | ]; |
273 | |
274 | $newfilters = []; |
275 | foreach ( $config[ 'analyzer' ] as $name => $value ) { |
276 | if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) { |
277 | continue; |
278 | } |
279 | if ( !isset( $value[ 'filter' ] ) ) { |
280 | continue; |
281 | } |
282 | if ( in_array( 'asciifolding', $value[ 'filter' ] ) ) { |
283 | $newfilters[ $name ] = $this->switchFiltersToICUFolding( $value[ 'filter' ] ); |
284 | } |
285 | if ( in_array( 'asciifolding_preserve', $value[ 'filter' ] ) ) { |
286 | $newfilters[ $name ] = $this->switchFiltersToICUFoldingPreserve( $value[ 'filter' ] ); |
287 | } |
288 | } |
289 | |
290 | foreach ( $newfilters as $name => $filters ) { |
291 | $config[ 'analyzer' ][ $name ][ 'filter' ] = $filters; |
292 | } |
293 | // Explicitly enable icu_folding on plain analyzers if it's not |
294 | // already enabled |
295 | foreach ( [ 'plain' ] as $analyzer ) { |
296 | if ( !isset( $config[ 'analyzer' ][ $analyzer ] ) ) { |
297 | continue; |
298 | } |
299 | if ( !isset( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] ) ) { |
300 | $config[ 'analyzer' ][ $analyzer ][ 'filter' ] = []; |
301 | } |
302 | $config[ 'analyzer' ][ $analyzer ][ 'filter' ] = |
303 | $this->switchFiltersToICUFoldingPreserve( |
304 | // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset |
305 | $config[ 'analyzer' ][ $analyzer ][ 'filter' ], true ); |
306 | } |
307 | |
308 | return $config; |
309 | } |
310 | |
311 | /** |
312 | * Replace occurrence of asciifolding to icu_folding |
313 | * @param string[] $filters |
314 | * @return string[] new list of filters |
315 | */ |
316 | private function switchFiltersToICUFolding( array $filters ) { |
317 | array_splice( $filters, array_search( 'asciifolding', $filters ), 1, |
318 | [ 'icu_folding', 'remove_empty' ] ); |
319 | return $filters; |
320 | } |
321 | |
322 | /** |
323 | * Replace occurrence of asciifolding_preserve with a set |
324 | * of compatible filters to enable icu_folding |
325 | * @param string[] $filters |
326 | * @param bool $append append icu_folding even if asciifolding is not present |
327 | * @return string[] new list of filters |
328 | */ |
329 | private function switchFiltersToICUFoldingPreserve( array $filters, $append = false ) { |
330 | if ( in_array( 'icu_folding', $filters ) ) { |
331 | // ICU folding already here |
332 | return $filters; |
333 | } |
334 | $ap_idx = array_search( 'asciifolding_preserve', $filters ); |
335 | if ( $ap_idx === false && $append ) { |
336 | $ap_idx = count( $filters ); |
337 | // fake an asciifolding_preserve so we can |
338 | // reuse code that replaces it |
339 | $filters[] = 'asciifolding_preserve'; |
340 | } |
341 | if ( $ap_idx === false ) { |
342 | return $filters; |
343 | } |
344 |   |