Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
94.79% |
91 / 96 |
|
80.00% |
8 / 10 |
CRAP | |
0.00% |
0 / 1 |
AnalysisFilter | |
94.79% |
91 / 96 |
|
80.00% |
8 / 10 |
50.35 | |
0.00% |
0 / 1 |
findUsedFromField | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
6 | |||
findUsedAnalyzersInMappings | |
75.00% |
6 / 8 |
|
0.00% |
0 / 1 |
3.14 | |||
pushAnalyzerAliasesIntoField | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
8 | |||
pushAnalyzerAliasesIntoMappings | |
66.67% |
6 / 9 |
|
0.00% |
0 / 1 |
3.33 | |||
filter | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
filterUnusedAnalysisChain | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
8 | |||
recursiveKsort | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
calcDeduplicationAliases | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
5 | |||
deduplicateAnalysisConfig | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
9 | |||
filterAnalysis | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use MediaWiki\Json\FormatJson; |
6 | |
7 | /** |
8 | * Filter unused and duplicate entries from elasticsearch index configuration |
9 | */ |
10 | class AnalysisFilter { |
11 | /** @var string[] List of key's in mappings that reference analyzers */ |
12 | private static $ANALYZER_FIELDS = [ 'analyzer', 'search_analyzer', 'search_quote_analyzer' ]; |
13 | |
14 | /** @var string[] List of key's in mappings that must be recursively searched */ |
15 | private static $SUBFIELD_FIELDS = [ 'fields', 'properties' ]; |
16 | |
17 | /** |
18 | * Recursively finds used analyzers from elasticsearch mappings |
19 | * |
20 | * @param array $properties a 'properties' or 'fields' list from the mappings |
21 | * @return Set The set of referenced analyzers |
22 | */ |
23 | private function findUsedFromField( array $properties ) { |
24 | $analyzers = new Set(); |
25 | foreach ( $properties as $name => $config ) { |
26 | foreach ( self::$ANALYZER_FIELDS as $key ) { |
27 | if ( isset( $config[$key] ) ) { |
28 | $analyzers->add( $config[$key] ); |
29 | } |
30 | } |
31 | foreach ( self::$SUBFIELD_FIELDS as $key ) { |
32 | if ( isset( $config[$key] ) ) { |
33 | $analyzers->union( $this->findUsedFromField( $config[$key] ) ); |
34 | } |
35 | } |
36 | } |
37 | return $analyzers; |
38 | } |
39 | |
40 | /** |
41 | * @param array[] $mappings Elasticsearch mapping configuration |
42 | * @return Set The set of analyzer names referenced in $mappings |
43 | */ |
44 | public function findUsedAnalyzersInMappings( array $mappings ) { |
45 | $analyzers = new Set(); |
46 | if ( isset( $mappings['properties'] ) ) { |
47 | // modern elastic, no index types |
48 | $analyzers->union( |
49 | $this->findUsedFromField( $mappings['properties'] ) ); |
50 | } else { |
51 | // BC for parts still using index types |
52 | foreach ( $mappings as $config ) { |
53 | $analyzers->union( |
54 | $this->findUsedFromField( $config['properties'] ) ); |
55 | } |
56 | } |
57 | return $analyzers; |
58 | } |
59 | |
60 | /** |
61 | * Recursively applies analyzer aliases to elasticsearch mappings |
62 | * |
63 | * @param array $properties a 'properties' or 'fields' list from the mappings |
64 | * @param string[] $aliases Map from current analyzer name to replacement name |
65 | * @return array $properties with analyzer aliases applied |
66 | */ |
67 | private function pushAnalyzerAliasesIntoField( array $properties, array $aliases ) { |
68 | foreach ( $properties as &$config ) { |
69 | foreach ( self::$ANALYZER_FIELDS as $key ) { |
70 | if ( isset( $config[$key] ) && isset( $aliases[$config[$key]] ) ) { |
71 | $config[$key] = $aliases[$config[$key]]; |
72 | } |
73 | } |
74 | foreach ( self::$SUBFIELD_FIELDS as $key ) { |
75 | if ( isset( $config[$key] ) && is_array( $config[$key] ) ) { |
76 | $config[$key] = $this->pushAnalyzerAliasesIntoField( |
77 | $config[$key], $aliases |
78 | ); |
79 | } |
80 | } |
81 | } |
82 | return $properties; |
83 | } |
84 | |
85 | /** |
86 | * @param array[] $mappings Elasticsearch index mapping configuration |
87 | * @param string[] $aliases Mapping from old name to new name for analyzers |
88 | * @return array Updated index mapping configuration |
89 | */ |
90 | public function pushAnalyzerAliasesIntoMappings( array $mappings, $aliases ) { |
91 | if ( isset( $mappings['properties'] ) ) { |
92 | // modern elastic, no index types |
93 | $mappings['properties'] = $this->pushAnalyzerAliasesIntoField( |
94 | $mappings['properties'], $aliases |
95 | ); |
96 | } else { |
97 | // BC for parts still using index types |
98 | foreach ( $mappings as $mappingType => $config ) { |
99 | $mappings[$mappingType]['properties'] = $this->pushAnalyzerAliasesIntoField( |
100 | $config['properties'], $aliases |
101 | ); |
102 | } |
103 | } |
104 | return $mappings; |
105 | } |
106 | |
107 | private function filter( $data, Set $keysToKeep ) { |
108 | foreach ( $data as $k => $v ) { |
109 | if ( !$keysToKeep->contains( $k ) ) { |
110 | unset( $data[$k] ); |
111 | } |
112 | } |
113 | return $data; |
114 | } |
115 | |
116 | /** |
117 | * @param array $analysis The index.analysis field of elasticsearch index settings |
118 | * @param Set $usedAnalyzers Set of analyzers to keep configurations for |
119 | * @return array The $analysis array filtered to only pieces needed for $usedAnalyzers |
120 | */ |
121 | public function filterUnusedAnalysisChain( $analysis, Set $usedAnalyzers ) { |
122 | $sets = [ |
123 | 'analyzer' => $usedAnalyzers, |
124 | 'filter' => new Set(), |
125 | 'char_filter' => new Set(), |
126 | 'tokenizer' => new Set(), |
127 | ]; |
128 | foreach ( $analysis['analyzer'] as $name => $config ) { |
129 | if ( !$usedAnalyzers->contains( $name ) ) { |
130 | continue; |
131 | } |
132 | foreach ( [ 'filter', 'char_filter' ] as $k ) { |
133 | if ( isset( $config[$k] ) ) { |
134 | $sets[$k]->addAll( $config[$k] ); |
135 | } |
136 | } |
137 | if ( isset( $config['tokenizer'] ) ) { |
138 | $sets['tokenizer']->add( $config['tokenizer'] ); |
139 | } |
140 | } |
141 | |
142 | foreach ( $sets as $k => $used ) { |
143 | if ( isset( $analysis[$k] ) ) { |
144 | $analysis[$k] = $this->filter( $analysis[$k], $used ); |
145 | } |
146 | } |
147 | |
148 | return $analysis; |
149 | } |
150 | |
151 | private function recursiveKsort( array $array ) { |
152 | foreach ( $array as $k => $v ) { |
153 | if ( is_array( $v ) ) { |
154 | $array[$k] = $this->recursiveKsort( $v ); |
155 | } |
156 | } |
157 | ksort( $array ); |
158 | return $array; |
159 | } |
160 | |
161 | private function calcDeduplicationAliases( array $input ) { |
162 | $keysByContent = []; |
163 | foreach ( $input as $k => $v ) { |
164 | $sorted = $this->recursiveKsort( $v ); |
165 | $content = FormatJson::encode( $sorted ); |
166 | $keysByContent[$content][] = $k; |
167 | } |
168 | $aliases = []; |
169 | foreach ( $keysByContent as $keys ) { |
170 | // Min to give a stable winner for each group. |
171 | $winner = count( $keys ) === 1 ? reset( $keys ) : min( ...$keys ); |
172 | foreach ( $keys as $key ) { |
173 | $aliases[$key] = $winner; |
174 | } |
175 | } |
176 | return $aliases; |
177 | } |
178 | |
179 | /** |
180 | * Remove duplicate analysis chain elements and report aliases that need |
181 | * to be applied to mapping configuration. |
182 | * |
183 | * This is necessary for indices such as wikibase that eagerly create |
184 | * analysis chains for many languages. Quite a few languages result in the |
185 | * same elements and this deduplication can remove a large fraction of the |
186 | * configuration. |
187 | * |
188 | * @param array $analysis The index.analysis field of elasticsearch index settings |
189 | * @return string[] map from old analyzer name to new analyzer name. |
190 | */ |
191 | public function deduplicateAnalysisConfig( array $analysis ) { |
192 | // Deduplicate children first to normalize analyzer configuration. |
193 | foreach ( [ 'tokenizer', 'filter', 'char_filter' ] as $k ) { |
194 | if ( !isset( $analysis[$k] ) ) { |
195 | continue; |
196 | } |
197 | $aliases = $this->calcDeduplicationAliases( $analysis[$k] ); |
198 | $analysis[$k] = $this->filter( $analysis[$k], new Set( $aliases ) ); |
199 | |
200 | // Push deduplications into analyzers that reference them |
201 | foreach ( $analysis['analyzer'] as $name => $analyzerConfig ) { |
202 | if ( !isset( $analyzerConfig[$k] ) ) { |
203 | continue; |
204 | } |
205 | if ( is_array( $analyzerConfig[$k] ) ) { |
206 | // filter, char_filter |
207 | foreach ( $analyzerConfig[$k] as $i => $value ) { |
208 | // TODO: in theory, all values should be set already? |
209 | if ( isset( $aliases[$value] ) ) { |
210 | $analysis['analyzer'][$name][$k][$i] = $aliases[$value]; |
211 | } |
212 | } |
213 | } elseif ( isset( $aliases[$analyzerConfig[$k]] ) ) { |
214 | // tokenizer |
215 | $analysis['analyzer'][$name][$k] = $aliases[$analyzerConfig[$k]]; |
216 | } |
217 | } |
218 | } |
219 | |
220 | // Once the analyzer configuration has been normalized by deduplication |
221 | // we can figure out which of the analyzers are duplicates as well. |
222 | return $this->calcDeduplicationAliases( $analysis['analyzer'] ); |
223 | } |
224 | |
225 | /** |
226 | * Shrink the size of elasticsearch index configuration |
227 | * |
228 | * Removes analysis chain elements that are defined but never referenced |
229 | * from the mappings. Optionally deduplicates elements of the analysis |
230 | * chain. |
231 | * |
232 | * @param array $analysis Elasticsearch index analysis configuration |
233 | * @param array $mappings Elasticsearch index mapping configuration |
234 | * @param bool $deduplicate When true deduplicate the analysis chain |
235 | * @param string[] $protected list of named analyzers that should not be removed. |
236 | * @return array [$settings, $mappings] |
237 | */ |
238 | public function filterAnalysis( array $analysis, array $mappings, $deduplicate = false, array $protected = [] ) { |
239 | if ( $deduplicate ) { |
240 | $aliases = $this->deduplicateAnalysisConfig( $analysis ); |
241 | $mappings = $this->pushAnalyzerAliasesIntoMappings( $mappings, $aliases ); |
242 | } |
243 | $usedAnalyzers = $this->findUsedAnalyzersInMappings( $mappings ); |
244 | // protected analyzers may be renamed in the mappings, but this retains them in the config as well |
245 | // to ensure they are available for query-time. |
246 | $usedAnalyzers->addAll( $protected ); |
247 | $analysis = $this->filterUnusedAnalysisChain( $analysis, $usedAnalyzers ); |
248 | return [ $analysis, $mappings ]; |
249 | } |
250 | } |