Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
94.79% |
91 / 96 |
|
80.00% |
8 / 10 |
CRAP | |
0.00% |
0 / 1 |
AnalysisFilter | |
94.79% |
91 / 96 |
|
80.00% |
8 / 10 |
50.35 | |
0.00% |
0 / 1 |
findUsedFromField | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
6 | |||
findUsedAnalyzersInMappings | |
75.00% |
6 / 8 |
|
0.00% |
0 / 1 |
3.14 | |||
pushAnalyzerAliasesIntoField | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
8 | |||
pushAnalyzerAliasesIntoMappings | |
66.67% |
6 / 9 |
|
0.00% |
0 / 1 |
3.33 | |||
filter | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
filterUnusedAnalysisChain | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
8 | |||
recursiveKsort | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
calcDeduplicationAliases | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
5 | |||
deduplicateAnalysisConfig | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
9 | |||
filterAnalysis | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | /** |
6 | * Filter unused and duplicate entries from elasticsearch index configuration |
7 | */ |
8 | class AnalysisFilter { |
9 | /** @var string[] List of key's in mappings that reference analyzers */ |
10 | private static $ANALYZER_FIELDS = [ 'analyzer', 'search_analyzer', 'search_quote_analyzer' ]; |
11 | |
12 | /** @var string[] List of key's in mappings that must be recursively searched */ |
13 | private static $SUBFIELD_FIELDS = [ 'fields', 'properties' ]; |
14 | |
15 | /** |
16 | * Recursively finds used analyzers from elasticsearch mappings |
17 | * |
18 | * @param array $properties a 'properties' or 'fields' list from the mappings |
19 | * @return Set The set of referenced analyzers |
20 | */ |
21 | private function findUsedFromField( array $properties ) { |
22 | $analyzers = new Set(); |
23 | foreach ( $properties as $name => $config ) { |
24 | foreach ( self::$ANALYZER_FIELDS as $key ) { |
25 | if ( isset( $config[$key] ) ) { |
26 | $analyzers->add( $config[$key] ); |
27 | } |
28 | } |
29 | foreach ( self::$SUBFIELD_FIELDS as $key ) { |
30 | if ( isset( $config[$key] ) ) { |
31 | $analyzers->union( $this->findUsedFromField( $config[$key] ) ); |
32 | } |
33 | } |
34 | } |
35 | return $analyzers; |
36 | } |
37 | |
38 | /** |
39 | * @param array[] $mappings Elasticsearch mapping configuration |
40 | * @return Set The set of analyzer names referenced in $mappings |
41 | */ |
42 | public function findUsedAnalyzersInMappings( array $mappings ) { |
43 | $analyzers = new Set(); |
44 | if ( isset( $mappings['properties'] ) ) { |
45 | // modern elastic, no index types |
46 | $analyzers->union( |
47 | $this->findUsedFromField( $mappings['properties'] ) ); |
48 | } else { |
49 | // BC for parts still using index types |
50 | foreach ( $mappings as $config ) { |
51 | $analyzers->union( |
52 | $this->findUsedFromField( $config['properties'] ) ); |
53 | } |
54 | } |
55 | return $analyzers; |
56 | } |
57 | |
58 | /** |
59 | * Recursively applies analyzer aliases to elasticsearch mappings |
60 | * |
61 | * @param array $properties a 'properties' or 'fields' list from the mappings |
62 | * @param string[] $aliases Map from current analyzer name to replacement name |
63 | * @return array $properties with analyzer aliases applied |
64 | */ |
65 | private function pushAnalyzerAliasesIntoField( array $properties, array $aliases ) { |
66 | foreach ( $properties as &$config ) { |
67 | foreach ( self::$ANALYZER_FIELDS as $key ) { |
68 | if ( isset( $config[$key] ) && isset( $aliases[$config[$key]] ) ) { |
69 | $config[$key] = $aliases[$config[$key]]; |
70 | } |
71 | } |
72 | foreach ( self::$SUBFIELD_FIELDS as $key ) { |
73 | if ( isset( $config[$key] ) && is_array( $config[$key] ) ) { |
74 | $config[$key] = $this->pushAnalyzerAliasesIntoField( |
75 | $config[$key], $aliases |
76 | ); |
77 | } |
78 | } |
79 | } |
80 | return $properties; |
81 | } |
82 | |
83 | /** |
84 | * @param array[] $mappings Elasticsearch index mapping configuration |
85 | * @param string[] $aliases Mapping from old name to new name for analyzers |
86 | * @return array Updated index mapping configuration |
87 | */ |
88 | public function pushAnalyzerAliasesIntoMappings( array $mappings, $aliases ) { |
89 | if ( isset( $mappings['properties'] ) ) { |
90 | // modern elastic, no index types |
91 | $mappings['properties'] = $this->pushAnalyzerAliasesIntoField( |
92 | $mappings['properties'], $aliases |
93 | ); |
94 | } else { |
95 | // BC for parts still using index types |
96 | foreach ( $mappings as $mappingType => $config ) { |
97 | $mappings[$mappingType]['properties'] = $this->pushAnalyzerAliasesIntoField( |
98 | $config['properties'], $aliases |
99 | ); |
100 | } |
101 | } |
102 | return $mappings; |
103 | } |
104 | |
105 | private function filter( $data, Set $keysToKeep ) { |
106 | foreach ( $data as $k => $v ) { |
107 | if ( !$keysToKeep->contains( $k ) ) { |
108 | unset( $data[$k] ); |
109 | } |
110 | } |
111 | return $data; |
112 | } |
113 | |
114 | /** |
115 | * @param array $analysis The index.analysis field of elasticsearch index settings |
116 | * @param Set $usedAnalyzers Set of analyzers to keep configurations for |
117 | * @return array The $analysis array filtered to only pieces needed for $usedAnalyzers |
118 | */ |
119 | public function filterUnusedAnalysisChain( $analysis, Set $usedAnalyzers ) { |
120 | $sets = [ |
121 | 'analyzer' => $usedAnalyzers, |
122 | 'filter' => new Set(), |
123 | 'char_filter' => new Set(), |
124 | 'tokenizer' => new Set(), |
125 | ]; |
126 | foreach ( $analysis['analyzer'] as $name => $config ) { |
127 | if ( !$usedAnalyzers->contains( $name ) ) { |
128 | continue; |
129 | } |
130 | foreach ( [ 'filter', 'char_filter' ] as $k ) { |
131 | if ( isset( $config[$k] ) ) { |
132 | $sets[$k]->addAll( $config[$k] ); |
133 | } |
134 | } |
135 | if ( isset( $config['tokenizer'] ) ) { |
136 | $sets['tokenizer']->add( $config['tokenizer'] ); |
137 | } |
138 | } |
139 | |
140 | foreach ( $sets as $k => $used ) { |
141 | if ( isset( $analysis[$k] ) ) { |
142 | $analysis[$k] = $this->filter( $analysis[$k], $used ); |
143 | } |
144 | } |
145 | |
146 | return $analysis; |
147 | } |
148 | |
149 | private function recursiveKsort( array $array ) { |
150 | foreach ( $array as $k => $v ) { |
151 | if ( is_array( $v ) ) { |
152 | $array[$k] = $this->recursiveKsort( $v ); |
153 | } |
154 | } |
155 | ksort( $array ); |
156 | return $array; |
157 | } |
158 | |
159 | private function calcDeduplicationAliases( array $input ) { |
160 | $keysByContent = []; |
161 | foreach ( $input as $k => $v ) { |
162 | $sorted = $this->recursiveKsort( $v ); |
163 | $content = \FormatJson::encode( $sorted ); |
164 | $keysByContent[$content][] = $k; |
165 | } |
166 | $aliases = []; |
167 | foreach ( $keysByContent as $keys ) { |
168 | // Min to give a stable winner for each group. |
169 | $winner = count( $keys ) === 1 ? reset( $keys ) : min( ...$keys ); |
170 | foreach ( $keys as $key ) { |
171 | $aliases[$key] = $winner; |
172 | } |
173 | } |
174 | return $aliases; |
175 | } |
176 | |
177 | /** |
178 | * Remove duplicate analysis chain elements and report aliases that need |
179 | * to be applied to mapping configuration. |
180 | * |
181 | * This is necessary for indices such as wikibase that eagerly create |
182 | * analysis chains for many languages. Quite a few languages result in the |
183 | * same elements and this deduplication can remove a large fraction of the |
184 | * configuration. |
185 | * |
186 | * @param array $analysis The index.analysis field of elasticsearch index settings |
187 | * @return string[] map from old analyzer name to new analyzer name. |
188 | */ |
189 | public function deduplicateAnalysisConfig( array $analysis ) { |
190 | // Deduplicate children first to normalize analyzer configuration. |
191 | foreach ( [ 'tokenizer', 'filter', 'char_filter' ] as $k ) { |
192 | if ( !isset( $analysis[$k] ) ) { |
193 | continue; |
194 | } |
195 | $aliases = $this->calcDeduplicationAliases( $analysis[$k] ); |
196 | $analysis[$k] = $this->filter( $analysis[$k], new Set( $aliases ) ); |
197 | |
198 | // Push deduplications into analyzers that reference them |
199 | foreach ( $analysis['analyzer'] as $name => $analyzerConfig ) { |
200 | if ( !isset( $analyzerConfig[$k] ) ) { |
201 | continue; |
202 | } |
203 | if ( is_array( $analyzerConfig[$k] ) ) { |
204 | // filter, char_filter |
205 | foreach ( $analyzerConfig[$k] as $i => $value ) { |
206 | // TODO: in theory, all values should be set already? |
207 | if ( isset( $aliases[$value] ) ) { |
208 | $analysis['analyzer'][$name][$k][$i] = $aliases[$value]; |
209 | } |
210 | } |
211 | } elseif ( isset( $aliases[$analyzerConfig[$k]] ) ) { |
212 | // tokenizer |
213 | $analysis['analyzer'][$name][$k] = $aliases[$analyzerConfig[$k]]; |
214 | } |
215 | } |
216 | } |
217 | |
218 | // Once the analyzer configuration has been normalized by deduplication |
219 | // we can figure out which of the analyzers are duplicates as well. |
220 | return $this->calcDeduplicationAliases( $analysis['analyzer'] ); |
221 | } |
222 | |
223 | /** |
224 | * Shrink the size of elasticsearch index configuration |
225 | * |
226 | * Removes analysis chain elements that are defined but never referenced |
227 | * from the mappings. Optionally deduplicates elements of the analysis |
228 | * chain. |
229 | * |
230 | * @param array $analysis Elasticsearch index analysis configuration |
231 | * @param array $mappings Elasticsearch index mapping configuration |
232 | * @param bool $deduplicate When true deduplicate the analysis chain |
233 | * @param string[] $protected list of named analyzers that should not be removed. |
234 | * @return array [$settings, $mappings] |
235 | */ |
236 | public function filterAnalysis( array $analysis, array $mappings, $deduplicate = false, array $protected = [] ) { |
237 | if ( $deduplicate ) { |
238 | $aliases = $this->deduplicateAnalysisConfig( $analysis ); |
239 | $mappings = $this->pushAnalyzerAliasesIntoMappings( $mappings, $aliases ); |
240 | } |
241 | $usedAnalyzers = $this->findUsedAnalyzersInMappings( $mappings ); |
242 | // protected analyzers may be renamed in the mappings, but this retains them in the config as well |
243 | // to ensure they are available for query-time. |
244 | $usedAnalyzers->addAll( $protected ); |
245 | $analysis = $this->filterUnusedAnalysisChain( $analysis, $usedAnalyzers ); |
246 | return [ $analysis, $mappings ]; |
247 | } |
248 | } |