Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
94.79% |
91 / 96 |
|
80.00% |
8 / 10 |
CRAP | |
0.00% |
0 / 1 |
| AnalysisFilter | |
94.79% |
91 / 96 |
|
80.00% |
8 / 10 |
50.35 | |
0.00% |
0 / 1 |
| findUsedFromField | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
6 | |||
| findUsedAnalyzersInMappings | |
75.00% |
6 / 8 |
|
0.00% |
0 / 1 |
3.14 | |||
| pushAnalyzerAliasesIntoField | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
8 | |||
| pushAnalyzerAliasesIntoMappings | |
66.67% |
6 / 9 |
|
0.00% |
0 / 1 |
3.33 | |||
| filter | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
| filterUnusedAnalysisChain | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
8 | |||
| recursiveKsort | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
| calcDeduplicationAliases | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
5 | |||
| deduplicateAnalysisConfig | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
9 | |||
| filterAnalysis | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace CirrusSearch\Maintenance; |
| 4 | |
| 5 | use MediaWiki\Json\FormatJson; |
| 6 | |
| 7 | /** |
| 8 | * Filter unused and duplicate entries from search index configuration |
| 9 | */ |
| 10 | class AnalysisFilter { |
| 11 | /** @var string[] List of key's in mappings that reference analyzers */ |
| 12 | private static $ANALYZER_FIELDS = [ 'analyzer', 'search_analyzer', 'search_quote_analyzer' ]; |
| 13 | |
| 14 | /** @var string[] List of key's in mappings that must be recursively searched */ |
| 15 | private static $SUBFIELD_FIELDS = [ 'fields', 'properties' ]; |
| 16 | |
| 17 | /** |
| 18 | * Recursively finds used analyzers from search mappings |
| 19 | * |
| 20 | * @param array $properties a 'properties' or 'fields' list from the mappings |
| 21 | * @return Set The set of referenced analyzers |
| 22 | */ |
| 23 | private function findUsedFromField( array $properties ) { |
| 24 | $analyzers = new Set(); |
| 25 | foreach ( $properties as $name => $config ) { |
| 26 | foreach ( self::$ANALYZER_FIELDS as $key ) { |
| 27 | if ( isset( $config[$key] ) ) { |
| 28 | $analyzers->add( $config[$key] ); |
| 29 | } |
| 30 | } |
| 31 | foreach ( self::$SUBFIELD_FIELDS as $key ) { |
| 32 | if ( isset( $config[$key] ) ) { |
| 33 | $analyzers->union( $this->findUsedFromField( $config[$key] ) ); |
| 34 | } |
| 35 | } |
| 36 | } |
| 37 | return $analyzers; |
| 38 | } |
| 39 | |
| 40 | /** |
| 41 | * @param array[] $mappings search mapping configuration |
| 42 | * @return Set The set of analyzer names referenced in $mappings |
| 43 | */ |
| 44 | public function findUsedAnalyzersInMappings( array $mappings ) { |
| 45 | $analyzers = new Set(); |
| 46 | if ( isset( $mappings['properties'] ) ) { |
| 47 | // modern search, no index types |
| 48 | $analyzers->union( |
| 49 | $this->findUsedFromField( $mappings['properties'] ) ); |
| 50 | } else { |
| 51 | // BC for parts still using index types |
| 52 | foreach ( $mappings as $config ) { |
| 53 | $analyzers->union( |
| 54 | $this->findUsedFromField( $config['properties'] ) ); |
| 55 | } |
| 56 | } |
| 57 | return $analyzers; |
| 58 | } |
| 59 | |
| 60 | /** |
| 61 | * Recursively applies analyzer aliases to search mappings |
| 62 | * |
| 63 | * @param array $properties a 'properties' or 'fields' list from the mappings |
| 64 | * @param string[] $aliases Map from current analyzer name to replacement name |
| 65 | * @return array $properties with analyzer aliases applied |
| 66 | */ |
| 67 | private function pushAnalyzerAliasesIntoField( array $properties, array $aliases ) { |
| 68 | foreach ( $properties as &$config ) { |
| 69 | foreach ( self::$ANALYZER_FIELDS as $key ) { |
| 70 | if ( isset( $config[$key] ) && isset( $aliases[$config[$key]] ) ) { |
| 71 | $config[$key] = $aliases[$config[$key]]; |
| 72 | } |
| 73 | } |
| 74 | foreach ( self::$SUBFIELD_FIELDS as $key ) { |
| 75 | if ( isset( $config[$key] ) && is_array( $config[$key] ) ) { |
| 76 | $config[$key] = $this->pushAnalyzerAliasesIntoField( |
| 77 | $config[$key], $aliases |
| 78 | ); |
| 79 | } |
| 80 | } |
| 81 | } |
| 82 | return $properties; |
| 83 | } |
| 84 | |
| 85 | /** |
| 86 | * @param array[] $mappings search index mapping configuration |
| 87 | * @param string[] $aliases Mapping from old name to new name for analyzers |
| 88 | * @return array Updated index mapping configuration |
| 89 | */ |
| 90 | public function pushAnalyzerAliasesIntoMappings( array $mappings, $aliases ) { |
| 91 | if ( isset( $mappings['properties'] ) ) { |
| 92 | // modern search, no index types |
| 93 | $mappings['properties'] = $this->pushAnalyzerAliasesIntoField( |
| 94 | $mappings['properties'], $aliases |
| 95 | ); |
| 96 | } else { |
| 97 | // BC for parts still using index types |
| 98 | foreach ( $mappings as $mappingType => $config ) { |
| 99 | $mappings[$mappingType]['properties'] = $this->pushAnalyzerAliasesIntoField( |
| 100 | $config['properties'], $aliases |
| 101 | ); |
| 102 | } |
| 103 | } |
| 104 | return $mappings; |
| 105 | } |
| 106 | |
| 107 | private function filter( array $data, Set $keysToKeep ): array { |
| 108 | foreach ( $data as $k => $v ) { |
| 109 | if ( !$keysToKeep->contains( $k ) ) { |
| 110 | unset( $data[$k] ); |
| 111 | } |
| 112 | } |
| 113 | return $data; |
| 114 | } |
| 115 | |
| 116 | /** |
| 117 | * @param array $analysis The index.analysis field of the search index settings |
| 118 | * @param Set $usedAnalyzers Set of analyzers to keep configurations for |
| 119 | * @return array The $analysis array filtered to only pieces needed for $usedAnalyzers |
| 120 | */ |
| 121 | public function filterUnusedAnalysisChain( $analysis, Set $usedAnalyzers ) { |
| 122 | $sets = [ |
| 123 | 'analyzer' => $usedAnalyzers, |
| 124 | 'filter' => new Set(), |
| 125 | 'char_filter' => new Set(), |
| 126 | 'tokenizer' => new Set(), |
| 127 | ]; |
| 128 | foreach ( $analysis['analyzer'] as $name => $config ) { |
| 129 | if ( !$usedAnalyzers->contains( $name ) ) { |
| 130 | continue; |
| 131 | } |
| 132 | foreach ( [ 'filter', 'char_filter' ] as $k ) { |
| 133 | if ( isset( $config[$k] ) ) { |
| 134 | $sets[$k]->addAll( $config[$k] ); |
| 135 | } |
| 136 | } |
| 137 | if ( isset( $config['tokenizer'] ) ) { |
| 138 | $sets['tokenizer']->add( $config['tokenizer'] ); |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | foreach ( $sets as $k => $used ) { |
| 143 | if ( isset( $analysis[$k] ) ) { |
| 144 | $analysis[$k] = $this->filter( $analysis[$k], $used ); |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | return $analysis; |
| 149 | } |
| 150 | |
| 151 | private function recursiveKsort( array $array ): array { |
| 152 | foreach ( $array as $k => $v ) { |
| 153 | if ( is_array( $v ) ) { |
| 154 | $array[$k] = $this->recursiveKsort( $v ); |
| 155 | } |
| 156 | } |
| 157 | ksort( $array ); |
| 158 | return $array; |
| 159 | } |
| 160 | |
| 161 | private function calcDeduplicationAliases( array $input ): array { |
| 162 | $keysByContent = []; |
| 163 | foreach ( $input as $k => $v ) { |
| 164 | $sorted = $this->recursiveKsort( $v ); |
| 165 | $content = FormatJson::encode( $sorted ); |
| 166 | $keysByContent[$content][] = $k; |
| 167 | } |
| 168 | $aliases = []; |
| 169 | foreach ( $keysByContent as $keys ) { |
| 170 | // Min to give a stable winner for each group. |
| 171 | $winner = count( $keys ) === 1 ? reset( $keys ) : min( ...$keys ); |
| 172 | foreach ( $keys as $key ) { |
| 173 | $aliases[$key] = $winner; |
| 174 | } |
| 175 | } |
| 176 | return $aliases; |
| 177 | } |
| 178 | |
| 179 | /** |
| 180 | * Remove duplicate analysis chain elements and report aliases that need |
| 181 | * to be applied to mapping configuration. |
| 182 | * |
| 183 | * This is necessary for indices such as wikibase that eagerly create |
| 184 | * analysis chains for many languages. Quite a few languages result in the |
| 185 | * same elements and this deduplication can remove a large fraction of the |
| 186 | * configuration. |
| 187 | * |
| 188 | * @param array $analysis The index.analysis field of the search index settings |
| 189 | * @return string[] map from old analyzer name to new analyzer name. |
| 190 | */ |
| 191 | public function deduplicateAnalysisConfig( array $analysis ) { |
| 192 | // Deduplicate children first to normalize analyzer configuration. |
| 193 | foreach ( [ 'tokenizer', 'filter', 'char_filter' ] as $k ) { |
| 194 | if ( !isset( $analysis[$k] ) ) { |
| 195 | continue; |
| 196 | } |
| 197 | $aliases = $this->calcDeduplicationAliases( $analysis[$k] ); |
| 198 | $analysis[$k] = $this->filter( $analysis[$k], new Set( $aliases ) ); |
| 199 | |
| 200 | // Push deduplications into analyzers that reference them |
| 201 | foreach ( $analysis['analyzer'] as $name => $analyzerConfig ) { |
| 202 | if ( !isset( $analyzerConfig[$k] ) ) { |
| 203 | continue; |
| 204 | } |
| 205 | if ( is_array( $analyzerConfig[$k] ) ) { |
| 206 | // filter, char_filter |
| 207 | foreach ( $analyzerConfig[$k] as $i => $value ) { |
| 208 | // TODO: in theory, all values should be set already? |
| 209 | if ( isset( $aliases[$value] ) ) { |
| 210 | $analysis['analyzer'][$name][$k][$i] = $aliases[$value]; |
| 211 | } |
| 212 | } |
| 213 | } elseif ( isset( $aliases[$analyzerConfig[$k]] ) ) { |
| 214 | // tokenizer |
| 215 | $analysis['analyzer'][$name][$k] = $aliases[$analyzerConfig[$k]]; |
| 216 | } |
| 217 | } |
| 218 | } |
| 219 | |
| 220 | // Once the analyzer configuration has been normalized by deduplication |
| 221 | // we can figure out which of the analyzers are duplicates as well. |
| 222 | return $this->calcDeduplicationAliases( $analysis['analyzer'] ); |
| 223 | } |
| 224 | |
| 225 | /** |
| 226 | * Shrink the size of the search index configuration |
| 227 | * |
| 228 | * Removes analysis chain elements that are defined but never referenced |
| 229 | * from the mappings. Optionally deduplicates elements of the analysis |
| 230 | * chain. |
| 231 | * |
| 232 | * @param array $analysis search index analysis configuration |
| 233 | * @param array $mappings search index mapping configuration |
| 234 | * @param bool $deduplicate When true deduplicate the analysis chain |
| 235 | * @param string[] $protected list of named analyzers that should not be removed. |
| 236 | * @return array [$settings, $mappings] |
| 237 | */ |
| 238 | public function filterAnalysis( array $analysis, array $mappings, $deduplicate = false, array $protected = [] ) { |
| 239 | if ( $deduplicate ) { |
| 240 | $aliases = $this->deduplicateAnalysisConfig( $analysis ); |
| 241 | $mappings = $this->pushAnalyzerAliasesIntoMappings( $mappings, $aliases ); |
| 242 | } |
| 243 | $usedAnalyzers = $this->findUsedAnalyzersInMappings( $mappings ); |
| 244 | // protected analyzers may be renamed in the mappings, but this retains them in the config as well |
| 245 | // to ensure they are available for query-time. |
| 246 | $usedAnalyzers->addAll( $protected ); |
| 247 | $analysis = $this->filterUnusedAnalysisChain( $analysis, $usedAnalyzers ); |
| 248 | return [ $analysis, $mappings ]; |
| 249 | } |
| 250 | } |