Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
96.59% |
170 / 176 |
|
71.43% |
5 / 7 |
CRAP | |
0.00% |
0 / 1 |
MappingConfigBuilder | |
96.59% |
170 / 176 |
|
71.43% |
5 / 7 |
19 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 | |||
getDefaultFields | |
95.24% |
100 / 105 |
|
0.00% |
0 / 1 |
4 | |||
buildConfig | |
100.00% |
39 / 39 |
|
100.00% |
1 / 1 |
5 | |||
setupCopyTo | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
buildSourceTextStringField | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getTextFieldMapping | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
1 | |||
canOptimizeAnalysisConfig | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use CirrusSearch\CirrusSearch; |
6 | use CirrusSearch\CirrusSearchHookRunner; |
7 | use CirrusSearch\Search\CirrusIndexField; |
8 | use CirrusSearch\Search\CirrusSearchIndexFieldFactory; |
9 | use CirrusSearch\Search\SourceTextIndexField; |
10 | use CirrusSearch\Search\TextIndexField; |
11 | use CirrusSearch\SearchConfig; |
12 | use MediaWiki\MediaWikiServices; |
13 | use SearchIndexField; |
14 | |
15 | /** |
16 | * Builds elasticsearch mapping configuration arrays. |
17 | * |
18 | * This program is free software; you can redistribute it and/or modify |
19 | * it under the terms of the GNU General Public License as published by |
20 | * the Free Software Foundation; either version 2 of the License, or |
21 | * (at your option) any later version. |
22 | * |
23 | * This program is distributed in the hope that it will be useful, |
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
26 | * GNU General Public License for more details. |
27 | * |
28 | * You should have received a copy of the GNU General Public License along |
29 | * with this program; if not, write to the Free Software Foundation, Inc., |
30 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
31 | * http://www.gnu.org/copyleft/gpl.html |
32 | */ |
33 | class MappingConfigBuilder { |
34 | // Bit field parameters for buildConfig |
35 | public const PREFIX_START_WITH_ANY = 1; |
36 | public const PHRASE_SUGGEST_USE_TEXT = 2; |
37 | public const OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER = 4; |
38 | |
39 | /** |
40 | * Version number for the core analysis. Increment the major |
41 | * version when the analysis changes in an incompatible way, |
42 | * and change the minor version when it changes but isn't |
43 | * incompatible |
44 | */ |
45 | public const VERSION = '1.10'; |
46 | |
47 | /** |
48 | * @var bool should the index be optimized for the experimental highlighter? |
49 | */ |
50 | private $optimizeForExperimentalHighlighter; |
51 | |
52 | /** |
53 | * @var SearchConfig |
54 | */ |
55 | private $config; |
56 | |
57 | /** |
58 | * @var CirrusSearch |
59 | */ |
60 | protected $engine; |
61 | |
62 | /** |
63 | * @var CirrusSearchIndexFieldFactory |
64 | */ |
65 | protected $searchIndexFieldFactory; |
66 | |
67 | /** |
68 | * @var int |
69 | */ |
70 | protected $flags = 0; |
71 | /** |
72 | * @var CirrusSearchHookRunner |
73 | */ |
74 | private $cirrusSearchHookRunner; |
75 | |
76 | /** |
77 | * @param bool $optimizeForExperimentalHighlighter should the index be optimized for the experimental highlighter? |
78 | * @param int $flags |
79 | * @param SearchConfig|null $config |
80 | * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner |
81 | */ |
82 | public function __construct( |
83 | $optimizeForExperimentalHighlighter, |
84 | $flags = 0, |
85 | ?SearchConfig $config = null, |
86 | ?CirrusSearchHookRunner $cirrusSearchHookRunner = null |
87 | ) { |
88 | $this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter; |
89 | if ( $this->optimizeForExperimentalHighlighter ) { |
90 | $flags |= self::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER; |
91 | } |
92 | $this->flags = $flags; |
93 | $this->engine = new CirrusSearch( $config ); |
94 | $this->config = $this->engine->getConfig(); |
95 | $this->searchIndexFieldFactory = new CirrusSearchIndexFieldFactory( $this->config ); |
96 | $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner( |
97 | MediaWikiServices::getInstance()->getHookContainer() ); |
98 | } |
99 | |
100 | /** |
101 | * Get definitions for default index fields. |
102 | * These fields are always present in the index. |
103 | * @return array |
104 | */ |
105 | private function getDefaultFields() { |
106 | // Note never to set something as type='object' here because that isn't returned by elasticsearch |
107 | // and is inferred anyway. |
108 | $titleExtraAnalyzers = [ |
109 | [ 'analyzer' => 'prefix', 'search_analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ], |
110 | [ |
111 | 'analyzer' => 'prefix_asciifolding', |
112 | 'search_analyzer' => 'near_match_asciifolding', |
113 | 'index_options' => 'docs', |
114 | 'norms' => false |
115 | ], |
116 | [ 'analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ], |
117 | [ 'analyzer' => 'near_match_asciifolding', 'index_options' => 'docs', 'norms' => false ], |
118 | [ 'analyzer' => 'keyword', 'index_options' => 'docs', 'norms' => false ], |
119 | ]; |
120 | if ( $this->flags & self::PREFIX_START_WITH_ANY ) { |
121 | $titleExtraAnalyzers[] = [ |
122 | 'analyzer' => 'word_prefix', |
123 | 'search_analyzer' => 'plain_search', |
124 | 'index_options' => 'docs' |
125 | ]; |
126 | } |
127 | if ( $this->config->getElement( 'CirrusSearchNaturalTitleSort', 'build' ) ) { |
128 | $titleExtraAnalyzers[] = [ |
129 | 'fieldName' => 'natural_sort', |
130 | 'type' => 'icu_collation_keyword', |
131 | // doc values only |
132 | 'index' => false, |
133 | 'numeric' => true, |
134 | 'strength' => 'tertiary', |
135 | // Does icu support all the language codes? |
136 | 'language' => $this->config->getElement( 'CirrusSearchNaturalTitleSort', 'language' ), |
137 | 'country' => $this->config->getElement( 'CirrusSearchNaturalTitleSort', 'country' ), |
138 | ]; |
139 | } |
140 | |
141 | $suggestField = [ |
142 | 'type' => 'text', |
143 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest' ), |
144 | 'index_options' => 'freqs', |
145 | 'analyzer' => 'suggest', |
146 | ]; |
147 | |
148 | if ( $this->config->getElement( 'CirrusSearchPhraseSuggestReverseField', 'build' ) ) { |
149 | $suggestField['fields'] = [ |
150 | 'reverse' => [ |
151 | 'type' => 'text', |
152 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest', 'reverse' ), |
153 | 'index_options' => 'freqs', |
154 | 'analyzer' => 'suggest_reverse', |
155 | ], |
156 | ]; |
157 | } |
158 | |
159 | $page = [ |
160 | 'dynamic' => false, |
161 | 'properties' => [ |
162 | 'timestamp' => [ |
163 | 'type' => 'date', |
164 | 'format' => 'dateOptionalTime', |
165 | ], |
166 | 'create_timestamp' => [ |
167 | 'type' => 'date', |
168 | 'format' => 'dateOptionalTime', |
169 | ], |
170 | 'page_id' => [ |
171 | 'type' => 'long', |
172 | 'index' => false, |
173 | ], |
174 | 'wiki' => $this->searchIndexFieldFactory |
175 | ->newKeywordField( 'wiki' ) |
176 | ->getMapping( $this->engine ), |
177 | 'namespace' => $this->searchIndexFieldFactory |
178 | ->newLongField( 'namespace' ) |
179 | ->getMapping( $this->engine ), |
180 | 'namespace_text' => $this->searchIndexFieldFactory |
181 | ->newKeywordField( 'namespace_text' ) |
182 | ->getMapping( $this->engine ), |
183 | 'title' => $this->searchIndexFieldFactory->newStringField( 'title', |
184 | TextIndexField::ENABLE_NORMS | TextIndexField::COPY_TO_SUGGEST | |
185 | TextIndexField::SUPPORT_REGEX, |
186 | $titleExtraAnalyzers )->setMappingFlags( $this->flags )->getMapping( $this->engine ), |
187 | 'text' => $this->getTextFieldMapping(), |
188 | 'text_bytes' => $this->searchIndexFieldFactory |
189 | ->newLongField( 'text_bytes' ) |
190 | ->getMapping( $this->engine ), |
191 | 'source_text' => $this->buildSourceTextStringField( 'source_text' ) |
192 | ->setMappingFlags( $this->flags )->getMapping( $this->engine ), |
193 | 'redirect' => [ |
194 | 'dynamic' => false, |
195 | 'properties' => [ |
196 | 'namespace' => $this->searchIndexFieldFactory |
197 | ->newLongField( 'namespace' ) |
198 | ->getMapping( $this->engine ), |
199 | 'title' => $this->searchIndexFieldFactory |
200 | ->newStringField( 'redirect.title', TextIndexField::ENABLE_NORMS |
201 | | TextIndexField::SPEED_UP_HIGHLIGHTING |
202 | | TextIndexField::COPY_TO_SUGGEST |
203 | | TextIndexField::SUPPORT_REGEX, |
204 | $titleExtraAnalyzers |
205 | ) |
206 | ->setMappingFlags( $this->flags ) |
207 | ->getMapping( $this->engine ), |
208 | ] |
209 | ], |
210 | 'incoming_links' => $this->searchIndexFieldFactory |
211 | ->newLongField( 'incoming_links' ) |
212 | ->getMapping( $this->engine ), |
213 | 'local_sites_with_dupe' => $this->searchIndexFieldFactory |
214 | ->newKeywordField( 'local_sites_with_dupe' ) |
215 | ->setFlag( SearchIndexField::FLAG_CASEFOLD ) |
216 | ->getMapping( $this->engine ), |
217 | 'suggest' => $suggestField, |
218 | ] |
219 | ]; |
220 | |
221 | return $page; |
222 | } |
223 | |
224 | /** |
225 | * Build the mapping config. |
226 | * @return array the mapping config |
227 | */ |
228 | public function buildConfig() { |
229 | global $wgCirrusSearchWeights; |
230 | |
231 | $page = $this->getDefaultFields(); |
232 | |
233 | $fields = $this->engine->getSearchIndexFields(); |
234 | |
235 | foreach ( $fields as $fieldName => $field ) { |
236 | if ( $field instanceof CirrusIndexField ) { |
237 | $field->setMappingFlags( $this->flags ); |
238 | } |
239 | $config = $field->getMapping( $this->engine ); |
240 | if ( $config ) { |
241 | $page['properties'][$fieldName] = $config; |
242 | } |
243 | } |
244 | |
245 | // Unclear how this would otherwise fit into the process to construct the mapping. |
246 | // Not used directly in cirrus, supports queries from 'add-a-link' (T301096). |
247 | if ( isset( $page['properties']['outgoing_link'] ) ) { |
248 | $page['properties']['outgoing_link']['fields']['token_count'] = [ |
249 | 'type' => 'token_count', |
250 | 'analyzer' => 'keyword', |
251 | ]; |
252 | } |
253 | |
254 | // Now layer all the fields into the all field once per weight. Querying it isn't strictly the |
255 | // same as querying each field - in some ways it is better! In others it is worse.... |
256 | |
257 | // Better because theoretically tf/idf based scoring works better this way. |
258 | // Worse because we have to analyze each field multiple times.... Bleh! |
259 | // This field can't be used for the fvh/experimental highlighter for several reasons: |
260 | // 1. It is built with copy_to and not stored. |
261 | // 2. The term frequency information is all whoppy compared to the "real" source text. |
262 | $allField = $this->searchIndexFieldFactory-> |
263 | newStringField( 'all', TextIndexField::ENABLE_NORMS ); |
264 | $page['properties']['all'] = |
265 | $allField->setMappingFlags( $this->flags )->getMapping( $this->engine ); |
266 | $page = $this->setupCopyTo( $page, $wgCirrusSearchWeights, 'all' ); |
267 | |
268 | // Now repeat for near_match fields. The same considerations above apply except near_match |
269 | // is never used in phrase queries or highlighting. |
270 | $page[ 'properties' ][ 'all_near_match' ] = [ |
271 | 'type' => 'text', |
272 | 'analyzer' => 'near_match', |
273 | 'index_options' => 'freqs', |
274 | 'norms' => false, |
275 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match' ), |
276 | 'fields' => [ |
277 | 'asciifolding' => [ |
278 | 'type' => 'text', |
279 | 'analyzer' => 'near_match_asciifolding', |
280 | 'index_options' => 'freqs', |
281 | 'norms' => false, |
282 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match', 'asciifolding' ), |
283 | ], |
284 | ], |
285 | ]; |
286 | $nearMatchFields = [ |
287 | 'title' => $wgCirrusSearchWeights[ 'title' ], |
288 | 'redirect' => $wgCirrusSearchWeights[ 'redirect' ], |
289 | ]; |
290 | return $this->setupCopyTo( $page, $nearMatchFields, 'all_near_match' ); |
291 | } |
292 | |
293 | /** |
294 | * Setup copy_to for some fields to $destination. |
295 | * @param array $config to modify |
296 | * @param array $fields field name to number of times copied |
297 | * @param string $destination destination of the copy |
298 | * @return array $config modified with the copy_to setup |
299 | */ |
300 | private function setupCopyTo( $config, $fields, $destination ) { |
301 | foreach ( $fields as $field => $weight ) { |
302 | // Note that weights this causes weights that are not whole numbers to be rounded up. |
303 | // We're ok with that because we don't have a choice. |
304 | for ( $r = 0; $r < $weight; $r++ ) { |
305 | if ( $field === 'redirect' ) { |
306 | // Redirect is in a funky place |
307 | $config[ 'properties' ][ 'redirect' ][ 'properties' ][ 'title' ][ 'copy_to' ][] = $destination; |
308 | } else { |
309 | $config[ 'properties' ][ $field ][ 'copy_to' ][] = $destination; |
310 | } |
311 | } |
312 | } |
313 | |
314 | return $config; |
315 | } |
316 | |
317 | /** |
318 | * Build the source_text index field |
319 | * |
320 | * @param string $fieldName usually "source_text" |
321 | * @return SourceTextIndexField |
322 | */ |
323 | protected function buildSourceTextStringField( $fieldName ) { |
324 | return new SourceTextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config ); |
325 | } |
326 | |
327 | /** |
328 | * @return array |
329 | */ |
330 | private function getTextFieldMapping() { |
331 | $stringFieldMapping = $this->searchIndexFieldFactory->newStringField( |
332 | 'text', |
333 | null, |
334 | [] |
335 | )->setMappingFlags( $this->flags )->getMapping( $this->engine ); |
336 | |
337 | $extraFieldMapping = [ |
338 | 'fields' => [ |
339 | 'word_count' => [ |
340 | 'type' => 'token_count', |
341 | 'analyzer' => 'plain', |
342 | ] |
343 | ] |
344 | ]; |
345 | |
346 | $textFieldMapping = array_merge_recursive( $stringFieldMapping, $extraFieldMapping ); |
347 | |
348 | return $textFieldMapping; |
349 | } |
350 | |
351 | /** |
352 | * Whether or not it's safe to optimize the analysis config. |
353 | * It's generally safe to optimize if all the analyzers needed are |
354 | * properly referenced in the mapping. |
355 | * In the case an analyzer is used directly in a query but not referenced |
356 | * in the mapping it's not safe to optimize. |
357 | * |
358 | * @return bool |
359 | */ |
360 | public function canOptimizeAnalysisConfig() { |
361 | return true; |
362 | } |
363 | } |