Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
96.41% |
161 / 167 |
|
71.43% |
5 / 7 |
CRAP | |
0.00% |
0 / 1 |
MappingConfigBuilder | |
96.41% |
161 / 167 |
|
71.43% |
5 / 7 |
18 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 | |||
getDefaultFields | |
94.79% |
91 / 96 |
|
0.00% |
0 / 1 |
3.00 | |||
buildConfig | |
100.00% |
39 / 39 |
|
100.00% |
1 / 1 |
5 | |||
setupCopyTo | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
buildSourceTextStringField | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getTextFieldMapping | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
1 | |||
canOptimizeAnalysisConfig | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use CirrusSearch\CirrusSearch; |
6 | use CirrusSearch\CirrusSearchHookRunner; |
7 | use CirrusSearch\Search\CirrusIndexField; |
8 | use CirrusSearch\Search\CirrusSearchIndexFieldFactory; |
9 | use CirrusSearch\Search\SourceTextIndexField; |
10 | use CirrusSearch\Search\TextIndexField; |
11 | use CirrusSearch\SearchConfig; |
12 | use MediaWiki\MediaWikiServices; |
13 | use SearchIndexField; |
14 | |
15 | /** |
16 | * Builds elasticsearch mapping configuration arrays. |
17 | * |
18 | * This program is free software; you can redistribute it and/or modify |
19 | * it under the terms of the GNU General Public License as published by |
20 | * the Free Software Foundation; either version 2 of the License, or |
21 | * (at your option) any later version. |
22 | * |
23 | * This program is distributed in the hope that it will be useful, |
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
26 | * GNU General Public License for more details. |
27 | * |
28 | * You should have received a copy of the GNU General Public License along |
29 | * with this program; if not, write to the Free Software Foundation, Inc., |
30 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
31 | * http://www.gnu.org/copyleft/gpl.html |
32 | */ |
33 | class MappingConfigBuilder { |
34 | // Bit field parameters for buildConfig |
35 | public const PREFIX_START_WITH_ANY = 1; |
36 | public const PHRASE_SUGGEST_USE_TEXT = 2; |
37 | public const OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER = 4; |
38 | |
39 | /** |
40 | * Version number for the core analysis. Increment the major |
41 | * version when the analysis changes in an incompatible way, |
42 | * and change the minor version when it changes but isn't |
43 | * incompatible |
44 | */ |
45 | public const VERSION = '1.10'; |
46 | |
47 | /** |
48 | * @var bool should the index be optimized for the experimental highlighter? |
49 | */ |
50 | private $optimizeForExperimentalHighlighter; |
51 | |
52 | /** |
53 | * @var SearchConfig |
54 | */ |
55 | private $config; |
56 | |
57 | /** |
58 | * @var CirrusSearch |
59 | */ |
60 | protected $engine; |
61 | |
62 | /** |
63 | * @var CirrusSearchIndexFieldFactory |
64 | */ |
65 | protected $searchIndexFieldFactory; |
66 | |
67 | /** |
68 | * @var int |
69 | */ |
70 | protected $flags = 0; |
71 | /** |
72 | * @var CirrusSearchHookRunner |
73 | */ |
74 | private $cirrusSearchHookRunner; |
75 | |
76 | /** |
77 | * @param bool $optimizeForExperimentalHighlighter should the index be optimized for the experimental highlighter? |
78 | * @param int $flags |
79 | * @param SearchConfig|null $config |
80 | * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner |
81 | */ |
82 | public function __construct( |
83 | $optimizeForExperimentalHighlighter, |
84 | $flags = 0, |
85 | SearchConfig $config = null, |
86 | CirrusSearchHookRunner $cirrusSearchHookRunner = null |
87 | ) { |
88 | $this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter; |
89 | if ( $this->optimizeForExperimentalHighlighter ) { |
90 | $flags |= self::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER; |
91 | } |
92 | $this->flags = $flags; |
93 | $this->engine = new CirrusSearch( $config ); |
94 | $this->config = $this->engine->getConfig(); |
95 | $this->searchIndexFieldFactory = new CirrusSearchIndexFieldFactory( $this->config ); |
96 | $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner( |
97 | MediaWikiServices::getInstance()->getHookContainer() ); |
98 | } |
99 | |
100 | /** |
101 | * Get definitions for default index fields. |
102 | * These fields are always present in the index. |
103 | * @return array |
104 | */ |
105 | private function getDefaultFields() { |
106 | // Note never to set something as type='object' here because that isn't returned by elasticsearch |
107 | // and is inferred anyway. |
108 | $titleExtraAnalyzers = [ |
109 | [ 'analyzer' => 'prefix', 'search_analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ], |
110 | [ |
111 | 'analyzer' => 'prefix_asciifolding', |
112 | 'search_analyzer' => 'near_match_asciifolding', |
113 | 'index_options' => 'docs', |
114 | 'norms' => false |
115 | ], |
116 | [ 'analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ], |
117 | [ 'analyzer' => 'near_match_asciifolding', 'index_options' => 'docs', 'norms' => false ], |
118 | [ 'analyzer' => 'keyword', 'index_options' => 'docs', 'norms' => false ], |
119 | ]; |
120 | if ( $this->flags & self::PREFIX_START_WITH_ANY ) { |
121 | $titleExtraAnalyzers[] = [ |
122 | 'analyzer' => 'word_prefix', |
123 | 'search_analyzer' => 'plain_search', |
124 | 'index_options' => 'docs' |
125 | ]; |
126 | } |
127 | |
128 | $suggestField = [ |
129 | 'type' => 'text', |
130 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest' ), |
131 | 'index_options' => 'freqs', |
132 | 'analyzer' => 'suggest', |
133 | ]; |
134 | |
135 | if ( $this->config->getElement( 'CirrusSearchPhraseSuggestReverseField', 'build' ) ) { |
136 | $suggestField['fields'] = [ |
137 | 'reverse' => [ |
138 | 'type' => 'text', |
139 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest', 'reverse' ), |
140 | 'index_options' => 'freqs', |
141 | 'analyzer' => 'suggest_reverse', |
142 | ], |
143 | ]; |
144 | } |
145 | |
146 | $page = [ |
147 | 'dynamic' => false, |
148 | 'properties' => [ |
149 | 'timestamp' => [ |
150 | 'type' => 'date', |
151 | 'format' => 'dateOptionalTime', |
152 | ], |
153 | 'create_timestamp' => [ |
154 | 'type' => 'date', |
155 | 'format' => 'dateOptionalTime', |
156 | ], |
157 | 'page_id' => [ |
158 | 'type' => 'long', |
159 | 'index' => false, |
160 | 'doc_values' => true, |
161 | ], |
162 | 'wiki' => $this->searchIndexFieldFactory |
163 | ->newKeywordField( 'wiki' ) |
164 | ->getMapping( $this->engine ), |
165 | 'namespace' => $this->searchIndexFieldFactory |
166 | ->newLongField( 'namespace' ) |
167 | ->getMapping( $this->engine ), |
168 | 'namespace_text' => $this->searchIndexFieldFactory |
169 | ->newKeywordField( 'namespace_text' ) |
170 | ->getMapping( $this->engine ), |
171 | 'title' => $this->searchIndexFieldFactory->newStringField( 'title', |
172 | TextIndexField::ENABLE_NORMS | TextIndexField::COPY_TO_SUGGEST | |
173 | TextIndexField::SUPPORT_REGEX, |
174 | $titleExtraAnalyzers )->setMappingFlags( $this->flags )->getMapping( $this->engine ), |
175 | 'text' => $this->getTextFieldMapping(), |
176 | 'text_bytes' => $this->searchIndexFieldFactory |
177 | ->newLongField( 'text_bytes' ) |
178 | ->getMapping( $this->engine ), |
179 | 'source_text' => $this->buildSourceTextStringField( 'source_text' ) |
180 | ->setMappingFlags( $this->flags )->getMapping( $this->engine ), |
181 | 'redirect' => [ |
182 | 'dynamic' => false, |
183 | 'properties' => [ |
184 | 'namespace' => $this->searchIndexFieldFactory |
185 | ->newLongField( 'namespace' ) |
186 | ->getMapping( $this->engine ), |
187 | 'title' => $this->searchIndexFieldFactory |
188 | ->newStringField( 'redirect.title', TextIndexField::ENABLE_NORMS |
189 | | TextIndexField::SPEED_UP_HIGHLIGHTING |
190 | | TextIndexField::COPY_TO_SUGGEST |
191 | | TextIndexField::SUPPORT_REGEX, |
192 | $titleExtraAnalyzers |
193 | ) |
194 | ->setMappingFlags( $this->flags ) |
195 | ->getMapping( $this->engine ), |
196 | ] |
197 | ], |
198 | 'incoming_links' => $this->searchIndexFieldFactory |
199 | ->newLongField( 'incoming_links' ) |
200 | ->getMapping( $this->engine ), |
201 | 'local_sites_with_dupe' => $this->searchIndexFieldFactory |
202 | ->newKeywordField( 'local_sites_with_dupe' ) |
203 | ->setFlag( SearchIndexField::FLAG_CASEFOLD ) |
204 | ->getMapping( $this->engine ), |
205 | 'suggest' => $suggestField, |
206 | ] |
207 | ]; |
208 | |
209 | return $page; |
210 | } |
211 | |
212 | /** |
213 | * Build the mapping config. |
214 | * @return array the mapping config |
215 | */ |
216 | public function buildConfig() { |
217 | global $wgCirrusSearchWeights; |
218 | |
219 | $page = $this->getDefaultFields(); |
220 | |
221 | $fields = $this->engine->getSearchIndexFields(); |
222 | |
223 | foreach ( $fields as $fieldName => $field ) { |
224 | if ( $field instanceof CirrusIndexField ) { |
225 | $field->setMappingFlags( $this->flags ); |
226 | } |
227 | $config = $field->getMapping( $this->engine ); |
228 | if ( $config ) { |
229 | $page['properties'][$fieldName] = $config; |
230 | } |
231 | } |
232 | |
233 | // Unclear how this would otherwise fit into the process to construct the mapping. |
234 | // Not used directly in cirrus, supports queries from 'add-a-link' (T301096). |
235 | if ( isset( $page['properties']['outgoing_link'] ) ) { |
236 | $page['properties']['outgoing_link']['fields']['token_count'] = [ |
237 | 'type' => 'token_count', |
238 | 'analyzer' => 'keyword', |
239 | ]; |
240 | } |
241 | |
242 | // Now layer all the fields into the all field once per weight. Querying it isn't strictly the |
243 | // same as querying each field - in some ways it is better! In others it is worse.... |
244 | |
245 | // Better because theoretically tf/idf based scoring works better this way. |
246 | // Worse because we have to analyze each field multiple times.... Bleh! |
247 | // This field can't be used for the fvh/experimental highlighter for several reasons: |
248 | // 1. It is built with copy_to and not stored. |
249 | // 2. The term frequency information is all whoppy compared to the "real" source text. |
250 | $allField = $this->searchIndexFieldFactory-> |
251 | newStringField( 'all', TextIndexField::ENABLE_NORMS ); |
252 | $page['properties']['all'] = |
253 | $allField->setMappingFlags( $this->flags )->getMapping( $this->engine ); |
254 | $page = $this->setupCopyTo( $page, $wgCirrusSearchWeights, 'all' ); |
255 | |
256 | // Now repeat for near_match fields. The same considerations above apply except near_match |
257 | // is never used in phrase queries or highlighting. |
258 | $page[ 'properties' ][ 'all_near_match' ] = [ |
259 | 'type' => 'text', |
260 | 'analyzer' => 'near_match', |
261 | 'index_options' => 'freqs', |
262 | 'norms' => false, |
263 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match' ), |
264 | 'fields' => [ |
265 | 'asciifolding' => [ |
266 | 'type' => 'text', |
267 | 'analyzer' => 'near_match_asciifolding', |
268 | 'index_options' => 'freqs', |
269 | 'norms' => false, |
270 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match', 'asciifolding' ), |
271 | ], |
272 | ], |
273 | ]; |
274 | $nearMatchFields = [ |
275 | 'title' => $wgCirrusSearchWeights[ 'title' ], |
276 | 'redirect' => $wgCirrusSearchWeights[ 'redirect' ], |
277 | ]; |
278 | return $this->setupCopyTo( $page, $nearMatchFields, 'all_near_match' ); |
279 | } |
280 | |
281 | /** |
282 | * Setup copy_to for some fields to $destination. |
283 | * @param array $config to modify |
284 | * @param array $fields field name to number of times copied |
285 | * @param string $destination destination of the copy |
286 | * @return array $config modified with the copy_to setup |
287 | */ |
288 | private function setupCopyTo( $config, $fields, $destination ) { |
289 | foreach ( $fields as $field => $weight ) { |
290 | // Note that weights this causes weights that are not whole numbers to be rounded up. |
291 | // We're ok with that because we don't have a choice. |
292 | for ( $r = 0; $r < $weight; $r++ ) { |
293 | if ( $field === 'redirect' ) { |
294 | // Redirect is in a funky place |
295 | $config[ 'properties' ][ 'redirect' ][ 'properties' ][ 'title' ][ 'copy_to' ][] = $destination; |
296 | } else { |
297 | $config[ 'properties' ][ $field ][ 'copy_to' ][] = $destination; |
298 | } |
299 | } |
300 | } |
301 | |
302 | return $config; |
303 | } |
304 | |
305 | /** |
306 | * Build the source_text index field |
307 | * |
308 | * @param string $fieldName usually "source_text" |
309 | * @return SourceTextIndexField |
310 | */ |
311 | protected function buildSourceTextStringField( $fieldName ) { |
312 | return new SourceTextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config ); |
313 | } |
314 | |
315 | /** |
316 | * @return array |
317 | */ |
318 | private function getTextFieldMapping() { |
319 | $stringFieldMapping = $this->searchIndexFieldFactory->newStringField( |
320 | 'text', |
321 | null, |
322 | [] |
323 | )->setMappingFlags( $this->flags )->getMapping( $this->engine ); |
324 | |
325 | $extraFieldMapping = [ |
326 | 'fields' => [ |
327 | 'word_count' => [ |
328 | 'type' => 'token_count', |
329 | 'analyzer' => 'plain', |
330 | ] |
331 | ] |
332 | ]; |
333 | |
334 | $textFieldMapping = array_merge_recursive( $stringFieldMapping, $extraFieldMapping ); |
335 | |
336 | return $textFieldMapping; |
337 | } |
338 | |
339 | /** |
340 | * Whether or not it's safe to optimize the analysis config. |
341 | * It's generally safe to optimize if all the analyzers needed are |
342 | * properly referenced in the mapping. |
343 | * In the case an analyzer is used directly in a query but not referenced |
344 | * in the mapping it's not safe to optimize. |
345 | * |
346 | * @return bool |
347 | */ |
348 | public function canOptimizeAnalysisConfig() { |
349 | return true; |
350 | } |
351 | } |