Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
96.45% |
163 / 169 |
|
71.43% |
5 / 7 |
CRAP | |
0.00% |
0 / 1 |
MappingConfigBuilder | |
96.45% |
163 / 169 |
|
71.43% |
5 / 7 |
19 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 | |||
getDefaultFields | |
94.79% |
91 / 96 |
|
0.00% |
0 / 1 |
3.00 | |||
buildConfig | |
100.00% |
41 / 41 |
|
100.00% |
1 / 1 |
6 | |||
setupCopyTo | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
buildSourceTextStringField | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getTextFieldMapping | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
1 | |||
canOptimizeAnalysisConfig | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Maintenance; |
4 | |
5 | use CirrusSearch\CirrusSearch; |
6 | use CirrusSearch\CirrusSearchHookRunner; |
7 | use CirrusSearch\Search\CirrusIndexField; |
8 | use CirrusSearch\Search\CirrusSearchIndexFieldFactory; |
9 | use CirrusSearch\Search\SourceTextIndexField; |
10 | use CirrusSearch\Search\TextIndexField; |
11 | use CirrusSearch\SearchConfig; |
12 | use MediaWiki\MediaWikiServices; |
13 | use SearchIndexField; |
14 | |
15 | /** |
16 | * Builds elasticsearch mapping configuration arrays. |
17 | * |
18 | * This program is free software; you can redistribute it and/or modify |
19 | * it under the terms of the GNU General Public License as published by |
20 | * the Free Software Foundation; either version 2 of the License, or |
21 | * (at your option) any later version. |
22 | * |
23 | * This program is distributed in the hope that it will be useful, |
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
26 | * GNU General Public License for more details. |
27 | * |
28 | * You should have received a copy of the GNU General Public License along |
29 | * with this program; if not, write to the Free Software Foundation, Inc., |
30 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
31 | * http://www.gnu.org/copyleft/gpl.html |
32 | */ |
33 | class MappingConfigBuilder { |
34 | // Bit field parameters for buildConfig |
35 | public const PREFIX_START_WITH_ANY = 1; |
36 | public const PHRASE_SUGGEST_USE_TEXT = 2; |
37 | public const OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER = 4; |
38 | |
39 | /** |
40 | * Version number for the core analysis. Increment the major |
41 | * version when the analysis changes in an incompatible way, |
42 | * and change the minor version when it changes but isn't |
43 | * incompatible |
44 | */ |
45 | public const VERSION = '1.10'; |
46 | |
47 | /** |
48 | * @var bool should the index be optimized for the experimental highlighter? |
49 | */ |
50 | private $optimizeForExperimentalHighlighter; |
51 | |
52 | /** |
53 | * @var SearchConfig |
54 | */ |
55 | private $config; |
56 | |
57 | /** |
58 | * @var CirrusSearch |
59 | */ |
60 | protected $engine; |
61 | |
62 | /** |
63 | * @var CirrusSearchIndexFieldFactory |
64 | */ |
65 | protected $searchIndexFieldFactory; |
66 | |
67 | /** |
68 | * @var int |
69 | */ |
70 | protected $flags = 0; |
71 | /** |
72 | * @var CirrusSearchHookRunner |
73 | */ |
74 | private $cirrusSearchHookRunner; |
75 | |
76 | /** |
77 | * @param bool $optimizeForExperimentalHighlighter should the index be optimized for the experimental highlighter? |
78 | * @param int $flags |
79 | * @param SearchConfig|null $config |
80 | * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner |
81 | */ |
82 | public function __construct( |
83 | $optimizeForExperimentalHighlighter, |
84 | $flags = 0, |
85 | SearchConfig $config = null, |
86 | CirrusSearchHookRunner $cirrusSearchHookRunner = null |
87 | ) { |
88 | $this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter; |
89 | if ( $this->optimizeForExperimentalHighlighter ) { |
90 | $flags |= self::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER; |
91 | } |
92 | $this->flags = $flags; |
93 | $this->engine = new CirrusSearch( $config ); |
94 | $this->config = $this->engine->getConfig(); |
95 | $this->searchIndexFieldFactory = new CirrusSearchIndexFieldFactory( $this->config ); |
96 | $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner( |
97 | MediaWikiServices::getInstance()->getHookContainer() ); |
98 | } |
99 | |
100 | /** |
101 | * Get definitions for default index fields. |
102 | * These fields are always present in the index. |
103 | * @return array |
104 | */ |
105 | private function getDefaultFields() { |
106 | // Note never to set something as type='object' here because that isn't returned by elasticsearch |
107 | // and is inferred anyway. |
108 | $titleExtraAnalyzers = [ |
109 | [ 'analyzer' => 'prefix', 'search_analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ], |
110 | [ |
111 | 'analyzer' => 'prefix_asciifolding', |
112 | 'search_analyzer' => 'near_match_asciifolding', |
113 | 'index_options' => 'docs', |
114 | 'norms' => false |
115 | ], |
116 | [ 'analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ], |
117 | [ 'analyzer' => 'near_match_asciifolding', 'index_options' => 'docs', 'norms' => false ], |
118 | [ 'analyzer' => 'keyword', 'index_options' => 'docs', 'norms' => false ], |
119 | ]; |
120 | if ( $this->flags & self::PREFIX_START_WITH_ANY ) { |
121 | $titleExtraAnalyzers[] = [ |
122 | 'analyzer' => 'word_prefix', |
123 | 'search_analyzer' => 'plain_search', |
124 | 'index_options' => 'docs' |
125 | ]; |
126 | } |
127 | |
128 | $suggestField = [ |
129 | 'type' => 'text', |
130 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest' ), |
131 | 'index_options' => 'freqs', |
132 | 'analyzer' => 'suggest', |
133 | ]; |
134 | |
135 | if ( $this->config->getElement( 'CirrusSearchPhraseSuggestReverseField', 'build' ) ) { |
136 | $suggestField['fields'] = [ |
137 | 'reverse' => [ |
138 | 'type' => 'text', |
139 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest', 'reverse' ), |
140 | 'index_options' => 'freqs', |
141 | 'analyzer' => 'suggest_reverse', |
142 | ], |
143 | ]; |
144 | } |
145 | |
146 | $page = [ |
147 | 'dynamic' => false, |
148 | 'properties' => [ |
149 | 'timestamp' => [ |
150 | 'type' => 'date', |
151 | 'format' => 'dateOptionalTime', |
152 | ], |
153 | 'create_timestamp' => [ |
154 | 'type' => 'date', |
155 | 'format' => 'dateOptionalTime', |
156 | ], |
157 | 'page_id' => [ |
158 | 'type' => 'long', |
159 | 'index' => false, |
160 | 'doc_values' => true, |
161 | ], |
162 | 'wiki' => $this->searchIndexFieldFactory |
163 | ->newKeywordField( 'wiki' ) |
164 | ->getMapping( $this->engine ), |
165 | 'namespace' => $this->searchIndexFieldFactory |
166 | ->newLongField( 'namespace' ) |
167 | ->getMapping( $this->engine ), |
168 | 'namespace_text' => $this->searchIndexFieldFactory |
169 | ->newKeywordField( 'namespace_text' ) |
170 | ->getMapping( $this->engine ), |
171 | 'title' => $this->searchIndexFieldFactory->newStringField( 'title', |
172 | TextIndexField::ENABLE_NORMS | TextIndexField::COPY_TO_SUGGEST | |
173 | TextIndexField::SUPPORT_REGEX, |
174 | $titleExtraAnalyzers )->setMappingFlags( $this->flags )->getMapping( $this->engine ), |
175 | 'text' => $this->getTextFieldMapping(), |
176 | 'text_bytes' => $this->searchIndexFieldFactory |
177 | ->newLongField( 'text_bytes' ) |
178 | ->getMapping( $this->engine ), |
179 | 'source_text' => $this->buildSourceTextStringField( 'source_text' ) |
180 | ->setMappingFlags( $this->flags )->getMapping( $this->engine ), |
181 | 'redirect' => [ |
182 | 'dynamic' => false, |
183 | 'properties' => [ |
184 | 'namespace' => $this->searchIndexFieldFactory |
185 | ->newLongField( 'namespace' ) |
186 | ->getMapping( $this->engine ), |
187 | 'title' => $this->searchIndexFieldFactory |
188 | ->newStringField( 'redirect.title', TextIndexField::ENABLE_NORMS |
189 | | TextIndexField::SPEED_UP_HIGHLIGHTING |
190 | | TextIndexField::COPY_TO_SUGGEST |
191 | | TextIndexField::SUPPORT_REGEX, |
192 | $titleExtraAnalyzers |
193 | ) |
194 | ->setMappingFlags( $this->flags ) |
195 | ->getMapping( $this->engine ), |
196 | ] |
197 | ], |
198 | 'incoming_links' => $this->searchIndexFieldFactory |
199 | ->newLongField( 'incoming_links' ) |
200 | ->getMapping( $this->engine ), |
201 | 'local_sites_with_dupe' => $this->searchIndexFieldFactory |
202 | ->newKeywordField( 'local_sites_with_dupe' ) |
203 | ->setFlag( SearchIndexField::FLAG_CASEFOLD ) |
204 | ->getMapping( $this->engine ), |
205 | 'suggest' => $suggestField, |
206 | ] |
207 | ]; |
208 | |
209 | return $page; |
210 | } |
211 | |
212 | /** |
213 | * Build the mapping config. |
214 | * @return array the mapping config |
215 | */ |
216 | public function buildConfig() { |
217 | global $wgCirrusSearchAllFields, $wgCirrusSearchWeights; |
218 | |
219 | $page = $this->getDefaultFields(); |
220 | |
221 | $fields = $this->engine->getSearchIndexFields(); |
222 | |
223 | foreach ( $fields as $fieldName => $field ) { |
224 | if ( $field instanceof CirrusIndexField ) { |
225 | $field->setMappingFlags( $this->flags ); |
226 | } |
227 | $config = $field->getMapping( $this->engine ); |
228 | if ( $config ) { |
229 | $page['properties'][$fieldName] = $config; |
230 | } |
231 | } |
232 | |
233 | // Unclear how this would otherwise fit into the process to construct the mapping. |
234 | // Not used directly in cirrus, supports queries from 'add-a-link' (T301096). |
235 | if ( isset( $page['properties']['outgoing_link'] ) ) { |
236 | $page['properties']['outgoing_link']['fields']['token_count'] = [ |
237 | 'type' => 'token_count', |
238 | 'analyzer' => 'keyword', |
239 | ]; |
240 | } |
241 | |
242 | if ( $wgCirrusSearchAllFields[ 'build' ] ) { |
243 | // Now layer all the fields into the all field once per weight. Querying it isn't strictly the |
244 | // same as querying each field - in some ways it is better! In others it is worse.... |
245 | |
246 | // Better because theoretically tf/idf based scoring works better this way. |
247 | // Worse because we have to analyze each field multiple times.... Bleh! |
248 | // This field can't be used for the fvh/experimental highlighter for several reasons: |
249 | // 1. It is built with copy_to and not stored. |
250 | // 2. The term frequency information is all whoppy compared to the "real" source text. |
251 | $allField = $this->searchIndexFieldFactory-> |
252 | newStringField( 'all', TextIndexField::ENABLE_NORMS ); |
253 | $page['properties']['all'] = |
254 | $allField->setMappingFlags( $this->flags )->getMapping( $this->engine ); |
255 | $page = $this->setupCopyTo( $page, $wgCirrusSearchWeights, 'all' ); |
256 | |
257 | // Now repeat for near_match fields. The same considerations above apply except near_match |
258 | // is never used in phrase queries or highlighting. |
259 | $page[ 'properties' ][ 'all_near_match' ] = [ |
260 | 'type' => 'text', |
261 | 'analyzer' => 'near_match', |
262 | 'index_options' => 'freqs', |
263 | 'norms' => false, |
264 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match' ), |
265 | 'fields' => [ |
266 | 'asciifolding' => [ |
267 | 'type' => 'text', |
268 | 'analyzer' => 'near_match_asciifolding', |
269 | 'index_options' => 'freqs', |
270 | 'norms' => false, |
271 | 'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match', 'asciifolding' ), |
272 | ], |
273 | ], |
274 | ]; |
275 | $nearMatchFields = [ |
276 | 'title' => $wgCirrusSearchWeights[ 'title' ], |
277 | 'redirect' => $wgCirrusSearchWeights[ 'redirect' ], |
278 | ]; |
279 | $page = $this->setupCopyTo( $page, $nearMatchFields, 'all_near_match' ); |
280 | } |
281 | |
282 | return $page; |
283 | } |
284 | |
285 | /** |
286 | * Setup copy_to for some fields to $destination. |
287 | * @param array $config to modify |
288 | * @param array $fields field name to number of times copied |
289 | * @param string $destination destination of the copy |
290 | * @return array $config modified with the copy_to setup |
291 | */ |
292 | private function setupCopyTo( $config, $fields, $destination ) { |
293 | foreach ( $fields as $field => $weight ) { |
294 | // Note that weights this causes weights that are not whole numbers to be rounded up. |
295 | // We're ok with that because we don't have a choice. |
296 | for ( $r = 0; $r < $weight; $r++ ) { |
297 | if ( $field === 'redirect' ) { |
298 | // Redirect is in a funky place |
299 | $config[ 'properties' ][ 'redirect' ][ 'properties' ][ 'title' ][ 'copy_to' ][] = $destination; |
300 | } else { |
301 | $config[ 'properties' ][ $field ][ 'copy_to' ][] = $destination; |
302 | } |
303 | } |
304 | } |
305 | |
306 | return $config; |
307 | } |
308 | |
309 | /** |
310 | * Build the source_text index field |
311 | * |
312 | * @param string $fieldName usually "source_text" |
313 | * @return SourceTextIndexField |
314 | */ |
315 | protected function buildSourceTextStringField( $fieldName ) { |
316 | return new SourceTextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config ); |
317 | } |
318 | |
319 | /** |
320 | * @return array |
321 | */ |
322 | private function getTextFieldMapping() { |
323 | $stringFieldMapping = $this->searchIndexFieldFactory->newStringField( |
324 | 'text', |
325 | null, |
326 | [] |
327 | )->setMappingFlags( $this->flags )->getMapping( $this->engine ); |
328 | |
329 | $extraFieldMapping = [ |
330 | 'fields' => [ |
331 | 'word_count' => [ |
332 | 'type' => 'token_count', |
333 | 'analyzer' => 'plain', |
334 | ] |
335 | ] |
336 | ]; |
337 | |
338 | $textFieldMapping = array_merge_recursive( $stringFieldMapping, $extraFieldMapping ); |
339 | |
340 | return $textFieldMapping; |
341 | } |
342 | |
343 | /** |
344 | * Whether or not it's safe to optimize the analysis config. |
345 | * It's generally safe to optimize if all the analyzers needed are |
346 | * properly referenced in the mapping. |
347 | * In the case an analyzer is used directly in a query but not referenced |
348 | * in the mapping it's not safe to optimize. |
349 | * |
350 | * @return bool |
351 | */ |
352 | public function canOptimizeAnalysisConfig() { |
353 | return true; |
354 | } |
355 | } |