Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
96.47% covered (success)
96.47%
164 / 170
71.43% covered (warning)
71.43%
5 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
MappingConfigBuilder
96.47% covered (success)
96.47%
164 / 170
71.43% covered (warning)
71.43%
5 / 7
19
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
3
 getDefaultFields
94.85% covered (success)
94.85%
92 / 97
0.00% covered (danger)
0.00%
0 / 1
3.00
 buildConfig
100.00% covered (success)
100.00%
41 / 41
100.00% covered (success)
100.00%
1 / 1
6
 setupCopyTo
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
4
 buildSourceTextStringField
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getTextFieldMapping
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
1
 canOptimizeAnalysisConfig
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use CirrusSearch\CirrusSearch;
6use CirrusSearch\CirrusSearchHookRunner;
7use CirrusSearch\Search\CirrusIndexField;
8use CirrusSearch\Search\CirrusSearchIndexFieldFactory;
9use CirrusSearch\Search\SourceTextIndexField;
10use CirrusSearch\Search\TextIndexField;
11use CirrusSearch\SearchConfig;
12use MediaWiki\MediaWikiServices;
13use SearchIndexField;
14
15/**
16 * Builds elasticsearch mapping configuration arrays.
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
27 *
28 * You should have received a copy of the GNU General Public License along
29 * with this program; if not, write to the Free Software Foundation, Inc.,
30 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
31 * http://www.gnu.org/copyleft/gpl.html
32 */
33class MappingConfigBuilder {
34    // Bit field parameters for buildConfig
35    public const PREFIX_START_WITH_ANY = 1;
36    public const PHRASE_SUGGEST_USE_TEXT = 2;
37    public const OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER = 4;
38
39    /**
40     * Version number for the core analysis. Increment the major
41     * version when the analysis changes in an incompatible way,
42     * and change the minor version when it changes but isn't
43     * incompatible
44     */
45    public const VERSION = '1.10';
46
47    /**
48     * @var bool should the index be optimized for the experimental highlighter?
49     */
50    private $optimizeForExperimentalHighlighter;
51
52    /**
53     * @var SearchConfig
54     */
55    private $config;
56
57    /**
58     * @var CirrusSearch
59     */
60    protected $engine;
61
62    /**
63     * @var CirrusSearchIndexFieldFactory
64     */
65    protected $searchIndexFieldFactory;
66
67    /**
68     * @var int
69     */
70    protected $flags = 0;
71    /**
72     * @var CirrusSearchHookRunner
73     */
74    private $cirrusSearchHookRunner;
75
76    /**
77     * @param bool $optimizeForExperimentalHighlighter should the index be optimized for the experimental highlighter?
78     * @param int $flags
79     * @param SearchConfig|null $config
80     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
81     */
82    public function __construct(
83        $optimizeForExperimentalHighlighter,
84        $flags = 0,
85        SearchConfig $config = null,
86        CirrusSearchHookRunner $cirrusSearchHookRunner = null
87    ) {
88        $this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter;
89        if ( $this->optimizeForExperimentalHighlighter ) {
90            $flags |= self::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER;
91        }
92        $this->flags = $flags;
93        $this->engine = new CirrusSearch( $config );
94        $this->config = $this->engine->getConfig();
95        $this->searchIndexFieldFactory = new CirrusSearchIndexFieldFactory( $this->config );
96        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
97            MediaWikiServices::getInstance()->getHookContainer() );
98    }
99
100    /**
101     * Get definitions for default index fields.
102     * These fields are always present in the index.
103     * @return array
104     */
105    private function getDefaultFields() {
106        // Note never to set something as type='object' here because that isn't returned by elasticsearch
107        // and is inferred anyway.
108        $titleExtraAnalyzers = [
109            [ 'analyzer' => 'prefix', 'search_analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ],
110            [
111                'analyzer' => 'prefix_asciifolding',
112                'search_analyzer' => 'near_match_asciifolding',
113                'index_options' => 'docs',
114                'norms' => false
115            ],
116            [ 'analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ],
117            [ 'analyzer' => 'near_match_asciifolding', 'index_options' => 'docs', 'norms' => false ],
118            [ 'analyzer' => 'keyword', 'index_options' => 'docs', 'norms' => false ],
119        ];
120        if ( $this->flags & self::PREFIX_START_WITH_ANY ) {
121            $titleExtraAnalyzers[] = [
122                'analyzer' => 'word_prefix',
123                'search_analyzer' => 'plain_search',
124                'index_options' => 'docs'
125            ];
126        }
127
128        $suggestField = [
129            'type' => 'text',
130            'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest' ),
131            'index_options' => 'freqs',
132            'analyzer' => 'suggest',
133        ];
134
135        if ( $this->config->getElement( 'CirrusSearchPhraseSuggestReverseField', 'build' ) ) {
136            $suggestField['fields'] = [
137                'reverse' => [
138                    'type' => 'text',
139                    'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest', 'reverse' ),
140                    'index_options' => 'freqs',
141                    'analyzer' => 'suggest_reverse',
142                ],
143            ];
144        }
145
146        $page = [
147            'dynamic' => false,
148            'properties' => [
149                'timestamp' => [
150                    'type' => 'date',
151                    'format' => 'dateOptionalTime',
152                ],
153                'create_timestamp' => [
154                    'type' => 'date',
155                    'format' => 'dateOptionalTime',
156                ],
157                'page_id' => [
158                    'type' => 'long',
159                    'index' => false,
160                    'doc_values' => true,
161                ],
162                'wiki' => $this->searchIndexFieldFactory
163                    ->newKeywordField( 'wiki' )
164                    ->getMapping( $this->engine ),
165                'namespace' => $this->searchIndexFieldFactory
166                    ->newLongField( 'namespace' )
167                    ->getMapping( $this->engine ),
168                'namespace_text' => $this->searchIndexFieldFactory
169                    ->newKeywordField( 'namespace_text' )
170                    ->getMapping( $this->engine ),
171                'title' => $this->searchIndexFieldFactory->newStringField( 'title',
172                    TextIndexField::ENABLE_NORMS | TextIndexField::COPY_TO_SUGGEST |
173                    TextIndexField::SUPPORT_REGEX,
174                    $titleExtraAnalyzers )->setMappingFlags( $this->flags )->getMapping( $this->engine ),
175                'text' => $this->getTextFieldMapping(),
176                'text_bytes' => $this->searchIndexFieldFactory
177                    ->newLongField( 'text_bytes' )
178                    ->setFlag( SearchIndexField::FLAG_NO_INDEX )
179                    ->getMapping( $this->engine ),
180                'source_text' => $this->buildSourceTextStringField( 'source_text' )
181                    ->setMappingFlags( $this->flags )->getMapping( $this->engine ),
182                'redirect' => [
183                    'dynamic' => false,
184                    'properties' => [
185                        'namespace' => $this->searchIndexFieldFactory
186                            ->newLongField( 'namespace' )
187                            ->getMapping( $this->engine ),
188                        'title' => $this->searchIndexFieldFactory
189                            ->newStringField( 'redirect.title', TextIndexField::ENABLE_NORMS
190                                | TextIndexField::SPEED_UP_HIGHLIGHTING
191                                | TextIndexField::COPY_TO_SUGGEST
192                                | TextIndexField::SUPPORT_REGEX,
193                                $titleExtraAnalyzers
194                            )
195                            ->setMappingFlags( $this->flags )
196                            ->getMapping( $this->engine ),
197                    ]
198                ],
199                'incoming_links' => $this->searchIndexFieldFactory
200                    ->newLongField( 'incoming_links' )
201                    ->getMapping( $this->engine ),
202                'local_sites_with_dupe' => $this->searchIndexFieldFactory
203                    ->newKeywordField( 'local_sites_with_dupe' )
204                    ->setFlag( SearchIndexField::FLAG_CASEFOLD )
205                    ->getMapping( $this->engine ),
206                'suggest' => $suggestField,
207            ]
208        ];
209
210        return $page;
211    }
212
213    /**
214     * Build the mapping config.
215     * @return array the mapping config
216     */
217    public function buildConfig() {
218        global $wgCirrusSearchAllFields, $wgCirrusSearchWeights;
219
220        $page = $this->getDefaultFields();
221
222        $fields = $this->engine->getSearchIndexFields();
223
224        foreach ( $fields as $fieldName => $field ) {
225            if ( $field instanceof CirrusIndexField ) {
226                $field->setMappingFlags( $this->flags );
227            }
228            $config = $field->getMapping( $this->engine );
229            if ( $config ) {
230                $page['properties'][$fieldName] = $config;
231            }
232        }
233
234        // Unclear how this would otherwise fit into the process to construct the mapping.
235        // Not used directly in cirrus, supports queries from 'add-a-link' (T301096).
236        if ( isset( $page['properties']['outgoing_link'] ) ) {
237            $page['properties']['outgoing_link']['fields']['token_count'] = [
238                'type' => 'token_count',
239                'analyzer' => 'keyword',
240            ];
241        }
242
243        if ( $wgCirrusSearchAllFields[ 'build' ] ) {
244            // Now layer all the fields into the all field once per weight.  Querying it isn't strictly the
245            // same as querying each field - in some ways it is better!  In others it is worse....
246
247            // Better because theoretically tf/idf based scoring works better this way.
248            // Worse because we have to analyze each field multiple times....  Bleh!
249            // This field can't be used for the fvh/experimental highlighter for several reasons:
250            // 1. It is built with copy_to and not stored.
251            // 2. The term frequency information is all whoppy compared to the "real" source text.
252            $allField = $this->searchIndexFieldFactory->
253                newStringField( 'all', TextIndexField::ENABLE_NORMS );
254            $page['properties']['all'] =
255                $allField->setMappingFlags( $this->flags )->getMapping( $this->engine );
256            $page = $this->setupCopyTo( $page, $wgCirrusSearchWeights, 'all' );
257
258            // Now repeat for near_match fields.  The same considerations above apply except near_match
259            // is never used in phrase queries or highlighting.
260            $page[ 'properties' ][ 'all_near_match' ] = [
261                'type' => 'text',
262                'analyzer' => 'near_match',
263                'index_options' => 'freqs',
264                'norms' => false,
265                'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match' ),
266                'fields' => [
267                    'asciifolding' => [
268                        'type' => 'text',
269                        'analyzer' => 'near_match_asciifolding',
270                        'index_options' => 'freqs',
271                        'norms' => false,
272                        'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match', 'asciifolding' ),
273                    ],
274                ],
275            ];
276            $nearMatchFields = [
277                'title' => $wgCirrusSearchWeights[ 'title' ],
278                'redirect' => $wgCirrusSearchWeights[ 'redirect' ],
279            ];
280            $page = $this->setupCopyTo( $page, $nearMatchFields, 'all_near_match' );
281        }
282
283        return $page;
284    }
285
286    /**
287     * Setup copy_to for some fields to $destination.
288     * @param array $config to modify
289     * @param array $fields field name to number of times copied
290     * @param string $destination destination of the copy
291     * @return array $config modified with the copy_to setup
292     */
293    private function setupCopyTo( $config, $fields, $destination ) {
294        foreach ( $fields as $field => $weight ) {
295            // Note that weights this causes weights that are not whole numbers to be rounded up.
296            // We're ok with that because we don't have a choice.
297            for ( $r = 0; $r < $weight; $r++ ) {
298                if ( $field === 'redirect' ) {
299                    // Redirect is in a funky place
300                    $config[ 'properties' ][ 'redirect' ][ 'properties' ][ 'title' ][ 'copy_to' ][] = $destination;
301                } else {
302                    $config[ 'properties' ][ $field ][ 'copy_to' ][] = $destination;
303                }
304            }
305        }
306
307        return $config;
308    }
309
310    /**
311     * Build the source_text index field
312     *
313     * @param string $fieldName usually "source_text"
314     * @return SourceTextIndexField
315     */
316    protected function buildSourceTextStringField( $fieldName ) {
317        return new SourceTextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config );
318    }
319
320    /**
321     * @return array
322     */
323    private function getTextFieldMapping() {
324        $stringFieldMapping = $this->searchIndexFieldFactory->newStringField(
325            'text',
326            null,
327            []
328        )->setMappingFlags( $this->flags )->getMapping( $this->engine );
329
330        $extraFieldMapping = [
331            'fields' => [
332                'word_count' => [
333                    'type' => 'token_count',
334                    'analyzer' => 'plain',
335                ]
336            ]
337        ];
338
339        $textFieldMapping = array_merge_recursive( $stringFieldMapping, $extraFieldMapping );
340
341        return $textFieldMapping;
342    }
343
344    /**
345     * Whether or not it's safe to optimize the analysis config.
346     * It's generally safe to optimize if all the analyzers needed are
347     * properly referenced in the mapping.
348     * In the case an analyzer is used directly in a query but not referenced
349     * in the mapping it's not safe to optimize.
350     *
351     * @return bool
352     */
353    public function canOptimizeAnalysisConfig() {
354        return false;
355    }
356}