Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
96.59% covered (success)
96.59%
170 / 176
71.43% covered (warning)
71.43%
5 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
MappingConfigBuilder
96.59% covered (success)
96.59%
170 / 176
71.43% covered (warning)
71.43%
5 / 7
19
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
3
 getDefaultFields
95.24% covered (success)
95.24%
100 / 105
0.00% covered (danger)
0.00%
0 / 1
4
 buildConfig
100.00% covered (success)
100.00%
39 / 39
100.00% covered (success)
100.00%
1 / 1
5
 setupCopyTo
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
4
 buildSourceTextStringField
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getTextFieldMapping
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
1
 canOptimizeAnalysisConfig
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use CirrusSearch\CirrusSearch;
6use CirrusSearch\CirrusSearchHookRunner;
7use CirrusSearch\Search\CirrusIndexField;
8use CirrusSearch\Search\CirrusSearchIndexFieldFactory;
9use CirrusSearch\Search\SourceTextIndexField;
10use CirrusSearch\Search\TextIndexField;
11use CirrusSearch\SearchConfig;
12use MediaWiki\MediaWikiServices;
13use SearchIndexField;
14
15/**
16 * Builds elasticsearch mapping configuration arrays.
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
27 *
28 * You should have received a copy of the GNU General Public License along
29 * with this program; if not, write to the Free Software Foundation, Inc.,
30 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
31 * http://www.gnu.org/copyleft/gpl.html
32 */
33class MappingConfigBuilder {
34    // Bit field parameters for buildConfig
35    public const PREFIX_START_WITH_ANY = 1;
36    public const PHRASE_SUGGEST_USE_TEXT = 2;
37    public const OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER = 4;
38
39    /**
40     * Version number for the core analysis. Increment the major
41     * version when the analysis changes in an incompatible way,
42     * and change the minor version when it changes but isn't
43     * incompatible
44     */
45    public const VERSION = '1.10';
46
47    /**
48     * @var bool should the index be optimized for the experimental highlighter?
49     */
50    private $optimizeForExperimentalHighlighter;
51
52    /**
53     * @var SearchConfig
54     */
55    private $config;
56
57    /**
58     * @var CirrusSearch
59     */
60    protected $engine;
61
62    /**
63     * @var CirrusSearchIndexFieldFactory
64     */
65    protected $searchIndexFieldFactory;
66
67    /**
68     * @var int
69     */
70    protected $flags = 0;
71    /**
72     * @var CirrusSearchHookRunner
73     */
74    private $cirrusSearchHookRunner;
75
76    /**
77     * @param bool $optimizeForExperimentalHighlighter should the index be optimized for the experimental highlighter?
78     * @param int $flags
79     * @param SearchConfig|null $config
80     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
81     */
82    public function __construct(
83        $optimizeForExperimentalHighlighter,
84        $flags = 0,
85        ?SearchConfig $config = null,
86        ?CirrusSearchHookRunner $cirrusSearchHookRunner = null
87    ) {
88        $this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter;
89        if ( $this->optimizeForExperimentalHighlighter ) {
90            $flags |= self::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER;
91        }
92        $this->flags = $flags;
93        $this->engine = new CirrusSearch( $config );
94        $this->config = $this->engine->getConfig();
95        $this->searchIndexFieldFactory = new CirrusSearchIndexFieldFactory( $this->config );
96        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
97            MediaWikiServices::getInstance()->getHookContainer() );
98    }
99
100    /**
101     * Get definitions for default index fields.
102     * These fields are always present in the index.
103     * @return array
104     */
105    private function getDefaultFields() {
106        // Note never to set something as type='object' here because that isn't returned by elasticsearch
107        // and is inferred anyway.
108        $titleExtraAnalyzers = [
109            [ 'analyzer' => 'prefix', 'search_analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ],
110            [
111                'analyzer' => 'prefix_asciifolding',
112                'search_analyzer' => 'near_match_asciifolding',
113                'index_options' => 'docs',
114                'norms' => false
115            ],
116            [ 'analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ],
117            [ 'analyzer' => 'near_match_asciifolding', 'index_options' => 'docs', 'norms' => false ],
118            [ 'analyzer' => 'keyword', 'index_options' => 'docs', 'norms' => false ],
119        ];
120        if ( $this->flags & self::PREFIX_START_WITH_ANY ) {
121            $titleExtraAnalyzers[] = [
122                'analyzer' => 'word_prefix',
123                'search_analyzer' => 'plain_search',
124                'index_options' => 'docs'
125            ];
126        }
127        if ( $this->config->getElement( 'CirrusSearchNaturalTitleSort', 'build' ) ) {
128            $titleExtraAnalyzers[] = [
129                'fieldName' => 'natural_sort',
130                'type' => 'icu_collation_keyword',
131                // doc values only
132                'index' => false,
133                'numeric' => true,
134                'strength' => 'tertiary',
135                // Does icu support all the language codes?
136                'language' => $this->config->getElement( 'CirrusSearchNaturalTitleSort', 'language' ),
137                'country' => $this->config->getElement( 'CirrusSearchNaturalTitleSort', 'country' ),
138            ];
139        }
140
141        $suggestField = [
142            'type' => 'text',
143            'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest' ),
144            'index_options' => 'freqs',
145            'analyzer' => 'suggest',
146        ];
147
148        if ( $this->config->getElement( 'CirrusSearchPhraseSuggestReverseField', 'build' ) ) {
149            $suggestField['fields'] = [
150                'reverse' => [
151                    'type' => 'text',
152                    'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest', 'reverse' ),
153                    'index_options' => 'freqs',
154                    'analyzer' => 'suggest_reverse',
155                ],
156            ];
157        }
158
159        $page = [
160            'dynamic' => false,
161            'properties' => [
162                'timestamp' => [
163                    'type' => 'date',
164                    'format' => 'dateOptionalTime',
165                ],
166                'create_timestamp' => [
167                    'type' => 'date',
168                    'format' => 'dateOptionalTime',
169                ],
170                'page_id' => [
171                    'type' => 'long',
172                    'index' => false,
173                ],
174                'wiki' => $this->searchIndexFieldFactory
175                    ->newKeywordField( 'wiki' )
176                    ->getMapping( $this->engine ),
177                'namespace' => $this->searchIndexFieldFactory
178                    ->newLongField( 'namespace' )
179                    ->getMapping( $this->engine ),
180                'namespace_text' => $this->searchIndexFieldFactory
181                    ->newKeywordField( 'namespace_text' )
182                    ->getMapping( $this->engine ),
183                'title' => $this->searchIndexFieldFactory->newStringField( 'title',
184                    TextIndexField::ENABLE_NORMS | TextIndexField::COPY_TO_SUGGEST |
185                    TextIndexField::SUPPORT_REGEX,
186                    $titleExtraAnalyzers )->setMappingFlags( $this->flags )->getMapping( $this->engine ),
187                'text' => $this->getTextFieldMapping(),
188                'text_bytes' => $this->searchIndexFieldFactory
189                    ->newLongField( 'text_bytes' )
190                    ->getMapping( $this->engine ),
191                'source_text' => $this->buildSourceTextStringField( 'source_text' )
192                    ->setMappingFlags( $this->flags )->getMapping( $this->engine ),
193                'redirect' => [
194                    'dynamic' => false,
195                    'properties' => [
196                        'namespace' => $this->searchIndexFieldFactory
197                            ->newLongField( 'namespace' )
198                            ->getMapping( $this->engine ),
199                        'title' => $this->searchIndexFieldFactory
200                            ->newStringField( 'redirect.title', TextIndexField::ENABLE_NORMS
201                                | TextIndexField::SPEED_UP_HIGHLIGHTING
202                                | TextIndexField::COPY_TO_SUGGEST
203                                | TextIndexField::SUPPORT_REGEX,
204                                $titleExtraAnalyzers
205                            )
206                            ->setMappingFlags( $this->flags )
207                            ->getMapping( $this->engine ),
208                    ]
209                ],
210                'incoming_links' => $this->searchIndexFieldFactory
211                    ->newLongField( 'incoming_links' )
212                    ->getMapping( $this->engine ),
213                'local_sites_with_dupe' => $this->searchIndexFieldFactory
214                    ->newKeywordField( 'local_sites_with_dupe' )
215                    ->setFlag( SearchIndexField::FLAG_CASEFOLD )
216                    ->getMapping( $this->engine ),
217                'suggest' => $suggestField,
218            ]
219        ];
220
221        return $page;
222    }
223
224    /**
225     * Build the mapping config.
226     * @return array the mapping config
227     */
228    public function buildConfig() {
229        global $wgCirrusSearchWeights;
230
231        $page = $this->getDefaultFields();
232
233        $fields = $this->engine->getSearchIndexFields();
234
235        foreach ( $fields as $fieldName => $field ) {
236            if ( $field instanceof CirrusIndexField ) {
237                $field->setMappingFlags( $this->flags );
238            }
239            $config = $field->getMapping( $this->engine );
240            if ( $config ) {
241                $page['properties'][$fieldName] = $config;
242            }
243        }
244
245        // Unclear how this would otherwise fit into the process to construct the mapping.
246        // Not used directly in cirrus, supports queries from 'add-a-link' (T301096).
247        if ( isset( $page['properties']['outgoing_link'] ) ) {
248            $page['properties']['outgoing_link']['fields']['token_count'] = [
249                'type' => 'token_count',
250                'analyzer' => 'keyword',
251            ];
252        }
253
254        // Now layer all the fields into the all field once per weight.  Querying it isn't strictly the
255        // same as querying each field - in some ways it is better!  In others it is worse....
256
257        // Better because theoretically tf/idf based scoring works better this way.
258        // Worse because we have to analyze each field multiple times....  Bleh!
259        // This field can't be used for the fvh/experimental highlighter for several reasons:
260        // 1. It is built with copy_to and not stored.
261        // 2. The term frequency information is all whoppy compared to the "real" source text.
262        $allField = $this->searchIndexFieldFactory->
263            newStringField( 'all', TextIndexField::ENABLE_NORMS );
264        $page['properties']['all'] =
265            $allField->setMappingFlags( $this->flags )->getMapping( $this->engine );
266        $page = $this->setupCopyTo( $page, $wgCirrusSearchWeights, 'all' );
267
268        // Now repeat for near_match fields.  The same considerations above apply except near_match
269        // is never used in phrase queries or highlighting.
270        $page[ 'properties' ][ 'all_near_match' ] = [
271            'type' => 'text',
272            'analyzer' => 'near_match',
273            'index_options' => 'freqs',
274            'norms' => false,
275            'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match' ),
276            'fields' => [
277                'asciifolding' => [
278                    'type' => 'text',
279                    'analyzer' => 'near_match_asciifolding',
280                    'index_options' => 'freqs',
281                    'norms' => false,
282                    'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match', 'asciifolding' ),
283                ],
284            ],
285        ];
286        $nearMatchFields = [
287            'title' => $wgCirrusSearchWeights[ 'title' ],
288            'redirect' => $wgCirrusSearchWeights[ 'redirect' ],
289        ];
290        return $this->setupCopyTo( $page, $nearMatchFields, 'all_near_match' );
291    }
292
293    /**
294     * Setup copy_to for some fields to $destination.
295     * @param array $config to modify
296     * @param array $fields field name to number of times copied
297     * @param string $destination destination of the copy
298     * @return array $config modified with the copy_to setup
299     */
300    private function setupCopyTo( $config, $fields, $destination ) {
301        foreach ( $fields as $field => $weight ) {
302            // Note that weights this causes weights that are not whole numbers to be rounded up.
303            // We're ok with that because we don't have a choice.
304            for ( $r = 0; $r < $weight; $r++ ) {
305                if ( $field === 'redirect' ) {
306                    // Redirect is in a funky place
307                    $config[ 'properties' ][ 'redirect' ][ 'properties' ][ 'title' ][ 'copy_to' ][] = $destination;
308                } else {
309                    $config[ 'properties' ][ $field ][ 'copy_to' ][] = $destination;
310                }
311            }
312        }
313
314        return $config;
315    }
316
317    /**
318     * Build the source_text index field
319     *
320     * @param string $fieldName usually "source_text"
321     * @return SourceTextIndexField
322     */
323    protected function buildSourceTextStringField( $fieldName ) {
324        return new SourceTextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config );
325    }
326
327    /**
328     * @return array
329     */
330    private function getTextFieldMapping() {
331        $stringFieldMapping = $this->searchIndexFieldFactory->newStringField(
332            'text',
333            null,
334            []
335        )->setMappingFlags( $this->flags )->getMapping( $this->engine );
336
337        $extraFieldMapping = [
338            'fields' => [
339                'word_count' => [
340                    'type' => 'token_count',
341                    'analyzer' => 'plain',
342                ]
343            ]
344        ];
345
346        $textFieldMapping = array_merge_recursive( $stringFieldMapping, $extraFieldMapping );
347
348        return $textFieldMapping;
349    }
350
351    /**
352     * Whether or not it's safe to optimize the analysis config.
353     * It's generally safe to optimize if all the analyzers needed are
354     * properly referenced in the mapping.
355     * In the case an analyzer is used directly in a query but not referenced
356     * in the mapping it's not safe to optimize.
357     *
358     * @return bool
359     */
360    public function canOptimizeAnalysisConfig() {
361        return true;
362    }
363}