Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
96.41% covered (success)
96.41%
161 / 167
71.43% covered (warning)
71.43%
5 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
MappingConfigBuilder
96.41% covered (success)
96.41%
161 / 167
71.43% covered (warning)
71.43%
5 / 7
18
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
3
 getDefaultFields
94.79% covered (success)
94.79%
91 / 96
0.00% covered (danger)
0.00%
0 / 1
3.00
 buildConfig
100.00% covered (success)
100.00%
39 / 39
100.00% covered (success)
100.00%
1 / 1
5
 setupCopyTo
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
4
 buildSourceTextStringField
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getTextFieldMapping
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
1
 canOptimizeAnalysisConfig
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use CirrusSearch\CirrusSearch;
6use CirrusSearch\CirrusSearchHookRunner;
7use CirrusSearch\Search\CirrusIndexField;
8use CirrusSearch\Search\CirrusSearchIndexFieldFactory;
9use CirrusSearch\Search\SourceTextIndexField;
10use CirrusSearch\Search\TextIndexField;
11use CirrusSearch\SearchConfig;
12use MediaWiki\MediaWikiServices;
13use SearchIndexField;
14
15/**
16 * Builds elasticsearch mapping configuration arrays.
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
27 *
28 * You should have received a copy of the GNU General Public License along
29 * with this program; if not, write to the Free Software Foundation, Inc.,
30 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
31 * http://www.gnu.org/copyleft/gpl.html
32 */
33class MappingConfigBuilder {
34    // Bit field parameters for buildConfig
35    public const PREFIX_START_WITH_ANY = 1;
36    public const PHRASE_SUGGEST_USE_TEXT = 2;
37    public const OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER = 4;
38
39    /**
40     * Version number for the core analysis. Increment the major
41     * version when the analysis changes in an incompatible way,
42     * and change the minor version when it changes but isn't
43     * incompatible
44     */
45    public const VERSION = '1.10';
46
47    /**
48     * @var bool should the index be optimized for the experimental highlighter?
49     */
50    private $optimizeForExperimentalHighlighter;
51
52    /**
53     * @var SearchConfig
54     */
55    private $config;
56
57    /**
58     * @var CirrusSearch
59     */
60    protected $engine;
61
62    /**
63     * @var CirrusSearchIndexFieldFactory
64     */
65    protected $searchIndexFieldFactory;
66
67    /**
68     * @var int
69     */
70    protected $flags = 0;
71    /**
72     * @var CirrusSearchHookRunner
73     */
74    private $cirrusSearchHookRunner;
75
76    /**
77     * @param bool $optimizeForExperimentalHighlighter should the index be optimized for the experimental highlighter?
78     * @param int $flags
79     * @param SearchConfig|null $config
80     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
81     */
82    public function __construct(
83        $optimizeForExperimentalHighlighter,
84        $flags = 0,
85        SearchConfig $config = null,
86        CirrusSearchHookRunner $cirrusSearchHookRunner = null
87    ) {
88        $this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter;
89        if ( $this->optimizeForExperimentalHighlighter ) {
90            $flags |= self::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER;
91        }
92        $this->flags = $flags;
93        $this->engine = new CirrusSearch( $config );
94        $this->config = $this->engine->getConfig();
95        $this->searchIndexFieldFactory = new CirrusSearchIndexFieldFactory( $this->config );
96        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
97            MediaWikiServices::getInstance()->getHookContainer() );
98    }
99
100    /**
101     * Get definitions for default index fields.
102     * These fields are always present in the index.
103     * @return array
104     */
105    private function getDefaultFields() {
106        // Note never to set something as type='object' here because that isn't returned by elasticsearch
107        // and is inferred anyway.
108        $titleExtraAnalyzers = [
109            [ 'analyzer' => 'prefix', 'search_analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ],
110            [
111                'analyzer' => 'prefix_asciifolding',
112                'search_analyzer' => 'near_match_asciifolding',
113                'index_options' => 'docs',
114                'norms' => false
115            ],
116            [ 'analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ],
117            [ 'analyzer' => 'near_match_asciifolding', 'index_options' => 'docs', 'norms' => false ],
118            [ 'analyzer' => 'keyword', 'index_options' => 'docs', 'norms' => false ],
119        ];
120        if ( $this->flags & self::PREFIX_START_WITH_ANY ) {
121            $titleExtraAnalyzers[] = [
122                'analyzer' => 'word_prefix',
123                'search_analyzer' => 'plain_search',
124                'index_options' => 'docs'
125            ];
126        }
127
128        $suggestField = [
129            'type' => 'text',
130            'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest' ),
131            'index_options' => 'freqs',
132            'analyzer' => 'suggest',
133        ];
134
135        if ( $this->config->getElement( 'CirrusSearchPhraseSuggestReverseField', 'build' ) ) {
136            $suggestField['fields'] = [
137                'reverse' => [
138                    'type' => 'text',
139                    'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest', 'reverse' ),
140                    'index_options' => 'freqs',
141                    'analyzer' => 'suggest_reverse',
142                ],
143            ];
144        }
145
146        $page = [
147            'dynamic' => false,
148            'properties' => [
149                'timestamp' => [
150                    'type' => 'date',
151                    'format' => 'dateOptionalTime',
152                ],
153                'create_timestamp' => [
154                    'type' => 'date',
155                    'format' => 'dateOptionalTime',
156                ],
157                'page_id' => [
158                    'type' => 'long',
159                    'index' => false,
160                    'doc_values' => true,
161                ],
162                'wiki' => $this->searchIndexFieldFactory
163                    ->newKeywordField( 'wiki' )
164                    ->getMapping( $this->engine ),
165                'namespace' => $this->searchIndexFieldFactory
166                    ->newLongField( 'namespace' )
167                    ->getMapping( $this->engine ),
168                'namespace_text' => $this->searchIndexFieldFactory
169                    ->newKeywordField( 'namespace_text' )
170                    ->getMapping( $this->engine ),
171                'title' => $this->searchIndexFieldFactory->newStringField( 'title',
172                    TextIndexField::ENABLE_NORMS | TextIndexField::COPY_TO_SUGGEST |
173                    TextIndexField::SUPPORT_REGEX,
174                    $titleExtraAnalyzers )->setMappingFlags( $this->flags )->getMapping( $this->engine ),
175                'text' => $this->getTextFieldMapping(),
176                'text_bytes' => $this->searchIndexFieldFactory
177                    ->newLongField( 'text_bytes' )
178                    ->getMapping( $this->engine ),
179                'source_text' => $this->buildSourceTextStringField( 'source_text' )
180                    ->setMappingFlags( $this->flags )->getMapping( $this->engine ),
181                'redirect' => [
182                    'dynamic' => false,
183                    'properties' => [
184                        'namespace' => $this->searchIndexFieldFactory
185                            ->newLongField( 'namespace' )
186                            ->getMapping( $this->engine ),
187                        'title' => $this->searchIndexFieldFactory
188                            ->newStringField( 'redirect.title', TextIndexField::ENABLE_NORMS
189                                | TextIndexField::SPEED_UP_HIGHLIGHTING
190                                | TextIndexField::COPY_TO_SUGGEST
191                                | TextIndexField::SUPPORT_REGEX,
192                                $titleExtraAnalyzers
193                            )
194                            ->setMappingFlags( $this->flags )
195                            ->getMapping( $this->engine ),
196                    ]
197                ],
198                'incoming_links' => $this->searchIndexFieldFactory
199                    ->newLongField( 'incoming_links' )
200                    ->getMapping( $this->engine ),
201                'local_sites_with_dupe' => $this->searchIndexFieldFactory
202                    ->newKeywordField( 'local_sites_with_dupe' )
203                    ->setFlag( SearchIndexField::FLAG_CASEFOLD )
204                    ->getMapping( $this->engine ),
205                'suggest' => $suggestField,
206            ]
207        ];
208
209        return $page;
210    }
211
212    /**
213     * Build the mapping config.
214     * @return array the mapping config
215     */
216    public function buildConfig() {
217        global $wgCirrusSearchWeights;
218
219        $page = $this->getDefaultFields();
220
221        $fields = $this->engine->getSearchIndexFields();
222
223        foreach ( $fields as $fieldName => $field ) {
224            if ( $field instanceof CirrusIndexField ) {
225                $field->setMappingFlags( $this->flags );
226            }
227            $config = $field->getMapping( $this->engine );
228            if ( $config ) {
229                $page['properties'][$fieldName] = $config;
230            }
231        }
232
233        // Unclear how this would otherwise fit into the process to construct the mapping.
234        // Not used directly in cirrus, supports queries from 'add-a-link' (T301096).
235        if ( isset( $page['properties']['outgoing_link'] ) ) {
236            $page['properties']['outgoing_link']['fields']['token_count'] = [
237                'type' => 'token_count',
238                'analyzer' => 'keyword',
239            ];
240        }
241
242        // Now layer all the fields into the all field once per weight.  Querying it isn't strictly the
243        // same as querying each field - in some ways it is better!  In others it is worse....
244
245        // Better because theoretically tf/idf based scoring works better this way.
246        // Worse because we have to analyze each field multiple times....  Bleh!
247        // This field can't be used for the fvh/experimental highlighter for several reasons:
248        // 1. It is built with copy_to and not stored.
249        // 2. The term frequency information is all whoppy compared to the "real" source text.
250        $allField = $this->searchIndexFieldFactory->
251            newStringField( 'all', TextIndexField::ENABLE_NORMS );
252        $page['properties']['all'] =
253            $allField->setMappingFlags( $this->flags )->getMapping( $this->engine );
254        $page = $this->setupCopyTo( $page, $wgCirrusSearchWeights, 'all' );
255
256        // Now repeat for near_match fields.  The same considerations above apply except near_match
257        // is never used in phrase queries or highlighting.
258        $page[ 'properties' ][ 'all_near_match' ] = [
259            'type' => 'text',
260            'analyzer' => 'near_match',
261            'index_options' => 'freqs',
262            'norms' => false,
263            'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match' ),
264            'fields' => [
265                'asciifolding' => [
266                    'type' => 'text',
267                    'analyzer' => 'near_match_asciifolding',
268                    'index_options' => 'freqs',
269                    'norms' => false,
270                    'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match', 'asciifolding' ),
271                ],
272            ],
273        ];
274        $nearMatchFields = [
275            'title' => $wgCirrusSearchWeights[ 'title' ],
276            'redirect' => $wgCirrusSearchWeights[ 'redirect' ],
277        ];
278        return $this->setupCopyTo( $page, $nearMatchFields, 'all_near_match' );
279    }
280
281    /**
282     * Setup copy_to for some fields to $destination.
283     * @param array $config to modify
284     * @param array $fields field name to number of times copied
285     * @param string $destination destination of the copy
286     * @return array $config modified with the copy_to setup
287     */
288    private function setupCopyTo( $config, $fields, $destination ) {
289        foreach ( $fields as $field => $weight ) {
290            // Note that weights this causes weights that are not whole numbers to be rounded up.
291            // We're ok with that because we don't have a choice.
292            for ( $r = 0; $r < $weight; $r++ ) {
293                if ( $field === 'redirect' ) {
294                    // Redirect is in a funky place
295                    $config[ 'properties' ][ 'redirect' ][ 'properties' ][ 'title' ][ 'copy_to' ][] = $destination;
296                } else {
297                    $config[ 'properties' ][ $field ][ 'copy_to' ][] = $destination;
298                }
299            }
300        }
301
302        return $config;
303    }
304
305    /**
306     * Build the source_text index field
307     *
308     * @param string $fieldName usually "source_text"
309     * @return SourceTextIndexField
310     */
311    protected function buildSourceTextStringField( $fieldName ) {
312        return new SourceTextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config );
313    }
314
315    /**
316     * @return array
317     */
318    private function getTextFieldMapping() {
319        $stringFieldMapping = $this->searchIndexFieldFactory->newStringField(
320            'text',
321            null,
322            []
323        )->setMappingFlags( $this->flags )->getMapping( $this->engine );
324
325        $extraFieldMapping = [
326            'fields' => [
327                'word_count' => [
328                    'type' => 'token_count',
329                    'analyzer' => 'plain',
330                ]
331            ]
332        ];
333
334        $textFieldMapping = array_merge_recursive( $stringFieldMapping, $extraFieldMapping );
335
336        return $textFieldMapping;
337    }
338
339    /**
340     * Whether or not it's safe to optimize the analysis config.
341     * It's generally safe to optimize if all the analyzers needed are
342     * properly referenced in the mapping.
343     * In the case an analyzer is used directly in a query but not referenced
344     * in the mapping it's not safe to optimize.
345     *
346     * @return bool
347     */
348    public function canOptimizeAnalysisConfig() {
349        return true;
350    }
351}