Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
96.39% covered (success)
96.39%
160 / 166
71.43% covered (warning)
71.43%
5 / 7
CRAP
0.00% covered (danger)
0.00%
0 / 1
MappingConfigBuilder
96.39% covered (success)
96.39%
160 / 166
71.43% covered (warning)
71.43%
5 / 7
18
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
3
 getDefaultFields
94.74% covered (success)
94.74%
90 / 95
0.00% covered (danger)
0.00%
0 / 1
3.00
 buildConfig
100.00% covered (success)
100.00%
39 / 39
100.00% covered (success)
100.00%
1 / 1
5
 setupCopyTo
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
4
 buildSourceTextStringField
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getTextFieldMapping
100.00% covered (success)
100.00%
15 / 15
100.00% covered (success)
100.00%
1 / 1
1
 canOptimizeAnalysisConfig
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
1<?php
2
3namespace CirrusSearch\Maintenance;
4
5use CirrusSearch\CirrusSearch;
6use CirrusSearch\CirrusSearchHookRunner;
7use CirrusSearch\Search\CirrusIndexField;
8use CirrusSearch\Search\CirrusSearchIndexFieldFactory;
9use CirrusSearch\Search\SourceTextIndexField;
10use CirrusSearch\Search\TextIndexField;
11use CirrusSearch\SearchConfig;
12use MediaWiki\MediaWikiServices;
13use SearchIndexField;
14
15/**
16 * Builds elasticsearch mapping configuration arrays.
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * This program is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU General Public License for more details.
27 *
28 * You should have received a copy of the GNU General Public License along
29 * with this program; if not, write to the Free Software Foundation, Inc.,
30 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
31 * http://www.gnu.org/copyleft/gpl.html
32 */
33class MappingConfigBuilder {
34    // Bit field parameters for buildConfig
35    public const PREFIX_START_WITH_ANY = 1;
36    public const PHRASE_SUGGEST_USE_TEXT = 2;
37    public const OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER = 4;
38
39    /**
40     * Version number for the core analysis. Increment the major
41     * version when the analysis changes in an incompatible way,
42     * and change the minor version when it changes but isn't
43     * incompatible
44     */
45    public const VERSION = '1.10';
46
47    /**
48     * @var bool should the index be optimized for the experimental highlighter?
49     */
50    private $optimizeForExperimentalHighlighter;
51
52    /**
53     * @var SearchConfig
54     */
55    private $config;
56
57    /**
58     * @var CirrusSearch
59     */
60    protected $engine;
61
62    /**
63     * @var CirrusSearchIndexFieldFactory
64     */
65    protected $searchIndexFieldFactory;
66
67    /**
68     * @var int
69     */
70    protected $flags = 0;
71    /**
72     * @var CirrusSearchHookRunner
73     */
74    private $cirrusSearchHookRunner;
75
76    /**
77     * @param bool $optimizeForExperimentalHighlighter should the index be optimized for the experimental highlighter?
78     * @param int $flags
79     * @param SearchConfig|null $config
80     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
81     */
82    public function __construct(
83        $optimizeForExperimentalHighlighter,
84        $flags = 0,
85        SearchConfig $config = null,
86        CirrusSearchHookRunner $cirrusSearchHookRunner = null
87    ) {
88        $this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter;
89        if ( $this->optimizeForExperimentalHighlighter ) {
90            $flags |= self::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER;
91        }
92        $this->flags = $flags;
93        $this->engine = new CirrusSearch( $config );
94        $this->config = $this->engine->getConfig();
95        $this->searchIndexFieldFactory = new CirrusSearchIndexFieldFactory( $this->config );
96        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
97            MediaWikiServices::getInstance()->getHookContainer() );
98    }
99
100    /**
101     * Get definitions for default index fields.
102     * These fields are always present in the index.
103     * @return array
104     */
105    private function getDefaultFields() {
106        // Note never to set something as type='object' here because that isn't returned by elasticsearch
107        // and is inferred anyway.
108        $titleExtraAnalyzers = [
109            [ 'analyzer' => 'prefix', 'search_analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ],
110            [
111                'analyzer' => 'prefix_asciifolding',
112                'search_analyzer' => 'near_match_asciifolding',
113                'index_options' => 'docs',
114                'norms' => false
115            ],
116            [ 'analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => false ],
117            [ 'analyzer' => 'near_match_asciifolding', 'index_options' => 'docs', 'norms' => false ],
118            [ 'analyzer' => 'keyword', 'index_options' => 'docs', 'norms' => false ],
119        ];
120        if ( $this->flags & self::PREFIX_START_WITH_ANY ) {
121            $titleExtraAnalyzers[] = [
122                'analyzer' => 'word_prefix',
123                'search_analyzer' => 'plain_search',
124                'index_options' => 'docs'
125            ];
126        }
127
128        $suggestField = [
129            'type' => 'text',
130            'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest' ),
131            'index_options' => 'freqs',
132            'analyzer' => 'suggest',
133        ];
134
135        if ( $this->config->getElement( 'CirrusSearchPhraseSuggestReverseField', 'build' ) ) {
136            $suggestField['fields'] = [
137                'reverse' => [
138                    'type' => 'text',
139                    'similarity' => TextIndexField::getSimilarity( $this->config, 'suggest', 'reverse' ),
140                    'index_options' => 'freqs',
141                    'analyzer' => 'suggest_reverse',
142                ],
143            ];
144        }
145
146        $page = [
147            'dynamic' => false,
148            'properties' => [
149                'timestamp' => [
150                    'type' => 'date',
151                    'format' => 'dateOptionalTime',
152                ],
153                'create_timestamp' => [
154                    'type' => 'date',
155                    'format' => 'dateOptionalTime',
156                ],
157                'page_id' => [
158                    'type' => 'long',
159                    'index' => false,
160                ],
161                'wiki' => $this->searchIndexFieldFactory
162                    ->newKeywordField( 'wiki' )
163                    ->getMapping( $this->engine ),
164                'namespace' => $this->searchIndexFieldFactory
165                    ->newLongField( 'namespace' )
166                    ->getMapping( $this->engine ),
167                'namespace_text' => $this->searchIndexFieldFactory
168                    ->newKeywordField( 'namespace_text' )
169                    ->getMapping( $this->engine ),
170                'title' => $this->searchIndexFieldFactory->newStringField( 'title',
171                    TextIndexField::ENABLE_NORMS | TextIndexField::COPY_TO_SUGGEST |
172                    TextIndexField::SUPPORT_REGEX,
173                    $titleExtraAnalyzers )->setMappingFlags( $this->flags )->getMapping( $this->engine ),
174                'text' => $this->getTextFieldMapping(),
175                'text_bytes' => $this->searchIndexFieldFactory
176                    ->newLongField( 'text_bytes' )
177                    ->getMapping( $this->engine ),
178                'source_text' => $this->buildSourceTextStringField( 'source_text' )
179                    ->setMappingFlags( $this->flags )->getMapping( $this->engine ),
180                'redirect' => [
181                    'dynamic' => false,
182                    'properties' => [
183                        'namespace' => $this->searchIndexFieldFactory
184                            ->newLongField( 'namespace' )
185                            ->getMapping( $this->engine ),
186                        'title' => $this->searchIndexFieldFactory
187                            ->newStringField( 'redirect.title', TextIndexField::ENABLE_NORMS
188                                | TextIndexField::SPEED_UP_HIGHLIGHTING
189                                | TextIndexField::COPY_TO_SUGGEST
190                                | TextIndexField::SUPPORT_REGEX,
191                                $titleExtraAnalyzers
192                            )
193                            ->setMappingFlags( $this->flags )
194                            ->getMapping( $this->engine ),
195                    ]
196                ],
197                'incoming_links' => $this->searchIndexFieldFactory
198                    ->newLongField( 'incoming_links' )
199                    ->getMapping( $this->engine ),
200                'local_sites_with_dupe' => $this->searchIndexFieldFactory
201                    ->newKeywordField( 'local_sites_with_dupe' )
202                    ->setFlag( SearchIndexField::FLAG_CASEFOLD )
203                    ->getMapping( $this->engine ),
204                'suggest' => $suggestField,
205            ]
206        ];
207
208        return $page;
209    }
210
211    /**
212     * Build the mapping config.
213     * @return array the mapping config
214     */
215    public function buildConfig() {
216        global $wgCirrusSearchWeights;
217
218        $page = $this->getDefaultFields();
219
220        $fields = $this->engine->getSearchIndexFields();
221
222        foreach ( $fields as $fieldName => $field ) {
223            if ( $field instanceof CirrusIndexField ) {
224                $field->setMappingFlags( $this->flags );
225            }
226            $config = $field->getMapping( $this->engine );
227            if ( $config ) {
228                $page['properties'][$fieldName] = $config;
229            }
230        }
231
232        // Unclear how this would otherwise fit into the process to construct the mapping.
233        // Not used directly in cirrus, supports queries from 'add-a-link' (T301096).
234        if ( isset( $page['properties']['outgoing_link'] ) ) {
235            $page['properties']['outgoing_link']['fields']['token_count'] = [
236                'type' => 'token_count',
237                'analyzer' => 'keyword',
238            ];
239        }
240
241        // Now layer all the fields into the all field once per weight.  Querying it isn't strictly the
242        // same as querying each field - in some ways it is better!  In others it is worse....
243
244        // Better because theoretically tf/idf based scoring works better this way.
245        // Worse because we have to analyze each field multiple times....  Bleh!
246        // This field can't be used for the fvh/experimental highlighter for several reasons:
247        // 1. It is built with copy_to and not stored.
248        // 2. The term frequency information is all whoppy compared to the "real" source text.
249        $allField = $this->searchIndexFieldFactory->
250            newStringField( 'all', TextIndexField::ENABLE_NORMS );
251        $page['properties']['all'] =
252            $allField->setMappingFlags( $this->flags )->getMapping( $this->engine );
253        $page = $this->setupCopyTo( $page, $wgCirrusSearchWeights, 'all' );
254
255        // Now repeat for near_match fields.  The same considerations above apply except near_match
256        // is never used in phrase queries or highlighting.
257        $page[ 'properties' ][ 'all_near_match' ] = [
258            'type' => 'text',
259            'analyzer' => 'near_match',
260            'index_options' => 'freqs',
261            'norms' => false,
262            'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match' ),
263            'fields' => [
264                'asciifolding' => [
265                    'type' => 'text',
266                    'analyzer' => 'near_match_asciifolding',
267                    'index_options' => 'freqs',
268                    'norms' => false,
269                    'similarity' => TextIndexField::getSimilarity( $this->config, 'all_near_match', 'asciifolding' ),
270                ],
271            ],
272        ];
273        $nearMatchFields = [
274            'title' => $wgCirrusSearchWeights[ 'title' ],
275            'redirect' => $wgCirrusSearchWeights[ 'redirect' ],
276        ];
277        return $this->setupCopyTo( $page, $nearMatchFields, 'all_near_match' );
278    }
279
280    /**
281     * Setup copy_to for some fields to $destination.
282     * @param array $config to modify
283     * @param array $fields field name to number of times copied
284     * @param string $destination destination of the copy
285     * @return array $config modified with the copy_to setup
286     */
287    private function setupCopyTo( $config, $fields, $destination ) {
288        foreach ( $fields as $field => $weight ) {
289            // Note that weights this causes weights that are not whole numbers to be rounded up.
290            // We're ok with that because we don't have a choice.
291            for ( $r = 0; $r < $weight; $r++ ) {
292                if ( $field === 'redirect' ) {
293                    // Redirect is in a funky place
294                    $config[ 'properties' ][ 'redirect' ][ 'properties' ][ 'title' ][ 'copy_to' ][] = $destination;
295                } else {
296                    $config[ 'properties' ][ $field ][ 'copy_to' ][] = $destination;
297                }
298            }
299        }
300
301        return $config;
302    }
303
304    /**
305     * Build the source_text index field
306     *
307     * @param string $fieldName usually "source_text"
308     * @return SourceTextIndexField
309     */
310    protected function buildSourceTextStringField( $fieldName ) {
311        return new SourceTextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config );
312    }
313
314    /**
315     * @return array
316     */
317    private function getTextFieldMapping() {
318        $stringFieldMapping = $this->searchIndexFieldFactory->newStringField(
319            'text',
320            null,
321            []
322        )->setMappingFlags( $this->flags )->getMapping( $this->engine );
323
324        $extraFieldMapping = [
325            'fields' => [
326                'word_count' => [
327                    'type' => 'token_count',
328                    'analyzer' => 'plain',
329                ]
330            ]
331        ];
332
333        $textFieldMapping = array_merge_recursive( $stringFieldMapping, $extraFieldMapping );
334
335        return $textFieldMapping;
336    }
337
338    /**
339     * Whether or not it's safe to optimize the analysis config.
340     * It's generally safe to optimize if all the analyzers needed are
341     * properly referenced in the mapping.
342     * In the case an analyzer is used directly in a query but not referenced
343     * in the mapping it's not safe to optimize.
344     *
345     * @return bool
346     */
347    public function canOptimizeAnalysisConfig() {
348        return true;
349    }
350}