Code Coverage
 
Classes and Traits
Functions and Methods
Lines
Total
0.00% covered (danger)
0.00%
0 / 1
69.57% covered (warning)
69.57%
16 / 23
CRAP
97.72% covered (success)
97.72%
472 / 483
AnalysisConfigBuilder
0.00% covered (danger)
0.00%
0 / 1
69.57% covered (warning)
69.57%
16 / 23
168
97.72% covered (success)
97.72%
472 / 483
 __construct
0.00% covered (danger)
0.00%
0 / 1
8.23
84.62% covered (warning)
84.62%
22 / 26
 shouldActivateIcuFolding
0.00% covered (danger)
0.00%
0 / 1
8.02
92.86% covered (success)
92.86%
13 / 14
 shouldActivateIcuTokenization
100.00% covered (success)
100.00%
1 / 1
5
100.00% covered (success)
100.00%
10 / 10
 buildConfig
0.00% covered (danger)
0.00%
0 / 1
4.01
90.91% covered (success)
90.91%
10 / 11
 buildSimilarityConfig
0.00% covered (danger)
0.00%
0 / 1
2
0.00% covered (danger)
0.00%
0 / 1
 enableICUTokenizer
100.00% covered (success)
100.00%
1 / 1
6
100.00% covered (success)
100.00%
6 / 6
 enableICUFolding
100.00% covered (success)
100.00%
1 / 1
12
100.00% covered (success)
100.00%
27 / 27
 switchFiltersToICUFolding
100.00% covered (success)
100.00%
1 / 1
1
100.00% covered (success)
100.00%
3 / 3
 switchFiltersToICUFoldingPreserve
0.00% covered (danger)
0.00%
0 / 1
7.01
94.44% covered (success)
94.44%
17 / 18
 getICUSetFilter
0.00% covered (danger)
0.00%
0 / 1
18.09
93.55% covered (success)
93.55%
29 / 31
 getICUNormSetFilter
0.00% covered (danger)
0.00%
0 / 1
3.07
80.00% covered (warning)
80.00%
4 / 5
 defaults
100.00% covered (success)
100.00%
1 / 1
5
100.00% covered (success)
100.00%
12 / 12
 customize
100.00% covered (success)
100.00%
1 / 1
43
100.00% covered (success)
100.00%
230 / 230
 fixAsciiFolding
100.00% covered (success)
100.00%
1 / 1
7
100.00% covered (success)
100.00%
13 / 13
 getDefaultTextAnalyzerType
100.00% covered (success)
100.00%
1 / 1
2
100.00% covered (success)
100.00%
3 / 3
 getDefaultFilters
100.00% covered (success)
100.00%
1 / 1
5
100.00% covered (success)
100.00%
8 / 8
 resolveFilters
100.00% covered (success)
100.00%
1 / 1
4
100.00% covered (success)
100.00%
10 / 10
 replaceFilter
100.00% covered (success)
100.00%
1 / 1
4
100.00% covered (success)
100.00%
8 / 8
 mergeConfig
100.00% covered (success)
100.00%
1 / 1
11
100.00% covered (success)
100.00%
17 / 17
 buildLanguageConfigs
100.00% covered (success)
100.00%
1 / 1
4
100.00% covered (success)
100.00%
11 / 11
 isIcuAvailable
100.00% covered (success)
100.00%
1 / 1
1
100.00% covered (success)
100.00%
1 / 1
 enableHomoglyphPlugin
100.00% covered (success)
100.00%
1 / 1
3
100.00% covered (success)
100.00%
5 / 5
 insertHomoglyphFilter
100.00% covered (success)
100.00%
1 / 1
6
100.00% covered (success)
100.00%
13 / 13
<?php
namespace CirrusSearch\Maintenance;
use CirrusSearch\CirrusSearch;
use CirrusSearch\CirrusSearchHookRunner;
use CirrusSearch\Profile\SearchProfileService;
use CirrusSearch\SearchConfig;
use MediaWiki\MediaWikiServices;
/**
 * Builds elasticsearch analysis config arrays.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 */
class AnalysisConfigBuilder {
    /**
     * Version number for the core analysis. Increment the major
     * version when the analysis changes in an incompatible way,
     * and change the minor version when it changes but isn't
     * incompatible.
     *
     * You may also need to increment MetaStoreIndex::METASTORE_VERSION
     * manually as well.
     */
    public const VERSION = '0.12';
    /**
     * Maximum number of characters allowed in keyword terms.
     */
    private const KEYWORD_IGNORE_ABOVE = 5000;
    /**
     * @var bool is the icu plugin available?
     */
    private $icu;
    /**
     * @var array Similarity algo (tf/idf, bm25, etc) configuration
     */
    private $similarity;
    /**
     * @var SearchConfig cirrus config
     */
    protected $config;
    /**
     * @var string[]
     */
    private $plugins;
    /**
     * @var string
     */
    protected $defaultLanguage;
    /**
     * @var CirrusSearchHookRunner
     */
    private $cirrusSearchHookRunner;
    /**
     * @param string $langCode The language code to build config for
     * @param string[] $plugins list of plugins installed in Elasticsearch
     * @param SearchConfig|null $config
     * @param CirrusSearchHookRunner|null $cirrusSearchHookRunner
     */
    public function __construct(
        $langCode,
        array $plugins,
        SearchConfig $config = null,
        CirrusSearchHookRunner $cirrusSearchHookRunner = null
    ) {
        $this->defaultLanguage = $langCode;
        $this->plugins = $plugins;
        foreach ( $this->elasticsearchLanguageAnalyzersFromPlugins as $pluginSpec => $extra ) {
            $pluginsPresent = 1;
            $pluginList = explode( ',', $pluginSpec );
            foreach ( $pluginList as $plugin ) {
                if ( !in_array( $plugin, $plugins ) ) {
                    $pluginsPresent = 0;
                    break;
                }
            }
            if ( $pluginsPresent ) {
                $this->elasticsearchLanguageAnalyzers =
                    array_merge( $this->elasticsearchLanguageAnalyzers, $extra );
            }
        }
        $this->icu = in_array( 'analysis-icu', $plugins );
        if ( $config === null ) {
            $config = MediaWikiServices::getInstance()
                ->getConfigFactory()
                ->makeConfig( 'CirrusSearch' );
        }
        $similarity = $config->getProfileService()->loadProfile( SearchProfileService::SIMILARITY );
        if ( !array_key_exists( 'similarity', $similarity ) ) {
            $similarity['similarity'] = [];
        }
        $this->cirrusSearchHookRunner = $cirrusSearchHookRunner ?: new CirrusSearchHookRunner(
            MediaWikiServices::getInstance()->getHookContainer() );
        $this->cirrusSearchHookRunner->onCirrusSearchSimilarityConfig( $similarity['similarity'] );
        $this->similarity = $similarity;
        $this->config = $config;
    }
    /**
     * Determine if ascii folding should be used
     * @param string $language Config language
     * @return bool true if icu folding should be enabled
     */
    public function shouldActivateIcuFolding( $language ) {
        if ( !$this->isIcuAvailable() || !in_array( 'extra', $this->plugins ) ) {
            // ICU folding requires the icu plugin and the extra plugin
            return false;
        }
        $in_config = $this->config->get( 'CirrusSearchUseIcuFolding' );
        // BC code, this config var was originally a simple boolean
        if ( $in_config === true ) {
            $in_config = 'yes';
        }
        if ( $in_config === false ) {
            $in_config = 'no';
        }
        switch ( $in_config ) {
        case 'yes':
            return true;
        case 'no':
            return false;
        case 'default':
            return $this->languagesWithIcuFolding[$language] ?? false;
        default:
            return false;
        }
    }
    /**
     * Determine if the icu tokenizer can be enabled
     * @param string $language Config language
     * @return bool
     */
    public function shouldActivateIcuTokenization( $language ) {
        if ( !$this->isIcuAvailable() ) {
            // requires the icu plugin
            return false;
        }
        $in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer' );
        switch ( $in_config ) {
        case 'yes':
            return true;
        case 'no':
            return false;
        case 'default':
            return $this->languagesWithIcuTokenization[$language] ?? false;
        default:
            return false;
        }
    }
    /**
     * Build the analysis config.
     *
     * @param string|null $language Config language
     * @return array the analysis config
     */
    public function buildConfig( $language = null ) {
        if ( $language === null ) {
            $language = $this->defaultLanguage;
        }
        $config = $this->customize( $this->defaults( $language ), $language );
        $this->cirrusSearchHookRunner->onCirrusSearchAnalysisConfig( $config, $this );
        $config = $this->enableHomoglyphPlugin( $config, $language );
        if ( $this->shouldActivateIcuTokenization( $language ) ) {
            $config = $this->enableICUTokenizer( $config );
        }
        if ( $this->shouldActivateIcuFolding( $language ) ) {
            $config = $this->enableICUFolding( $config, $language );
        }
        $config = $this->fixAsciiFolding( $config );
        return $config;
    }
    /**
     * @return array|null the similarity config
     */
    public function buildSimilarityConfig() {
        return $this->similarity['similarity'] ?? null;
    }
    /**
     * replace the standard tokenizer with icu_tokenizer
     * @param mixed[] $config
     * @return mixed[] update config
     */
    public function enableICUTokenizer( array $config ) {
        foreach ( $config[ 'analyzer' ] as $name => &$value ) {
            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
                continue;
            }
            if ( isset( $value[ 'tokenizer' ] ) && $value[ 'tokenizer' ] === 'standard' ) {
                $value[ 'tokenizer' ] = 'icu_tokenizer';
            }
        }
        return $config;
    }
    /**
     * Activate ICU folding instead of asciifolding
     * @param mixed[] $config
     * @param string $language Config language
     * @return mixed[] update config
     */
    public function enableICUFolding( array $config, $language ) {
        $unicodeSetFilter = $this->getICUSetFilter( $language );
        $filter = [
            'type' => 'icu_folding',
        ];
        if ( !empty( $unicodeSetFilter ) ) {
            $filter[ 'unicodeSetFilter' ] = $unicodeSetFilter;
        }
        $config[ 'filter' ][ 'icu_folding' ] = $filter;
        // Adds a simple nfkc normalizer for cases where
        // we preserve original but the lowercase filter
        // is not used before
        $config[ 'filter' ][ 'icu_nfkc_normalization' ] = [
            'type' => 'icu_normalizer',
            'name' => 'nfkc',
        ];
        $newfilters = [];
        foreach ( $config[ 'analyzer' ] as $name => $value ) {
            if ( isset( $value[ 'type' ] ) && $value[ 'type' ] != 'custom' ) {
                continue;
            }
            if ( !isset( $value[ 'filter' ] ) ) {
                continue;
            }
            if ( in_array( 'asciifolding', $value[ 'filter' ] ) ) {
                $newfilters[ $name ] = $this->switchFiltersToICUFolding( $value[ 'filter' ] );
            }
            if ( in_array( 'asciifolding_preserve', $value[ 'filter' ] ) ) {
                $newfilters[ $name ] = $this->switchFiltersToICUFoldingPreserve( $value[ 'filter' ] );
            }
        }
        foreach ( $newfilters as $name => $filters ) {
            $config[ 'analyzer' ][ $name ][ 'filter' ] = $filters;
        }
        // Explicitly enable icu_folding on plain analyzers if it's not
        // already enabled
        foreach ( [ 'plain' ] as $analyzer ) {
            if ( !isset( $config[ 'analyzer' ][ $analyzer ] ) ) {
                continue;
            }
            if ( !isset( $config[ 'analyzer' ][ $analyzer ][ 'filter' ] ) ) {
                $config[ 'analyzer' ][ $analyzer ][ 'filter' ] = [];
            }
            $config[ 'analyzer' ][ $analyzer ][ 'filter' ] =
                $this->switchFiltersToICUFoldingPreserve(
                    // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset
                    $config[ 'analyzer' ][ $analyzer ][ 'filter' ], true );
        }
        return $config;
    }
    /**
     * Replace occurrence of asciifolding to icu_folding
     * @param string[] $filters
     * @return string[] new list of filters
     */
    private function switchFiltersToICUFolding( array $filters ) {
        array_splice( $filters, array_search( 'asciifolding', $filters ), 1,
            [ 'icu_folding', 'remove_empty' ] );
        return $filters;
    }
    /**
     * Replace occurrence of asciifolding_preserve with a set
     * of compatible filters to enable icu_folding
     * @param string[] $filters
     * @param bool $append append icu_folding even if asciifolding is not present
     * @return string[] new list of filters
     */
    private function switchFiltersToICUFoldingPreserve( array $filters, $append = false ) {
        if ( in_array( 'icu_folding', $filters ) ) {
            // ICU folding already here
            return $filters;
        }
        $ap_idx = array_search( 'asciifolding_preserve', $filters );
        if ( $ap_idx === false && $append ) {
            $ap_idx = count( $filters );
            // fake an asciifolding_preserve so we can
            // reuse code that replaces it
            $filters[] = 'asciifolding_preserve';
        }
        if ( $ap_idx === false ) {
            return $filters;
        }
        // with ICU lowercase is replaced by icu_normalizer/nfkc_cf
        // thus unicode normalization is already done.
        $lc_idx = array_search( 'icu_normalizer', $filters );
        $newfilters = [];
        if ( $lc_idx === false || $lc_idx > $ap_idx ) {
            // If lowercase is not detected before we
            // will have to do some icu normalization
            // this is to prevent preserving "un-normalized"
            // unicode chars.
            $newfilters[] = 'icu_nfkc_normalization';
        }
        $newfilters[] = 'preserve_original_recorder';
        $newfilters[] = 'icu_folding';
        $newfilters[] = 'preserve_original';
        $newfilters[] = 'remove_empty';
        array_splice( $filters, $ap_idx, 1, $newfilters );
        return $filters;
    }
    /**
     * Return the list of chars to exclude from ICU folding
     * @param string $language Config language
     * @return null|string
     */
    protected function getICUSetFilter( $language ) {
        if ( $this->config->get( 'CirrusSearchICUFoldingUnicodeSetFilter' ) !== null ) {
            return $this->config->get( 'CirrusSearchICUFoldingUnicodeSetFilter' );
        }
        switch ( $language ) {
        /* @todo: complete the default filters per language
         * For Swedish (sv), see https://www.mediawiki.org/wiki/User:TJones_(WMF)/T160562
         * For Serbian (sr), see https://www.mediawiki.org/wiki/User:TJones_(WMF)/T183015
         * For Bosnian (bs), Croatian (hr), and Serbo-Croatian (sh),
         *   see https://www.mediawiki.org/wiki/User:TJones_(WMF)/T192395
         * For Esperanto (eo), see https://www.mediawiki.org/wiki/User:TJones_(WMF)/T202173
         * For Slovak (sk)—which has no folding configured here!—see:
         *   https://www.mediawiki.org/wiki/User:TJones_(WMF)/T223787
         * For Spanish (es), see T277699
         * For German (de), see T281379
         * For Basque (eu) and Danish (da), see T283366
         * For Czech (cs), Finnish (fi), and Galician (gl), see T284578
         * For Norwegian (nb, nn), see T289612
         */
        case 'bs':
        case 'hr':
        case 'sh':
        case 'sr':
            return '[^ĐđŽžĆ抚Čč]';
        case 'cs':
            return '[^ÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž]';
        case 'da':
            return '[^ÆæØøÅå]';
        case 'de':
            return '[^ÄäÖöÜüẞß]';
        case 'eo':
            return '[^ĈĉĜĝĤĥĴĵŜŝŬŭ]';
        case 'es':
            return '[^Ññ]';
        case 'eu':
            return '[^Ññ]';
        case 'fi':
            return '[^ÅåÄäÖö]';
        case 'gl':
            return '[^Ññ]';
        case 'nb':
        case 'nn':
            return '[^ÆæØøÅå]';
        case 'ru':
            return '[^Йй]';
        case 'sv':
            return '[^ÅåÄäÖö]';
        default:
            return null;
        }
    }
    /**
     * Return the list of chars to exclude from ICU normalization
     * @param string $language Config language
     * @return null|string
     */
    protected function getICUNormSetFilter( $language ) {
        if ( $this->config->get( 'CirrusSearchICUNormalizationUnicodeSetFilter' ) !== null ) {
            return $this->config->get( 'CirrusSearchICUNormalizationUnicodeSetFilter' );
        }
        switch ( $language ) {
        /* For German (de), see T281379
         */
        case 'de':
            return '[^ẞß]'; // Capital ẞ is lowercased to ß by german_charfilter
                            // lowercase ß is normalized to ss by german_normalization
        default:
            return null;
        }
    }
    /**
     * Build an analysis config with sane defaults.
     *
     * @param string $language Config language
     * @return array
     */
    private function defaults( $language ) {
        $defaults = [
            'analyzer' => [
                'text' => [
                    // These defaults are not applied to non-custom
                    // analysis chains, i.e., those that use the
                    // default language analyzers on 'text'
                    'type' => $this->getDefaultTextAnalyzerType( $language ),
                    'char_filter' => [ 'word_break_helper' ],
                ],
                // text_search is not configured here because it will be copied from text
                'plain' => [
                    // Surprisingly, the Lucene docs claim this works for
                    // Chinese, Japanese, and Thai as well.
                    // The difference between this and the 'standard'
                    // analyzer is the lack of english stop words.
                    'type' => 'custom',
                    'char_filter' => [ 'word_break_helper' ],
                    'tokenizer' => 'standard',
                    'filter' => [ 'lowercase' ],
                ],
                'plain_search' => [
                    // In accent squashing languages this will not contain accent
                    // squashing to allow searches with accents to only find accents
                    // and searches without accents to find both.
                    'type' => 'custom',
                    'char_filter' => [ 'word_break_helper' ],
                    'tokenizer' => 'standard',
                    'filter' => [ 'lowercase' ],
                ],
                // Used by ShortTextIndexField
                'short_text' => [
                    'type' => 'custom',
                    'tokenizer' => 'whitespace',
                    'filter' => [ 'lowercase', 'aggressive_splitting', 'asciifolding_preserve' ],
                ],
                'short_text_search' => [
                    'type' => 'custom',
                    'tokenizer' => 'whitespace',
                    'filter' => [ 'lowercase', 'aggressive_splitting' ],
                ],
                'source_text_plain' => [
                    'type' => 'custom',
                    'tokenizer' => 'standard',
                    'filter' => [ 'lowercase' ],
                    'char_filter' => [ 'word_break_helper_source_text' ],
                ],
                'source_text_plain_search' => [
                    'type' => 'custom',
                    'char_filter' => [ 'word_break_helper_source_text' ],
                    'tokenizer' => 'standard',
                    'filter' => [ 'lowercase' ],
                ],
                'suggest' => [
                    'type' => 'custom',
                    'tokenizer' => 'standard',
                    'filter' => [ 'lowercase', 'suggest_shingle' ],
                ],
                'suggest_reverse' => [
                    'type' => 'custom',
                    'tokenizer' => 'standard',
                    'filter' => [ 'lowercase', 'suggest_shingle', 'reverse' ],
                ],
                'token_reverse' => [
                    'type' => 'custom',
                    'tokenizer' => 'no_splitting',
                    'filter' => [ 'reverse' ]
                ],
                'near_match' => [
                    'type' => 'custom',
                    'char_filter' => [ 'near_space_flattener' ],
                    'tokenizer' => 'no_splitting',
                    'filter' => [ 'lowercase' ],
                ],
                'near_match_asciifolding' => [
                    'type' => 'custom',
                    'char_filter' => [ 'near_space_flattener' ],
                    'tokenizer' => 'no_splitting',
                    'filter' => [ 'truncate_keyword', 'lowercase', 'asciifolding' ],
                ],
                'prefix' => [
                    'type' => 'custom',
                    'char_filter' => [ 'near_space_flattener' ],
                    'tokenizer' => 'prefix',
                    'filter' => [ 'lowercase' ],
                ],
                'prefix_asciifolding' => [
                    'type' => 'custom',
                    'char_filter' => [ 'near_space_flattener' ],
                    'tokenizer' => 'prefix',
                    'filter' => [ 'lowercase', 'asciifolding' ],
                ],
                'word_prefix' => [
                    'type' => 'custom',
                    'tokenizer' => 'standard',
                    'filter' => [ 'lowercase', 'prefix_ngram_filter' ],
                ],
                'keyword' => [
                    'type' => 'custom',
                    'tokenizer' => 'no_splitting',
                    'filter' => [ 'truncate_keyword' ],
                ],
                'lowercase_keyword' => [
                    'type' => 'custom',
                    'tokenizer' => 'no_splitting',
                    'filter' => [ 'truncate_keyword', 'lowercase' ],
                ],
                'trigram' => [
                    'type' => 'custom',
                    'tokenizer' => 'trigram',
                    'filter' => [ 'lowercase' ],
                ],
            ],
            'filter' => [
                'suggest_shingle' => [
                    'type' => 'shingle',
                    'min_shingle_size' => 2,
                    'max_shingle_size' => 3,
                    'output_unigrams' => true,
                ],
                'lowercase' => [
                    'type' => 'lowercase',
                ],
                'aggressive_splitting' => [
                    'type' => 'word_delimiter',
                    'stem_english_possessive' => false,
                    // 'catenate_words' => true, // Might be useful but causes errors on indexing
                    // 'catenate_numbers' => true, // Might be useful but causes errors on indexing
                    // 'catenate_all' => true, // Might be useful but causes errors on indexing
                    'preserve_original' => false // "wi-fi-555" finds "wi-fi-555".
                                                 // Not needed because of plain analysis.
                ],
                'prefix_ngram_filter' => [
                    'type' => 'edgeNGram',
                    'max_gram' => CirrusSearch::MAX_TITLE_SEARCH,
                ],
                'asciifolding' => [
                    'type' => 'asciifolding',
                    'preserve_original' => false
                ],
                'asciifolding_preserve' => [
                    'type' => 'asciifolding',
                    'preserve_original' => true
                ],
                // The 'keyword' type in ES seems like a hack
                // and doesn't allow normalization (like lowercase)
                // prior to 5.2. Instead we consistently use 'text'
                // and truncate where necessary.
                'truncate_keyword' => [
                    'type' => 'truncate',
                    'length' => self::KEYWORD_IGNORE_ABOVE,
                ],
                'remove_empty' => [
                    'type' => 'length',
                    'min' => 1,
                ],
            ],
            'tokenizer' => [
                'prefix' => [
                    'type' => 'edgeNGram',
                    'max_gram' => CirrusSearch::MAX_TITLE_SEARCH,
                ],
                'no_splitting' => [ // Just grab the whole term.
                    'type' => 'keyword',
                ],
                'trigram' => [
                    'type' => 'nGram',
                    'min_gram' => 3,
                    'max_gram' => 3,
                ],
            ],
            'char_filter' => [
                // Flattens things that are space like to spaces in the near_match style analyzers
                'near_space_flattener' => [
                    'type' => 'mapping',
                    'mappings' => [
                        "'=>\u0020",       // Useful for finding names
                        '\u2019=>\u0020',  // Unicode right single quote
                        '\u02BC=>\u0020',  // Unicode modifier letter apostrophe
                        '_=>\u0020',       // Mediawiki loves _ and people are used to it but
                                           // it usually means space
                        '-=>\u0020',       // Useful for finding hyphenated names unhyphenated
                    ],
                ],
                // Converts things that don't always count as word breaks into spaces which always
                // count as word breaks.
                'word_break_helper' => [
                    'type' => 'mapping',
                    'mappings' => [
                        '_=>\u0020',
                        // These are more useful for code:
                        '.=>\u0020',
                        '(=>\u0020',
                        ')=>\u0020',
                    ],
                ],
                'word_break_helper_source_text' => [
                    'type' => 'mapping',
                    'mappings' => [
                        '_=>\u0020',
                        // These are more useful for code:
                        '.=>\u0020',
                        '(=>\u0020',
                        ')=>\u0020',
                        ':=>\u0020', // T145023
                    ],