Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
97.96% covered (success)
97.96%
48 / 49
66.67% covered (warning)
66.67%
2 / 3
CRAP
0.00% covered (danger)
0.00%
0 / 1
ArticlePredictionKeyword
97.96% covered (success)
97.96%
48 / 49
66.67% covered (warning)
66.67%
2 / 3
12
0.00% covered (danger)
0.00%
0 / 1
 parseValue
96.88% covered (success)
96.88%
31 / 32
0.00% covered (danger)
0.00%
0 / 1
6
 getKeywords
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 doApply
100.00% covered (success)
100.00%
16 / 16
100.00% covered (success)
100.00%
1 / 1
5
1<?php
2
3namespace CirrusSearch\Query;
4
5use CirrusSearch\Search\SearchContext;
6use CirrusSearch\Search\WeightedTagsHooks;
7use CirrusSearch\WarningCollector;
8use Elastica\Query\DisMax;
9use Elastica\Query\Terms;
10use MediaWiki\Message\Message;
11use Wikimedia\Message\ListType;
12
13/**
14 * Finds pages based on how well they match a given keyword
15 * (e.g.articletopic:term, articlecountry:term), based on scores provided by
16 * (Wikimedia-specific) ML models.
17 * @see WeightedTagsHooks
18 * @see https://www.mediawiki.org/wiki/Help:CirrusSearch#Articletopic
19 */
20class ArticlePredictionKeyword extends SimpleKeywordFeature {
21    public const ARTICLE_TOPIC_TAG_PREFIX = 'classification.prediction.articletopic';
22    public const DRAFT_TOPIC_TAG_PREFIX = 'classification.prediction.drafttopic';
23    public const ARTICLE_COUNTRY_TAG_PREFIX = 'classification.prediction.articlecountry';
24
25    private const PREFIX_PER_KEYWORD = [
26        'articletopic' => self::ARTICLE_TOPIC_TAG_PREFIX,
27        'drafttopic' => self::DRAFT_TOPIC_TAG_PREFIX,
28        'articlecountry' => self::ARTICLE_COUNTRY_TAG_PREFIX,
29    ];
30
31    /**
32     * @var array<string, string|array<string>>
33     */
34    private const TERMS_PER_KEYWORD = [
35        'articletopic' => ArticleTopicFeature::TERMS_TO_LABELS,
36        'drafttopic' => ArticleTopicFeature::TERMS_TO_LABELS,
37        // Suppresses a warning when ArticleCountryFeature::AREA_CODES_TO_COUNTRY_CODES
38        // is empty. Using + operator for compile-time array union since array_merge()
39        // can't be used in constant definitions
40        // @phan-suppress-next-line PhanUselessBinaryAddRight
41        'articlecountry' => ArticleCountryFeature::COUNTRY_CODES_TO_LABELS +
42            ArticleCountryFeature::AREA_CODES_TO_COUNTRY_CODES,
43    ];
44
45    private const WARN_MESSAGE_PER_KEYWORD = "cirrussearch-articleprediction-invalid-keyword";
46
47    /**
48     * @inheritDoc
49     * @phan-return array{keywords:array<array{terms:string[], boost:float|null}>,tag_prefix:string}
50     */
51    public function parseValue(
52        $key, $value, $quotedValue, $valueDelimiter, $suffix, WarningCollector $warningCollector
53    ) {
54        $allowedTerms = self::TERMS_PER_KEYWORD[$key];
55        $keywords = explode( '|', mb_strtolower( $value ) );
56        $keywords = array_map( fn ( string $k ): array => $this->parseBoost( $k, $warningCollector ), $keywords );
57        $invalidKeywords = array_diff(
58            array_map( static fn ( array $k ): string => $k['term'], $keywords ),
59            array_keys( $allowedTerms ) );
60
61        $validKeywords = array_filter(
62            $keywords,
63            static fn ( array $k ): bool => array_key_exists( $k['term'], $allowedTerms )
64        );
65
66        $isArticleTopic = $key === 'articletopic';
67        $validKeywords = array_map(
68            static function ( array $k ) use ( $allowedTerms, $isArticleTopic ): array {
69                $rawTerms = $allowedTerms[$k['term']];
70                $terms = is_string( $rawTerms ) ? [ $rawTerms ] : $rawTerms;
71                // At some point articletopic predictions changed from spaces to
72                // underscores, but the index is still mixed. Once the live indexes
73                // have been made consistent via reindexing this can be removed.
74                if ( $isArticleTopic ) {
75                    $bcTerms = [];
76                    foreach ( $terms as $term ) {
77                        if ( str_contains( $term, '_' ) ) {
78                            $bcTerms[] = strtr( $term, [ '_' => ' ' ] );
79                        }
80                    }
81                    $terms = array_merge( $terms, $bcTerms );
82                }
83
84                return [
85                    'terms' => $terms,
86                    'boost' => $k['boost']
87                ];
88            },
89            $validKeywords
90        );
91
92        if ( $invalidKeywords ) {
93            $warningCollector->addWarning( self::WARN_MESSAGE_PER_KEYWORD,
94                Message::listParam( $invalidKeywords, ListType::COMMA ), count( $invalidKeywords ), $key );
95        }
96        return [ 'keywords' => $validKeywords, 'tag_prefix' => self::PREFIX_PER_KEYWORD[$key] ];
97    }
98
99    /** @inheritDoc */
100    protected function getKeywords() {
101        return array_keys( self::PREFIX_PER_KEYWORD );
102    }
103
104    /** @inheritDoc */
105    protected function doApply( SearchContext $context, $key, $value, $quotedValue, $negated ) {
106        $parsed = $this->parseValue( $key, $value, $quotedValue, '', '', $context );
107        $keywords = $parsed['keywords'];
108        if ( $keywords === [] ) {
109            $context->setResultsPossible( false );
110            return [ null, true ];
111        }
112
113        $query = new DisMax();
114        foreach ( $keywords as $keyword ) {
115            $terms = array_map( static fn ( string $k ): string => $parsed['tag_prefix'] . '/' . $k, $keyword['terms'] );
116            $keywordQuery = new Terms( WeightedTagsHooks::FIELD_NAME, $terms );
117            if ( $keyword['boost'] !== null ) {
118                $keywordQuery->setBoost( $keyword['boost'] );
119            }
120            $query->addQuery( $keywordQuery );
121        }
122
123        if ( !$negated ) {
124            $context->addNonTextQuery( $query );
125            return [ null, false ];
126        } else {
127            return [ $query, false ];
128        }
129    }
130
131}