Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
97.96% |
48 / 49 |
|
66.67% |
2 / 3 |
CRAP | |
0.00% |
0 / 1 |
| ArticlePredictionKeyword | |
97.96% |
48 / 49 |
|
66.67% |
2 / 3 |
12 | |
0.00% |
0 / 1 |
| parseValue | |
96.88% |
31 / 32 |
|
0.00% |
0 / 1 |
6 | |||
| getKeywords | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| doApply | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
5 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace CirrusSearch\Query; |
| 4 | |
| 5 | use CirrusSearch\Search\SearchContext; |
| 6 | use CirrusSearch\Search\WeightedTagsHooks; |
| 7 | use CirrusSearch\WarningCollector; |
| 8 | use Elastica\Query\DisMax; |
| 9 | use Elastica\Query\Terms; |
| 10 | use MediaWiki\Message\Message; |
| 11 | use Wikimedia\Message\ListType; |
| 12 | |
| 13 | /** |
| 14 | * Finds pages based on how well they match a given keyword |
| 15 | * (e.g.articletopic:term, articlecountry:term), based on scores provided by |
| 16 | * (Wikimedia-specific) ML models. |
| 17 | * @see WeightedTagsHooks |
| 18 | * @see https://www.mediawiki.org/wiki/Help:CirrusSearch#Articletopic |
| 19 | */ |
| 20 | class ArticlePredictionKeyword extends SimpleKeywordFeature { |
| 21 | public const ARTICLE_TOPIC_TAG_PREFIX = 'classification.prediction.articletopic'; |
| 22 | public const DRAFT_TOPIC_TAG_PREFIX = 'classification.prediction.drafttopic'; |
| 23 | public const ARTICLE_COUNTRY_TAG_PREFIX = 'classification.prediction.articlecountry'; |
| 24 | |
| 25 | private const PREFIX_PER_KEYWORD = [ |
| 26 | 'articletopic' => self::ARTICLE_TOPIC_TAG_PREFIX, |
| 27 | 'drafttopic' => self::DRAFT_TOPIC_TAG_PREFIX, |
| 28 | 'articlecountry' => self::ARTICLE_COUNTRY_TAG_PREFIX, |
| 29 | ]; |
| 30 | |
| 31 | /** |
| 32 | * @var array<string, string|array<string>> |
| 33 | */ |
| 34 | private const TERMS_PER_KEYWORD = [ |
| 35 | 'articletopic' => ArticleTopicFeature::TERMS_TO_LABELS, |
| 36 | 'drafttopic' => ArticleTopicFeature::TERMS_TO_LABELS, |
| 37 | // Suppresses a warning when ArticleCountryFeature::AREA_CODES_TO_COUNTRY_CODES |
| 38 | // is empty. Using + operator for compile-time array union since array_merge() |
| 39 | // can't be used in constant definitions |
| 40 | // @phan-suppress-next-line PhanUselessBinaryAddRight |
| 41 | 'articlecountry' => ArticleCountryFeature::COUNTRY_CODES_TO_LABELS + |
| 42 | ArticleCountryFeature::AREA_CODES_TO_COUNTRY_CODES, |
| 43 | ]; |
| 44 | |
| 45 | private const WARN_MESSAGE_PER_KEYWORD = "cirrussearch-articleprediction-invalid-keyword"; |
| 46 | |
| 47 | /** |
| 48 | * @inheritDoc |
| 49 | * @phan-return array{keywords:array<array{terms:string[], boost:float|null}>,tag_prefix:string} |
| 50 | */ |
| 51 | public function parseValue( |
| 52 | $key, $value, $quotedValue, $valueDelimiter, $suffix, WarningCollector $warningCollector |
| 53 | ) { |
| 54 | $allowedTerms = self::TERMS_PER_KEYWORD[$key]; |
| 55 | $keywords = explode( '|', mb_strtolower( $value ) ); |
| 56 | $keywords = array_map( fn ( string $k ): array => $this->parseBoost( $k, $warningCollector ), $keywords ); |
| 57 | $invalidKeywords = array_diff( |
| 58 | array_map( static fn ( array $k ): string => $k['term'], $keywords ), |
| 59 | array_keys( $allowedTerms ) ); |
| 60 | |
| 61 | $validKeywords = array_filter( |
| 62 | $keywords, |
| 63 | static fn ( array $k ): bool => array_key_exists( $k['term'], $allowedTerms ) |
| 64 | ); |
| 65 | |
| 66 | $isArticleTopic = $key === 'articletopic'; |
| 67 | $validKeywords = array_map( |
| 68 | static function ( array $k ) use ( $allowedTerms, $isArticleTopic ): array { |
| 69 | $rawTerms = $allowedTerms[$k['term']]; |
| 70 | $terms = is_string( $rawTerms ) ? [ $rawTerms ] : $rawTerms; |
| 71 | // At some point articletopic predictions changed from spaces to |
| 72 | // underscores, but the index is still mixed. Once the live indexes |
| 73 | // have been made consistent via reindexing this can be removed. |
| 74 | if ( $isArticleTopic ) { |
| 75 | $bcTerms = []; |
| 76 | foreach ( $terms as $term ) { |
| 77 | if ( str_contains( $term, '_' ) ) { |
| 78 | $bcTerms[] = strtr( $term, [ '_' => ' ' ] ); |
| 79 | } |
| 80 | } |
| 81 | $terms = array_merge( $terms, $bcTerms ); |
| 82 | } |
| 83 | |
| 84 | return [ |
| 85 | 'terms' => $terms, |
| 86 | 'boost' => $k['boost'] |
| 87 | ]; |
| 88 | }, |
| 89 | $validKeywords |
| 90 | ); |
| 91 | |
| 92 | if ( $invalidKeywords ) { |
| 93 | $warningCollector->addWarning( self::WARN_MESSAGE_PER_KEYWORD, |
| 94 | Message::listParam( $invalidKeywords, ListType::COMMA ), count( $invalidKeywords ), $key ); |
| 95 | } |
| 96 | return [ 'keywords' => $validKeywords, 'tag_prefix' => self::PREFIX_PER_KEYWORD[$key] ]; |
| 97 | } |
| 98 | |
| 99 | /** @inheritDoc */ |
| 100 | protected function getKeywords() { |
| 101 | return array_keys( self::PREFIX_PER_KEYWORD ); |
| 102 | } |
| 103 | |
| 104 | /** @inheritDoc */ |
| 105 | protected function doApply( SearchContext $context, $key, $value, $quotedValue, $negated ) { |
| 106 | $parsed = $this->parseValue( $key, $value, $quotedValue, '', '', $context ); |
| 107 | $keywords = $parsed['keywords']; |
| 108 | if ( $keywords === [] ) { |
| 109 | $context->setResultsPossible( false ); |
| 110 | return [ null, true ]; |
| 111 | } |
| 112 | |
| 113 | $query = new DisMax(); |
| 114 | foreach ( $keywords as $keyword ) { |
| 115 | $terms = array_map( static fn ( string $k ): string => $parsed['tag_prefix'] . '/' . $k, $keyword['terms'] ); |
| 116 | $keywordQuery = new Terms( WeightedTagsHooks::FIELD_NAME, $terms ); |
| 117 | if ( $keyword['boost'] !== null ) { |
| 118 | $keywordQuery->setBoost( $keyword['boost'] ); |
| 119 | } |
| 120 | $query->addQuery( $keywordQuery ); |
| 121 | } |
| 122 | |
| 123 | if ( !$negated ) { |
| 124 | $context->addNonTextQuery( $query ); |
| 125 | return [ null, false ]; |
| 126 | } else { |
| 127 | return [ $query, false ]; |
| 128 | } |
| 129 | } |
| 130 | |
| 131 | } |