Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
32 / 32 |
|
100.00% |
4 / 4 |
CRAP | |
100.00% |
1 / 1 |
ArticleTopicFeature | |
100.00% |
32 / 32 |
|
100.00% |
4 / 4 |
9 | |
100.00% |
1 / 1 |
getTopicScores | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 | |||
parseValue | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
2 | |||
getKeywords | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
doApply | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
4 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Query; |
4 | |
5 | use CirrusSearch\Search\SearchContext; |
6 | use CirrusSearch\WarningCollector; |
7 | use CirrusSearch\Wikimedia\WeightedTagsHooks; |
8 | use Elastica\Query\DisMax; |
9 | use Elastica\Query\Term; |
10 | use MediaWiki\Message\Message; |
11 | |
12 | /** |
13 | * Finds pages based on how well they match a given topic, based on scores provided by the |
14 | * (Wikimedia-specific) articletopic ORES model. |
15 | * @package CirrusSearch\Wikimedia |
16 | * @see WeightedTagsHooks |
17 | * @see https://www.mediawiki.org/wiki/Help:CirrusSearch#Articletopic |
18 | */ |
19 | class ArticleTopicFeature extends SimpleKeywordFeature { |
20 | public const ARTICLE_TOPIC_TAG_PREFIX = 'classification.ores.articletopic'; |
21 | public const DRAFT_TOPIC_TAG_PREFIX = 'classification.ores.drafttopic'; |
22 | |
23 | private const PREFIX_PER_KEYWORD = [ |
24 | 'articletopic' => self::ARTICLE_TOPIC_TAG_PREFIX, |
25 | 'drafttopic' => self::DRAFT_TOPIC_TAG_PREFIX |
26 | ]; |
27 | |
28 | public const TERMS_TO_LABELS = [ |
29 | 'biography' => 'Culture.Biography.Biography*', |
30 | 'women' => 'Culture.Biography.Women', |
31 | 'food-and-drink' => 'Culture.Food and drink', |
32 | 'internet-culture' => 'Culture.Internet culture', |
33 | 'linguistics' => 'Culture.Linguistics', |
34 | 'literature' => 'Culture.Literature', |
35 | 'books' => 'Culture.Media.Books', |
36 | 'entertainment' => 'Culture.Media.Entertainment', |
37 | 'films' => 'Culture.Media.Films', |
38 | 'media' => 'Culture.Media.Media*', |
39 | 'music' => 'Culture.Media.Music', |
40 | 'radio' => 'Culture.Media.Radio', |
41 | 'software' => 'Culture.Media.Software', |
42 | 'television' => 'Culture.Media.Television', |
43 | 'video-games' => 'Culture.Media.Video games', |
44 | 'performing-arts' => 'Culture.Performing arts', |
45 | 'philosophy-and-religion' => 'Culture.Philosophy and religion', |
46 | 'sports' => 'Culture.Sports', |
47 | 'architecture' => 'Culture.Visual arts.Architecture', |
48 | 'comics-and-anime' => 'Culture.Visual arts.Comics and Anime', |
49 | 'fashion' => 'Culture.Visual arts.Fashion', |
50 | 'visual-arts' => 'Culture.Visual arts.Visual arts*', |
51 | 'geographical' => 'Geography.Geographical', |
52 | 'africa' => 'Geography.Regions.Africa.Africa*', |
53 | 'central-africa' => 'Geography.Regions.Africa.Central Africa', |
54 | 'eastern-africa' => 'Geography.Regions.Africa.Eastern Africa', |
55 | 'northern-africa' => 'Geography.Regions.Africa.Northern Africa', |
56 | 'southern-africa' => 'Geography.Regions.Africa.Southern Africa', |
57 | 'western-africa' => 'Geography.Regions.Africa.Western Africa', |
58 | 'central-america' => 'Geography.Regions.Americas.Central America', |
59 | 'north-america' => 'Geography.Regions.Americas.North America', |
60 | 'south-america' => 'Geography.Regions.Americas.South America', |
61 | 'asia' => 'Geography.Regions.Asia.Asia*', |
62 | 'central-asia' => 'Geography.Regions.Asia.Central Asia', |
63 | 'east-asia' => 'Geography.Regions.Asia.East Asia', |
64 | 'north-asia' => 'Geography.Regions.Asia.North Asia', |
65 | 'south-asia' => 'Geography.Regions.Asia.South Asia', |
66 | 'southeast-asia' => 'Geography.Regions.Asia.Southeast Asia', |
67 | 'west-asia' => 'Geography.Regions.Asia.West Asia', |
68 | 'eastern-europe' => 'Geography.Regions.Europe.Eastern Europe', |
69 | 'europe' => 'Geography.Regions.Europe.Europe*', |
70 | 'northern-europe' => 'Geography.Regions.Europe.Northern Europe', |
71 | 'southern-europe' => 'Geography.Regions.Europe.Southern Europe', |
72 | 'western-europe' => 'Geography.Regions.Europe.Western Europe', |
73 | 'oceania' => 'Geography.Regions.Oceania', |
74 | 'business-and-economics' => 'History and Society.Business and economics', |
75 | 'education' => 'History and Society.Education', |
76 | 'history' => 'History and Society.History', |
77 | 'military-and-warfare' => 'History and Society.Military and warfare', |
78 | 'politics-and-government' => 'History and Society.Politics and government', |
79 | 'society' => 'History and Society.Society', |
80 | 'transportation' => 'History and Society.Transportation', |
81 | 'biology' => 'STEM.Biology', |
82 | 'chemistry' => 'STEM.Chemistry', |
83 | 'computing' => 'STEM.Computing', |
84 | 'earth-and-environment' => 'STEM.Earth and environment', |
85 | 'engineering' => 'STEM.Engineering', |
86 | 'libraries-and-information' => 'STEM.Libraries & Information', |
87 | 'mathematics' => 'STEM.Mathematics', |
88 | 'medicine-and-health' => 'STEM.Medicine & Health', |
89 | 'physics' => 'STEM.Physics', |
90 | 'stem' => 'STEM.STEM*', |
91 | 'space' => 'STEM.Space', |
92 | 'technology' => 'STEM.Technology', |
93 | ]; |
94 | |
95 | /** |
96 | * Helper method for turning raw ORES score data (as stored in the Cirrus document) into |
97 | * search terms, for analytics/debugging. |
98 | * @param array $rawTopicData The unprefixed content of the document's weighted_tags field |
99 | * @return array corresponding search term => ORES score (rounded to three decimals) |
100 | */ |
101 | public static function getTopicScores( array $rawTopicData ): array { |
102 | $labelsToTerms = array_flip( self::TERMS_TO_LABELS ); |
103 | $topicScores = []; |
104 | foreach ( $rawTopicData as $rawTopic ) { |
105 | [ $oresLabel, $scaledScore ] = explode( '|', $rawTopic ); |
106 | $topicId = $labelsToTerms[$oresLabel]; |
107 | $topicScores[$topicId] = (int)$scaledScore / 1000; |
108 | } |
109 | return $topicScores; |
110 | } |
111 | |
112 | /** |
113 | * @inheritDoc |
114 | * @phan-return array{topics:string[],tag_prefix:string} |
115 | */ |
116 | public function parseValue( |
117 | $key, $value, $quotedValue, $valueDelimiter, $suffix, WarningCollector $warningCollector |
118 | ) { |
119 | $topics = explode( '|', $value ); |
120 | $invalidTopics = array_diff( $topics, array_keys( self::TERMS_TO_LABELS ) ); |
121 | $validTopics = array_filter( array_map( static function ( $topic ) { |
122 | return self::TERMS_TO_LABELS[$topic]; |
123 | }, array_diff( $topics, $invalidTopics ) ) ); |
124 | |
125 | if ( $invalidTopics ) { |
126 | $warningCollector->addWarning( 'cirrussearch-articletopic-invalid-topic', |
127 | Message::listParam( $invalidTopics, 'comma' ), count( $invalidTopics ) ); |
128 | } |
129 | return [ 'topics' => $validTopics, 'tag_prefix' => self::PREFIX_PER_KEYWORD[$key] ]; |
130 | } |
131 | |
132 | /** @inheritDoc */ |
133 | protected function getKeywords() { |
134 | return array_keys( self::PREFIX_PER_KEYWORD ); |
135 | } |
136 | |
137 | /** @inheritDoc */ |
138 | protected function doApply( SearchContext $context, $key, $value, $quotedValue, $negated ) { |
139 | $parsed = $this->parseValue( $key, $value, $quotedValue, '', '', $context ); |
140 | $topics = $parsed['topics']; |
141 | $tagPrefix = $parsed['tag_prefix']; |
142 | if ( $topics === [] ) { |
143 | $context->setResultsPossible( false ); |
144 | return [ null, true ]; |
145 | } |
146 | |
147 | $query = new DisMax(); |
148 | foreach ( $topics as $topic ) { |
149 | $topicQuery = new Term(); |
150 | $topicQuery->setTerm( WeightedTagsHooks::FIELD_NAME, $tagPrefix . '/' . $topic ); |
151 | $query->addQuery( $topicQuery ); |
152 | } |
153 | |
154 | if ( !$negated ) { |
155 | $context->addNonTextQuery( $query ); |
156 | return [ null, false ]; |
157 | } else { |
158 | return [ $query, false ]; |
159 | } |
160 | } |
161 | |
162 | } |