Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
92.75% |
64 / 69 |
|
72.73% |
8 / 11 |
CRAP | |
0.00% |
0 / 1 |
SearchStrategy | |
92.75% |
64 / 69 |
|
72.73% |
8 / 11 |
30.34 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getQueries | |
91.89% |
34 / 37 |
|
0.00% |
0 / 1 |
16.14 | |||
validateParams | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
getTemplateTerm | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getTopicTerm | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
getPageIdTerm | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getExcludedPageIdTerm | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
getOresBasedTopicTerm | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
getMorelikeBasedTopicTerm | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
escapeSearchTitleList | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
shuffleQueryOrder | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace GrowthExperiments\NewcomerTasks\TaskSuggester\SearchStrategy; |
4 | |
5 | use GrowthExperiments\NewcomerTasks\TaskType\LinkRecommendationTaskType; |
6 | use GrowthExperiments\NewcomerTasks\TaskType\TaskType; |
7 | use GrowthExperiments\NewcomerTasks\TaskType\TaskTypeHandlerRegistry; |
8 | use GrowthExperiments\NewcomerTasks\Topic\CampaignTopic; |
9 | use GrowthExperiments\NewcomerTasks\Topic\MorelikeBasedTopic; |
10 | use GrowthExperiments\NewcomerTasks\Topic\OresBasedTopic; |
11 | use GrowthExperiments\NewcomerTasks\Topic\Topic; |
12 | use MediaWiki\Linker\LinkTarget; |
13 | use Wikimedia\Assert\Assert; |
14 | |
15 | /** |
16 | * SearchStrategy turns requirements from the user (such as task types and topics) |
17 | * into a series of search query strings. |
18 | */ |
19 | class SearchStrategy { |
20 | |
21 | public const TOPIC_MATCH_MODE_OR = 'OR'; |
22 | public const TOPIC_MATCH_MODE_AND = 'AND'; |
23 | public const TOPIC_MATCH_MODES = [ |
24 | self::TOPIC_MATCH_MODE_OR, |
25 | self::TOPIC_MATCH_MODE_AND |
26 | ]; |
27 | |
28 | /** @var TaskTypeHandlerRegistry */ |
29 | private $taskTypeHandlerRegistry; |
30 | |
31 | /** |
32 | * @param TaskTypeHandlerRegistry $taskTypeHandlerRegistry |
33 | */ |
34 | public function __construct( |
35 | TaskTypeHandlerRegistry $taskTypeHandlerRegistry |
36 | ) { |
37 | $this->taskTypeHandlerRegistry = $taskTypeHandlerRegistry; |
38 | } |
39 | |
40 | /** |
41 | * Get the search queries for searching for a given user requirement |
42 | * (set of task types and topics). |
43 | * @param TaskType[] $taskTypes Task types to limit search results to |
44 | * @param Topic[] $topics Topics to limit search results to |
45 | * @param array|null $pageIds List of PageIds search results should be restricted to. |
46 | * @param array|null $excludePageIds List of PageIds to exclude from search. |
47 | * @param string|null $topicsFilterMode Join mode for the topics search. One of ('AND', 'OR'). |
48 | * @return SearchQuery[] Array of queries, indexed by query ID. |
49 | */ |
50 | public function getQueries( |
51 | array $taskTypes, |
52 | array $topics, |
53 | ?array $pageIds = null, |
54 | ?array $excludePageIds = null, |
55 | ?string $topicsFilterMode = null |
56 | ) { |
57 | $this->validateParams( $taskTypes, $topics ); |
58 | $queries = []; |
59 | // FIXME Ideally we should do a single search for all topics, but currently this |
60 | // runs into query length limits (T242560) |
61 | // Empty topic array means doing a single search with no topic filter |
62 | $topics = $topics ?: [ null ]; |
63 | foreach ( $taskTypes as $taskType ) { |
64 | $typeTerm = $this->taskTypeHandlerRegistry->getByTaskType( $taskType ) |
65 | ->getSearchTerm( $taskType ); |
66 | $pageIdTerm = $pageIds ? $this->getPageIdTerm( $pageIds ) : null; |
67 | $excludedPageIdTerm = $excludePageIds ? $this->getExcludedPageIdTerm( $excludePageIds ) : null; |
68 | if ( $topicsFilterMode === self::TOPIC_MATCH_MODE_AND ) { |
69 | $allTopicsAreOres = true; |
70 | $topicTerms = []; |
71 | foreach ( $topics as $topic ) { |
72 | $topicTerms[] = $this->getTopicTerm( $topic ); |
73 | $allTopicsAreOres = $allTopicsAreOres && $topic instanceof OresBasedTopic; |
74 | } |
75 | $topicTerm = implode( ' ', array_filter( $topicTerms ) ); |
76 | $queryString = implode( ' ', array_filter( [ $typeTerm, $topicTerm, |
77 | $pageIdTerm, $excludedPageIdTerm ] ) ); |
78 | |
79 | $queryId = $taskType->getId() . ':multiple-topics'; |
80 | $query = new SearchQuery( $queryId, $queryString, $taskType, $topics[0] ); |
81 | // don't randomize if we use topic matching with the morelike backend, which itself |
82 | // is a kind of sorting. Topic matching with the ORES backend already uses |
83 | // thresholds per topic so applying a random sort should be safe. |
84 | if ( $allTopicsAreOres ) { |
85 | $query->setSort( 'random' ); |
86 | } |
87 | $queries[$queryId] = $query; |
88 | } else { |
89 | foreach ( $topics as $topic ) { |
90 | $topicTerm = $this->getTopicTerm( $topic ); |
91 | $queryString = implode( ' ', array_filter( [ $typeTerm, $topicTerm, |
92 | $pageIdTerm, $excludedPageIdTerm ] ) ); |
93 | |
94 | $queryId = $taskType->getId() . ':' . ( $topic ? $topic->getId() : '-' ); |
95 | $query = new SearchQuery( $queryId, $queryString, $taskType, $topic ); |
96 | // don't randomize if we use topic matching with the morelike backend, which itself |
97 | // is a kind of sorting. Topic matching with the ORES backend already uses |
98 | // thresholds per topic so applying a random sort should be safe. |
99 | if ( !$topic || $topic instanceof OresBasedTopic ) { |
100 | $query->setSort( 'random' ); |
101 | } |
102 | $queries[$queryId] = $query; |
103 | } |
104 | } |
105 | if ( |
106 | $taskType instanceof LinkRecommendationTaskType |
107 | && $taskType->getUnderlinkedWeight() > 0 |
108 | && !$pageIdTerm |
109 | ) { |
110 | // Sort link recommendation tasks by underlinkedness. |
111 | // Cirrus will only rescore when the sort mode is 'relevance' so we can't use |
112 | // random sorting. It probably doesn't matter much: we are typically aiming for |
113 | // 32K tasks per wiki, and the top <rescore window size> * <shard count> results |
114 | // will be rescored; in practice, that's $wmgCirrusSearchShardCount * 8K results, |
115 | // so a fairly large part of the total result set will be included anyway. |
116 | $query->setSort( 'relevance' ); |
117 | $query->setRescoreProfile( SearchQuery::RESCORE_UNDERLINKED ); |
118 | } |
119 | } |
120 | return $this->shuffleQueryOrder( $queries ); |
121 | } |
122 | |
123 | /** |
124 | * @param TaskType[] $taskTypes |
125 | * @param Topic[] $topics |
126 | */ |
127 | protected function validateParams( array $taskTypes, array $topics ) { |
128 | Assert::parameterElementType( TaskType::class, $taskTypes, '$taskTypes' ); |
129 | Assert::parameterElementType( [ OresBasedTopic::class, MorelikeBasedTopic::class, |
130 | CampaignTopic::class ], $topics, '$topics' ); |
131 | } |
132 | |
133 | /** |
134 | * @param LinkTarget[] $templates |
135 | * @return string|null |
136 | */ |
137 | protected function getTemplateTerm( array $templates ) { |
138 | return 'hastemplate:' . $this->escapeSearchTitleList( $templates ); |
139 | } |
140 | |
141 | /** |
142 | * @param Topic|null $topic |
143 | * @return string|null |
144 | */ |
145 | protected function getTopicTerm( ?Topic $topic ): ?string { |
146 | $topicTerm = null; |
147 | if ( $topic instanceof OresBasedTopic ) { |
148 | $topicTerm = $this->getOresBasedTopicTerm( [ $topic ] ); |
149 | } elseif ( $topic instanceof MorelikeBasedTopic ) { |
150 | $topicTerm = $this->getMorelikeBasedTopicTerm( [ $topic ] ); |
151 | } elseif ( $topic instanceof CampaignTopic ) { |
152 | $topicTerm = $topic->getSearchExpression(); |
153 | } |
154 | return $topicTerm; |
155 | } |
156 | |
157 | /** |
158 | * @param array $pageIds |
159 | * @return string |
160 | */ |
161 | private function getPageIdTerm( array $pageIds ) { |
162 | return 'pageid:' . implode( '|', $pageIds ); |
163 | } |
164 | |
165 | /** |
166 | * @param array $pageIds |
167 | * @return string |
168 | */ |
169 | private function getExcludedPageIdTerm( array $pageIds ): string { |
170 | return '-pageid:' . implode( '|', $pageIds ); |
171 | } |
172 | |
173 | /** |
174 | * @param OresBasedTopic[] $topics |
175 | * @return string |
176 | */ |
177 | protected function getOresBasedTopicTerm( array $topics ) { |
178 | return 'articletopic:' . implode( '|', array_reduce( $topics, |
179 | static function ( array $carry, OresBasedTopic $topic ) { |
180 | return array_merge( $carry, $topic->getOresTopics() ); |
181 | }, [] ) ); |
182 | } |
183 | |
184 | /** |
185 | * @param MorelikeBasedTopic[] $topics |
186 | * @return string |
187 | * @see https://www.mediawiki.org/wiki/Help:CirrusSearch#Morelike |
188 | * @see https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-mlt-query.html |
189 | */ |
190 | protected function getMorelikeBasedTopicTerm( array $topics ) { |
191 | return 'morelikethis:' . $this->escapeSearchTitleList( |
192 | array_reduce( $topics, static function ( array $carry, MorelikeBasedTopic $topic ) { |
193 | return array_merge( $carry, $topic->getReferencePages() ); |
194 | }, [] ) ); |
195 | } |
196 | |
197 | /** |
198 | * Turns an array of pages into a CirrusSearch keyword value (pipe-separated, escaped). |
199 | * Namespaces are omitted entirely. |
200 | * @param LinkTarget[] $titles |
201 | * @return string |
202 | */ |
203 | protected function escapeSearchTitleList( array $titles ) { |
204 | return '"' . implode( '|', array_map( static function ( LinkTarget $title ) { |
205 | return str_replace( [ '"', '?' ], [ '\"', '\?' ], $title->getDBkey() ); |
206 | }, $titles ) ) . '"'; |
207 | } |
208 | |
209 | /** |
210 | * Shuffle the list of queries, preserving keys (T248106) |
211 | * |
212 | * PHP's shuffle() is insufficient as we need to preserve the keys. |
213 | * |
214 | * @param array $queries |
215 | * @return array |
216 | */ |
217 | protected function shuffleQueryOrder( array $queries ): array { |
218 | $keys = array_keys( $queries ); |
219 | shuffle( $keys ); |
220 | $shuffled = []; |
221 | foreach ( $keys as $key ) { |
222 | $shuffled[$key] = $queries[$key]; |
223 | } |
224 | return $shuffled; |
225 | } |
226 | |
227 | } |