Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
92.75% covered (success)
92.75%
64 / 69
72.73% covered (warning)
72.73%
8 / 11
CRAP
0.00% covered (danger)
0.00%
0 / 1
SearchStrategy
92.75% covered (success)
92.75%
64 / 69
72.73% covered (warning)
72.73%
8 / 11
30.34
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getQueries
91.89% covered (success)
91.89%
34 / 37
0.00% covered (danger)
0.00%
0 / 1
16.14
 validateParams
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 getTemplateTerm
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getTopicTerm
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
4
 getPageIdTerm
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 getExcludedPageIdTerm
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getOresBasedTopicTerm
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 getMorelikeBasedTopicTerm
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 escapeSearchTitleList
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 shuffleQueryOrder
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
1<?php
2
3namespace GrowthExperiments\NewcomerTasks\TaskSuggester\SearchStrategy;
4
5use GrowthExperiments\NewcomerTasks\TaskType\LinkRecommendationTaskType;
6use GrowthExperiments\NewcomerTasks\TaskType\TaskType;
7use GrowthExperiments\NewcomerTasks\TaskType\TaskTypeHandlerRegistry;
8use GrowthExperiments\NewcomerTasks\Topic\CampaignTopic;
9use GrowthExperiments\NewcomerTasks\Topic\MorelikeBasedTopic;
10use GrowthExperiments\NewcomerTasks\Topic\OresBasedTopic;
11use GrowthExperiments\NewcomerTasks\Topic\Topic;
12use MediaWiki\Linker\LinkTarget;
13use Wikimedia\Assert\Assert;
14
15/**
16 * SearchStrategy turns requirements from the user (such as task types and topics)
17 * into a series of search query strings.
18 */
19class SearchStrategy {
20
21    public const TOPIC_MATCH_MODE_OR = 'OR';
22    public const TOPIC_MATCH_MODE_AND = 'AND';
23    public const TOPIC_MATCH_MODES = [
24        self::TOPIC_MATCH_MODE_OR,
25        self::TOPIC_MATCH_MODE_AND
26    ];
27
28    /** @var TaskTypeHandlerRegistry */
29    private $taskTypeHandlerRegistry;
30
31    /**
32     * @param TaskTypeHandlerRegistry $taskTypeHandlerRegistry
33     */
34    public function __construct(
35        TaskTypeHandlerRegistry $taskTypeHandlerRegistry
36    ) {
37        $this->taskTypeHandlerRegistry = $taskTypeHandlerRegistry;
38    }
39
40    /**
41     * Get the search queries for searching for a given user requirement
42     * (set of task types and topics).
43     * @param TaskType[] $taskTypes Task types to limit search results to
44     * @param Topic[] $topics Topics to limit search results to
45     * @param array|null $pageIds List of PageIds search results should be restricted to.
46     * @param array|null $excludePageIds List of PageIds to exclude from search.
47     * @param string|null $topicsFilterMode Join mode for the topics search. One of ('AND', 'OR').
48     * @return SearchQuery[] Array of queries, indexed by query ID.
49     */
50    public function getQueries(
51        array $taskTypes,
52        array $topics,
53        ?array $pageIds = null,
54        ?array $excludePageIds = null,
55        ?string $topicsFilterMode = null
56    ) {
57        $this->validateParams( $taskTypes, $topics );
58        $queries = [];
59        // FIXME Ideally we should do a single search for all topics, but currently this
60        //   runs into query length limits (T242560)
61        // Empty topic array means doing a single search with no topic filter
62        $topics = $topics ?: [ null ];
63        foreach ( $taskTypes as $taskType ) {
64            $typeTerm = $this->taskTypeHandlerRegistry->getByTaskType( $taskType )
65                ->getSearchTerm( $taskType );
66            $pageIdTerm = $pageIds ? $this->getPageIdTerm( $pageIds ) : null;
67            $excludedPageIdTerm = $excludePageIds ? $this->getExcludedPageIdTerm( $excludePageIds ) : null;
68            if ( $topicsFilterMode === self::TOPIC_MATCH_MODE_AND ) {
69                $allTopicsAreOres = true;
70                $topicTerms = [];
71                foreach ( $topics as $topic ) {
72                    $topicTerms[] = $this->getTopicTerm( $topic );
73                    $allTopicsAreOres = $allTopicsAreOres && $topic instanceof OresBasedTopic;
74                }
75                $topicTerm = implode( ' ', array_filter( $topicTerms ) );
76                $queryString = implode( ' ', array_filter( [ $typeTerm, $topicTerm,
77                    $pageIdTerm, $excludedPageIdTerm ] ) );
78
79                $queryId = $taskType->getId() . ':multiple-topics';
80                $query = new SearchQuery( $queryId, $queryString, $taskType, $topics[0] );
81                // don't randomize if we use topic matching with the morelike backend, which itself
82                // is a kind of sorting. Topic matching with the ORES backend already uses
83                // thresholds per topic so applying a random sort should be safe.
84                if ( $allTopicsAreOres ) {
85                    $query->setSort( 'random' );
86                }
87                $queries[$queryId] = $query;
88            } else {
89                foreach ( $topics as $topic ) {
90                    $topicTerm = $this->getTopicTerm( $topic );
91                    $queryString = implode( ' ', array_filter( [ $typeTerm, $topicTerm,
92                        $pageIdTerm, $excludedPageIdTerm ] ) );
93
94                    $queryId = $taskType->getId() . ':' . ( $topic ? $topic->getId() : '-' );
95                    $query = new SearchQuery( $queryId, $queryString, $taskType, $topic );
96                    // don't randomize if we use topic matching with the morelike backend, which itself
97                    // is a kind of sorting. Topic matching with the ORES backend already uses
98                    // thresholds per topic so applying a random sort should be safe.
99                    if ( !$topic || $topic instanceof OresBasedTopic ) {
100                        $query->setSort( 'random' );
101                    }
102                    $queries[$queryId] = $query;
103                }
104            }
105            if (
106                $taskType instanceof LinkRecommendationTaskType
107                && $taskType->getUnderlinkedWeight() > 0
108                && !$pageIdTerm
109            ) {
110                // Sort link recommendation tasks by underlinkedness.
111                // Cirrus will only rescore when the sort mode is 'relevance' so we can't use
112                // random sorting. It probably doesn't matter much: we are typically aiming for
113                // 32K tasks per wiki, and the top <rescore window size> * <shard count> results
114                // will be rescored; in practice, that's $wmgCirrusSearchShardCount * 8K results,
115                // so a fairly large part of the total result set will be included anyway.
116                $query->setSort( 'relevance' );
117                $query->setRescoreProfile( SearchQuery::RESCORE_UNDERLINKED );
118            }
119        }
120        return $this->shuffleQueryOrder( $queries );
121    }
122
123    /**
124     * @param TaskType[] $taskTypes
125     * @param Topic[] $topics
126     */
127    protected function validateParams( array $taskTypes, array $topics ) {
128        Assert::parameterElementType( TaskType::class, $taskTypes, '$taskTypes' );
129        Assert::parameterElementType( [ OresBasedTopic::class, MorelikeBasedTopic::class,
130            CampaignTopic::class ], $topics, '$topics' );
131    }
132
133    /**
134     * @param LinkTarget[] $templates
135     * @return string|null
136     */
137    protected function getTemplateTerm( array $templates ) {
138        return 'hastemplate:' . $this->escapeSearchTitleList( $templates );
139    }
140
141    /**
142     * @param Topic|null $topic
143     * @return string|null
144     */
145    protected function getTopicTerm( ?Topic $topic ): ?string {
146        $topicTerm = null;
147        if ( $topic instanceof OresBasedTopic ) {
148            $topicTerm = $this->getOresBasedTopicTerm( [ $topic ] );
149        } elseif ( $topic instanceof MorelikeBasedTopic ) {
150            $topicTerm = $this->getMorelikeBasedTopicTerm( [ $topic ] );
151        } elseif ( $topic instanceof CampaignTopic ) {
152            $topicTerm = $topic->getSearchExpression();
153        }
154        return $topicTerm;
155    }
156
157    /**
158     * @param array $pageIds
159     * @return string
160     */
161    private function getPageIdTerm( array $pageIds ) {
162        return 'pageid:' . implode( '|', $pageIds );
163    }
164
165    /**
166     * @param array $pageIds
167     * @return string
168     */
169    private function getExcludedPageIdTerm( array $pageIds ): string {
170        return '-pageid:' . implode( '|', $pageIds );
171    }
172
173    /**
174     * @param OresBasedTopic[] $topics
175     * @return string
176     */
177    protected function getOresBasedTopicTerm( array $topics ) {
178        return 'articletopic:' . implode( '|', array_reduce( $topics,
179            static function ( array $carry, OresBasedTopic $topic ) {
180                return array_merge( $carry, $topic->getOresTopics() );
181            }, [] ) );
182    }
183
184    /**
185     * @param MorelikeBasedTopic[] $topics
186     * @return string
187     * @see https://www.mediawiki.org/wiki/Help:CirrusSearch#Morelike
188     * @see https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-mlt-query.html
189     */
190    protected function getMorelikeBasedTopicTerm( array $topics ) {
191        return 'morelikethis:' . $this->escapeSearchTitleList(
192            array_reduce( $topics, static function ( array $carry, MorelikeBasedTopic $topic ) {
193                return array_merge( $carry, $topic->getReferencePages() );
194            }, [] ) );
195    }
196
197    /**
198     * Turns an array of pages into a CirrusSearch keyword value (pipe-separated, escaped).
199     * Namespaces are omitted entirely.
200     * @param LinkTarget[] $titles
201     * @return string
202     */
203    protected function escapeSearchTitleList( array $titles ) {
204        return '"' . implode( '|', array_map( static function ( LinkTarget $title ) {
205            return str_replace( [ '"', '?' ], [ '\"', '\?' ], $title->getDBkey() );
206        }, $titles ) ) . '"';
207    }
208
209    /**
210     * Shuffle the list of queries, preserving keys (T248106)
211     *
212     * PHP's shuffle() is insufficient as we need to preserve the keys.
213     *
214     * @param array $queries
215     * @return array
216     */
217    protected function shuffleQueryOrder( array $queries ): array {
218        $keys = array_keys( $queries );
219        shuffle( $keys );
220        $shuffled = [];
221        foreach ( $keys as $key ) {
222            $shuffled[$key] = $queries[$key];
223        }
224        return $shuffled;
225    }
226
227}