Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
95.59% |
65 / 68 |
|
84.62% |
11 / 13 |
CRAP | |
0.00% |
0 / 1 |
DeepcatFeature | |
95.59% |
65 / 68 |
|
84.62% |
11 / 13 |
25 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
4 | |||
getCrossSearchStrategy | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getKeywords | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getFeatureName | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
doApply | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
expand | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
doExpand | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
3 | |||
decideUiWarning | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
getCategoryPrefix | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
logRequest | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
fetchCategories | |
91.30% |
21 / 23 |
|
0.00% |
0 / 1 |
3.01 | |||
getFilterQuery | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
doGetFilterQuery | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Query; |
4 | |
5 | use CirrusSearch\CrossSearchStrategy; |
6 | use CirrusSearch\Parser\AST\KeywordFeatureNode; |
7 | use CirrusSearch\Query\Builder\QueryBuildingContext; |
8 | use CirrusSearch\Search\SearchContext; |
9 | use CirrusSearch\SearchConfig; |
10 | use CirrusSearch\WarningCollector; |
11 | use Config; |
12 | use Elastica\Query\AbstractQuery; |
13 | use MediaWiki\Logger\LoggerFactory; |
14 | use MediaWiki\MediaWikiServices; |
15 | use MediaWiki\Sparql\SparqlClient; |
16 | use MediaWiki\Sparql\SparqlException; |
17 | use MediaWiki\Title\Title; |
18 | |
19 | /** |
20 | * Filters by category or its subcategories. E.g. if category Vehicles includes Cars |
21 | * and Boats, then search for Vehicles would match pages in Vehicles, Cars and Boats. |
22 | * |
23 | * Syntax: |
24 | * deepcat:Vehicles |
25 | */ |
26 | class DeepcatFeature extends SimpleKeywordFeature implements FilterQueryFeature { |
27 | /** |
28 | * Max lookup depth |
29 | * @var int |
30 | */ |
31 | private $depth; |
32 | /** |
33 | * Max number of categories |
34 | * @var int |
35 | */ |
36 | private $limit; |
37 | /** |
38 | * Category URL prefix for this wiki |
39 | * @var string|null (lazy loaded) |
40 | */ |
41 | private $prefix; |
42 | /** |
43 | * @var SparqlClient |
44 | */ |
45 | private $client; |
46 | |
47 | /** |
48 | * User agent to use for SPARQL queries |
49 | */ |
50 | public const USER_AGENT = 'CirrusSearch deepcat feature'; |
51 | /** |
52 | * Timeout (in seconds) for SPARQL query. |
53 | * TODO: make configurable? |
54 | */ |
55 | public const TIMEOUT = 3; |
56 | /** |
57 | * Stats key for SPARQL requests |
58 | */ |
59 | private const STATSD_SPARQL_KEY = 'CirrusSearch.deepcat.sparql'; |
60 | /** |
61 | * Stats key for reporting too many categories |
62 | */ |
63 | private const STATSD_TOOMANY_KEY = 'CirrusSearch.deepcat.toomany'; |
64 | |
65 | /** |
66 | * @param Config $config |
67 | * @param SparqlClient|null $client |
68 | */ |
69 | public function __construct( Config $config, SparqlClient $client = null ) { |
70 | $this->depth = (int)$config->get( 'CirrusSearchCategoryDepth' ); |
71 | $this->limit = (int)$config->get( 'CirrusSearchCategoryMax' ); |
72 | $endpoint = $config->get( 'CirrusSearchCategoryEndpoint' ); |
73 | if ( $endpoint !== null && $endpoint !== '' ) { |
74 | $this->client = $client ?: MediaWikiServices::getInstance()->getService( 'CirrusCategoriesClient' ); |
75 | } |
76 | } |
77 | |
78 | /** |
79 | * @param KeywordFeatureNode $node |
80 | * @return CrossSearchStrategy |
81 | */ |
82 | public function getCrossSearchStrategy( KeywordFeatureNode $node ) { |
83 | // the category tree is wiki specific |
84 | return CrossSearchStrategy::hostWikiOnlyStrategy(); |
85 | } |
86 | |
87 | /** |
88 | * @return string[] The list of keywords this feature is supposed to match |
89 | */ |
90 | protected function getKeywords() { |
91 | return [ 'deepcat', 'deepcategory' ]; |
92 | } |
93 | |
94 | /** |
95 | * @param string $key |
96 | * @param string $valueDelimiter |
97 | * @return string |
98 | */ |
99 | public function getFeatureName( $key, $valueDelimiter ) { |
100 | return 'deepcategory'; |
101 | } |
102 | |
103 | /** |
104 | * Applies the detected keyword from the search term. May apply changes |
105 | * either to $context directly, or return a filter to be added. |
106 | * |
107 | * @param SearchContext $context |
108 | * @param string $key The keyword |
109 | * @param string $value The value attached to the keyword with quotes stripped and escaped |
110 | * quotes un-escaped. |
111 | * @param string $quotedValue The original value in the search string, including quotes if used |
112 | * @param bool $negated Is the search negated? Not used to generate the returned AbstractQuery, |
113 | * that will be negated as necessary. Used for any other building/context necessary. |
114 | * @return array Two element array, first an AbstractQuery or null to apply to the |
115 | * query. Second a boolean indicating if the quotedValue should be kept in the search |
116 | * string. |
117 | */ |
118 | protected function doApply( SearchContext $context, $key, $value, $quotedValue, $negated ) { |
119 | $filter = $this->doGetFilterQuery( $this->doExpand( $value, $context ) ); |
120 | if ( $filter === null ) { |
121 | $context->setResultsPossible( false ); |
122 | } |
123 | |
124 | return [ $filter, false ]; |
125 | } |
126 | |
127 | /** |
128 | * @param KeywordFeatureNode $node |
129 | * @param SearchConfig $config |
130 | * @param WarningCollector $warningCollector |
131 | * @return array |
132 | */ |
133 | public function expand( KeywordFeatureNode $node, SearchConfig $config, WarningCollector $warningCollector ) { |
134 | return $this->doExpand( $node->getValue(), $warningCollector ); |
135 | } |
136 | |
137 | /** |
138 | * @param string $value |
139 | * @param WarningCollector $warningCollector |
140 | * @return array |
141 | */ |
142 | private function doExpand( $value, WarningCollector $warningCollector ) { |
143 | if ( !$this->client ) { |
144 | $warningCollector->addWarning( 'cirrussearch-feature-deepcat-endpoint' ); |
145 | return []; |
146 | } |
147 | |
148 | $startQueryTime = microtime( true ); |
149 | try { |
150 | $categories = $this->fetchCategories( $value, $warningCollector ); |
151 | } catch ( SparqlException $e ) { |
152 | // Not publishing exception here because it can contain too many details including IPs, etc. |
153 | $warningCollector->addWarning( $this->decideUiWarning( $e ) ); |
154 | LoggerFactory::getInstance( 'CirrusSearch' ) |
155 | ->warning( 'Deepcat SPARQL Exception: ' . $e->getMessage() ); |
156 | $categories = [ $value ]; |
157 | } |
158 | $this->logRequest( $startQueryTime ); |
159 | return $categories; |
160 | } |
161 | |
162 | private function decideUiWarning( SparqlException $e ): string { |
163 | $message = $e->getMessage(); |
164 | // This could alternatively be a 500 error if blazegraph timed out |
165 | // prior to the http client timing out, but that doesn't happen due |
166 | // to http and blazegraph timeouts being set to the same value. |
167 | if ( strpos( $message, 'HTTP request timed out.' ) !== false ) { |
168 | return 'cirrussearch-feature-deepcat-timeout'; |
169 | } else { |
170 | return 'cirrussearch-feature-deepcat-exception'; |
171 | } |
172 | } |
173 | |
174 | /** |
175 | * Get URL prefix for full category URL for this wiki. |
176 | * @return bool|string |
177 | */ |
178 | private function getCategoryPrefix() { |
179 | if ( $this->prefix === null ) { |
180 | $title = Title::makeTitle( NS_CATEGORY, 'ZZ' ); |
181 | $fullName = $title->getFullURL( '', false, PROTO_CANONICAL ); |
182 | $this->prefix = substr( $fullName, 0, -2 ); |
183 | } |
184 | return $this->prefix; |
185 | } |
186 | |
187 | /** |
188 | * Record stats data for the request. |
189 | * @param float $startQueryTime |
190 | */ |
191 | private function logRequest( $startQueryTime ) { |
192 | $timeTaken = intval( 1000 * ( microtime( true ) - $startQueryTime ) ); |
193 | MediaWikiServices::getInstance()->getStatsdDataFactory()->timing( |
194 | self::STATSD_SPARQL_KEY, $timeTaken |
195 | ); |
196 | } |
197 | |
198 | /** |
199 | * Get child categories using SPARQL service. |
200 | * @param string $rootCategory Category to start looking from |
201 | * @param WarningCollector $warningCollector |
202 | * @return string[] List of subcategories. |
203 | * Note that the list may be incomplete due to limitations of the service. |
204 | * @throws SparqlException |
205 | */ |
206 | private function fetchCategories( $rootCategory, WarningCollector $warningCollector ) { |
207 | /** @var SparqlClient $client */ |
208 | $title = Title::makeTitleSafe( NS_CATEGORY, $rootCategory ); |
209 | if ( $title === null ) { |
210 | $warningCollector->addWarning( 'cirrussearch-feature-deepcat-invalid-title' ); |
211 | return []; |
212 | } |
213 | $fullName = $title->getFullURL( '', false, PROTO_CANONICAL ); |
214 | $limit1 = $this->limit + 1; |
215 | $query = <<<SPARQL |
216 | SELECT ?out WHERE { |
217 | SERVICE mediawiki:categoryTree { |
218 | bd:serviceParam mediawiki:start <$fullName> . |
219 | bd:serviceParam mediawiki:direction "Reverse" . |
220 | bd:serviceParam mediawiki:depth {$this->depth} . |
221 | } |
222 | } ORDER BY ASC(?depth) |
223 | LIMIT $limit1 |
224 | SPARQL; |
225 | $result = $this->client->query( $query ); |
226 | |
227 | if ( count( $result ) > $this->limit ) { |
228 | // We went over the limit. |
229 | // According to T181549 this means we fail the filter application |
230 | $warningCollector->addWarning( 'cirrussearch-feature-deepcat-toomany' ); |
231 | MediaWikiServices::getInstance() |
232 | ->getStatsdDataFactory() |
233 | ->increment( self::STATSD_TOOMANY_KEY ); |
234 | return []; |
235 | } |
236 | |
237 | $prefixLen = strlen( $this->getCategoryPrefix() ); |
238 | return array_map( static function ( $row ) use ( $prefixLen ) { |
239 | // TODO: maybe we want to check the prefix is indeed the same? |
240 | // It should be but who knows... |
241 | return rawurldecode( substr( $row['out'], $prefixLen ) ); |
242 | }, $result ); |
243 | } |
244 | |
245 | /** |
246 | * @param KeywordFeatureNode $node |
247 | * @param QueryBuildingContext $context |
248 | * @return AbstractQuery|null |
249 | */ |
250 | public function getFilterQuery( KeywordFeatureNode $node, QueryBuildingContext $context ) { |
251 | return $this->doGetFilterQuery( $context->getKeywordExpandedData( $node ) ); |
252 | } |
253 | |
254 | /** |
255 | * @param array $categories |
256 | * @return \Elastica\Query\BoolQuery|null |
257 | */ |
258 | protected function doGetFilterQuery( array $categories ) { |
259 | if ( $categories == [] ) { |
260 | return null; |
261 | } |
262 | |
263 | $filter = new \Elastica\Query\BoolQuery(); |
264 | foreach ( $categories as $cat ) { |
265 | $filter->addShould( QueryHelper::matchPage( 'category.lowercase_keyword', $cat ) ); |
266 | } |
267 | |
268 | return $filter; |
269 | } |
270 | } |