Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
95.71% |
67 / 70 |
|
84.62% |
11 / 13 |
CRAP | |
0.00% |
0 / 1 |
DeepcatFeature | |
95.71% |
67 / 70 |
|
84.62% |
11 / 13 |
25 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
4 | |||
getCrossSearchStrategy | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getKeywords | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getFeatureName | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
doApply | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
expand | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
doExpand | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
3 | |||
decideUiWarning | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
getCategoryPrefix | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
logRequest | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
fetchCategories | |
91.67% |
22 / 24 |
|
0.00% |
0 / 1 |
3.01 | |||
getFilterQuery | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
doGetFilterQuery | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Query; |
4 | |
5 | use CirrusSearch\CrossSearchStrategy; |
6 | use CirrusSearch\Parser\AST\KeywordFeatureNode; |
7 | use CirrusSearch\Query\Builder\QueryBuildingContext; |
8 | use CirrusSearch\Search\SearchContext; |
9 | use CirrusSearch\SearchConfig; |
10 | use CirrusSearch\Util; |
11 | use CirrusSearch\WarningCollector; |
12 | use Elastica\Query\AbstractQuery; |
13 | use MediaWiki\Config\Config; |
14 | use MediaWiki\Logger\LoggerFactory; |
15 | use MediaWiki\MediaWikiServices; |
16 | use MediaWiki\Sparql\SparqlClient; |
17 | use MediaWiki\Sparql\SparqlException; |
18 | use MediaWiki\Title\Title; |
19 | |
20 | /** |
21 | * Filters by category or its subcategories. E.g. if category Vehicles includes Cars |
22 | * and Boats, then search for Vehicles would match pages in Vehicles, Cars and Boats. |
23 | * |
24 | * Syntax: |
25 | * deepcat:Vehicles |
26 | */ |
27 | class DeepcatFeature extends SimpleKeywordFeature implements FilterQueryFeature { |
28 | /** |
29 | * Max lookup depth |
30 | * @var int |
31 | */ |
32 | private $depth; |
33 | /** |
34 | * Max number of categories |
35 | * @var int |
36 | */ |
37 | private $limit; |
38 | /** |
39 | * Category URL prefix for this wiki |
40 | * @var string|null (lazy loaded) |
41 | */ |
42 | private $prefix; |
43 | /** |
44 | * @var SparqlClient |
45 | */ |
46 | private $client; |
47 | |
48 | /** |
49 | * User agent to use for SPARQL queries |
50 | */ |
51 | public const USER_AGENT = 'CirrusSearch deepcat feature'; |
52 | /** |
53 | * Timeout (in seconds) for SPARQL query. |
54 | * TODO: make configurable? |
55 | */ |
56 | public const TIMEOUT = 3; |
57 | |
58 | /** |
59 | * @param Config $config |
60 | * @param SparqlClient|null $client |
61 | */ |
62 | public function __construct( Config $config, ?SparqlClient $client = null ) { |
63 | $this->depth = (int)$config->get( 'CirrusSearchCategoryDepth' ); |
64 | $this->limit = (int)$config->get( 'CirrusSearchCategoryMax' ); |
65 | $endpoint = $config->get( 'CirrusSearchCategoryEndpoint' ); |
66 | if ( $endpoint !== null && $endpoint !== '' ) { |
67 | $this->client = $client ?: MediaWikiServices::getInstance()->getService( 'CirrusCategoriesClient' ); |
68 | } |
69 | } |
70 | |
71 | /** |
72 | * @param KeywordFeatureNode $node |
73 | * @return CrossSearchStrategy |
74 | */ |
75 | public function getCrossSearchStrategy( KeywordFeatureNode $node ) { |
76 | // the category tree is wiki specific |
77 | return CrossSearchStrategy::hostWikiOnlyStrategy(); |
78 | } |
79 | |
80 | /** |
81 | * @return string[] The list of keywords this feature is supposed to match |
82 | */ |
83 | protected function getKeywords() { |
84 | return [ 'deepcat', 'deepcategory' ]; |
85 | } |
86 | |
87 | /** |
88 | * @param string $key |
89 | * @param string $valueDelimiter |
90 | * @return string |
91 | */ |
92 | public function getFeatureName( $key, $valueDelimiter ) { |
93 | return 'deepcategory'; |
94 | } |
95 | |
96 | /** |
97 | * Applies the detected keyword from the search term. May apply changes |
98 | * either to $context directly, or return a filter to be added. |
99 | * |
100 | * @param SearchContext $context |
101 | * @param string $key The keyword |
102 | * @param string $value The value attached to the keyword with quotes stripped and escaped |
103 | * quotes un-escaped. |
104 | * @param string $quotedValue The original value in the search string, including quotes if used |
105 | * @param bool $negated Is the search negated? Not used to generate the returned AbstractQuery, |
106 | * that will be negated as necessary. Used for any other building/context necessary. |
107 | * @return array Two element array, first an AbstractQuery or null to apply to the |
108 | * query. Second a boolean indicating if the quotedValue should be kept in the search |
109 | * string. |
110 | */ |
111 | protected function doApply( SearchContext $context, $key, $value, $quotedValue, $negated ) { |
112 | $filter = $this->doGetFilterQuery( $this->doExpand( $value, $context ) ); |
113 | if ( $filter === null ) { |
114 | $context->setResultsPossible( false ); |
115 | } |
116 | |
117 | return [ $filter, false ]; |
118 | } |
119 | |
120 | /** |
121 | * @param KeywordFeatureNode $node |
122 | * @param SearchConfig $config |
123 | * @param WarningCollector $warningCollector |
124 | * @return array |
125 | */ |
126 | public function expand( KeywordFeatureNode $node, SearchConfig $config, WarningCollector $warningCollector ) { |
127 | return $this->doExpand( $node->getValue(), $warningCollector ); |
128 | } |
129 | |
130 | /** |
131 | * @param string $value |
132 | * @param WarningCollector $warningCollector |
133 | * @return array |
134 | */ |
135 | private function doExpand( $value, WarningCollector $warningCollector ) { |
136 | if ( !$this->client ) { |
137 | $warningCollector->addWarning( 'cirrussearch-feature-deepcat-endpoint' ); |
138 | return []; |
139 | } |
140 | |
141 | $startQueryTime = microtime( true ); |
142 | try { |
143 | $categories = $this->fetchCategories( $value, $warningCollector ); |
144 | } catch ( SparqlException $e ) { |
145 | // Not publishing exception here because it can contain too many details including IPs, etc. |
146 | $warningCollector->addWarning( $this->decideUiWarning( $e ) ); |
147 | LoggerFactory::getInstance( 'CirrusSearch' ) |
148 | ->warning( 'Deepcat SPARQL Exception: ' . $e->getMessage() ); |
149 | $categories = [ $value ]; |
150 | } |
151 | $this->logRequest( $startQueryTime ); |
152 | return $categories; |
153 | } |
154 | |
155 | private function decideUiWarning( SparqlException $e ): string { |
156 | $message = $e->getMessage(); |
157 | // This could alternatively be a 500 error if blazegraph timed out |
158 | // prior to the http client timing out, but that doesn't happen due |
159 | // to http and blazegraph timeouts being set to the same value. |
160 | if ( strpos( $message, 'HTTP request timed out.' ) !== false ) { |
161 | return 'cirrussearch-feature-deepcat-timeout'; |
162 | } else { |
163 | return 'cirrussearch-feature-deepcat-exception'; |
164 | } |
165 | } |
166 | |
167 | /** |
168 | * Get URL prefix for full category URL for this wiki. |
169 | * @return bool|string |
170 | */ |
171 | private function getCategoryPrefix() { |
172 | if ( $this->prefix === null ) { |
173 | $title = Title::makeTitle( NS_CATEGORY, 'ZZ' ); |
174 | $fullName = $title->getFullURL( '', false, PROTO_CANONICAL ); |
175 | $this->prefix = substr( $fullName, 0, -2 ); |
176 | } |
177 | return $this->prefix; |
178 | } |
179 | |
180 | /** |
181 | * Record stats data for the request. |
182 | * @param float $startQueryTime |
183 | */ |
184 | private function logRequest( $startQueryTime ) { |
185 | $timeTaken = intval( 1000 * ( microtime( true ) - $startQueryTime ) ); |
186 | Util::getStatsFactory() |
187 | ->getTiming( 'deepcat_sparql_query_seconds' ) |
188 | ->copyToStatsdAt( 'CirrusSearch.deepcat.sparql' ) |
189 | ->observe( $timeTaken ); |
190 | } |
191 | |
192 | /** |
193 | * Get child categories using SPARQL service. |
194 | * @param string $rootCategory Category to start looking from |
195 | * @param WarningCollector $warningCollector |
196 | * @return string[] List of subcategories. |
197 | * Note that the list may be incomplete due to limitations of the service. |
198 | * @throws SparqlException |
199 | */ |
200 | private function fetchCategories( $rootCategory, WarningCollector $warningCollector ) { |
201 | $title = Title::makeTitleSafe( NS_CATEGORY, $rootCategory ); |
202 | if ( $title === null ) { |
203 | $warningCollector->addWarning( 'cirrussearch-feature-deepcat-invalid-title' ); |
204 | return []; |
205 | } |
206 | $fullName = $title->getFullURL( '', false, PROTO_CANONICAL ); |
207 | $limit1 = $this->limit + 1; |
208 | $query = <<<SPARQL |
209 | SELECT ?out WHERE { |
210 | SERVICE mediawiki:categoryTree { |
211 | bd:serviceParam mediawiki:start <$fullName> . |
212 | bd:serviceParam mediawiki:direction "Reverse" . |
213 | bd:serviceParam mediawiki:depth {$this->depth} . |
214 | } |
215 | } ORDER BY ASC(?depth) |
216 | LIMIT $limit1 |
217 | SPARQL; |
218 | $result = $this->client->query( $query ); |
219 | |
220 | if ( count( $result ) > $this->limit ) { |
221 | // We went over the limit. |
222 | // According to T181549 this means we fail the filter application |
223 | $warningCollector->addWarning( 'cirrussearch-feature-deepcat-toomany' ); |
224 | Util::getStatsFactory() |
225 | ->getCounter( 'deepcat_too_many_total' ) |
226 | ->copyToStatsdAt( 'CirrusSearch.deepcat.toomany' ) |
227 | ->increment(); |
228 | $result = array_slice( $result, 0, $this->limit ); |
229 | } |
230 | |
231 | $prefixLen = strlen( $this->getCategoryPrefix() ); |
232 | return array_map( static function ( $row ) use ( $prefixLen ) { |
233 | // TODO: maybe we want to check the prefix is indeed the same? |
234 | // It should be but who knows... |
235 | return rawurldecode( substr( $row['out'], $prefixLen ) ); |
236 | }, $result ); |
237 | } |
238 | |
239 | /** |
240 | * @param KeywordFeatureNode $node |
241 | * @param QueryBuildingContext $context |
242 | * @return AbstractQuery|null |
243 | */ |
244 | public function getFilterQuery( KeywordFeatureNode $node, QueryBuildingContext $context ) { |
245 | return $this->doGetFilterQuery( $context->getKeywordExpandedData( $node ) ); |
246 | } |
247 | |
248 | /** |
249 | * @param array $categories |
250 | * @return \Elastica\Query\BoolQuery|null |
251 | */ |
252 | protected function doGetFilterQuery( array $categories ) { |
253 | if ( $categories == [] ) { |
254 | return null; |
255 | } |
256 | |
257 | $filter = new \Elastica\Query\BoolQuery(); |
258 | foreach ( $categories as $cat ) { |
259 | $filter->addShould( QueryHelper::matchPage( 'category.lowercase_keyword', $cat ) ); |
260 | } |
261 | |
262 | return $filter; |
263 | } |
264 | } |