Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
95.59% |
65 / 68 |
|
84.62% |
11 / 13 |
CRAP | |
0.00% |
0 / 1 |
| DeepcatFeature | |
95.59% |
65 / 68 |
|
84.62% |
11 / 13 |
25 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
4 | |||
| getCrossSearchStrategy | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getKeywords | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getFeatureName | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| doApply | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| expand | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| doExpand | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
3 | |||
| decideUiWarning | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
| getCategoryPrefix | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
| logRequest | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
| fetchCategories | |
91.30% |
21 / 23 |
|
0.00% |
0 / 1 |
3.01 | |||
| getFilterQuery | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| doGetFilterQuery | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace CirrusSearch\Query; |
| 4 | |
| 5 | use CirrusSearch\CrossSearchStrategy; |
| 6 | use CirrusSearch\Parser\AST\KeywordFeatureNode; |
| 7 | use CirrusSearch\Query\Builder\QueryBuildingContext; |
| 8 | use CirrusSearch\Search\SearchContext; |
| 9 | use CirrusSearch\SearchConfig; |
| 10 | use CirrusSearch\Util; |
| 11 | use CirrusSearch\WarningCollector; |
| 12 | use Elastica\Query\AbstractQuery; |
| 13 | use MediaWiki\Config\Config; |
| 14 | use MediaWiki\Logger\LoggerFactory; |
| 15 | use MediaWiki\MediaWikiServices; |
| 16 | use MediaWiki\Sparql\SparqlClient; |
| 17 | use MediaWiki\Sparql\SparqlException; |
| 18 | use MediaWiki\Title\Title; |
| 19 | |
| 20 | /** |
| 21 | * Filters by category or its subcategories. E.g. if category Vehicles includes Cars |
| 22 | * and Boats, then search for Vehicles would match pages in Vehicles, Cars and Boats. |
| 23 | * |
| 24 | * Syntax: |
| 25 | * deepcat:Vehicles |
| 26 | */ |
| 27 | class DeepcatFeature extends SimpleKeywordFeature implements FilterQueryFeature { |
| 28 | /** |
| 29 | * Max lookup depth |
| 30 | * @var int |
| 31 | */ |
| 32 | private $depth; |
| 33 | /** |
| 34 | * Max number of categories |
| 35 | * @var int |
| 36 | */ |
| 37 | private $limit; |
| 38 | /** |
| 39 | * Category URL prefix for this wiki |
| 40 | * @var string|null (lazy loaded) |
| 41 | */ |
| 42 | private $prefix; |
| 43 | /** |
| 44 | * @var SparqlClient |
| 45 | */ |
| 46 | private $client; |
| 47 | |
| 48 | /** |
| 49 | * User agent to use for SPARQL queries |
| 50 | */ |
| 51 | public const USER_AGENT = 'CirrusSearch deepcat feature'; |
| 52 | /** |
| 53 | * Timeout (in seconds) for SPARQL query. |
| 54 | * TODO: make configurable? |
| 55 | */ |
| 56 | public const TIMEOUT = 3; |
| 57 | |
| 58 | /** |
| 59 | * @param Config $config |
| 60 | * @param SparqlClient|null $client |
| 61 | */ |
| 62 | public function __construct( Config $config, ?SparqlClient $client = null ) { |
| 63 | $this->depth = (int)$config->get( 'CirrusSearchCategoryDepth' ); |
| 64 | $this->limit = (int)$config->get( 'CirrusSearchCategoryMax' ); |
| 65 | $endpoint = $config->get( 'CirrusSearchCategoryEndpoint' ); |
| 66 | if ( $endpoint !== null && $endpoint !== '' ) { |
| 67 | $this->client = $client ?: MediaWikiServices::getInstance()->getService( 'CirrusCategoriesClient' ); |
| 68 | } |
| 69 | } |
| 70 | |
| 71 | /** |
| 72 | * @param KeywordFeatureNode $node |
| 73 | * @return CrossSearchStrategy |
| 74 | */ |
| 75 | public function getCrossSearchStrategy( KeywordFeatureNode $node ) { |
| 76 | // the category tree is wiki specific |
| 77 | return CrossSearchStrategy::hostWikiOnlyStrategy(); |
| 78 | } |
| 79 | |
| 80 | /** |
| 81 | * @return string[] The list of keywords this feature is supposed to match |
| 82 | */ |
| 83 | protected function getKeywords() { |
| 84 | return [ 'deepcat', 'deepcategory' ]; |
| 85 | } |
| 86 | |
| 87 | /** |
| 88 | * @param string $key |
| 89 | * @param string $valueDelimiter |
| 90 | * @return string |
| 91 | */ |
| 92 | public function getFeatureName( $key, $valueDelimiter ) { |
| 93 | return 'deepcategory'; |
| 94 | } |
| 95 | |
| 96 | /** |
| 97 | * Applies the detected keyword from the search term. May apply changes |
| 98 | * either to $context directly, or return a filter to be added. |
| 99 | * |
| 100 | * @param SearchContext $context |
| 101 | * @param string $key The keyword |
| 102 | * @param string $value The value attached to the keyword with quotes stripped and escaped |
| 103 | * quotes un-escaped. |
| 104 | * @param string $quotedValue The original value in the search string, including quotes if used |
| 105 | * @param bool $negated Is the search negated? Not used to generate the returned AbstractQuery, |
| 106 | * that will be negated as necessary. Used for any other building/context necessary. |
| 107 | * @return array Two element array, first an AbstractQuery or null to apply to the |
| 108 | * query. Second a boolean indicating if the quotedValue should be kept in the search |
| 109 | * string. |
| 110 | */ |
| 111 | protected function doApply( SearchContext $context, $key, $value, $quotedValue, $negated ) { |
| 112 | $filter = $this->doGetFilterQuery( $this->doExpand( $value, $context ) ); |
| 113 | if ( $filter === null ) { |
| 114 | $context->setResultsPossible( false ); |
| 115 | } |
| 116 | |
| 117 | return [ $filter, false ]; |
| 118 | } |
| 119 | |
| 120 | /** |
| 121 | * @param KeywordFeatureNode $node |
| 122 | * @param SearchConfig $config |
| 123 | * @param WarningCollector $warningCollector |
| 124 | * @return array |
| 125 | */ |
| 126 | public function expand( KeywordFeatureNode $node, SearchConfig $config, WarningCollector $warningCollector ) { |
| 127 | return $this->doExpand( $node->getValue(), $warningCollector ); |
| 128 | } |
| 129 | |
| 130 | /** |
| 131 | * @param string $value |
| 132 | * @param WarningCollector $warningCollector |
| 133 | * @return array |
| 134 | */ |
| 135 | private function doExpand( $value, WarningCollector $warningCollector ) { |
| 136 | if ( !$this->client ) { |
| 137 | $warningCollector->addWarning( 'cirrussearch-feature-deepcat-endpoint' ); |
| 138 | return []; |
| 139 | } |
| 140 | |
| 141 | $startQueryTime = microtime( true ); |
| 142 | try { |
| 143 | $categories = $this->fetchCategories( $value, $warningCollector ); |
| 144 | } catch ( SparqlException $e ) { |
| 145 | // Not publishing exception here because it can contain too many details including IPs, etc. |
| 146 | $warningCollector->addWarning( $this->decideUiWarning( $e ) ); |
| 147 | LoggerFactory::getInstance( 'CirrusSearch' ) |
| 148 | ->warning( 'Deepcat SPARQL Exception: ' . $e->getMessage() ); |
| 149 | $categories = [ $value ]; |
| 150 | } |
| 151 | $this->logRequest( $startQueryTime ); |
| 152 | return $categories; |
| 153 | } |
| 154 | |
| 155 | private function decideUiWarning( SparqlException $e ): string { |
| 156 | $message = $e->getMessage(); |
| 157 | // This could alternatively be a 500 error if blazegraph timed out |
| 158 | // prior to the http client timing out, but that doesn't happen due |
| 159 | // to http and blazegraph timeouts being set to the same value. |
| 160 | if ( strpos( $message, 'HTTP request timed out.' ) !== false ) { |
| 161 | return 'cirrussearch-feature-deepcat-timeout'; |
| 162 | } else { |
| 163 | return 'cirrussearch-feature-deepcat-exception'; |
| 164 | } |
| 165 | } |
| 166 | |
| 167 | /** |
| 168 | * Get URL prefix for full category URL for this wiki. |
| 169 | * @return bool|string |
| 170 | */ |
| 171 | private function getCategoryPrefix() { |
| 172 | if ( $this->prefix === null ) { |
| 173 | $title = Title::makeTitle( NS_CATEGORY, 'ZZ' ); |
| 174 | $fullName = $title->getFullURL( '', false, PROTO_CANONICAL ); |
| 175 | $this->prefix = substr( $fullName, 0, -2 ); |
| 176 | } |
| 177 | return $this->prefix; |
| 178 | } |
| 179 | |
| 180 | /** |
| 181 | * Record stats data for the request. |
| 182 | * @param float $startQueryTime |
| 183 | */ |
| 184 | private function logRequest( $startQueryTime ) { |
| 185 | $timeTaken = intval( 1000 * ( microtime( true ) - $startQueryTime ) ); |
| 186 | Util::getStatsFactory() |
| 187 | ->getTiming( 'deepcat_sparql_query_seconds' ) |
| 188 | ->observe( $timeTaken ); |
| 189 | } |
| 190 | |
| 191 | /** |
| 192 | * Get child categories using SPARQL service. |
| 193 | * @param string $rootCategory Category to start looking from |
| 194 | * @param WarningCollector $warningCollector |
| 195 | * @return string[] List of subcategories. |
| 196 | * Note that the list may be incomplete due to limitations of the service. |
| 197 | * @throws SparqlException |
| 198 | */ |
| 199 | private function fetchCategories( $rootCategory, WarningCollector $warningCollector ) { |
| 200 | $title = Title::makeTitleSafe( NS_CATEGORY, $rootCategory ); |
| 201 | if ( $title === null ) { |
| 202 | $warningCollector->addWarning( 'cirrussearch-feature-deepcat-invalid-title' ); |
| 203 | return []; |
| 204 | } |
| 205 | $fullName = $title->getFullURL( '', false, PROTO_CANONICAL ); |
| 206 | $limit1 = $this->limit + 1; |
| 207 | $query = <<<SPARQL |
| 208 | SELECT ?out WHERE { |
| 209 | SERVICE mediawiki:categoryTree { |
| 210 | bd:serviceParam mediawiki:start <$fullName> . |
| 211 | bd:serviceParam mediawiki:direction "Reverse" . |
| 212 | bd:serviceParam mediawiki:depth {$this->depth} . |
| 213 | } |
| 214 | } ORDER BY ASC(?depth) |
| 215 | LIMIT $limit1 |
| 216 | SPARQL; |
| 217 | $result = $this->client->query( $query ); |
| 218 | |
| 219 | if ( count( $result ) > $this->limit ) { |
| 220 | // We went over the limit. |
| 221 | // According to T181549 this means we fail the filter application |
| 222 | $warningCollector->addWarning( 'cirrussearch-feature-deepcat-toomany' ); |
| 223 | Util::getStatsFactory() |
| 224 | ->getCounter( 'deepcat_too_many_total' ) |
| 225 | ->increment(); |
| 226 | $result = array_slice( $result, 0, $this->limit ); |
| 227 | } |
| 228 | |
| 229 | $prefixLen = strlen( $this->getCategoryPrefix() ); |
| 230 | return array_map( static function ( $row ) use ( $prefixLen ) { |
| 231 | // TODO: maybe we want to check the prefix is indeed the same? |
| 232 | // It should be but who knows... |
| 233 | return rawurldecode( substr( $row['out'], $prefixLen ) ); |
| 234 | }, $result ); |
| 235 | } |
| 236 | |
| 237 | /** |
| 238 | * @param KeywordFeatureNode $node |
| 239 | * @param QueryBuildingContext $context |
| 240 | * @return AbstractQuery|null |
| 241 | */ |
| 242 | public function getFilterQuery( KeywordFeatureNode $node, QueryBuildingContext $context ) { |
| 243 | return $this->doGetFilterQuery( $context->getKeywordExpandedData( $node ) ); |
| 244 | } |
| 245 | |
| 246 | /** |
| 247 | * @param array $categories |
| 248 | * @return \Elastica\Query\BoolQuery|null |
| 249 | */ |
| 250 | protected function doGetFilterQuery( array $categories ) { |
| 251 | if ( $categories == [] ) { |
| 252 | return null; |
| 253 | } |
| 254 | |
| 255 | $filter = new \Elastica\Query\BoolQuery(); |
| 256 | foreach ( $categories as $cat ) { |
| 257 | $filter->addShould( QueryHelper::matchPage( 'category.lowercase_keyword', $cat ) ); |
| 258 | } |
| 259 | |
| 260 | return $filter; |
| 261 | } |
| 262 | } |