Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
92.66% |
101 / 109 |
|
71.43% |
5 / 7 |
CRAP | |
0.00% |
0 / 1 |
| InLabelFeature | |
92.66% |
101 / 109 |
|
71.43% |
5 / 7 |
30.36 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| getKeywords | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| doApply | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
2 | |||
| makeQuery | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
2 | |||
| parseLanguages | |
100.00% |
31 / 31 |
|
100.00% |
1 / 1 |
13 | |||
| parseValue | |
84.78% |
39 / 46 |
|
0.00% |
0 / 1 |
7.17 | |||
| getFilterQuery | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
3.14 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace Wikibase\Search\Elastic\Query; |
| 4 | |
| 5 | use CirrusSearch\Parser\AST\KeywordFeatureNode; |
| 6 | use CirrusSearch\Query\Builder\QueryBuildingContext; |
| 7 | use CirrusSearch\Query\FilterQueryFeature; |
| 8 | use CirrusSearch\Query\SimpleKeywordFeature; |
| 9 | use CirrusSearch\Search\SearchContext; |
| 10 | use CirrusSearch\WarningCollector; |
| 11 | use Elastica\Query\AbstractQuery; |
| 12 | use Elastica\Query\MultiMatch; |
| 13 | use Wikibase\Lib\LanguageFallbackChainFactory; |
| 14 | use Wikibase\Search\Elastic\Fields\AllLabelsField; |
| 15 | use Wikibase\Search\Elastic\Fields\DescriptionsField; |
| 16 | use Wikibase\Search\Elastic\Fields\LabelsField; |
| 17 | use Wikimedia\Assert\Assert; |
| 18 | |
| 19 | /** |
| 20 | * Handles the search keyword 'inlabel:' |
| 21 | * |
| 22 | * Allows the user to search for pages that have wikibase labels, optionally in user specified |
| 23 | * languages. |
| 24 | * |
| 25 | * @uses CirrusSearch |
| 26 | * @see https://phabricator.wikimedia.org/T215967 |
| 27 | */ |
| 28 | class InLabelFeature extends SimpleKeywordFeature implements FilterQueryFeature { |
| 29 | /** @var int A limit to the number of fields that can be queried at once */ |
| 30 | public const MAX_FIELDS = 30; |
| 31 | |
| 32 | /** @var LanguageFallbackChainFactory */ |
| 33 | private $languageChainFactory; |
| 34 | |
| 35 | /** @var true[] Keyed by known language codes for set membership check */ |
| 36 | private $languages; |
| 37 | |
| 38 | /** |
| 39 | * @var array |
| 40 | */ |
| 41 | private static $FIELDS_PER_KEYWORD = [ |
| 42 | // query both label and description time for the sanitizer |
| 43 | // to catch up (ref T226722) |
| 44 | 'incaption' => [ |
| 45 | 'fields' => [ LabelsField::NAME, DescriptionsField::NAME ], |
| 46 | 'all_fields' => [ AllLabelsField::NAME . '.plain', DescriptionsField::NAME . '.*.plain' ] |
| 47 | ], |
| 48 | 'inlabel' => [ |
| 49 | 'fields' => [ LabelsField::NAME ], |
| 50 | 'all_fields' => [ AllLabelsField::NAME . '.plain' ] |
| 51 | ] |
| 52 | ]; |
| 53 | |
| 54 | /** |
| 55 | * @param LanguageFallbackChainFactory $languageChainFactory |
| 56 | * @param string[] $languages list of languages indexed in elastic. Must all be lowercase. |
| 57 | */ |
| 58 | public function __construct( LanguageFallbackChainFactory $languageChainFactory, $languages ) { |
| 59 | $this->languageChainFactory = $languageChainFactory; |
| 60 | $this->languages = []; |
| 61 | foreach ( $languages as $lang ) { |
| 62 | $this->languages[$lang] = true; |
| 63 | } |
| 64 | } |
| 65 | |
| 66 | /** |
| 67 | * @return string[] |
| 68 | */ |
| 69 | protected function getKeywords() { |
| 70 | // When using WikibaseMediaInfo extension the labels are referred to |
| 71 | // more concretely as captions. While perhaps slightly messy, there |
| 72 | // doesn't seem to be much downside to allowing `incaption` everywhere. |
| 73 | return [ 'inlabel', 'incaption' ]; |
| 74 | } |
| 75 | |
| 76 | /** |
| 77 | * @param SearchContext $context |
| 78 | * @param string $key The keyword |
| 79 | * @param string $value The value attached to the keyword with quotes stripped |
| 80 | * @param string $quotedValue The original value in the search string, including quotes if used |
| 81 | * @param bool $negated Is the search negated? Not used to generate the returned AbstractQuery, |
| 82 | * that will be negated as necessary. Used for any other building/context necessary. |
| 83 | * @return array Two element array, first an AbstractQuery or null to apply to the |
| 84 | * query. Second a boolean indicating if the quotedValue should be kept in the search |
| 85 | * string. |
| 86 | */ |
| 87 | protected function doApply( SearchContext $context, $key, $value, $quotedValue, $negated ) { |
| 88 | $parsedValue = $this->parseValue( |
| 89 | $key, |
| 90 | $value, |
| 91 | $quotedValue, |
| 92 | '', |
| 93 | '', |
| 94 | $context |
| 95 | ); |
| 96 | if ( $parsedValue['fields'] === [] ) { |
| 97 | $context->setResultsPossible( false ); |
| 98 | return [ null, false ]; |
| 99 | } |
| 100 | $query = $this->makeQuery( $parsedValue ); |
| 101 | // The query will only be used in the filter context. To enable highlighting |
| 102 | // we need to provide the query to the highlighter as well. |
| 103 | // TODO: How does this work with the new parser that only calls parseValue / getFilterQuery? |
| 104 | // |
| 105 | $context->addNonTextHighlightQuery( $query ); |
| 106 | |
| 107 | // TODO: This false should be true, but it's not quite right. It will keep |
| 108 | // the whole quotedValue, but we want it to only keep the search query |
| 109 | // portion. Possibly we want to influence ranking with the language |
| 110 | // chain as well? |
| 111 | return [ $query, false ]; |
| 112 | } |
| 113 | |
| 114 | /** |
| 115 | * Builds an OR between the fields in $parsedValue. The |
| 116 | * search terms must exist wholly within a single field. |
| 117 | * |
| 118 | * @param array $parsedValue |
| 119 | * @return \Elastica\Query\AbstractQuery |
| 120 | */ |
| 121 | private function makeQuery( array $parsedValue ) { |
| 122 | $query = ( new MultiMatch() ) |
| 123 | ->setQuery( $parsedValue['string'] ) |
| 124 | // AND means all terms must exist in one language label. |
| 125 | // Only 1 of the provided fields must match. |
| 126 | ->setOperator( MultiMatch::OPERATOR_AND ) |
| 127 | ->setFields( $parsedValue['fields'] ); |
| 128 | if ( $parsedValue['phrase'] ) { |
| 129 | $query->setType( MultiMatch::TYPE_PHRASE ); |
| 130 | } |
| 131 | return $query; |
| 132 | } |
| 133 | |
| 134 | /** |
| 135 | * @param array $useFields |
| 136 | * @param string $languageString |
| 137 | * @param WarningCollector $warningCollector |
| 138 | * @return string[] |
| 139 | */ |
| 140 | private function parseLanguages( array $useFields, $languageString, WarningCollector $warningCollector ): array { |
| 141 | $fields = []; |
| 142 | foreach ( explode( ',', $languageString ) as $languageCode ) { |
| 143 | $languageCode = mb_strtolower( $languageCode ); |
| 144 | $withFallbacks = false; |
| 145 | $withoutEnFallback = false; |
| 146 | $len = strlen( $languageCode ); |
| 147 | if ( $len > 1 && str_ends_with( $languageCode, '*' ) ) { |
| 148 | $languageCode = substr( $languageCode, 0, -1 ); |
| 149 | $withFallbacks = true; |
| 150 | } elseif ( $len > 1 && str_ends_with( $languageCode, '+' ) ) { |
| 151 | $languageCode = substr( $languageCode, 0, -1 ); |
| 152 | $withFallbacks = true; |
| 153 | $withoutEnFallback = true; |
| 154 | } |
| 155 | |
| 156 | if ( !isset( $this->languages[$languageCode] ) ) { |
| 157 | $warningCollector->addWarning( |
| 158 | 'wikibasecirrus-keywordfeature-unknown-language-code', |
| 159 | 'inlabel', |
| 160 | $languageCode ); |
| 161 | continue; |
| 162 | } |
| 163 | |
| 164 | foreach ( $useFields as $field ) { |
| 165 | $fields[$field . '.' . $languageCode . '.plain'] = true; |
| 166 | } |
| 167 | if ( $withFallbacks ) { |
| 168 | $fallbacks = $this->languageChainFactory |
| 169 | ->newFromLanguageCode( $languageCode ) |
| 170 | ->getFetchLanguageCodes(); |
| 171 | foreach ( $fallbacks as $fallbackCode ) { |
| 172 | if ( $withoutEnFallback && $fallbackCode == 'en' ) { |
| 173 | continue; |
| 174 | } |
| 175 | foreach ( $useFields as $field ) { |
| 176 | $fields[$field . '.' . $fallbackCode . '.plain'] = true; |
| 177 | } |
| 178 | } |
| 179 | } |
| 180 | } |
| 181 | return array_keys( $fields ); |
| 182 | } |
| 183 | |
| 184 | /** |
| 185 | * @param string $key |
| 186 | * @param string $value |
| 187 | * @param string $quotedValue |
| 188 | * @param string $valueDelimiter |
| 189 | * @param string $suffix |
| 190 | * @param WarningCollector $warningCollector |
| 191 | * @return array [ |
| 192 | * 'string' => string to search for |
| 193 | * 'fields' => array of document fields to run the query against, |
| 194 | * 'phrase' => boolean indicating if a phrase query should be issued |
| 195 | * ] |
| 196 | */ |
| 197 | public function parseValue( |
| 198 | $key, |
| 199 | $value, |
| 200 | $quotedValue, |
| 201 | $valueDelimiter, |
| 202 | $suffix, |
| 203 | WarningCollector $warningCollector |
| 204 | ) { |
| 205 | $isPhrase = $quotedValue !== $value; |
| 206 | Assert::precondition( isset( self::$FIELDS_PER_KEYWORD[$key] ), "Must have the list of fields for $key defined" ); |
| 207 | $allLabelFields = self::$FIELDS_PER_KEYWORD[$key]['all_fields']; |
| 208 | if ( strlen( $value ) === 0 ) { |
| 209 | $warningCollector->addWarning( |
| 210 | 'wikibasecirrus-inlabel-no-query-provided' ); |
| 211 | return [ |
| 212 | 'fields' => [], |
| 213 | 'string' => $value, |
| 214 | 'phrase' => $isPhrase, |
| 215 | ]; |
| 216 | } |
| 217 | $atPos = strrpos( $value, '@' ); |
| 218 | if ( $atPos === false ) { |
| 219 | return [ |
| 220 | 'fields' => $allLabelFields, |
| 221 | 'string' => $value, |
| 222 | 'phrase' => $isPhrase, |
| 223 | ]; |
| 224 | } |
| 225 | $search = substr( $value, 0, $atPos ); |
| 226 | if ( strlen( $search ) === 0 ) { |
| 227 | $warningCollector->addWarning( |
| 228 | 'wikibasecirrus-inlabel-no-query-provided' ); |
| 229 | return [ |
| 230 | 'fields' => [], |
| 231 | 'string' => $search, |
| 232 | 'phrase' => $isPhrase, |
| 233 | ]; |
| 234 | } |
| 235 | |
| 236 | $languages = substr( $value, $atPos + 1 ); |
| 237 | // when $atPos + 1 === strlen( $value ) then php will return '' |
| 238 | if ( $languages === '' || $languages === '*' ) { |
| 239 | return [ |
| 240 | 'fields' => $allLabelFields, |
| 241 | 'string' => $search, |
| 242 | 'phrase' => $isPhrase, |
| 243 | ]; |
| 244 | } |
| 245 | $fieldsToUse = self::$FIELDS_PER_KEYWORD[$key]['fields']; |
| 246 | $fields = $this->parseLanguages( $fieldsToUse, $languages, $warningCollector ); |
| 247 | if ( count( $fields ) > self::MAX_FIELDS ) { |
| 248 | $warningCollector->addWarning( |
| 249 | 'wikibasecirrus-keywordfeature-too-many-language-codes', |
| 250 | 'inlabel', self::MAX_FIELDS, count( $fields ) ); |
| 251 | $fields = array_slice( $fields, 0, self::MAX_FIELDS ); |
| 252 | } |
| 253 | |
| 254 | return [ |
| 255 | 'fields' => $fields, |
| 256 | 'string' => $search, |
| 257 | 'phrase' => $isPhrase, |
| 258 | ]; |
| 259 | } |
| 260 | |
| 261 | /** |
| 262 | * @param KeywordFeatureNode $node |
| 263 | * @param QueryBuildingContext $context |
| 264 | * @return AbstractQuery|null |
| 265 | */ |
| 266 | public function getFilterQuery( KeywordFeatureNode $node, QueryBuildingContext $context ) { |
| 267 | $parsedValue = $node->getParsedValue(); |
| 268 | if ( $parsedValue === null || $parsedValue['fields'] === [] ) { |
| 269 | return null; |
| 270 | } |
| 271 | return $this->makeQuery( $parsedValue ); |
| 272 | } |
| 273 | |
| 274 | } |