Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
92.79% |
103 / 111 |
|
71.43% |
5 / 7 |
CRAP | |
0.00% |
0 / 1 |
InLabelFeature | |
92.79% |
103 / 111 |
|
71.43% |
5 / 7 |
31.36 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
getKeywords | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
doApply | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
2 | |||
makeQuery | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
2 | |||
parseLanguages | |
100.00% |
33 / 33 |
|
100.00% |
1 / 1 |
13 | |||
parseValue | |
84.78% |
39 / 46 |
|
0.00% |
0 / 1 |
8.23 | |||
getFilterQuery | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
3.14 |
1 | <?php |
2 | |
3 | namespace Wikibase\Search\Elastic\Query; |
4 | |
5 | use CirrusSearch\Parser\AST\KeywordFeatureNode; |
6 | use CirrusSearch\Query\Builder\QueryBuildingContext; |
7 | use CirrusSearch\Query\FilterQueryFeature; |
8 | use CirrusSearch\Query\SimpleKeywordFeature; |
9 | use CirrusSearch\Search\SearchContext; |
10 | use CirrusSearch\WarningCollector; |
11 | use Elastica\Query\AbstractQuery; |
12 | use Elastica\Query\MultiMatch; |
13 | use Wikibase\Lib\LanguageFallbackChainFactory; |
14 | use Wikibase\Search\Elastic\Fields\AllLabelsField; |
15 | use Wikibase\Search\Elastic\Fields\DescriptionsField; |
16 | use Wikibase\Search\Elastic\Fields\LabelsField; |
17 | use Wikimedia\Assert\Assert; |
18 | |
19 | /** |
20 | * Handles the search keyword 'inlabel:' |
21 | * |
22 | * Allows the user to search for pages that have wikibase labels, optionally in user specified |
23 | * languages. |
24 | * |
25 | * @uses CirrusSearch |
26 | * @see https://phabricator.wikimedia.org/T215967 |
27 | */ |
28 | class InLabelFeature extends SimpleKeywordFeature implements FilterQueryFeature { |
29 | /** @var int A limit to the number of fields that can be queried at once */ |
30 | public const MAX_FIELDS = 30; |
31 | |
32 | /** @var LanguageFallbackChainFactory */ |
33 | private $languageChainFactory; |
34 | |
35 | /** @var true[] Keyed by known language codes for set membership check */ |
36 | private $languages; |
37 | |
38 | /** |
39 | * @var array |
40 | */ |
41 | private static $FIELDS_PER_KEYWORD = [ |
42 | // query both label and description time for the sanitizer |
43 | // to catch up (ref T226722) |
44 | 'incaption' => [ |
45 | 'fields' => [ LabelsField::NAME, DescriptionsField::NAME ], |
46 | 'all_fields' => [ AllLabelsField::NAME . '.plain', DescriptionsField::NAME . '.*.plain' ] |
47 | ], |
48 | 'inlabel' => [ |
49 | 'fields' => [ LabelsField::NAME ], |
50 | 'all_fields' => [ AllLabelsField::NAME . '.plain' ] |
51 | ] |
52 | ]; |
53 | |
54 | /** |
55 | * @param LanguageFallbackChainFactory $languageChainFactory |
56 | * @param string[] $languages list of languages indexed in elastic. Must all be lowercase. |
57 | */ |
58 | public function __construct( LanguageFallbackChainFactory $languageChainFactory, $languages ) { |
59 | $this->languageChainFactory = $languageChainFactory; |
60 | $this->languages = []; |
61 | foreach ( $languages as $lang ) { |
62 | $this->languages[$lang] = true; |
63 | } |
64 | } |
65 | |
66 | /** |
67 | * @return string[] |
68 | */ |
69 | protected function getKeywords() { |
70 | // When using WikibaseMediaInfo extension the labels are referred to |
71 | // more concretely as captions. While perhaps slightly messy, there |
72 | // doesn't seem to be much downside to allowing `incaption` everywhere. |
73 | return [ 'inlabel', 'incaption' ]; |
74 | } |
75 | |
76 | /** |
77 | * @param SearchContext $context |
78 | * @param string $key The keyword |
79 | * @param string $value The value attached to the keyword with quotes stripped |
80 | * @param string $quotedValue The original value in the search string, including quotes if used |
81 | * @param bool $negated Is the search negated? Not used to generate the returned AbstractQuery, |
82 | * that will be negated as necessary. Used for any other building/context necessary. |
83 | * @return array Two element array, first an AbstractQuery or null to apply to the |
84 | * query. Second a boolean indicating if the quotedValue should be kept in the search |
85 | * string. |
86 | */ |
87 | protected function doApply( SearchContext $context, $key, $value, $quotedValue, $negated ) { |
88 | $parsedValue = $this->parseValue( |
89 | $key, |
90 | $value, |
91 | $quotedValue, |
92 | '', |
93 | '', |
94 | $context |
95 | ); |
96 | if ( $parsedValue['fields'] === [] ) { |
97 | $context->setResultsPossible( false ); |
98 | return [ null, false ]; |
99 | } |
100 | $query = $this->makeQuery( $parsedValue ); |
101 | // The query will only be used in the filter context. To enable highlighting |
102 | // we need to provide the query to the highlighter as well. |
103 | // TODO: How does this work with the new parser that only calls parseValue / getFilterQuery? |
104 | // |
105 | $context->addNonTextHighlightQuery( $query ); |
106 | |
107 | // TODO: This false should be true, but it's not quite right. It will keep |
108 | // the whole quotedValue, but we want it to only keep the search query |
109 | // portion. Possibly we want to influence ranking with the language |
110 | // chain as well? |
111 | return [ $query, false ]; |
112 | } |
113 | |
114 | /** |
115 | * Builds an OR between the fields in $parsedValue. The |
116 | * search terms must exist wholly within a single field. |
117 | * |
118 | * @param array $parsedValue |
119 | * @return \Elastica\Query\AbstractQuery |
120 | */ |
121 | private function makeQuery( array $parsedValue ) { |
122 | $query = ( new MultiMatch() ) |
123 | ->setQuery( $parsedValue['string'] ) |
124 | // AND means all terms must exist in one language label. |
125 | // Only 1 of the provided fields must match. |
126 | ->setOperator( MultiMatch::OPERATOR_AND ) |
127 | ->setFields( $parsedValue['fields'] ); |
128 | if ( $parsedValue['phrase'] ) { |
129 | $query->setType( MultiMatch::TYPE_PHRASE ); |
130 | } |
131 | return $query; |
132 | } |
133 | |
134 | /** |
135 | * @param array $useFields |
136 | * @param string $languageString |
137 | * @param WarningCollector $warningCollector |
138 | * @return string[] |
139 | */ |
140 | private function parseLanguages( array $useFields, $languageString, WarningCollector $warningCollector ): array { |
141 | $fields = []; |
142 | foreach ( explode( ',', $languageString ) as $languageCode ) { |
143 | $languageCode = mb_strtolower( $languageCode ); |
144 | $withFallbacks = false; |
145 | $withoutEnFallback = false; |
146 | $len = strlen( $languageCode ); |
147 | if ( $len > 1 && $languageCode[$len - 1] === '*' ) { |
148 | $languageCode = substr( $languageCode, 0, -1 ); |
149 | $withFallbacks = true; |
150 | $len--; |
151 | } elseif ( $len > 1 && $languageCode[$len - 1] === '+' ) { |
152 | $languageCode = substr( $languageCode, 0, -1 ); |
153 | $withFallbacks = true; |
154 | $withoutEnFallback = true; |
155 | $len--; |
156 | } |
157 | |
158 | if ( !isset( $this->languages[$languageCode] ) ) { |
159 | $warningCollector->addWarning( |
160 | 'wikibasecirrus-keywordfeature-unknown-language-code', |
161 | 'inlabel', |
162 | $languageCode ); |
163 | continue; |
164 | } |
165 | |
166 | foreach ( $useFields as $field ) { |
167 | $fields[$field . '.' . $languageCode . '.plain'] = true; |
168 | } |
169 | if ( $withFallbacks ) { |
170 | $fallbacks = $this->languageChainFactory |
171 | ->newFromLanguageCode( $languageCode ) |
172 | ->getFetchLanguageCodes(); |
173 | foreach ( $fallbacks as $fallbackCode ) { |
174 | if ( $withoutEnFallback && $fallbackCode == 'en' ) { |
175 | continue; |
176 | } |
177 | foreach ( $useFields as $field ) { |
178 | $fields[$field . '.' . $fallbackCode . '.plain'] = true; |
179 | } |
180 | } |
181 | } |
182 | } |
183 | return array_keys( $fields ); |
184 | } |
185 | |
186 | /** |
187 | * @param string $key |
188 | * @param string $value |
189 | * @param string $quotedValue |
190 | * @param string $valueDelimiter |
191 | * @param string $suffix |
192 | * @param WarningCollector $warningCollector |
193 | * @return array [ |
194 | * 'string' => string to search for |
195 | * 'fields' => array of document fields to run the query against, |
196 | * 'phrase' => boolean indicating if a phrase query should be issued |
197 | * ] |
198 | */ |
199 | public function parseValue( |
200 | $key, |
201 | $value, |
202 | $quotedValue, |
203 | $valueDelimiter, |
204 | $suffix, |
205 | WarningCollector $warningCollector |
206 | ) { |
207 | $isPhrase = $quotedValue !== $value; |
208 | Assert::precondition( isset( self::$FIELDS_PER_KEYWORD[$key] ), "Must have the list of fields for $key defined" ); |
209 | $allLabelFields = self::$FIELDS_PER_KEYWORD[$key]['all_fields']; |
210 | if ( strlen( $value ) === 0 ) { |
211 | $warningCollector->addWarning( |
212 | 'wikibasecirrus-inlabel-no-query-provided' ); |
213 | return [ |
214 | 'fields' => [], |
215 | 'string' => $value, |
216 | 'phrase' => $isPhrase, |
217 | ]; |
218 | } |
219 | $atPos = strrpos( $value, '@' ); |
220 | if ( $atPos === false ) { |
221 | return [ |
222 | 'fields' => $allLabelFields, |
223 | 'string' => $value, |
224 | 'phrase' => $isPhrase, |
225 | ]; |
226 | } |
227 | $search = substr( $value, 0, $atPos ); |
228 | if ( strlen( $search ) === 0 ) { |
229 | $warningCollector->addWarning( |
230 | 'wikibasecirrus-inlabel-no-query-provided' ); |
231 | return [ |
232 | 'fields' => [], |
233 | 'string' => $search, |
234 | 'phrase' => $isPhrase, |
235 | ]; |
236 | } |
237 | |
238 | $languages = substr( $value, $atPos + 1 ); |
239 | // when $atPos + 1 === strlen( $value ) then php will return '' |
240 | if ( $languages === false || $languages === '' || $languages === '*' ) { |
241 | return [ |
242 | 'fields' => $allLabelFields, |
243 | 'string' => $search, |
244 | 'phrase' => $isPhrase, |
245 | ]; |
246 | } |
247 | $fieldsToUse = self::$FIELDS_PER_KEYWORD[$key]['fields']; |
248 | $fields = $this->parseLanguages( $fieldsToUse, $languages, $warningCollector ); |
249 | if ( count( $fields ) > self::MAX_FIELDS ) { |
250 | $warningCollector->addWarning( |
251 | 'wikibasecirrus-keywordfeature-too-many-language-codes', |
252 | 'inlabel', self::MAX_FIELDS, count( $fields ) ); |
253 | $fields = array_slice( $fields, 0, self::MAX_FIELDS ); |
254 | } |
255 | |
256 | return [ |
257 | 'fields' => $fields, |
258 | 'string' => $search, |
259 | 'phrase' => $isPhrase, |
260 | ]; |
261 | } |
262 | |
263 | /** |
264 | * @param KeywordFeatureNode $node |
265 | * @param QueryBuildingContext $context |
266 | * @return AbstractQuery|null |
267 | */ |
268 | public function getFilterQuery( KeywordFeatureNode $node, QueryBuildingContext $context ) { |
269 | $parsedValue = $node->getParsedValue(); |
270 | if ( $parsedValue === null || $parsedValue['fields'] === [] ) { |
271 | return null; |
272 | } |
273 | return $this->makeQuery( $parsedValue ); |
274 | } |
275 | |
276 | } |