Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
62.50% |
65 / 104 |
|
44.44% |
4 / 9 |
CRAP | |
0.00% |
0 / 1 |
StatementsField | |
62.50% |
65 / 104 |
|
44.44% |
4 / 9 |
77.46 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
getMappingField | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
getStatements | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
3.07 | |||
getFieldData | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
5 | |||
getSnakAsPropertyIdAndValue | |
61.54% |
16 / 26 |
|
0.00% |
0 / 1 |
11.64 | |||
getSnakAsString | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
getWhitelistedSnakAsString | |
50.00% |
10 / 20 |
|
0.00% |
0 / 1 |
10.50 | |||
snakHasKnownValue | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getMapping | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | |
3 | namespace Wikibase\Search\Elastic\Fields; |
4 | |
5 | use CirrusSearch\CirrusSearch; |
6 | use OutOfBoundsException; |
7 | use Psr\Log\LoggerInterface; |
8 | use Psr\Log\NullLogger; |
9 | use SearchEngine; |
10 | use SearchIndexField; |
11 | use SearchIndexFieldDefinition; |
12 | use UnexpectedValueException; |
13 | use Wikibase\DataModel\Entity\EntityDocument; |
14 | use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookup; |
15 | use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookupException; |
16 | use Wikibase\DataModel\Snak\PropertyValueSnak; |
17 | use Wikibase\DataModel\Snak\Snak; |
18 | use Wikibase\DataModel\Statement\Statement; |
19 | use Wikibase\DataModel\Statement\StatementListProvider; |
20 | use Wikibase\Lib\DataTypeFactory; |
21 | use Wikibase\Repo\Search\Fields\WikibaseIndexField; |
22 | |
23 | /** |
24 | * Field indexing statements for particular item. |
25 | * |
26 | * @license GPL-2.0-or-later |
27 | * @author Stas Malyshev |
28 | */ |
29 | class StatementsField extends SearchIndexFieldDefinition implements WikibaseIndexField { |
30 | |
31 | /** |
32 | * Field name |
33 | */ |
34 | public const NAME = 'statement_keywords'; |
35 | |
36 | /** |
37 | * String which separates property from value in statement representation. |
38 | * Should be the string that is: |
39 | * - Not part of property ID serialization |
40 | * - Regex-safe |
41 | */ |
42 | public const STATEMENT_SEPARATOR = '='; |
43 | |
44 | /** |
45 | * Strings which enclose a qualifier for a statement |
46 | */ |
47 | private const QUALIFIER_START = '['; |
48 | private const QUALIFIER_END = ']'; |
49 | |
50 | /** |
51 | * @var array List of properties to index, as a flipped array with the property IDs as keys. |
52 | */ |
53 | protected $propertyIds; |
54 | |
55 | /** |
56 | * @var string[] |
57 | */ |
58 | private $indexedTypes; |
59 | |
60 | /** |
61 | * @var callable[] |
62 | */ |
63 | protected $searchIndexDataFormatters; |
64 | |
65 | /** |
66 | * @var PropertyDataTypeLookup |
67 | */ |
68 | private $propertyDataTypeLookup; |
69 | |
70 | /** |
71 | * @var DataTypeFactory |
72 | */ |
73 | private $dataTypeFactory; |
74 | |
75 | /** |
76 | * @var array |
77 | */ |
78 | private $excludedIds; |
79 | |
80 | /** |
81 | * @var ?callable Accepts an EntityDocument and returns an |
82 | * iterable of Statement instances to index. |
83 | */ |
84 | private $statementProvider; |
85 | |
86 | private LoggerInterface $logger; |
87 | |
88 | /** |
89 | * @param DataTypeFactory $dataTypeFactory |
90 | * @param PropertyDataTypeLookup $propertyDataTypeLookup |
91 | * @param string[] $propertyIds List of property IDs to index |
92 | * @param string[] $indexedTypes List of property types to index. Property of this type will be |
93 | * indexed regardless of $propertyIds. |
94 | * @param string[] $excludedIds List of property IDs to exclude. |
95 | * @param callable[] $searchIndexDataFormatters Search formatters, indexed by data type name |
96 | * @param ?LoggerInterface $logger |
97 | * @param ?callable $statementProvider Callable that accepts an EntityDocument and returns |
98 | * an iterable containing Statement instances to index. |
99 | */ |
100 | public function __construct( |
101 | DataTypeFactory $dataTypeFactory, |
102 | PropertyDataTypeLookup $propertyDataTypeLookup, |
103 | array $propertyIds, |
104 | array $indexedTypes, |
105 | array $excludedIds, |
106 | array $searchIndexDataFormatters, |
107 | ?LoggerInterface $logger = null, |
108 | ?callable $statementProvider = null |
109 | ) { |
110 | parent::__construct( static::NAME, SearchIndexField::INDEX_TYPE_KEYWORD ); |
111 | |
112 | $this->propertyIds = array_flip( $propertyIds ); |
113 | $this->indexedTypes = array_flip( $indexedTypes ); |
114 | $this->searchIndexDataFormatters = $searchIndexDataFormatters; |
115 | $this->dataTypeFactory = $dataTypeFactory; |
116 | $this->propertyDataTypeLookup = $propertyDataTypeLookup; |
117 | $this->excludedIds = array_flip( $excludedIds ); |
118 | $this->statementProvider = $statementProvider; |
119 | $this->logger = $logger ?? new NullLogger(); |
120 | } |
121 | |
122 | /** |
123 | * Produce specific field mapping |
124 | * |
125 | * @param SearchEngine $engine |
126 | * @param string $name |
127 | * |
128 | * @return SearchIndexField|null Null if mapping is not supported |
129 | */ |
130 | public function getMappingField( SearchEngine $engine, $name ) { |
131 | if ( !( $engine instanceof CirrusSearch ) ) { |
132 | // For now only Cirrus/Elastic is supported |
133 | return null; |
134 | } |
135 | |
136 | return $this; |
137 | } |
138 | |
139 | private function getStatements( EntityDocument $entity ): iterable { |
140 | if ( $this->statementProvider !== null ) { |
141 | return ( $this->statementProvider )( $entity ); |
142 | } |
143 | |
144 | if ( $entity instanceof StatementListProvider ) { |
145 | return $entity->getStatements(); |
146 | } |
147 | |
148 | return []; |
149 | } |
150 | |
151 | /** |
152 | * @param EntityDocument $entity |
153 | * |
154 | * @return mixed Get the value of the field to be indexed when a page/document |
155 | * is indexed. This might be an array with nested data, if the field |
156 | * is defined with nested type or an int or string for simple field types. |
157 | */ |
158 | public function getFieldData( EntityDocument $entity ) { |
159 | $data = []; |
160 | $seen = []; |
161 | $skipped = []; |
162 | |
163 | /** @var Statement $statement */ |
164 | foreach ( $this->getStatements( $entity ) as $statement ) { |
165 | $snak = $statement->getMainSnak(); |
166 | $mainSnakString = $this->getWhitelistedSnakAsString( $snak, $statement->getGuid() ); |
167 | $propertyId = $snak->getPropertyId()->getSerialization(); |
168 | if ( $mainSnakString === null ) { |
169 | $skipped[$propertyId] = true; |
170 | } else { |
171 | $seen[$propertyId] = true; |
172 | $data[] = $mainSnakString; |
173 | foreach ( $statement->getQualifiers() as $qualifier ) { |
174 | $qualifierString = $this->getSnakAsString( $qualifier ); |
175 | if ( $qualifierString !== null ) { |
176 | $data[] = $mainSnakString . |
177 | self::QUALIFIER_START . |
178 | $qualifierString . |
179 | self::QUALIFIER_END; |
180 | } |
181 | } |
182 | } |
183 | } |
184 | |
185 | // There are entities with thousands of properties, try and be somewhat efficient |
186 | $missing = array_diff( array_keys( $skipped ), array_keys( $seen ) ); |
187 | |
188 | return array_merge( $data, $missing ); |
189 | } |
190 | |
191 | /** |
192 | * Return the snak as an array with keys propertyId and value |
193 | * |
194 | * e.g. [ 'propertyId' => 'P180', 'value' => 'Q999' ] |
195 | * |
196 | * @param Snak $snak |
197 | * @param ?string $propType The property data type, if already known by the caller. |
198 | * @return array|null |
199 | */ |
200 | protected function getSnakAsPropertyIdAndValue( Snak $snak, ?string $propType = null ) { |
201 | if ( !( $this->snakHasKnownValue( $snak ) ) ) { |
202 | return null; |
203 | } |
204 | /** |
205 | * @var PropertyValueSnak $snak |
206 | */ |
207 | |
208 | try { |
209 | $propType ??= $this->propertyDataTypeLookup->getDataTypeIdForProperty( $snak->getPropertyId() ); |
210 | } catch ( PropertyDataTypeLookupException $e ) { |
211 | return null; |
212 | } |
213 | try { |
214 | $dataType = $this->dataTypeFactory->getType( $propType ); |
215 | } catch ( OutOfBoundsException $e ) { |
216 | return null; |
217 | } |
218 | |
219 | /* @phan-suppress-next-line PhanUndeclaredMethod */ |
220 | $dataValue = $snak->getDataValue(); |
221 | if ( $dataValue::getType() !== $dataType->getDataValueType() ) { |
222 | // Property type and data value type do not match (T372993) |
223 | return null; |
224 | } |
225 | |
226 | $formatter = $this->searchIndexDataFormatters[$propType] ?? null; |
227 | if ( $formatter === null ) { |
228 | // We do not know how to format these values |
229 | return null; |
230 | } |
231 | |
232 | $value = $formatter( $dataValue ); |
233 | |
234 | if ( !is_string( $value ) ) { |
235 | throw new UnexpectedValueException( |
236 | "Search index data formatter callback for data type '$propType' " . |
237 | " didn't return a string" |
238 | ); |
239 | } |
240 | if ( $value === '' ) { |
241 | return null; |
242 | } |
243 | |
244 | return [ |
245 | 'propertyId' => $snak->getPropertyId()->getSerialization(), |
246 | 'value' => $value, |
247 | ]; |
248 | } |
249 | |
250 | protected function getSnakAsString( Snak $snak, ?string $propType = null ): ?string { |
251 | $snakAsPropertyIdAndValue = $this->getSnakAsPropertyIdAndValue( $snak, $propType ); |
252 | if ( $snakAsPropertyIdAndValue === null ) { |
253 | return null; |
254 | } |
255 | return $snakAsPropertyIdAndValue[ 'propertyId' ] . self::STATEMENT_SEPARATOR . |
256 | $snakAsPropertyIdAndValue[ 'value' ]; |
257 | } |
258 | |
259 | /** |
260 | * Return the snak in the format '<property id>=<value>' IF AND ONLY IF the property has been |
261 | * whitelisted or its type has been whitelisted, and it has not been specifically excluded |
262 | * |
263 | * e.g. P180=Q537, P240=1234567 |
264 | * |
265 | * @param Snak $snak |
266 | * @param string|null $guid Statement GUID to which this snak belongs |
267 | * @return null|string |
268 | */ |
269 | protected function getWhitelistedSnakAsString( Snak $snak, $guid ) { |
270 | if ( !( $this->snakHasKnownValue( $snak ) ) ) { |
271 | return null; |
272 | } |
273 | |
274 | $propertyId = $snak->getPropertyId()->getSerialization(); |
275 | if ( array_key_exists( $propertyId, $this->excludedIds ) ) { |
276 | return null; |
277 | } |
278 | |
279 | try { |
280 | $propType = $this->propertyDataTypeLookup->getDataTypeIdForProperty( $snak->getPropertyId() ); |
281 | } catch ( PropertyDataTypeLookupException $e ) { |
282 | // T198091: looks like occasionally we get weird fails on indexing |
283 | // Log them but do not break indexing other data |
284 | $this->logger->warning( |
285 | __METHOD__ . ': Failed to look up property {propertyId} for {guid}', |
286 | [ |
287 | 'propertyId' => $e->getPropertyId()->getSerialization(), |
288 | 'guid' => $guid, |
289 | 'exception' => $e, |
290 | ] |
291 | ); |
292 | return null; |
293 | } |
294 | if ( !array_key_exists( $propType, $this->indexedTypes ) && |
295 | !array_key_exists( $propertyId, $this->propertyIds ) ) { |
296 | return null; |
297 | } |
298 | |
299 | return $this->getSnakAsString( $snak, $propType ); |
300 | } |
301 | |
302 | /** |
303 | * Returns true if the snak has a known value - i.e. it is NOT a PropertyNoValueSnak or a |
304 | * PropertySomeValueSnak |
305 | * |
306 | * @param Snak $snak |
307 | * @return bool |
308 | */ |
309 | protected function snakHasKnownValue( Snak $snak ) { |
310 | return ( $snak instanceof PropertyValueSnak ); |
311 | } |
312 | |
313 | /** |
314 | * @param SearchEngine $engine |
315 | * |
316 | * @return array |
317 | */ |
318 | public function getMapping( SearchEngine $engine ) { |
319 | // Since we need a specially tuned field, we can not use |
320 | // standard search engine types. |
321 | if ( !( $engine instanceof CirrusSearch ) ) { |
322 | // For now only Cirrus/Elastic is supported |
323 | return []; |
324 | } |
325 | |
326 | $config = [ |
327 | 'type' => 'text', |
328 | 'copy_to' => 'all', |
329 | 'analyzer' => 'lowercase_keyword', |
330 | 'norms' => false, |
331 | 'index_options' => 'docs', |
332 | ]; |
333 | // Subfield indexing only property names, so we could do matches |
334 | // like "property exists" without specifying the value. |
335 | $config['fields']['property'] = [ |
336 | 'type' => 'text', |
337 | 'analyzer' => 'extract_wb_property', |
338 | 'search_analyzer' => 'keyword', |
339 | ]; |
340 | |
341 | return $config; |
342 | } |
343 | |
344 | } |