Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
66.23% |
51 / 77 |
|
37.50% |
3 / 8 |
CRAP | |
0.00% |
0 / 1 |
StatementsField | |
66.23% |
51 / 77 |
|
37.50% |
3 / 8 |
49.06 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
getMappingField | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
getFieldData | |
93.75% |
15 / 16 |
|
0.00% |
0 / 1 |
6.01 | |||
getSnakAsPropertyIdAndValue | |
82.35% |
14 / 17 |
|
0.00% |
0 / 1 |
5.14 | |||
getSnakAsString | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
getWhitelistedSnakAsString | |
71.43% |
10 / 14 |
|
0.00% |
0 / 1 |
6.84 | |||
snakHasKnownValue | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getMapping | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | |
3 | namespace Wikibase\Search\Elastic\Fields; |
4 | |
5 | use CirrusSearch\CirrusSearch; |
6 | use SearchEngine; |
7 | use SearchIndexField; |
8 | use SearchIndexFieldDefinition; |
9 | use UnexpectedValueException; |
10 | use Wikibase\DataModel\Entity\EntityDocument; |
11 | use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookup; |
12 | use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookupException; |
13 | use Wikibase\DataModel\Snak\PropertyValueSnak; |
14 | use Wikibase\DataModel\Snak\Snak; |
15 | use Wikibase\DataModel\Statement\Statement; |
16 | use Wikibase\DataModel\Statement\StatementListProvider; |
17 | use Wikibase\Repo\Search\Fields\WikibaseIndexField; |
18 | |
19 | /** |
20 | * Field indexing statements for particular item. |
21 | * |
22 | * @license GPL-2.0-or-later |
23 | * @author Stas Malyshev |
24 | */ |
25 | class StatementsField extends SearchIndexFieldDefinition implements WikibaseIndexField { |
26 | |
27 | /** |
28 | * Field name |
29 | */ |
30 | public const NAME = 'statement_keywords'; |
31 | |
32 | /** |
33 | * String which separates property from value in statement representation. |
34 | * Should be the string that is: |
35 | * - Not part of property ID serialization |
36 | * - Regex-safe |
37 | */ |
38 | public const STATEMENT_SEPARATOR = '='; |
39 | |
40 | /** |
41 | * Strings which enclose a qualifier for a statement |
42 | */ |
43 | private const QUALIFIER_START = '['; |
44 | private const QUALIFIER_END = ']'; |
45 | |
46 | /** |
47 | * @var array List of properties to index, as a flipped array with the property IDs as keys. |
48 | */ |
49 | protected $propertyIds; |
50 | |
51 | /** |
52 | * @var string[] |
53 | */ |
54 | private $indexedTypes; |
55 | |
56 | /** |
57 | * @var callable[] |
58 | */ |
59 | protected $searchIndexDataFormatters; |
60 | |
61 | /** |
62 | * @var PropertyDataTypeLookup |
63 | */ |
64 | private $propertyDataTypeLookup; |
65 | /** |
66 | * @var array |
67 | */ |
68 | private $excludedIds; |
69 | |
70 | /** |
71 | * @param PropertyDataTypeLookup $propertyDataTypeLookup |
72 | * @param string[] $propertyIds List of property IDs to index |
73 | * @param string[] $indexedTypes List of property types to index. Property of this type will be |
74 | * indexed regardless of $propertyIds. |
75 | * @param string[] $excludedIds List of property IDs to exclude. |
76 | * @param callable[] $searchIndexDataFormatters Search formatters, indexed by data type name |
77 | */ |
78 | public function __construct( |
79 | PropertyDataTypeLookup $propertyDataTypeLookup, |
80 | array $propertyIds, |
81 | array $indexedTypes, |
82 | array $excludedIds, |
83 | array $searchIndexDataFormatters |
84 | ) { |
85 | parent::__construct( static::NAME, SearchIndexField::INDEX_TYPE_KEYWORD ); |
86 | |
87 | $this->propertyIds = array_flip( $propertyIds ); |
88 | $this->indexedTypes = array_flip( $indexedTypes ); |
89 | $this->searchIndexDataFormatters = $searchIndexDataFormatters; |
90 | $this->propertyDataTypeLookup = $propertyDataTypeLookup; |
91 | $this->excludedIds = array_flip( $excludedIds ); |
92 | } |
93 | |
94 | /** |
95 | * Produce specific field mapping |
96 | * |
97 | * @param SearchEngine $engine |
98 | * @param string $name |
99 | * |
100 | * @return SearchIndexField|null Null if mapping is not supported |
101 | */ |
102 | public function getMappingField( SearchEngine $engine, $name ) { |
103 | if ( !( $engine instanceof CirrusSearch ) ) { |
104 | // For now only Cirrus/Elastic is supported |
105 | return null; |
106 | } |
107 | |
108 | return $this; |
109 | } |
110 | |
111 | /** |
112 | * @param EntityDocument $entity |
113 | * |
114 | * @return mixed Get the value of the field to be indexed when a page/document |
115 | * is indexed. This might be an array with nested data, if the field |
116 | * is defined with nested type or an int or string for simple field types. |
117 | */ |
118 | public function getFieldData( EntityDocument $entity ) { |
119 | if ( !( $entity instanceof StatementListProvider ) ) { |
120 | return []; |
121 | } |
122 | |
123 | $data = []; |
124 | |
125 | /** @var Statement $statement */ |
126 | foreach ( $entity->getStatements() as $statement ) { |
127 | $snak = $statement->getMainSnak(); |
128 | $mainSnakString = $this->getWhitelistedSnakAsString( $snak, $statement->getGuid() ); |
129 | if ( $mainSnakString !== null ) { |
130 | $data[] = $mainSnakString; |
131 | foreach ( $statement->getQualifiers() as $qualifier ) { |
132 | $qualifierString = $this->getSnakAsString( $qualifier ); |
133 | if ( $qualifierString !== null ) { |
134 | $data[] = $mainSnakString . |
135 | self::QUALIFIER_START . |
136 | $qualifierString . |
137 | self::QUALIFIER_END; |
138 | } |
139 | } |
140 | } |
141 | } |
142 | |
143 | return $data; |
144 | } |
145 | |
146 | /** |
147 | * Return the snak as an array with keys propertyId and value |
148 | * |
149 | * e.g. [ 'propertyId' => 'P180', 'value' => 'Q999' ] |
150 | * |
151 | * @param Snak $snak |
152 | * @return array|null |
153 | */ |
154 | protected function getSnakAsPropertyIdAndValue( Snak $snak ) { |
155 | if ( !( $this->snakHasKnownValue( $snak ) ) ) { |
156 | return null; |
157 | } |
158 | /** |
159 | * @var PropertyValueSnak $snak |
160 | */ |
161 | /* @phan-suppress-next-line PhanUndeclaredMethod */ |
162 | $dataValue = $snak->getDataValue(); |
163 | $definitionKey = 'VT:' . $dataValue->getType(); |
164 | |
165 | if ( !isset( $this->searchIndexDataFormatters[$definitionKey] ) ) { |
166 | // We do not know how to format these values |
167 | return null; |
168 | } |
169 | |
170 | $formatter = $this->searchIndexDataFormatters[$definitionKey]; |
171 | $value = $formatter( $dataValue ); |
172 | |
173 | if ( !is_string( $value ) ) { |
174 | throw new UnexpectedValueException( 'Search index data formatter callback for "' . $definitionKey |
175 | . '" didn\'t return a string' ); |
176 | } |
177 | if ( $value === '' ) { |
178 | return null; |
179 | } |
180 | |
181 | return [ |
182 | 'propertyId' => $snak->getPropertyId()->getSerialization(), |
183 | 'value' => $value, |
184 | ]; |
185 | } |
186 | |
187 | protected function getSnakAsString( Snak $snak ) { |
188 | $snakAsPropertyIdAndValue = $this->getSnakAsPropertyIdAndValue( $snak ); |
189 | if ( $snakAsPropertyIdAndValue === null ) { |
190 | return null; |
191 | } |
192 | return $snakAsPropertyIdAndValue[ 'propertyId' ] . self::STATEMENT_SEPARATOR . |
193 | $snakAsPropertyIdAndValue[ 'value' ]; |
194 | } |
195 | |
196 | /** |
197 | * Return the snak in the format '<property id>=<value>' IF AND ONLY IF the property has been |
198 | * whitelisted or its type has been whitelisted, and it has not been specifically excluded |
199 | * |
200 | * e.g. P180=Q537, P240=1234567 |
201 | * |
202 | * @param Snak $snak |
203 | * @param string $guid Statement GUID to which this snak belongs |
204 | * @return null|string |
205 | */ |
206 | protected function getWhitelistedSnakAsString( Snak $snak, $guid ) { |
207 | if ( !( $this->snakHasKnownValue( $snak ) ) ) { |
208 | return null; |
209 | } |
210 | |
211 | $propertyId = $snak->getPropertyId()->getSerialization(); |
212 | if ( array_key_exists( $propertyId, $this->excludedIds ) ) { |
213 | return null; |
214 | } |
215 | |
216 | try { |
217 | $propType = $this->propertyDataTypeLookup->getDataTypeIdForProperty( $snak->getPropertyId() ); |
218 | } catch ( PropertyDataTypeLookupException $e ) { |
219 | // T198091: looks like occasionally we get weird fails on indexing |
220 | // Log them but do not break indexing other data |
221 | wfLogWarning( __METHOD__ . ': Failed to look up property ' . $e->getPropertyId() . |
222 | ' for ' . $guid ); |
223 | return null; |
224 | } |
225 | if ( !array_key_exists( $propType, $this->indexedTypes ) && |
226 | !array_key_exists( $propertyId, $this->propertyIds ) ) { |
227 | return null; |
228 | } |
229 | |
230 | return $this->getSnakAsString( $snak ); |
231 | } |
232 | |
233 | /** |
234 | * Returns true if the snak has a known value - i.e. it is NOT a PropertyNoValueSnak or a |
235 | * PropertySomeValueSnak |
236 | * |
237 | * @param Snak $snak |
238 | * @return bool |
239 | */ |
240 | protected function snakHasKnownValue( Snak $snak ) { |
241 | return ( $snak instanceof PropertyValueSnak ); |
242 | } |
243 | |
244 | /** |
245 | * @param SearchEngine $engine |
246 | * |
247 | * @return array |
248 | */ |
249 | public function getMapping( SearchEngine $engine ) { |
250 | // Since we need a specially tuned field, we can not use |
251 | // standard search engine types. |
252 | if ( !( $engine instanceof CirrusSearch ) ) { |
253 | // For now only Cirrus/Elastic is supported |
254 | return []; |
255 | } |
256 | |
257 | $config = [ |
258 | 'type' => 'text', |
259 | 'copy_to' => 'all', |
260 | 'analyzer' => 'lowercase_keyword', |
261 | 'norms' => false, |
262 | 'index_options' => 'docs', |
263 | ]; |
264 | // Subfield indexing only property names, so we could do matches |
265 | // like "property exists" without specifying the value. |
266 | $config['fields']['property'] = [ |
267 | 'type' => 'text', |
268 | 'analyzer' => 'extract_wb_property', |
269 | 'search_analyzer' => 'keyword', |
270 | ]; |
271 | |
272 | return $config; |
273 | } |
274 | |
275 | } |