Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
60.78% |
62 / 102 |
|
44.44% |
4 / 9 |
CRAP | |
0.00% |
0 / 1 |
| StatementsField | |
60.78% |
62 / 102 |
|
44.44% |
4 / 9 |
84.28 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
| getMappingField | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| getStatements | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
3.07 | |||
| getFieldData | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
5 | |||
| getSnakAsPropertyIdAndValue | |
61.54% |
16 / 26 |
|
0.00% |
0 / 1 |
11.64 | |||
| getSnakAsString | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
| getWhitelistedSnakAsString | |
50.00% |
10 / 20 |
|
0.00% |
0 / 1 |
10.50 | |||
| snakHasKnownValue | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getMapping | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
6 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace Wikibase\Search\Elastic\Fields; |
| 4 | |
| 5 | use CirrusSearch\CirrusSearch; |
| 6 | use MediaWiki\Search\SearchEngine; |
| 7 | use MediaWiki\Search\SearchIndexField; |
| 8 | use MediaWiki\Search\SearchIndexFieldDefinition; |
| 9 | use OutOfBoundsException; |
| 10 | use Psr\Log\LoggerInterface; |
| 11 | use Psr\Log\NullLogger; |
| 12 | use UnexpectedValueException; |
| 13 | use Wikibase\DataModel\Entity\EntityDocument; |
| 14 | use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookup; |
| 15 | use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookupException; |
| 16 | use Wikibase\DataModel\Snak\PropertyValueSnak; |
| 17 | use Wikibase\DataModel\Snak\Snak; |
| 18 | use Wikibase\DataModel\Statement\Statement; |
| 19 | use Wikibase\DataModel\Statement\StatementListProvider; |
| 20 | use Wikibase\Lib\DataTypeFactory; |
| 21 | use Wikibase\Repo\Search\Fields\WikibaseIndexField; |
| 22 | |
| 23 | /** |
| 24 | * Field indexing statements for particular item. |
| 25 | * |
| 26 | * @license GPL-2.0-or-later |
| 27 | * @author Stas Malyshev |
| 28 | */ |
| 29 | class StatementsField extends SearchIndexFieldDefinition implements WikibaseIndexField { |
| 30 | |
| 31 | /** |
| 32 | * Field name |
| 33 | */ |
| 34 | public const NAME = 'statement_keywords'; |
| 35 | |
| 36 | /** |
| 37 | * String which separates property from value in statement representation. |
| 38 | * Should be the string that is: |
| 39 | * - Not part of property ID serialization |
| 40 | * - Regex-safe |
| 41 | */ |
| 42 | public const STATEMENT_SEPARATOR = '='; |
| 43 | |
| 44 | /** |
| 45 | * Strings which enclose a qualifier for a statement |
| 46 | */ |
| 47 | private const QUALIFIER_START = '['; |
| 48 | private const QUALIFIER_END = ']'; |
| 49 | |
| 50 | /** |
| 51 | * @var array<string,int> List of properties to index, as a flipped array with the property IDs as keys. |
| 52 | */ |
| 53 | protected array $propertyIds; |
| 54 | |
| 55 | /** |
| 56 | * @var array<string,int> |
| 57 | */ |
| 58 | private array $indexedTypes; |
| 59 | |
| 60 | /** |
| 61 | * @var array<string,int> |
| 62 | */ |
| 63 | private array $excludedIds; |
| 64 | |
| 65 | /** |
| 66 | * @var ?callable Accepts an EntityDocument and returns an |
| 67 | * iterable of Statement instances to index. |
| 68 | */ |
| 69 | private $statementProvider; |
| 70 | |
| 71 | private LoggerInterface $logger; |
| 72 | |
| 73 | /** |
| 74 | * @param DataTypeFactory $dataTypeFactory |
| 75 | * @param PropertyDataTypeLookup $propertyDataTypeLookup |
| 76 | * @param string[] $propertyIds List of property IDs to index |
| 77 | * @param string[] $indexedTypes List of property types to index. Property of this type will be |
| 78 | * indexed regardless of $propertyIds. |
| 79 | * @param string[] $excludedIds List of property IDs to exclude. |
| 80 | * @param callable[] $searchIndexDataFormatters Search formatters, indexed by data type name |
| 81 | * @param ?LoggerInterface $logger |
| 82 | * @param ?callable $statementProvider Callable that accepts an EntityDocument and returns |
| 83 | * an iterable containing Statement instances to index. |
| 84 | */ |
| 85 | public function __construct( |
| 86 | private readonly DataTypeFactory $dataTypeFactory, |
| 87 | private readonly PropertyDataTypeLookup $propertyDataTypeLookup, |
| 88 | array $propertyIds, |
| 89 | array $indexedTypes, |
| 90 | array $excludedIds, |
| 91 | private readonly array $searchIndexDataFormatters, |
| 92 | ?LoggerInterface $logger = null, |
| 93 | ?callable $statementProvider = null |
| 94 | ) { |
| 95 | parent::__construct( static::NAME, SearchIndexField::INDEX_TYPE_KEYWORD ); |
| 96 | |
| 97 | $this->propertyIds = array_flip( $propertyIds ); |
| 98 | $this->indexedTypes = array_flip( $indexedTypes ); |
| 99 | $this->excludedIds = array_flip( $excludedIds ); |
| 100 | $this->statementProvider = $statementProvider; |
| 101 | $this->logger = $logger ?? new NullLogger(); |
| 102 | } |
| 103 | |
| 104 | /** |
| 105 | * Produce specific field mapping |
| 106 | * |
| 107 | * @param SearchEngine $engine |
| 108 | * @param string $name |
| 109 | * |
| 110 | * @return SearchIndexField|null Null if mapping is not supported |
| 111 | */ |
| 112 | public function getMappingField( SearchEngine $engine, $name ) { |
| 113 | if ( !( $engine instanceof CirrusSearch ) ) { |
| 114 | // For now only Cirrus/Elastic is supported |
| 115 | return null; |
| 116 | } |
| 117 | |
| 118 | return $this; |
| 119 | } |
| 120 | |
| 121 | private function getStatements( EntityDocument $entity ): iterable { |
| 122 | if ( $this->statementProvider !== null ) { |
| 123 | return ( $this->statementProvider )( $entity ); |
| 124 | } |
| 125 | |
| 126 | if ( $entity instanceof StatementListProvider ) { |
| 127 | return $entity->getStatements(); |
| 128 | } |
| 129 | |
| 130 | return []; |
| 131 | } |
| 132 | |
| 133 | /** |
| 134 | * @param EntityDocument $entity |
| 135 | * |
| 136 | * @return mixed Get the value of the field to be indexed when a page/document |
| 137 | * is indexed. This might be an array with nested data, if the field |
| 138 | * is defined with nested type or an int or string for simple field types. |
| 139 | */ |
| 140 | public function getFieldData( EntityDocument $entity ) { |
| 141 | $data = []; |
| 142 | $seen = []; |
| 143 | $skipped = []; |
| 144 | |
| 145 | /** @var Statement $statement */ |
| 146 | foreach ( $this->getStatements( $entity ) as $statement ) { |
| 147 | $snak = $statement->getMainSnak(); |
| 148 | $mainSnakString = $this->getWhitelistedSnakAsString( $snak, $statement->getGuid() ); |
| 149 | $propertyId = $snak->getPropertyId()->getSerialization(); |
| 150 | if ( $mainSnakString === null ) { |
| 151 | $skipped[$propertyId] = true; |
| 152 | } else { |
| 153 | $seen[$propertyId] = true; |
| 154 | $data[] = $mainSnakString; |
| 155 | foreach ( $statement->getQualifiers() as $qualifier ) { |
| 156 | $qualifierString = $this->getSnakAsString( $qualifier ); |
| 157 | if ( $qualifierString !== null ) { |
| 158 | $data[] = $mainSnakString . |
| 159 | self::QUALIFIER_START . |
| 160 | $qualifierString . |
| 161 | self::QUALIFIER_END; |
| 162 | } |
| 163 | } |
| 164 | } |
| 165 | } |
| 166 | |
| 167 | // There are entities with thousands of properties, try and be somewhat efficient |
| 168 | $missing = array_diff( array_keys( $skipped ), array_keys( $seen ) ); |
| 169 | |
| 170 | return array_merge( $data, $missing ); |
| 171 | } |
| 172 | |
| 173 | /** |
| 174 | * Return the snak as an array with keys propertyId and value |
| 175 | * |
| 176 | * e.g. [ 'propertyId' => 'P180', 'value' => 'Q999' ] |
| 177 | * |
| 178 | * @param Snak $snak |
| 179 | * @param ?string $propType The property data type, if already known by the caller. |
| 180 | * @return array|null |
| 181 | */ |
| 182 | protected function getSnakAsPropertyIdAndValue( Snak $snak, ?string $propType = null ) { |
| 183 | if ( !( $this->snakHasKnownValue( $snak ) ) ) { |
| 184 | return null; |
| 185 | } |
| 186 | /** |
| 187 | * @var PropertyValueSnak $snak |
| 188 | */ |
| 189 | |
| 190 | try { |
| 191 | $propType ??= $this->propertyDataTypeLookup->getDataTypeIdForProperty( $snak->getPropertyId() ); |
| 192 | } catch ( PropertyDataTypeLookupException ) { |
| 193 | return null; |
| 194 | } |
| 195 | try { |
| 196 | $dataType = $this->dataTypeFactory->getType( $propType ); |
| 197 | } catch ( OutOfBoundsException ) { |
| 198 | return null; |
| 199 | } |
| 200 | |
| 201 | /* @phan-suppress-next-line PhanUndeclaredMethod */ |
| 202 | $dataValue = $snak->getDataValue(); |
| 203 | if ( $dataValue::getType() !== $dataType->getDataValueType() ) { |
| 204 | // Property type and data value type do not match (T372993) |
| 205 | return null; |
| 206 | } |
| 207 | |
| 208 | $formatter = $this->searchIndexDataFormatters[$propType] ?? null; |
| 209 | if ( $formatter === null ) { |
| 210 | // We do not know how to format these values |
| 211 | return null; |
| 212 | } |
| 213 | |
| 214 | $value = $formatter( $dataValue ); |
| 215 | |
| 216 | if ( !is_string( $value ) ) { |
| 217 | throw new UnexpectedValueException( |
| 218 | "Search index data formatter callback for data type '$propType' " . |
| 219 | " didn't return a string" |
| 220 | ); |
| 221 | } |
| 222 | if ( $value === '' ) { |
| 223 | return null; |
| 224 | } |
| 225 | |
| 226 | return [ |
| 227 | 'propertyId' => $snak->getPropertyId()->getSerialization(), |
| 228 | 'value' => $value, |
| 229 | ]; |
| 230 | } |
| 231 | |
| 232 | protected function getSnakAsString( Snak $snak, ?string $propType = null ): ?string { |
| 233 | $snakAsPropertyIdAndValue = $this->getSnakAsPropertyIdAndValue( $snak, $propType ); |
| 234 | if ( $snakAsPropertyIdAndValue === null ) { |
| 235 | return null; |
| 236 | } |
| 237 | return $snakAsPropertyIdAndValue[ 'propertyId' ] . self::STATEMENT_SEPARATOR . |
| 238 | $snakAsPropertyIdAndValue[ 'value' ]; |
| 239 | } |
| 240 | |
| 241 | /** |
| 242 | * Return the snak in the format '<property id>=<value>' IF AND ONLY IF the property has been |
| 243 | * whitelisted or its type has been whitelisted, and it has not been specifically excluded |
| 244 | * |
| 245 | * e.g. P180=Q537, P240=1234567 |
| 246 | * |
| 247 | * @param Snak $snak |
| 248 | * @param string|null $guid Statement GUID to which this snak belongs |
| 249 | * @return null|string |
| 250 | */ |
| 251 | protected function getWhitelistedSnakAsString( Snak $snak, $guid ) { |
| 252 | if ( !( $this->snakHasKnownValue( $snak ) ) ) { |
| 253 | return null; |
| 254 | } |
| 255 | |
| 256 | $propertyId = $snak->getPropertyId()->getSerialization(); |
| 257 | if ( array_key_exists( $propertyId, $this->excludedIds ) ) { |
| 258 | return null; |
| 259 | } |
| 260 | |
| 261 | try { |
| 262 | $propType = $this->propertyDataTypeLookup->getDataTypeIdForProperty( $snak->getPropertyId() ); |
| 263 | } catch ( PropertyDataTypeLookupException $e ) { |
| 264 | // T198091: looks like occasionally we get weird fails on indexing |
| 265 | // Log them but do not break indexing other data |
| 266 | $this->logger->warning( |
| 267 | __METHOD__ . ': Failed to look up property {propertyId} for {guid}', |
| 268 | [ |
| 269 | 'propertyId' => $e->getPropertyId()->getSerialization(), |
| 270 | 'guid' => $guid, |
| 271 | 'exception' => $e, |
| 272 | ] |
| 273 | ); |
| 274 | return null; |
| 275 | } |
| 276 | if ( !array_key_exists( $propType, $this->indexedTypes ) && |
| 277 | !array_key_exists( $propertyId, $this->propertyIds ) ) { |
| 278 | return null; |
| 279 | } |
| 280 | |
| 281 | return $this->getSnakAsString( $snak, $propType ); |
| 282 | } |
| 283 | |
| 284 | /** |
| 285 | * Returns true if the snak has a known value - i.e. it is NOT a PropertyNoValueSnak or a |
| 286 | * PropertySomeValueSnak |
| 287 | * |
| 288 | * @param Snak $snak |
| 289 | * @return bool |
| 290 | */ |
| 291 | protected function snakHasKnownValue( Snak $snak ) { |
| 292 | return ( $snak instanceof PropertyValueSnak ); |
| 293 | } |
| 294 | |
| 295 | /** |
| 296 | * @param SearchEngine $engine |
| 297 | * |
| 298 | * @return array |
| 299 | */ |
| 300 | public function getMapping( SearchEngine $engine ) { |
| 301 | // Since we need a specially tuned field, we can not use |
| 302 | // standard search engine types. |
| 303 | if ( !( $engine instanceof CirrusSearch ) ) { |
| 304 | // For now only Cirrus/Elastic is supported |
| 305 | return []; |
| 306 | } |
| 307 | |
| 308 | $config = [ |
| 309 | 'type' => 'keyword', |
| 310 | 'copy_to' => 'all', |
| 311 | 'normalizer' => 'lowercase_keyword', |
| 312 | 'norms' => false, |
| 313 | 'doc_values' => false, |
| 314 | 'index_options' => 'docs', |
| 315 | ]; |
| 316 | // Subfield indexing only property names, so we could do matches |
| 317 | // like "property exists" without specifying the value. |
| 318 | $config['fields']['property'] = [ |
| 319 | 'type' => 'text', |
| 320 | 'analyzer' => 'extract_wb_property', |
| 321 | 'search_analyzer' => 'keyword', |
| 322 | ]; |
| 323 | |
| 324 | return $config; |
| 325 | } |
| 326 | |
| 327 | } |