Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
79.15% |
315 / 398 |
|
63.16% |
12 / 19 |
CRAP | |
0.00% |
0 / 1 |
SparqlHelper | |
79.15% |
315 / 398 |
|
63.16% |
12 / 19 |
138.05 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
29 / 29 |
|
100.00% |
1 / 1 |
3 | |||
getQueryPrefixes | |
100.00% |
42 / 42 |
|
100.00% |
1 / 1 |
3 | |||
hasType | |
85.71% |
24 / 28 |
|
0.00% |
0 / 1 |
4.05 | |||
nestedSeparatorFilter | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
1 | |||
findEntitiesWithSameStatement | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
2 | |||
findEntitiesWithSameQualifierOrReference | |
100.00% |
27 / 27 |
|
100.00% |
1 / 1 |
6 | |||
stringLiteral | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getOtherEntities | |
87.50% |
21 / 24 |
|
0.00% |
0 / 1 |
5.05 | |||
getRdfLiteral | |
90.00% |
27 / 30 |
|
0.00% |
0 / 1 |
16.26 | |||
matchesRegularExpression | |
0.00% |
0 / 60 |
|
0.00% |
0 / 1 |
90 | |||
serializeConstraintParameterException | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
deserializeConstraintParameterException | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
matchesRegularExpressionWithSparql | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
2 | |||
isTimeout | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
getCacheMaxAge | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
getThrottling | |
64.29% |
9 / 14 |
|
0.00% |
0 / 1 |
7.64 | |||
getTimestampInFuture | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
runQuery | |
90.00% |
54 / 60 |
|
0.00% |
0 / 1 |
9.08 | |||
guardAgainstTooManyRequestsError | |
88.24% |
15 / 17 |
|
0.00% |
0 / 1 |
4.03 |
1 | <?php |
2 | |
3 | declare( strict_types = 1 ); |
4 | |
5 | namespace WikibaseQuality\ConstraintReport\ConstraintCheck\Helper; |
6 | |
7 | use DataValues\DataValue; |
8 | use DataValues\MonolingualTextValue; |
9 | use DateInterval; |
10 | use InvalidArgumentException; |
11 | use MapCacheLRU; |
12 | use MediaWiki\Config\Config; |
13 | use MediaWiki\Http\HttpRequestFactory; |
14 | use MediaWiki\Json\FormatJson; |
15 | use MWHttpRequest; |
16 | use UnexpectedValueException; |
17 | use Wikibase\DataModel\Entity\EntityId; |
18 | use Wikibase\DataModel\Entity\EntityIdParser; |
19 | use Wikibase\DataModel\Entity\EntityIdParsingException; |
20 | use Wikibase\DataModel\Entity\EntityIdValue; |
21 | use Wikibase\DataModel\Entity\PropertyId; |
22 | use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookup; |
23 | use Wikibase\DataModel\Snak\PropertyValueSnak; |
24 | use Wikibase\DataModel\Statement\Statement; |
25 | use Wikibase\Repo\Rdf\RdfVocabulary; |
26 | use WikibaseQuality\ConstraintReport\Api\ExpiryLock; |
27 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\CachedBool; |
28 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\CachedEntityIds; |
29 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\CachedQueryResults; |
30 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\CachingMetadata; |
31 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\Metadata; |
32 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Context\Context; |
33 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Message\ViolationMessage; |
34 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Message\ViolationMessageDeserializer; |
35 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Message\ViolationMessageSerializer; |
36 | use WikibaseQuality\ConstraintReport\Role; |
37 | use Wikimedia\ObjectCache\WANObjectCache; |
38 | use Wikimedia\Stats\IBufferingStatsdDataFactory; |
39 | use Wikimedia\Timestamp\ConvertibleTimestamp; |
40 | |
41 | /** |
42 | * Class for running a SPARQL query on some endpoint and getting the results. |
43 | * |
44 | * @author Lucas Werkmeister |
45 | * @license GPL-2.0-or-later |
46 | */ |
47 | class SparqlHelper { |
48 | |
49 | private RdfVocabulary $rdfVocabulary; |
50 | |
51 | /** |
52 | * @var string[] |
53 | */ |
54 | private array $entityPrefixes; |
55 | |
56 | private string $prefixes; |
57 | |
58 | private EntityIdParser $entityIdParser; |
59 | |
60 | private PropertyDataTypeLookup $propertyDataTypeLookup; |
61 | |
62 | private WANObjectCache $cache; |
63 | |
64 | private ViolationMessageSerializer $violationMessageSerializer; |
65 | |
66 | private ViolationMessageDeserializer $violationMessageDeserializer; |
67 | |
68 | private IBufferingStatsdDataFactory $dataFactory; |
69 | |
70 | private LoggingHelper $loggingHelper; |
71 | |
72 | private string $defaultUserAgent; |
73 | |
74 | private ExpiryLock $throttlingLock; |
75 | |
76 | /** |
77 | * @var int stands for: No Retry-After header-field was sent back |
78 | */ |
79 | private const NO_RETRY_AFTER = -1; |
80 | /** |
81 | * @var int stands for: Empty Retry-After header-field was sent back |
82 | */ |
83 | private const EMPTY_RETRY_AFTER = -2; |
84 | /** |
85 | * @var int stands for: Invalid Retry-After header-field was sent back |
86 | * link a string |
87 | */ |
88 | private const INVALID_RETRY_AFTER = -3; |
89 | /** |
90 | * @var string ID on which the lock is applied on |
91 | */ |
92 | public const EXPIRY_LOCK_ID = 'SparqlHelper.runQuery'; |
93 | |
94 | /** |
95 | * @var int HTTP response code for too many requests |
96 | */ |
97 | private const HTTP_TOO_MANY_REQUESTS = 429; |
98 | |
99 | private HttpRequestFactory $requestFactory; |
100 | |
101 | private string $primaryEndpoint; |
102 | |
103 | /** |
104 | * @var string[] |
105 | */ |
106 | private array $additionalEndpoints; |
107 | |
108 | private int $maxQueryTimeMillis; |
109 | |
110 | private string $subclassOfId; |
111 | |
112 | private int $cacheMapSize; |
113 | |
114 | /** |
115 | * @var string[] |
116 | */ |
117 | private array $timeoutExceptionClasses; |
118 | |
119 | private bool $sparqlHasWikibaseSupport; |
120 | |
121 | private int $sparqlThrottlingFallbackDuration; |
122 | |
123 | public function __construct( |
124 | Config $config, |
125 | RdfVocabulary $rdfVocabulary, |
126 | EntityIdParser $entityIdParser, |
127 | PropertyDataTypeLookup $propertyDataTypeLookup, |
128 | WANObjectCache $cache, |
129 | ViolationMessageSerializer $violationMessageSerializer, |
130 | ViolationMessageDeserializer $violationMessageDeserializer, |
131 | IBufferingStatsdDataFactory $dataFactory, |
132 | ExpiryLock $throttlingLock, |
133 | LoggingHelper $loggingHelper, |
134 | $defaultUserAgent, |
135 | HttpRequestFactory $requestFactory |
136 | ) { |
137 | $this->rdfVocabulary = $rdfVocabulary; |
138 | $this->entityIdParser = $entityIdParser; |
139 | $this->propertyDataTypeLookup = $propertyDataTypeLookup; |
140 | $this->cache = $cache; |
141 | $this->violationMessageSerializer = $violationMessageSerializer; |
142 | $this->violationMessageDeserializer = $violationMessageDeserializer; |
143 | $this->dataFactory = $dataFactory; |
144 | $this->throttlingLock = $throttlingLock; |
145 | $this->loggingHelper = $loggingHelper; |
146 | $this->defaultUserAgent = $defaultUserAgent; |
147 | $this->requestFactory = $requestFactory; |
148 | $this->entityPrefixes = []; |
149 | foreach ( $rdfVocabulary->entityNamespaceNames as $namespaceName ) { |
150 | $this->entityPrefixes[] = $rdfVocabulary->getNamespaceURI( $namespaceName ); |
151 | } |
152 | |
153 | $this->primaryEndpoint = $config->get( 'WBQualityConstraintsSparqlEndpoint' ); |
154 | $this->additionalEndpoints = $config->get( 'WBQualityConstraintsAdditionalSparqlEndpoints' ) ?: []; |
155 | $this->maxQueryTimeMillis = $config->get( 'WBQualityConstraintsSparqlMaxMillis' ); |
156 | $this->subclassOfId = $config->get( 'WBQualityConstraintsSubclassOfId' ); |
157 | $this->cacheMapSize = $config->get( 'WBQualityConstraintsFormatCacheMapSize' ); |
158 | $this->timeoutExceptionClasses = $config->get( |
159 | 'WBQualityConstraintsSparqlTimeoutExceptionClasses' |
160 | ); |
161 | $this->sparqlHasWikibaseSupport = $config->get( |
162 | 'WBQualityConstraintsSparqlHasWikibaseSupport' |
163 | ); |
164 | $this->sparqlThrottlingFallbackDuration = (int)$config->get( |
165 | 'WBQualityConstraintsSparqlThrottlingFallbackDuration' |
166 | ); |
167 | |
168 | $this->prefixes = $this->getQueryPrefixes( $rdfVocabulary ); |
169 | } |
170 | |
171 | private function getQueryPrefixes( RdfVocabulary $rdfVocabulary ): string { |
172 | // TODO: it would probably be smarter that RdfVocabulary exposed these prefixes somehow |
173 | $prefixes = ''; |
174 | foreach ( $rdfVocabulary->entityNamespaceNames as $sourceName => $namespaceName ) { |
175 | $prefixes .= <<<END |
176 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
177 | END; |
178 | } |
179 | $prefixes .= <<<END |
180 | PREFIX wds: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NS_STATEMENT )}> |
181 | PREFIX wdv: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NS_VALUE )}>\n |
182 | END; |
183 | |
184 | foreach ( $rdfVocabulary->propertyNamespaceNames as $sourceName => $sourceNamespaces ) { |
185 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_DIRECT_CLAIM]; |
186 | $prefixes .= <<<END |
187 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
188 | END; |
189 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_CLAIM]; |
190 | $prefixes .= <<<END |
191 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
192 | END; |
193 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_CLAIM_STATEMENT]; |
194 | $prefixes .= <<<END |
195 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
196 | END; |
197 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_QUALIFIER]; |
198 | $prefixes .= <<<END |
199 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
200 | END; |
201 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_QUALIFIER_VALUE]; |
202 | $prefixes .= <<<END |
203 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
204 | END; |
205 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_REFERENCE]; |
206 | $prefixes .= <<<END |
207 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
208 | END; |
209 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_REFERENCE_VALUE]; |
210 | $prefixes .= <<<END |
211 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
212 | END; |
213 | } |
214 | $prefixes .= <<<END |
215 | PREFIX wikibase: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NS_ONTOLOGY )}>\n |
216 | END; |
217 | return $prefixes; |
218 | } |
219 | |
220 | /** |
221 | * @param string $id entity ID serialization of the entity to check |
222 | * @param string[] $classes entity ID serializations of the expected types |
223 | * |
224 | * @return CachedBool |
225 | * @throws SparqlHelperException if the query times out or some other error occurs |
226 | */ |
227 | public function hasType( string $id, array $classes ): CachedBool { |
228 | // TODO hint:gearing is a workaround for T168973 and can hopefully be removed eventually |
229 | $gearingHint = $this->sparqlHasWikibaseSupport ? |
230 | ' hint:Prior hint:gearing "forward".' : |
231 | ''; |
232 | |
233 | $metadatas = []; |
234 | |
235 | foreach ( array_chunk( $classes, 20 ) as $classesChunk ) { |
236 | $classesValues = implode( ' ', array_map( |
237 | static function ( $class ) { |
238 | return 'wd:' . $class; |
239 | }, |
240 | $classesChunk |
241 | ) ); |
242 | |
243 | $query = <<<EOF |
244 | ASK { |
245 | BIND(wd:$id AS ?item) |
246 | VALUES ?class { $classesValues } |
247 | ?item wdt:{$this->subclassOfId}* ?class.$gearingHint |
248 | } |
249 | EOF; |
250 | |
251 | $result = $this->runQuery( $query, $this->primaryEndpoint ); |
252 | $metadatas[] = $result->getMetadata(); |
253 | if ( $result->getArray()['boolean'] ) { |
254 | return new CachedBool( |
255 | true, |
256 | Metadata::merge( $metadatas ) |
257 | ); |
258 | } |
259 | } |
260 | |
261 | return new CachedBool( |
262 | false, |
263 | Metadata::merge( $metadatas ) |
264 | ); |
265 | } |
266 | |
267 | /** |
268 | * Helper function used by findEntitiesWithSameStatement to filter |
269 | * out entities with different qualifiers or no qualifier value. |
270 | */ |
271 | private function nestedSeparatorFilter( PropertyId $separator ): string { |
272 | $filter = <<<EOF |
273 | MINUS { |
274 | ?statement pq:$separator ?qualifier. |
275 | FILTER NOT EXISTS { |
276 | ?otherStatement pq:$separator ?qualifier. |
277 | } |
278 | } |
279 | MINUS { |
280 | ?otherStatement pq:$separator ?qualifier. |
281 | FILTER NOT EXISTS { |
282 | ?statement pq:$separator ?qualifier. |
283 | } |
284 | } |
285 | MINUS { |
286 | ?statement a wdno:$separator. |
287 | FILTER NOT EXISTS { |
288 | ?otherStatement a wdno:$separator. |
289 | } |
290 | } |
291 | MINUS { |
292 | ?otherStatement a wdno:$separator. |
293 | FILTER NOT EXISTS { |
294 | ?statement a wdno:$separator. |
295 | } |
296 | } |
297 | EOF; |
298 | return $filter; |
299 | } |
300 | |
301 | /** |
302 | * @param Statement $statement |
303 | * @param PropertyId[] $separators |
304 | * |
305 | * @return CachedEntityIds |
306 | * @throws SparqlHelperException if the query times out or some other error occurs |
307 | */ |
308 | public function findEntitiesWithSameStatement( Statement $statement, array $separators ): CachedEntityIds { |
309 | $pid = $statement->getPropertyId()->getSerialization(); |
310 | $guid = $statement->getGuid(); |
311 | '@phan-var string $guid'; // statement must have a non-null GUID |
312 | $guidForRdf = str_replace( '$', '-', $guid ); |
313 | |
314 | $separatorFilters = array_map( [ $this, 'nestedSeparatorFilter' ], $separators ); |
315 | $finalSeparatorFilter = implode( "\n", $separatorFilters ); |
316 | |
317 | $query = <<<EOF |
318 | SELECT DISTINCT ?otherEntity WHERE { |
319 | BIND(wds:$guidForRdf AS ?statement) |
320 | BIND(p:$pid AS ?p) |
321 | BIND(ps:$pid AS ?ps) |
322 | ?entity ?p ?statement. |
323 | ?statement ?ps ?value. |
324 | ?otherStatement ?ps ?value. |
325 | ?otherEntity ?p ?otherStatement. |
326 | FILTER(?otherEntity != ?entity) |
327 | MINUS { ?otherStatement wikibase:rank wikibase:DeprecatedRank. } |
328 | $finalSeparatorFilter |
329 | } |
330 | LIMIT 10 |
331 | EOF; |
332 | |
333 | $results = [ $this->runQuery( $query, $this->primaryEndpoint ) ]; |
334 | foreach ( $this->additionalEndpoints as $endpoint ) { |
335 | $results[] = $this->runQuery( $query, $endpoint ); |
336 | } |
337 | |
338 | return $this->getOtherEntities( $results ); |
339 | } |
340 | |
341 | /** |
342 | * @param EntityId $entityId The entity ID on the containing entity |
343 | * @param PropertyValueSnak $snak |
344 | * @param string $type Context::TYPE_QUALIFIER or Context::TYPE_REFERENCE |
345 | * @param boolean $ignoreDeprecatedStatements Whether to ignore deprecated statements or not. |
346 | * |
347 | * @return CachedEntityIds |
348 | * @throws SparqlHelperException if the query times out or some other error occurs |
349 | */ |
350 | public function findEntitiesWithSameQualifierOrReference( |
351 | EntityId $entityId, |
352 | PropertyValueSnak $snak, |
353 | string $type, |
354 | bool $ignoreDeprecatedStatements |
355 | ): CachedEntityIds { |
356 | $eid = $entityId->getSerialization(); |
357 | $pid = $snak->getPropertyId()->getSerialization(); |
358 | $prefix = $type === Context::TYPE_QUALIFIER ? 'pq' : 'pr'; |
359 | $dataValue = $snak->getDataValue(); |
360 | $dataType = $this->propertyDataTypeLookup->getDataTypeIdForProperty( |
361 | $snak->getPropertyId() |
362 | ); |
363 | [ $value, $isFullValue ] = $this->getRdfLiteral( $dataType, $dataValue ); |
364 | if ( $isFullValue ) { |
365 | $prefix .= 'v'; |
366 | } |
367 | $path = $type === Context::TYPE_QUALIFIER ? |
368 | "$prefix:$pid" : |
369 | "prov:wasDerivedFrom/$prefix:$pid"; |
370 | |
371 | $deprecatedFilter = ''; |
372 | if ( $ignoreDeprecatedStatements ) { |
373 | $deprecatedFilter = <<< EOF |
374 | MINUS { ?otherStatement wikibase:rank wikibase:DeprecatedRank. } |
375 | EOF; |
376 | } |
377 | |
378 | $query = <<<EOF |
379 | SELECT DISTINCT ?otherEntity WHERE { |
380 | BIND(wd:$eid AS ?entity) |
381 | BIND($value AS ?value) |
382 | ?entity ?p ?statement. |
383 | ?statement $path ?value. |
384 | ?otherStatement $path ?value. |
385 | ?otherEntity ?otherP ?otherStatement. |
386 | FILTER(?otherEntity != ?entity) |
387 | $deprecatedFilter |
388 | } |
389 | LIMIT 10 |
390 | EOF; |
391 | |
392 | $results = [ $this->runQuery( $query, $this->primaryEndpoint ) ]; |
393 | foreach ( $this->additionalEndpoints as $endpoint ) { |
394 | $results[] = $this->runQuery( $query, $endpoint ); |
395 | } |
396 | |
397 | return $this->getOtherEntities( $results ); |
398 | } |
399 | |
400 | /** |
401 | * Return SPARQL code for a string literal with $text as content. |
402 | */ |
403 | private function stringLiteral( string $text ): string { |
404 | return '"' . strtr( $text, [ '"' => '\\"', '\\' => '\\\\' ] ) . '"'; |
405 | } |
406 | |
407 | /** |
408 | * Extract and parse entity IDs from the ?otherEntity column of SPARQL query results. |
409 | * |
410 | * @param CachedQueryResults[] $results |
411 | * |
412 | * @return CachedEntityIds |
413 | */ |
414 | private function getOtherEntities( array $results ): CachedEntityIds { |
415 | $allResultBindings = []; |
416 | $metadatas = []; |
417 | |
418 | foreach ( $results as $result ) { |
419 | $metadatas[] = $result->getMetadata(); |
420 | $allResultBindings = array_merge( $allResultBindings, $result->getArray()['results']['bindings'] ); |
421 | } |
422 | |
423 | $entityIds = array_map( |
424 | function ( $resultBindings ) { |
425 | $entityIRI = $resultBindings['otherEntity']['value']; |
426 | foreach ( $this->entityPrefixes as $entityPrefix ) { |
427 | $entityPrefixLength = strlen( $entityPrefix ); |
428 | if ( substr( $entityIRI, 0, $entityPrefixLength ) === $entityPrefix ) { |
429 | try { |
430 | return $this->entityIdParser->parse( |
431 | substr( $entityIRI, $entityPrefixLength ) |
432 | ); |
433 | } catch ( EntityIdParsingException $e ) { |
434 | // fall through |
435 | } |
436 | } |
437 | |
438 | return null; |
439 | } |
440 | |
441 | return null; |
442 | }, |
443 | $allResultBindings |
444 | ); |
445 | |
446 | return new CachedEntityIds( |
447 | array_values( array_filter( array_unique( $entityIds ) ) ), |
448 | Metadata::merge( $metadatas ) |
449 | ); |
450 | } |
451 | |
452 | // phpcs:disable Generic.Metrics.CyclomaticComplexity,Squiz.WhiteSpace.FunctionSpacing |
453 | /** |
454 | * Get an RDF literal or IRI with which the given data value can be matched in a query. |
455 | * |
456 | * @return array the literal or IRI as a string in SPARQL syntax, |
457 | * and a boolean indicating whether it refers to a full value node or not |
458 | */ |
459 | private function getRdfLiteral( string $dataType, DataValue $dataValue ): array { |
460 | switch ( $dataType ) { |
461 | case 'string': |
462 | case 'external-id': |
463 | return [ $this->stringLiteral( $dataValue->getValue() ), false ]; |
464 | case 'commonsMedia': |
465 | $url = $this->rdfVocabulary->getMediaFileURI( $dataValue->getValue() ); |
466 | return [ '<' . $url . '>', false ]; |
467 | case 'geo-shape': |
468 | $url = $this->rdfVocabulary->getGeoShapeURI( $dataValue->getValue() ); |
469 | return [ '<' . $url . '>', false ]; |
470 | case 'tabular-data': |
471 | $url = $this->rdfVocabulary->getTabularDataURI( $dataValue->getValue() ); |
472 | return [ '<' . $url . '>', false ]; |
473 | case 'url': |
474 | $url = $dataValue->getValue(); |
475 | if ( !preg_match( '/^[^<>"{}\\\\|^`\\x00-\\x20]*$/D', $url ) ) { |
476 | // not a valid URL for SPARQL (see SPARQL spec, production 139 IRIREF) |
477 | // such an URL should never reach us, so just throw |
478 | throw new InvalidArgumentException( 'invalid URL: ' . $url ); |
479 | } |
480 | return [ '<' . $url . '>', false ]; |
481 | case 'wikibase-item': |
482 | case 'wikibase-property': |
483 | /** @var EntityIdValue $dataValue */ |
484 | '@phan-var EntityIdValue $dataValue'; |
485 | return [ 'wd:' . $dataValue->getEntityId()->getSerialization(), false ]; |
486 | case 'monolingualtext': |
487 | /** @var MonolingualTextValue $dataValue */ |
488 | '@phan-var MonolingualTextValue $dataValue'; |
489 | $lang = $dataValue->getLanguageCode(); |
490 | if ( !preg_match( '/^[a-zA-Z]+(-[a-zA-Z0-9]+)*$/D', $lang ) ) { |
491 | // not a valid language tag for SPARQL (see SPARQL spec, production 145 LANGTAG) |
492 | // such a language tag should never reach us, so just throw |
493 | throw new InvalidArgumentException( 'invalid language tag: ' . $lang ); |
494 | } |
495 | return [ $this->stringLiteral( $dataValue->getText() ) . '@' . $lang, false ]; |
496 | case 'globe-coordinate': |
497 | case 'quantity': |
498 | case 'time': |
499 | // @phan-suppress-next-line PhanUndeclaredMethod |
500 | return [ 'wdv:' . $dataValue->getHash(), true ]; |
501 | default: |
502 | throw new InvalidArgumentException( 'unknown data type: ' . $dataType ); |
503 | } |
504 | } |
505 | // phpcs:enable |
506 | |
507 | /** |
508 | * @throws SparqlHelperException if the query times out or some other error occurs |
509 | * @throws ConstraintParameterException if the $regex is invalid |
510 | */ |
511 | public function matchesRegularExpression( string $text, string $regex ): bool { |
512 | // caching wrapper around matchesRegularExpressionWithSparql |
513 | |
514 | $textHash = hash( 'sha256', $text ); |
515 | $cacheKey = $this->cache->makeKey( |
516 | 'WikibaseQualityConstraints', // extension |
517 | 'regex', // action |
518 | 'WDQS-Java', // regex flavor |
519 | hash( 'sha256', $regex ) |
520 | ); |
521 | |
522 | $cacheMapArray = $this->cache->getWithSetCallback( |
523 | $cacheKey, |
524 | WANObjectCache::TTL_DAY, |
525 | function ( $cacheMapArray ) use ( $text, $regex, $textHash ) { |
526 | // Initialize the cache map if not set |
527 | if ( $cacheMapArray === false ) { |
528 | $key = 'wikibase.quality.constraints.regex.cache.refresh.init'; |
529 | $this->dataFactory->increment( $key ); |
530 | return []; |
531 | } |
532 | |
533 | $key = 'wikibase.quality.constraints.regex.cache.refresh'; |
534 | $this->dataFactory->increment( $key ); |
535 | $cacheMap = MapCacheLRU::newFromArray( $cacheMapArray, $this->cacheMapSize ); |
536 | if ( $cacheMap->has( $textHash ) ) { |
537 | $key = 'wikibase.quality.constraints.regex.cache.refresh.hit'; |
538 | $this->dataFactory->increment( $key ); |
539 | $cacheMap->get( $textHash ); // ping cache |
540 | } else { |
541 | $key = 'wikibase.quality.constraints.regex.cache.refresh.miss'; |
542 | $this->dataFactory->increment( $key ); |
543 | try { |
544 | $matches = $this->matchesRegularExpressionWithSparql( $text, $regex ); |
545 | } catch ( ConstraintParameterException $e ) { |
546 | $matches = $this->serializeConstraintParameterException( $e ); |
547 | } catch ( SparqlHelperException $e ) { |
548 | // don’t cache this |
549 | return $cacheMap->toArray(); |
550 | } |
551 | $cacheMap->set( |
552 | $textHash, |
553 | $matches, |
554 | 3 / 8 |
555 | ); |
556 | } |
557 | |
558 | return $cacheMap->toArray(); |
559 | }, |
560 | [ |
561 | // Once map is > 1 sec old, consider refreshing |
562 | 'ageNew' => 1, |
563 | // Update 5 seconds after "ageNew" given a 1 query/sec cache check rate |
564 | 'hotTTR' => 5, |
565 | // avoid querying cache servers multiple times in a request |
566 | // (e. g. when checking format of a reference URL used multiple times on an entity) |
567 | 'pcTTL' => WANObjectCache::TTL_PROC_LONG, |
568 | ] |
569 | ); |
570 | |
571 | if ( isset( $cacheMapArray[$textHash] ) ) { |
572 | $key = 'wikibase.quality.constraints.regex.cache.hit'; |
573 | $this->dataFactory->increment( $key ); |
574 | $matches = $cacheMapArray[$textHash]; |
575 | if ( is_bool( $matches ) ) { |
576 | return $matches; |
577 | } elseif ( is_array( $matches ) && |
578 | $matches['type'] == ConstraintParameterException::class ) { |
579 | throw $this->deserializeConstraintParameterException( $matches ); |
580 | } else { |
581 | throw new UnexpectedValueException( |
582 | 'Value of unknown type in object cache (' . |
583 | 'cache key: ' . $cacheKey . ', ' . |
584 | 'cache map key: ' . $textHash . ', ' . |
585 | 'value type: ' . get_debug_type( $matches ) . ')' |
586 | ); |
587 | } |
588 | } else { |
589 | $key = 'wikibase.quality.constraints.regex.cache.miss'; |
590 | $this->dataFactory->increment( $key ); |
591 | return $this->matchesRegularExpressionWithSparql( $text, $regex ); |
592 | } |
593 | } |
594 | |
595 | private function serializeConstraintParameterException( ConstraintParameterException $cpe ): array { |
596 | return [ |
597 | 'type' => ConstraintParameterException::class, |
598 | 'violationMessage' => $this->violationMessageSerializer->serialize( $cpe->getViolationMessage() ), |
599 | ]; |
600 | } |
601 | |
602 | private function deserializeConstraintParameterException( array $serialization ): ConstraintParameterException { |
603 | $message = $this->violationMessageDeserializer->deserialize( |
604 | $serialization['violationMessage'] |
605 | ); |
606 | return new ConstraintParameterException( $message ); |
607 | } |
608 | |
609 | /** |
610 | * This function is only public for testing purposes; |
611 | * use matchesRegularExpression, which is equivalent but caches results. |
612 | * |
613 | * @throws SparqlHelperException if the query times out or some other error occurs |
614 | * @throws ConstraintParameterException if the $regex is invalid |
615 | */ |
616 | public function matchesRegularExpressionWithSparql( string $text, string $regex ): bool { |
617 | $textStringLiteral = $this->stringLiteral( $text ); |
618 | $regexStringLiteral = $this->stringLiteral( '^(?:' . $regex . ')$' ); |
619 | |
620 | $query = <<<EOF |
621 | SELECT (REGEX($textStringLiteral, $regexStringLiteral) AS ?matches) {} |
622 | EOF; |
623 | |
624 | $result = $this->runQuery( $query, $this->primaryEndpoint, false ); |
625 | |
626 | $vars = $result->getArray()['results']['bindings'][0]; |
627 | if ( array_key_exists( 'matches', $vars ) ) { |
628 | // true or false ⇒ regex okay, text matches or not |
629 | return $vars['matches']['value'] === 'true'; |
630 | } else { |
631 | // empty result: regex broken |
632 | throw new ConstraintParameterException( |
633 | ( new ViolationMessage( 'wbqc-violation-message-parameter-regex' ) ) |
634 | ->withInlineCode( $regex, Role::CONSTRAINT_PARAMETER_VALUE ) |
635 | ); |
636 | } |
637 | } |
638 | |
639 | /** |
640 | * Check whether the text content of an error response indicates a query timeout. |
641 | */ |
642 | public function isTimeout( string $responseContent ): bool { |
643 | $timeoutRegex = implode( '|', array_map( |
644 | static function ( $fqn ) { |
645 | return preg_quote( $fqn, '/' ); |
646 | }, |
647 | $this->timeoutExceptionClasses |
648 | ) ); |
649 | return (bool)preg_match( '/' . $timeoutRegex . '/', $responseContent ); |
650 | } |
651 | |
652 | /** |
653 | * Return the max-age of a cached response, |
654 | * or a boolean indicating whether the response was cached or not. |
655 | * |
656 | * @param array $responseHeaders see MWHttpRequest::getResponseHeaders() |
657 | * |
658 | * @return int|boolean the max-age (in seconds) |
659 | * or a plain boolean if no max-age can be determined |
660 | */ |
661 | public function getCacheMaxAge( array $responseHeaders ) { |
662 | if ( |
663 | array_key_exists( 'x-cache-status', $responseHeaders ) && |
664 | preg_match( '/^hit(?:-.*)?$/', $responseHeaders['x-cache-status'][0] ) |
665 | ) { |
666 | $maxage = []; |
667 | if ( |
668 | array_key_exists( 'cache-control', $responseHeaders ) && |
669 | preg_match( '/\bmax-age=(\d+)\b/', $responseHeaders['cache-control'][0], $maxage ) |
670 | ) { |
671 | return intval( $maxage[1] ); |
672 | } else { |
673 | return true; |
674 | } |
675 | } else { |
676 | return false; |
677 | } |
678 | } |
679 | |
680 | /** |
681 | * Get the delay date of a 429 headered response, which is caused by |
682 | * throttling of to many SPARQL-Requests. The header-format is defined |
683 | * in RFC7231 see: https://tools.ietf.org/html/rfc7231#section-7.1.3 |
684 | * |
685 | * @param MWHttpRequest $request |
686 | * |
687 | * @return int|ConvertibleTimestamp |
688 | * or SparlHelper::NO_RETRY_AFTER if there is no Retry-After header |
689 | * or SparlHelper::EMPTY_RETRY_AFTER if there is an empty Retry-After |
690 | * or SparlHelper::INVALID_RETRY_AFTER if there is something wrong with the format |
691 | */ |
692 | public function getThrottling( MWHttpRequest $request ) { |
693 | $retryAfterValue = $request->getResponseHeader( 'Retry-After' ); |
694 | if ( $retryAfterValue === null ) { |
695 | return self::NO_RETRY_AFTER; |
696 | } |
697 | |
698 | $trimmedRetryAfterValue = trim( $retryAfterValue ); |
699 | if ( $trimmedRetryAfterValue === '' ) { |
700 | return self::EMPTY_RETRY_AFTER; |
701 | } |
702 | |
703 | if ( is_numeric( $trimmedRetryAfterValue ) ) { |
704 | $delaySeconds = (int)$trimmedRetryAfterValue; |
705 | if ( $delaySeconds >= 0 ) { |
706 | return $this->getTimestampInFuture( new DateInterval( 'PT' . $delaySeconds . 'S' ) ); |
707 | } |
708 | } else { |
709 | $return = strtotime( $trimmedRetryAfterValue ); |
710 | if ( $return !== false ) { |
711 | return new ConvertibleTimestamp( $return ); |
712 | } |
713 | } |
714 | return self::INVALID_RETRY_AFTER; |
715 | } |
716 | |
717 | private function getTimestampInFuture( DateInterval $delta ) { |
718 | $now = new ConvertibleTimestamp(); |
719 | return new ConvertibleTimestamp( $now->timestamp->add( $delta ) ); |
720 | } |
721 | |
722 | /** |
723 | * Runs a query against the configured endpoint and returns the results. |
724 | * TODO: See if Sparql Client in core can be used instead of rolling our own |
725 | * |
726 | * @param string $query The query, unencoded (plain string). |
727 | * @param string $endpoint The endpoint to query. |
728 | * @param bool $needsPrefixes Whether the query requires prefixes or they can be omitted. |
729 | * |
730 | * @return CachedQueryResults |
731 | * |
732 | * @throws SparqlHelperException if the query times out or some other error occurs |
733 | */ |
734 | protected function runQuery( string $query, string $endpoint, bool $needsPrefixes = true ): CachedQueryResults { |
735 | if ( $this->throttlingLock->isLocked( self::EXPIRY_LOCK_ID ) ) { |
736 | $this->dataFactory->increment( 'wikibase.quality.constraints.sparql.throttling' ); |
737 | throw new TooManySparqlRequestsException(); |
738 | } |
739 | |
740 | if ( $this->sparqlHasWikibaseSupport ) { |
741 | $needsPrefixes = false; |
742 | } |
743 | |
744 | if ( $needsPrefixes ) { |
745 | $query = $this->prefixes . $query; |
746 | } |
747 | $query = "#wbqc\n" . $query; |
748 | |
749 | $url = $endpoint . '?' . http_build_query( |
750 | [ |
751 | 'query' => $query, |
752 | 'format' => 'json', |
753 | 'maxQueryTimeMillis' => $this->maxQueryTimeMillis, |
754 | ], |
755 | '', ini_get( 'arg_separator.output' ), |
756 | // encode spaces with %20, not + |
757 | PHP_QUERY_RFC3986 |
758 | ); |
759 | |
760 | $options = [ |
761 | 'method' => 'GET', |
762 | 'timeout' => (int)round( ( $this->maxQueryTimeMillis + 1000 ) / 1000 ), |
763 | 'connectTimeout' => 'default', |
764 | 'userAgent' => $this->defaultUserAgent, |
765 | ]; |
766 | $request = $this->requestFactory->create( $url, $options, __METHOD__ ); |
767 | $startTime = microtime( true ); |
768 | $requestStatus = $request->execute(); |
769 | $endTime = microtime( true ); |
770 | $this->dataFactory->timing( |
771 | 'wikibase.quality.constraints.sparql.timing', |
772 | ( $endTime - $startTime ) * 1000 |
773 | ); |
774 | |
775 | $this->guardAgainstTooManyRequestsError( $request ); |
776 | |
777 | $maxAge = $this->getCacheMaxAge( $request->getResponseHeaders() ); |
778 | if ( $maxAge ) { |
779 | $this->dataFactory->increment( 'wikibase.quality.constraints.sparql.cached' ); |
780 | } |
781 | |
782 | if ( $requestStatus->isOK() ) { |
783 | $json = $request->getContent(); |
784 | $jsonStatus = FormatJson::parse( $json, FormatJson::FORCE_ASSOC ); |
785 | if ( $jsonStatus->isOK() ) { |
786 | return new CachedQueryResults( |
787 | $jsonStatus->getValue(), |
788 | Metadata::ofCachingMetadata( |
789 | $maxAge ? |
790 | CachingMetadata::ofMaximumAgeInSeconds( $maxAge ) : |
791 | CachingMetadata::fresh() |
792 | ) |
793 | ); |
794 | } else { |
795 | $jsonErrorCode = $jsonStatus->getErrors()[0]['message']; |
796 | $this->dataFactory->increment( |
797 | "wikibase.quality.constraints.sparql.error.json.$jsonErrorCode" |
798 | ); |
799 | // fall through to general error handling |
800 | } |
801 | } else { |
802 | $this->dataFactory->increment( |
803 | "wikibase.quality.constraints.sparql.error.http.{$request->getStatus()}" |
804 | ); |
805 | // fall through to general error handling |
806 | } |
807 | |
808 | $this->dataFactory->increment( 'wikibase.quality.constraints.sparql.error' ); |
809 | |
810 | if ( $this->isTimeout( $request->getContent() ) ) { |
811 | $this->dataFactory->increment( |
812 | 'wikibase.quality.constraints.sparql.error.timeout' |
813 | ); |
814 | } |
815 | |
816 | throw new SparqlHelperException(); |
817 | } |
818 | |
819 | /** |
820 | * Handle a potential “too many requests” error. |
821 | * |
822 | * @param MWHttpRequest $request |
823 | * @throws TooManySparqlRequestsException |
824 | */ |
825 | private function guardAgainstTooManyRequestsError( MWHttpRequest $request ): void { |
826 | if ( $request->getStatus() !== self::HTTP_TOO_MANY_REQUESTS ) { |
827 | return; |
828 | } |
829 | |
830 | $fallbackBlockDuration = $this->sparqlThrottlingFallbackDuration; |
831 | |
832 | if ( $fallbackBlockDuration < 0 ) { |
833 | throw new InvalidArgumentException( 'Fallback duration must be positive int but is: ' . |
834 | $fallbackBlockDuration ); |
835 | } |
836 | |
837 | $this->dataFactory->increment( 'wikibase.quality.constraints.sparql.throttling' ); |
838 | $throttlingUntil = $this->getThrottling( $request ); |
839 | if ( !( $throttlingUntil instanceof ConvertibleTimestamp ) ) { |
840 | $this->loggingHelper->logSparqlHelperTooManyRequestsRetryAfterInvalid( $request ); |
841 | $this->throttlingLock->lock( |
842 | self::EXPIRY_LOCK_ID, |
843 | $this->getTimestampInFuture( new DateInterval( 'PT' . $fallbackBlockDuration . 'S' ) ) |
844 | ); |
845 | } else { |
846 | $this->loggingHelper->logSparqlHelperTooManyRequestsRetryAfterPresent( $throttlingUntil, $request ); |
847 | $this->throttlingLock->lock( self::EXPIRY_LOCK_ID, $throttlingUntil ); |
848 | } |
849 | throw new TooManySparqlRequestsException(); |
850 | } |
851 | |
852 | } |