Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
78.44% |
302 / 385 |
|
63.16% |
12 / 19 |
CRAP | |
0.00% |
0 / 1 |
SparqlHelper | |
78.44% |
302 / 385 |
|
63.16% |
12 / 19 |
133.87 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
29 / 29 |
|
100.00% |
1 / 1 |
2 | |||
getQueryPrefixes | |
100.00% |
42 / 42 |
|
100.00% |
1 / 1 |
3 | |||
hasType | |
85.71% |
24 / 28 |
|
0.00% |
0 / 1 |
4.05 | |||
nestedSeparatorFilter | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
1 | |||
findEntitiesWithSameStatement | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
1 | |||
findEntitiesWithSameQualifierOrReference | |
100.00% |
25 / 25 |
|
100.00% |
1 / 1 |
5 | |||
stringLiteral | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getOtherEntities | |
80.00% |
12 / 15 |
|
0.00% |
0 / 1 |
4.13 | |||
getRdfLiteral | |
90.00% |
27 / 30 |
|
0.00% |
0 / 1 |
16.26 | |||
matchesRegularExpression | |
0.00% |
0 / 60 |
|
0.00% |
0 / 1 |
90 | |||
serializeConstraintParameterException | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
deserializeConstraintParameterException | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
matchesRegularExpressionWithSparql | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
2 | |||
isTimeout | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
getCacheMaxAge | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
getThrottling | |
64.29% |
9 / 14 |
|
0.00% |
0 / 1 |
7.64 | |||
getTimestampInFuture | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
runQuery | |
90.00% |
54 / 60 |
|
0.00% |
0 / 1 |
9.08 | |||
guardAgainstTooManyRequestsError | |
88.24% |
15 / 17 |
|
0.00% |
0 / 1 |
4.03 |
1 | <?php |
2 | |
3 | namespace WikibaseQuality\ConstraintReport\ConstraintCheck\Helper; |
4 | |
5 | use DataValues\DataValue; |
6 | use DataValues\MonolingualTextValue; |
7 | use DateInterval; |
8 | use FormatJson; |
9 | use IBufferingStatsdDataFactory; |
10 | use InvalidArgumentException; |
11 | use MapCacheLRU; |
12 | use MediaWiki\Config\Config; |
13 | use MediaWiki\Http\HttpRequestFactory; |
14 | use MWHttpRequest; |
15 | use UnexpectedValueException; |
16 | use WANObjectCache; |
17 | use Wikibase\DataModel\Entity\EntityId; |
18 | use Wikibase\DataModel\Entity\EntityIdParser; |
19 | use Wikibase\DataModel\Entity\EntityIdParsingException; |
20 | use Wikibase\DataModel\Entity\EntityIdValue; |
21 | use Wikibase\DataModel\Entity\PropertyId; |
22 | use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookup; |
23 | use Wikibase\DataModel\Snak\PropertyValueSnak; |
24 | use Wikibase\DataModel\Statement\Statement; |
25 | use Wikibase\Repo\Rdf\RdfVocabulary; |
26 | use WikibaseQuality\ConstraintReport\Api\ExpiryLock; |
27 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\CachedBool; |
28 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\CachedEntityIds; |
29 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\CachedQueryResults; |
30 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\CachingMetadata; |
31 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Cache\Metadata; |
32 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Context\Context; |
33 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Message\ViolationMessage; |
34 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Message\ViolationMessageDeserializer; |
35 | use WikibaseQuality\ConstraintReport\ConstraintCheck\Message\ViolationMessageSerializer; |
36 | use WikibaseQuality\ConstraintReport\Role; |
37 | use Wikimedia\Timestamp\ConvertibleTimestamp; |
38 | |
39 | /** |
40 | * Class for running a SPARQL query on some endpoint and getting the results. |
41 | * |
42 | * @author Lucas Werkmeister |
43 | * @license GPL-2.0-or-later |
44 | */ |
45 | class SparqlHelper { |
46 | |
47 | /** |
48 | * @var RdfVocabulary |
49 | */ |
50 | private $rdfVocabulary; |
51 | |
52 | /** |
53 | * @var string[] |
54 | */ |
55 | private $entityPrefixes; |
56 | |
57 | /** |
58 | * @var string |
59 | */ |
60 | private $prefixes; |
61 | |
62 | /** |
63 | * @var EntityIdParser |
64 | */ |
65 | private $entityIdParser; |
66 | |
67 | /** |
68 | * @var PropertyDataTypeLookup |
69 | */ |
70 | private $propertyDataTypeLookup; |
71 | |
72 | /** |
73 | * @var WANObjectCache |
74 | */ |
75 | private $cache; |
76 | |
77 | /** |
78 | * @var ViolationMessageSerializer |
79 | */ |
80 | private $violationMessageSerializer; |
81 | |
82 | /** |
83 | * @var ViolationMessageDeserializer |
84 | */ |
85 | private $violationMessageDeserializer; |
86 | |
87 | /** |
88 | * @var IBufferingStatsdDataFactory |
89 | */ |
90 | private $dataFactory; |
91 | |
92 | /** |
93 | * @var LoggingHelper |
94 | */ |
95 | private $loggingHelper; |
96 | |
97 | /** |
98 | * @var string |
99 | */ |
100 | private $defaultUserAgent; |
101 | |
102 | /** |
103 | * @var ExpiryLock |
104 | */ |
105 | private $throttlingLock; |
106 | |
107 | /** |
108 | * @var int stands for: No Retry-After header-field was sent back |
109 | */ |
110 | private const NO_RETRY_AFTER = -1; |
111 | /** |
112 | * @var int stands for: Empty Retry-After header-field was sent back |
113 | */ |
114 | private const EMPTY_RETRY_AFTER = -2; |
115 | /** |
116 | * @var int stands for: Invalid Retry-After header-field was sent back |
117 | * link a string |
118 | */ |
119 | private const INVALID_RETRY_AFTER = -3; |
120 | /** |
121 | * @var string ID on which the lock is applied on |
122 | */ |
123 | public const EXPIRY_LOCK_ID = 'SparqlHelper.runQuery'; |
124 | |
125 | /** |
126 | * @var int HTTP response code for too many requests |
127 | */ |
128 | private const HTTP_TOO_MANY_REQUESTS = 429; |
129 | |
130 | /** |
131 | * @var HttpRequestFactory |
132 | */ |
133 | private $requestFactory; |
134 | |
135 | // config variables |
136 | |
137 | /** |
138 | * @var string |
139 | */ |
140 | private $endpoint; |
141 | |
142 | /** |
143 | * @var int |
144 | */ |
145 | private $maxQueryTimeMillis; |
146 | |
147 | /** |
148 | * @var string |
149 | */ |
150 | private $instanceOfId; |
151 | |
152 | /** |
153 | * @var string |
154 | */ |
155 | private $subclassOfId; |
156 | |
157 | /** |
158 | * @var int |
159 | */ |
160 | private $cacheMapSize; |
161 | |
162 | /** |
163 | * @var string[] |
164 | */ |
165 | private $timeoutExceptionClasses; |
166 | |
167 | /** |
168 | * @var bool |
169 | */ |
170 | private $sparqlHasWikibaseSupport; |
171 | |
172 | /** |
173 | * @var int |
174 | */ |
175 | private $sparqlThrottlingFallbackDuration; |
176 | |
177 | public function __construct( |
178 | Config $config, |
179 | RdfVocabulary $rdfVocabulary, |
180 | EntityIdParser $entityIdParser, |
181 | PropertyDataTypeLookup $propertyDataTypeLookup, |
182 | WANObjectCache $cache, |
183 | ViolationMessageSerializer $violationMessageSerializer, |
184 | ViolationMessageDeserializer $violationMessageDeserializer, |
185 | IBufferingStatsdDataFactory $dataFactory, |
186 | ExpiryLock $throttlingLock, |
187 | LoggingHelper $loggingHelper, |
188 | $defaultUserAgent, |
189 | HttpRequestFactory $requestFactory |
190 | ) { |
191 | $this->rdfVocabulary = $rdfVocabulary; |
192 | $this->entityIdParser = $entityIdParser; |
193 | $this->propertyDataTypeLookup = $propertyDataTypeLookup; |
194 | $this->cache = $cache; |
195 | $this->violationMessageSerializer = $violationMessageSerializer; |
196 | $this->violationMessageDeserializer = $violationMessageDeserializer; |
197 | $this->dataFactory = $dataFactory; |
198 | $this->throttlingLock = $throttlingLock; |
199 | $this->loggingHelper = $loggingHelper; |
200 | $this->defaultUserAgent = $defaultUserAgent; |
201 | $this->requestFactory = $requestFactory; |
202 | $this->entityPrefixes = []; |
203 | foreach ( $rdfVocabulary->entityNamespaceNames as $namespaceName ) { |
204 | $this->entityPrefixes[] = $rdfVocabulary->getNamespaceURI( $namespaceName ); |
205 | } |
206 | |
207 | $this->endpoint = $config->get( 'WBQualityConstraintsSparqlEndpoint' ); |
208 | $this->maxQueryTimeMillis = $config->get( 'WBQualityConstraintsSparqlMaxMillis' ); |
209 | $this->instanceOfId = $config->get( 'WBQualityConstraintsInstanceOfId' ); |
210 | $this->subclassOfId = $config->get( 'WBQualityConstraintsSubclassOfId' ); |
211 | $this->cacheMapSize = $config->get( 'WBQualityConstraintsFormatCacheMapSize' ); |
212 | $this->timeoutExceptionClasses = $config->get( |
213 | 'WBQualityConstraintsSparqlTimeoutExceptionClasses' |
214 | ); |
215 | $this->sparqlHasWikibaseSupport = $config->get( |
216 | 'WBQualityConstraintsSparqlHasWikibaseSupport' |
217 | ); |
218 | $this->sparqlThrottlingFallbackDuration = (int)$config->get( |
219 | 'WBQualityConstraintsSparqlThrottlingFallbackDuration' |
220 | ); |
221 | |
222 | $this->prefixes = $this->getQueryPrefixes( $rdfVocabulary ); |
223 | } |
224 | |
225 | private function getQueryPrefixes( RdfVocabulary $rdfVocabulary ) { |
226 | // TODO: it would probably be smarter that RdfVocubulary exposed these prefixes somehow |
227 | $prefixes = ''; |
228 | foreach ( $rdfVocabulary->entityNamespaceNames as $sourceName => $namespaceName ) { |
229 | $prefixes .= <<<END |
230 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
231 | END; |
232 | } |
233 | $prefixes .= <<<END |
234 | PREFIX wds: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NS_STATEMENT )}> |
235 | PREFIX wdv: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NS_VALUE )}>\n |
236 | END; |
237 | |
238 | foreach ( $rdfVocabulary->propertyNamespaceNames as $sourceName => $sourceNamespaces ) { |
239 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_DIRECT_CLAIM]; |
240 | $prefixes .= <<<END |
241 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
242 | END; |
243 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_CLAIM]; |
244 | $prefixes .= <<<END |
245 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
246 | END; |
247 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_CLAIM_STATEMENT]; |
248 | $prefixes .= <<<END |
249 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
250 | END; |
251 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_QUALIFIER]; |
252 | $prefixes .= <<<END |
253 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
254 | END; |
255 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_QUALIFIER_VALUE]; |
256 | $prefixes .= <<<END |
257 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
258 | END; |
259 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_REFERENCE]; |
260 | $prefixes .= <<<END |
261 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
262 | END; |
263 | $namespaceName = $sourceNamespaces[RdfVocabulary::NSP_REFERENCE_VALUE]; |
264 | $prefixes .= <<<END |
265 | PREFIX {$namespaceName}: <{$rdfVocabulary->getNamespaceURI( $namespaceName )}>\n |
266 | END; |
267 | } |
268 | $prefixes .= <<<END |
269 | PREFIX wikibase: <{$rdfVocabulary->getNamespaceURI( RdfVocabulary::NS_ONTOLOGY )}>\n |
270 | END; |
271 | return $prefixes; |
272 | } |
273 | |
274 | /** |
275 | * @param string $id entity ID serialization of the entity to check |
276 | * @param string[] $classes entity ID serializations of the expected types |
277 | * |
278 | * @return CachedBool |
279 | * @throws SparqlHelperException if the query times out or some other error occurs |
280 | */ |
281 | public function hasType( $id, array $classes ) { |
282 | // TODO hint:gearing is a workaround for T168973 and can hopefully be removed eventually |
283 | $gearingHint = $this->sparqlHasWikibaseSupport ? |
284 | ' hint:Prior hint:gearing "forward".' : |
285 | ''; |
286 | |
287 | $metadatas = []; |
288 | |
289 | foreach ( array_chunk( $classes, 20 ) as $classesChunk ) { |
290 | $classesValues = implode( ' ', array_map( |
291 | static function ( $class ) { |
292 | return 'wd:' . $class; |
293 | }, |
294 | $classesChunk |
295 | ) ); |
296 | |
297 | $query = <<<EOF |
298 | ASK { |
299 | BIND(wd:$id AS ?item) |
300 | VALUES ?class { $classesValues } |
301 | ?item wdt:{$this->subclassOfId}* ?class.$gearingHint |
302 | } |
303 | EOF; |
304 | |
305 | $result = $this->runQuery( $query ); |
306 | $metadatas[] = $result->getMetadata(); |
307 | if ( $result->getArray()['boolean'] ) { |
308 | return new CachedBool( |
309 | true, |
310 | Metadata::merge( $metadatas ) |
311 | ); |
312 | } |
313 | } |
314 | |
315 | return new CachedBool( |
316 | false, |
317 | Metadata::merge( $metadatas ) |
318 | ); |
319 | } |
320 | |
321 | /** |
322 | * Helper function used by findEntitiesWithSameStatement to filter |
323 | * out entities with different qualifiers or no qualifier value. |
324 | * |
325 | * @param PropertyId $separator |
326 | * @return string |
327 | */ |
328 | private function nestedSeparatorFilter( PropertyId $separator ) { |
329 | $filter = <<<EOF |
330 | MINUS { |
331 | ?statement pq:$separator ?qualifier. |
332 | FILTER NOT EXISTS { |
333 | ?otherStatement pq:$separator ?qualifier. |
334 | } |
335 | } |
336 | MINUS { |
337 | ?otherStatement pq:$separator ?qualifier. |
338 | FILTER NOT EXISTS { |
339 | ?statement pq:$separator ?qualifier. |
340 | } |
341 | } |
342 | MINUS { |
343 | ?statement a wdno:$separator. |
344 | FILTER NOT EXISTS { |
345 | ?otherStatement a wdno:$separator. |
346 | } |
347 | } |
348 | MINUS { |
349 | ?otherStatement a wdno:$separator. |
350 | FILTER NOT EXISTS { |
351 | ?statement a wdno:$separator. |
352 | } |
353 | } |
354 | EOF; |
355 | return $filter; |
356 | } |
357 | |
358 | /** |
359 | * @param Statement $statement |
360 | * @param PropertyId[] $separators |
361 | * |
362 | * @return CachedEntityIds |
363 | * @throws SparqlHelperException if the query times out or some other error occurs |
364 | */ |
365 | public function findEntitiesWithSameStatement( |
366 | Statement $statement, |
367 | array $separators |
368 | ) { |
369 | $pid = $statement->getPropertyId()->getSerialization(); |
370 | $guid = $statement->getGuid(); |
371 | '@phan-var string $guid'; // statement must have a non-null GUID |
372 | $guidForRdf = str_replace( '$', '-', $guid ); |
373 | |
374 | $separatorFilters = array_map( [ $this, 'nestedSeparatorFilter' ], $separators ); |
375 | $finalSeparatorFilter = implode( "\n", $separatorFilters ); |
376 | |
377 | $query = <<<EOF |
378 | SELECT DISTINCT ?otherEntity WHERE { |
379 | BIND(wds:$guidForRdf AS ?statement) |
380 | BIND(p:$pid AS ?p) |
381 | BIND(ps:$pid AS ?ps) |
382 | ?entity ?p ?statement. |
383 | ?statement ?ps ?value. |
384 | ?otherStatement ?ps ?value. |
385 | ?otherEntity ?p ?otherStatement. |
386 | FILTER(?otherEntity != ?entity) |
387 | MINUS { ?otherStatement wikibase:rank wikibase:DeprecatedRank. } |
388 | $finalSeparatorFilter |
389 | } |
390 | LIMIT 10 |
391 | EOF; |
392 | |
393 | $result = $this->runQuery( $query ); |
394 | |
395 | return $this->getOtherEntities( $result ); |
396 | } |
397 | |
398 | /** |
399 | * @param EntityId $entityId The entity ID on the containing entity |
400 | * @param PropertyValueSnak $snak |
401 | * @param string $type Context::TYPE_QUALIFIER or Context::TYPE_REFERENCE |
402 | * @param boolean $ignoreDeprecatedStatements Whether to ignore deprecated statements or not. |
403 | * |
404 | * @return CachedEntityIds |
405 | * @throws SparqlHelperException if the query times out or some other error occurs |
406 | */ |
407 | public function findEntitiesWithSameQualifierOrReference( |
408 | EntityId $entityId, |
409 | PropertyValueSnak $snak, |
410 | $type, |
411 | $ignoreDeprecatedStatements |
412 | ) { |
413 | $eid = $entityId->getSerialization(); |
414 | $pid = $snak->getPropertyId()->getSerialization(); |
415 | $prefix = $type === Context::TYPE_QUALIFIER ? 'pq' : 'pr'; |
416 | $dataValue = $snak->getDataValue(); |
417 | $dataType = $this->propertyDataTypeLookup->getDataTypeIdForProperty( |
418 | $snak->getPropertyId() |
419 | ); |
420 | [ $value, $isFullValue ] = $this->getRdfLiteral( $dataType, $dataValue ); |
421 | if ( $isFullValue ) { |
422 | $prefix .= 'v'; |
423 | } |
424 | $path = $type === Context::TYPE_QUALIFIER ? |
425 | "$prefix:$pid" : |
426 | "prov:wasDerivedFrom/$prefix:$pid"; |
427 | |
428 | $deprecatedFilter = ''; |
429 | if ( $ignoreDeprecatedStatements ) { |
430 | $deprecatedFilter = <<< EOF |
431 | MINUS { ?otherStatement wikibase:rank wikibase:DeprecatedRank. } |
432 | EOF; |
433 | } |
434 | |
435 | $query = <<<EOF |
436 | SELECT DISTINCT ?otherEntity WHERE { |
437 | BIND(wd:$eid AS ?entity) |
438 | BIND($value AS ?value) |
439 | ?entity ?p ?statement. |
440 | ?statement $path ?value. |
441 | ?otherStatement $path ?value. |
442 | ?otherEntity ?otherP ?otherStatement. |
443 | FILTER(?otherEntity != ?entity) |
444 | $deprecatedFilter |
445 | } |
446 | LIMIT 10 |
447 | EOF; |
448 | |
449 | $result = $this->runQuery( $query ); |
450 | |
451 | return $this->getOtherEntities( $result ); |
452 | } |
453 | |
454 | /** |
455 | * Return SPARQL code for a string literal with $text as content. |
456 | * |
457 | * @param string $text |
458 | * |
459 | * @return string |
460 | */ |
461 | private function stringLiteral( $text ) { |
462 | return '"' . strtr( $text, [ '"' => '\\"', '\\' => '\\\\' ] ) . '"'; |
463 | } |
464 | |
465 | /** |
466 | * Extract and parse entity IDs from the ?otherEntity column of a SPARQL query result. |
467 | * |
468 | * @param CachedQueryResults $results |
469 | * |
470 | * @return CachedEntityIds |
471 | */ |
472 | private function getOtherEntities( CachedQueryResults $results ) { |
473 | return new CachedEntityIds( array_map( |
474 | function ( $resultBindings ) { |
475 | $entityIRI = $resultBindings['otherEntity']['value']; |
476 | foreach ( $this->entityPrefixes as $entityPrefix ) { |
477 | $entityPrefixLength = strlen( $entityPrefix ); |
478 | if ( substr( $entityIRI, 0, $entityPrefixLength ) === $entityPrefix ) { |
479 | try { |
480 | return $this->entityIdParser->parse( |
481 | substr( $entityIRI, $entityPrefixLength ) |
482 | ); |
483 | } catch ( EntityIdParsingException $e ) { |
484 | // fall through |
485 | } |
486 | } |
487 | |
488 | return null; |
489 | } |
490 | |
491 | return null; |
492 | }, |
493 | $results->getArray()['results']['bindings'] |
494 | ), $results->getMetadata() ); |
495 | } |
496 | |
497 | // phpcs:disable Generic.Metrics.CyclomaticComplexity,Squiz.WhiteSpace.FunctionSpacing |
498 | /** |
499 | * Get an RDF literal or IRI with which the given data value can be matched in a query. |
500 | * |
501 | * @param string $dataType |
502 | * @param DataValue $dataValue |
503 | * |
504 | * @return array the literal or IRI as a string in SPARQL syntax, |
505 | * and a boolean indicating whether it refers to a full value node or not |
506 | */ |
507 | private function getRdfLiteral( $dataType, DataValue $dataValue ) { |
508 | switch ( $dataType ) { |
509 | case 'string': |
510 | case 'external-id': |
511 | return [ $this->stringLiteral( $dataValue->getValue() ), false ]; |
512 | case 'commonsMedia': |
513 | $url = $this->rdfVocabulary->getMediaFileURI( $dataValue->getValue() ); |
514 | return [ '<' . $url . '>', false ]; |
515 | case 'geo-shape': |
516 | $url = $this->rdfVocabulary->getGeoShapeURI( $dataValue->getValue() ); |
517 | return [ '<' . $url . '>', false ]; |
518 | case 'tabular-data': |
519 | $url = $this->rdfVocabulary->getTabularDataURI( $dataValue->getValue() ); |
520 | return [ '<' . $url . '>', false ]; |
521 | case 'url': |
522 | $url = $dataValue->getValue(); |
523 | if ( !preg_match( '/^[^<>"{}\\\\|^`\\x00-\\x20]*$/D', $url ) ) { |
524 | // not a valid URL for SPARQL (see SPARQL spec, production 139 IRIREF) |
525 | // such an URL should never reach us, so just throw |
526 | throw new InvalidArgumentException( 'invalid URL: ' . $url ); |
527 | } |
528 | return [ '<' . $url . '>', false ]; |
529 | case 'wikibase-item': |
530 | case 'wikibase-property': |
531 | /** @var EntityIdValue $dataValue */ |
532 | '@phan-var EntityIdValue $dataValue'; |
533 | return [ 'wd:' . $dataValue->getEntityId()->getSerialization(), false ]; |
534 | case 'monolingualtext': |
535 | /** @var MonolingualTextValue $dataValue */ |
536 | '@phan-var MonolingualTextValue $dataValue'; |
537 | $lang = $dataValue->getLanguageCode(); |
538 | if ( !preg_match( '/^[a-zA-Z]+(-[a-zA-Z0-9]+)*$/D', $lang ) ) { |
539 | // not a valid language tag for SPARQL (see SPARQL spec, production 145 LANGTAG) |
540 | // such a language tag should never reach us, so just throw |
541 | throw new InvalidArgumentException( 'invalid language tag: ' . $lang ); |
542 | } |
543 | return [ $this->stringLiteral( $dataValue->getText() ) . '@' . $lang, false ]; |
544 | case 'globe-coordinate': |
545 | case 'quantity': |
546 | case 'time': |
547 | // @phan-suppress-next-line PhanUndeclaredMethod |
548 | return [ 'wdv:' . $dataValue->getHash(), true ]; |
549 | default: |
550 | throw new InvalidArgumentException( 'unknown data type: ' . $dataType ); |
551 | } |
552 | } |
553 | // phpcs:enable |
554 | |
555 | /** |
556 | * @param string $text |
557 | * @param string $regex |
558 | * |
559 | * @return boolean |
560 | * @throws SparqlHelperException if the query times out or some other error occurs |
561 | * @throws ConstraintParameterException if the $regex is invalid |
562 | */ |
563 | public function matchesRegularExpression( $text, $regex ) { |
564 | // caching wrapper around matchesRegularExpressionWithSparql |
565 | |
566 | $textHash = hash( 'sha256', $text ); |
567 | $cacheKey = $this->cache->makeKey( |
568 | 'WikibaseQualityConstraints', // extension |
569 | 'regex', // action |
570 | 'WDQS-Java', // regex flavor |
571 | hash( 'sha256', $regex ) |
572 | ); |
573 | |
574 | $cacheMapArray = $this->cache->getWithSetCallback( |
575 | $cacheKey, |
576 | WANObjectCache::TTL_DAY, |
577 | function ( $cacheMapArray ) use ( $text, $regex, $textHash ) { |
578 | // Initialize the cache map if not set |
579 | if ( $cacheMapArray === false ) { |
580 | $key = 'wikibase.quality.constraints.regex.cache.refresh.init'; |
581 | $this->dataFactory->increment( $key ); |
582 | return []; |
583 | } |
584 | |
585 | $key = 'wikibase.quality.constraints.regex.cache.refresh'; |
586 | $this->dataFactory->increment( $key ); |
587 | $cacheMap = MapCacheLRU::newFromArray( $cacheMapArray, $this->cacheMapSize ); |
588 | if ( $cacheMap->has( $textHash ) ) { |
589 | $key = 'wikibase.quality.constraints.regex.cache.refresh.hit'; |
590 | $this->dataFactory->increment( $key ); |
591 | $cacheMap->get( $textHash ); // ping cache |
592 | } else { |
593 | $key = 'wikibase.quality.constraints.regex.cache.refresh.miss'; |
594 | $this->dataFactory->increment( $key ); |
595 | try { |
596 | $matches = $this->matchesRegularExpressionWithSparql( $text, $regex ); |
597 | } catch ( ConstraintParameterException $e ) { |
598 | $matches = $this->serializeConstraintParameterException( $e ); |
599 | } catch ( SparqlHelperException $e ) { |
600 | // don’t cache this |
601 | return $cacheMap->toArray(); |
602 | } |
603 | $cacheMap->set( |
604 | $textHash, |
605 | $matches, |
606 | 3 / 8 |
607 | ); |
608 | } |
609 | |
610 | return $cacheMap->toArray(); |
611 | }, |
612 | [ |
613 | // Once map is > 1 sec old, consider refreshing |
614 | 'ageNew' => 1, |
615 | // Update 5 seconds after "ageNew" given a 1 query/sec cache check rate |
616 | 'hotTTR' => 5, |
617 | // avoid querying cache servers multiple times in a request |
618 | // (e. g. when checking format of a reference URL used multiple times on an entity) |
619 | 'pcTTL' => WANObjectCache::TTL_PROC_LONG, |
620 | ] |
621 | ); |
622 | |
623 | if ( isset( $cacheMapArray[$textHash] ) ) { |
624 | $key = 'wikibase.quality.constraints.regex.cache.hit'; |
625 | $this->dataFactory->increment( $key ); |
626 | $matches = $cacheMapArray[$textHash]; |
627 | if ( is_bool( $matches ) ) { |
628 | return $matches; |
629 | } elseif ( is_array( $matches ) && |
630 | $matches['type'] == ConstraintParameterException::class ) { |
631 | throw $this->deserializeConstraintParameterException( $matches ); |
632 | } else { |
633 | throw new UnexpectedValueException( |
634 | 'Value of unknown type in object cache (' . |
635 | 'cache key: ' . $cacheKey . ', ' . |
636 | 'cache map key: ' . $textHash . ', ' . |
637 | 'value type: ' . gettype( $matches ) . ')' |
638 | ); |
639 | } |
640 | } else { |
641 | $key = 'wikibase.quality.constraints.regex.cache.miss'; |
642 | $this->dataFactory->increment( $key ); |
643 | return $this->matchesRegularExpressionWithSparql( $text, $regex ); |
644 | } |
645 | } |
646 | |
647 | private function serializeConstraintParameterException( ConstraintParameterException $cpe ) { |
648 | return [ |
649 | 'type' => ConstraintParameterException::class, |
650 | 'violationMessage' => $this->violationMessageSerializer->serialize( $cpe->getViolationMessage() ), |
651 | ]; |
652 | } |
653 | |
654 | private function deserializeConstraintParameterException( array $serialization ) { |
655 | $message = $this->violationMessageDeserializer->deserialize( |
656 | $serialization['violationMessage'] |
657 | ); |
658 | return new ConstraintParameterException( $message ); |
659 | } |
660 | |
661 | /** |
662 | * This function is only public for testing purposes; |
663 | * use matchesRegularExpression, which is equivalent but caches results. |
664 | * |
665 | * @param string $text |
666 | * @param string $regex |
667 | * |
668 | * @return boolean |
669 | * @throws SparqlHelperException if the query times out or some other error occurs |
670 | * @throws ConstraintParameterException if the $regex is invalid |
671 | */ |
672 | public function matchesRegularExpressionWithSparql( $text, $regex ) { |
673 | $textStringLiteral = $this->stringLiteral( $text ); |
674 | $regexStringLiteral = $this->stringLiteral( '^(?:' . $regex . ')$' ); |
675 | |
676 | $query = <<<EOF |
677 | SELECT (REGEX($textStringLiteral, $regexStringLiteral) AS ?matches) {} |
678 | EOF; |
679 | |
680 | $result = $this->runQuery( $query, false ); |
681 | |
682 | $vars = $result->getArray()['results']['bindings'][0]; |
683 | if ( array_key_exists( 'matches', $vars ) ) { |
684 | // true or false ⇒ regex okay, text matches or not |
685 | return $vars['matches']['value'] === 'true'; |
686 | } else { |
687 | // empty result: regex broken |
688 | throw new ConstraintParameterException( |
689 | ( new ViolationMessage( 'wbqc-violation-message-parameter-regex' ) ) |
690 | ->withInlineCode( $regex, Role::CONSTRAINT_PARAMETER_VALUE ) |
691 | ); |
692 | } |
693 | } |
694 | |
695 | /** |
696 | * Check whether the text content of an error response indicates a query timeout. |
697 | * |
698 | * @param string $responseContent |
699 | * |
700 | * @return boolean |
701 | */ |
702 | public function isTimeout( $responseContent ) { |
703 | $timeoutRegex = implode( '|', array_map( |
704 | static function ( $fqn ) { |
705 | return preg_quote( $fqn, '/' ); |
706 | }, |
707 | $this->timeoutExceptionClasses |
708 | ) ); |
709 | return (bool)preg_match( '/' . $timeoutRegex . '/', $responseContent ); |
710 | } |
711 | |
712 | /** |
713 | * Return the max-age of a cached response, |
714 | * or a boolean indicating whether the response was cached or not. |
715 | * |
716 | * @param array $responseHeaders see MWHttpRequest::getResponseHeaders() |
717 | * |
718 | * @return int|boolean the max-age (in seconds) |
719 | * or a plain boolean if no max-age can be determined |
720 | */ |
721 | public function getCacheMaxAge( $responseHeaders ) { |
722 | if ( |
723 | array_key_exists( 'x-cache-status', $responseHeaders ) && |
724 | preg_match( '/^hit(?:-.*)?$/', $responseHeaders['x-cache-status'][0] ) |
725 | ) { |
726 | $maxage = []; |
727 | if ( |
728 | array_key_exists( 'cache-control', $responseHeaders ) && |
729 | preg_match( '/\bmax-age=(\d+)\b/', $responseHeaders['cache-control'][0], $maxage ) |
730 | ) { |
731 | return intval( $maxage[1] ); |
732 | } else { |
733 | return true; |
734 | } |
735 | } else { |
736 | return false; |
737 | } |
738 | } |
739 | |
740 | /** |
741 | * Get the delay date of a 429 headered response, which is caused by |
742 | * throttling of to many SPARQL-Requests. The header-format is defined |
743 | * in RFC7231 see: https://tools.ietf.org/html/rfc7231#section-7.1.3 |
744 | * |
745 | * @param MWHttpRequest $request |
746 | * |
747 | * @return int|ConvertibleTimestamp |
748 | * or SparlHelper::NO_RETRY_AFTER if there is no Retry-After header |
749 | * or SparlHelper::EMPTY_RETRY_AFTER if there is an empty Retry-After |
750 | * or SparlHelper::INVALID_RETRY_AFTER if there is something wrong with the format |
751 | */ |
752 | public function getThrottling( MWHttpRequest $request ) { |
753 | $retryAfterValue = $request->getResponseHeader( 'Retry-After' ); |
754 | if ( $retryAfterValue === null ) { |
755 | return self::NO_RETRY_AFTER; |
756 | } |
757 | |
758 | $trimmedRetryAfterValue = trim( $retryAfterValue ); |
759 | if ( $trimmedRetryAfterValue === '' ) { |
760 | return self::EMPTY_RETRY_AFTER; |
761 | } |
762 | |
763 | if ( is_numeric( $trimmedRetryAfterValue ) ) { |
764 | $delaySeconds = (int)$trimmedRetryAfterValue; |
765 | if ( $delaySeconds >= 0 ) { |
766 | return $this->getTimestampInFuture( new DateInterval( 'PT' . $delaySeconds . 'S' ) ); |
767 | } |
768 | } else { |
769 | $return = strtotime( $trimmedRetryAfterValue ); |
770 | if ( $return !== false ) { |
771 | return new ConvertibleTimestamp( $return ); |
772 | } |
773 | } |
774 | return self::INVALID_RETRY_AFTER; |
775 | } |
776 | |
777 | private function getTimestampInFuture( DateInterval $delta ) { |
778 | $now = new ConvertibleTimestamp(); |
779 | return new ConvertibleTimestamp( $now->timestamp->add( $delta ) ); |
780 | } |
781 | |
782 | /** |
783 | * Runs a query against the configured endpoint and returns the results. |
784 | * TODO: See if Sparql Client in core can be used instead of rolling our own |
785 | * |
786 | * @param string $query The query, unencoded (plain string). |
787 | * @param bool $needsPrefixes Whether the query requires prefixes or they can be omitted. |
788 | * |
789 | * @return CachedQueryResults |
790 | * |
791 | * @throws SparqlHelperException if the query times out or some other error occurs |
792 | */ |
793 | public function runQuery( $query, $needsPrefixes = true ) { |
794 | |
795 | if ( $this->throttlingLock->isLocked( self::EXPIRY_LOCK_ID ) ) { |
796 | $this->dataFactory->increment( 'wikibase.quality.constraints.sparql.throttling' ); |
797 | throw new TooManySparqlRequestsException(); |
798 | } |
799 | |
800 | if ( $this->sparqlHasWikibaseSupport ) { |
801 | $needsPrefixes = false; |
802 | } |
803 | |
804 | if ( $needsPrefixes ) { |
805 | $query = $this->prefixes . $query; |
806 | } |
807 | $query = "#wbqc\n" . $query; |
808 | |
809 | $url = $this->endpoint . '?' . http_build_query( |
810 | [ |
811 | 'query' => $query, |
812 | 'format' => 'json', |
813 | 'maxQueryTimeMillis' => $this->maxQueryTimeMillis, |
814 | ], |
815 | '', ini_get( 'arg_separator.output' ), |
816 | // encode spaces with %20, not + |
817 | PHP_QUERY_RFC3986 |
818 | ); |
819 | |
820 | $options = [ |
821 | 'method' => 'GET', |
822 | 'timeout' => (int)round( ( $this->maxQueryTimeMillis + 1000 ) / 1000 ), |
823 | 'connectTimeout' => 'default', |
824 | 'userAgent' => $this->defaultUserAgent, |
825 | ]; |
826 | $request = $this->requestFactory->create( $url, $options, __METHOD__ ); |
827 | $startTime = microtime( true ); |
828 | $requestStatus = $request->execute(); |
829 | $endTime = microtime( true ); |
830 | $this->dataFactory->timing( |
831 | 'wikibase.quality.constraints.sparql.timing', |
832 | ( $endTime - $startTime ) * 1000 |
833 | ); |
834 | |
835 | $this->guardAgainstTooManyRequestsError( $request ); |
836 | |
837 | $maxAge = $this->getCacheMaxAge( $request->getResponseHeaders() ); |
838 | if ( $maxAge ) { |
839 | $this->dataFactory->increment( 'wikibase.quality.constraints.sparql.cached' ); |
840 | } |
841 | |
842 | if ( $requestStatus->isOK() ) { |
843 | $json = $request->getContent(); |
844 | $jsonStatus = FormatJson::parse( $json, FormatJson::FORCE_ASSOC ); |
845 | if ( $jsonStatus->isOK() ) { |
846 | return new CachedQueryResults( |
847 | $jsonStatus->getValue(), |
848 | Metadata::ofCachingMetadata( |
849 | $maxAge ? |
850 | CachingMetadata::ofMaximumAgeInSeconds( $maxAge ) : |
851 | CachingMetadata::fresh() |
852 | ) |
853 | ); |
854 | } else { |
855 | $jsonErrorCode = $jsonStatus->getErrors()[0]['message']; |
856 | $this->dataFactory->increment( |
857 | "wikibase.quality.constraints.sparql.error.json.$jsonErrorCode" |
858 | ); |
859 | // fall through to general error handling |
860 | } |
861 | } else { |
862 | $this->dataFactory->increment( |
863 | "wikibase.quality.constraints.sparql.error.http.{$request->getStatus()}" |
864 | ); |
865 | // fall through to general error handling |
866 | } |
867 | |
868 | $this->dataFactory->increment( 'wikibase.quality.constraints.sparql.error' ); |
869 | |
870 | if ( $this->isTimeout( $request->getContent() ) ) { |
871 | $this->dataFactory->increment( |
872 | 'wikibase.quality.constraints.sparql.error.timeout' |
873 | ); |
874 | } |
875 | |
876 | throw new SparqlHelperException(); |
877 | } |
878 | |
879 | /** |
880 | * Handle a potential “too many requests” error. |
881 | * |
882 | * @param MWHttpRequest $request |
883 | * @throws TooManySparqlRequestsException |
884 | */ |
885 | private function guardAgainstTooManyRequestsError( MWHttpRequest $request ): void { |
886 | if ( $request->getStatus() !== self::HTTP_TOO_MANY_REQUESTS ) { |
887 | return; |
888 | } |
889 | |
890 | $fallbackBlockDuration = $this->sparqlThrottlingFallbackDuration; |
891 | |
892 | if ( $fallbackBlockDuration < 0 ) { |
893 | throw new InvalidArgumentException( 'Fallback duration must be positive int but is: ' . |
894 | $fallbackBlockDuration ); |
895 | } |
896 | |
897 | $this->dataFactory->increment( 'wikibase.quality.constraints.sparql.throttling' ); |
898 | $throttlingUntil = $this->getThrottling( $request ); |
899 | if ( !( $throttlingUntil instanceof ConvertibleTimestamp ) ) { |
900 | $this->loggingHelper->logSparqlHelperTooManyRequestsRetryAfterInvalid( $request ); |
901 | $this->throttlingLock->lock( |
902 | self::EXPIRY_LOCK_ID, |
903 | $this->getTimestampInFuture( new DateInterval( 'PT' . $fallbackBlockDuration . 'S' ) ) |
904 | ); |
905 | } else { |
906 | $this->loggingHelper->logSparqlHelperTooManyRequestsRetryAfterPresent( $throttlingUntil, $request ); |
907 | $this->throttlingLock->lock( self::EXPIRY_LOCK_ID, $throttlingUntil ); |
908 | } |
909 | throw new TooManySparqlRequestsException(); |
910 | } |
911 | |
912 | } |