Code Coverage for /workspace/src/extensions/CirrusSearch/includes/Parser/QueryStringRegex/KeywordParser.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	100.00% covered (success)	100.00%	92 / 92	100.00% covered (success)	100.00%	4 / 4	CRAP	100.00% covered (success)	100.00%	1 / 1
KeywordParser	100.00% covered (success)	100.00%	92 / 92	100.00% covered (success)	100.00%	4 / 4	25	100.00% covered (success)	100.00%	1 / 1
parse	100.00% covered (success)	100.00%	70 / 70	100.00% covered (success)	100.00%	1 / 1	17
getValueRegex	100.00% covered (success)	100.00%	20 / 20	100.00% covered (success)	100.00%	1 / 1	6
getWarnings	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
addWarning	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1

1	<?php
2
3	namespace CirrusSearch\Parser\QueryStringRegex;
4
5	use CirrusSearch\Parser\AST\KeywordFeatureNode;
6	use CirrusSearch\Parser\AST\NegatedNode;
7	use CirrusSearch\Parser\AST\ParsedNode;
8	use CirrusSearch\Parser\AST\ParseWarning;
9	use CirrusSearch\Query\KeywordFeature;
10	use CirrusSearch\WarningCollector;
11	use Wikimedia\Assert\Assert;
12
13	/**
14	* Parser for KeywordFeature
15	*/
16	class KeywordParser implements WarningCollector {
17
18	/**
19	* @var int
20	*/
21	private $currentOffset;
22
23	/**
24	* @var ParseWarning[]
25	*/
26	private $warnings = [];
27
28	/**
29	* @param string $query
30	* @param KeywordFeature $feature
31	* @param OffsetTracker $tracker
32	* @param int $startOffset start offset of the query in $query
33	* @return ParsedNode[]
34	*/
35	public function parse( $query, KeywordFeature $feature, OffsetTracker $tracker, $startOffset = 0 ) {
36	if ( $feature->greedy() ) {
37	Assert::precondition( !$feature->allowEmptyValue(),
38	"greedy keywords must not accept empty value" );
39	// XXX: we ignore value delimiter for greedy keywords
40	Assert::precondition( $feature->getValueDelimiters() === [ [ 'delimiter' => '"' ] ],
41	"getValueDelimiters() must not be overridden with greedy keywords" );
42	}
43	$offset = $tracker->getMinimalUnconsumedOffset( $startOffset );
44	$keyListRegex = implode(
45	'\|',
46	array_map(
47	static function ( $kw ) {
48	return preg_quote( $kw, '/' );
49	},
50	$feature->getKeywordPrefixes()
51	)
52	);
53	// Hook to the beginning allowing optional spaces if we are a queryHeader
54	// otherwise lookbehind allowing begin or space.
55	// \G is similar to ^ but also works when offset is set is if we ran substr on it
56	$begin = $feature->queryHeader() ? '(?:\G[\pZ\pC]*)' : '(?<=\G\|[\pZ\pC])';
57	$keywordRegex = '(?<key>-?(?:' . $keyListRegex . '))';
58	$valueSideRegex = '';
59	if ( $feature->hasValue() ) {
60	$valueRegex = '(?<value>' . $this->getValueRegex( $feature ) . ')';
61	// If we allow empty values we don't allow spaces between
62	// the keyword and its value, a space would mean "empty value"
63	$spacesAfterSep = $feature->allowEmptyValue() ? '' : '[\pZ\pC]*';
64	$valueSideRegex = "{$spacesAfterSep}{$valueRegex}";
65	}
66	$matches = [];
67	preg_match_all( "/{$begin}{$keywordRegex}(?<colon>:){$valueSideRegex}/u",
68	$query, $matches, PREG_SET_ORDER \| PREG_OFFSET_CAPTURE, $offset );
69	$output = [];
70	foreach ( $matches as $match ) {
71	$key = $match['key'][0];
72	Assert::invariant( $feature->hasValue() === isset( $match['value'] ),
73	'a value must have matched if the keyword wants a value.' );
74	$quotedValue = '';
75	$value = '';
76	$valueDelimiter = '';
77	$valueSuffix = '';
78	$valueStart = $match['colon'][1] + strlen( $match['colon'][0] );
79	if ( $feature->hasValue() ) {
80	$quotedValue = $match['value'][0];
81	if ( isset( $match['unquoted'] ) && $match['unquoted'][1] >= 0 ) {
82	$value = $match['unquoted'][0];
83	} else {
84	$valueDelimiter = $match['delim'][0];
85	$value = str_replace( "\\$valueDelimiter", $valueDelimiter, $match['quoted'][0] );
86	}
87	if ( isset( $match['suffixes'] ) && $match['suffixes'][1] >= 1 ) {
88	$valueSuffix = $match['suffixes'][0];
89	$quotedValue = rtrim( $quotedValue, $valueSuffix );
90	}
91	}
92
93	$negationChar = '';
94	if ( $key[0] === '-' ) {
95	$negationChar = $key[0];
96	$key = substr( $key, 1 );
97	}
98	// We take the key as start offset, the whole match can eat some spaces
99	// at the beginning for query headers.
100	$kwStart = $match['key'][1] + strlen( $negationChar );
101	$wholeStart = $match['key'][1];
102	// $end is whole match length minus chars between start and key
103	$end = $wholeStart + strlen( $match[0][0] ) - ( $wholeStart - $match[0][1] );
104	$parsedValue = null;
105	if ( $feature->hasValue() && $quotedValue !== '' ) {
106	// Set the current offset so that we can collect warnings at the keyword offset
107	$this->currentOffset = $valueStart;
108	$parsedValue = $feature->parseValue(
109	$key, $value, $quotedValue, $valueDelimiter, $valueSuffix, $this );
110	if ( $parsedValue === false ) {
111	Assert::postcondition( $feature->allowEmptyValue(),
112	'Only features accepting empty value can reject a value' );
113	$value = '';
114	$quotedValue = '';
115	$end = $valueStart;
116	$parsedValue = null;
117	}
118	}
119	if ( !$tracker->overlap( $wholeStart, $end ) ) {
120	$node = new KeywordFeatureNode( $kwStart, $end, $feature, $key, $value, $quotedValue,
121	$valueDelimiter, $valueSuffix, $parsedValue );
122	if ( $negationChar !== '' ) {
123	$node = new NegatedNode( $wholeStart, $end, $node, $negationChar );
124	}
125	$output[] = $node;
126	}
127	}
128	return $output;
129	}
130
131	/**
132	* @param KeywordFeature $feature
133	* @return string
134	*/
135	private function getValueRegex( KeywordFeature $feature ) {
136	Assert::invariant( $feature->hasValue(), __METHOD__ . ' called but hasValue() is false' );
137	if ( $feature->greedy() ) {
138	// XXX: we send raw value to the keyword
139	return '(?<unquoted>.+)';
140	} else {
141	$quantifier = $feature->allowEmptyValue() ? '*' : '+';
142	// Collect all quoted vlaue delimiter (usually only " but can be / for regexes)
143	$allDelims = '';
144	$optionalSuffixes = [];
145	foreach ( $feature->getValueDelimiters() as $delimConfig ) {
146	Assert::precondition( strlen( $delimConfig['delimiter'] ) === 1,
147	"Value delimiter must be a single byte char" );
148	$delim = preg_quote( $delimConfig['delimiter'], '/' );
149	$allDelims .= $delim;
150	if ( isset( $delimConfig['suffixes'] ) ) {
151	// Use lookbehind to only match the suffix if it was used with the proper delimiter
152	// i.e i should only be matched in /regex/i not "regex"i
153	$optionalSuffixes[] = "(?<=$delim)" . preg_quote( $delimConfig['suffixes'], '/' );
154	}
155	}
156	$quotedValue = "(?<delim>[$allDelims])" . // Capture the delimiter used to use in backreferences
157	// use negative lookbehind to consume any char that is not the captured delimiter
158	// but also accept to escape the captured delimiter
159	"(?<quoted>(?:\\\\\g{delim}\|(?!\g{delim}).)*)" .
160	"\g{delim}";
161	if ( $optionalSuffixes ) {
162	$quotedValue .= "(?<suffixes>" . implode( '\|', $optionalSuffixes ) . ')?';
163	}
164	// XXX: we support only " to break the unquoted value
165	$unquotedValue = "(?<unquoted>[^\"\pZ\pC]$quantifier)";
166	return "(?:$quotedValue\|$unquotedValue)";
167	}
168	}
169
170	/**
171	* @return ParseWarning[]
172	*/
173	public function getWarnings() {
174	return $this->warnings;
175	}
176
177	/**
178	* Add a warning
179	*
180	* @param string $message i18n message key
181	* @param mixed ...$params
182	*/
183	public function addWarning( $message, ...$params ) {
184	$this->warnings[] = new ParseWarning( $message, $this->currentOffset, [], null, $params );
185	}
186	}