Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
92 / 92 |
|
100.00% |
4 / 4 |
CRAP | |
100.00% |
1 / 1 |
KeywordParser | |
100.00% |
92 / 92 |
|
100.00% |
4 / 4 |
25 | |
100.00% |
1 / 1 |
parse | |
100.00% |
70 / 70 |
|
100.00% |
1 / 1 |
17 | |||
getValueRegex | |
100.00% |
20 / 20 |
|
100.00% |
1 / 1 |
6 | |||
getWarnings | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
addWarning | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Parser\QueryStringRegex; |
4 | |
5 | use CirrusSearch\Parser\AST\KeywordFeatureNode; |
6 | use CirrusSearch\Parser\AST\NegatedNode; |
7 | use CirrusSearch\Parser\AST\ParsedNode; |
8 | use CirrusSearch\Parser\AST\ParseWarning; |
9 | use CirrusSearch\Query\KeywordFeature; |
10 | use CirrusSearch\WarningCollector; |
11 | use Wikimedia\Assert\Assert; |
12 | |
13 | /** |
14 | * Parser for KeywordFeature |
15 | */ |
16 | class KeywordParser implements WarningCollector { |
17 | |
18 | /** |
19 | * @var int |
20 | */ |
21 | private $currentOffset; |
22 | |
23 | /** |
24 | * @var ParseWarning[] |
25 | */ |
26 | private $warnings = []; |
27 | |
28 | /** |
29 | * @param string $query |
30 | * @param KeywordFeature $feature |
31 | * @param OffsetTracker $tracker |
32 | * @param int $startOffset start offset of the query in $query |
33 | * @return ParsedNode[] |
34 | */ |
35 | public function parse( $query, KeywordFeature $feature, OffsetTracker $tracker, $startOffset = 0 ) { |
36 | if ( $feature->greedy() ) { |
37 | Assert::precondition( !$feature->allowEmptyValue(), |
38 | "greedy keywords must not accept empty value" ); |
39 | // XXX: we ignore value delimiter for greedy keywords |
40 | Assert::precondition( $feature->getValueDelimiters() === [ [ 'delimiter' => '"' ] ], |
41 | "getValueDelimiters() must not be overridden with greedy keywords" ); |
42 | } |
43 | $offset = $tracker->getMinimalUnconsumedOffset( $startOffset ); |
44 | $keyListRegex = implode( |
45 | '|', |
46 | array_map( |
47 | static function ( $kw ) { |
48 | return preg_quote( $kw, '/' ); |
49 | }, |
50 | $feature->getKeywordPrefixes() |
51 | ) |
52 | ); |
53 | // Hook to the beginning allowing optional spaces if we are a queryHeader |
54 | // otherwise lookbehind allowing begin or space. |
55 | // \G is similar to ^ but also works when offset is set is if we ran substr on it |
56 | $begin = $feature->queryHeader() ? '(?:\G[\pZ\pC]*)' : '(?<=\G|[\pZ\pC])'; |
57 | $keywordRegex = '(?<key>-?(?:' . $keyListRegex . '))'; |
58 | $valueSideRegex = ''; |
59 | if ( $feature->hasValue() ) { |
60 | $valueRegex = '(?<value>' . $this->getValueRegex( $feature ) . ')'; |
61 | // If we allow empty values we don't allow spaces between |
62 | // the keyword and its value, a space would mean "empty value" |
63 | $spacesAfterSep = $feature->allowEmptyValue() ? '' : '[\pZ\pC]*'; |
64 | $valueSideRegex = "{$spacesAfterSep}{$valueRegex}"; |
65 | } |
66 | $matches = []; |
67 | preg_match_all( "/{$begin}{$keywordRegex}(?<colon>:){$valueSideRegex}/u", |
68 | $query, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE, $offset ); |
69 | $output = []; |
70 | foreach ( $matches as $match ) { |
71 | $key = $match['key'][0]; |
72 | Assert::invariant( $feature->hasValue() === isset( $match['value'] ), |
73 | 'a value must have matched if the keyword wants a value.' ); |
74 | $quotedValue = ''; |
75 | $value = ''; |
76 | $valueDelimiter = ''; |
77 | $valueSuffix = ''; |
78 | $valueStart = $match['colon'][1] + strlen( $match['colon'][0] ); |
79 | if ( $feature->hasValue() ) { |
80 | $quotedValue = $match['value'][0]; |
81 | if ( isset( $match['unquoted'] ) && $match['unquoted'][1] >= 0 ) { |
82 | $value = $match['unquoted'][0]; |
83 | } else { |
84 | $valueDelimiter = $match['delim'][0]; |
85 | $value = str_replace( "\\$valueDelimiter", $valueDelimiter, $match['quoted'][0] ); |
86 | } |
87 | if ( isset( $match['suffixes'] ) && $match['suffixes'][1] >= 1 ) { |
88 | $valueSuffix = $match['suffixes'][0]; |
89 | $quotedValue = rtrim( $quotedValue, $valueSuffix ); |
90 | } |
91 | } |
92 | |
93 | $negationChar = ''; |
94 | if ( $key[0] === '-' ) { |
95 | $negationChar = $key[0]; |
96 | $key = substr( $key, 1 ); |
97 | } |
98 | // We take the key as start offset, the whole match can eat some spaces |
99 | // at the beginning for query headers. |
100 | $kwStart = $match['key'][1] + strlen( $negationChar ); |
101 | $wholeStart = $match['key'][1]; |
102 | // $end is whole match length minus chars between start and key |
103 | $end = $wholeStart + strlen( $match[0][0] ) - ( $wholeStart - $match[0][1] ); |
104 | $parsedValue = null; |
105 | if ( $feature->hasValue() && $quotedValue !== '' ) { |
106 | // Set the current offset so that we can collect warnings at the keyword offset |
107 | $this->currentOffset = $valueStart; |
108 | $parsedValue = $feature->parseValue( |
109 | $key, $value, $quotedValue, $valueDelimiter, $valueSuffix, $this ); |
110 | if ( $parsedValue === false ) { |
111 | Assert::postcondition( $feature->allowEmptyValue(), |
112 | 'Only features accepting empty value can reject a value' ); |
113 | $value = ''; |
114 | $quotedValue = ''; |
115 | $end = $valueStart; |
116 | $parsedValue = null; |
117 | } |
118 | } |
119 | if ( !$tracker->overlap( $wholeStart, $end ) ) { |
120 | $node = new KeywordFeatureNode( $kwStart, $end, $feature, $key, $value, $quotedValue, |
121 | $valueDelimiter, $valueSuffix, $parsedValue ); |
122 | if ( $negationChar !== '' ) { |
123 | $node = new NegatedNode( $wholeStart, $end, $node, $negationChar ); |
124 | } |
125 | $output[] = $node; |
126 | } |
127 | } |
128 | return $output; |
129 | } |
130 | |
131 | /** |
132 | * @param KeywordFeature $feature |
133 | * @return string |
134 | */ |
135 | private function getValueRegex( KeywordFeature $feature ) { |
136 | Assert::invariant( $feature->hasValue(), __METHOD__ . ' called but hasValue() is false' ); |
137 | if ( $feature->greedy() ) { |
138 | // XXX: we send raw value to the keyword |
139 | return '(?<unquoted>.+)'; |
140 | } else { |
141 | $quantifier = $feature->allowEmptyValue() ? '*' : '+'; |
142 | // Collect all quoted vlaue delimiter (usually only " but can be / for regexes) |
143 | $allDelims = ''; |
144 | $optionalSuffixes = []; |
145 | foreach ( $feature->getValueDelimiters() as $delimConfig ) { |
146 | Assert::precondition( strlen( $delimConfig['delimiter'] ) === 1, |
147 | "Value delimiter must be a single byte char" ); |
148 | $delim = preg_quote( $delimConfig['delimiter'], '/' ); |
149 | $allDelims .= $delim; |
150 | if ( isset( $delimConfig['suffixes'] ) ) { |
151 | // Use lookbehind to only match the suffix if it was used with the proper delimiter |
152 | // i.e i should only be matched in /regex/i not "regex"i |
153 | $optionalSuffixes[] = "(?<=$delim)" . preg_quote( $delimConfig['suffixes'], '/' ); |
154 | } |
155 | } |
156 | $quotedValue = "(?<delim>[$allDelims])" . // Capture the delimiter used to use in backreferences |
157 | // use negative lookbehind to consume any char that is not the captured delimiter |
158 | // but also accept to escape the captured delimiter |
159 | "(?<quoted>(?:\\\\\g{delim}|(?!\g{delim}).)*)" . |
160 | "\g{delim}"; |
161 | if ( $optionalSuffixes ) { |
162 | $quotedValue .= "(?<suffixes>" . implode( '|', $optionalSuffixes ) . ')?'; |
163 | } |
164 | // XXX: we support only " to break the unquoted value |
165 | $unquotedValue = "(?<unquoted>[^\"\pZ\pC]$quantifier)"; |
166 | return "(?:$quotedValue|$unquotedValue)"; |
167 | } |
168 | } |
169 | |
170 | /** |
171 | * @return ParseWarning[] |
172 | */ |
173 | public function getWarnings() { |
174 | return $this->warnings; |
175 | } |
176 | |
177 | /** |
178 | * Add a warning |
179 | * |
180 | * @param string $message i18n message key |
181 | * @param mixed ...$params |
182 | */ |
183 | public function addWarning( $message, ...$params ) { |
184 | $this->warnings[] = new ParseWarning( $message, $this->currentOffset, [], null, $params ); |
185 | } |
186 | } |