Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
97.85% |
319 / 326 |
|
85.19% |
23 / 27 |
CRAP | |
0.00% |
0 / 1 |
QueryStringRegexParser | |
97.85% |
319 / 326 |
|
85.19% |
23 / 27 |
102 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
2 | |||
reInit | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
cleanup | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
6 | |||
parse | |
100.00% |
41 / 41 |
|
100.00% |
1 / 1 |
10 | |||
createClause | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
expression | |
94.44% |
68 / 72 |
|
0.00% |
0 / 1 |
20.07 | |||
createBoolNode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
collapseWords | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
mergeWords | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
fallbackToWord | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
unexpectedEOF | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
negatedLeaf | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
leaf | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
explicitlyNegatedNode | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
isLeaf | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
parseKeywords | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
nextToken | |
96.15% |
25 / 26 |
|
0.00% |
0 / 1 |
9 | |||
consumeWS | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
consumeBoolOp | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
consumePhrase | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
consumeWord | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
consumeUnbalancedPhrase | |
92.86% |
13 / 14 |
|
0.00% |
0 / 1 |
6.01 | |||
advance | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
boolToOccur | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
extractRequiredNamespaces | |
93.75% |
15 / 16 |
|
0.00% |
0 / 1 |
3.00 | |||
parseNsHeader | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
checkQueryLen | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
6 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Parser\QueryStringRegex; |
4 | |
5 | use CirrusSearch\Parser\AST\BooleanClause; |
6 | use CirrusSearch\Parser\AST\EmptyQueryNode; |
7 | use CirrusSearch\Parser\AST\KeywordFeatureNode; |
8 | use CirrusSearch\Parser\AST\NamespaceHeaderNode; |
9 | use CirrusSearch\Parser\AST\NegatedNode; |
10 | use CirrusSearch\Parser\AST\ParsedBooleanNode; |
11 | use CirrusSearch\Parser\AST\ParsedNode; |
12 | use CirrusSearch\Parser\AST\ParsedQuery; |
13 | use CirrusSearch\Parser\AST\ParseWarning; |
14 | use CirrusSearch\Parser\AST\PhraseQueryNode; |
15 | use CirrusSearch\Parser\AST\Visitor\KeywordNodeVisitor; |
16 | use CirrusSearch\Parser\AST\WordsQueryNode; |
17 | use CirrusSearch\Parser\NamespacePrefixParser; |
18 | use CirrusSearch\Parser\ParsedQueryClassifiersRepository; |
19 | use CirrusSearch\Parser\QueryParser; |
20 | use CirrusSearch\Query\KeywordFeature; |
21 | use CirrusSearch\Query\PrefixFeature; |
22 | use CirrusSearch\Search\Escaper; |
23 | use CirrusSearch\Util; |
24 | use LogicException; |
25 | use Message; |
26 | use Wikimedia\Assert\Assert; |
27 | |
28 | /** |
29 | * Full text query parser that uses regex to parse its token. |
30 | * |
31 | * Far from being a state of the art parser it detects most of its |
32 | * tokens using regular expression. And make arbitrary decisions |
33 | * at tokenization. |
34 | * |
35 | * The tokenizer will understand few token types: |
36 | * - WHITESPACE: all unicode whitespace and control chars ([\pZ\pC]) |
37 | * the WHITESPACE token is ignored and never presented to the parser |
38 | * - EOF: dummy type used to mark end of string |
39 | * - BOOL_AND/BOOL_OR/BOOL_NOT: explicit boolean opeartors |
40 | * - PARSED_NODE: complex type (usually part of the query) |
41 | * |
42 | * PARSED_NODE is a type that groups: |
43 | * - Keywords |
44 | * - Phrase |
45 | * - Words |
46 | * - Wildcards/Prefix |
47 | * |
48 | * Phrase does not have its own token " and is part the tokenization and is never exposed |
49 | * to the parser. |
50 | * Same for negation prefix (! and -), they are parsed at tokenization time. |
51 | * |
52 | * NOTE that this parser is broken by design: |
53 | * - no lexical context support, we first parse keywords |
54 | * - no support for groupings (parenthesis) |
55 | */ |
56 | class QueryStringRegexParser implements QueryParser { |
57 | /** |
58 | * Whitespace regex including unicode and some control chars |
59 | */ |
60 | private const WHITESPACE_REGEX = '/\G[\pZ\pC]+/u'; |
61 | |
62 | public const QUERY_LEN_HARD_LIMIT = 4096; |
63 | |
64 | /** |
65 | * see T66350 |
66 | */ |
67 | private const GERSHAYIM_REGEX = '/(\p{L}{2,})(?:")(\p{L})(?=[^\p{L}]|$)/u'; |
68 | |
69 | /** |
70 | * Supported explicit boolean operator |
71 | * |
72 | */ |
73 | private const EXPLICIT_BOOLEAN_OPERATOR = '/\G(?:(?<AND>AND|&&)|(?<OR>OR|\|\|)|(?<NOT>NOT))(?![^\pZ\pC"])/u'; |
74 | |
75 | /** |
76 | * @var \CirrusSearch\Parser\KeywordRegistry |
77 | */ |
78 | private $keywordRegistry; |
79 | |
80 | /** |
81 | * @var Escaper |
82 | */ |
83 | private $escaper; |
84 | |
85 | /** |
86 | * @var ParsedQueryClassifiersRepository |
87 | */ |
88 | private $classifierRepository; |
89 | |
90 | /** |
91 | * @var string|null user query (null when not yet cleaned up) |
92 | */ |
93 | private $query; |
94 | |
95 | /** |
96 | * @var string Either "all", "break", or "final" |
97 | */ |
98 | private $questionMarkStripLevel; |
99 | |
100 | /** |
101 | * @var string the raw query as received by the search engine |
102 | */ |
103 | private $rawQuery; |
104 | |
105 | /** |
106 | * @var KeywordParser |
107 | */ |
108 | private $keywordParser; |
109 | |
110 | /** |
111 | * @var PhraseQueryParser |
112 | */ |
113 | private $phraseQueryParser; |
114 | |
115 | /** |
116 | * @var NonPhraseParser |
117 | */ |
118 | private $nonPhraseParser; |
119 | |
120 | /** |
121 | * @var OffsetTracker track offsets of parsed keywords |
122 | */ |
123 | private $keywordOffsetsTracker; |
124 | |
125 | /** |
126 | * @var ParsedNode[] |
127 | */ |
128 | private $preTaggedNodes = []; |
129 | |
130 | /** |
131 | * Token set after calling nextToken |
132 | * @var Token|null |
133 | */ |
134 | private $token; |
135 | |
136 | /** |
137 | * Last token seen (set within nextToken) |
138 | * @var Token|null |
139 | */ |
140 | private $lookBehind; |
141 | |
142 | /** |
143 | * Current offset |
144 | * NOTE: offset is moved after call advance |
145 | * @var int |
146 | */ |
147 | private $offset; |
148 | |
149 | /** |
150 | * @var bool[] indexed cleanups applied (indexed by the cleanup type) |
151 | * @see ParsedQuery::hasCleanup() |
152 | */ |
153 | private $queryCleanups = []; |
154 | |
155 | /** |
156 | * Errors detected while parsing the query |
157 | * @var ParseWarning[] |
158 | */ |
159 | private $warnings = []; |
160 | |
161 | /** |
162 | * @var NamespaceHeaderNode|null |
163 | */ |
164 | private $namespaceHeader; |
165 | |
166 | /** |
167 | * @var NamespacePrefixParser |
168 | */ |
169 | private $namespacePrefixParser; |
170 | |
171 | private const DEFAULT_OCCUR = BooleanClause::MUST; |
172 | |
173 | /** |
174 | * @var int |
175 | */ |
176 | private $maxQueryLen; |
177 | |
178 | /** |
179 | * @param \CirrusSearch\Parser\KeywordRegistry $keywordRegistry |
180 | * @param Escaper $escaper |
181 | * @param string $qmarkStripLevel Level of question mark stripping to apply, either "all", |
182 | * "break", or "final" |
183 | * @param ParsedQueryClassifiersRepository $classifierRepository |
184 | * @param NamespacePrefixParser $namespacePrefixParser |
185 | * @param int|null $maxQueryLen maximum length of the query in chars |
186 | * @see Util::stripQuestionMarks() for acceptable $qmarkStripLevel values |
187 | */ |
188 | public function __construct( |
189 | \CirrusSearch\Parser\KeywordRegistry $keywordRegistry, |
190 | Escaper $escaper, |
191 | $qmarkStripLevel, |
192 | ParsedQueryClassifiersRepository $classifierRepository, |
193 | NamespacePrefixParser $namespacePrefixParser, |
194 | ?int $maxQueryLen |
195 | ) { |
196 | $this->keywordRegistry = $keywordRegistry; |
197 | $this->escaper = $escaper; |
198 | $this->keywordParser = new KeywordParser(); |
199 | $this->phraseQueryParser = new PhraseQueryParser( $escaper ); |
200 | $this->nonPhraseParser = new NonPhraseParser( $escaper ); |
201 | $this->questionMarkStripLevel = $qmarkStripLevel; |
202 | $this->classifierRepository = $classifierRepository; |
203 | $this->namespacePrefixParser = $namespacePrefixParser; |
204 | $this->maxQueryLen = $maxQueryLen ?: 300; |
205 | } |
206 | |
207 | /** |
208 | * Reinit internal parser states |
209 | * @param string $rawQuery |
210 | */ |
211 | private function reInit( $rawQuery ) { |
212 | $this->rawQuery = $rawQuery; |
213 | $this->query = null; |
214 | $this->keywordOffsetsTracker = new OffsetTracker(); |
215 | $this->token = null; |
216 | $this->lookBehind = null; |
217 | $this->preTaggedNodes = []; |
218 | $this->warnings = []; |
219 | $this->queryCleanups = []; |
220 | $this->namespaceHeader = null; |
221 | $this->offset = 0; |
222 | } |
223 | |
224 | /** |
225 | * Apply some cleanups to the input query prior to parsing it |
226 | * Ideally the parser should be able to handle the query without modifying it |
227 | * but in some cases it simply way easier to handle this this way. |
228 | * Cleanups applied: |
229 | * - Question mark stripping depending on $this->questionMarkStripLevel |
230 | * - gershayim quirks if $this->escaper->getLanguage() is hebrew |
231 | */ |
232 | private function cleanup() { |
233 | $query = $this->rawQuery; |
234 | $nquery = Util::stripQuestionMarks( $query, $this->questionMarkStripLevel ); |
235 | if ( $nquery !== $query ) { |
236 | $this->queryCleanups[ParsedQuery::CLEANUP_QMARK_STRIPPING] = true; |
237 | $query = $nquery; |
238 | } |
239 | if ( $this->escaper->getLanguage() === 'he' ) { |
240 | $nquery = preg_replace( self::GERSHAYIM_REGEX, '$1\"$2', $query ); |
241 | if ( $nquery !== $query ) { |
242 | $this->queryCleanups[ParsedQuery::CLEANUP_GERSHAYIM_QUIRKS] = true; |
243 | $query = $nquery; |
244 | } |
245 | } |
246 | if ( strlen( $query ) > 0 && $query[0] === '~' ) { |
247 | $query = substr( $query, 1 ); |
248 | $this->queryCleanups[ParsedQuery::TILDE_HEADER] = true; |
249 | } |
250 | $this->query = $query; |
251 | } |
252 | |
253 | /** |
254 | * @param string $query |
255 | * @return \CirrusSearch\Parser\AST\ParsedQuery |
256 | * @throws SearchQueryParseException |
257 | */ |
258 | public function parse( string $query ): ParsedQuery { |
259 | $this->reInit( $query ); |
260 | $queryLen = mb_strlen( $query ); |
261 | if ( $queryLen > self::QUERY_LEN_HARD_LIMIT ) { |
262 | throw new SearchQueryParseException( 'cirrussearch-query-too-long', |
263 | $queryLen, self::QUERY_LEN_HARD_LIMIT ); |
264 | } |
265 | $this->cleanup(); |
266 | $this->parseNsHeader(); |
267 | $this->token = new Token( $this->query ); |
268 | $this->lookBehind = new Token( $this->query ); |
269 | |
270 | // First parse keywords |
271 | $nonGreedyHeaders = []; |
272 | $greedyHeaders = []; |
273 | $greedy = []; |
274 | $allowingEmpty = []; |
275 | $normalKeywords = []; |
276 | foreach ( $this->keywordRegistry->getKeywords() as $keyword ) { |
277 | // Parsing depends on the nature of the keyword |
278 | // 1. non greedy query headers |
279 | // 2. greedy query headers |
280 | // 3. greedy |
281 | // 4. allowed empty values (see prefer-recent) |
282 | // 5. normal |
283 | // FIXME: refactor this so that parsing is less dependent on keyword ordering |
284 | // we could try to identify all keyword prefixes in a single regex like /(?<=\G|[\pZ\pC])(intitle|prefer-recent|...):/ |
285 | // and iterate over there. Currently we workaround this issue by separating keywords into categories but that is not |
286 | // sufficient it's still dependent on ordering within a specific category. |
287 | if ( !$keyword->greedy() && $keyword->queryHeader() ) { |
288 | $nonGreedyHeaders[] = $keyword; |
289 | } elseif ( $keyword->greedy() && $keyword->queryHeader() ) { |
290 | $greedyHeaders[] = $keyword; |
291 | } elseif ( $keyword->greedy() ) { |
292 | $greedy[] = $keyword; |
293 | } elseif ( $keyword->allowEmptyValue() ) { |
294 | $allowingEmpty[] = $keyword; |
295 | } else { |
296 | $normalKeywords[] = $keyword; |
297 | } |
298 | } |
299 | $this->parseKeywords( $nonGreedyHeaders ); |
300 | $this->parseKeywords( $greedyHeaders ); |
301 | $this->parseKeywords( $greedy ); |
302 | $this->parseKeywords( $allowingEmpty ); |
303 | $this->parseKeywords( $normalKeywords ); |
304 | $this->warnings = array_merge( $this->warnings, $this->keywordParser->getWarnings() ); |
305 | // All parsed keywords have their offsets marked in $this->keywordOffsetsTracker |
306 | // We then reparse the query from the beginning finding holes between keyword offsets |
307 | uasort( $this->preTaggedNodes, static function ( ParsedNode $a, ParsedNode $b ) { |
308 | if ( $a->getStartOffset() < $b->getStartOffset() ) { |
309 | return -1; |
310 | } else { |
311 | // We cannot have equality here |
312 | return 1; |
313 | } |
314 | } ); |
315 | reset( $this->preTaggedNodes ); |
316 | |
317 | $this->checkQueryLen(); |
318 | $root = $this->expression(); |
319 | $additionalNamespaces = $this->extractRequiredNamespaces( $root ); |
320 | return new ParsedQuery( $root, $this->query, $this->rawQuery, $this->queryCleanups, |
321 | $this->namespaceHeader, $additionalNamespaces, $this->warnings, $this->classifierRepository ); |
322 | } |
323 | |
324 | private function createClause( ParsedNode $node, $explicit = false, $occur = null ) { |
325 | if ( $node instanceof NegatedNode ) { |
326 | // OR NOT is simply MUST_NOT, there's no SHOULD_NOT in lucene |
327 | // so simply do what lucene QueryString does: force MUST_NOT whenever |
328 | // we encounter a negated clause. |
329 | return new BooleanClause( $node->getChild(), BooleanClause::MUST_NOT, |
330 | $explicit || $node->getNegationType() === 'NOT', $node ); |
331 | } |
332 | return new BooleanClause( $node, $occur ?? self::DEFAULT_OCCUR, $explicit ); |
333 | } |
334 | |
335 | /** |
336 | * (([NOT] Node) (AND|OR)?)* |
337 | * Tries to follow behavior with backward order precedence: |
338 | * - A AND B OR C => MUST:A SHOULD:B SHOULD:C |
339 | * - A OR B AND C => SHOULD:A MUST:B MUST:C |
340 | * |
341 | * NOT is always MUST_NOT: |
342 | * - A OR NOT B: A -B |
343 | * |
344 | * Syntax errors fallback: |
345 | * - NOT: MUST:NOT |
346 | * - NOT NOT: MUST_NOT:NOT |
347 | * - NOT AND FOO: MUST_NOT:AND MUST:FOO |
348 | * - NOT !FOO: MUST:FOO |
349 | * @return ParsedNode |
350 | */ |
351 | private function expression() { |
352 | $clauses = []; |
353 | $left = null; |
354 | // Last boolean operator seen, -1 means none |
355 | $lastBoolType = -1; |
356 | $explicitNegation = false; |
357 | // TODO: simplify, this is a bit hairy |
358 | while ( $this->nextToken() ) { |
359 | if ( $left === null ) { |
360 | // First iteration |
361 | if ( !$this->isLeaf() ) { |
362 | $left = $this->fallbackToWord( [ Token::NOT, Token::PARSED_NODE ] ); |
363 | } else { |
364 | $left = $this->negatedLeaf(); |
365 | } |
366 | Assert::postcondition( $left !== null, '$left must not be null' ); |
367 | continue; |
368 | } |
369 | |
370 | $explicitNegation = false; |
371 | // The last boolean operator seen before the last one, -1 means none |
372 | // used to know if the node was attached explicitly by the user |
373 | $beforeLastBoolType = $lastBoolType; |
374 | $lastBoolType = -1; |
375 | switch ( $this->token->getType() ) { |
376 | case Token::NOT: |
377 | // NOT something |
378 | $this->advance(); |
379 | if ( !$this->nextToken() ) { |
380 | // NOT<EOF> |
381 | // strategy is to simply eat the NOT token as a word query |
382 | $node = $this->unexpectedEOF( [ Token::PARSED_NODE ] ); |
383 | if ( $left instanceof WordsQueryNode ) { |
384 | // Collapse it to previous words |
385 | $left = $this->mergeWords( $left, $node ); |
386 | } else { |
387 | // or add new boolean clause |
388 | $clauses[] = $this->createClause( $left, $beforeLastBoolType !== -1 ); |
389 | $left = $node; |
390 | } |
391 | } else { |
392 | $explicitNegation = true; |
393 | $clauses[] = $this->createClause( $left ); |
394 | $left = $this->explicitlyNegatedNode(); |
395 | } |
396 | Assert::postcondition( $left !== null, '$left must not be null' ); |
397 | break; |
398 | case Token::PARSED_NODE: |
399 | // A word or a keyword |
400 | // note: negation prefix is eaten by the tokenizer |
401 | if ( $left instanceof WordsQueryNode |
402 | && $this->token->getNode() instanceof WordsQueryNode |
403 | ) { |
404 | $lastBoolType = $beforeLastBoolType; |
405 | $left = $this->collapseWords( $left ); |
406 | |
407 | } else { |
408 | $clauses[] = $this->createClause( $left, $explicitNegation ); |
409 | $left = $this->leaf(); |
410 | } |
411 | Assert::postcondition( $left !== null, '$left must not be null' ); |
412 | break; |
413 | case Token::BOOL_AND: |
414 | case Token::BOOL_OR: |
415 | $lastBoolType = $this->token->getType(); |
416 | $this->advance(); |
417 | if ( !$this->nextToken() ) { |
418 | $lastBoolType = $beforeLastBoolType; |
419 | $node = $this->unexpectedEOF( [ Token::NOT, Token::PARSED_NODE ] ); |
420 | if ( $left instanceof WordsQueryNode ) { |
421 | // "catapult ||" |
422 | $left = $this->mergeWords( $left, $node ); |
423 | } else { |
424 | // "!catapult ||" |
425 | $clauses[] = $this->createClause( $left, $beforeLastBoolType !== -1 ); |
426 | $left = $node; |
427 | } |
428 | Assert::postcondition( $left !== null, '$left must not be null' ); |
429 | break; |
430 | } |
431 | $occur = $this->boolToOccur( $lastBoolType ); |
432 | if ( $this->isLeaf() ) { |
433 | $node = $this->negatedLeaf(); |
434 | $clauses[] = $this->createClause( $left, true, $occur ); |
435 | $left = $node; |
436 | } else { |
437 | $clauses[] = $this->createClause( $left ); |
438 | $left = $this->fallbackToWord( [ Token::NOT, Token::PARSED_NODE ] ); |
439 | } |
440 | Assert::postcondition( $left !== null, '$left must not be null' ); |
441 | break; |
442 | default: |
443 | throw new LogicException( "BUG: unexpected token type {$this->token->getType()}" ); |
444 | } |
445 | } |
446 | |
447 | if ( $left === null ) { |
448 | return new EmptyQueryNode( 0, strlen( $this->query ) ); |
449 | } |
450 | if ( $clauses !== [] ) { |
451 | if ( $lastBoolType !== -1 ) { |
452 | $occur = $this->boolToOccur( $lastBoolType ); |
453 | $clauses[] = $this->createClause( $left, true, $occur ); |
454 | } else { |
455 | $clauses[] = $this->createClause( $left, $explicitNegation ); |
456 | } |
457 | return $this->createBoolNode( $clauses ); |
458 | } elseif ( $left instanceof NegatedNode ) { |
459 | $clauses[] = $this->createClause( $left ); |
460 | return $this->createBoolNode( $clauses ); |
461 | } |
462 | return $left; |
463 | } |
464 | |
465 | /** |
466 | * @param BooleanClause[] $clauses |
467 | * @return ParsedBooleanNode |
468 | */ |
469 | private function createBoolNode( array $clauses ) { |
470 | $end = end( $clauses )->getNode()->getEndOffset(); |
471 | $start = reset( $clauses )->getNode()->getStartOffset(); |
472 | return new ParsedBooleanNode( $start, $end, $clauses ); |
473 | } |
474 | |
475 | /** |
476 | * Collapse the current token with $left. |
477 | * @param WordsQueryNode $left |
478 | * @return WordsQueryNode |
479 | */ |
480 | private function collapseWords( WordsQueryNode $left ) { |
481 | $start = $left->getStartOffset(); |
482 | $end = $this->token->getEnd(); |
483 | $word = new WordsQueryNode( $start, $end, |
484 | $this->escaper->unescape( substr( $this->query, $start, $end - $start ) ) ); |
485 | $this->advance(); |
486 | return $word; |
487 | } |
488 | |
489 | private function mergeWords( WordsQueryNode $left, WordsQueryNode $right ) { |
490 | $start = $left->getStartOffset(); |
491 | $end = $right->getEndOffset(); |
492 | return new WordsQueryNode( $start, $end, |
493 | $this->escaper->unescape( substr( $this->query, $start, $end - $start ) ) ); |
494 | } |
495 | |
496 | /** |
497 | * @param int[] $expected token types |
498 | * @return WordsQueryNode |
499 | */ |
500 | private function fallbackToWord( array $expected ) { |
501 | $this->warnings[] = new ParseWarning( |
502 | 'cirrussearch-parse-error-unexpected-token', |
503 | $this->token->getStart(), |
504 | Token::getTypeLabels( $expected ), |
505 | Token::getTypeLabel( $this->token->getType() ) |
506 | ); |
507 | $word = new WordsQueryNode( $this->token->getStart(), $this->token->getEnd(), |
508 | $this->token->getImage() ); |
509 | $this->advance(); |
510 | return $word; |
511 | } |
512 | |
513 | /** |
514 | * @param int[] $expected token types |
515 | * @return WordsQueryNode |
516 | */ |
517 | private function unexpectedEOF( array $expected ) { |
518 | $this->warnings[] = new ParseWarning( |
519 | 'cirrussearch-parse-error-unexpected-end', |
520 | strlen( $this->query ), |
521 | Token::getTypeLabels( $expected ), |
522 | Token::getTypeLabel( $this->token->getType() ) |
523 | ); |
524 | return new WordsQueryNode( $this->lookBehind->getStart(), $this->lookBehind->getEnd(), |
525 | $this->lookBehind->getImage() ); |
526 | } |
527 | |
528 | private function negatedLeaf() { |
529 | if ( $this->token->getType() === Token::NOT ) { |
530 | $this->advance(); |
531 | if ( !$this->nextToken() ) { |
532 | return $this->unexpectedEOF( [ Token::PARSED_NODE ] ); |
533 | } |
534 | return $this->explicitlyNegatedNode(); |
535 | } |
536 | return $this->leaf(); |
537 | } |
538 | |
539 | private function leaf() { |
540 | $node = $this->token->getNode(); |
541 | $this->advance(); |
542 | return $node; |
543 | } |
544 | |
545 | /** |
546 | * (word|phrase|keyword|wildcard|fuzzy) |
547 | * |
548 | * Warnings on : |
549 | * |
550 | * !(word|phrase|keyword|special_word) => double negation dropped |
551 | * (AND|OR) => NOT("AND|OR") |
552 | * EOF => "NOT" |
553 | * |
554 | * @return ParsedNode |
555 | */ |
556 | private function explicitlyNegatedNode() { |
557 | if ( $this->token->getType() === Token::PARSED_NODE ) { |
558 | $node = $this->leaf(); |
559 | if ( $node instanceof NegatedNode ) { |
560 | $this->warnings[] = new ParseWarning( 'cirrussearch-parse-error-double-negation', |
561 | $node->getStartOffset() ); |
562 | $node = $node->getChild(); |
563 | } else { |
564 | $node = new NegatedNode( $this->lookBehind->getStart(), $node->getEndOffset(), |
565 | // @phan-suppress-next-line PhanTypeMismatchArgumentNullable $node not null after getType |
566 | $node, $this->lookBehind->getImage() ); |
567 | } |
568 | } else { |
569 | $node = $this->fallbackToWord( [ Token::PARSED_NODE ] ); |
570 | } |
571 | return $node; |
572 | } |
573 | |
574 | /** |
575 | * @return bool true if it's a simple word token |
576 | */ |
577 | private function isLeaf() { |
578 | return $this->token->getType() === Token::PARSED_NODE |
579 | || $this->token->getType() === Token::NOT; |
580 | } |
581 | |
582 | /** |
583 | * @param KeywordFeature[] $keywords |
584 | */ |
585 | private function parseKeywords( array $keywords ) { |
586 | foreach ( $keywords as $kw ) { |
587 | $parsedKeywords = |
588 | $this->keywordParser->parse( $this->query, $kw, $this->keywordOffsetsTracker, $this->offset ); |
589 | $this->keywordOffsetsTracker->appendNodes( $parsedKeywords ); |
590 | foreach ( $parsedKeywords as $keyword ) { |
591 | $this->preTaggedNodes[] = $keyword; |
592 | } |
593 | } |
594 | } |
595 | |
596 | /** |
597 | * @return bool |
598 | */ |
599 | private function nextToken() { |
600 | Assert::precondition( $this->token->getStart() < $this->offset, |
601 | 'You should not call nextToken() twice on the same offsets' ); |
602 | $this->token->copyTo( $this->lookBehind ); |
603 | $this->token->reset(); |
604 | $nextPretagged = current( $this->preTaggedNodes ); |
605 | $queryLen = strlen( $this->query ); |
606 | $maxOffset = $nextPretagged !== false ? |
607 | $nextPretagged->getStartOffset() : $queryLen; |
608 | |
609 | $this->consumeWS(); |
610 | |
611 | if ( $this->offset >= $queryLen ) { |
612 | $this->token->eof(); |
613 | return false; |
614 | } |
615 | |
616 | if ( $nextPretagged !== false && $this->offset === $nextPretagged->getStartOffset() ) { |
617 | $this->token->node( $nextPretagged ); |
618 | return true; |
619 | } |
620 | |
621 | Assert::precondition( $this->offset < $maxOffset, '$this->offset < $maxOffset' ); |
622 | if ( $this->consumeBoolOp() ) { |
623 | return true; |
624 | } |
625 | |
626 | if ( $this->consumePhrase( $maxOffset ) ) { |
627 | return true; |
628 | } |
629 | |
630 | if ( $this->consumeWord( $maxOffset ) ) { |
631 | return true; |
632 | } |
633 | |
634 | if ( $this->consumeUnbalancedPhrase( $maxOffset ) ) { |
635 | $this->warnings[] = new ParseWarning( "cirrus-parse-error-unbalanced-phrase", $this->token->getStart() ); |
636 | // this is theorically the only remaining option (unbalanced phrase query) |
637 | // if not it means the above is broken and needs a fix |
638 | return true; |
639 | } |
640 | throw new \RuntimeException( "BUG: cannot consume query at offset $this->offset (need to go to $maxOffset)" ); |
641 | } |
642 | |
643 | /** |
644 | * @return bool |
645 | */ |
646 | private function consumeWS() { |
647 | $matches = []; |
648 | if ( preg_match( self::WHITESPACE_REGEX, $this->query, $matches, 0, $this->offset ) === 1 ) { |
649 | $this->offset += strlen( $matches[0] ); |
650 | return true; |
651 | } |
652 | return false; |
653 | } |
654 | |
655 | /** |
656 | * Consume an explicit boolean operator (and/or/not) |
657 | * @return bool true if consumed, false otherwise. |
658 | */ |
659 | private function consumeBoolOp() { |
660 | $match = []; |
661 | if ( preg_match( self::EXPLICIT_BOOLEAN_OPERATOR, $this->query, $match, 0, $this->offset ) ) { |
662 | // Check captured backward so no need to check that the group actually matched |
663 | if ( isset( $match['NOT'] ) ) { |
664 | $this->token->setType( Token::NOT, $this->offset, $this->offset + strlen( $match['NOT'] ) ); |
665 | } elseif ( isset( $match['OR'] ) ) { |
666 | $this->token->setType( Token::BOOL_OR, $this->offset, $this->offset + strlen( $match['OR'] ) ); |
667 | } else { |
668 | $this->token->setType( Token::BOOL_AND, $this->offset, |
669 | $this->offset + strlen( $match['AND'] ) ); |
670 | } |
671 | return true; |
672 | } |
673 | return false; |
674 | } |
675 | |
676 | /** |
677 | * @param int $maxOffset |
678 | * @return bool true if consumed, false otherwise. |
679 | */ |
680 | private function consumePhrase( $maxOffset ) { |
681 | $node = $this->phraseQueryParser->parse( $this->query, $this->offset, $maxOffset ); |
682 | if ( $node !== null ) { |
683 | $this->token->node( $node ); |
684 | return true; |
685 | } |
686 | return false; |
687 | } |
688 | |
689 | /** |
690 | * @param int $maxOffset |
691 | * @return bool |
692 | */ |
693 | private function consumeWord( int $maxOffset ) { |
694 | $node = $this->nonPhraseParser->parse( $this->query, $this->offset, $maxOffset ); |
695 | if ( $node !== null ) { |
696 | $this->token->node( $node ); |
697 | return true; |
698 | } |
699 | return false; |
700 | } |
701 | |
702 | /** |
703 | * @param int $maxOffset |
704 | * @return bool |
705 | */ |
706 | private function consumeUnbalancedPhrase( $maxOffset ) { |
707 | $matches = []; |
708 | if ( preg_match( PhraseQueryParser::PHRASE_START, $this->query, $matches, 0, $this->offset ) === 1 ) { |
709 | $negated = isset( $matches['negate'] ) && strlen( $matches['negate'] ) > 0 ? $matches['negate'] : ''; |
710 | $wholeStart = $this->offset; |
711 | $phraseStart = $wholeStart + strlen( $negated ); |
712 | $innerStart = $phraseStart + strlen( '"' ); |
713 | Assert::invariant( $wholeStart <= $maxOffset, '$start <= $to' ); |
714 | $phrase = $maxOffset > $innerStart ? substr( $this->query, $innerStart, $maxOffset - $innerStart ) : ""; |
715 | $node = PhraseQueryNode::unbalanced( $phraseStart, $maxOffset, $phrase ); |
716 | if ( $negated !== '' ) { |
717 | $node = new NegatedNode( $wholeStart, $maxOffset, $node, $negated ); |
718 | } |
719 | $this->token->node( $node ); |
720 | return true; |
721 | } |
722 | return false; |
723 | } |
724 | |
725 | /** |
726 | * advance current offset to the end of the current token |
727 | */ |
728 | private function advance() { |
729 | $pretagged = current( $this->preTaggedNodes ); |
730 | if ( $pretagged !== false && $this->token->getStart() === $pretagged->getStartOffset() ) { |
731 | next( $this->preTaggedNodes ); |
732 | } |
733 | $this->offset = $this->token->getEnd(); |
734 | } |
735 | |
736 | /** |
737 | * @param int $boolOperator |
738 | * @return string |
739 | */ |
740 | private function boolToOccur( $boolOperator ) { |
741 | return $boolOperator === Token::BOOL_OR ? BooleanClause::SHOULD : BooleanClause::MUST; |
742 | } |
743 | |
744 | /** |
745 | * @param ParsedNode $root |
746 | * @return array|string array of additional namespaces, 'all' for everything |
747 | */ |
748 | private function extractRequiredNamespaces( ParsedNode $root ) { |
749 | $visitor = new class( [], [ PrefixFeature::class ] ) extends KeywordNodeVisitor { |
750 | /** @var string|int[] */ |
751 | public $total = []; |
752 | |
753 | /** |
754 | * @param KeywordFeatureNode $node |
755 | */ |
756 | public function doVisitKeyword( KeywordFeatureNode $node ) { |
757 | if ( $this->total === 'all' ) { |
758 | return; |
759 | } |
760 | |
761 | Assert::parameter( $node->getKeyword() instanceof PrefixFeature, '$node', 'must be parsed from PrefixFeature' ); |
762 | // @phan-suppress-next-line PhanTypeArraySuspiciousNullable |
763 | $additional = $node->getParsedValue()[PrefixFeature::PARSED_NAMESPACES]; |
764 | if ( $additional === 'all' ) { |
765 | $this->total = 'all'; |
766 | return; |
767 | } |
768 | Assert::precondition( is_array( $additional ), |
769 | 'PrefixFeature::PARSED_NAMESPACES key must point to an array or "all"' ); |
770 | $this->total = array_merge( $this->total, array_filter( $additional, function ( $v ) { |
771 | return !in_array( $v, $this->total ); |
772 | } ) ); |
773 | } |
774 | }; |
775 | $root->accept( $visitor ); |
776 | return $visitor->total; |
777 | } |
778 | |
779 | /** |
780 | * Inspect $this->query to see if it mentions a namespace in the first few chars |
781 | * If yes $this->namespaceHeader will be set and this->offset will be advanced |
782 | * to the actual start of the query. |
783 | */ |
784 | private function parseNsHeader() { |
785 | Assert::precondition( $this->offset === 0, 'ns header must be the first parsed bits or ' . |
786 | 'you must properly handle offset in this method.' ); |
787 | $queryAndNs = $this->namespacePrefixParser->parse( $this->query ); |
788 | if ( $queryAndNs !== false ) { |
789 | Assert::postcondition( count( $queryAndNs ) === 2, |
790 | '\SearchEngine::parseNamespacePrefixes() must return false or a 2 elements array' ); |
791 | $queryOffset = strlen( $this->query ) - strlen( $queryAndNs[0] ); |
792 | if ( $queryAndNs[1] ) { |
793 | Assert::postcondition( is_array( $queryAndNs[1] ) && count( $queryAndNs[1] ) === 1, |
794 | '\SearchEngine::parseNamespacePrefixes() should return an array whose second ' . |
795 | 'element is falsy or an array of size 1' ); |
796 | $this->namespaceHeader = new NamespaceHeaderNode( $this->offset, $queryOffset, reset( $queryAndNs[1] ) ); |
797 | } else { |
798 | $this->namespaceHeader = new NamespaceHeaderNode( $this->offset, $queryOffset, 'all' ); |
799 | } |
800 | $this->offset = $queryOffset; |
801 | } |
802 | } |
803 | |
804 | /** |
805 | * Check the length of the query and throws SearchQueryParseException |
806 | * if it's more than what we allow. |
807 | * |
808 | * @throws SearchQueryParseException |
809 | */ |
810 | private function checkQueryLen(): void { |
811 | Assert::precondition( $this->query !== null, "Query must be set" ); |
812 | $maxLen = $this->maxQueryLen; |
813 | $exemptKeywords = []; |
814 | foreach ( $this->preTaggedNodes as $node ) { |
815 | $realNode = $node; |
816 | if ( $node instanceof NegatedNode ) { |
817 | $realNode = $node->getChild(); |
818 | } |
819 | if ( $realNode instanceof KeywordFeatureNode ) { |
820 | $maxLen += mb_strlen( substr( $this->query, $node->getStartOffset(), $node->getEndOffset() ) ); |
821 | $exemptKeywords[] = $realNode->getKey(); |
822 | } |
823 | } |
824 | $queryLen = mb_strlen( $this->query ); |
825 | if ( $queryLen > $maxLen ) { |
826 | if ( $exemptKeywords ) { |
827 | sort( $exemptKeywords ); |
828 | throw new SearchQueryParseException( 'cirrussearch-query-too-long-with-exemptions', |
829 | $queryLen, $this->maxQueryLen, Message::listParam( array_unique( $exemptKeywords ), 'comma' ) ); |
830 | } else { |
831 | throw new SearchQueryParseException( 'cirrussearch-query-too-long', $queryLen, |
832 | $this->maxQueryLen ); |
833 | } |
834 | } |
835 | } |
836 | } |