Code Coverage for /workspace/src/extensions/CirrusSearch/includes/Parser/QueryStringRegex/QueryStringRegexParser.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	97.85% covered (success)	97.85%	319 / 326	85.19% covered (warning)	85.19%	23 / 27	CRAP	0.00% covered (danger)	0.00%	0 / 1
QueryStringRegexParser	97.85% covered (success)	97.85%	319 / 326	85.19% covered (warning)	85.19%	23 / 27	102	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	2
reInit	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	1
cleanup	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	6
parse	100.00% covered (success)	100.00%	41 / 41	100.00% covered (success)	100.00%	1 / 1	10
createClause	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	3
expression	94.44% covered (success)	94.44%	68 / 72	0.00% covered (danger)	0.00%	0 / 1	20.07
createBoolNode	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
collapseWords	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	1
mergeWords	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	1
fallbackToWord	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	1
unexpectedEOF	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	1
negatedLeaf	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	3
leaf	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
explicitlyNegatedNode	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	3
isLeaf	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	2
parseKeywords	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	3
nextToken	96.15% covered (success)	96.15%	25 / 26	0.00% covered (danger)	0.00%	0 / 1	9
consumeWS	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	2
consumeBoolOp	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	4
consumePhrase	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	2
consumeWord	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	2
consumeUnbalancedPhrase	92.86% covered (success)	92.86%	13 / 14	0.00% covered (danger)	0.00%	0 / 1	6.01
advance	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	3
boolToOccur	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	2
extractRequiredNamespaces	93.75% covered (success)	93.75%	15 / 16	0.00% covered (danger)	0.00%	0 / 1	3.00
parseNsHeader	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	4
checkQueryLen	100.00% covered (success)	100.00%	18 / 18	100.00% covered (success)	100.00%	1 / 1	6

1	<?php
2
3	namespace CirrusSearch\Parser\QueryStringRegex;
4
5	use CirrusSearch\Parser\AST\BooleanClause;
6	use CirrusSearch\Parser\AST\EmptyQueryNode;
7	use CirrusSearch\Parser\AST\KeywordFeatureNode;
8	use CirrusSearch\Parser\AST\NamespaceHeaderNode;
9	use CirrusSearch\Parser\AST\NegatedNode;
10	use CirrusSearch\Parser\AST\ParsedBooleanNode;
11	use CirrusSearch\Parser\AST\ParsedNode;
12	use CirrusSearch\Parser\AST\ParsedQuery;
13	use CirrusSearch\Parser\AST\ParseWarning;
14	use CirrusSearch\Parser\AST\PhraseQueryNode;
15	use CirrusSearch\Parser\AST\Visitor\KeywordNodeVisitor;
16	use CirrusSearch\Parser\AST\WordsQueryNode;
17	use CirrusSearch\Parser\NamespacePrefixParser;
18	use CirrusSearch\Parser\ParsedQueryClassifiersRepository;
19	use CirrusSearch\Parser\QueryParser;
20	use CirrusSearch\Query\KeywordFeature;
21	use CirrusSearch\Query\PrefixFeature;
22	use CirrusSearch\Search\Escaper;
23	use CirrusSearch\Util;
24	use LogicException;
25	use MediaWiki\Message\Message;
26	use Wikimedia\Assert\Assert;
27
28	/**
29	* Full text query parser that uses regex to parse its token.
30	*
31	* Far from being a state of the art parser it detects most of its
32	* tokens using regular expression. And make arbitrary decisions
33	* at tokenization.
34	*
35	* The tokenizer will understand few token types:
36	* - WHITESPACE: all unicode whitespace and control chars ([\pZ\pC])
37	* the WHITESPACE token is ignored and never presented to the parser
38	* - EOF: dummy type used to mark end of string
39	* - BOOL_AND/BOOL_OR/BOOL_NOT: explicit boolean opeartors
40	* - PARSED_NODE: complex type (usually part of the query)
41	*
42	* PARSED_NODE is a type that groups:
43	* - Keywords
44	* - Phrase
45	* - Words
46	* - Wildcards/Prefix
47	*
48	* Phrase does not have its own token " and is part the tokenization and is never exposed
49	* to the parser.
50	* Same for negation prefix (! and -), they are parsed at tokenization time.
51	*
52	* NOTE that this parser is broken by design:
53	* - no lexical context support, we first parse keywords
54	* - no support for groupings (parenthesis)
55	*/
56	class QueryStringRegexParser implements QueryParser {
57	/**
58	* Whitespace regex including unicode and some control chars
59	*/
60	private const WHITESPACE_REGEX = '/\G[\pZ\pC]+/u';
61
62	public const QUERY_LEN_HARD_LIMIT = 4096;
63
64	/**
65	* see T66350
66	*/
67	private const GERSHAYIM_REGEX = '/(\p{L}{2,})(?:")(\p{L})(?=[^\p{L}]\|$)/u';
68
69	/**
70	* Supported explicit boolean operator
71	*
72	*/
73	private const EXPLICIT_BOOLEAN_OPERATOR = '/\G(?:(?<AND>AND\|&&)\|(?<OR>OR\|\\|\\|)\|(?<NOT>NOT))(?![^\pZ\pC"])/u';
74
75	/**
76	* @var \CirrusSearch\Parser\KeywordRegistry
77	*/
78	private $keywordRegistry;
79
80	/**
81	* @var Escaper
82	*/
83	private $escaper;
84
85	/**
86	* @var ParsedQueryClassifiersRepository
87	*/
88	private $classifierRepository;
89
90	/**
91	* @var string\|null user query (null when not yet cleaned up)
92	*/
93	private $query;
94
95	/**
96	* @var string Either "all", "break", or "final"
97	*/
98	private $questionMarkStripLevel;
99
100	/**
101	* @var string the raw query as received by the search engine
102	*/
103	private $rawQuery;
104
105	/**
106	* @var KeywordParser
107	*/
108	private $keywordParser;
109
110	/**
111	* @var PhraseQueryParser
112	*/
113	private $phraseQueryParser;
114
115	/**
116	* @var NonPhraseParser
117	*/
118	private $nonPhraseParser;
119
120	/**
121	* @var OffsetTracker track offsets of parsed keywords
122	*/
123	private $keywordOffsetsTracker;
124
125	/**
126	* @var ParsedNode[]
127	*/
128	private $preTaggedNodes = [];
129
130	/**
131	* Token set after calling nextToken
132	* @var Token\|null
133	*/
134	private $token;
135
136	/**
137	* Last token seen (set within nextToken)
138	* @var Token\|null
139	*/
140	private $lookBehind;
141
142	/**
143	* Current offset
144	* NOTE: offset is moved after call advance
145	* @var int
146	*/
147	private $offset;
148
149	/**
150	* @var bool[] indexed cleanups applied (indexed by the cleanup type)
151	* @see ParsedQuery::hasCleanup()
152	*/
153	private $queryCleanups = [];
154
155	/**
156	* Errors detected while parsing the query
157	* @var ParseWarning[]
158	*/
159	private $warnings = [];
160
161	/**
162	* @var NamespaceHeaderNode\|null
163	*/
164	private $namespaceHeader;
165
166	/**
167	* @var NamespacePrefixParser
168	*/
169	private $namespacePrefixParser;
170
171	private const DEFAULT_OCCUR = BooleanClause::MUST;
172
173	/**
174	* @var int
175	*/
176	private $maxQueryLen;
177
178	/**
179	* @param \CirrusSearch\Parser\KeywordRegistry $keywordRegistry
180	* @param Escaper $escaper
181	* @param string $qmarkStripLevel Level of question mark stripping to apply, either "all",
182	* "break", or "final"
183	* @param ParsedQueryClassifiersRepository $classifierRepository
184	* @param NamespacePrefixParser $namespacePrefixParser
185	* @param int\|null $maxQueryLen maximum length of the query in chars
186	* @see Util::stripQuestionMarks() for acceptable $qmarkStripLevel values
187	*/
188	public function __construct(
189	\CirrusSearch\Parser\KeywordRegistry $keywordRegistry,
190	Escaper $escaper,
191	$qmarkStripLevel,
192	ParsedQueryClassifiersRepository $classifierRepository,
193	NamespacePrefixParser $namespacePrefixParser,
194	?int $maxQueryLen
195	) {
196	$this->keywordRegistry = $keywordRegistry;
197	$this->escaper = $escaper;
198	$this->keywordParser = new KeywordParser();
199	$this->phraseQueryParser = new PhraseQueryParser( $escaper );
200	$this->nonPhraseParser = new NonPhraseParser( $escaper );
201	$this->questionMarkStripLevel = $qmarkStripLevel;
202	$this->classifierRepository = $classifierRepository;
203	$this->namespacePrefixParser = $namespacePrefixParser;
204	$this->maxQueryLen = $maxQueryLen ?: 300;
205	}
206
207	/**
208	* Reinit internal parser states
209	* @param string $rawQuery
210	*/
211	private function reInit( $rawQuery ) {
212	$this->rawQuery = $rawQuery;
213	$this->query = null;
214	$this->keywordOffsetsTracker = new OffsetTracker();
215	$this->token = null;
216	$this->lookBehind = null;
217	$this->preTaggedNodes = [];
218	$this->warnings = [];
219	$this->queryCleanups = [];
220	$this->namespaceHeader = null;
221	$this->offset = 0;
222	}
223
224	/**
225	* Apply some cleanups to the input query prior to parsing it
226	* Ideally the parser should be able to handle the query without modifying it
227	* but in some cases it simply way easier to handle this this way.
228	* Cleanups applied:
229	* - Question mark stripping depending on $this->questionMarkStripLevel
230	* - gershayim quirks if $this->escaper->getLanguage() is hebrew
231	*/
232	private function cleanup() {
233	$query = $this->rawQuery;
234	$nquery = Util::stripQuestionMarks( $query, $this->questionMarkStripLevel );
235	if ( $nquery !== $query ) {
236	$this->queryCleanups[ParsedQuery::CLEANUP_QMARK_STRIPPING] = true;
237	$query = $nquery;
238	}
239	if ( $this->escaper->getLanguage() === 'he' ) {
240	$nquery = preg_replace( self::GERSHAYIM_REGEX, '$1\"$2', $query );
241	if ( $nquery !== $query ) {
242	$this->queryCleanups[ParsedQuery::CLEANUP_GERSHAYIM_QUIRKS] = true;
243	$query = $nquery;
244	}
245	}
246	if ( strlen( $query ) > 0 && $query[0] === '~' ) {
247	$query = substr( $query, 1 );
248	$this->queryCleanups[ParsedQuery::TILDE_HEADER] = true;
249	}
250	$this->query = $query;
251	}
252
253	/**
254	* @param string $query
255	* @return \CirrusSearch\Parser\AST\ParsedQuery
256	* @throws SearchQueryParseException
257	*/
258	public function parse( string $query ): ParsedQuery {
259	$this->reInit( $query );
260	$queryLen = mb_strlen( $query );
261	if ( $queryLen > self::QUERY_LEN_HARD_LIMIT ) {
262	throw new SearchQueryParseException( 'cirrussearch-query-too-long',
263	$queryLen, self::QUERY_LEN_HARD_LIMIT );
264	}
265	$this->cleanup();
266	$this->parseNsHeader();
267	$this->token = new Token( $this->query );
268	$this->lookBehind = new Token( $this->query );
269
270	// First parse keywords
271	$nonGreedyHeaders = [];
272	$greedyHeaders = [];
273	$greedy = [];
274	$allowingEmpty = [];
275	$normalKeywords = [];
276	foreach ( $this->keywordRegistry->getKeywords() as $keyword ) {
277	// Parsing depends on the nature of the keyword
278	// 1. non greedy query headers
279	// 2. greedy query headers
280	// 3. greedy
281	// 4. allowed empty values (see prefer-recent)
282	// 5. normal
283	// FIXME: refactor this so that parsing is less dependent on keyword ordering
284	// we could try to identify all keyword prefixes in a single regex like /(?<=\G\|[\pZ\pC])(intitle\|prefer-recent\|...):/
285	// and iterate over there. Currently we workaround this issue by separating keywords into categories but that is not
286	// sufficient it's still dependent on ordering within a specific category.
287	if ( !$keyword->greedy() && $keyword->queryHeader() ) {
288	$nonGreedyHeaders[] = $keyword;
289	} elseif ( $keyword->greedy() && $keyword->queryHeader() ) {
290	$greedyHeaders[] = $keyword;
291	} elseif ( $keyword->greedy() ) {
292	$greedy[] = $keyword;
293	} elseif ( $keyword->allowEmptyValue() ) {
294	$allowingEmpty[] = $keyword;
295	} else {
296	$normalKeywords[] = $keyword;
297	}
298	}
299	$this->parseKeywords( $nonGreedyHeaders );
300	$this->parseKeywords( $greedyHeaders );
301	$this->parseKeywords( $greedy );
302	$this->parseKeywords( $allowingEmpty );
303	$this->parseKeywords( $normalKeywords );
304	$this->warnings = array_merge( $this->warnings, $this->keywordParser->getWarnings() );
305	// All parsed keywords have their offsets marked in $this->keywordOffsetsTracker
306	// We then reparse the query from the beginning finding holes between keyword offsets
307	uasort( $this->preTaggedNodes, static function ( ParsedNode $a, ParsedNode $b ) {
308	if ( $a->getStartOffset() < $b->getStartOffset() ) {
309	return -1;
310	} else {
311	// We cannot have equality here
312	return 1;
313	}
314	} );
315	reset( $this->preTaggedNodes );
316
317	$this->checkQueryLen();
318	$root = $this->expression();
319	$additionalNamespaces = $this->extractRequiredNamespaces( $root );
320	return new ParsedQuery( $root, $this->query, $this->rawQuery, $this->queryCleanups,
321	$this->namespaceHeader, $additionalNamespaces, $this->warnings, $this->classifierRepository );
322	}
323
324	private function createClause( ParsedNode $node, $explicit = false, $occur = null ) {
325	if ( $node instanceof NegatedNode ) {
326	// OR NOT is simply MUST_NOT, there's no SHOULD_NOT in lucene
327	// so simply do what lucene QueryString does: force MUST_NOT whenever
328	// we encounter a negated clause.
329	return new BooleanClause( $node->getChild(), BooleanClause::MUST_NOT,
330	$explicit \|\| $node->getNegationType() === 'NOT', $node );
331	}
332	return new BooleanClause( $node, $occur ?? self::DEFAULT_OCCUR, $explicit );
333	}
334
335	/**
336	* (([NOT] Node) (AND\|OR)?)*
337	* Tries to follow behavior with backward order precedence:
338	* - A AND B OR C => MUST:A SHOULD:B SHOULD:C
339	* - A OR B AND C => SHOULD:A MUST:B MUST:C
340	*
341	* NOT is always MUST_NOT:
342	* - A OR NOT B: A -B
343	*
344	* Syntax errors fallback:
345	* - NOT: MUST:NOT
346	* - NOT NOT: MUST_NOT:NOT
347	* - NOT AND FOO: MUST_NOT:AND MUST:FOO
348	* - NOT !FOO: MUST:FOO
349	* @return ParsedNode
350	*/
351	private function expression() {
352	$clauses = [];
353	$left = null;
354	// Last boolean operator seen, -1 means none
355	$lastBoolType = -1;
356	$explicitNegation = false;
357	// TODO: simplify, this is a bit hairy
358	while ( $this->nextToken() ) {
359	if ( $left === null ) {
360	// First iteration
361	if ( !$this->isLeaf() ) {
362	$left = $this->fallbackToWord( [ Token::NOT, Token::PARSED_NODE ] );
363	} else {
364	$left = $this->negatedLeaf();
365	}
366	Assert::postcondition( $left !== null, '$left must not be null' );
367	continue;
368	}
369
370	$explicitNegation = false;
371	// The last boolean operator seen before the last one, -1 means none
372	// used to know if the node was attached explicitly by the user
373	$beforeLastBoolType = $lastBoolType;
374	$lastBoolType = -1;
375	switch ( $this->token->getType() ) {
376	case Token::NOT:
377	// NOT something
378	$this->advance();
379	if ( !$this->nextToken() ) {
380	// NOT<EOF>
381	// strategy is to simply eat the NOT token as a word query
382	$node = $this->unexpectedEOF( [ Token::PARSED_NODE ] );
383	if ( $left instanceof WordsQueryNode ) {
384	// Collapse it to previous words
385	$left = $this->mergeWords( $left, $node );
386	} else {
387	// or add new boolean clause
388	$clauses[] = $this->createClause( $left, $beforeLastBoolType !== -1 );
389	$left = $node;
390	}
391	} else {
392	$explicitNegation = true;
393	$clauses[] = $this->createClause( $left );
394	$left = $this->explicitlyNegatedNode();
395	}
396	Assert::postcondition( $left !== null, '$left must not be null' );
397	break;
398	case Token::PARSED_NODE:
399	// A word or a keyword
400	// note: negation prefix is eaten by the tokenizer
401	if ( $left instanceof WordsQueryNode
402	&& $this->token->getNode() instanceof WordsQueryNode
403	) {
404	$lastBoolType = $beforeLastBoolType;
405	$left = $this->collapseWords( $left );
406
407	} else {
408	$clauses[] = $this->createClause( $left, $explicitNegation );
409	$left = $this->leaf();
410	}
411	Assert::postcondition( $left !== null, '$left must not be null' );
412	break;
413	case Token::BOOL_AND:
414	case Token::BOOL_OR:
415	$lastBoolType = $this->token->getType();
416	$this->advance();
417	if ( !$this->nextToken() ) {
418	$lastBoolType = $beforeLastBoolType;
419	$node = $this->unexpectedEOF( [ Token::NOT, Token::PARSED_NODE ] );
420	if ( $left instanceof WordsQueryNode ) {
421	// "catapult \|\|"
422	$left = $this->mergeWords( $left, $node );
423	} else {
424	// "!catapult \|\|"
425	$clauses[] = $this->createClause( $left, $beforeLastBoolType !== -1 );
426	$left = $node;
427	}
428	Assert::postcondition( $left !== null, '$left must not be null' );
429	break;
430	}
431	$occur = $this->boolToOccur( $lastBoolType );
432	if ( $this->isLeaf() ) {
433	$node = $this->negatedLeaf();
434	$clauses[] = $this->createClause( $left, true, $occur );
435	$left = $node;
436	} else {
437	$clauses[] = $this->createClause( $left );
438	$left = $this->fallbackToWord( [ Token::NOT, Token::PARSED_NODE ] );
439	}
440	Assert::postcondition( $left !== null, '$left must not be null' );
441	break;
442	default:
443	throw new LogicException( "BUG: unexpected token type {$this->token->getType()}" );
444	}
445	}
446
447	if ( $left === null ) {
448	return new EmptyQueryNode( 0, strlen( $this->query ) );
449	}
450	if ( $clauses !== [] ) {
451	if ( $lastBoolType !== -1 ) {
452	$occur = $this->boolToOccur( $lastBoolType );
453	$clauses[] = $this->createClause( $left, true, $occur );
454	} else {
455	$clauses[] = $this->createClause( $left, $explicitNegation );
456	}
457	return $this->createBoolNode( $clauses );
458	} elseif ( $left instanceof NegatedNode ) {
459	$clauses[] = $this->createClause( $left );
460	return $this->createBoolNode( $clauses );
461	}
462	return $left;
463	}
464
465	/**
466	* @param BooleanClause[] $clauses
467	* @return ParsedBooleanNode
468	*/
469	private function createBoolNode( array $clauses ) {
470	$end = end( $clauses )->getNode()->getEndOffset();
471	$start = reset( $clauses )->getNode()->getStartOffset();
472	return new ParsedBooleanNode( $start, $end, $clauses );
473	}
474
475	/**
476	* Collapse the current token with $left.
477	* @param WordsQueryNode $left
478	* @return WordsQueryNode
479	*/
480	private function collapseWords( WordsQueryNode $left ) {
481	$start = $left->getStartOffset();
482	$end = $this->token->getEnd();
483	$word = new WordsQueryNode( $start, $end,
484	$this->escaper->unescape( substr( $this->query, $start, $end - $start ) ) );
485	$this->advance();
486	return $word;
487	}
488
489	private function mergeWords( WordsQueryNode $left, WordsQueryNode $right ) {
490	$start = $left->getStartOffset();
491	$end = $right->getEndOffset();
492	return new WordsQueryNode( $start, $end,
493	$this->escaper->unescape( substr( $this->query, $start, $end - $start ) ) );
494	}
495
496	/**
497	* @param int[] $expected token types
498	* @return WordsQueryNode
499	*/
500	private function fallbackToWord( array $expected ) {
501	$this->warnings[] = new ParseWarning(
502	'cirrussearch-parse-error-unexpected-token',
503	$this->token->getStart(),
504	Token::getTypeLabels( $expected ),
505	Token::getTypeLabel( $this->token->getType() )
506	);
507	$word = new WordsQueryNode( $this->token->getStart(), $this->token->getEnd(),
508	$this->token->getImage() );
509	$this->advance();
510	return $word;
511	}
512
513	/**
514	* @param int[] $expected token types
515	* @return WordsQueryNode
516	*/
517	private function unexpectedEOF( array $expected ) {
518	$this->warnings[] = new ParseWarning(
519	'cirrussearch-parse-error-unexpected-end',
520	strlen( $this->query ),
521	Token::getTypeLabels( $expected ),
522	Token::getTypeLabel( $this->token->getType() )
523	);
524	return new WordsQueryNode( $this->lookBehind->getStart(), $this->lookBehind->getEnd(),
525	$this->lookBehind->getImage() );
526	}
527
528	private function negatedLeaf() {
529	if ( $this->token->getType() === Token::NOT ) {
530	$this->advance();
531	if ( !$this->nextToken() ) {
532	return $this->unexpectedEOF( [ Token::PARSED_NODE ] );
533	}
534	return $this->explicitlyNegatedNode();
535	}
536	return $this->leaf();
537	}
538
539	private function leaf() {
540	$node = $this->token->getNode();
541	$this->advance();
542	return $node;
543	}
544
545	/**
546	* (word\|phrase\|keyword\|wildcard\|fuzzy)
547	*
548	* Warnings on :
549	*
550	* !(word\|phrase\|keyword\|special_word) => double negation dropped
551	* (AND\|OR) => NOT("AND\|OR")
552	* EOF => "NOT"
553	*
554	* @return ParsedNode
555	*/
556	private function explicitlyNegatedNode() {
557	if ( $this->token->getType() === Token::PARSED_NODE ) {
558	$node = $this->leaf();
559	if ( $node instanceof NegatedNode ) {
560	$this->warnings[] = new ParseWarning( 'cirrussearch-parse-error-double-negation',
561	$node->getStartOffset() );
562	$node = $node->getChild();
563	} else {
564	$node = new NegatedNode( $this->lookBehind->getStart(), $node->getEndOffset(),
565	// @phan-suppress-next-line PhanTypeMismatchArgumentNullable $node not null after getType
566	$node, $this->lookBehind->getImage() );
567	}
568	} else {
569	$node = $this->fallbackToWord( [ Token::PARSED_NODE ] );
570	}
571	return $node;
572	}
573
574	/**
575	* @return bool true if it's a simple word token
576	*/
577	private function isLeaf() {
578	return $this->token->getType() === Token::PARSED_NODE
579	\|\| $this->token->getType() === Token::NOT;
580	}
581
582	/**
583	* @param KeywordFeature[] $keywords
584	*/
585	private function parseKeywords( array $keywords ) {
586	foreach ( $keywords as $kw ) {
587	$parsedKeywords =
588	$this->keywordParser->parse( $this->query, $kw, $this->keywordOffsetsTracker, $this->offset );
589	$this->keywordOffsetsTracker->appendNodes( $parsedKeywords );
590	foreach ( $parsedKeywords as $keyword ) {
591	$this->preTaggedNodes[] = $keyword;
592	}
593	}
594	}
595
596	/**
597	* @return bool
598	*/
599	private function nextToken() {
600	Assert::precondition( $this->token->getStart() < $this->offset,
601	'You should not call nextToken() twice on the same offsets' );
602	$this->token->copyTo( $this->lookBehind );
603	$this->token->reset();
604	$nextPretagged = current( $this->preTaggedNodes );
605	$queryLen = strlen( $this->query );
606	$maxOffset = $nextPretagged !== false ?
607	$nextPretagged->getStartOffset() : $queryLen;
608
609	$this->consumeWS();
610
611	if ( $this->offset >= $queryLen ) {
612	$this->token->eof();
613	return false;
614	}
615
616	if ( $nextPretagged !== false && $this->offset === $nextPretagged->getStartOffset() ) {
617	$this->token->node( $nextPretagged );
618	return true;
619	}
620
621	Assert::precondition( $this->offset < $maxOffset, '$this->offset < $maxOffset' );
622	if ( $this->consumeBoolOp() ) {
623	return true;
624	}
625
626	if ( $this->consumePhrase( $maxOffset ) ) {
627	return true;
628	}
629
630	if ( $this->consumeWord( $maxOffset ) ) {
631	return true;
632	}
633
634	if ( $this->consumeUnbalancedPhrase( $maxOffset ) ) {
635	$this->warnings[] = new ParseWarning( "cirrus-parse-error-unbalanced-phrase", $this->token->getStart() );
636	// this is theorically the only remaining option (unbalanced phrase query)
637	// if not it means the above is broken and needs a fix
638	return true;
639	}
640	throw new \RuntimeException( "BUG: cannot consume query at offset $this->offset (need to go to $maxOffset)" );
641	}
642
643	/**
644	* @return bool
645	*/
646	private function consumeWS() {
647	$matches = [];
648	if ( preg_match( self::WHITESPACE_REGEX, $this->query, $matches, 0, $this->offset ) === 1 ) {
649	$this->offset += strlen( $matches[0] );
650	return true;
651	}
652	return false;
653	}
654
655	/**
656	* Consume an explicit boolean operator (and/or/not)
657	* @return bool true if consumed, false otherwise.
658	*/
659	private function consumeBoolOp() {
660	$match = [];
661	if ( preg_match( self::EXPLICIT_BOOLEAN_OPERATOR, $this->query, $match, 0, $this->offset ) ) {
662	// Check captured backward so no need to check that the group actually matched
663	if ( isset( $match['NOT'] ) ) {
664	$this->token->setType( Token::NOT, $this->offset, $this->offset + strlen( $match['NOT'] ) );
665	} elseif ( isset( $match['OR'] ) ) {
666	$this->token->setType( Token::BOOL_OR, $this->offset, $this->offset + strlen( $match['OR'] ) );
667	} else {
668	$this->token->setType( Token::BOOL_AND, $this->offset,
669	$this->offset + strlen( $match['AND'] ) );
670	}
671	return true;
672	}
673	return false;
674	}
675
676	/**
677	* @param int $maxOffset
678	* @return bool true if consumed, false otherwise.
679	*/
680	private function consumePhrase( $maxOffset ) {
681	$node = $this->phraseQueryParser->parse( $this->query, $this->offset, $maxOffset );
682	if ( $node !== null ) {
683	$this->token->node( $node );
684	return true;
685	}
686	return false;
687	}
688
689	/**
690	* @param int $maxOffset
691	* @return bool
692	*/
693	private function consumeWord( int $maxOffset ) {
694	$node = $this->nonPhraseParser->parse( $this->query, $this->offset, $maxOffset );
695	if ( $node !== null ) {
696	$this->token->node( $node );
697	return true;
698	}
699	return false;
700	}
701
702	/**
703	* @param int $maxOffset
704	* @return bool
705	*/
706	private function consumeUnbalancedPhrase( $maxOffset ) {
707	$matches = [];
708	if ( preg_match( PhraseQueryParser::PHRASE_START, $this->query, $matches, 0, $this->offset ) === 1 ) {
709	$negated = isset( $matches['negate'] ) && strlen( $matches['negate'] ) > 0 ? $matches['negate'] : '';
710	$wholeStart = $this->offset;
711	$phraseStart = $wholeStart + strlen( $negated );
712	$innerStart = $phraseStart + strlen( '"' );
713	Assert::invariant( $wholeStart <= $maxOffset, '$start <= $to' );
714	$phrase = $maxOffset > $innerStart ? substr( $this->query, $innerStart, $maxOffset - $innerStart ) : "";
715	$node = PhraseQueryNode::unbalanced( $phraseStart, $maxOffset, $phrase );
716	if ( $negated !== '' ) {
717	$node = new NegatedNode( $wholeStart, $maxOffset, $node, $negated );
718	}
719	$this->token->node( $node );
720	return true;
721	}
722	return false;
723	}
724
725	/**
726	* advance current offset to the end of the current token
727	*/
728	private function advance() {
729	$pretagged = current( $this->preTaggedNodes );
730	if ( $pretagged !== false && $this->token->getStart() === $pretagged->getStartOffset() ) {
731	next( $this->preTaggedNodes );
732	}
733	$this->offset = $this->token->getEnd();
734	}
735
736	/**
737	* @param int $boolOperator
738	* @return string
739	*/
740	private function boolToOccur( $boolOperator ) {
741	return $boolOperator === Token::BOOL_OR ? BooleanClause::SHOULD : BooleanClause::MUST;
742	}
743
744	/**
745	* @param ParsedNode $root
746	* @return array\|string array of additional namespaces, 'all' for everything
747	*/
748	private function extractRequiredNamespaces( ParsedNode $root ) {
749	$visitor = new class( [], [ PrefixFeature::class ] ) extends KeywordNodeVisitor {
750	/** @var string\|int[] */
751	public $total = [];
752
753	/**
754	* @param KeywordFeatureNode $node
755	*/
756	public function doVisitKeyword( KeywordFeatureNode $node ) {
757	if ( $this->total === 'all' ) {
758	return;
759	}
760
761	Assert::parameter( $node->getKeyword() instanceof PrefixFeature, '$node', 'must be parsed from PrefixFeature' );
762	// @phan-suppress-next-line PhanTypeArraySuspiciousNullable
763	$additional = $node->getParsedValue()[PrefixFeature::PARSED_NAMESPACES];
764	if ( $additional === 'all' ) {
765	$this->total = 'all';
766	return;
767	}
768	Assert::precondition( is_array( $additional ),
769	'PrefixFeature::PARSED_NAMESPACES key must point to an array or "all"' );
770	$this->total = array_merge( $this->total, array_filter( $additional, function ( $v ) {
771	return !in_array( $v, $this->total );
772	} ) );
773	}
774	};
775	$root->accept( $visitor );
776	return $visitor->total;
777	}
778
779	/**
780	* Inspect $this->query to see if it mentions a namespace in the first few chars
781	* If yes $this->namespaceHeader will be set and this->offset will be advanced
782	* to the actual start of the query.
783	*/
784	private function parseNsHeader() {
785	Assert::precondition( $this->offset === 0, 'ns header must be the first parsed bits or ' .
786	'you must properly handle offset in this method.' );
787	$queryAndNs = $this->namespacePrefixParser->parse( $this->query );
788	if ( $queryAndNs !== false ) {
789	Assert::postcondition( count( $queryAndNs ) === 2,
790	'\SearchEngine::parseNamespacePrefixes() must return false or a 2 elements array' );
791	$queryOffset = strlen( $this->query ) - strlen( $queryAndNs[0] );
792	if ( $queryAndNs[1] ) {
793	Assert::postcondition( is_array( $queryAndNs[1] ) && count( $queryAndNs[1] ) === 1,
794	'\SearchEngine::parseNamespacePrefixes() should return an array whose second ' .
795	'element is falsy or an array of size 1' );
796	$this->namespaceHeader = new NamespaceHeaderNode( $this->offset, $queryOffset, reset( $queryAndNs[1] ) );
797	} else {
798	$this->namespaceHeader = new NamespaceHeaderNode( $this->offset, $queryOffset, 'all' );
799	}
800	$this->offset = $queryOffset;
801	}
802	}
803
804	/**
805	* Check the length of the query and throws SearchQueryParseException
806	* if it's more than what we allow.
807	*
808	* @throws SearchQueryParseException
809	*/
810	private function checkQueryLen(): void {
811	Assert::precondition( $this->query !== null, "Query must be set" );
812	$maxLen = $this->maxQueryLen;
813	$exemptKeywords = [];
814	foreach ( $this->preTaggedNodes as $node ) {
815	$realNode = $node;
816	if ( $node instanceof NegatedNode ) {
817	$realNode = $node->getChild();
818	}
819	if ( $realNode instanceof KeywordFeatureNode ) {
820	$maxLen += mb_strlen( substr( $this->query, $node->getStartOffset(), $node->getEndOffset() ) );
821	$exemptKeywords[] = $realNode->getKey();
822	}
823	}
824	$queryLen = mb_strlen( $this->query );
825	if ( $queryLen > $maxLen ) {
826	if ( $exemptKeywords ) {
827	sort( $exemptKeywords );
828	throw new SearchQueryParseException( 'cirrussearch-query-too-long-with-exemptions',
829	$queryLen, $this->maxQueryLen, Message::listParam( array_unique( $exemptKeywords ), 'comma' ) );
830	} else {
831	throw new SearchQueryParseException( 'cirrussearch-query-too-long', $queryLen,
832	$this->maxQueryLen );
833	}
834	}
835	}
836	}