Code Coverage for /workspace/src/extensions/CirrusSearch/includes/Parser/QueryStringRegex/NonPhraseParser.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	100.00% covered (success)	100.00%	50 / 50	100.00% covered (success)	100.00%	2 / 2	CRAP	100.00% covered (success)	100.00%	1 / 1
NonPhraseParser	100.00% covered (success)	100.00%	50 / 50	100.00% covered (success)	100.00%	2 / 2	19	100.00% covered (success)	100.00%	1 / 1
__construct	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2
parse	100.00% covered (success)	100.00%	46 / 46	100.00% covered (success)	100.00%	1 / 1	17

1	<?php
2
3	namespace CirrusSearch\Parser\QueryStringRegex;
4
5	use CirrusSearch\Parser\AST\FuzzyNode;
6	use CirrusSearch\Parser\AST\NegatedNode;
7	use CirrusSearch\Parser\AST\ParsedNode;
8	use CirrusSearch\Parser\AST\PrefixNode;
9	use CirrusSearch\Parser\AST\WildcardNode;
10	use CirrusSearch\Parser\AST\WordsQueryNode;
11	use CirrusSearch\Search\Escaper;
12	use Wikimedia\Assert\Assert;
13
14	/**
15	* Parse non-phrase query parts.
16	* Emit a ParsedQueryStringNode if lucene QueryString syntax is detected
17	* A WordsQueryNode otherwise.
18	*/
19	class NonPhraseParser {
20
21	/**
22	* Detects prefixed negation but ignores negation if not followed by a letter, a number or _
23	* -word: properly negated
24	* --word: eaten as "--word"
25	*
26	* few markups are added
27	*/
28	private const NEGATION = '/\G[-!](?=[\w])/u';
29
30	/**
31	* Consume non quoted chars (negated phrase queries as well)
32	* allows:
33	* - all escaped sequences
34	* - !- only if they are not followed by " (accepts $ to consume !- at the end of the string)
35	* - stops at first ", ! or -
36	*/
37	private const NON_QUOTE = '/\\\\.\|[!-](?!")\|(?<stop>["!\pZ\pC-])/u';
38
39	/**
40	* Detect simple prefix nodes
41	* only letters and number allowed
42	*/
43	private const PREFIX_QUERY = '/^(?<prefix>\w+)[*]+$/u';
44
45	/**
46	* Wildcards disallowed at the beginning
47	* we arbitrarily allow 3 wildcards to avoid catching random garbage
48	* and too costly queries.
49	*/
50	private const DISALLOWED_LEADING_WILDCARD = '/^(?:\w+[?]){1,3}\w$/u';
51
52	/**
53	* Wildcards allowed at the beginning
54	* but we still force the wildcards to be surrounded by letters
55	* we allow only 3 wildcards
56	*/
57	private const ALLOWED_LEADING_WILDCARD = '/^(?:(?:[?](?=\w)(?:\w+[?]\|\w+){1,2}\w)\|(?:(?:\w+[?]){1,3}\w*))$/u';
58
59	/**
60	* We force fuzzy words to have letters in them
61	* NOTE that we disallow * or ? here so we can't
62	* match fuzzy and wildcard at the same time
63	*/
64	private const FUZZY_WORD = '/^(?<word>\w+)~(?<fuzzyness>[0-2])?$/u';
65
66	/**
67	* @var Escaper
68	*/
69	private $escaper;
70
71	/**
72	* @var string regex used to detect wildcards
73	*/
74	private $wildcardRegex;
75
76	/**
77	* @param Escaper $escaper
78	*/
79	public function __construct( Escaper $escaper ) {
80	$this->escaper = $escaper;
81	if ( $this->escaper->getAllowLeadingWildcard() ) {
82	$this->wildcardRegex = self::ALLOWED_LEADING_WILDCARD;
83	} else {
84	$this->wildcardRegex = self::DISALLOWED_LEADING_WILDCARD;
85	}
86	}
87
88	/**
89	* @param string $query
90	* @param int $start
91	* @param int $end
92	* @return ParsedNode\|null
93	*/
94	public function parse( string $query, int $start, int $end ) {
95	$match = [];
96	Assert::precondition( $start < $end, '$start < $end' );
97	Assert::precondition( $end <= strlen( $query ), '$end <= strlen( $query )' );
98	$ret = preg_match( self::NEGATION, $query, $match, PREG_OFFSET_CAPTURE, $start );
99	Assert::postcondition( $ret !== false, 'Regex failed: ' . preg_last_error() );
100	$wholeStart = $start;
101	$wordStart = $start;
102	$negationType = '';
103	if ( $ret ) {
104	$wordStart = $start + strlen( $match[0][0] );
105	$negationType = $match[0][0];
106	$start = $match[0][1];
107	}
108	$wholeEnd = -1;
109
110	while ( $start < $end ) {
111	$ret = preg_match( self::NON_QUOTE, $query, $match, PREG_OFFSET_CAPTURE, $start );
112	Assert::postcondition( $ret !== false, 'Regex failed: ' . preg_last_error() );
113	if ( !$ret ) {
114	$wholeEnd = $end;
115	break;
116	}
117	if ( isset( $match['stop'] ) && $match['stop'][1] >= 0 ) {
118	$wholeEnd = $match['stop'][1];
119	break;
120	}
121	$start = $match[0][1] + strlen( $match[0][0] );
122	$wholeEnd = $start;
123	}
124
125	if ( $wholeEnd == $wordStart ) {
126	return null;
127	}
128
129	$wordLen = $wholeEnd - $wordStart;
130	$word = substr( $query, $wordStart, $wordLen );
131	$node = null;
132	$match = [];
133	if ( str_contains( $word, '~' ) && preg_match( self::FUZZY_WORD, $word, $match ) ) {
134	$word = $match['word'];
135	if ( isset( $match['fuzzyness'] ) && strlen( $match['fuzzyness'] ) > 0 ) {
136	$fuzzyness = intval( $match['fuzzyness'] );
137	} else {
138	$fuzzyness = -1;
139	}
140	// No need to unescape here, we don't match any punctuation except_
141	$node = new FuzzyNode( $wordStart, $wholeEnd, $word, $fuzzyness );
142	} elseif ( str_contains( $word, '*' ) \|\| str_contains( $word, '?' ) ) {
143	if ( preg_match( self::PREFIX_QUERY, $word, $match ) ) {
144	$node = new PrefixNode( $wordStart, $wholeEnd, $match['prefix'] );
145	} elseif ( preg_match( $this->wildcardRegex, $word ) ) {
146	$node = new WildcardNode( $wordStart, $wholeEnd, $word );
147	}
148	}
149
150	if ( $node === null ) {
151	$node = new WordsQueryNode( $wordStart, $wholeEnd, $this->escaper->unescape( $word ) );
152	}
153	if ( $negationType !== '' ) {
154	$node = new NegatedNode( $wholeStart, $wholeEnd, $node, $negationType );
155	}
156	return $node;
157	}
158	}