Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
50 / 50 |
|
100.00% |
2 / 2 |
CRAP | |
100.00% |
1 / 1 |
NonPhraseParser | |
100.00% |
50 / 50 |
|
100.00% |
2 / 2 |
19 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
parse | |
100.00% |
46 / 46 |
|
100.00% |
1 / 1 |
17 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Parser\QueryStringRegex; |
4 | |
5 | use CirrusSearch\Parser\AST\FuzzyNode; |
6 | use CirrusSearch\Parser\AST\NegatedNode; |
7 | use CirrusSearch\Parser\AST\ParsedNode; |
8 | use CirrusSearch\Parser\AST\PrefixNode; |
9 | use CirrusSearch\Parser\AST\WildcardNode; |
10 | use CirrusSearch\Parser\AST\WordsQueryNode; |
11 | use CirrusSearch\Search\Escaper; |
12 | use Wikimedia\Assert\Assert; |
13 | |
14 | /** |
15 | * Parse non-phrase query parts. |
16 | * Emit a ParsedQueryStringNode if lucene QueryString syntax is detected |
17 | * A WordsQueryNode otherwise. |
18 | */ |
19 | class NonPhraseParser { |
20 | |
21 | /** |
22 | * Detects prefixed negation but ignores negation if not followed by a letter, a number or _ |
23 | * -word: properly negated |
24 | * --word: eaten as "--word" |
25 | * |
26 | * few markups are added |
27 | */ |
28 | private const NEGATION = '/\G[-!](?=[\w])/u'; |
29 | |
30 | /** |
31 | * Consume non quoted chars (negated phrase queries as well) |
32 | * allows: |
33 | * - all escaped sequences |
34 | * - !- only if they are not followed by " (accepts $ to consume !- at the end of the string) |
35 | * - stops at first ", ! or - |
36 | */ |
37 | private const NON_QUOTE = '/\\\\.|[!-](?!")|(?<stop>["!\pZ\pC-])/u'; |
38 | |
39 | /** |
40 | * Detect simple prefix nodes |
41 | * only letters and number allowed |
42 | */ |
43 | private const PREFIX_QUERY = '/^(?<prefix>\w+)[*]+$/u'; |
44 | |
45 | /** |
46 | * Wildcards disallowed at the beginning |
47 | * we arbitrarily allow 3 wildcards to avoid catching random garbage |
48 | * and too costly queries. |
49 | */ |
50 | private const DISALLOWED_LEADING_WILDCARD = '/^(?:\w+[?*]){1,3}\w*$/u'; |
51 | |
52 | /** |
53 | * Wildcards allowed at the beginning |
54 | * but we still force the wildcards to be surrounded by letters |
55 | * we allow only 3 wildcards |
56 | */ |
57 | private const ALLOWED_LEADING_WILDCARD = '/^(?:(?:[?*](?=\w)(?:\w+[?*]|\w+){1,2}\w*)|(?:(?:\w+[?*]){1,3}\w*))$/u'; |
58 | |
59 | /** |
60 | * We force fuzzy words to have letters in them |
61 | * NOTE that we disallow * or ? here so we can't |
62 | * match fuzzy and wildcard at the same time |
63 | */ |
64 | private const FUZZY_WORD = '/^(?<word>\w+)~(?<fuzzyness>[0-2])?$/u'; |
65 | |
66 | /** |
67 | * @var Escaper |
68 | */ |
69 | private $escaper; |
70 | |
71 | /** |
72 | * @var string regex used to detect wildcards |
73 | */ |
74 | private $wildcardRegex; |
75 | |
76 | /** |
77 | * @param Escaper $escaper |
78 | */ |
79 | public function __construct( Escaper $escaper ) { |
80 | $this->escaper = $escaper; |
81 | if ( $this->escaper->getAllowLeadingWildcard() ) { |
82 | $this->wildcardRegex = self::ALLOWED_LEADING_WILDCARD; |
83 | } else { |
84 | $this->wildcardRegex = self::DISALLOWED_LEADING_WILDCARD; |
85 | } |
86 | } |
87 | |
88 | /** |
89 | * @param string $query |
90 | * @param int $start |
91 | * @param int $end |
92 | * @return ParsedNode|null |
93 | */ |
94 | public function parse( string $query, int $start, int $end ) { |
95 | $match = []; |
96 | Assert::precondition( $start < $end, '$start < $end' ); |
97 | Assert::precondition( $end <= strlen( $query ), '$end <= strlen( $query )' ); |
98 | $ret = preg_match( self::NEGATION, $query, $match, PREG_OFFSET_CAPTURE, $start ); |
99 | Assert::postcondition( $ret !== false, 'Regex failed: ' . preg_last_error() ); |
100 | $wholeStart = $start; |
101 | $wordStart = $start; |
102 | $negationType = ''; |
103 | if ( $ret ) { |
104 | $wordStart = $start + strlen( $match[0][0] ); |
105 | $negationType = $match[0][0]; |
106 | $start = $match[0][1]; |
107 | } |
108 | $wholeEnd = -1; |
109 | |
110 | while ( $start < $end ) { |
111 | $ret = preg_match( self::NON_QUOTE, $query, $match, PREG_OFFSET_CAPTURE, $start ); |
112 | Assert::postcondition( $ret !== false, 'Regex failed: ' . preg_last_error() ); |
113 | if ( !$ret ) { |
114 | $wholeEnd = $end; |
115 | break; |
116 | } |
117 | if ( isset( $match['stop'] ) && $match['stop'][1] >= 0 ) { |
118 | $wholeEnd = $match['stop'][1]; |
119 | break; |
120 | } |
121 | $start = $match[0][1] + strlen( $match[0][0] ); |
122 | $wholeEnd = $start; |
123 | } |
124 | |
125 | if ( $wholeEnd == $wordStart ) { |
126 | return null; |
127 | } |
128 | |
129 | $wordLen = $wholeEnd - $wordStart; |
130 | $word = substr( $query, $wordStart, $wordLen ); |
131 | $node = null; |
132 | $match = []; |
133 | if ( str_contains( $word, '~' ) && preg_match( self::FUZZY_WORD, $word, $match ) ) { |
134 | $word = $match['word']; |
135 | if ( isset( $match['fuzzyness'] ) && strlen( $match['fuzzyness'] ) > 0 ) { |
136 | $fuzzyness = intval( $match['fuzzyness'] ); |
137 | } else { |
138 | $fuzzyness = -1; |
139 | } |
140 | // No need to unescape here, we don't match any punctuation except_ |
141 | $node = new FuzzyNode( $wordStart, $wholeEnd, $word, $fuzzyness ); |
142 | } elseif ( str_contains( $word, '*' ) || str_contains( $word, '?' ) ) { |
143 | if ( preg_match( self::PREFIX_QUERY, $word, $match ) ) { |
144 | $node = new PrefixNode( $wordStart, $wholeEnd, $match['prefix'] ); |
145 | } elseif ( preg_match( $this->wildcardRegex, $word ) ) { |
146 | $node = new WildcardNode( $wordStart, $wholeEnd, $word ); |
147 | } |
148 | } |
149 | |
150 | if ( $node === null ) { |
151 | $node = new WordsQueryNode( $wordStart, $wholeEnd, $this->escaper->unescape( $word ) ); |
152 | } |
153 | if ( $negationType !== '' ) { |
154 | $node = new NegatedNode( $wholeStart, $wholeEnd, $node, $negationType ); |
155 | } |
156 | return $node; |
157 | } |
158 | } |