Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
97.85% |
319 / 326 |
|
85.19% |
23 / 27 |
CRAP | |
0.00% |
0 / 1 |
QueryStringRegexParser | |
97.85% |
319 / 326 |
|
85.19% |
23 / 27 |
102 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
2 | |||
reInit | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
cleanup | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
6 | |||
parse | |
100.00% |
41 / 41 |
|
100.00% |
1 / 1 |
10 | |||
createClause | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
expression | |
94.44% |
68 / 72 |
|
0.00% |
0 / 1 |
20.07 | |||
createBoolNode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
collapseWords | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
1 | |||
mergeWords | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
fallbackToWord | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
unexpectedEOF | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 | |||
negatedLeaf | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
leaf | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
explicitlyNegatedNode | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
isLeaf | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
parseKeywords | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
nextToken | |
96.15% |
25 / 26 |
|
0.00% |
0 / 1 |
9 | |||
consumeWS | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
consumeBoolOp | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
consumePhrase | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
consumeWord | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
consumeUnbalancedPhrase | |
92.86% |
13 / 14 |
|
0.00% |
0 / 1 |
6.01 | |||
advance | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
3 | |||
boolToOccur | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
extractRequiredNamespaces | |
93.75% |
15 / 16 |
|
0.00% |
0 / 1 |
3.00 | |||
parseNsHeader | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
4 | |||
checkQueryLen | |
100.00% |
18 / 18 |
|
100.00% |
1 / 1 |
6 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Parser\QueryStringRegex; |
4 | |
5 | use CirrusSearch\Parser\AST\BooleanClause; |
6 | use CirrusSearch\Parser\AST\EmptyQueryNode; |
7 | use CirrusSearch\Parser\AST\KeywordFeatureNode; |
8 | use CirrusSearch\Parser\AST\NamespaceHeaderNode; |
9 | use CirrusSearch\Parser\AST\NegatedNode; |
10 | use CirrusSearch\Parser\AST\ParsedBooleanNode; |
11 | use CirrusSearch\Parser\AST\ParsedNode; |
12 | use CirrusSearch\Parser\AST\ParsedQuery; |
13 | use CirrusSearch\Parser\AST\ParseWarning; |
14 | use CirrusSearch\Parser\AST\PhraseQueryNode; |
15 | use CirrusSearch\Parser\AST\Visitor\KeywordNodeVisitor; |
16 | use CirrusSearch\Parser\AST\WordsQueryNode; |
17 | use CirrusSearch\Parser\NamespacePrefixParser; |
18 | use CirrusSearch\Parser\ParsedQueryClassifiersRepository; |
19 | use CirrusSearch\Parser\QueryParser; |
20 | use CirrusSearch\Query\KeywordFeature; |
21 | use CirrusSearch\Query\PrefixFeature; |
22 | use CirrusSearch\Search\Escaper; |
23 | use CirrusSearch\Util; |
24 | use Message; |
25 | use Wikimedia\Assert\Assert; |
26 | |
27 | /** |
28 | * Full text query parser that uses regex to parse its token. |
29 | * |
30 | * Far from being a state of the art parser it detects most of its |
31 | * tokens using regular expression. And make arbitrary decisions |
32 | * at tokenization. |
33 | * |
34 | * The tokenizer will understand few token types: |
35 | * - WHITESPACE: all unicode whitespace and control chars ([\pZ\pC]) |
36 | * the WHITESPACE token is ignored and never presented to the parser |
37 | * - EOF: dummy type used to mark end of string |
38 | * - BOOL_AND/BOOL_OR/BOOL_NOT: explicit boolean opeartors |
39 | * - PARSED_NODE: complex type (usually part of the query) |
40 | * |
41 | * PARSED_NODE is a type that groups: |
42 | * - Keywords |
43 | * - Phrase |
44 | * - Words |
45 | * - Wildcards/Prefix |
46 | * |
47 | * Phrase does not have its own token " and is part the tokenization and is never exposed |
48 | * to the parser. |
49 | * Same for negation prefix (! and -), they are parsed at tokenization time. |
50 | * |
51 | * NOTE that this parser is broken by design: |
52 | * - no lexical context support, we first parse keywords |
53 | * - no support for groupings (parenthesis) |
54 | */ |
55 | class QueryStringRegexParser implements QueryParser { |
56 | /** |
57 | * Whitespace regex including unicode and some control chars |
58 | */ |
59 | private const WHITESPACE_REGEX = '/\G[\pZ\pC]+/u'; |
60 | |
61 | public const QUERY_LEN_HARD_LIMIT = 4096; |
62 | |
63 | /** |
64 | * see T66350 |
65 | */ |
66 | private const GERSHAYIM_REGEX = '/(\p{L}{2,})(?:")(\p{L})(?=[^\p{L}]|$)/u'; |
67 | |
68 | /** |
69 | * Supported explicit boolean operator |
70 | * |
71 | */ |
72 | private const EXPLICIT_BOOLEAN_OPERATOR = '/\G(?:(?<AND>AND|&&)|(?<OR>OR|\|\|)|(?<NOT>NOT))(?![^\pZ\pC"])/u'; |
73 | |
74 | /** |
75 | * @var \CirrusSearch\Parser\KeywordRegistry |
76 | */ |
77 | private $keywordRegistry; |
78 | |
79 | /** |
80 | * @var Escaper |
81 | */ |
82 | private $escaper; |
83 | |
84 | /** |
85 | * @var ParsedQueryClassifiersRepository |
86 | */ |
87 | private $classifierRepository; |
88 | |
89 | /** |
90 | * @var string|null user query (null when not yet cleaned up) |
91 | */ |
92 | private $query; |
93 | |
94 | /** |
95 | * @var string Either "all", "break", or "final" |
96 | */ |
97 | private $questionMarkStripLevel; |
98 | |
99 | /** |
100 | * @var string the raw query as received by the search engine |
101 | */ |
102 | private $rawQuery; |
103 | |
104 | /** |
105 | * @var KeywordParser |
106 | */ |
107 | private $keywordParser; |
108 | |
109 | /** |
110 | * @var PhraseQueryParser |
111 | */ |
112 | private $phraseQueryParser; |
113 | |
114 | /** |
115 | * @var NonPhraseParser |
116 | */ |
117 | private $nonPhraseParser; |
118 | |
119 | /** |
120 | * @var OffsetTracker track offsets of parsed keywords |
121 | */ |
122 | private $keywordOffsetsTracker; |
123 | |
124 | /** |
125 | * @var ParsedNode[] |
126 | */ |
127 | private $preTaggedNodes = []; |
128 | |
129 | /** |
130 | * Token set after calling nextToken |
131 | * @var Token|null |
132 | */ |
133 | private $token; |
134 | |
135 | /** |
136 | * Last token seen (set within nextToken) |
137 | * @var Token|null |
138 | */ |
139 | private $lookBehind; |
140 | |
141 | /** |
142 | * Current offset |
143 | * NOTE: offset is moved after call advance |
144 | * @var int |
145 | */ |
146 | private $offset; |
147 | |
148 | /** |
149 | * @var bool[] indexed cleanups applied (indexed by the cleanup type) |
150 | * @see ParsedQuery::hasCleanup() |
151 | */ |
152 | private $queryCleanups = []; |
153 | |
154 | /** |
155 | * Errors detected while parsing the query |
156 | * @var ParseWarning[] |
157 | */ |
158 | private $warnings = []; |
159 | |
160 | /** |
161 | * @var NamespaceHeaderNode|null |
162 | */ |
163 | private $namespaceHeader; |
164 | |
165 | /** |
166 | * @var NamespacePrefixParser |
167 | */ |
168 | private $namespacePrefixParser; |
169 | |
170 | private const DEFAULT_OCCUR = BooleanClause::MUST; |
171 | |
172 | /** |
173 | * @var int |
174 | */ |
175 | private $maxQueryLen; |
176 | |
177 | /** |
178 | * @param \CirrusSearch\Parser\KeywordRegistry $keywordRegistry |
179 | * @param Escaper $escaper |
180 | * @param string $qmarkStripLevel Level of question mark stripping to apply, either "all", |
181 | * "break", or "final" |
182 | * @param ParsedQueryClassifiersRepository $classifierRepository |
183 | * @param NamespacePrefixParser $namespacePrefixParser |
184 | * @param int|null $maxQueryLen maximum length of the query in chars |
185 | * @see Util::stripQuestionMarks() for acceptable $qmarkStripLevel values |
186 | */ |
187 | public function __construct( |
188 | \CirrusSearch\Parser\KeywordRegistry $keywordRegistry, |
189 | Escaper $escaper, |
190 | $qmarkStripLevel, |
191 | ParsedQueryClassifiersRepository $classifierRepository, |
192 | NamespacePrefixParser $namespacePrefixParser, |
193 | ?int $maxQueryLen |
194 | ) { |
195 | $this->keywordRegistry = $keywordRegistry; |
196 | $this->escaper = $escaper; |
197 | $this->keywordParser = new KeywordParser(); |
198 | $this->phraseQueryParser = new PhraseQueryParser( $escaper ); |
199 | $this->nonPhraseParser = new NonPhraseParser( $escaper ); |
200 | $this->questionMarkStripLevel = $qmarkStripLevel; |
201 | $this->classifierRepository = $classifierRepository; |
202 | $this->namespacePrefixParser = $namespacePrefixParser; |
203 | $this->maxQueryLen = $maxQueryLen ?: 300; |
204 | } |
205 | |
206 | /** |
207 | * Reinit internal parser states |
208 | * @param string $rawQuery |
209 | */ |
210 | private function reInit( $rawQuery ) { |
211 | $this->rawQuery = $rawQuery; |
212 | $this->query = null; |
213 | $this->keywordOffsetsTracker = new OffsetTracker(); |
214 | $this->token = null; |
215 | $this->lookBehind = null; |
216 | $this->preTaggedNodes = []; |
217 | $this->warnings = []; |
218 | $this->queryCleanups = []; |
219 | $this->namespaceHeader = null; |
220 | $this->offset = 0; |
221 | } |
222 | |
223 | /** |
224 | * Apply some cleanups to the input query prior to parsing it |
225 | * Ideally the parser should be able to handle the query without modifying it |
226 | * but in some cases it simply way easier to handle this this way. |
227 | * Cleanups applied: |
228 | * - Question mark stripping depending on $this->questionMarkStripLevel |
229 | * - gershayim quirks if $this->escaper->getLanguage() is hebrew |
230 | */ |
231 | private function cleanup() { |
232 | $query = $this->rawQuery; |
233 | $nquery = Util::stripQuestionMarks( $query, $this->questionMarkStripLevel ); |
234 | if ( $nquery !== $query ) { |
235 | $this->queryCleanups[ParsedQuery::CLEANUP_QMARK_STRIPPING] = true; |
236 | $query = $nquery; |
237 | } |
238 | if ( $this->escaper->getLanguage() === 'he' ) { |
239 | $nquery = preg_replace( self::GERSHAYIM_REGEX, '$1\"$2', $query ); |
240 | if ( $nquery !== $query ) { |
241 | $this->queryCleanups[ParsedQuery::CLEANUP_GERSHAYIM_QUIRKS] = true; |
242 | $query = $nquery; |
243 | } |
244 | } |
245 | if ( strlen( $query ) > 0 && $query[0] === '~' ) { |
246 | $query = substr( $query, 1 ); |
247 | $this->queryCleanups[ParsedQuery::TILDE_HEADER] = true; |
248 | } |
249 | $this->query = $query; |
250 | } |
251 | |
252 | /** |
253 | * @param string $query |
254 | * @return \CirrusSearch\Parser\AST\ParsedQuery |
255 | * @throws SearchQueryParseException |
256 | */ |
257 | public function parse( string $query ): ParsedQuery { |
258 | $this->reInit( $query ); |
259 | $queryLen = mb_strlen( $query ); |
260 | if ( $queryLen > self::QUERY_LEN_HARD_LIMIT ) { |
261 | throw new SearchQueryParseException( 'cirrussearch-query-too-long', |
262 | $queryLen, self::QUERY_LEN_HARD_LIMIT ); |
263 | } |