Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
97.85% covered (success)
97.85%
319 / 326
85.19% covered (warning)
85.19%
23 / 27
CRAP
0.00% covered (danger)
0.00%
0 / 1
QueryStringRegexParser
97.85% covered (success)
97.85%
319 / 326
85.19% covered (warning)
85.19%
23 / 27
102
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
2
 reInit
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
1
 cleanup
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
6
 parse
100.00% covered (success)
100.00%
41 / 41
100.00% covered (success)
100.00%
1 / 1
10
 createClause
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
3
 expression
94.44% covered (success)
94.44%
68 / 72
0.00% covered (danger)
0.00%
0 / 1
20.07
 createBoolNode
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 collapseWords
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
1
 mergeWords
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 fallbackToWord
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
1
 unexpectedEOF
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
1
 negatedLeaf
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 leaf
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 explicitlyNegatedNode
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
3
 isLeaf
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 parseKeywords
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 nextToken
96.15% covered (success)
96.15%
25 / 26
0.00% covered (danger)
0.00%
0 / 1
9
 consumeWS
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 consumeBoolOp
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 consumePhrase
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 consumeWord
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 consumeUnbalancedPhrase
92.86% covered (success)
92.86%
13 / 14
0.00% covered (danger)
0.00%
0 / 1
6.01
 advance
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
3
 boolToOccur
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
2
 extractRequiredNamespaces
93.75% covered (success)
93.75%
15 / 16
0.00% covered (danger)
0.00%
0 / 1
3.00
 parseNsHeader
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
 checkQueryLen
100.00% covered (success)
100.00%
18 / 18
100.00% covered (success)
100.00%
1 / 1
6
1<?php
2
3namespace CirrusSearch\Parser\QueryStringRegex;
4
5use CirrusSearch\Parser\AST\BooleanClause;
6use CirrusSearch\Parser\AST\EmptyQueryNode;
7use CirrusSearch\Parser\AST\KeywordFeatureNode;
8use CirrusSearch\Parser\AST\NamespaceHeaderNode;
9use CirrusSearch\Parser\AST\NegatedNode;
10use CirrusSearch\Parser\AST\ParsedBooleanNode;
11use CirrusSearch\Parser\AST\ParsedNode;
12use CirrusSearch\Parser\AST\ParsedQuery;
13use CirrusSearch\Parser\AST\ParseWarning;
14use CirrusSearch\Parser\AST\PhraseQueryNode;
15use CirrusSearch\Parser\AST\Visitor\KeywordNodeVisitor;
16use CirrusSearch\Parser\AST\WordsQueryNode;
17use CirrusSearch\Parser\NamespacePrefixParser;
18use CirrusSearch\Parser\ParsedQueryClassifiersRepository;
19use CirrusSearch\Parser\QueryParser;
20use CirrusSearch\Query\KeywordFeature;
21use CirrusSearch\Query\PrefixFeature;
22use CirrusSearch\Search\Escaper;
23use CirrusSearch\Util;
24use LogicException;
25use Message;
26use Wikimedia\Assert\Assert;
27
28/**
29 * Full text query parser that uses regex to parse its token.
30 *
31 * Far from being a state of the art parser it detects most of its
32 * tokens using regular expression. And make arbitrary decisions
33 * at tokenization.
34 *
35 * The tokenizer will understand few token types:
36 * - WHITESPACE: all unicode whitespace and control chars ([\pZ\pC])
37 *   the WHITESPACE token is ignored and never presented to the parser
38 * - EOF: dummy type used to mark end of string
39 * - BOOL_AND/BOOL_OR/BOOL_NOT: explicit boolean opeartors
40 * - PARSED_NODE: complex type (usually part of the query)
41 *
42 * PARSED_NODE is a type that groups:
43 * - Keywords
44 * - Phrase
45 * - Words
46 * - Wildcards/Prefix
47 *
48 * Phrase does not have its own token " and is part the tokenization and is never exposed
49 * to the parser.
50 * Same for negation prefix (! and -), they are parsed at tokenization time.
51 *
52 * NOTE that this parser is broken by design:
53 * - no lexical context support, we first parse keywords
54 * - no support for groupings (parenthesis)
55 */
56class QueryStringRegexParser implements QueryParser {
57    /**
58     * Whitespace regex including unicode and some control chars
59     */
60    private const WHITESPACE_REGEX = '/\G[\pZ\pC]+/u';
61
62    public const QUERY_LEN_HARD_LIMIT = 4096;
63
64    /**
65     * see T66350
66     */
67    private const GERSHAYIM_REGEX = '/(\p{L}{2,})(?:")(\p{L})(?=[^\p{L}]|$)/u';
68
69    /**
70     * Supported explicit boolean operator
71     *
72     */
73    private const EXPLICIT_BOOLEAN_OPERATOR = '/\G(?:(?<AND>AND|&&)|(?<OR>OR|\|\|)|(?<NOT>NOT))(?![^\pZ\pC"])/u';
74
75    /**
76     * @var \CirrusSearch\Parser\KeywordRegistry
77     */
78    private $keywordRegistry;
79
80    /**
81     * @var Escaper
82     */
83    private $escaper;
84
85    /**
86     * @var ParsedQueryClassifiersRepository
87     */
88    private $classifierRepository;
89
90    /**
91     * @var string|null user query (null when not yet cleaned up)
92     */
93    private $query;
94
95    /**
96     * @var string Either "all", "break", or "final"
97     */
98    private $questionMarkStripLevel;
99
100    /**
101     * @var string the raw query as received by the search engine
102     */
103    private $rawQuery;
104
105    /**
106     * @var KeywordParser
107     */
108    private $keywordParser;
109
110    /**
111     * @var PhraseQueryParser
112     */
113    private $phraseQueryParser;
114
115    /**
116     * @var NonPhraseParser
117     */
118    private $nonPhraseParser;
119
120    /**
121     * @var OffsetTracker track offsets of parsed keywords
122     */
123    private $keywordOffsetsTracker;
124
125    /**
126     * @var ParsedNode[]
127     */
128    private $preTaggedNodes = [];
129
130    /**
131     * Token set after calling nextToken
132     * @var Token|null
133     */
134    private $token;
135
136    /**
137     * Last token seen (set within nextToken)
138     * @var Token|null
139     */
140    private $lookBehind;
141
142    /**
143     * Current offset
144     * NOTE: offset is moved after call advance
145     * @var int
146     */
147    private $offset;
148
149    /**
150     * @var bool[] indexed cleanups applied (indexed by the cleanup type)
151     * @see ParsedQuery::hasCleanup()
152     */
153    private $queryCleanups = [];
154
155    /**
156     * Errors detected while parsing the query
157     * @var ParseWarning[]
158     */
159    private $warnings = [];
160
161    /**
162     * @var NamespaceHeaderNode|null
163     */
164    private $namespaceHeader;
165
166    /**
167     * @var NamespacePrefixParser
168     */
169    private $namespacePrefixParser;
170
171    private const DEFAULT_OCCUR = BooleanClause::MUST;
172
173    /**
174     * @var int
175     */
176    private $maxQueryLen;
177
178    /**
179     * @param \CirrusSearch\Parser\KeywordRegistry $keywordRegistry
180     * @param Escaper $escaper
181     * @param string $qmarkStripLevel Level of question mark stripping to apply, either "all",
182     *  "break", or "final"
183     * @param ParsedQueryClassifiersRepository $classifierRepository
184     * @param NamespacePrefixParser $namespacePrefixParser
185     * @param int|null $maxQueryLen maximum length of the query in chars
186     * @see Util::stripQuestionMarks() for acceptable $qmarkStripLevel values
187     */
188    public function __construct(
189        \CirrusSearch\Parser\KeywordRegistry $keywordRegistry,
190        Escaper $escaper,
191        $qmarkStripLevel,
192        ParsedQueryClassifiersRepository $classifierRepository,
193        NamespacePrefixParser $namespacePrefixParser,
194        ?int $maxQueryLen
195    ) {
196        $this->keywordRegistry = $keywordRegistry;
197        $this->escaper = $escaper;
198        $this->keywordParser = new KeywordParser();
199        $this->phraseQueryParser = new PhraseQueryParser( $escaper );
200        $this->nonPhraseParser = new NonPhraseParser( $escaper );
201        $this->questionMarkStripLevel = $qmarkStripLevel;
202        $this->classifierRepository = $classifierRepository;
203        $this->namespacePrefixParser = $namespacePrefixParser;
204        $this->maxQueryLen = $maxQueryLen ?: 300;
205    }
206
207    /**
208     * Reinit internal parser states
209     * @param string $rawQuery
210     */
211    private function reInit( $rawQuery ) {
212        $this->rawQuery = $rawQuery;
213        $this->query = null;
214        $this->keywordOffsetsTracker = new OffsetTracker();
215        $this->token = null;
216        $this->lookBehind = null;
217        $this->preTaggedNodes = [];
218        $this->warnings = [];
219        $this->queryCleanups = [];
220        $this->namespaceHeader = null;
221        $this->offset = 0;
222    }
223
224    /**
225     * Apply some cleanups to the input query prior to parsing it
226     * Ideally the parser should be able to handle the query without modifying it
227     * but in some cases it simply way easier to handle this this way.
228     * Cleanups applied:
229     * - Question mark stripping depending on $this->questionMarkStripLevel
230     * - gershayim quirks if $this->escaper->getLanguage() is hebrew
231     */
232    private function cleanup() {
233        $query = $this->rawQuery;
234        $nquery = Util::stripQuestionMarks( $query, $this->questionMarkStripLevel );
235        if ( $nquery !== $query ) {
236            $this->queryCleanups[ParsedQuery::CLEANUP_QMARK_STRIPPING] = true;
237            $query = $nquery;
238        }
239        if ( $this->escaper->getLanguage() === 'he' ) {
240            $nquery = preg_replace( self::GERSHAYIM_REGEX, '$1\"$2', $query );
241            if ( $nquery !== $query ) {
242                $this->queryCleanups[ParsedQuery::CLEANUP_GERSHAYIM_QUIRKS] = true;
243                $query = $nquery;
244            }
245        }
246        if ( strlen( $query ) > 0 && $query[0] === '~' ) {
247            $query = substr( $query, 1 );
248            $this->queryCleanups[ParsedQuery::TILDE_HEADER] = true;
249        }
250        $this->query = $query;
251    }
252
253    /**
254     * @param string $query
255     * @return \CirrusSearch\Parser\AST\ParsedQuery
256     * @throws SearchQueryParseException
257     */
258    public function parse( string $query ): ParsedQuery {
259        $this->reInit( $query );
260        $queryLen = mb_strlen( $query );
261        if ( $queryLen > self::QUERY_LEN_HARD_LIMIT ) {
262            throw new SearchQueryParseException( 'cirrussearch-query-too-long',
263                $queryLen, self::QUERY_LEN_HARD_LIMIT );
264        }