Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
97.85% covered (success)
97.85%
319 / 326
85.19% covered (warning)
85.19%
23 / 27
CRAP
0.00% covered (danger)
0.00%
0 / 1
QueryStringRegexParser
97.85% covered (success)
97.85%
319 / 326
85.19% covered (warning)
85.19%
23 / 27
102
0.00% covered (danger)
0.00%
0 / 1
 __construct
100.00% covered (success)
100.00%
9 / 9
100.00% covered (success)
100.00%
1 / 1
2
 reInit
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
1
 cleanup
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
6
 parse
100.00% covered (success)
100.00%
41 / 41
100.00% covered (success)
100.00%
1 / 1
10
 createClause
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
3
 expression
94.44% covered (success)
94.44%
68 / 72
0.00% covered (danger)
0.00%
0 / 1
20.07
 createBoolNode
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 collapseWords
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
1
 mergeWords
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
1
 fallbackToWord
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
1
 unexpectedEOF
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
1
 negatedLeaf
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 leaf
100.00% covered (success)
100.00%
3 / 3
100.00% covered (success)
100.00%
1 / 1
1
 explicitlyNegatedNode
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
3
 isLeaf
100.00% covered (success)
100.00%
2 / 2
100.00% covered (success)
100.00%
1 / 1
2
 parseKeywords
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
3
 nextToken
96.15% covered (success)
96.15%
25 / 26
0.00% covered (danger)
0.00%
0 / 1
9
 consumeWS
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 consumeBoolOp
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
4
 consumePhrase
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 consumeWord
100.00% covered (success)
100.00%
5 / 5
100.00% covered (success)
100.00%
1 / 1
2
 consumeUnbalancedPhrase
92.86% covered (success)
92.86%
13 / 14
0.00% covered (danger)
0.00%
0 / 1
6.01
 advance
100.00% covered (success)
100.00%
4 / 4
100.00% covered (success)
100.00%
1 / 1
3
 boolToOccur
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
2
 extractRequiredNamespaces
93.75% covered (success)
93.75%
15 / 16
0.00% covered (danger)
0.00%
0 / 1
3.00
 parseNsHeader
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
4
 checkQueryLen
100.00% covered (success)
100.00%
18 / 18
100.00% covered (success)
100.00%
1 / 1
6
1<?php
2
3namespace CirrusSearch\Parser\QueryStringRegex;
4
5use CirrusSearch\Parser\AST\BooleanClause;
6use CirrusSearch\Parser\AST\EmptyQueryNode;
7use CirrusSearch\Parser\AST\KeywordFeatureNode;
8use CirrusSearch\Parser\AST\NamespaceHeaderNode;
9use CirrusSearch\Parser\AST\NegatedNode;
10use CirrusSearch\Parser\AST\ParsedBooleanNode;
11use CirrusSearch\Parser\AST\ParsedNode;
12use CirrusSearch\Parser\AST\ParsedQuery;
13use CirrusSearch\Parser\AST\ParseWarning;
14use CirrusSearch\Parser\AST\PhraseQueryNode;
15use CirrusSearch\Parser\AST\Visitor\KeywordNodeVisitor;
16use CirrusSearch\Parser\AST\WordsQueryNode;
17use CirrusSearch\Parser\NamespacePrefixParser;
18use CirrusSearch\Parser\ParsedQueryClassifiersRepository;
19use CirrusSearch\Parser\QueryParser;
20use CirrusSearch\Query\KeywordFeature;
21use CirrusSearch\Query\PrefixFeature;
22use CirrusSearch\Search\Escaper;
23use CirrusSearch\Util;
24use Message;
25use Wikimedia\Assert\Assert;
26
27/**
28 * Full text query parser that uses regex to parse its token.
29 *
30 * Far from being a state of the art parser it detects most of its
31 * tokens using regular expression. And make arbitrary decisions
32 * at tokenization.
33 *
34 * The tokenizer will understand few token types:
35 * - WHITESPACE: all unicode whitespace and control chars ([\pZ\pC])
36 *   the WHITESPACE token is ignored and never presented to the parser
37 * - EOF: dummy type used to mark end of string
38 * - BOOL_AND/BOOL_OR/BOOL_NOT: explicit boolean opeartors
39 * - PARSED_NODE: complex type (usually part of the query)
40 *
41 * PARSED_NODE is a type that groups:
42 * - Keywords
43 * - Phrase
44 * - Words
45 * - Wildcards/Prefix
46 *
47 * Phrase does not have its own token " and is part the tokenization and is never exposed
48 * to the parser.
49 * Same for negation prefix (! and -), they are parsed at tokenization time.
50 *
51 * NOTE that this parser is broken by design:
52 * - no lexical context support, we first parse keywords
53 * - no support for groupings (parenthesis)
54 */
55class QueryStringRegexParser implements QueryParser {
56    /**
57     * Whitespace regex including unicode and some control chars
58     */
59    private const WHITESPACE_REGEX = '/\G[\pZ\pC]+/u';
60
61    public const QUERY_LEN_HARD_LIMIT = 4096;
62
63    /**
64     * see T66350
65     */
66    private const GERSHAYIM_REGEX = '/(\p{L}{2,})(?:")(\p{L})(?=[^\p{L}]|$)/u';
67
68    /**
69     * Supported explicit boolean operator
70     *
71     */
72    private const EXPLICIT_BOOLEAN_OPERATOR = '/\G(?:(?<AND>AND|&&)|(?<OR>OR|\|\|)|(?<NOT>NOT))(?![^\pZ\pC"])/u';
73
74    /**
75     * @var \CirrusSearch\Parser\KeywordRegistry
76     */
77    private $keywordRegistry;
78
79    /**
80     * @var Escaper
81     */
82    private $escaper;
83
84    /**
85     * @var ParsedQueryClassifiersRepository
86     */
87    private $classifierRepository;
88
89    /**
90     * @var string|null user query (null when not yet cleaned up)
91     */
92    private $query;
93
94    /**
95     * @var string Either "all", "break", or "final"
96     */
97    private $questionMarkStripLevel;
98
99    /**
100     * @var string the raw query as received by the search engine
101     */
102    private $rawQuery;
103
104    /**
105     * @var KeywordParser
106     */
107    private $keywordParser;
108
109    /**
110     * @var PhraseQueryParser
111     */
112    private $phraseQueryParser;
113
114    /**
115     * @var NonPhraseParser
116     */
117    private $nonPhraseParser;
118
119    /**
120     * @var OffsetTracker track offsets of parsed keywords
121     */
122    private $keywordOffsetsTracker;
123
124    /**
125     * @var ParsedNode[]
126     */
127    private $preTaggedNodes = [];
128
129    /**
130     * Token set after calling nextToken
131     * @var Token|null
132     */
133    private $token;
134
135    /**
136     * Last token seen (set within nextToken)
137     * @var Token|null
138     */
139    private $lookBehind;
140
141    /**
142     * Current offset
143     * NOTE: offset is moved after call advance
144     * @var int
145     */
146    private $offset;
147
148    /**
149     * @var bool[] indexed cleanups applied (indexed by the cleanup type)
150     * @see ParsedQuery::hasCleanup()
151     */
152    private $queryCleanups = [];
153
154    /**
155     * Errors detected while parsing the query
156     * @var ParseWarning[]
157     */
158    private $warnings = [];
159
160    /**
161     * @var NamespaceHeaderNode|null
162     */
163    private $namespaceHeader;
164
165    /**
166     * @var NamespacePrefixParser
167     */
168    private $namespacePrefixParser;
169
170    private const DEFAULT_OCCUR = BooleanClause::MUST;
171
172    /**
173     * @var int
174     */
175    private $maxQueryLen;
176
177    /**
178     * @param \CirrusSearch\Parser\KeywordRegistry $keywordRegistry
179     * @param Escaper $escaper
180     * @param string $qmarkStripLevel Level of question mark stripping to apply, either "all",
181     *  "break", or "final"
182     * @param ParsedQueryClassifiersRepository $classifierRepository
183     * @param NamespacePrefixParser $namespacePrefixParser
184     * @param int|null $maxQueryLen maximum length of the query in chars
185     * @see Util::stripQuestionMarks() for acceptable $qmarkStripLevel values
186     */
187    public function __construct(
188        \CirrusSearch\Parser\KeywordRegistry $keywordRegistry,
189        Escaper $escaper,
190        $qmarkStripLevel,
191        ParsedQueryClassifiersRepository $classifierRepository,
192        NamespacePrefixParser $namespacePrefixParser,
193        ?int $maxQueryLen
194    ) {
195        $this->keywordRegistry = $keywordRegistry;
196        $this->escaper = $escaper;
197        $this->keywordParser = new KeywordParser();
198        $this->phraseQueryParser = new PhraseQueryParser( $escaper );
199        $this->nonPhraseParser = new NonPhraseParser( $escaper );
200        $this->questionMarkStripLevel = $qmarkStripLevel;
201        $this->classifierRepository = $classifierRepository;
202        $this->namespacePrefixParser = $namespacePrefixParser;
203        $this->maxQueryLen = $maxQueryLen ?: 300;
204    }
205
206    /**
207     * Reinit internal parser states
208     * @param string $rawQuery
209     */
210    private function reInit( $rawQuery ) {
211        $this->rawQuery = $rawQuery;
212        $this->query = null;
213        $this->keywordOffsetsTracker = new OffsetTracker();
214        $this->token = null;
215        $this->lookBehind = null;
216        $this->preTaggedNodes = [];
217        $this->warnings = [];
218        $this->queryCleanups = [];
219        $this->namespaceHeader = null;
220        $this->offset = 0;
221    }
222
223    /**
224     * Apply some cleanups to the input query prior to parsing it
225     * Ideally the parser should be able to handle the query without modifying it
226     * but in some cases it simply way easier to handle this this way.
227     * Cleanups applied:
228     * - Question mark stripping depending on $this->questionMarkStripLevel
229     * - gershayim quirks if $this->escaper->getLanguage() is hebrew
230     */
231    private function cleanup() {
232        $query = $this->rawQuery;
233        $nquery = Util::stripQuestionMarks( $query, $this->questionMarkStripLevel );
234        if ( $nquery !== $query ) {
235            $this->queryCleanups[ParsedQuery::CLEANUP_QMARK_STRIPPING] = true;
236            $query = $nquery;
237        }
238        if ( $this->escaper->getLanguage() === 'he' ) {
239            $nquery = preg_replace( self::GERSHAYIM_REGEX, '$1\"$2', $query );
240            if ( $nquery !== $query ) {
241                $this->queryCleanups[ParsedQuery::CLEANUP_GERSHAYIM_QUIRKS] = true;
242                $query = $nquery;
243            }
244        }
245        if ( strlen( $query ) > 0 && $query[0] === '~' ) {
246            $query = substr( $query, 1 );
247            $this->queryCleanups[ParsedQuery::TILDE_HEADER] = true;
248        }
249        $this->query = $query;
250    }
251
252    /**
253     * @param string $query
254     * @return \CirrusSearch\Parser\AST\ParsedQuery
255     * @throws SearchQueryParseException
256     */
257    public function parse( string $query ): ParsedQuery {
258        $this->reInit( $query );
259        $queryLen = mb_strlen( $query );
260        if ( $queryLen > self::QUERY_LEN_HARD_LIMIT ) {
261            throw new SearchQueryParseException( 'cirrussearch-query-too-long',
262                $queryLen, self::QUERY_LEN_HARD_LIMIT );
263        }