Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
29 / 29 |
|
100.00% |
2 / 2 |
CRAP | |
100.00% |
1 / 1 |
PhraseQueryParser | |
100.00% |
29 / 29 |
|
100.00% |
2 / 2 |
10 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
parse | |
100.00% |
28 / 28 |
|
100.00% |
1 / 1 |
9 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Parser\QueryStringRegex; |
4 | |
5 | use CirrusSearch\Parser\AST\NegatedNode; |
6 | use CirrusSearch\Parser\AST\PhrasePrefixNode; |
7 | use CirrusSearch\Parser\AST\PhraseQueryNode; |
8 | use CirrusSearch\Search\Escaper; |
9 | use Wikimedia\Assert\Assert; |
10 | |
11 | /** |
12 | * Detects phrase queries: |
13 | * "simple phrase" : use the plain fields |
14 | * "simple phrase"~ : use the stem fields |
15 | * "simple phrase"~2 : force the slop to be 2 |
16 | * "simple phrase"~2~ : force the slop to be 2 and use the stem fields |
17 | * |
18 | * The phrase can be negated using a ! or - |
19 | * Quotes can be escaped using \ |
20 | * |
21 | * Supports phrase prefix as well: |
22 | * "simple phras*" |
23 | * iff slop and stem are not provided otherwise we send a simple phrase node |
24 | */ |
25 | class PhraseQueryParser { |
26 | |
27 | /** |
28 | * Start of a phrase |
29 | */ |
30 | public const PHRASE_START = '/\G(?<negate>-|!)?"/'; |
31 | |
32 | /** |
33 | * Normal phrase detection |
34 | */ |
35 | private const PHRASE_REGEX = '/\G(?<negate>-|!)?"(?<value>(?:\\\\.|[^"])*)"(?<slop>~(?<slopvalue>\d+))?(?<fuzzy>~)?/'; |
36 | |
37 | /** |
38 | * @var Escaper |
39 | */ |
40 | private $escaper; |
41 | |
42 | public function __construct( Escaper $escaper ) { |
43 | $this->escaper = $escaper; |
44 | } |
45 | |
46 | /** |
47 | * @param string $query |
48 | * @param int $start |
49 | * @param int $end |
50 | * @return PhraseQueryNode|PhrasePrefixNode|null |
51 | */ |
52 | public function parse( $query, $start, $end ) { |
53 | $match = []; |
54 | Assert::precondition( $start < $end, '$start < $end' ); |
55 | Assert::precondition( $end <= strlen( $query ), '$end <= strlen( $query )' ); |
56 | if ( preg_match( self::PHRASE_REGEX, $query, $match, 0, $start ) === 1 ) { |
57 | if ( strlen( $match[0] ) + $start <= $end ) { |
58 | $slop = -1; |
59 | $phrasePrefix = false; |
60 | $quotedvalue = $match['value']; |
61 | // Detects phrase prefix (still unclear why we do not allow *) |
62 | if ( preg_match( '/^(?:\\\\.|[^*])+[*]$/', $quotedvalue ) === 1 ) { |
63 | $phrasePrefix = true; |
64 | } |
65 | if ( isset( $match['slopvalue'] ) && strlen( $match['slopvalue'] ) > 0 ) { |
66 | $slop = intval( $match['slopvalue'] ); |
67 | $phrasePrefix = false; |
68 | } |
69 | $stem = false; |
70 | if ( isset( $match['fuzzy'] ) ) { |
71 | $stem = true; |
72 | $phrasePrefix = false; |
73 | } |
74 | $negated = $match['negate']; |
75 | $phraseStart = $start + strlen( $match['negate'] ); |
76 | $value = $this->escaper->unescape( $quotedvalue ); |
77 | if ( $phrasePrefix ) { |
78 | $node = new PhrasePrefixNode( $phraseStart, strlen( $match[0] ) + $start, rtrim( $value, '*' ) ); |
79 | } else { |
80 | $node = new PhraseQueryNode( $phraseStart, strlen( $match[0] ) + $start, $value, $slop, |
81 | $stem ); |
82 | } |
83 | if ( $negated !== '' ) { |
84 | $node = new NegatedNode( $start, $node->getEndOffset(), $node, $negated ); |
85 | } |
86 | return $node; |
87 | } |
88 | } |
89 | return null; |
90 | } |
91 | } |