Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
88.10% |
74 / 84 |
|
47.06% |
8 / 17 |
CRAP | |
0.00% |
0 / 1 |
QueryFixer | |
88.10% |
74 / 84 |
|
47.06% |
8 / 17 |
54.22 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
build | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
4.05 | |||
getFixablePart | |
86.67% |
13 / 15 |
|
0.00% |
0 / 1 |
8.15 | |||
fix | |
95.65% |
22 / 23 |
|
0.00% |
0 / 1 |
7 | |||
visitWordsQueryNode | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
acceptableString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
visitPhraseQueryNode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
visitPhrasePrefixNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
visitFuzzyNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
visitPrefixNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
visitWildcardNode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
visitEmptyQueryNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
visitKeywordFeatureNode | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
6 | |||
visitParsedBooleanNode | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
visitBooleanClause | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
8 | |||
visitNegatedNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
visitNamespaceHeader | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Parser\AST\Visitor; |
4 | |
5 | use CirrusSearch\Parser\AST\BooleanClause; |
6 | use CirrusSearch\Parser\AST\EmptyQueryNode; |
7 | use CirrusSearch\Parser\AST\FuzzyNode; |
8 | use CirrusSearch\Parser\AST\KeywordFeatureNode; |
9 | use CirrusSearch\Parser\AST\NamespaceHeaderNode; |
10 | use CirrusSearch\Parser\AST\NegatedNode; |
11 | use CirrusSearch\Parser\AST\ParsedBooleanNode; |
12 | use CirrusSearch\Parser\AST\ParsedNode; |
13 | use CirrusSearch\Parser\AST\ParsedQuery; |
14 | use CirrusSearch\Parser\AST\PhrasePrefixNode; |
15 | use CirrusSearch\Parser\AST\PhraseQueryNode; |
16 | use CirrusSearch\Parser\AST\PrefixNode; |
17 | use CirrusSearch\Parser\AST\WildcardNode; |
18 | use CirrusSearch\Parser\AST\WordsQueryNode; |
19 | use HtmlArmor; |
20 | use Wikimedia\Assert\Assert; |
21 | |
22 | /** |
23 | * Inspect a query and determine what parts of it can be sent to a typo correction mechanism and |
24 | * provide a method to fix the query once the corrected substring is known. |
25 | */ |
26 | class QueryFixer implements Visitor { |
27 | /** |
28 | * @var \SplObjectStorage |
29 | */ |
30 | private static $cache; |
31 | |
32 | /** |
33 | * @var ParsedQuery |
34 | */ |
35 | private $parsedQuery; |
36 | |
37 | /** |
38 | * @var bool |
39 | */ |
40 | private $visited = false; |
41 | |
42 | /** |
43 | * @var ParsedNode|null |
44 | */ |
45 | private $node; |
46 | |
47 | /** |
48 | * @var bool |
49 | */ |
50 | private $hasQMarkInWildcard = false; |
51 | |
52 | /** |
53 | * @var int |
54 | */ |
55 | private $currentSize = 0; |
56 | |
57 | /** |
58 | * @var bool true when this branch is "negated". |
59 | */ |
60 | private $inNegation; |
61 | |
62 | /** |
63 | * @var bool |
64 | */ |
65 | private $isComplex = false; |
66 | |
67 | public function __construct( ParsedQuery $query ) { |
68 | $this->parsedQuery = $query; |
69 | } |
70 | |
71 | /** |
72 | * @param ParsedQuery $query |
73 | * @return self |
74 | */ |
75 | public static function build( ParsedQuery $query ) { |
76 | if ( self::$cache === null || count( self::$cache ) > 100 ) { |
77 | // Build the cache for the first time or drop it for a new empty one just in case this class |
78 | // is used from a maint script that treats/parses millions of queries |
79 | self::$cache = new \SplObjectStorage(); |
80 | } |
81 | |
82 | $fixer = self::$cache[$query] ?? null; |
83 | if ( $fixer === null ) { |
84 | $fixer = new self( $query ); |
85 | self::$cache[$query] = $fixer; |
86 | } |
87 | return $fixer; |
88 | } |
89 | |
90 | /** |
91 | * Get the longest phrase that is subject to typo correction. |
92 | * It's generally a set of consecutive words. |
93 | * |
94 | * @return string|null |
95 | */ |
96 | public function getFixablePart() { |
97 | if ( !$this->visited ) { |
98 | $this->visited = true; |
99 | $this->parsedQuery->getRoot()->accept( $this ); |
100 | } |
101 | |
102 | if ( $this->isComplex ) { |
103 | $this->node = null; |
104 | } |
105 | |
106 | if ( $this->hasQMarkInWildcard && $this->parsedQuery->hasCleanup( ParsedQuery::CLEANUP_QMARK_STRIPPING ) ) { |
107 | // We may not be able to reconstruct this kind of queries properly |
108 | // If a question mark is legimetely removed we agree that it's OK to present the user |
109 | // with its original query minus the question marks. |
110 | // But if the user explicitely escaped the question mark so that it generates a valid |
111 | // wildcard query we don't attempt to re-escape the resulting query. |
112 | $this->node = null; |
113 | } |
114 | |
115 | // @phan-suppress-next-line PhanSuspiciousValueComparison |
116 | if ( $this->node === null ) { |
117 | return null; |
118 | } |
119 | |
120 | if ( $this->node instanceof KeywordFeatureNode ) { |
121 | return $this->node->getValue(); |
122 | } elseif ( $this->node instanceof WordsQueryNode ) { |
123 | return $this->node->getWords(); |
124 | } else { |
125 | /** @phan-suppress-next-line PhanImpossibleCondition I agree, this is impossible. */ |
126 | Assert::invariant( false, "Unsupported node type " . get_class( $this->node ) ); |
127 | return null; |
128 | } |
129 | } |
130 | |
131 | /** |
132 | * Replace the fixable part of the visited query with the provided replacement |
133 | * |
134 | * @param HtmlArmor|string $replacement If HtmlArmor is provided all modifications will be |
135 | * html safe and HtmlArmor will be returned. If a string is provided no escaping will occur. |
136 | * @return HtmlArmor|string|null |
137 | */ |
138 | public function fix( $replacement ) { |
139 | Assert::precondition( $this->visited, "getFixablePart must be called before trying to fix the query" ); |
140 | if ( $this->node === null ) { |
141 | return null; |
142 | } |
143 | |
144 | $escapeBoundaries = false; |
145 | if ( $replacement instanceof HtmlArmor ) { |
146 | $escapeBoundaries = true; |
147 | $replacement = HtmlArmor::getHtml( $replacement ); |
148 | if ( $replacement === null ) { |
149 | throw new \InvalidArgumentException( '$replacement cannot be null nor wrap a null value' ); |
150 | } |
151 | } |
152 | $replacement = preg_replace( '/[~?*"\\\\]/', '\\\\$0', $replacement ); |
153 | |
154 | $prefix = ""; |
155 | if ( $this->parsedQuery->hasCleanup( ParsedQuery::TILDE_HEADER ) ) { |
156 | $prefix .= "~"; |
157 | } |
158 | $prefix .= substr( $this->parsedQuery->getQuery(), 0, $this->node->getStartOffset() ); |
159 | if ( $this->node instanceof KeywordFeatureNode ) { |
160 | $prefix .= $this->node->getKey() . ':'; |
161 | } |
162 | |
163 | $suffix = substr( $this->parsedQuery->getQuery(), $this->node->getEndOffset() ); |
164 | |
165 | if ( $escapeBoundaries ) { |
166 | $prefix = htmlspecialchars( $prefix ); |
167 | $suffix = htmlspecialchars( $suffix ); |
168 | $fixed = $prefix . $replacement . $suffix; |
169 | return new HtmlArmor( $fixed ); |
170 | } |
171 | |
172 | return $prefix . $replacement . $suffix; |
173 | } |
174 | |
175 | public function visitWordsQueryNode( WordsQueryNode $node ) { |
176 | if ( $this->inNegation ) { |
177 | return; |
178 | } |
179 | $siz = mb_strlen( $node->getWords() ); |
180 | if ( $siz > $this->currentSize ) { |
181 | if ( !$this->acceptableString( $node->getWords() ) ) { |
182 | return; |
183 | } |
184 | $this->node = $node; |
185 | $this->currentSize = $siz; |
186 | } |
187 | } |
188 | |
189 | /** |
190 | * Determine if this substring of the query is suitable for being fixed. |
191 | * Excludes string with chars that may require escaping (*, ?, " and \) |
192 | * @param string $str |
193 | * @return bool |
194 | */ |
195 | private function acceptableString( $str ) { |
196 | // We ignore word parts that we me have to escape |
197 | // when presenting the query back to the user |
198 | return preg_match( '/[*?"\\\\]/', $str ) !== 1; |
199 | } |
200 | |
201 | public function visitPhraseQueryNode( PhraseQueryNode $node ) { |
202 | $this->isComplex = true; |
203 | } |
204 | |
205 | public function visitPhrasePrefixNode( PhrasePrefixNode $node ) { |
206 | $this->isComplex = true; |
207 | } |
208 | |
209 | public function visitFuzzyNode( FuzzyNode $node ) { |
210 | $this->isComplex = true; |
211 | } |
212 | |
213 | public function visitPrefixNode( PrefixNode $node ) { |
214 | $this->isComplex = true; |
215 | } |
216 | |
217 | public function visitWildcardNode( WildcardNode $node ) { |
218 | if ( str_contains( $node->getWildcardQuery(), '?' ) ) { |
219 | $this->hasQMarkInWildcard = true; |
220 | } |
221 | $this->isComplex = true; |
222 | } |
223 | |
224 | public function visitEmptyQueryNode( EmptyQueryNode $node ) { |
225 | } |
226 | |
227 | public function visitKeywordFeatureNode( KeywordFeatureNode $node ) { |
228 | // FIXME: fixing intitle is perhaps a side effect of the original cirrus query parser |
229 | if ( !$this->inNegation && $node->getKey() === 'intitle' && $node->getDelimiter() === '' ) { |
230 | $siz = strlen( $node->getValue() ); |
231 | if ( $siz > $this->currentSize && $this->acceptableString( $node->getValue() ) ) { |
232 | $this->node = $node; |
233 | $this->currentSize = $siz; |
234 | } |
235 | } |
236 | } |
237 | |
238 | public function visitParsedBooleanNode( ParsedBooleanNode $node ) { |
239 | foreach ( $node->getClauses() as $clause ) { |
240 | $this->visitBooleanClause( $clause ); |
241 | } |
242 | } |
243 | |
244 | public function visitBooleanClause( BooleanClause $clause ) { |
245 | if ( $clause->isExplicit() ) { |
246 | $this->isComplex = true; |
247 | } |
248 | $oldNegated = $this->inNegation; |
249 | $node = $clause->getNode(); |
250 | if ( $node instanceof KeywordFeatureNode && $node->getKey() === 'intitle' && $node->getDelimiter() === '' ) { |
251 | // Inhibits the fixer when it sees an un-acceptable value inside a keyword (legacy browsertest_176) |
252 | $this->isComplex = $this->isComplex || !$this->acceptableString( $node->getValue() ); |
253 | } |
254 | if ( $clause->getOccur() === BooleanClause::MUST_NOT ) { |
255 | if ( !$node instanceof KeywordFeatureNode ) { |
256 | // FIXME: (legacy) only negated keywords were accepted |
257 | $this->isComplex = true; |
258 | } |
259 | $this->inNegation = !$this->inNegation; |
260 | } |
261 | |
262 | $clause->getNode()->accept( $this ); |
263 | $this->inNegation = $oldNegated; |
264 | } |
265 | |
266 | final public function visitNegatedNode( NegatedNode $node ) { |
267 | /** @phan-suppress-next-line PhanImpossibleCondition I agree, this is impossible. */ |
268 | Assert::invariant( false, 'NegatedNode should be optimized at parse time' ); |
269 | } |
270 | |
271 | final public function visitNamespaceHeader( NamespaceHeaderNode $node ) { |
272 | /** @phan-suppress-next-line PhanImpossibleCondition I agree, this is impossible. */ |
273 | Assert::invariant( false, 'Not yet part of the AST, should not be visited.' ); |
274 | } |
275 | } |