Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
88.10% |
74 / 84 |
|
47.06% |
8 / 17 |
CRAP | |
0.00% |
0 / 1 |
QueryFixer | |
88.10% |
74 / 84 |
|
47.06% |
8 / 17 |
54.22 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
build | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
4.05 | |||
getFixablePart | |
86.67% |
13 / 15 |
|
0.00% |
0 / 1 |
8.15 | |||
fix | |
95.65% |
22 / 23 |
|
0.00% |
0 / 1 |
7 | |||
visitWordsQueryNode | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
acceptableString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
visitPhraseQueryNode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
visitPhrasePrefixNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
visitFuzzyNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
visitPrefixNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
visitWildcardNode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
visitEmptyQueryNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
visitKeywordFeatureNode | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
6 | |||
visitParsedBooleanNode | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
visitBooleanClause | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
8 | |||
visitNegatedNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
visitNamespaceHeader | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Parser\AST\Visitor; |
4 | |
5 | use CirrusSearch\Parser\AST\BooleanClause; |
6 | use CirrusSearch\Parser\AST\EmptyQueryNode; |
7 | use CirrusSearch\Parser\AST\FuzzyNode; |
8 | use CirrusSearch\Parser\AST\KeywordFeatureNode; |
9 | use CirrusSearch\Parser\AST\NamespaceHeaderNode; |
10 | use CirrusSearch\Parser\AST\NegatedNode; |
11 | use CirrusSearch\Parser\AST\ParsedBooleanNode; |
12 | use CirrusSearch\Parser\AST\ParsedNode; |
13 | use CirrusSearch\Parser\AST\ParsedQuery; |
14 | use CirrusSearch\Parser\AST\PhrasePrefixNode; |
15 | use CirrusSearch\Parser\AST\PhraseQueryNode; |
16 | use CirrusSearch\Parser\AST\PrefixNode; |
17 | use CirrusSearch\Parser\AST\WildcardNode; |
18 | use CirrusSearch\Parser\AST\WordsQueryNode; |
19 | use HtmlArmor; |
20 | use Wikimedia\Assert\Assert; |
21 | |
22 | /** |
23 | * Inspect a query and determine what parts of it can be sent to a typo correction mechanism and |
24 | * provide a method to fix the query once the corrected substring is known. |
25 | */ |
26 | class QueryFixer implements Visitor { |
27 | /** |
28 | * @var \SplObjectStorage |
29 | */ |
30 | private static $cache; |
31 | |
32 | /** |
33 | * @var ParsedQuery |
34 | */ |
35 | private $parsedQuery; |
36 | |
37 | /** |
38 | * @var bool |
39 | */ |
40 | private $visited = false; |
41 | |
42 | /** |
43 | * @var ParsedNode|null |
44 | */ |
45 | private $node; |
46 | |
47 | /** |
48 | * @var bool |
49 | */ |
50 | private $hasQMarkInWildcard = false; |
51 | |
52 | /** |
53 | * @var int |
54 | */ |
55 | private $currentSize = 0; |
56 | |
57 | /** |
58 | * @var bool true when this branch is "negated". |
59 | */ |
60 | private $inNegation; |
61 | |
62 | /** |
63 | * @var bool |
64 | */ |
65 | private $isComplex = false; |
66 | |
67 | /** |
68 | * @param ParsedQuery $query |
69 | */ |
70 | public function __construct( ParsedQuery $query ) { |
71 | $this->parsedQuery = $query; |
72 | } |
73 | |
74 | /** |
75 | * @param ParsedQuery $query |
76 | * @return QueryFixer |
77 | */ |
78 | public static function build( ParsedQuery $query ) { |
79 | if ( self::$cache === null || count( self::$cache ) > 100 ) { |
80 | // Build the cache for the first time or drop it for a new empty one just in case this class |
81 | // is used from a maint script that treats/parses millions of queries |
82 | self::$cache = new \SplObjectStorage(); |
83 | } |
84 | |
85 | $fixer = self::$cache[$query] ?? null; |
86 | if ( $fixer === null ) { |
87 | $fixer = new self( $query ); |
88 | self::$cache[$query] = $fixer; |
89 | } |
90 | return $fixer; |
91 | } |
92 | |
93 | /** |
94 | * Get the longest phrase that is subject to typo correction. |
95 | * It's generally a set of consecutive words. |
96 | * |
97 | * @return string|null |
98 | */ |
99 | public function getFixablePart() { |
100 | if ( !$this->visited ) { |
101 | $this->visited = true; |
102 | $this->parsedQuery->getRoot()->accept( $this ); |
103 | } |
104 | |
105 | if ( $this->isComplex ) { |
106 | $this->node = null; |
107 | } |
108 | |
109 | if ( $this->hasQMarkInWildcard && $this->parsedQuery->hasCleanup( ParsedQuery::CLEANUP_QMARK_STRIPPING ) ) { |
110 | // We may not be able to reconstruct this kind of queries properly |
111 | // If a question mark is legimetely removed we agree that it's OK to present the user |
112 | // with its original query minus the question marks. |
113 | // But if the user explicitely escaped the question mark so that it generates a valid |
114 | // wildcard query we don't attempt to re-escape the resulting query. |
115 | $this->node = null; |
116 | } |
117 | |
118 | // @phan-suppress-next-line PhanSuspiciousValueComparison |
119 | if ( $this->node === null ) { |
120 | return null; |
121 | } |
122 | |
123 | if ( $this->node instanceof KeywordFeatureNode ) { |
124 | return $this->node->getValue(); |
125 | } elseif ( $this->node instanceof WordsQueryNode ) { |
126 | return $this->node->getWords(); |
127 | } else { |
128 | /** @phan-suppress-next-line PhanImpossibleCondition I agree, this is impossible. */ |
129 | Assert::invariant( false, "Unsupported node type " . get_class( $this->node ) ); |
130 | return null; |
131 | } |
132 | } |
133 | |
134 | /** |
135 | * Replace the fixable part of the visited query with the provided replacement |
136 | * |
137 | * @param HtmlArmor|string $replacement If HtmlArmor is provided all modifications will be |
138 | * html safe and HtmlArmor will be returned. If a string is provided no escaping will occur. |
139 | * @return HtmlArmor|string|null |
140 | */ |
141 | public function fix( $replacement ) { |
142 | Assert::precondition( $this->visited, "getFixablePart must be called before trying to fix the query" ); |
143 | if ( $this->node === null ) { |
144 | return null; |
145 | } |
146 | |
147 | $escapeBoundaries = false; |
148 | if ( $replacement instanceof HtmlArmor ) { |
149 | $escapeBoundaries = true; |
150 | $replacement = HtmlArmor::getHtml( $replacement ); |
151 | if ( $replacement === null ) { |
152 | throw new \InvalidArgumentException( '$replacement cannot be null nor wrap a null value' ); |
153 | } |
154 | } |
155 | $replacement = preg_replace( '/[~?*"\\\\]/', '\\\\$0', $replacement ); |
156 | |
157 | $prefix = ""; |
158 | if ( $this->parsedQuery->hasCleanup( ParsedQuery::TILDE_HEADER ) ) { |
159 | $prefix .= "~"; |
160 | } |
161 | $prefix .= substr( $this->parsedQuery->getQuery(), 0, $this->node->getStartOffset() ); |
162 | if ( $this->node instanceof KeywordFeatureNode ) { |
163 | $prefix .= $this->node->getKey() . ':'; |
164 | } |
165 | |
166 | $suffix = substr( $this->parsedQuery->getQuery(), $this->node->getEndOffset() ); |
167 | |
168 | if ( $escapeBoundaries ) { |
169 | $prefix = htmlspecialchars( $prefix ); |
170 | $suffix = htmlspecialchars( $suffix ); |
171 | $fixed = $prefix . $replacement . $suffix; |
172 | return new HtmlArmor( $fixed ); |
173 | } |
174 | |
175 | return $prefix . $replacement . $suffix; |
176 | } |
177 | |
178 | /** |
179 | * @param WordsQueryNode $node |
180 | */ |
181 | public function visitWordsQueryNode( WordsQueryNode $node ) { |
182 | if ( $this->inNegation ) { |
183 | return; |
184 | } |
185 | $siz = mb_strlen( $node->getWords() ); |
186 | if ( $siz > $this->currentSize ) { |
187 | if ( !$this->acceptableString( $node->getWords() ) ) { |
188 | return; |
189 | } |
190 | $this->node = $node; |
191 | $this->currentSize = $siz; |
192 | } |
193 | } |
194 | |
195 | /** |
196 | * Determine if this substring of the query is suitable for being fixed. |
197 | * Excludes string with chars that may require escaping (*, ?, " and \) |
198 | * @param string $str |
199 | * @return bool |
200 | */ |
201 | private function acceptableString( $str ) { |
202 | // We ignore word parts that we me have to escape |
203 | // when presenting the query back to the user |
204 | return preg_match( '/[*?"\\\\]/', $str ) !== 1; |
205 | } |
206 | |
207 | /** |
208 | * @param PhraseQueryNode $node |
209 | */ |
210 | public function visitPhraseQueryNode( PhraseQueryNode $node ) { |
211 | $this->isComplex = true; |
212 | } |
213 | |
214 | /** |
215 | * @param PhrasePrefixNode $node |
216 | */ |
217 | public function visitPhrasePrefixNode( PhrasePrefixNode $node ) { |
218 | $this->isComplex = true; |
219 | } |
220 | |
221 | /** |
222 | * @param FuzzyNode $node |
223 | */ |
224 | public function visitFuzzyNode( FuzzyNode $node ) { |
225 | $this->isComplex = true; |
226 | } |
227 | |
228 | /** |
229 | * @param PrefixNode $node |
230 | */ |
231 | public function visitPrefixNode( PrefixNode $node ) { |
232 | $this->isComplex = true; |
233 | } |
234 | |
235 | /** |
236 | * @param WildcardNode $node |
237 | */ |
238 | public function visitWildcardNode( WildcardNode $node ) { |
239 | if ( str_contains( $node->getWildcardQuery(), '?' ) ) { |
240 | $this->hasQMarkInWildcard = true; |
241 | } |
242 | $this->isComplex = true; |
243 | } |
244 | |
245 | /** |
246 | * @param EmptyQueryNode $node |
247 | */ |
248 | public function visitEmptyQueryNode( EmptyQueryNode $node ) { |
249 | } |
250 | |
251 | /** |
252 | * @param KeywordFeatureNode $node |
253 | */ |
254 | public function visitKeywordFeatureNode( KeywordFeatureNode $node ) { |
255 | // FIXME: fixing intitle is perhaps a side effect of the original cirrus query parser |
256 | if ( !$this->inNegation && $node->getKey() === 'intitle' && $node->getDelimiter() === '' ) { |
257 | $siz = strlen( $node->getValue() ); |
258 | if ( $siz > $this->currentSize && $this->acceptableString( $node->getValue() ) ) { |
259 | $this->node = $node; |
260 | $this->currentSize = $siz; |
261 | } |
262 | } |
263 | } |
264 | |
265 | /** |
266 | * @param ParsedBooleanNode $node |
267 | */ |
268 | public function visitParsedBooleanNode( ParsedBooleanNode $node ) { |
269 | foreach ( $node->getClauses() as $clause ) { |
270 | $this->visitBooleanClause( $clause ); |
271 | } |
272 | } |
273 | |
274 | /** |
275 | * @param BooleanClause $clause |
276 | */ |
277 | public function visitBooleanClause( BooleanClause $clause ) { |
278 | if ( $clause->isExplicit() ) { |
279 | $this->isComplex = true; |
280 | } |
281 | $oldNegated = $this->inNegation; |
282 | $node = $clause->getNode(); |
283 | if ( $node instanceof KeywordFeatureNode && $node->getKey() === 'intitle' && $node->getDelimiter() === '' ) { |
284 | // Inhibits the fixer when it sees an un-acceptable value inside a keyword (legacy browsertest_176) |
285 | $this->isComplex = $this->isComplex || !$this->acceptableString( $node->getValue() ); |
286 | } |
287 | if ( $clause->getOccur() === BooleanClause::MUST_NOT ) { |
288 | if ( !$node instanceof KeywordFeatureNode ) { |
289 | // FIXME: (legacy) only negated keywords were accepted |
290 | $this->isComplex = true; |
291 | } |
292 | $this->inNegation = !$this->inNegation; |
293 | } |
294 | |
295 | $clause->getNode()->accept( $this ); |
296 | $this->inNegation = $oldNegated; |
297 | } |
298 | |
299 | /** |
300 | * @param NegatedNode $node |
301 | */ |
302 | final public function visitNegatedNode( NegatedNode $node ) { |
303 | /** @phan-suppress-next-line PhanImpossibleCondition I agree, this is impossible. */ |
304 | Assert::invariant( false, 'NegatedNode should be optimized at parse time' ); |
305 | } |
306 | |
307 | /** |
308 | * @param NamespaceHeaderNode $node |
309 | */ |
310 | final public function visitNamespaceHeader( NamespaceHeaderNode $node ) { |
311 | /** @phan-suppress-next-line PhanImpossibleCondition I agree, this is impossible. */ |
312 | Assert::invariant( false, 'Not yet part of the AST, should not be visited.' ); |
313 | } |
314 | } |