Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
88.10% |
74 / 84 |
|
47.06% |
8 / 17 |
CRAP | |
0.00% |
0 / 1 |
| QueryFixer | |
88.10% |
74 / 84 |
|
47.06% |
8 / 17 |
54.22 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| build | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
4.05 | |||
| getFixablePart | |
86.67% |
13 / 15 |
|
0.00% |
0 / 1 |
8.15 | |||
| fix | |
95.65% |
22 / 23 |
|
0.00% |
0 / 1 |
7 | |||
| visitWordsQueryNode | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
| acceptableString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| visitPhraseQueryNode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| visitPhrasePrefixNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| visitFuzzyNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| visitPrefixNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| visitWildcardNode | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
| visitEmptyQueryNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| visitKeywordFeatureNode | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
6 | |||
| visitParsedBooleanNode | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
| visitBooleanClause | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
8 | |||
| visitNegatedNode | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| visitNamespaceHeader | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | namespace CirrusSearch\Parser\AST\Visitor; |
| 4 | |
| 5 | use CirrusSearch\Parser\AST\BooleanClause; |
| 6 | use CirrusSearch\Parser\AST\EmptyQueryNode; |
| 7 | use CirrusSearch\Parser\AST\FuzzyNode; |
| 8 | use CirrusSearch\Parser\AST\KeywordFeatureNode; |
| 9 | use CirrusSearch\Parser\AST\NamespaceHeaderNode; |
| 10 | use CirrusSearch\Parser\AST\NegatedNode; |
| 11 | use CirrusSearch\Parser\AST\ParsedBooleanNode; |
| 12 | use CirrusSearch\Parser\AST\ParsedNode; |
| 13 | use CirrusSearch\Parser\AST\ParsedQuery; |
| 14 | use CirrusSearch\Parser\AST\PhrasePrefixNode; |
| 15 | use CirrusSearch\Parser\AST\PhraseQueryNode; |
| 16 | use CirrusSearch\Parser\AST\PrefixNode; |
| 17 | use CirrusSearch\Parser\AST\WildcardNode; |
| 18 | use CirrusSearch\Parser\AST\WordsQueryNode; |
| 19 | use HtmlArmor; |
| 20 | use Wikimedia\Assert\Assert; |
| 21 | |
| 22 | /** |
| 23 | * Inspect a query and determine what parts of it can be sent to a typo correction mechanism and |
| 24 | * provide a method to fix the query once the corrected substring is known. |
| 25 | */ |
| 26 | class QueryFixer implements Visitor { |
| 27 | /** |
| 28 | * @var \SplObjectStorage |
| 29 | */ |
| 30 | private static $cache; |
| 31 | |
| 32 | /** |
| 33 | * @var ParsedQuery |
| 34 | */ |
| 35 | private $parsedQuery; |
| 36 | |
| 37 | /** |
| 38 | * @var bool |
| 39 | */ |
| 40 | private $visited = false; |
| 41 | |
| 42 | /** |
| 43 | * @var ParsedNode|null |
| 44 | */ |
| 45 | private $node; |
| 46 | |
| 47 | /** |
| 48 | * @var bool |
| 49 | */ |
| 50 | private $hasQMarkInWildcard = false; |
| 51 | |
| 52 | /** |
| 53 | * @var int |
| 54 | */ |
| 55 | private $currentSize = 0; |
| 56 | |
| 57 | /** |
| 58 | * @var bool true when this branch is "negated". |
| 59 | */ |
| 60 | private $inNegation; |
| 61 | |
| 62 | /** |
| 63 | * @var bool |
| 64 | */ |
| 65 | private $isComplex = false; |
| 66 | |
| 67 | public function __construct( ParsedQuery $query ) { |
| 68 | $this->parsedQuery = $query; |
| 69 | } |
| 70 | |
| 71 | /** |
| 72 | * @param ParsedQuery $query |
| 73 | * @return self |
| 74 | */ |
| 75 | public static function build( ParsedQuery $query ) { |
| 76 | if ( self::$cache === null || count( self::$cache ) > 100 ) { |
| 77 | // Build the cache for the first time or drop it for a new empty one just in case this class |
| 78 | // is used from a maint script that treats/parses millions of queries |
| 79 | self::$cache = new \SplObjectStorage(); |
| 80 | } |
| 81 | |
| 82 | $fixer = self::$cache[$query] ?? null; |
| 83 | if ( $fixer === null ) { |
| 84 | $fixer = new self( $query ); |
| 85 | self::$cache[$query] = $fixer; |
| 86 | } |
| 87 | return $fixer; |
| 88 | } |
| 89 | |
| 90 | /** |
| 91 | * Get the longest phrase that is subject to typo correction. |
| 92 | * It's generally a set of consecutive words. |
| 93 | * |
| 94 | * @return string|null |
| 95 | */ |
| 96 | public function getFixablePart() { |
| 97 | if ( !$this->visited ) { |
| 98 | $this->visited = true; |
| 99 | $this->parsedQuery->getRoot()->accept( $this ); |
| 100 | } |
| 101 | |
| 102 | if ( $this->isComplex ) { |
| 103 | $this->node = null; |
| 104 | } |
| 105 | |
| 106 | if ( $this->hasQMarkInWildcard && $this->parsedQuery->hasCleanup( ParsedQuery::CLEANUP_QMARK_STRIPPING ) ) { |
| 107 | // We may not be able to reconstruct this kind of queries properly |
| 108 | // If a question mark is legimetely removed we agree that it's OK to present the user |
| 109 | // with its original query minus the question marks. |
| 110 | // But if the user explicitely escaped the question mark so that it generates a valid |
| 111 | // wildcard query we don't attempt to re-escape the resulting query. |
| 112 | $this->node = null; |
| 113 | } |
| 114 | |
| 115 | // @phan-suppress-next-line PhanSuspiciousValueComparison |
| 116 | if ( $this->node === null ) { |
| 117 | return null; |
| 118 | } |
| 119 | |
| 120 | if ( $this->node instanceof KeywordFeatureNode ) { |
| 121 | return $this->node->getValue(); |
| 122 | } elseif ( $this->node instanceof WordsQueryNode ) { |
| 123 | return $this->node->getWords(); |
| 124 | } else { |
| 125 | /** @phan-suppress-next-line PhanImpossibleCondition I agree, this is impossible. */ |
| 126 | Assert::invariant( false, "Unsupported node type " . get_class( $this->node ) ); |
| 127 | return null; |
| 128 | } |
| 129 | } |
| 130 | |
| 131 | /** |
| 132 | * Replace the fixable part of the visited query with the provided replacement |
| 133 | * |
| 134 | * @param HtmlArmor|string $replacement If HtmlArmor is provided all modifications will be |
| 135 | * html safe and HtmlArmor will be returned. If a string is provided no escaping will occur. |
| 136 | * @return HtmlArmor|string|null |
| 137 | */ |
| 138 | public function fix( $replacement ) { |
| 139 | Assert::precondition( $this->visited, "getFixablePart must be called before trying to fix the query" ); |
| 140 | if ( $this->node === null ) { |
| 141 | return null; |
| 142 | } |
| 143 | |
| 144 | $escapeBoundaries = false; |
| 145 | if ( $replacement instanceof HtmlArmor ) { |
| 146 | $escapeBoundaries = true; |
| 147 | $replacement = HtmlArmor::getHtml( $replacement ); |
| 148 | if ( $replacement === null ) { |
| 149 | throw new \InvalidArgumentException( '$replacement cannot be null nor wrap a null value' ); |
| 150 | } |
| 151 | } |
| 152 | $replacement = preg_replace( '/[~?*"\\\\]/', '\\\\$0', $replacement ); |
| 153 | |
| 154 | $prefix = ""; |
| 155 | if ( $this->parsedQuery->hasCleanup( ParsedQuery::TILDE_HEADER ) ) { |
| 156 | $prefix .= "~"; |
| 157 | } |
| 158 | $prefix .= substr( $this->parsedQuery->getQuery(), 0, $this->node->getStartOffset() ); |
| 159 | if ( $this->node instanceof KeywordFeatureNode ) { |
| 160 | $prefix .= $this->node->getKey() . ':'; |
| 161 | } |
| 162 | |
| 163 | $suffix = substr( $this->parsedQuery->getQuery(), $this->node->getEndOffset() ); |
| 164 | |
| 165 | if ( $escapeBoundaries ) { |
| 166 | $prefix = htmlspecialchars( $prefix ); |
| 167 | $suffix = htmlspecialchars( $suffix ); |
| 168 | $fixed = $prefix . $replacement . $suffix; |
| 169 | return new HtmlArmor( $fixed ); |
| 170 | } |
| 171 | |
| 172 | return $prefix . $replacement . $suffix; |
| 173 | } |
| 174 | |
| 175 | public function visitWordsQueryNode( WordsQueryNode $node ) { |
| 176 | if ( $this->inNegation ) { |
| 177 | return; |
| 178 | } |
| 179 | $siz = mb_strlen( $node->getWords() ); |
| 180 | if ( $siz > $this->currentSize ) { |
| 181 | if ( !$this->acceptableString( $node->getWords() ) ) { |
| 182 | return; |
| 183 | } |
| 184 | $this->node = $node; |
| 185 | $this->currentSize = $siz; |
| 186 | } |
| 187 | } |
| 188 | |
| 189 | /** |
| 190 | * Determine if this substring of the query is suitable for being fixed. |
| 191 | * Excludes string with chars that may require escaping (*, ?, " and \) |
| 192 | * @param string $str |
| 193 | * @return bool |
| 194 | */ |
| 195 | private function acceptableString( $str ) { |
| 196 | // We ignore word parts that we me have to escape |
| 197 | // when presenting the query back to the user |
| 198 | return preg_match( '/[*?"\\\\]/', $str ) !== 1; |
| 199 | } |
| 200 | |
| 201 | public function visitPhraseQueryNode( PhraseQueryNode $node ) { |
| 202 | $this->isComplex = true; |
| 203 | } |
| 204 | |
| 205 | public function visitPhrasePrefixNode( PhrasePrefixNode $node ) { |
| 206 | $this->isComplex = true; |
| 207 | } |
| 208 | |
| 209 | public function visitFuzzyNode( FuzzyNode $node ) { |
| 210 | $this->isComplex = true; |
| 211 | } |
| 212 | |
| 213 | public function visitPrefixNode( PrefixNode $node ) { |
| 214 | $this->isComplex = true; |
| 215 | } |
| 216 | |
| 217 | public function visitWildcardNode( WildcardNode $node ) { |
| 218 | if ( str_contains( $node->getWildcardQuery(), '?' ) ) { |
| 219 | $this->hasQMarkInWildcard = true; |
| 220 | } |
| 221 | $this->isComplex = true; |
| 222 | } |
| 223 | |
| 224 | public function visitEmptyQueryNode( EmptyQueryNode $node ) { |
| 225 | } |
| 226 | |
| 227 | public function visitKeywordFeatureNode( KeywordFeatureNode $node ) { |
| 228 | // FIXME: fixing intitle is perhaps a side effect of the original cirrus query parser |
| 229 | if ( !$this->inNegation && $node->getKey() === 'intitle' && $node->getDelimiter() === '' ) { |
| 230 | $siz = strlen( $node->getValue() ); |
| 231 | if ( $siz > $this->currentSize && $this->acceptableString( $node->getValue() ) ) { |
| 232 | $this->node = $node; |
| 233 | $this->currentSize = $siz; |
| 234 | } |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | public function visitParsedBooleanNode( ParsedBooleanNode $node ) { |
| 239 | foreach ( $node->getClauses() as $clause ) { |
| 240 | $this->visitBooleanClause( $clause ); |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | public function visitBooleanClause( BooleanClause $clause ) { |
| 245 | if ( $clause->isExplicit() ) { |
| 246 | $this->isComplex = true; |
| 247 | } |
| 248 | $oldNegated = $this->inNegation; |
| 249 | $node = $clause->getNode(); |
| 250 | if ( $node instanceof KeywordFeatureNode && $node->getKey() === 'intitle' && $node->getDelimiter() === '' ) { |
| 251 | // Inhibits the fixer when it sees an un-acceptable value inside a keyword (legacy browsertest_176) |
| 252 | $this->isComplex = $this->isComplex || !$this->acceptableString( $node->getValue() ); |
| 253 | } |
| 254 | if ( $clause->getOccur() === BooleanClause::MUST_NOT ) { |
| 255 | if ( !$node instanceof KeywordFeatureNode ) { |
| 256 | // FIXME: (legacy) only negated keywords were accepted |
| 257 | $this->isComplex = true; |
| 258 | } |
| 259 | $this->inNegation = !$this->inNegation; |
| 260 | } |
| 261 | |
| 262 | $clause->getNode()->accept( $this ); |
| 263 | $this->inNegation = $oldNegated; |
| 264 | } |
| 265 | |
| 266 | final public function visitNegatedNode( NegatedNode $node ) { |
| 267 | /** @phan-suppress-next-line PhanImpossibleCondition I agree, this is impossible. */ |
| 268 | Assert::invariant( false, 'NegatedNode should be optimized at parse time' ); |
| 269 | } |
| 270 | |
| 271 | final public function visitNamespaceHeader( NamespaceHeaderNode $node ) { |
| 272 | /** @phan-suppress-next-line PhanImpossibleCondition I agree, this is impossible. */ |
| 273 | Assert::invariant( false, 'Not yet part of the AST, should not be visited.' ); |
| 274 | } |
| 275 | } |