Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
80.70% |
92 / 114 |
|
73.33% |
11 / 15 |
CRAP | |
0.00% |
0 / 1 |
BaseRegexFeature | |
80.70% |
92 / 114 |
|
73.33% |
11 / 15 |
56.29 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
getValueDelimiters | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
1 | |||
parseValue | |
100.00% |
12 / 12 |
|
100.00% |
1 / 1 |
4 | |||
getFeatureName | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
getCrossSearchStrategy | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
doApplyExtended | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
5 | |||
getFilterQuery | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
3.02 | |||
buildHighlightFields | |
87.50% |
7 / 8 |
|
0.00% |
0 / 1 |
3.02 | |||
getNonRegexFilterQuery | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
buildRegexQuery | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
3 | |||
configureHighlighting | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
doGetRegexHLFields | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
buildRegexWithPlugin | |
83.33% |
10 / 12 |
|
0.00% |
0 / 1 |
6.17 | |||
buildRegexWithGroovy | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
6 | |||
buildNonRegexHLFields | n/a |
0 / 0 |
n/a |
0 / 0 |
0 | |||||
isRegexQuery | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
3 | |||
trimFirstOccurrenceOfSlash | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Query; |
4 | |
5 | use CirrusSearch\CrossSearchStrategy; |
6 | use CirrusSearch\Extra\Query\SourceRegex; |
7 | use CirrusSearch\Parser\AST\KeywordFeatureNode; |
8 | use CirrusSearch\Query\Builder\QueryBuildingContext; |
9 | use CirrusSearch\Search\Fetch\FetchPhaseConfigBuilder; |
10 | use CirrusSearch\Search\Fetch\HighlightedField; |
11 | use CirrusSearch\Search\Fetch\HighlightFieldGenerator; |
12 | use CirrusSearch\Search\Filters; |
13 | use CirrusSearch\Search\SearchContext; |
14 | use CirrusSearch\SearchConfig; |
15 | use CirrusSearch\WarningCollector; |
16 | use Elastica\Query\AbstractQuery; |
17 | use Wikimedia\Assert\Assert; |
18 | |
19 | /** |
20 | * Base class supporting regex searches. Works best when combined with the |
21 | * wikimedia-extra plugin for elasticsearch, but can also fallback to a groovy |
22 | * based implementation. Can be really expensive, but mostly ok if you have the |
23 | * extra plugin enabled. |
24 | * |
25 | * Examples: |
26 | * insource:/abc?/ |
27 | * |
28 | * @see SourceRegex |
29 | */ |
30 | abstract class BaseRegexFeature extends SimpleKeywordFeature implements FilterQueryFeature, HighlightingFeature { |
31 | /** |
32 | * @var string[] Elasticsearch field(s) to search against |
33 | */ |
34 | private $fields; |
35 | |
36 | /** |
37 | * @var bool Is this feature enabled? |
38 | */ |
39 | private $enabled; |
40 | |
41 | /** |
42 | * @var string Locale used for case conversions. It's important that this |
43 | * matches the locale used for lowercasing in the ngram index. |
44 | */ |
45 | private $languageCode; |
46 | |
47 | /** |
48 | * @var string[] Configuration flags for the regex plugin |
49 | */ |
50 | private $regexPlugin; |
51 | |
52 | /** |
53 | * @var int The maximum number of automaton states that Lucene's regex |
54 | * compilation can expand to (even temporarily). Provides protection |
55 | * against overloading the search cluster. Only works when using the |
56 | * extra plugin, groovy based execution is unbounded. |
57 | */ |
58 | private $maxDeterminizedStates; |
59 | |
60 | /** |
61 | * @var string timeout for regex queries |
62 | * with the extra plugin |
63 | */ |
64 | private $shardTimeout; |
65 | |
66 | /** |
67 | * @param SearchConfig $config |
68 | * @param string[] $fields |
69 | */ |
70 | public function __construct( SearchConfig $config, array $fields ) { |
71 | $this->enabled = $config->get( 'CirrusSearchEnableRegex' ); |
72 | $this->languageCode = $config->get( 'LanguageCode' ); |
73 | $this->regexPlugin = $config->getElement( 'CirrusSearchWikimediaExtraPlugin', 'regex' ); |
74 | $this->maxDeterminizedStates = $config->get( 'CirrusSearchRegexMaxDeterminizedStates' ); |
75 | Assert::precondition( $fields !== [], 'must have at least one field' ); |
76 | $this->fields = $fields; |
77 | $this->shardTimeout = $config->getElement( 'CirrusSearchSearchShardTimeout', 'regex' ); |
78 | } |
79 | |
80 | /** |
81 | * @return string[][] |
82 | */ |
83 | public function getValueDelimiters() { |
84 | return [ |
85 | [ |
86 | // simple search |
87 | 'delimiter' => '"' |
88 | ], |
89 | [ |
90 | // regex searches |
91 | 'delimiter' => '/', |
92 | // optional case insensitive suffix |
93 | 'suffixes' => 'i' |
94 | ] |
95 | ]; |
96 | } |
97 | |
98 | /** |
99 | * @param string $key |
100 | * @param string $value |
101 | * @param string $quotedValue |
102 | * @param string $valueDelimiter |
103 | * @param string $suffix |
104 | * @param WarningCollector $warningCollector |
105 | * @return array|false|null |
106 | */ |
107 | public function parseValue( $key, $value, $quotedValue, $valueDelimiter, $suffix, WarningCollector $warningCollector ) { |
108 | if ( $valueDelimiter === '/' ) { |
109 | if ( !$this->enabled ) { |
110 | $warningCollector->addWarning( 'cirrussearch-feature-not-available', "$key regex" ); |
111 | } |
112 | |
113 | $pattern = $this->trimFirstOccurrenceOfSlash( $quotedValue ); |
114 | |
115 | if ( $pattern === '' ) { |
116 | $warningCollector->addWarning( 'cirrussearch-regex-empty-expression', $key ); |
117 | } |
118 | |
119 | return [ |
120 | 'type' => 'regex', |
121 | 'pattern' => $pattern, |
122 | 'insensitive' => $suffix === 'i', |
123 | ]; |
124 | } |
125 | return parent::parseValue( $key, $value, $quotedValue, $valueDelimiter, $suffix, $warningCollector ); |
126 | } |
127 | |
128 | /** |
129 | * @param string $key |
130 | * @param string $valueDelimiter |
131 | * @return string |
132 | */ |
133 | public function getFeatureName( $key, $valueDelimiter ) { |
134 | if ( $valueDelimiter === '/' ) { |
135 | return 'regex'; |
136 | } |
137 | return parent::getFeatureName( $key, $valueDelimiter ); |
138 | } |
139 | |
140 | /** |
141 | * @param KeywordFeatureNode $node |
142 | * @return CrossSearchStrategy |
143 | */ |
144 | public function getCrossSearchStrategy( KeywordFeatureNode $node ) { |
145 | if ( $node->getDelimiter() === '/' ) { |
146 | return CrossSearchStrategy::hostWikiOnlyStrategy(); |
147 | } else { |
148 | return CrossSearchStrategy::allWikisStrategy(); |
149 | } |
150 | } |
151 | |
152 | /** |
153 | * @param SearchContext $context |
154 | * @param string $key |
155 | * @param string $value |
156 | * @param string $quotedValue |
157 | * @param bool $negated |
158 | * @param string $delimiter |
159 | * @param string $suffix |
160 | * @return array |
161 | */ |
162 | public function doApplyExtended( SearchContext $context, $key, $value, $quotedValue, $negated, $delimiter, $suffix ) { |
163 | $parsedValue = $this->parseValue( $key, $value, $quotedValue, $delimiter, $suffix, $context ); |
164 | if ( $this->isRegexQuery( $parsedValue ) ) { |
165 | if ( !$this->enabled ) { |
166 | return [ null, false ]; |
167 | } |
168 | '@phan-var array $parsedValue'; |
169 | $pattern = $parsedValue['pattern']; |
170 | $insensitive = $parsedValue['insensitive']; |
171 | |
172 | if ( $pattern === '' ) { |
173 | $context->setResultsPossible( false ); |
174 | |
175 | return [ null, false ]; |
176 | } |
177 | |
178 | $filter = $this->buildRegexQuery( $pattern, $insensitive ); |
179 | if ( !$negated ) { |
180 | $this->configureHighlighting( $pattern, $insensitive, $context->getFetchPhaseBuilder() ); |
181 | } |
182 | return [ $filter, false ]; |
183 | } else { |
184 | return $this->doApply( $context, $key, $value, $quotedValue, $negated ); |
185 | } |
186 | } |
187 | |
188 | /** |
189 | * @inheritDoc |
190 | */ |
191 | public function getFilterQuery( KeywordFeatureNode $node, QueryBuildingContext $context ) { |
192 | $parsedValue = $node->getParsedValue(); |
193 | if ( $this->isRegexQuery( $parsedValue ) ) { |
194 | if ( !$this->enabled ) { |
195 | return null; |
196 | } |
197 | '@phan-var array $parsedValue'; |
198 | $pattern = $parsedValue['pattern']; |
199 | $insensitive = $parsedValue['insensitive']; |
200 | return $this->buildRegexQuery( $pattern, $insensitive ); |
201 | } else { |
202 | return $this->getNonRegexFilterQuery( $node, $context ); |
203 | } |
204 | } |
205 | |
206 | /** |
207 | * @inheritDoc |
208 | */ |
209 | public function buildHighlightFields( KeywordFeatureNode $node, QueryBuildingContext $context ) { |
210 | $parsedValue = $node->getParsedValue(); |
211 | if ( $this->isRegexQuery( $parsedValue ) ) { |
212 | if ( !$this->enabled ) { |
213 | return []; |
214 | } |
215 | '@phan-var array $parsedValue'; |
216 | $pattern = $parsedValue['pattern']; |
217 | $insensitive = $parsedValue['insensitive']; |
218 | return $this->doGetRegexHLFields( $context->getHighlightFieldGenerator(), $pattern, $insensitive ); |
219 | } |
220 | return $this->buildNonRegexHLFields( $node, $context ); |
221 | } |
222 | |
223 | /** |
224 | * Obtain the filter when the keyword is used in non regex mode. |
225 | * This method will be called on syntax like keyword:word or keyword:"phrase" |
226 | * @param KeywordFeatureNode $node |
227 | * @param QueryBuildingContext $context |
228 | * @return AbstractQuery|null |
229 | */ |
230 | abstract protected function getNonRegexFilterQuery( KeywordFeatureNode $node, QueryBuildingContext $context ); |
231 | |
232 | /** |
233 | * @param string $pattern |
234 | * @param bool $insensitive |
235 | * @return AbstractQuery |
236 | */ |
237 | private function buildRegexQuery( $pattern, $insensitive ) { |
238 | return $this->regexPlugin && in_array( 'use', $this->regexPlugin ) |
239 | ? $this->buildRegexWithPlugin( $pattern, $insensitive ) |
240 | : $this->buildRegexWithGroovy( $pattern, $insensitive ); |
241 | } |
242 | |
243 | /** |
244 | * @param string $pattern |
245 | * @param bool $insensitive |
246 | * @param FetchPhaseConfigBuilder $fetchPhaseConfigBuilder |
247 | */ |
248 | private function configureHighlighting( $pattern, $insensitive, FetchPhaseConfigBuilder $fetchPhaseConfigBuilder ) { |
249 | foreach ( $this->doGetRegexHLFields( $fetchPhaseConfigBuilder, $pattern, $insensitive ) as $f ) { |
250 | $fetchPhaseConfigBuilder->addHLField( $f ); |
251 | } |
252 | } |
253 | |
254 | /** |
255 | * @param HighlightFieldGenerator $generator |
256 | * @param string $pattern |
257 | * @param bool $insensitive |
258 | * @return HighlightedField[] |
259 | */ |
260 | private function doGetRegexHLFields( HighlightFieldGenerator $generator, $pattern, $insensitive ) { |
261 | $fields = []; |
262 | if ( !$generator->supportsRegexFields() ) { |
263 | return $fields; |
264 | } |
265 | foreach ( $this->fields as $field => $hlTarget ) { |
266 | $fields[] = $generator->newRegexField( "$field.plain", $hlTarget, |
267 | $pattern, $insensitive, HighlightedField::COSTLY_EXPERT_SYNTAX_PRIORITY ); |
268 | } |
269 | return $fields; |
270 | } |
271 | |
272 | /** |
273 | * Builds a regular expression query using the wikimedia-extra plugin. |
274 | * |
275 | * @param string $pattern The regular expression to match |
276 | * @param bool $insensitive Should the match be case insensitive? |
277 | * @return AbstractQuery Regular expression query |
278 | */ |
279 | private function buildRegexWithPlugin( $pattern, $insensitive ) { |
280 | $filters = []; |
281 | // TODO: Update plugin to accept multiple values for the field property |
282 | // so that at index time we can create a single trigram index with |
283 | // copy_to instead of creating multiple queries. |
284 | foreach ( $this->fields as $field => $hlTarget ) { |
285 | $filter = new SourceRegex( $pattern, $field, $field . '.trigram' ); |
286 | // set some defaults |
287 | $filter->setMaxDeterminizedStates( $this->maxDeterminizedStates ); |
288 | if ( isset( $this->regexPlugin['max_ngrams_extracted'] ) && is_numeric( $this->regexPlugin['max_ngrams_extracted'] ) ) { |
289 | $filter->setMaxNgramsExtracted( (int)$this->regexPlugin['max_ngrams_extracted'] ); |
290 | } |
291 | if ( isset( $this->regexPlugin['max_ngram_clauses'] ) && is_numeric( $this->regexPlugin['max_ngram_clauses'] ) ) { |
292 | $filter->setMaxNgramClauses( (int)$this->regexPlugin['max_ngram_clauses'] ); |
293 | } |
294 | $filter->setCaseSensitive( !$insensitive ); |
295 | $filter->setLocale( $this->languageCode ); |
296 | |
297 | $filters[] = $filter; |
298 | } |
299 | |
300 | return Filters::booleanOr( $filters ); |
301 | } |
302 | |
303 | /** |
304 | * Builds a regular expression query using groovy. It's significantly less |
305 | * good than the wikimedia-extra plugin, but it's something. |
306 | * |
307 | * @param string $pattern The regular expression to match |
308 | * @param bool $insensitive Should the match be case insensitive? |
309 | * @return AbstractQuery Regular expression query |
310 | */ |
311 | private function buildRegexWithGroovy( $pattern, $insensitive ) { |
312 | $filters = []; |
313 | foreach ( $this->fields as $field ) { |
314 | $script = <<<GROOVY |
315 | import org.apache.lucene.util.automaton.*; |
316 | sourceText = _source.get("{$field}"); |
317 | if (sourceText == null) { |
318 | false; |
319 | } else { |
320 | if (automaton == null) { |
321 | if (insensitive) { |
322 | locale = new Locale(language); |
323 | pattern = pattern.toLowerCase(locale); |
324 | } |
325 | regexp = new RegExp(pattern, RegExp.ALL ^ RegExp.AUTOMATON); |
326 | automaton = new CharacterRunAutomaton(regexp.toAutomaton()); |
327 | } |
328 | if (insensitive) { |
329 | sourceText = sourceText.toLowerCase(locale); |
330 | } |
331 | automaton.run(sourceText); |
332 | } |
333 | |
334 | GROOVY; |
335 | |
336 | $filters[] = new \Elastica\Query\Script( new \Elastica\Script\Script( |
337 | $script, |
338 | [ |
339 | 'pattern' => '.*(' . $pattern . ').*', |
340 | 'insensitive' => $insensitive, |
341 | 'language' => $this->languageCode, |
342 | // The null here creates a slot in which the script will shove |
343 | // an automaton while executing. |
344 | 'automaton' => null, |
345 | 'locale' => null, |
346 | ], |
347 | 'groovy' |
348 | ) ); |
349 | } |
350 | |
351 | return Filters::booleanOr( $filters ); |
352 | } |
353 | |
354 | abstract public function buildNonRegexHLFields( KeywordFeatureNode $node, QueryBuildingContext $context ); |
355 | |
356 | /** |
357 | * @param array|null $parsedValue |
358 | * @return bool |
359 | */ |
360 | private function isRegexQuery( ?array $parsedValue = null ) { |
361 | return is_array( $parsedValue ) && isset( $parsedValue['type'] ) && |
362 | $parsedValue['type'] === 'regex'; |
363 | } |
364 | |
365 | /** |
366 | * @param string $quotedValue |
367 | * @return false|string |
368 | */ |
369 | private function trimFirstOccurrenceOfSlash( string $quotedValue ) { |
370 | $pattern = $quotedValue; |
371 | if ( $pattern[0] == '/' ) { |
372 | $pattern = substr( $quotedValue, 1 ); |
373 | } |
374 | if ( $pattern[strlen( $pattern ) - 1] == '/' ) { |
375 | $pattern = substr( $pattern, 0, strlen( $pattern ) - 1 ); |
376 | } |
377 | |
378 | return $pattern; |
379 | } |
380 | } |