Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
10.53% |
2 / 19 |
|
7.69% |
1 / 13 |
CRAP | |
0.00% |
0 / 1 |
| SourceRegex | |
10.53% |
2 / 19 |
|
7.69% |
1 / 13 |
199.37 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
| setRegex | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| setField | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| setNGramField | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| setGramSize | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| setMaxExpand | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| setMaxStatesTraced | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| setMaxInspect | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| setMaxDeterminizedStates | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| setCaseSensitive | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| setLocale | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| setMaxNgramsExtracted | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| setMaxNgramClauses | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * @license GPL-2.0-or-later |
| 4 | */ |
| 5 | |
| 6 | namespace CirrusSearch\Extra\Query; |
| 7 | |
| 8 | use Elastica\Query\AbstractQuery; |
| 9 | |
| 10 | /** |
| 11 | * Source regex filter for trigram accelerated regex matching. |
| 12 | * |
| 13 | * @link https://github.com/wikimedia/search-extra/blob/master/docs/source_regex.md |
| 14 | */ |
| 15 | class SourceRegex extends AbstractQuery { |
| 16 | /** |
| 17 | * List of language code that may be used by WMF wikis which are not valid java locales. |
| 18 | * Use of the empty string falls back to the ROOT locale. |
| 19 | * Note that the locale here is only used for transforming strings to lowercase |
| 20 | */ |
| 21 | private const MW_LANG_CODE_TO_JAVA_LOCALE = [ |
| 22 | 'be-tarask' => 'be', |
| 23 | 'map-bms' => '', |
| 24 | 'roa-tara' => '', |
| 25 | 'sh' => 'hbs', |
| 26 | 'shy-latn' => '', |
| 27 | ]; |
| 28 | |
| 29 | /** |
| 30 | * @param null|string $regex optional regex to match against field |
| 31 | * @param null|string $field optional field who's source to check with the regex |
| 32 | * @param null|string $ngramField optional field that is indexed with ngrams to |
| 33 | * accelerate regex matching |
| 34 | */ |
| 35 | public function __construct( $regex = null, $field = null, $ngramField = null ) { |
| 36 | if ( $regex ) { |
| 37 | $this->setRegex( $regex ); |
| 38 | } |
| 39 | if ( $field ) { |
| 40 | $this->setField( $field ); |
| 41 | } |
| 42 | if ( $ngramField ) { |
| 43 | $this->setNGramField( $ngramField ); |
| 44 | } |
| 45 | } |
| 46 | |
| 47 | /** |
| 48 | * @param string $regex regex to match against field |
| 49 | * @return self |
| 50 | */ |
| 51 | public function setRegex( $regex ) { |
| 52 | return $this->setParam( 'regex', $regex ); |
| 53 | } |
| 54 | |
| 55 | /** |
| 56 | * @param string $field field who's source to check with the regex |
| 57 | * @return self |
| 58 | */ |
| 59 | public function setField( $field ) { |
| 60 | return $this->setParam( 'field', $field ); |
| 61 | } |
| 62 | |
| 63 | /** |
| 64 | * @param string $ngramField field that is indexed with ngrams to |
| 65 | * accelerate regex matching |
| 66 | * @return self |
| 67 | */ |
| 68 | public function setNGramField( $ngramField ) { |
| 69 | return $this->setParam( 'ngram_field', $ngramField ); |
| 70 | } |
| 71 | |
| 72 | /** |
| 73 | * @param int $gramSize size of the ngrams extracted for accelerating |
| 74 | * the regex. Defaults to 3 if not set. That gram size must have been |
| 75 | * produced by analyzing the ngramField. |
| 76 | * @return self |
| 77 | */ |
| 78 | public function setGramSize( $gramSize ) { |
| 79 | return $this->setParam( 'gram_size', $gramSize ); |
| 80 | } |
| 81 | |
| 82 | /** |
| 83 | * @param int $maxExpand maximum range before outgoing automaton arcs are |
| 84 | * ignored. Roughly corresponds to the maximum number of characters in a |
| 85 | * character class ([abcd]) before it is treated as . for purposes of |
| 86 | * acceleration. Defaults to 4. |
| 87 | * @return self |
| 88 | */ |
| 89 | public function setMaxExpand( $maxExpand ) { |
| 90 | return $this->setParam( 'max_expand', $maxExpand ); |
| 91 | } |
| 92 | |
| 93 | /** |
| 94 | * @param int $maxStatesTraced maximum number of automaton states that can |
| 95 | * be traced before the algorithm gives up and assumes the regex is too |
| 96 | * complex and throws an error back to the user. Defaults to 10000 which |
| 97 | * handily covers all regexes I cared to test. |
| 98 | * @return self |
| 99 | */ |
| 100 | public function setMaxStatesTraced( $maxStatesTraced ) { |
| 101 | return $this->setParam( 'max_states_traced', $maxStatesTraced ); |
| 102 | } |
| 103 | |
| 104 | /** |
| 105 | * @param int $maxInspect maximum number of source field to run the regex |
| 106 | * against before giving up and just declaring all remaining fields not |
| 107 | * matching by fiat. Defaults to MAX_INT. Set this to 10000 or something |
| 108 | * nice and low to prevent regular expressions that cannot be sped up from |
| 109 | * taking up too many resources. |
| 110 | * @return self |
| 111 | */ |
| 112 | public function setMaxInspect( $maxInspect ) { |
| 113 | return $this->setParam( 'max_inspect', $maxInspect ); |
| 114 | } |
| 115 | |
| 116 | /** |
| 117 | * @param int $maxDeterminizedStates maximum number of automaton states |
| 118 | * that Lucene's regex compilation can expand to (even temporarily) |
| 119 | * @return self |
| 120 | */ |
| 121 | public function setMaxDeterminizedStates( $maxDeterminizedStates ) { |
| 122 | return $this->setParam( 'max_determinized_states', $maxDeterminizedStates ); |
| 123 | } |
| 124 | |
| 125 | /** |
| 126 | * @param bool $caseSensitive is the regex case insensitive? Defaults to |
| 127 | * case insensitive if not set. |
| 128 | * @return self |
| 129 | */ |
| 130 | public function setCaseSensitive( $caseSensitive ) { |
| 131 | return $this->setParam( 'case_sensitive', $caseSensitive ); |
| 132 | } |
| 133 | |
| 134 | /** |
| 135 | * @param string $locale locale used for case conversions. Its important that |
| 136 | * this matches the locale used for lowercasing in the ngram index. |
| 137 | * @return self |
| 138 | */ |
| 139 | public function setLocale( $locale ) { |
| 140 | $locale = self::MW_LANG_CODE_TO_JAVA_LOCALE[$locale ?? ''] ?? $locale; |
| 141 | return $this->setParam( 'locale', $locale ); |
| 142 | } |
| 143 | |
| 144 | /** |
| 145 | * @param int $maxNgrams The maximum number of ngrams to extracted from the |
| 146 | * regex. If more could be extracted from the regex tey are ignored. |
| 147 | * @return self |
| 148 | */ |
| 149 | public function setMaxNgramsExtracted( $maxNgrams ) { |
| 150 | return $this->setParam( 'max_ngrams_extracted', $maxNgrams ); |
| 151 | } |
| 152 | |
| 153 | /** |
| 154 | * @param int $maxNgramClauses The maximum number of boolean clauses |
| 155 | * generated from extracted ngrams. |
| 156 | * @return self |
| 157 | */ |
| 158 | public function setMaxNgramClauses( $maxNgramClauses ) { |
| 159 | return $this->setParam( 'max_ngram_clauses', $maxNgramClauses ); |
| 160 | } |
| 161 | } |