Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
10.53% |
2 / 19 |
|
7.69% |
1 / 13 |
CRAP | |
0.00% |
0 / 1 |
SourceRegex | |
10.53% |
2 / 19 |
|
7.69% |
1 / 13 |
199.37 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
setRegex | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setField | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setNGramField | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setGramSize | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setMaxExpand | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setMaxStatesTraced | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setMaxInspect | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setMaxDeterminizedStates | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setCaseSensitive | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setLocale | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setMaxNgramsExtracted | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setMaxNgramClauses | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace CirrusSearch\Extra\Query; |
4 | |
5 | use Elastica\Query\AbstractQuery; |
6 | |
7 | /** |
8 | * Source regex filter for trigram accelerated regex matching. |
9 | * |
10 | * @link https://github.com/wikimedia/search-extra/blob/master/docs/source_regex.md |
11 | * |
12 | * This program is free software; you can redistribute it and/or modify |
13 | * it under the terms of the GNU General Public License as published by |
14 | * the Free Software Foundation; either version 2 of the License, or |
15 | * (at your option) any later version. |
16 | * |
17 | * This program is distributed in the hope that it will be useful, |
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
20 | * GNU General Public License for more details. |
21 | * |
22 | * You should have received a copy of the GNU General Public License along |
23 | * with this program; if not, write to the Free Software Foundation, Inc., |
24 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
25 | * http://www.gnu.org/copyleft/gpl.html |
26 | */ |
27 | |
28 | class SourceRegex extends AbstractQuery { |
29 | /** |
30 | * List of language code that may be used by WMF wikis which are not valid java locales. |
31 | * Use of the empty string falls back to the ROOT locale. |
32 | * Note that the locale here is only used for transforming strings to lowercase |
33 | */ |
34 | private const MW_LANG_CODE_TO_JAVA_LOCALE = [ |
35 | 'be-tarask' => 'be', |
36 | 'map-bms' => '', |
37 | 'roa-tara' => '', |
38 | 'sh' => 'hbs', |
39 | 'shy-latn' => '', |
40 | ]; |
41 | |
42 | /** |
43 | * @param null|string $regex optional regex to match against field |
44 | * @param null|string $field optional field who's source to check with the regex |
45 | * @param null|string $ngramField optional field that is indexed with ngrams to |
46 | * accelerate regex matching |
47 | */ |
48 | public function __construct( $regex = null, $field = null, $ngramField = null ) { |
49 | if ( $regex ) { |
50 | $this->setRegex( $regex ); |
51 | } |
52 | if ( $field ) { |
53 | $this->setField( $field ); |
54 | } |
55 | if ( $ngramField ) { |
56 | $this->setNGramField( $ngramField ); |
57 | } |
58 | } |
59 | |
60 | /** |
61 | * @param string $regex regex to match against field |
62 | * @return self |
63 | */ |
64 | public function setRegex( $regex ) { |
65 | return $this->setParam( 'regex', $regex ); |
66 | } |
67 | |
68 | /** |
69 | * @param string $field field who's source to check with the regex |
70 | * @return self |
71 | */ |
72 | public function setField( $field ) { |
73 | return $this->setParam( 'field', $field ); |
74 | } |
75 | |
76 | /** |
77 | * @param string $ngramField field that is indexed with ngrams to |
78 | * accelerate regex matching |
79 | * @return self |
80 | */ |
81 | public function setNGramField( $ngramField ) { |
82 | return $this->setParam( 'ngram_field', $ngramField ); |
83 | } |
84 | |
85 | /** |
86 | * @param int $gramSize size of the ngrams extracted for accelerating |
87 | * the regex. Defaults to 3 if not set. That gram size must have been |
88 | * produced by analyzing the ngramField. |
89 | * @return self |
90 | */ |
91 | public function setGramSize( $gramSize ) { |
92 | return $this->setParam( 'gram_size', $gramSize ); |
93 | } |
94 | |
95 | /** |
96 | * @param int $maxExpand maximum range before outgoing automaton arcs are |
97 | * ignored. Roughly corresponds to the maximum number of characters in a |
98 | * character class ([abcd]) before it is treated as . for purposes of |
99 | * acceleration. Defaults to 4. |
100 | * @return self |
101 | */ |
102 | public function setMaxExpand( $maxExpand ) { |
103 | return $this->setParam( 'max_expand', $maxExpand ); |
104 | } |
105 | |
106 | /** |
107 | * @param int $maxStatesTraced maximum number of automaton states that can |
108 | * be traced before the algorithm gives up and assumes the regex is too |
109 | * complex and throws an error back to the user. Defaults to 10000 which |
110 | * handily covers all regexes I cared to test. |
111 | * @return self |
112 | */ |
113 | public function setMaxStatesTraced( $maxStatesTraced ) { |
114 | return $this->setParam( 'max_states_traced', $maxStatesTraced ); |
115 | } |
116 | |
117 | /** |
118 | * @param int $maxInspect maximum number of source field to run the regex |
119 | * against before giving up and just declaring all remaining fields not |
120 | * matching by fiat. Defaults to MAX_INT. Set this to 10000 or something |
121 | * nice and low to prevent regular expressions that cannot be sped up from |
122 | * taking up too many resources. |
123 | * @return self |
124 | */ |
125 | public function setMaxInspect( $maxInspect ) { |
126 | return $this->setParam( 'max_inspect', $maxInspect ); |
127 | } |
128 | |
129 | /** |
130 | * @param int $maxDeterminizedStates maximum number of automaton states |
131 | * that Lucene's regex compilation can expand to (even temporarily) |
132 | * @return self |
133 | */ |
134 | public function setMaxDeterminizedStates( $maxDeterminizedStates ) { |
135 | return $this->setParam( 'max_determinized_states', $maxDeterminizedStates ); |
136 | } |
137 | |
138 | /** |
139 | * @param bool $caseSensitive is the regex case insensitive? Defaults to |
140 | * case insensitive if not set. |
141 | * @return self |
142 | */ |
143 | public function setCaseSensitive( $caseSensitive ) { |
144 | return $this->setParam( 'case_sensitive', $caseSensitive ); |
145 | } |
146 | |
147 | /** |
148 | * @param string $locale locale used for case conversions. Its important that |
149 | * this matches the locale used for lowercasing in the ngram index. |
150 | * @return self |
151 | */ |
152 | public function setLocale( $locale ) { |
153 | $locale = self::MW_LANG_CODE_TO_JAVA_LOCALE[$locale] ?? $locale; |
154 | return $this->setParam( 'locale', $locale ); |
155 | } |
156 | |
157 | /** |
158 | * @param int $maxNgrams The maximum number of ngrams to extracted from the |
159 | * regex. If more could be extracted from the regex tey are ignored. |
160 | * @return self |
161 | */ |
162 | public function setMaxNgramsExtracted( $maxNgrams ) { |
163 | return $this->setParam( 'max_ngrams_extracted', $maxNgrams ); |
164 | } |
165 | |
166 | /** |
167 | * @param int $maxNgramClauses The maximum number of boolean clauses |
168 | * generated from extracted ngrams. |
169 | * @return self |
170 | */ |
171 | public function setMaxNgramClauses( $maxNgramClauses ) { |
172 | return $this->setParam( 'max_ngram_clauses', $maxNgramClauses ); |
173 | } |
174 | } |