Code Coverage |
||||||||||
Classes and Traits |
Functions and Methods |
Lines |
||||||||
Total | |
0.00% |
0 / 1 |
|
7.69% |
1 / 13 |
CRAP | |
10.00% |
2 / 20 |
SourceRegex | |
0.00% |
0 / 1 |
|
7.69% |
1 / 13 |
202.62 | |
10.00% |
2 / 20 |
__construct | |
0.00% |
0 / 1 |
20 | |
0.00% |
0 / 7 |
|||
setRegex | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
setField | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
setNGramField | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
setGramSize | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
setMaxExpand | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
setMaxStatesTraced | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
setMaxInspect | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
setMaxDeterminizedStates | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
setCaseSensitive | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
setLocale | |
100.00% |
1 / 1 |
1 | |
100.00% |
2 / 2 |
|||
setMaxNgramsExtracted | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
setMaxNgramClauses | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
<?php | |
namespace CirrusSearch\Extra\Query; | |
use Elastica\Query\AbstractQuery; | |
/** | |
* Source regex filter for trigram accelerated regex matching. | |
* | |
* @link https://github.com/wikimedia/search-extra/blob/master/docs/source_regex.md | |
* | |
* This program is free software; you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License as published by | |
* the Free Software Foundation; either version 2 of the License, or | |
* (at your option) any later version. | |
* | |
* This program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License along | |
* with this program; if not, write to the Free Software Foundation, Inc., | |
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |
* http://www.gnu.org/copyleft/gpl.html | |
*/ | |
class SourceRegex extends AbstractQuery { | |
/** | |
* List of language code that may be used by WMF wikis which are not valid java locales. | |
* Use of the empty string falls back to the ROOT locale. | |
* Note that the locale here is only used for transforming strings to lowercase | |
*/ | |
private const MW_LANG_CODE_TO_JAVA_LOCALE = [ | |
'be-tarask' => 'be', | |
'map-bms' => '', | |
'roa-tara' => '', | |
'sh' => 'hbs', | |
'shy-latn' => '', | |
]; | |
/** | |
* @param null|string $regex optional regex to match against field | |
* @param null|string $field optional field who's source to check with the regex | |
* @param null|string $ngramField optional field that is indexed with ngrams to | |
* accelerate regex matching | |
*/ | |
public function __construct( $regex = null, $field = null, $ngramField = null ) { | |
if ( $regex ) { | |
$this->setRegex( $regex ); | |
} | |
if ( $field ) { | |
$this->setField( $field ); | |
} | |
if ( $ngramField ) { | |
$this->setNGramField( $ngramField ); | |
} | |
} | |
/** | |
* @param string $regex regex to match against field | |
* @return self | |
*/ | |
public function setRegex( $regex ) { | |
return $this->setParam( 'regex', $regex ); | |
} | |
/** | |
* @param string $field field who's source to check with the regex | |
* @return self | |
*/ | |
public function setField( $field ) { | |
return $this->setParam( 'field', $field ); | |
} | |
/** | |
* @param string $ngramField field that is indexed with ngrams to | |
* accelerate regex matching | |
* @return self | |
*/ | |
public function setNGramField( $ngramField ) { | |
return $this->setParam( 'ngram_field', $ngramField ); | |
} | |
/** | |
* @param int $gramSize size of the ngrams extracted for accelerating | |
* the regex. Defaults to 3 if not set. That gram size must have been | |
* produced by analyzing the ngramField. | |
* @return self | |
*/ | |
public function setGramSize( $gramSize ) { | |
return $this->setParam( 'gram_size', $gramSize ); | |
} | |
/** | |
* @param int $maxExpand maximum range before outgoing automaton arcs are | |
* ignored. Roughly corresponds to the maximum number of characters in a | |
* character class ([abcd]) before it is treated as . for purposes of | |
* acceleration. Defaults to 4. | |
* @return self | |
*/ | |
public function setMaxExpand( $maxExpand ) { | |
return $this->setParam( 'max_expand', $maxExpand ); | |
} | |
/** | |
* @param int $maxStatesTraced maximum number of automaton states that can | |
* be traced before the algorithm gives up and assumes the regex is too | |
* complex and throws an error back to the user. Defaults to 10000 which | |
* handily covers all regexes I cared to test. | |
* @return self | |
*/ | |
public function setMaxStatesTraced( $maxStatesTraced ) { | |
return $this->setParam( 'max_states_traced', $maxStatesTraced ); | |
} | |
/** | |
* @param int $maxInspect maximum number of source field to run the regex | |
* against before giving up and just declaring all remaining fields not | |
* matching by fiat. Defaults to MAX_INT. Set this to 10000 or something | |
* nice and low to prevent regular expressions that cannot be sped up from | |
* taking up too many resources. | |
* @return self | |
*/ | |
public function setMaxInspect( $maxInspect ) { | |
return $this->setParam( 'max_inspect', $maxInspect ); | |
} | |
/** | |
* @param int $maxDeterminizedStates maximum number of automaton states | |
* that Lucene's regex compilation can expand to (even temporarily) | |
* @return self | |
*/ | |
public function setMaxDeterminizedStates( $maxDeterminizedStates ) { | |
return $this->setParam( 'max_determinized_states', $maxDeterminizedStates ); | |
} | |
/** | |
* @param bool $caseSensitive is the regex case insensitive? Defaults to | |
* case insensitive if not set. | |
* @return self | |
*/ | |
public function setCaseSensitive( $caseSensitive ) { | |
return $this->setParam( 'case_sensitive', $caseSensitive ); | |
} | |
/** | |
* @param string $locale locale used for case conversions. Its important that | |
* this matches the locale used for lowercasing in the ngram index. | |
* @return self | |
*/ | |
public function setLocale( $locale ) { | |
$locale = self::MW_LANG_CODE_TO_JAVA_LOCALE[$locale] ?? $locale; | |
return $this->setParam( 'locale', $locale ); | |
} | |
/** | |
* @param int $maxNgrams The maximum number of ngrams to extracted from the | |
* regex. If more could be extracted from the regex tey are ignored. | |
* @return self | |
*/ | |
public function setMaxNgramsExtracted( $maxNgrams ) { | |
return $this->setParam( 'max_ngrams_extracted', $maxNgrams ); | |
} | |
/** | |
* @param int $maxNgramClauses The maximum number of boolean clauses | |
* generated from extracted ngrams. | |
* @return self | |
*/ | |
public function setMaxNgramClauses( $maxNgramClauses ) { | |
return $this->setParam( 'max_ngram_clauses', $maxNgramClauses ); | |
} | |
} |