Code Coverage |
||||||||||
Classes and Traits |
Functions and Methods |
Lines |
||||||||
Total | |
0.00% |
0 / 1 |
|
60.00% |
6 / 10 |
CRAP | |
84.21% |
48 / 57 |
Escaper | |
0.00% |
0 / 1 |
|
60.00% |
6 / 10 |
21.57 | |
84.21% |
48 / 57 |
__construct | |
100.00% |
1 / 1 |
1 | |
100.00% |
3 / 3 |
|||
escapeQuotes | |
100.00% |
1 / 1 |
2 | |
100.00% |
3 / 3 |
|||
fixupQueryStringPart | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 4 |
|||
fixupWholeQueryString | |
0.00% |
0 / 1 |
4.03 | |
88.00% |
22 / 25 |
|||
lowercaseMatched | |
100.00% |
1 / 1 |
1 | |
100.00% |
1 / 1 |
|||
balanceQuotes | |
100.00% |
1 / 1 |
2 | |
100.00% |
3 / 3 |
|||
unbalancedQuotes | |
100.00% |
1 / 1 |
6 | |
100.00% |
14 / 14 |
|||
unescape | |
100.00% |
1 / 1 |
1 | |
100.00% |
2 / 2 |
|||
getAllowLeadingWildcard | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
getLanguage | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
<?php | |
namespace CirrusSearch\Search; | |
/** | |
* Escapes queries. | |
* | |
* This program is free software; you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License as published by | |
* the Free Software Foundation; either version 2 of the License, or | |
* (at your option) any later version. | |
* | |
* This program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License along | |
* with this program; if not, write to the Free Software Foundation, Inc., | |
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |
* http://www.gnu.org/copyleft/gpl.html | |
*/ | |
class Escaper { | |
/** | |
* @var string MediaWiki language code | |
*/ | |
private $language; | |
/** | |
* Allow leading wildcards? | |
* @var bool | |
*/ | |
private $allowLeadingWildcard; | |
/** | |
* @param string $language MediaWiki language code | |
* @param bool $allowLeadingWildcard | |
*/ | |
public function __construct( $language, $allowLeadingWildcard = true ) { | |
$this->language = $language; | |
$this->allowLeadingWildcard = $allowLeadingWildcard; | |
} | |
/** | |
* @param string $text | |
* @return string | |
*/ | |
public function escapeQuotes( $text ) { | |
if ( $this->language === 'he' ) { | |
// Hebrew uses the double quote (") character as a standin for quotation marks (“”) | |
// which delineate phrases. It also uses double quotes as a standin for another | |
// character (״), call a Gershayim, which mark acronyms. Here we guess if the intent | |
// was to mark a phrase, in which case we leave the quotes alone, or to mark an | |
// acronym, in which case we escape them. | |
return preg_replace( '/(?<=[^\s\\\\])"(?=\S)/u', '\\"', $text ); | |
} | |
return $text; | |
} | |
/** | |
* Make sure the query string part is well formed by escaping some syntax that we don't | |
* want users to get direct access to and making sure quotes are balanced. | |
* These special characters _aren't_ escaped: | |
* * and ?: Do a wildcard search against the stemmed text which isn't strictly a good | |
* idea but this is so rarely used that adding extra code to flip prefix searches into | |
* real prefix searches isn't really worth it. | |
* ~: Do a fuzzy match against the stemmed text which isn't strictly a good idea but it | |
* gets the job done and fuzzy matches are a really rarely used feature to be creating an | |
* extra index for. | |
* ": Perform a phrase search for the quoted term. If the "s aren't balanced we insert one | |
* at the end of the term to make sure elasticsearch doesn't barf at us. | |
* | |
* @param string $string | |
* @return string | |
*/ | |
public function fixupQueryStringPart( $string ) { | |
// Escape characters that can be escaped with \\ | |
$string = preg_replace( '/( | |
\(| (?# no user supplied groupings) | |
\)| | |
\{| (?# no exclusive range queries) | |
}| | |
\[| (?# no inclusive range queries either) | |
]| | |
\^| (?# no user supplied boosts at this point, though I cant think why) | |
:| (?# no specifying your own fields) | |
\\\(?!") (?# the only acceptable escaping is for quotes) | |
)/x', '\\\$1', $string ); | |
// Forward slash escaping doesn't work properly in all environments so we just eat them. Nom. | |
$string = str_replace( '/', ' ', $string ); | |
// Elasticsearch's query strings can't abide unbalanced quotes | |
return $this->balanceQuotes( $string ); | |
} | |
/** | |
* Make sure that all operators and lucene syntax is used correctly in the query string | |
* and store if this is a fuzzy query. | |
* If it isn't then the syntax escaped so it becomes part of the query text. | |
* | |
* @param string $string | |
* @return string fixed up query string | |
*/ | |
public function fixupWholeQueryString( $string ) { | |
$escapeBadSyntax = static function ( $matches ) { | |
return preg_replace( '/(?=[^\s\w])/', '\\', $matches[0] ); | |
}; | |
// Be careful when editing this method because the ordering of the replacements matters. | |
// Escape ~ that don't follow a term or a quote | |
$string = preg_replace_callback( '/(?<![\w"])~/u', $escapeBadSyntax, $string ); | |
// When allow leading wildcard is disabled elasticsearch will report an | |
// error if these are unescaped. Escape ? and * that don't follow a term. | |
if ( !$this->allowLeadingWildcard ) { | |
$string = preg_replace_callback( '/(?<!\w)[?*]/u', $escapeBadSyntax, $string ); | |
} | |
// Reduce token ranges to bare tokens without the < or > | |
$string = preg_replace( '/[<>]+(\S)/u', '$1', $string ); | |
// Turn bad fuzzy searches into searches that contain a ~ and set $this->fuzzyQuery for good ones. | |
$string = preg_replace_callback( '/(?<leading>\w)~(?<trailing>\S*)/u', | |
static function ( $matches ) use ( &$fuzzyQuery ) { | |
if ( preg_match( '/^[0-2]?$/', $matches[ 'trailing' ] ) ) { | |
return $matches[ 0 ]; | |
} else { | |
return $matches[ 'leading' ] . '\\~' . | |
preg_replace( '/(?<!\\\\)~/', '\~', $matches[ 'trailing' ] ); | |
} | |
}, $string ); | |
// Turn bad proximity searches into searches that contain a ~ | |
$string = preg_replace_callback( '/"~(?<trailing>\S*)/u', static function ( $matches ) { | |
if ( preg_match( '/\d+/', $matches[ 'trailing' ] ) ) { | |
return $matches[ 0 ]; | |
} else { | |
return '"\\~' . $matches[ 'trailing' ]; | |
} | |
}, $string ); | |
// Escape +, -, and ! when not immediately followed by a term or when immediately | |
// prefixed with a term. Catches "foo-bar", "foo- bar", "foo - bar". The only | |
// acceptable use is "foo -bar" and "-bar foo". | |
$string = preg_replace_callback( '/[+\-!]+(?!\w)/u', $escapeBadSyntax, $string ); | |
$string = preg_replace_callback( '/(?<!^|[ \\\\])[+\-!]+/u', $escapeBadSyntax, $string ); | |
// Escape || when not between terms | |
$string = preg_replace_callback( '/^\s*\|\|/u', $escapeBadSyntax, $string ); | |
$string = preg_replace_callback( '/\|\|\s*$/u', $escapeBadSyntax, $string ); | |
// Lowercase AND and OR when not surrounded on both sides by a term. | |
// Lowercase NOT when it doesn't have a term after it. | |
$string = preg_replace_callback( '/^\s*(?:AND|OR)\b|\b(?:AND|OR|NOT)\s*$/u', | |
[ self::class, 'lowercaseMatched' ], $string ); | |
$string = preg_replace_callback( '/\b(?:AND|OR|NOT)\s+(?=AND\b|OR\b|NOT\b)/u', | |
[ self::class, 'lowercaseMatched' ], $string ); | |
return $string; | |
} | |
/** | |
* @param string[] $matches | |
* @return string | |
*/ | |
private static function lowercaseMatched( $matches ) { | |
return strtolower( $matches[ 0 ] ); | |
} | |
/** | |
* @param string $text | |
* @return string | |
*/ | |
public function balanceQuotes( $text ) { | |
if ( $this->unbalancedQuotes( $text ) ) { | |
$text .= '"'; | |
} | |
return $text; | |
} | |
/** | |
* @param string $text | |
* @param int $from | |
* @param int $to | |
* @return bool true if there are unbalanced quotes in the [$from, $to] range. | |
*/ | |
public function unbalancedQuotes( $text, $from = 0, $to = -1 ) { | |
$to = $to < 0 ? strlen( $text ) : $to; | |
$inQuote = false; | |
$inEscape = false; | |
for ( $i = $from; $i < $to; $i++ ) { | |
if ( $inEscape ) { | |
$inEscape = false; | |
continue; | |
} | |
switch ( $text[ $i ] ) { | |
case '"': | |
$inQuote = !$inQuote; | |
break; | |
case '\\': | |
$inEscape = true; | |
} | |
} | |
return $inQuote; | |
} | |
/** | |
* Unescape a given string | |
* @param string $query string to unescape | |
* @param string $escapeChar escape sequence | |
* @return string | |
*/ | |
public function unescape( $query, $escapeChar = '\\' ) { | |
$escapeChar = preg_quote( $escapeChar, '/' ); | |
return preg_replace( "/$escapeChar(.)/u", '$1', $query ); | |
} | |
/** | |
* Is leading wildcard allowed? | |
* | |
* @return bool | |
*/ | |
public function getAllowLeadingWildcard() { | |
return $this->allowLeadingWildcard; | |
} | |
/** | |
* @return string | |
*/ | |
public function getLanguage() { | |
return $this->language; | |
} | |
} |