Code Coverage for /workspace/src/extensions/CirrusSearch/includes/Search/Escaper.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	84.75% covered (warning)	84.75%	50 / 59	60.00% covered (warning)	60.00%	6 / 10	CRAP	0.00% covered (danger)	0.00%	0 / 1
Escaper	84.75% covered (warning)	84.75%	50 / 59	60.00% covered (warning)	60.00%	6 / 10	21.42	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
escapeQuotes	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
fixupQueryStringPart	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	2
fixupWholeQueryString	89.29% covered (warning)	89.29%	25 / 28	0.00% covered (danger)	0.00%	0 / 1	4.02
lowercaseMatched	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
balanceQuotes	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	2
unbalancedQuotes	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	6
unescape	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
getAllowLeadingWildcard	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
getLanguage	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2
3	namespace CirrusSearch\Search;
4
5	/**
6	* Escapes queries.
7	*
8	* This program is free software; you can redistribute it and/or modify
9	* it under the terms of the GNU General Public License as published by
10	* the Free Software Foundation; either version 2 of the License, or
11	* (at your option) any later version.
12	*
13	* This program is distributed in the hope that it will be useful,
14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	* GNU General Public License for more details.
17	*
18	* You should have received a copy of the GNU General Public License along
19	* with this program; if not, write to the Free Software Foundation, Inc.,
20	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21	* http://www.gnu.org/copyleft/gpl.html
22	*/
23	class Escaper {
24
25	/**
26	* @var string MediaWiki language code
27	*/
28	private $language;
29
30	/**
31	* Allow leading wildcards?
32	* @var bool
33	*/
34	private $allowLeadingWildcard;
35
36	/**
37	* @param string $language MediaWiki language code
38	* @param bool $allowLeadingWildcard
39	*/
40	public function __construct( $language, $allowLeadingWildcard = true ) {
41	$this->language = $language;
42	$this->allowLeadingWildcard = $allowLeadingWildcard;
43	}
44
45	/**
46	* @param string $text
47	* @return string
48	*/
49	public function escapeQuotes( $text ) {
50	if ( $this->language === 'he' ) {
51	// Hebrew uses the double quote (") character as a standin for quotation marks (“”)
52	// which delineate phrases. It also uses double quotes as a standin for another
53	// character (״), call a Gershayim, which mark acronyms. Here we guess if the intent
54	// was to mark a phrase, in which case we leave the quotes alone, or to mark an
55	// acronym, in which case we escape them.
56	return preg_replace( '/(?<=[^\s\\\\])"(?=\S)/u', '\\"', $text );
57	}
58	return $text;
59	}
60
61	/**
62	* Make sure the query string part is well formed by escaping some syntax that we don't
63	* want users to get direct access to and making sure quotes are balanced.
64	* These special characters _aren't_ escaped:
65	* * and ?: Do a wildcard search against the stemmed text which isn't strictly a good
66	* idea but this is so rarely used that adding extra code to flip prefix searches into
67	* real prefix searches isn't really worth it.
68	* ~: Do a fuzzy match against the stemmed text which isn't strictly a good idea but it
69	* gets the job done and fuzzy matches are a really rarely used feature to be creating an
70	* extra index for.
71	* ": Perform a phrase search for the quoted term. If the "s aren't balanced we insert one
72	* at the end of the term to make sure elasticsearch doesn't barf at us.
73	*
74	* @param string $string
75	* @return string
76	*/
77	public function fixupQueryStringPart( $string ) {
78	// Escape characters that can be escaped with \\
79	$string = preg_replace( '/(
80	\(\| (?# no user supplied groupings)
81	\)\|
82	\{\| (?# no exclusive range queries)
83	}\|
84	\[\| (?# no inclusive range queries either)
85	]\|
86	\^\| (?# no user supplied boosts at this point, though I cant think why)
87	:\| (?# no specifying your own fields)
88	\\\(?!") (?# the only acceptable escaping is for quotes)
89	)/x', '\\\$1', $string );
90	// Forward slash escaping doesn't work properly in all environments so we just eat them. Nom.
91	$string = str_replace( '/', ' ', $string );
92
93	// Elasticsearch's query strings can't abide unbalanced quotes
94	return $this->balanceQuotes( $string );
95	}
96
97	/**
98	* Make sure that all operators and lucene syntax is used correctly in the query string
99	* and store if this is a fuzzy query.
100	* If it isn't then the syntax escaped so it becomes part of the query text.
101	*
102	* @param string $string
103	* @return string fixed up query string
104	*/
105	public function fixupWholeQueryString( $string ) {
106	$escapeBadSyntax = static function ( $matches ) {
107	return preg_replace( '/(?=[^\s\w])/', '\\', $matches[0] );
108	};
109
110	// Be careful when editing this method because the ordering of the replacements matters.
111
112	// Escape ~ that don't follow a term or a quote
113	$string = preg_replace_callback( '/(?<![\w"])~/u', $escapeBadSyntax, $string );
114
115	// When allow leading wildcard is disabled elasticsearch will report an
116	// error if these are unescaped. Escape ? and * that don't follow a term.
117	if ( !$this->allowLeadingWildcard ) {
118	$string = preg_replace_callback( '/(?<!\w)[?*]/u', $escapeBadSyntax, $string );
119	}
120
121	// Reduce token ranges to bare tokens without the < or >
122	$string = preg_replace( '/[<>]+(\S)/u', '$1', $string );
123
124	// Turn bad fuzzy searches into searches that contain a ~ and set $this->fuzzyQuery for good ones.
125	$string = preg_replace_callback( '/(?<leading>\w)~(?<trailing>\S*)/u',
126	static function ( $matches ) use ( &$fuzzyQuery ) {
127	if ( preg_match( '/^[0-2]?$/', $matches[ 'trailing' ] ) ) {
128	return $matches[ 0 ];
129	} else {
130	return $matches[ 'leading' ] . '\\~' .
131	preg_replace( '/(?<!\\\\)~/', '\~', $matches[ 'trailing' ] );
132	}
133	}, $string );
134
135	// Turn bad proximity searches into searches that contain a ~
136	$string = preg_replace_callback( '/"~(?<trailing>\S*)/u', static function ( $matches ) {
137	if ( preg_match( '/\d+/', $matches[ 'trailing' ] ) ) {
138	return $matches[ 0 ];
139	} else {
140	return '"\\~' . $matches[ 'trailing' ];
141	}
142	}, $string );
143
144	// Escape +, -, and ! when not immediately followed by a term or when immediately
145	// prefixed with a term. Catches "foo-bar", "foo- bar", "foo - bar". The only
146	// acceptable use is "foo -bar" and "-bar foo".
147	$string = preg_replace_callback( '/[+\-!]+(?!\w)/u', $escapeBadSyntax, $string );
148	$string = preg_replace_callback( '/(?<!^\|[ \\\\])[+\-!]+/u', $escapeBadSyntax, $string );
149
150	// Escape \|\| when not between terms
151	$string = preg_replace_callback( '/^\s*\\|\\|/u', $escapeBadSyntax, $string );
152	$string = preg_replace_callback( '/\\|\\|\s*$/u', $escapeBadSyntax, $string );
153
154	// Lowercase AND and OR when not surrounded on both sides by a term.
155	// Lowercase NOT when it doesn't have a term after it.
156	$string = preg_replace_callback( '/^\s(?:AND\|OR)\b\|\b(?:AND\|OR\|NOT)\s$/u',
157	[ self::class, 'lowercaseMatched' ], $string );
158	$string = preg_replace_callback( '/\b(?:AND\|OR\|NOT)\s+(?=AND\b\|OR\b\|NOT\b)/u',
159	[ self::class, 'lowercaseMatched' ], $string );
160
161	return $string;
162	}
163
164	/**
165	* @param string[] $matches
166	* @return string
167	*/
168	private static function lowercaseMatched( $matches ) {
169	return strtolower( $matches[ 0 ] );
170	}
171
172	/**
173	* @param string $text
174	* @return string
175	*/
176	public function balanceQuotes( $text ) {
177	if ( $this->unbalancedQuotes( $text ) ) {
178	$text .= '"';
179	}
180	return $text;
181	}
182
183	/**
184	* @param string $text
185	* @param int $from
186	* @param int $to
187	* @return bool true if there are unbalanced quotes in the [$from, $to] range.
188	*/
189	public function unbalancedQuotes( $text, $from = 0, $to = -1 ) {
190	$to = $to < 0 ? strlen( $text ) : $to;
191	$inQuote = false;
192	$inEscape = false;
193	for ( $i = $from; $i < $to; $i++ ) {
194	if ( $inEscape ) {
195	$inEscape = false;
196	continue;
197	}
198	switch ( $text[ $i ] ) {
199	case '"':
200	$inQuote = !$inQuote;
201	break;
202	case '\\':
203	$inEscape = true;
204	}
205	}
206	return $inQuote;
207	}
208
209	/**
210	* Unescape a given string
211	* @param string $query string to unescape
212	* @param string $escapeChar escape sequence
213	* @return string
214	*/
215	public function unescape( $query, $escapeChar = '\\' ) {
216	$escapeChar = preg_quote( $escapeChar, '/' );
217	return preg_replace( "/$escapeChar(.)/u", '$1', $query );
218	}
219
220	/**
221	* Is leading wildcard allowed?
222	*
223	* @return bool
224	*/
225	public function getAllowLeadingWildcard() {
226	return $this->allowLeadingWildcard;
227	}
228
229	/**
230	* @return string
231	*/
232	public function getLanguage() {
233	return $this->language;
234	}
235	}