Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 159 |
|
0.00% |
0 / 19 |
CRAP | |
0.00% |
0 / 1 |
SearchMySQL | |
0.00% |
0 / 159 |
|
0.00% |
0 / 19 |
2352 | |
0.00% |
0 / 1 |
parseQuery | |
0.00% |
0 / 50 |
|
0.00% |
0 / 1 |
182 | |||
regexTerm | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
legalSearchChars | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
doSearchTextInDB | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
doSearchTitleInDB | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
searchInternal | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 | |||
supports | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
queryFeatures | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
queryNamespaces | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
getQueryBuilder | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
getIndexField | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
queryMain | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
getCountQueryBuilder | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
update | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
updateTitle | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
delete | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
normalizeText | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
6 | |||
stripForSearchCallback | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
minSearchLength | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 |
1 | <?php |
2 | /** |
3 | * MySQL search engine |
4 | * |
5 | * Copyright (C) 2004 Brooke Vibber <bvibber@wikimedia.org> |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup Search |
25 | */ |
26 | |
27 | use MediaWiki\MediaWikiServices; |
28 | use Wikimedia\AtEase\AtEase; |
29 | use Wikimedia\Rdbms\IDatabase; |
30 | use Wikimedia\Rdbms\IExpression; |
31 | use Wikimedia\Rdbms\LikeValue; |
32 | use Wikimedia\Rdbms\SelectQueryBuilder; |
33 | |
34 | /** |
35 | * Search engine hook for MySQL |
36 | * @ingroup Search |
37 | */ |
38 | class SearchMySQL extends SearchDatabase { |
39 | protected $strictMatching = true; |
40 | |
41 | private static $mMinSearchLength; |
42 | |
43 | /** |
44 | * Parse the user's query and transform it into two SQL fragments: |
45 | * a WHERE condition and an ORDER BY expression |
46 | * |
47 | * @param string $filteredText |
48 | * @param bool $fulltext |
49 | * |
50 | * @return array |
51 | */ |
52 | private function parseQuery( $filteredText, $fulltext ) { |
53 | $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); // Minus syntax chars (" and *) |
54 | $searchon = ''; |
55 | $this->searchTerms = []; |
56 | |
57 | # @todo FIXME: This doesn't handle parenthetical expressions. |
58 | $m = []; |
59 | if ( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', |
60 | $filteredText, $m, PREG_SET_ORDER ) |
61 | ) { |
62 | $services = MediaWikiServices::getInstance(); |
63 | $contLang = $services->getContentLanguage(); |
64 | $langConverter = $services->getLanguageConverterFactory()->getLanguageConverter( $contLang ); |
65 | foreach ( $m as $bits ) { |
66 | AtEase::suppressWarnings(); |
67 | [ /* all */, $modifier, $term, $nonQuoted, $wildcard ] = $bits; |
68 | AtEase::restoreWarnings(); |
69 | |
70 | if ( $nonQuoted != '' ) { |
71 | $term = $nonQuoted; |
72 | $quote = ''; |
73 | } else { |
74 | $term = str_replace( '"', '', $term ); |
75 | $quote = '"'; |
76 | } |
77 | |
78 | if ( $searchon !== '' ) { |
79 | $searchon .= ' '; |
80 | } |
81 | if ( $this->strictMatching && ( $modifier == '' ) ) { |
82 | // If we leave this out, boolean op defaults to OR which is rarely helpful. |
83 | $modifier = '+'; |
84 | } |
85 | |
86 | // Some languages such as Serbian store the input form in the search index, |
87 | // so we may need to search for matches in multiple writing system variants. |
88 | $convertedVariants = $langConverter->autoConvertToAllVariants( $term ); |
89 | if ( is_array( $convertedVariants ) ) { |
90 | $variants = array_unique( array_values( $convertedVariants ) ); |
91 | } else { |
92 | $variants = [ $term ]; |
93 | } |
94 | |
95 | // The low-level search index does some processing on input to work |
96 | // around problems with minimum lengths and encoding in MySQL's |
97 | // fulltext engine. |
98 | // For Chinese this also inserts spaces between adjacent Han characters. |
99 | $strippedVariants = array_map( [ $contLang, 'normalizeForSearch' ], $variants ); |
100 | |
101 | // Some languages such as Chinese force all variants to a canonical |
102 | // form when stripping to the low-level search index, so to be sure |
103 | // let's check our variants list for unique items after stripping. |
104 | $strippedVariants = array_unique( $strippedVariants ); |
105 | |
106 | $searchon .= $modifier; |
107 | if ( count( $strippedVariants ) > 1 ) { |
108 | $searchon .= '('; |
109 | } |
110 | foreach ( $strippedVariants as $stripped ) { |
111 | $stripped = $this->normalizeText( $stripped ); |
112 | if ( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { |
113 | // Hack for Chinese: we need to toss in quotes for |
114 | // multiple-character phrases since normalizeForSearch() |
115 | // added spaces between them to make word breaks. |
116 | $stripped = '"' . trim( $stripped ) . '"'; |
117 | } |
118 | $searchon .= "$quote$stripped$quote$wildcard "; |
119 | } |
120 | if ( count( $strippedVariants ) > 1 ) { |
121 | $searchon .= ')'; |
122 | } |
123 | |
124 | // Match individual terms or quoted phrase in result highlighting... |
125 | // Note that variants will be introduced in a later stage for highlighting! |
126 | $regexp = $this->regexTerm( $term, $wildcard ); |
127 | $this->searchTerms[] = $regexp; |
128 | } |
129 | wfDebug( __METHOD__ . ": Would search with '$searchon'" ); |
130 | wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/" ); |
131 | } else { |
132 | wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'" ); |
133 | } |
134 | |
135 | $dbr = $this->dbProvider->getReplicaDatabase(); |
136 | $searchon = $dbr->addQuotes( $searchon ); |
137 | $field = $this->getIndexField( $fulltext ); |
138 | return [ |
139 | " MATCH($field) AGAINST($searchon IN BOOLEAN MODE) ", |
140 | " MATCH($field) AGAINST($searchon IN NATURAL LANGUAGE MODE) DESC " |
141 | ]; |
142 | } |
143 | |
144 | private function regexTerm( $string, $wildcard ) { |
145 | $regex = preg_quote( $string, '/' ); |
146 | if ( MediaWikiServices::getInstance()->getContentLanguage()->hasWordBreaks() ) { |
147 | if ( $wildcard ) { |
148 | // Don't cut off the final bit! |
149 | $regex = "\b$regex"; |
150 | } else { |
151 | $regex = "\b$regex\b"; |
152 | } |
153 | } else { |
154 | // For Chinese, words may legitimately abut other words in the text literal. |
155 | // Don't add \b boundary checks... note this could cause false positives |
156 | // for Latin chars. |
157 | } |
158 | return $regex; |
159 | } |
160 | |
161 | public function legalSearchChars( $type = self::CHARS_ALL ) { |
162 | $searchChars = parent::legalSearchChars( $type ); |
163 | if ( $type === self::CHARS_ALL ) { |
164 | // " for phrase, * for wildcard |
165 | $searchChars = "\"*" . $searchChars; |
166 | } |
167 | return $searchChars; |
168 | } |
169 | |
170 | /** |
171 | * Perform a full text search query and return a result set. |
172 | * |
173 | * @param string $term Raw search term |
174 | * @return SqlSearchResultSet|null |
175 | */ |
176 | protected function doSearchTextInDB( $term ) { |
177 | return $this->searchInternal( $term, true ); |
178 | } |
179 | |
180 | /** |
181 | * Perform a title-only search query and return a result set. |
182 | * |
183 | * @param string $term Raw search term |
184 | * @return SqlSearchResultSet|null |
185 | */ |
186 | protected function doSearchTitleInDB( $term ) { |
187 | return $this->searchInternal( $term, false ); |
188 | } |
189 | |
190 | protected function searchInternal( $term, $fulltext ) { |
191 | // This seems out of place, why is this called with empty term? |
192 | if ( trim( $term ) === '' ) { |
193 | return null; |
194 | } |
195 | |
196 | $filteredTerm = $this->filter( $term ); |
197 | $queryBuilder = $this->getQueryBuilder( $filteredTerm, $fulltext ); |
198 | $resultSet = $queryBuilder->caller( __METHOD__ )->fetchResultSet(); |
199 | |
200 | $total = null; |
201 | $queryBuilder = $this->getCountQueryBuilder( $filteredTerm, $fulltext ); |
202 | $totalResult = $queryBuilder->caller( __METHOD__ )->fetchResultSet(); |
203 | |
204 | $row = $totalResult->fetchObject(); |
205 | if ( $row ) { |
206 | $total = intval( $row->c ); |
207 | } |
208 | $totalResult->free(); |
209 | |
210 | return new SqlSearchResultSet( $resultSet, $this->searchTerms, $total ); |
211 | } |
212 | |
213 | public function supports( $feature ) { |
214 | switch ( $feature ) { |
215 | case 'title-suffix-filter': |
216 | return true; |
217 | default: |
218 | return parent::supports( $feature ); |
219 | } |
220 | } |
221 | |
222 | /** |
223 | * Add special conditions |
224 | * @param SelectQueryBuilder $queryBuilder |
225 | * @since 1.18 |
226 | */ |
227 | protected function queryFeatures( SelectQueryBuilder $queryBuilder ) { |
228 | foreach ( $this->features as $feature => $value ) { |
229 | if ( $feature === 'title-suffix-filter' && $value ) { |
230 | $dbr = $this->dbProvider->getReplicaDatabase(); |
231 | $queryBuilder->andWhere( |
232 | $dbr->expr( 'page_title', IExpression::LIKE, new LikeValue( $dbr->anyString(), $value ) ) |
233 | ); |
234 | } |
235 | } |
236 | } |
237 | |
238 | /** |
239 | * Add namespace conditions |
240 | * @param SelectQueryBuilder $queryBuilder |
241 | * @since 1.18 (changed) |
242 | */ |
243 | private function queryNamespaces( $queryBuilder ) { |
244 | if ( is_array( $this->namespaces ) ) { |
245 | if ( count( $this->namespaces ) === 0 ) { |
246 | $this->namespaces[] = NS_MAIN; |
247 | } |
248 | $queryBuilder->andWhere( [ 'page_namespace' => $this->namespaces ] ); |
249 | } |
250 | } |
251 | |
252 | /** |
253 | * Construct the SQL query builder to do the search. |
254 | * @param string $filteredTerm |
255 | * @param bool $fulltext |
256 | * @return SelectQueryBuilder |
257 | * @since 1.41 |
258 | */ |
259 | private function getQueryBuilder( $filteredTerm, $fulltext ): SelectQueryBuilder { |
260 | $queryBuilder = $this->dbProvider->getReplicaDatabase()->newSelectQueryBuilder(); |
261 | |
262 | $this->queryMain( $queryBuilder, $filteredTerm, $fulltext ); |
263 | $this->queryFeatures( $queryBuilder ); |
264 | $this->queryNamespaces( $queryBuilder ); |
265 | $queryBuilder->limit( $this->limit ) |
266 | ->offset( $this->offset ); |
267 | |
268 | return $queryBuilder; |
269 | } |
270 | |
271 | /** |
272 | * Picks which field to index on, depending on what type of query. |
273 | * @param bool $fulltext |
274 | * @return string |
275 | */ |
276 | private function getIndexField( $fulltext ) { |
277 | return $fulltext ? 'si_text' : 'si_title'; |
278 | } |
279 | |
280 | /** |
281 | * Get the base part of the search query. |
282 | * |
283 | * @param SelectQueryBuilder $queryBuilder Search query builder |
284 | * @param string $filteredTerm |
285 | * @param bool $fulltext |
286 | * @since 1.18 (changed) |
287 | */ |
288 | private function queryMain( SelectQueryBuilder $queryBuilder, $filteredTerm, $fulltext ) { |
289 | $match = $this->parseQuery( $filteredTerm, $fulltext ); |
290 | $queryBuilder->select( [ 'page_id', 'page_namespace', 'page_title' ] ) |
291 | ->from( 'page' ) |
292 | ->join( 'searchindex', null, 'page_id=si_page' ) |
293 | ->where( $match[0] ) |
294 | ->orderBy( $match[1] ); |
295 | } |
296 | |
297 | /** |
298 | * @since 1.41 (changed) |
299 | * @param string $filteredTerm |
300 | * @param bool $fulltext |
301 | * @return SelectQueryBuilder |
302 | */ |
303 | private function getCountQueryBuilder( $filteredTerm, $fulltext ): SelectQueryBuilder { |
304 | $match = $this->parseQuery( $filteredTerm, $fulltext ); |
305 | $queryBuilder = $this->dbProvider->getReplicaDatabase()->newSelectQueryBuilder() |
306 | ->select( [ 'c' => 'COUNT(*)' ] ) |
307 | ->from( 'page' ) |
308 | ->join( 'searchindex', null, 'page_id=si_page' ) |
309 | ->where( $match[0] ); |
310 | |
311 | $this->queryFeatures( $queryBuilder ); |
312 | $this->queryNamespaces( $queryBuilder ); |
313 | |
314 | return $queryBuilder; |
315 | } |
316 | |
317 | /** |
318 | * Create or update the search index record for the given page. |
319 | * Title and text should be pre-processed. |
320 | * |
321 | * @param int $id |
322 | * @param string $title |
323 | * @param string $text |
324 | */ |
325 | public function update( $id, $title, $text ) { |
326 | $this->dbProvider->getPrimaryDatabase()->newReplaceQueryBuilder() |
327 | ->replaceInto( 'searchindex' ) |
328 | ->uniqueIndexFields( [ 'si_page' ] ) |
329 | ->rows( [ |
330 | 'si_page' => $id, |
331 | 'si_title' => $this->normalizeText( $title ), |
332 | 'si_text' => $this->normalizeText( $text ) |
333 | ] ) |
334 | ->caller( __METHOD__ )->execute(); |
335 | } |
336 | |
337 | /** |
338 | * Update a search index record's title only. |
339 | * Title should be pre-processed. |
340 | * |
341 | * @param int $id |
342 | * @param string $title |
343 | */ |
344 | public function updateTitle( $id, $title ) { |
345 | $this->dbProvider->getPrimaryDatabase()->newUpdateQueryBuilder() |
346 | ->update( 'searchindex' ) |
347 | ->set( [ 'si_title' => $this->normalizeText( $title ) ] ) |
348 | ->where( [ 'si_page' => $id ] ) |
349 | ->caller( __METHOD__ )->execute(); |
350 | } |
351 | |
352 | /** |
353 | * Delete an indexed page |
354 | * Title should be pre-processed. |
355 | * |
356 | * @param int $id Page id that was deleted |
357 | * @param string $title Title of page that was deleted |
358 | */ |
359 | public function delete( $id, $title ) { |
360 | $this->dbProvider->getPrimaryDatabase()->newDeleteQueryBuilder() |
361 | ->deleteFrom( 'searchindex' ) |
362 | ->where( [ 'si_page' => $id ] ) |
363 | ->caller( __METHOD__ )->execute(); |
364 | } |
365 | |
366 | /** |
367 | * Converts some characters for MySQL's indexing to grok it correctly, |
368 | * and pads short words to overcome limitations. |
369 | * @param string $string |
370 | * @return string |
371 | */ |
372 | public function normalizeText( $string ) { |
373 | $out = parent::normalizeText( $string ); |
374 | |
375 | // MySQL fulltext index doesn't grok utf-8, so we |
376 | // need to fold cases and convert to hex |
377 | $out = preg_replace_callback( |
378 | "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
379 | [ $this, 'stripForSearchCallback' ], |
380 | MediaWikiServices::getInstance()->getContentLanguage()->lc( $out ) ); |
381 | |
382 | // And to add insult to injury, the default indexing |
383 | // ignores short words... Pad them so we can pass them |
384 | // through without reconfiguring the server... |
385 | $minLength = $this->minSearchLength(); |
386 | if ( $minLength > 1 ) { |
387 | $n = $minLength - 1; |
388 | $out = preg_replace( |
389 | "/\b(\w{1,$n})\b/", |
390 | "$1u800", |
391 | $out ); |
392 | } |
393 | |
394 | // Periods within things like hostnames and IP addresses |
395 | // are also important -- we want a search for "example.com" |
396 | // or "192.168.1.1" to work sensibly. |
397 | // MySQL's search seems to ignore them, so you'd match on |
398 | // "example.wikipedia.com" and "192.168.83.1" as well. |
399 | return preg_replace( |
400 | "/(\w)\.(\w|\*)/u", |
401 | "$1u82e$2", |
402 | $out |
403 | ); |
404 | } |
405 | |
406 | /** |
407 | * Armor a case-folded UTF-8 string to get through MySQL's |
408 | * fulltext search without being mucked up by funny charset |
409 | * settings or anything else of the sort. |
410 | * @param array $matches |
411 | * @return string |
412 | */ |
413 | protected function stripForSearchCallback( $matches ) { |
414 | return 'u8' . bin2hex( $matches[1] ); |
415 | } |
416 | |
417 | /** |
418 | * Check MySQL server's ft_min_word_len setting so we know |
419 | * if we need to pad short words... |
420 | * |
421 | * @return int |
422 | */ |
423 | protected function minSearchLength() { |
424 | if ( self::$mMinSearchLength === null ) { |
425 | $sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'"; |
426 | |
427 | $dbr = $this->dbProvider->getReplicaDatabase(); |
428 | // The real type is still IDatabase, but IReplicaDatabase is used for safety. |
429 | '@phan-var IDatabase $dbr'; |
430 | // phpcs:ignore MediaWiki.Usage.DbrQueryUsage.DbrQueryFound |
431 | $result = $dbr->query( $sql, __METHOD__ ); |
432 | $row = $result->fetchObject(); |
433 | $result->free(); |
434 | |
435 | if ( $row && $row->Variable_name == 'ft_min_word_len' ) { |
436 | self::$mMinSearchLength = intval( $row->Value ); |
437 | } else { |
438 | self::$mMinSearchLength = 0; |
439 | } |
440 | } |
441 | return self::$mMinSearchLength; |
442 | } |
443 | } |