Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 159 |
|
0.00% |
0 / 19 |
CRAP | |
0.00% |
0 / 1 |
SearchMySQL | |
0.00% |
0 / 159 |
|
0.00% |
0 / 19 |
2352 | |
0.00% |
0 / 1 |
parseQuery | |
0.00% |
0 / 50 |
|
0.00% |
0 / 1 |
182 | |||
regexTerm | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
legalSearchChars | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
doSearchTextInDB | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
doSearchTitleInDB | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
searchInternal | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 | |||
supports | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
queryFeatures | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
queryNamespaces | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
getQueryBuilder | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
getIndexField | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
queryMain | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
getCountQueryBuilder | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
update | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
2 | |||
updateTitle | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
delete | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
normalizeText | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
6 | |||
stripForSearchCallback | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
minSearchLength | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
20 |
1 | <?php |
2 | /** |
3 | * MySQL search engine |
4 | * |
5 | * Copyright (C) 2004 Brooke Vibber <bvibber@wikimedia.org> |
6 | * https://www.mediawiki.org/ |
7 | * |
8 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or |
11 | * (at your option) any later version. |
12 | * |
13 | * This program is distributed in the hope that it will be useful, |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. |
17 | * |
18 | * You should have received a copy of the GNU General Public License along |
19 | * with this program; if not, write to the Free Software Foundation, Inc., |
20 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21 | * http://www.gnu.org/copyleft/gpl.html |
22 | * |
23 | * @file |
24 | * @ingroup Search |
25 | */ |
26 | |
27 | use MediaWiki\MediaWikiServices; |
28 | use Wikimedia\AtEase\AtEase; |
29 | use Wikimedia\Rdbms\IDatabase; |
30 | use Wikimedia\Rdbms\IExpression; |
31 | use Wikimedia\Rdbms\LikeValue; |
32 | use Wikimedia\Rdbms\SelectQueryBuilder; |
33 | |
34 | /** |
35 | * Search engine hook for MySQL |
36 | * @ingroup Search |
37 | */ |
38 | class SearchMySQL extends SearchDatabase { |
39 | /** @var bool */ |
40 | protected $strictMatching = true; |
41 | |
42 | /** @var int|null */ |
43 | private static $mMinSearchLength; |
44 | |
45 | /** |
46 | * Parse the user's query and transform it into two SQL fragments: |
47 | * a WHERE condition and an ORDER BY expression |
48 | * |
49 | * @param string $filteredText |
50 | * @param bool $fulltext |
51 | * |
52 | * @return array |
53 | */ |
54 | private function parseQuery( $filteredText, $fulltext ) { |
55 | $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); // Minus syntax chars (" and *) |
56 | $searchon = ''; |
57 | $this->searchTerms = []; |
58 | |
59 | # @todo FIXME: This doesn't handle parenthetical expressions. |
60 | $m = []; |
61 | if ( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', |
62 | $filteredText, $m, PREG_SET_ORDER ) |
63 | ) { |
64 | $services = MediaWikiServices::getInstance(); |
65 | $contLang = $services->getContentLanguage(); |
66 | $langConverter = $services->getLanguageConverterFactory()->getLanguageConverter( $contLang ); |
67 | foreach ( $m as $bits ) { |
68 | AtEase::suppressWarnings(); |
69 | [ /* all */, $modifier, $term, $nonQuoted, $wildcard ] = $bits; |
70 | AtEase::restoreWarnings(); |
71 | |
72 | if ( $nonQuoted != '' ) { |
73 | $term = $nonQuoted; |
74 | $quote = ''; |
75 | } else { |
76 | $term = str_replace( '"', '', $term ); |
77 | $quote = '"'; |
78 | } |
79 | |
80 | if ( $searchon !== '' ) { |
81 | $searchon .= ' '; |
82 | } |
83 | if ( $this->strictMatching && ( $modifier == '' ) ) { |
84 | // If we leave this out, boolean op defaults to OR which is rarely helpful. |
85 | $modifier = '+'; |
86 | } |
87 | |
88 | // Some languages such as Serbian store the input form in the search index, |
89 | // so we may need to search for matches in multiple writing system variants. |
90 | $convertedVariants = $langConverter->autoConvertToAllVariants( $term ); |
91 | if ( is_array( $convertedVariants ) ) { |
92 | $variants = array_unique( array_values( $convertedVariants ) ); |
93 | } else { |
94 | $variants = [ $term ]; |
95 | } |
96 | |
97 | // The low-level search index does some processing on input to work |
98 | // around problems with minimum lengths and encoding in MySQL's |
99 | // fulltext engine. |
100 | // For Chinese this also inserts spaces between adjacent Han characters. |
101 | $strippedVariants = array_map( [ $contLang, 'normalizeForSearch' ], $variants ); |
102 | |
103 | // Some languages such as Chinese force all variants to a canonical |
104 | // form when stripping to the low-level search index, so to be sure |
105 | // let's check our variants list for unique items after stripping. |
106 | $strippedVariants = array_unique( $strippedVariants ); |
107 | |
108 | $searchon .= $modifier; |
109 | if ( count( $strippedVariants ) > 1 ) { |
110 | $searchon .= '('; |
111 | } |
112 | foreach ( $strippedVariants as $stripped ) { |
113 | $stripped = $this->normalizeText( $stripped ); |
114 | if ( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { |
115 | // Hack for Chinese: we need to toss in quotes for |
116 | // multiple-character phrases since normalizeForSearch() |
117 | // added spaces between them to make word breaks. |
118 | $stripped = '"' . trim( $stripped ) . '"'; |
119 | } |
120 | $searchon .= "$quote$stripped$quote$wildcard "; |
121 | } |
122 | if ( count( $strippedVariants ) > 1 ) { |
123 | $searchon .= ')'; |
124 | } |
125 | |
126 | // Match individual terms or quoted phrase in result highlighting... |
127 | // Note that variants will be introduced in a later stage for highlighting! |
128 | $regexp = $this->regexTerm( $term, $wildcard ); |
129 | $this->searchTerms[] = $regexp; |
130 | } |
131 | wfDebug( __METHOD__ . ": Would search with '$searchon'" ); |
132 | wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/" ); |
133 | } else { |
134 | wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'" ); |
135 | } |
136 | |
137 | $dbr = $this->dbProvider->getReplicaDatabase(); |
138 | $searchon = $dbr->addQuotes( $searchon ); |
139 | $field = $this->getIndexField( $fulltext ); |
140 | return [ |
141 | " MATCH($field) AGAINST($searchon IN BOOLEAN MODE) ", |
142 | " MATCH($field) AGAINST($searchon IN NATURAL LANGUAGE MODE) DESC " |
143 | ]; |
144 | } |
145 | |
146 | private function regexTerm( $string, $wildcard ) { |
147 | $regex = preg_quote( $string, '/' ); |
148 | if ( MediaWikiServices::getInstance()->getContentLanguage()->hasWordBreaks() ) { |
149 | if ( $wildcard ) { |
150 | // Don't cut off the final bit! |
151 | $regex = "\b$regex"; |
152 | } else { |
153 | $regex = "\b$regex\b"; |
154 | } |
155 | } else { |
156 | // For Chinese, words may legitimately abut other words in the text literal. |
157 | // Don't add \b boundary checks... note this could cause false positives |
158 | // for Latin chars. |
159 | } |
160 | return $regex; |
161 | } |
162 | |
163 | public function legalSearchChars( $type = self::CHARS_ALL ) { |
164 | $searchChars = parent::legalSearchChars( $type ); |
165 | if ( $type === self::CHARS_ALL ) { |
166 | // " for phrase, * for wildcard |
167 | $searchChars = "\"*" . $searchChars; |
168 | } |
169 | return $searchChars; |
170 | } |
171 | |
172 | /** |
173 | * Perform a full text search query and return a result set. |
174 | * |
175 | * @param string $term Raw search term |
176 | * @return SqlSearchResultSet|null |
177 | */ |
178 | protected function doSearchTextInDB( $term ) { |
179 | return $this->searchInternal( $term, true ); |
180 | } |
181 | |
182 | /** |
183 | * Perform a title-only search query and return a result set. |
184 | * |
185 | * @param string $term Raw search term |
186 | * @return SqlSearchResultSet|null |
187 | */ |
188 | protected function doSearchTitleInDB( $term ) { |
189 | return $this->searchInternal( $term, false ); |
190 | } |
191 | |
192 | protected function searchInternal( $term, $fulltext ) { |
193 | // This seems out of place, why is this called with empty term? |
194 | if ( trim( $term ) === '' ) { |
195 | return null; |
196 | } |
197 | |
198 | $filteredTerm = $this->filter( $term ); |
199 | $queryBuilder = $this->getQueryBuilder( $filteredTerm, $fulltext ); |
200 | $resultSet = $queryBuilder->caller( __METHOD__ )->fetchResultSet(); |
201 | |
202 | $total = null; |
203 | $queryBuilder = $this->getCountQueryBuilder( $filteredTerm, $fulltext ); |
204 | $totalResult = $queryBuilder->caller( __METHOD__ )->fetchResultSet(); |
205 | |
206 | $row = $totalResult->fetchObject(); |
207 | if ( $row ) { |
208 | $total = intval( $row->c ); |
209 | } |
210 | $totalResult->free(); |
211 | |
212 | return new SqlSearchResultSet( $resultSet, $this->searchTerms, $total ); |
213 | } |
214 | |
215 | public function supports( $feature ) { |
216 | switch ( $feature ) { |
217 | case 'title-suffix-filter': |
218 | return true; |
219 | default: |
220 | return parent::supports( $feature ); |
221 | } |
222 | } |
223 | |
224 | /** |
225 | * Add special conditions |
226 | * @param SelectQueryBuilder $queryBuilder |
227 | * @since 1.18 |
228 | */ |
229 | protected function queryFeatures( SelectQueryBuilder $queryBuilder ) { |
230 | foreach ( $this->features as $feature => $value ) { |
231 | if ( $feature === 'title-suffix-filter' && $value ) { |
232 | $dbr = $this->dbProvider->getReplicaDatabase(); |
233 | $queryBuilder->andWhere( |
234 | $dbr->expr( 'page_title', IExpression::LIKE, new LikeValue( $dbr->anyString(), $value ) ) |
235 | ); |
236 | } |
237 | } |
238 | } |
239 | |
240 | /** |
241 | * Add namespace conditions |
242 | * @param SelectQueryBuilder $queryBuilder |
243 | * @since 1.18 (changed) |
244 | */ |
245 | private function queryNamespaces( $queryBuilder ) { |
246 | if ( is_array( $this->namespaces ) ) { |
247 | if ( count( $this->namespaces ) === 0 ) { |
248 | $this->namespaces[] = NS_MAIN; |
249 | } |
250 | $queryBuilder->andWhere( [ 'page_namespace' => $this->namespaces ] ); |
251 | } |
252 | } |
253 | |
254 | /** |
255 | * Construct the SQL query builder to do the search. |
256 | * @param string $filteredTerm |
257 | * @param bool $fulltext |
258 | * @return SelectQueryBuilder |
259 | * @since 1.41 |
260 | */ |
261 | private function getQueryBuilder( $filteredTerm, $fulltext ): SelectQueryBuilder { |
262 | $queryBuilder = $this->dbProvider->getReplicaDatabase()->newSelectQueryBuilder(); |
263 | |
264 | $this->queryMain( $queryBuilder, $filteredTerm, $fulltext ); |
265 | $this->queryFeatures( $queryBuilder ); |
266 | $this->queryNamespaces( $queryBuilder ); |
267 | $queryBuilder->limit( $this->limit ) |
268 | ->offset( $this->offset ); |
269 | |
270 | return $queryBuilder; |
271 | } |
272 | |
273 | /** |
274 | * Picks which field to index on, depending on what type of query. |
275 | * @param bool $fulltext |
276 | * @return string |
277 | */ |
278 | private function getIndexField( $fulltext ) { |
279 | return $fulltext ? 'si_text' : 'si_title'; |
280 | } |
281 | |
282 | /** |
283 | * Get the base part of the search query. |
284 | * |
285 | * @param SelectQueryBuilder $queryBuilder Search query builder |
286 | * @param string $filteredTerm |
287 | * @param bool $fulltext |
288 | * @since 1.18 (changed) |
289 | */ |
290 | private function queryMain( SelectQueryBuilder $queryBuilder, $filteredTerm, $fulltext ) { |
291 | $match = $this->parseQuery( $filteredTerm, $fulltext ); |
292 | $queryBuilder->select( [ 'page_id', 'page_namespace', 'page_title' ] ) |
293 | ->from( 'page' ) |
294 | ->join( 'searchindex', null, 'page_id=si_page' ) |
295 | ->where( $match[0] ) |
296 | ->orderBy( $match[1] ); |
297 | } |
298 | |
299 | /** |
300 | * @since 1.41 (changed) |
301 | * @param string $filteredTerm |
302 | * @param bool $fulltext |
303 | * @return SelectQueryBuilder |
304 | */ |
305 | private function getCountQueryBuilder( $filteredTerm, $fulltext ): SelectQueryBuilder { |
306 | $match = $this->parseQuery( $filteredTerm, $fulltext ); |
307 | $queryBuilder = $this->dbProvider->getReplicaDatabase()->newSelectQueryBuilder() |
308 | ->select( [ 'c' => 'COUNT(*)' ] ) |
309 | ->from( 'page' ) |
310 | ->join( 'searchindex', null, 'page_id=si_page' ) |
311 | ->where( $match[0] ); |
312 | |
313 | $this->queryFeatures( $queryBuilder ); |
314 | $this->queryNamespaces( $queryBuilder ); |
315 | |
316 | return $queryBuilder; |
317 | } |
318 | |
319 | /** |
320 | * Create or update the search index record for the given page. |
321 | * Title and text should be pre-processed. |
322 | * |
323 | * @param int $id |
324 | * @param string $title |
325 | * @param string $text |
326 | */ |
327 | public function update( $id, $title, $text ) { |
328 | $this->dbProvider->getPrimaryDatabase()->newReplaceQueryBuilder() |
329 | ->replaceInto( 'searchindex' ) |
330 | ->uniqueIndexFields( [ 'si_page' ] ) |
331 | ->row( [ |
332 | 'si_page' => $id, |
333 | 'si_title' => $this->normalizeText( $title ), |
334 | 'si_text' => $this->normalizeText( $text ) |
335 | ] ) |
336 | ->caller( __METHOD__ )->execute(); |
337 | } |
338 | |
339 | /** |
340 | * Update a search index record's title only. |
341 | * Title should be pre-processed. |
342 | * |
343 | * @param int $id |
344 | * @param string $title |
345 | */ |
346 | public function updateTitle( $id, $title ) { |
347 | $this->dbProvider->getPrimaryDatabase()->newUpdateQueryBuilder() |
348 | ->update( 'searchindex' ) |
349 | ->set( [ 'si_title' => $this->normalizeText( $title ) ] ) |
350 | ->where( [ 'si_page' => $id ] ) |
351 | ->caller( __METHOD__ )->execute(); |
352 | } |
353 | |
354 | /** |
355 | * Delete an indexed page |
356 | * Title should be pre-processed. |
357 | * |
358 | * @param int $id Page id that was deleted |
359 | * @param string $title Title of page that was deleted |
360 | */ |
361 | public function delete( $id, $title ) { |
362 | $this->dbProvider->getPrimaryDatabase()->newDeleteQueryBuilder() |
363 | ->deleteFrom( 'searchindex' ) |
364 | ->where( [ 'si_page' => $id ] ) |
365 | ->caller( __METHOD__ )->execute(); |
366 | } |
367 | |
368 | /** |
369 | * Converts some characters for MySQL's indexing to grok it correctly, |
370 | * and pads short words to overcome limitations. |
371 | * @param string $string |
372 | * @return string |
373 | */ |
374 | public function normalizeText( $string ) { |
375 | $out = parent::normalizeText( $string ); |
376 | |
377 | // MySQL fulltext index doesn't grok utf-8, so we |
378 | // need to fold cases and convert to hex |
379 | $out = preg_replace_callback( |
380 | "/([\\xc0-\\xff][\\x80-\\xbf]*)/", |
381 | [ $this, 'stripForSearchCallback' ], |
382 | MediaWikiServices::getInstance()->getContentLanguage()->lc( $out ) ); |
383 | |
384 | // And to add insult to injury, the default indexing |
385 | // ignores short words... Pad them so we can pass them |
386 | // through without reconfiguring the server... |
387 | $minLength = $this->minSearchLength(); |
388 | if ( $minLength > 1 ) { |
389 | $n = $minLength - 1; |
390 | $out = preg_replace( |
391 | "/\b(\w{1,$n})\b/", |
392 | "$1u800", |
393 | $out ); |
394 | } |
395 | |
396 | // Periods within things like hostnames and IP addresses |
397 | // are also important -- we want a search for "example.com" |
398 | // or "192.168.1.1" to work sensibly. |
399 | // MySQL's search seems to ignore them, so you'd match on |
400 | // "example.wikipedia.com" and "192.168.83.1" as well. |
401 | return preg_replace( |
402 | "/(\w)\.(\w|\*)/u", |
403 | "$1u82e$2", |
404 | $out |
405 | ); |
406 | } |
407 | |
408 | /** |
409 | * Armor a case-folded UTF-8 string to get through MySQL's |
410 | * fulltext search without being mucked up by funny charset |
411 | * settings or anything else of the sort. |
412 | * @param array $matches |
413 | * @return string |
414 | */ |
415 | protected function stripForSearchCallback( $matches ) { |
416 | return 'u8' . bin2hex( $matches[1] ); |
417 | } |
418 | |
419 | /** |
420 | * Check MySQL server's ft_min_word_len setting so we know |
421 | * if we need to pad short words... |
422 | * |
423 | * @return int |
424 | */ |
425 | protected function minSearchLength() { |
426 | if ( self::$mMinSearchLength === null ) { |
427 | $sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'"; |
428 | |
429 | $dbr = $this->dbProvider->getReplicaDatabase(); |
430 | // The real type is still IDatabase, but IReplicaDatabase is used for safety. |
431 | '@phan-var IDatabase $dbr'; |
432 | // phpcs:ignore MediaWiki.Usage.DbrQueryUsage.DbrQueryFound |
433 | $result = $dbr->query( $sql, __METHOD__ ); |
434 | $row = $result->fetchObject(); |
435 | $result->free(); |
436 | |
437 | if ( $row && $row->Variable_name == 'ft_min_word_len' ) { |
438 | self::$mMinSearchLength = intval( $row->Value ); |
439 | } else { |
440 | self::$mMinSearchLength = 0; |
441 | } |
442 | } |
443 | return self::$mMinSearchLength; |
444 | } |
445 | } |