Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
11.76% |
8 / 68 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
CargoSearchMySQL | |
11.76% |
8 / 68 |
|
40.00% |
2 / 5 |
354.49 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
getSearchTerms | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
parseQuery | |
0.00% |
0 / 52 |
|
0.00% |
0 / 1 |
210 | |||
regexTerm | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
getIndexField | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | use MediaWiki\MediaWikiServices; |
4 | use Wikimedia\AtEase\AtEase; |
5 | |
6 | /** |
7 | * We need to create subclasses, instead of just calling the functionality, |
8 | * because both filter() and, more importantly, $searchTerms are currently |
9 | * "protected". |
10 | * |
11 | * Unfortunately, the SearchMySQL methods parseQuery(), regexTerm() and |
12 | * getIndexField() are private, which means that they need to be |
13 | * copied over here (but declared as public). |
14 | */ |
15 | class CargoSearchMySQL extends SearchMySQL { |
16 | |
17 | public function __construct() { |
18 | if ( property_exists( $this, 'dbProvider' ) ) { |
19 | // MW 1.41+ |
20 | $dbProvider = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); |
21 | parent::__construct( $dbProvider ); |
22 | } else { |
23 | // MW < 1.41 |
24 | $lb = MediaWikiServices::getInstance()->getDBLoadBalancer(); |
25 | parent::__construct( $lb ); |
26 | } |
27 | } |
28 | |
29 | public function getSearchTerms( $searchString ) { |
30 | $filteredTerm = $this->filter( $searchString ); |
31 | $this->parseQuery( $filteredTerm, false ); |
32 | return $this->searchTerms; |
33 | } |
34 | |
35 | /** |
36 | * Parse the user's query and transform it into two SQL fragments: |
37 | * a WHERE condition and an ORDER BY expression |
38 | * |
39 | * @param string $filteredText |
40 | * @param string $fulltext |
41 | * |
42 | * @return array |
43 | */ |
44 | public function parseQuery( $filteredText, $fulltext ) { |
45 | $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); // Minus syntax chars (" and *) |
46 | $searchon = ''; |
47 | $this->searchTerms = []; |
48 | |
49 | # @todo FIXME: This doesn't handle parenthetical expressions. |
50 | $m = []; |
51 | if ( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', |
52 | $filteredText, $m, PREG_SET_ORDER ) ) { |
53 | $contLang = CargoUtils::getContentLang(); |
54 | $langConverter = MediaWikiServices::getInstance()->getLanguageConverterFactory() |
55 | ->getLanguageConverter( $contLang ); |
56 | foreach ( $m as $bits ) { |
57 | AtEase::suppressWarnings(); |
58 | [ /* all */, $modifier, $term, $nonQuoted, $wildcard ] = $bits; |
59 | AtEase::restoreWarnings(); |
60 | |
61 | if ( $nonQuoted != '' ) { |
62 | $term = $nonQuoted; |
63 | $quote = ''; |
64 | } else { |
65 | $term = str_replace( '"', '', $term ); |
66 | $quote = '"'; |
67 | } |
68 | |
69 | if ( $searchon !== '' ) { |
70 | $searchon .= ' '; |
71 | } |
72 | if ( $this->strictMatching && ( $modifier == '' ) ) { |
73 | // If we leave this out, boolean op defaults to OR which is rarely helpful. |
74 | $modifier = '+'; |
75 | } |
76 | |
77 | // Some languages such as Serbian store the input form in the search index, |
78 | // so we may need to search for matches in multiple writing system variants. |
79 | $convertedVariants = $langConverter->autoConvertToAllVariants( $term ); |
80 | if ( is_array( $convertedVariants ) ) { |
81 | $variants = array_unique( array_values( $convertedVariants ) ); |
82 | } else { |
83 | $variants = [ $term ]; |
84 | } |
85 | |
86 | // The low-level search index does some processing on input to work |
87 | // around problems with minimum lengths and encoding in MySQL's |
88 | // fulltext engine. |
89 | // For Chinese this also inserts spaces between adjacent Han characters. |
90 | $strippedVariants = array_map( [ $contLang, 'normalizeForSearch' ], $variants ); |
91 | |
92 | // Some languages such as Chinese force all variants to a canonical |
93 | // form when stripping to the low-level search index, so to be sure |
94 | // let's check our variants list for unique items after stripping. |
95 | $strippedVariants = array_unique( $strippedVariants ); |
96 | |
97 | $searchon .= $modifier; |
98 | if ( count( $strippedVariants ) > 1 ) { |
99 | $searchon .= '('; |
100 | } |
101 | foreach ( $strippedVariants as $stripped ) { |
102 | $stripped = $this->normalizeText( $stripped ); |
103 | if ( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { |
104 | // Hack for Chinese: we need to toss in quotes for |
105 | // multiple-character phrases since normalizeForSearch() |
106 | // added spaces between them to make word breaks. |
107 | $stripped = '"' . trim( $stripped ) . '"'; |
108 | } |
109 | $searchon .= "$quote$stripped$quote$wildcard "; |
110 | } |
111 | if ( count( $strippedVariants ) > 1 ) { |
112 | $searchon .= ')'; |
113 | } |
114 | |
115 | // Match individual terms or quoted phrase in result highlighting... |
116 | // Note that variants will be introduced in a later stage for highlighting! |
117 | $regexp = $this->regexTerm( $term, $wildcard ); |
118 | $this->searchTerms[] = $regexp; |
119 | } |
120 | wfDebug( __METHOD__ . ": Would search with '$searchon'\n" ); |
121 | wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); |
122 | } else { |
123 | wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); |
124 | } |
125 | |
126 | if ( property_exists( $this, 'db' ) ) { |
127 | // MW < 1.41 |
128 | // @phan-suppress-next-line PhanUndeclaredProperty |
129 | $searchon = $this->db->addQuotes( $searchon ); |
130 | } else { |
131 | $cdb = CargoUtils::getDB(); |
132 | $searchon = $cdb->addQuotes( $searchon ); |
133 | } |
134 | |
135 | $field = $this->getIndexField( $fulltext ); |
136 | return [ |
137 | " MATCH($field) AGAINST($searchon IN BOOLEAN MODE) ", |
138 | " MATCH($field) AGAINST($searchon IN NATURAL LANGUAGE MODE) DESC " |
139 | ]; |
140 | } |
141 | |
142 | /** |
143 | * @param string $string |
144 | * @param bool $wildcard |
145 | * @return string |
146 | */ |
147 | public function regexTerm( $string, $wildcard ) { |
148 | $regex = preg_quote( $string, '/' ); |
149 | $contLang = CargoUtils::getContentLang(); |
150 | if ( $contLang->hasWordBreaks() ) { |
151 | if ( $wildcard ) { |
152 | // Don't cut off the final bit! |
153 | $regex = "\b$regex"; |
154 | } else { |
155 | $regex = "\b$regex\b"; |
156 | } |
157 | } else { |
158 | // For Chinese, words may legitimately abut other words in the text literal. |
159 | // Don't add \b boundary checks... note this could cause false positives |
160 | // for Latin chars. |
161 | } |
162 | return $regex; |
163 | } |
164 | |
165 | /** |
166 | * Picks which field to index on, depending on what type of query. |
167 | * @param bool $fulltext |
168 | * @return string |
169 | */ |
170 | public function getIndexField( $fulltext ) { |
171 | return $fulltext ? 'si_text' : 'si_title'; |
172 | } |
173 | |
174 | } |