Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
12.90% |
8 / 62 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
CargoSearchMySQL | |
12.90% |
8 / 62 |
|
40.00% |
2 / 5 |
284.28 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getSearchTerms | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
parseQuery | |
0.00% |
0 / 49 |
|
0.00% |
0 / 1 |
182 | |||
regexTerm | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
getIndexField | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | use MediaWiki\MediaWikiServices; |
4 | |
5 | /** |
6 | * We need to create subclasses, instead of just calling the functionality, |
7 | * because both filter() and, more importantly, $searchTerms are currently |
8 | * "protected". |
9 | * |
10 | * Unfortunately, the SearchMySQL methods parseQuery(), regexTerm() and |
11 | * getIndexField() are private, which means that they need to be |
12 | * copied over here (but declared as public). |
13 | */ |
14 | class CargoSearchMySQL extends SearchMySQL { |
15 | |
16 | public function __construct() { |
17 | $lb = MediaWikiServices::getInstance()->getDBLoadBalancer(); |
18 | parent::__construct( $lb ); |
19 | } |
20 | |
21 | public function getSearchTerms( $searchString ) { |
22 | $filteredTerm = $this->filter( $searchString ); |
23 | $this->parseQuery( $filteredTerm, false ); |
24 | return $this->searchTerms; |
25 | } |
26 | |
27 | /** |
28 | * Parse the user's query and transform it into two SQL fragments: |
29 | * a WHERE condition and an ORDER BY expression |
30 | * |
31 | * @param string $filteredText |
32 | * @param string $fulltext |
33 | * |
34 | * @return array |
35 | */ |
36 | public function parseQuery( $filteredText, $fulltext ) { |
37 | $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); // Minus syntax chars (" and *) |
38 | $searchon = ''; |
39 | $this->searchTerms = []; |
40 | |
41 | # @todo FIXME: This doesn't handle parenthetical expressions. |
42 | $m = []; |
43 | if ( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', |
44 | $filteredText, $m, PREG_SET_ORDER ) ) { |
45 | $contLang = CargoUtils::getContentLang(); |
46 | $langConverter = MediaWikiServices::getInstance()->getLanguageConverterFactory() |
47 | ->getLanguageConverter( $contLang ); |
48 | foreach ( $m as $bits ) { |
49 | Wikimedia\suppressWarnings(); |
50 | list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits; |
51 | Wikimedia\restoreWarnings(); |
52 | |
53 | if ( $nonQuoted != '' ) { |
54 | $term = $nonQuoted; |
55 | $quote = ''; |
56 | } else { |
57 | $term = str_replace( '"', '', $term ); |
58 | $quote = '"'; |
59 | } |
60 | |
61 | if ( $searchon !== '' ) { |
62 | $searchon .= ' '; |
63 | } |
64 | if ( $this->strictMatching && ( $modifier == '' ) ) { |
65 | // If we leave this out, boolean op defaults to OR which is rarely helpful. |
66 | $modifier = '+'; |
67 | } |
68 | |
69 | // Some languages such as Serbian store the input form in the search index, |
70 | // so we may need to search for matches in multiple writing system variants. |
71 | $convertedVariants = $langConverter->autoConvertToAllVariants( $term ); |
72 | if ( is_array( $convertedVariants ) ) { |
73 | $variants = array_unique( array_values( $convertedVariants ) ); |
74 | } else { |
75 | $variants = [ $term ]; |
76 | } |
77 | |
78 | // The low-level search index does some processing on input to work |
79 | // around problems with minimum lengths and encoding in MySQL's |
80 | // fulltext engine. |
81 | // For Chinese this also inserts spaces between adjacent Han characters. |
82 | $strippedVariants = array_map( [ $contLang, 'normalizeForSearch' ], $variants ); |
83 | |
84 | // Some languages such as Chinese force all variants to a canonical |
85 | // form when stripping to the low-level search index, so to be sure |
86 | // let's check our variants list for unique items after stripping. |
87 | $strippedVariants = array_unique( $strippedVariants ); |
88 | |
89 | $searchon .= $modifier; |
90 | if ( count( $strippedVariants ) > 1 ) { |
91 | $searchon .= '('; |
92 | } |
93 | foreach ( $strippedVariants as $stripped ) { |
94 | $stripped = $this->normalizeText( $stripped ); |
95 | if ( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { |
96 | // Hack for Chinese: we need to toss in quotes for |
97 | // multiple-character phrases since normalizeForSearch() |
98 | // added spaces between them to make word breaks. |
99 | $stripped = '"' . trim( $stripped ) . '"'; |
100 | } |
101 | $searchon .= "$quote$stripped$quote$wildcard "; |
102 | } |
103 | if ( count( $strippedVariants ) > 1 ) { |
104 | $searchon .= ')'; |
105 | } |
106 | |
107 | // Match individual terms or quoted phrase in result highlighting... |
108 | // Note that variants will be introduced in a later stage for highlighting! |
109 | $regexp = $this->regexTerm( $term, $wildcard ); |
110 | $this->searchTerms[] = $regexp; |
111 | } |
112 | wfDebug( __METHOD__ . ": Would search with '$searchon'\n" ); |
113 | wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); |
114 | } else { |
115 | wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); |
116 | } |
117 | |
118 | $searchon = $this->db->addQuotes( $searchon ); |
119 | $field = $this->getIndexField( $fulltext ); |
120 | return [ |
121 | " MATCH($field) AGAINST($searchon IN BOOLEAN MODE) ", |
122 | " MATCH($field) AGAINST($searchon IN NATURAL LANGUAGE MODE) DESC " |
123 | ]; |
124 | } |
125 | |
126 | /** |
127 | * @param string $string |
128 | * @param bool $wildcard |
129 | * @return string |
130 | */ |
131 | public function regexTerm( $string, $wildcard ) { |
132 | $regex = preg_quote( $string, '/' ); |
133 | $contLang = CargoUtils::getContentLang(); |
134 | if ( $contLang->hasWordBreaks() ) { |
135 | if ( $wildcard ) { |
136 | // Don't cut off the final bit! |
137 | $regex = "\b$regex"; |
138 | } else { |
139 | $regex = "\b$regex\b"; |
140 | } |
141 | } else { |
142 | // For Chinese, words may legitimately abut other words in the text literal. |
143 | // Don't add \b boundary checks... note this could cause false positives |
144 | // for Latin chars. |
145 | } |
146 | return $regex; |
147 | } |
148 | |
149 | /** |
150 | * Picks which field to index on, depending on what type of query. |
151 | * @param bool $fulltext |
152 | * @return string |
153 | */ |
154 | public function getIndexField( $fulltext ) { |
155 | return $fulltext ? 'si_text' : 'si_title'; |
156 | } |
157 | |
158 | } |