Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
13.11% |
8 / 61 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
| CargoSearchMySQL | |
13.11% |
8 / 61 |
|
40.00% |
2 / 5 |
282.36 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| getSearchTerms | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
| parseQuery | |
0.00% |
0 / 48 |
|
0.00% |
0 / 1 |
182 | |||
| regexTerm | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
3 | |||
| getIndexField | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| 1 | <?php |
| 2 | |
| 3 | use MediaWiki\MediaWikiServices; |
| 4 | |
| 5 | /** |
| 6 | * We need to create subclasses, instead of just calling the functionality, |
| 7 | * because both filter() and, more importantly, $searchTerms are currently |
| 8 | * "protected". |
| 9 | * |
| 10 | * Unfortunately, the SearchMySQL methods parseQuery(), regexTerm() and |
| 11 | * getIndexField() are private, which means that they need to be |
| 12 | * copied over here (but declared as public). |
| 13 | */ |
| 14 | class CargoSearchMySQL extends SearchMySQL { |
| 15 | |
| 16 | public function __construct() { |
| 17 | $dbProvider = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); |
| 18 | parent::__construct( $dbProvider ); |
| 19 | } |
| 20 | |
| 21 | public function getSearchTerms( $searchString ) { |
| 22 | $filteredTerm = $this->filter( $searchString ); |
| 23 | $this->parseQuery( $filteredTerm, false ); |
| 24 | return $this->searchTerms; |
| 25 | } |
| 26 | |
| 27 | /** |
| 28 | * Parse the user's query and transform it into two SQL fragments: |
| 29 | * a WHERE condition and an ORDER BY expression |
| 30 | * |
| 31 | * @param string $filteredText |
| 32 | * @param string $fulltext |
| 33 | * |
| 34 | * @return array |
| 35 | */ |
| 36 | public function parseQuery( $filteredText, $fulltext ) { |
| 37 | $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); // Minus syntax chars (" and *) |
| 38 | $searchon = ''; |
| 39 | $this->searchTerms = []; |
| 40 | |
| 41 | # @todo FIXME: This doesn't handle parenthetical expressions. |
| 42 | $m = []; |
| 43 | if ( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', |
| 44 | $filteredText, $m, PREG_SET_ORDER ) ) { |
| 45 | $contLang = CargoUtils::getContentLang(); |
| 46 | $langConverter = MediaWikiServices::getInstance()->getLanguageConverterFactory() |
| 47 | ->getLanguageConverter( $contLang ); |
| 48 | foreach ( $m as $bits ) { |
| 49 | [ /* all */, $modifier, $term, $nonQuoted, $wildcard ] = $bits; |
| 50 | |
| 51 | if ( $nonQuoted != '' ) { |
| 52 | $term = $nonQuoted; |
| 53 | $quote = ''; |
| 54 | } else { |
| 55 | $term = str_replace( '"', '', $term ); |
| 56 | $quote = '"'; |
| 57 | } |
| 58 | |
| 59 | if ( $searchon !== '' ) { |
| 60 | $searchon .= ' '; |
| 61 | } |
| 62 | if ( $this->strictMatching && ( $modifier == '' ) ) { |
| 63 | // If we leave this out, boolean op defaults to OR which is rarely helpful. |
| 64 | $modifier = '+'; |
| 65 | } |
| 66 | |
| 67 | // Some languages such as Serbian store the input form in the search index, |
| 68 | // so we may need to search for matches in multiple writing system variants. |
| 69 | $convertedVariants = $langConverter->autoConvertToAllVariants( $term ); |
| 70 | if ( is_array( $convertedVariants ) ) { |
| 71 | $variants = array_unique( array_values( $convertedVariants ) ); |
| 72 | } else { |
| 73 | $variants = [ $term ]; |
| 74 | } |
| 75 | |
| 76 | // The low-level search index does some processing on input to work |
| 77 | // around problems with minimum lengths and encoding in MySQL's |
| 78 | // fulltext engine. |
| 79 | // For Chinese this also inserts spaces between adjacent Han characters. |
| 80 | $strippedVariants = array_map( [ $contLang, 'normalizeForSearch' ], $variants ); |
| 81 | |
| 82 | // Some languages such as Chinese force all variants to a canonical |
| 83 | // form when stripping to the low-level search index, so to be sure |
| 84 | // let's check our variants list for unique items after stripping. |
| 85 | $strippedVariants = array_unique( $strippedVariants ); |
| 86 | |
| 87 | $searchon .= $modifier; |
| 88 | if ( count( $strippedVariants ) > 1 ) { |
| 89 | $searchon .= '('; |
| 90 | } |
| 91 | foreach ( $strippedVariants as $stripped ) { |
| 92 | $stripped = $this->normalizeText( $stripped ); |
| 93 | if ( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { |
| 94 | // Hack for Chinese: we need to toss in quotes for |
| 95 | // multiple-character phrases since normalizeForSearch() |
| 96 | // added spaces between them to make word breaks. |
| 97 | $stripped = '"' . trim( $stripped ) . '"'; |
| 98 | } |
| 99 | $searchon .= "$quote$stripped$quote$wildcard "; |
| 100 | } |
| 101 | if ( count( $strippedVariants ) > 1 ) { |
| 102 | $searchon .= ')'; |
| 103 | } |
| 104 | |
| 105 | // Match individual terms or quoted phrase in result highlighting... |
| 106 | // Note that variants will be introduced in a later stage for highlighting! |
| 107 | $regexp = $this->regexTerm( $term, $wildcard ); |
| 108 | $this->searchTerms[] = $regexp; |
| 109 | } |
| 110 | wfDebug( __METHOD__ . ": Would search with '$searchon'\n" ); |
| 111 | wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/\n" ); |
| 112 | } else { |
| 113 | wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" ); |
| 114 | } |
| 115 | |
| 116 | $cdb = CargoUtils::getDB(); |
| 117 | $searchon = $cdb->addQuotes( $searchon ); |
| 118 | |
| 119 | $field = $this->getIndexField( $fulltext ); |
| 120 | return [ |
| 121 | " MATCH($field) AGAINST($searchon IN BOOLEAN MODE) ", |
| 122 | " MATCH($field) AGAINST($searchon IN NATURAL LANGUAGE MODE) DESC " |
| 123 | ]; |
| 124 | } |
| 125 | |
| 126 | public function regexTerm( string $string, ?string $wildcard = null ): string { |
| 127 | $regex = preg_quote( $string, '/' ); |
| 128 | $contLang = CargoUtils::getContentLang(); |
| 129 | if ( $contLang->hasWordBreaks() ) { |
| 130 | if ( $wildcard ) { |
| 131 | // Don't cut off the final bit! |
| 132 | $regex = "\b$regex"; |
| 133 | } else { |
| 134 | $regex = "\b$regex\b"; |
| 135 | } |
| 136 | } else { |
| 137 | // For Chinese, words may legitimately abut other words in the text literal. |
| 138 | // Don't add \b boundary checks... note this could cause false positives |
| 139 | // for Latin chars. |
| 140 | } |
| 141 | return $regex; |
| 142 | } |
| 143 | |
| 144 | /** |
| 145 | * Picks which field to index on, depending on what type of query. |
| 146 | * @param bool $fulltext |
| 147 | * @return string |
| 148 | */ |
| 149 | public function getIndexField( $fulltext ) { |
| 150 | return $fulltext ? 'si_text' : 'si_title'; |
| 151 | } |
| 152 | |
| 153 | } |