Code Coverage for /workspace/src/extensions/Cargo/includes/search/CargoSearchMySQL.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	11.76% covered (danger)	11.76%	8 / 68	40.00% covered (danger)	40.00%	2 / 5	CRAP	0.00% covered (danger)	0.00%	0 / 1
CargoSearchMySQL	11.76% covered (danger)	11.76%	8 / 68	40.00% covered (danger)	40.00%	2 / 5	354.49	0.00% covered (danger)	0.00%	0 / 1
__construct	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	6
getSearchTerms	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	2
parseQuery	0.00% covered (danger)	0.00%	0 / 52	0.00% covered (danger)	0.00%	0 / 1	210
regexTerm	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	3
getIndexField	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	2

1	<?php
2
3	use MediaWiki\MediaWikiServices;
4	use Wikimedia\AtEase\AtEase;
5
6	/**
7	* We need to create subclasses, instead of just calling the functionality,
8	* because both filter() and, more importantly, $searchTerms are currently
9	* "protected".
10	*
11	* Unfortunately, the SearchMySQL methods parseQuery(), regexTerm() and
12	* getIndexField() are private, which means that they need to be
13	* copied over here (but declared as public).
14	*/
15	class CargoSearchMySQL extends SearchMySQL {
16
17	public function __construct() {
18	if ( property_exists( $this, 'dbProvider' ) ) {
19	// MW 1.41+
20	$dbProvider = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
21	parent::__construct( $dbProvider );
22	} else {
23	// MW < 1.41
24	$lb = MediaWikiServices::getInstance()->getDBLoadBalancer();
25	parent::__construct( $lb );
26	}
27	}
28
29	public function getSearchTerms( $searchString ) {
30	$filteredTerm = $this->filter( $searchString );
31	$this->parseQuery( $filteredTerm, false );
32	return $this->searchTerms;
33	}
34
35	/**
36	* Parse the user's query and transform it into two SQL fragments:
37	* a WHERE condition and an ORDER BY expression
38	*
39	* @param string $filteredText
40	* @param string $fulltext
41	*
42	* @return array
43	*/
44	public function parseQuery( $filteredText, $fulltext ) {
45	$lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); // Minus syntax chars (" and *)
46	$searchon = '';
47	$this->searchTerms = [];
48
49	# @todo FIXME: This doesn't handle parenthetical expressions.
50	$m = [];
51	if ( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\?)\|"[^"]")/',
52	$filteredText, $m, PREG_SET_ORDER ) ) {
53	$contLang = CargoUtils::getContentLang();
54	$langConverter = MediaWikiServices::getInstance()->getLanguageConverterFactory()
55	->getLanguageConverter( $contLang );
56	foreach ( $m as $bits ) {
57	AtEase::suppressWarnings();
58	[ /* all */, $modifier, $term, $nonQuoted, $wildcard ] = $bits;
59	AtEase::restoreWarnings();
60
61	if ( $nonQuoted != '' ) {
62	$term = $nonQuoted;
63	$quote = '';
64	} else {
65	$term = str_replace( '"', '', $term );
66	$quote = '"';
67	}
68
69	if ( $searchon !== '' ) {
70	$searchon .= ' ';
71	}
72	if ( $this->strictMatching && ( $modifier == '' ) ) {
73	// If we leave this out, boolean op defaults to OR which is rarely helpful.
74	$modifier = '+';
75	}
76
77	// Some languages such as Serbian store the input form in the search index,
78	// so we may need to search for matches in multiple writing system variants.
79	$convertedVariants = $langConverter->autoConvertToAllVariants( $term );
80	if ( is_array( $convertedVariants ) ) {
81	$variants = array_unique( array_values( $convertedVariants ) );
82	} else {
83	$variants = [ $term ];
84	}
85
86	// The low-level search index does some processing on input to work
87	// around problems with minimum lengths and encoding in MySQL's
88	// fulltext engine.
89	// For Chinese this also inserts spaces between adjacent Han characters.
90	$strippedVariants = array_map( [ $contLang, 'normalizeForSearch' ], $variants );
91
92	// Some languages such as Chinese force all variants to a canonical
93	// form when stripping to the low-level search index, so to be sure
94	// let's check our variants list for unique items after stripping.
95	$strippedVariants = array_unique( $strippedVariants );
96
97	$searchon .= $modifier;
98	if ( count( $strippedVariants ) > 1 ) {
99	$searchon .= '(';
100	}
101	foreach ( $strippedVariants as $stripped ) {
102	$stripped = $this->normalizeText( $stripped );
103	if ( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
104	// Hack for Chinese: we need to toss in quotes for
105	// multiple-character phrases since normalizeForSearch()
106	// added spaces between them to make word breaks.
107	$stripped = '"' . trim( $stripped ) . '"';
108	}
109	$searchon .= "$quote$stripped$quote$wildcard ";
110	}
111	if ( count( $strippedVariants ) > 1 ) {
112	$searchon .= ')';
113	}
114
115	// Match individual terms or quoted phrase in result highlighting...
116	// Note that variants will be introduced in a later stage for highlighting!
117	$regexp = $this->regexTerm( $term, $wildcard );
118	$this->searchTerms[] = $regexp;
119	}
120	wfDebug( __METHOD__ . ": Would search with '$searchon'\n" );
121	wfDebug( __METHOD__ . ': Match with /' . implode( '\|', $this->searchTerms ) . "/\n" );
122	} else {
123	wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'\n" );
124	}
125
126	if ( property_exists( $this, 'db' ) ) {
127	// MW < 1.41
128	// @phan-suppress-next-line PhanUndeclaredProperty
129	$searchon = $this->db->addQuotes( $searchon );
130	} else {
131	$cdb = CargoUtils::getDB();
132	$searchon = $cdb->addQuotes( $searchon );
133	}
134
135	$field = $this->getIndexField( $fulltext );
136	return [
137	" MATCH($field) AGAINST($searchon IN BOOLEAN MODE) ",
138	" MATCH($field) AGAINST($searchon IN NATURAL LANGUAGE MODE) DESC "
139	];
140	}
141
142	/**
143	* @param string $string
144	* @param bool $wildcard
145	* @return string
146	*/
147	public function regexTerm( $string, $wildcard ) {
148	$regex = preg_quote( $string, '/' );
149	$contLang = CargoUtils::getContentLang();
150	if ( $contLang->hasWordBreaks() ) {
151	if ( $wildcard ) {
152	// Don't cut off the final bit!
153	$regex = "\b$regex";
154	} else {
155	$regex = "\b$regex\b";
156	}
157	} else {
158	// For Chinese, words may legitimately abut other words in the text literal.
159	// Don't add \b boundary checks... note this could cause false positives
160	// for Latin chars.
161	}
162	return $regex;
163	}
164
165	/**
166	* Picks which field to index on, depending on what type of query.
167	* @param bool $fulltext
168	* @return string
169	*/
170	public function getIndexField( $fulltext ) {
171	return $fulltext ? 'si_text' : 'si_title';
172	}
173
174	}