Code Coverage for /workspace/src/includes/search/SearchHighlighter.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	7.98% covered (danger)	7.98%	21 / 263	0.00% covered (danger)	0.00%	0 / 11	CRAP	0.00% covered (danger)	0.00%	0 / 1
SearchHighlighter	7.98% covered (danger)	7.98%	21 / 263	0.00% covered (danger)	0.00%	0 / 11	5320.50	0.00% covered (danger)	0.00%	0 / 1
__construct	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
highlightText	0.00% covered (danger)	0.00%	0 / 149	0.00% covered (danger)	0.00%	0 / 1	1980
splitAndAdd	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	20
caseCallback	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	6
extract	0.00% covered (danger)	0.00%	0 / 12	0.00% covered (danger)	0.00%	0 / 1	42
position	0.00% covered (danger)	0.00%	0 / 18	0.00% covered (danger)	0.00%	0 / 1	30
process	0.00% covered (danger)	0.00%	0 / 22	0.00% covered (danger)	0.00%	0 / 1	72
removeWiki	0.00% covered (danger)	0.00%	0 / 14	0.00% covered (danger)	0.00%	0 / 1	2
linkReplace	0.00% covered (danger)	0.00%	0 / 8	0.00% covered (danger)	0.00%	0 / 1	30
highlightSimple	91.30% covered (success)	91.30%	21 / 23	0.00% covered (danger)	0.00%	0 / 1	5.02
highlightNone	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2	/**
3	* Basic search engine highlighting
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License along
16	* with this program; if not, write to the Free Software Foundation, Inc.,
17	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18	* http://www.gnu.org/copyleft/gpl.html
19	*
20	* @file
21	* @ingroup Search
22	*/
23
24	use MediaWiki\MainConfigNames;
25	use MediaWiki\MediaWikiServices;
26	use MediaWiki\Parser\Sanitizer;
27
28	/**
29	* Highlight bits of wikitext
30	*
31	* @newable
32	* @note marked as newable in 1.35 for lack of a better alternative,
33	* but should use a factory in the future.
34	* @ingroup Search
35	*/
36	class SearchHighlighter {
37	public const DEFAULT_CONTEXT_LINES = 2;
38	public const DEFAULT_CONTEXT_CHARS = 75;
39
40	protected $mCleanWikitext = true;
41
42	/**
43	* @stable to call
44	* @warning If you pass false to this constructor, then
45	* the caller is responsible for HTML escaping.
46	* @param bool $cleanupWikitext
47	*/
48	public function __construct( $cleanupWikitext = true ) {
49	$this->mCleanWikitext = $cleanupWikitext;
50	}
51
52	/**
53	* Wikitext highlighting when $wgAdvancedSearchHighlighting = true
54	*
55	* @param string $text
56	* @param string[] $terms Terms to highlight (not html escaped but
57	* regex escaped via SearchDatabase::regexTerm())
58	* @param int $contextlines
59	* @param int $contextchars
60	* @return string
61	*/
62	public function highlightText(
63	$text,
64	$terms,
65	$contextlines = self::DEFAULT_CONTEXT_LINES,
66	$contextchars = self::DEFAULT_CONTEXT_CHARS
67	) {
68	$searchHighlightBoundaries = MediaWikiServices::getInstance()
69	->getMainConfig()->get( MainConfigNames::SearchHighlightBoundaries );
70
71	if ( $text == '' ) {
72	return '';
73	}
74
75	// split text into text + templates/links/tables
76	$spat = "/(\\{\\{)\|(\\[\\[[^\\]:]+:)\|(\n\\{\\\|)";
77	// first capture group is for detecting nested templates/links/tables/references
78	$endPatterns = [
79	1 => '/(\{\{)\|(\}\})/', // template
80	2 => '/(\[\[)\|(\]\])/', // image
81	3 => "/(\n\\{\\\|)\|(\n\\\|\\})/" ]; // table
82
83	// @todo FIXME: This should prolly be a hook or something
84	// instead of hardcoding the name of the Cite extension
85	if ( \ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
86	$spat .= '\|(<ref>)'; // references via cite extension
87	$endPatterns[4] = '/(<ref>)\|(<\/ref>)/';
88	}
89	$spat .= '/';
90	$textExt = []; // text extracts
91	$otherExt = []; // other extracts
92	$start = 0;
93	$textLen = strlen( $text );
94	$count = 0; // sequence number to maintain ordering
95	while ( $start < $textLen ) {
96	// find start of template/image/table
97	if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
98	$epat = '';
99	foreach ( $matches as $key => $val ) {
100	if ( $key > 0 && $val[1] != -1 ) {
101	if ( $key == 2 ) {
102	// see if this is an image link
103	$ns = substr( $val[0], 2, -1 );
104	if (
105	MediaWikiServices::getInstance()->getContentLanguage()->
106	getNsIndex( $ns ) !== NS_FILE
107	) {
108	break;
109	}
110
111	}
112	$epat = $endPatterns[$key];
113	$this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
114	$start = $val[1];
115	break;
116	}
117	}
118	if ( $epat ) {
119	// find end (and detect any nested elements)
120	$level = 0;
121	$offset = $start + 1;
122	$found = false;
123	while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
124	if ( array_key_exists( 2, $endMatches ) ) {
125	// found end
126	if ( $level == 0 ) {
127	$len = strlen( $endMatches[2][0] );
128	$off = $endMatches[2][1];
129	$this->splitAndAdd( $otherExt, $count,
130	substr( $text, $start, $off + $len - $start ) );
131	$start = $off + $len;
132	$found = true;
133	break;
134	} else {
135	// end of nested element
136	$level -= 1;
137	}
138	} else {
139	// nested
140	$level += 1;
141	}
142	$offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
143	}
144	if ( !$found ) {
145	// couldn't find appropriate closing tag, skip
146	$this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
147	$start += strlen( $matches[0][0] );
148	}
149	continue;
150	}
151	}
152	// else: add as text extract
153	$this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
154	break;
155	}
156	'@phan-var string[] $textExt';
157
158	$all = $textExt + $otherExt; // these have disjunct key sets
159
160	// prepare regexps
161	foreach ( $terms as $index => $term ) {
162	// manually do upper/lowercase stuff for utf-8 since PHP won't do it
163	if ( preg_match( '/[\x80-\xff]/', $term ) ) {
164	$terms[$index] = preg_replace_callback(
165	'/./us',
166	[ $this, 'caseCallback' ],
167	$terms[$index]
168	);
169	} else {
170	$terms[$index] = $term;
171	}
172	}
173	$anyterm = implode( '\|', $terms );
174	$phrase = implode( "{$searchHighlightBoundaries}+", $terms );
175	// @todo FIXME: A hack to scale contextchars, a correct solution
176	// would be to have contextchars actually be char and not byte
177	// length, and do proper utf-8 substrings and lengths everywhere,
178	// but PHP is making that very hard and unclean to implement :(
179	$scale = strlen( $anyterm ) / mb_strlen( $anyterm );
180	$contextchars = intval( $contextchars * $scale );
181
182	$patPre = "(^\|{$searchHighlightBoundaries})";
183	$patPost = "({$searchHighlightBoundaries}\|$)";
184
185	$pat1 = "/(" . $phrase . ")/ui";
186	$pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
187
188	$left = $contextlines;
189
190	$snippets = [];
191	$offsets = [];
192
193	// show beginning only if it contains all words
194	$first = 0;
195	$firstText = '';
196	foreach ( $textExt as $index => $line ) {
197	if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
198	$firstText = $this->extract( $line, 0, $contextchars * $contextlines );
199	$first = $index;
200	break;
201	}
202	}
203	if ( $firstText ) {
204	$succ = true;
205	// check if first text contains all terms
206	foreach ( $terms as $term ) {
207	if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
208	$succ = false;
209	break;
210	}
211	}
212	if ( $succ ) {
213	$snippets[$first] = $firstText;
214	$offsets[$first] = 0;
215	}
216	}
217	if ( !$snippets ) {
218	// match whole query on text
219	$this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
220	// match whole query on templates/tables/images
221	$this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
222	// match any words on text
223	$this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
224	// match any words on templates/tables/images
225	$this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
226
227	ksort( $snippets );
228	}
229
230	// add extra chars to each snippet to make snippets constant size
231	$extended = [];
232	if ( count( $snippets ) == 0 ) {
233	// couldn't find the target words, just show beginning of article
234	if ( array_key_exists( $first, $all ) ) {
235	$targetchars = $contextchars * $contextlines;
236	$snippets[$first] = '';
237	$offsets[$first] = 0;
238	}
239	} else {
240	// if begin of the article contains the whole phrase, show only that !!
241	if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
242	&& $offsets[$first] < $contextchars * 2 ) {
243	$snippets = [ $first => $snippets[$first] ];
244	}
245
246	// calc by how much to extend existing snippets
247	$targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
248	}
249
250	foreach ( $snippets as $index => $line ) {
251	$extended[$index] = $line;
252	$len = strlen( $line );
253	// @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
254	// $targetchars is set when $snippes contains anything
255	if ( $len < $targetchars - 20 ) {
256	// complete this line
257	if ( $len < strlen( $all[$index] ) ) {
258	$extended[$index] = $this->extract(
259	$all[$index],
260	$offsets[$index],
261	// @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
262	// $targetchars is set when $snippes contains anything
263	$offsets[$index] + $targetchars,
264	$offsets[$index]
265	);
266	$len = strlen( $extended[$index] );
267	}
268
269	// add more lines
270	$add = $index + 1;
271	// @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
272	// $targetchars is set when $snippes contains anything
273	while ( $len < $targetchars - 20
274	&& array_key_exists( $add, $all )
275	&& !array_key_exists( $add, $snippets ) ) {
276	$offsets[$add] = 0;
277	// @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
278	// $targetchars is set when $snippes contains anything
279	$tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
280	$extended[$add] = $tt;
281	$len += strlen( $tt );
282	$add++;
283	}
284	}
285	}
286
287	// $snippets = array_map( 'htmlspecialchars', $extended );
288	$snippets = $extended;
289	$last = -1;
290	$extract = '';
291	foreach ( $snippets as $index => $line ) {
292	if ( $last == -1 ) {
293	$extract .= $line; // first line
294	} elseif ( $last + 1 == $index
295	&& $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
296	) {
297	$extract .= " " . $line; // continuous lines
298	} else {
299	$extract .= '<b> ... </b>' . $line;
300	}
301
302	$last = $index;
303	}
304	if ( $extract ) {
305	$extract .= '<b> ... </b>';
306	}
307
308	$processed = [];
309	foreach ( $terms as $term ) {
310	if ( !isset( $processed[$term] ) ) {
311	$pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
312	$extract = preg_replace( $pat3,
313	"\\1<span class='searchmatch'>\\2</span>\\3", $extract );
314	$processed[$term] = true;
315	}
316	}
317
318	return $extract;
319	}
320
321	/**
322	* Split text into lines and add it to extracts array
323	*
324	* @param string[] &$extracts Index -> $line
325	* @param int &$count
326	* @param string $text
327	*/
328	private function splitAndAdd( &$extracts, &$count, $text ) {
329	$split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
330	foreach ( $split as $line ) {
331	$tt = trim( $line );
332	if ( $tt ) {
333	$extracts[$count++] = $tt;
334	}
335	}
336	}
337
338	/**
339	* Do manual case conversion for non-ascii chars
340	*
341	* @param array $matches
342	* @return string
343	*/
344	private function caseCallback( $matches ) {
345	if ( strlen( $matches[0] ) > 1 ) {
346	$contLang = MediaWikiServices::getInstance()->getContentLanguage();
347	return '[' . $contLang->lc( $matches[0] ) .
348	$contLang->uc( $matches[0] ) . ']';
349	} else {
350	return $matches[0];
351	}
352	}
353
354	/**
355	* Extract part of the text from start to end, but by
356	* not chopping up words
357	* @param string $text
358	* @param int $start
359	* @param int $end
360	* @param int\|null &$posStart (out) actual start position
361	* @param int\|null &$posEnd (out) actual end position
362	* @return string
363	*/
364	private function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
365	if ( $start != 0 ) {
366	$start = $this->position( $text, $start, 1 );
367	}
368	if ( $end >= strlen( $text ) ) {
369	$end = strlen( $text );
370	} else {
371	$end = $this->position( $text, $end );
372	}
373
374	if ( $posStart !== null ) {
375	$posStart = $start;
376	}
377	if ( $posEnd !== null ) {
378	$posEnd = $end;
379	}
380
381	if ( $end > $start ) {
382	return substr( $text, $start, $end - $start );
383	} else {
384	return '';
385	}
386	}
387
388	/**
389	* Find a nonletter near a point (index) in the text
390	*
391	* @param string $text
392	* @param int $point
393	* @param int $offset Offset to found index
394	* @return int Nearest nonletter index, or beginning of utf8 char if none
395	*/
396	private function position( $text, $point, $offset = 0 ) {
397	$tolerance = 10;
398	$s = max( 0, $point - $tolerance );
399	$l = min( strlen( $text ), $point + $tolerance ) - $s;
400	$m = [];
401
402	if ( preg_match(
403	'/[ ,.!?~!@#$%^&*+=\-\\\\|\[\]"\'<>]/',
404	substr( $text, $s, $l ),
405	$m,
406	PREG_OFFSET_CAPTURE
407	) ) {
408	return $m[0][1] + $s + $offset;
409	} else {
410	// check if point is on a valid first UTF8 char
411	$char = ord( $text[$point] );
412	while ( $char >= 0x80 && $char < 0xc0 ) {
413	// skip trailing bytes
414	$point++;
415	if ( $point >= strlen( $text ) ) {
416	return strlen( $text );
417	}
418	$char = ord( $text[$point] );
419	}
420
421	return $point;
422
423	}
424	}
425
426	/**
427	* Search extracts for a pattern, and return snippets
428	*
429	* @param string $pattern Regexp for matching lines
430	* @param array $extracts Extracts to search
431	* @param int &$linesleft Number of extracts to make
432	* @param int &$contextchars Length of snippet
433	* @param array &$out Map for highlighted snippets
434	* @param array &$offsets Map of starting points of snippets
435	*/
436	private function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
437	if ( $linesleft == 0 ) {
438	return; // nothing to do
439	}
440	foreach ( $extracts as $index => $line ) {
441	if ( array_key_exists( $index, $out ) ) {
442	continue; // this line already highlighted
443	}
444
445	$m = [];
446	if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
447	continue;
448	}
449
450	$offset = $m[0][1];
451	$len = strlen( $m[0][0] );
452	if ( $offset + $len < $contextchars ) {
453	$begin = 0;
454	} elseif ( $len > $contextchars ) {
455	$begin = $offset;
456	} else {
457	$begin = $offset + intval( ( $len - $contextchars ) / 2 );
458	}
459
460	$end = $begin + $contextchars;
461
462	$posBegin = $begin;
463	// basic snippet from this line
464	$out[$index] = $this->extract( $line, $begin, $end, $posBegin );
465	$offsets[$index] = $posBegin;
466	$linesleft--;
467	if ( $linesleft == 0 ) {
468	return;
469	}
470	}
471	}
472
473	/**
474	* Basic wikitext removal
475	* @param string $text
476	* @return string
477	*/
478	private function removeWiki( $text ) {
479	$text = preg_replace( "/\\{\\{([^\|]+?)\\}\\}/", "", $text );
480	$text = preg_replace( "/\\{\\{([^\|]+\\\|)(.*?)\\}\\}/", "\\2", $text );
481	$text = preg_replace( "/\\[\\[([^\|]+?)\\]\\]/", "\\1", $text );
482	$text = preg_replace_callback(
483	"/\\[\\[([^\|]+\\\|)(.*?)\\]\\]/",
484	[ $this, 'linkReplace' ],
485	$text
486	);
487	$text = preg_replace( "/<\/?[^>]+>/", "", $text );
488	$text = preg_replace( "/'''''/", "", $text );
489	$text = preg_replace( "/('''\|<\/?[iIuUbB]>)/", "", $text );
490	$text = preg_replace( "/''/", "", $text );
491
492	// Note, the previous /<\/?[^>]+>/ is insufficient
493	// for XSS safety as the HTML tag can span multiple
494	// search results (T144845).
495	$text = Sanitizer::escapeHtmlAllowEntities( $text );
496	return $text;
497	}
498
499	/**
500	* callback to replace [[target\|caption]] kind of links, if
501	* the target is category or image, leave it
502	*
503	* @param array $matches
504	* @return string
505	*/
506	private function linkReplace( $matches ) {
507	$colon = strpos( $matches[1], ':' );
508	if ( $colon === false ) {
509	return $matches[2]; // replace with caption
510	}
511	$ns = substr( $matches[1], 0, $colon );
512	$index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
513	if ( $index !== false && ( $index === NS_FILE \|\| $index === NS_CATEGORY ) ) {
514	return $matches[0]; // return the whole thing
515	} else {
516	return $matches[2];
517	}
518	}
519
520	/**
521	* Simple & fast snippet extraction, but gives completely irrelevant
522	* snippets
523	*
524	* Used when $wgAdvancedSearchHighlighting is false.
525	*
526	* @param string $text
527	* @param string[] $terms Escaped for regex by SearchDatabase::regexTerm()
528	* @param int $contextlines
529	* @param int $contextchars
530	* @return string
531	*/
532	public function highlightSimple(
533	$text,
534	$terms,
535	$contextlines = self::DEFAULT_CONTEXT_LINES,
536	$contextchars = self::DEFAULT_CONTEXT_CHARS
537	) {
538	$lines = explode( "\n", $text );
539
540	$terms = implode( '\|', $terms );
541	$max = intval( $contextchars ) + 1;
542	$pat1 = "/(.*)($terms)(.{0,$max})/ui";
543
544	$extract = '';
545	$contLang = MediaWikiServices::getInstance()->getContentLanguage();
546	foreach ( $lines as $line ) {
547	if ( $contextlines == 0 ) {
548	break;
549	}
550	$m = [];
551	if ( !preg_match( $pat1, $line, $m ) ) {
552	continue;
553	}
554	--$contextlines;
555	// truncate function changes ... to relevant i18n message.
556	$pre = $contLang->truncateForVisual( $m[1], -$contextchars, '...', false );
557
558	if ( count( $m ) < 3 ) {
559	$post = '';
560	} else {
561	$post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
562	}
563
564	$found = $m[2];
565
566	$line = htmlspecialchars( $pre . $found . $post );
567	$pat2 = '/(' . $terms . ')/ui';
568	$line = preg_replace( $pat2, '<span class="searchmatch">\1</span>', $line );
569
570	$extract .= "{$line}\n";
571	}
572
573	return $extract;
574	}
575
576	/**
577	* Returns the first few lines of the text
578	*
579	* @param string $text
580	* @param int $contextlines Max number of returned lines
581	* @param int $contextchars Average number of characters per line
582	* @return string
583	*/
584	public function highlightNone(
585	$text,
586	$contextlines = self::DEFAULT_CONTEXT_LINES,
587	$contextchars = self::DEFAULT_CONTEXT_CHARS
588	) {
589	$match = [];
590	$text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
591	$text = str_replace( "\n\n", "\n", $text ); // remove empty lines
592	preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
593
594	// Trim and limit to max number of chars
595	$text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
596	return str_replace( "\n", '<br>', $text );
597	}
598	}