MediaWiki master
SearchHighlighter.php
Go to the documentation of this file.
1<?php
10namespace MediaWiki\Search;
11
16
26 public const DEFAULT_CONTEXT_LINES = 2;
27 public const DEFAULT_CONTEXT_CHARS = 75;
28
30 protected $mCleanWikitext = true;
31
38 public function __construct( $cleanupWikitext = true ) {
39 $this->mCleanWikitext = $cleanupWikitext;
40 }
41
52 public function highlightText(
53 $text,
54 $terms,
55 $contextlines = self::DEFAULT_CONTEXT_LINES,
56 $contextchars = self::DEFAULT_CONTEXT_CHARS
57 ) {
58 $searchHighlightBoundaries = MediaWikiServices::getInstance()
59 ->getMainConfig()->get( MainConfigNames::SearchHighlightBoundaries );
60
61 if ( $text == '' ) {
62 return '';
63 }
64
65 // split text into text + templates/links/tables
66 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
67 // first capture group is for detecting nested templates/links/tables/references
68 $endPatterns = [
69 1 => '/(\{\{)|(\}\})/', // template
70 2 => '/(\[\[)|(\]\])/', // image
71 3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
72
73 // @todo FIXME: This should prolly be a hook or something
74 // instead of hardcoding the name of the Cite extension
75 if ( ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
76 $spat .= '|(<ref>)'; // references via cite extension
77 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
78 }
79 $spat .= '/';
80 $textExt = []; // text extracts
81 $otherExt = []; // other extracts
82 $start = 0;
83 $textLen = strlen( $text );
84 $count = 0; // sequence number to maintain ordering
85 while ( $start < $textLen ) {
86 // find start of template/image/table
87 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
88 $epat = '';
89 foreach ( $matches as $key => $val ) {
90 if ( $key > 0 && $val[1] != -1 ) {
91 if ( $key == 2 ) {
92 // see if this is an image link
93 $ns = substr( $val[0], 2, -1 );
94 if (
95 MediaWikiServices::getInstance()->getContentLanguage()->
96 getNsIndex( $ns ) !== NS_FILE
97 ) {
98 break;
99 }
100
101 }
102 $epat = $endPatterns[$key];
103 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
104 $start = $val[1];
105 break;
106 }
107 }
108 if ( $epat ) {
109 // find end (and detect any nested elements)
110 $level = 0;
111 $offset = $start + 1;
112 $found = false;
113 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
114 if ( array_key_exists( 2, $endMatches ) ) {
115 // found end
116 if ( $level == 0 ) {
117 $len = strlen( $endMatches[2][0] );
118 $off = $endMatches[2][1];
119 $this->splitAndAdd( $otherExt, $count,
120 substr( $text, $start, $off + $len - $start ) );
121 $start = $off + $len;
122 $found = true;
123 break;
124 } else {
125 // end of nested element
126 $level--;
127 }
128 } else {
129 // nested
130 $level++;
131 }
132 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
133 }
134 if ( !$found ) {
135 // couldn't find appropriate closing tag, skip
136 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
137 $start += strlen( $matches[0][0] );
138 }
139 continue;
140 }
141 }
142 // else: add as text extract
143 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
144 break;
145 }
146 '@phan-var string[] $textExt';
147
148 $all = $textExt + $otherExt; // these have disjunct key sets
149
150 // prepare regexps
151 foreach ( $terms as $index => $term ) {
152 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
153 if ( preg_match( '/[\x80-\xff]/', $term ) ) {
154 $terms[$index] = preg_replace_callback(
155 '/./us',
156 $this->caseCallback( ... ),
157 $terms[$index]
158 );
159 } else {
160 $terms[$index] = $term;
161 }
162 }
163 $anyterm = implode( '|', $terms );
164 $phrase = implode( "{$searchHighlightBoundaries}+", $terms );
165 // @todo FIXME: A hack to scale contextchars, a correct solution
166 // would be to have contextchars actually be char and not byte
167 // length, and do proper utf-8 substrings and lengths everywhere,
168 // but PHP is making that very hard and unclean to implement :(
169 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
170 $contextchars = intval( $contextchars * $scale );
171
172 $patPre = "(^|{$searchHighlightBoundaries})";
173 $patPost = "({$searchHighlightBoundaries}|$)";
174
175 $pat1 = "/(" . $phrase . ")/ui";
176 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
177
178 $left = $contextlines;
179
180 $snippets = [];
181 $offsets = [];
182
183 // show beginning only if it contains all words
184 $first = 0;
185 $firstText = '';
186 foreach ( $textExt as $index => $line ) {
187 if ( $line !== '' && $line[0] != ';' && $line[0] != ':' ) {
188 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
189 $first = $index;
190 break;
191 }
192 }
193 if ( $firstText ) {
194 $succ = true;
195 // check if first text contains all terms
196 foreach ( $terms as $term ) {
197 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
198 $succ = false;
199 break;
200 }
201 }
202 if ( $succ ) {
203 $snippets[$first] = $firstText;
204 $offsets[$first] = 0;
205 }
206 }
207 if ( !$snippets ) {
208 // match whole query on text
209 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
210 // match whole query on templates/tables/images
211 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
212 // match any words on text
213 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
214 // match any words on templates/tables/images
215 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
216
217 ksort( $snippets );
218 }
219
220 // add extra chars to each snippet to make snippets constant size
221 $extended = [];
222 if ( count( $snippets ) == 0 ) {
223 // couldn't find the target words, just show beginning of article
224 if ( array_key_exists( $first, $all ) ) {
225 $targetchars = $contextchars * $contextlines;
226 $snippets[$first] = '';
227 $offsets[$first] = 0;
228 }
229 } else {
230 // if begin of the article contains the whole phrase, show only that !!
231 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
232 && $offsets[$first] < $contextchars * 2 ) {
233 $snippets = [ $first => $snippets[$first] ];
234 }
235
236 // calc by how much to extend existing snippets
237 $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
238 }
239
240 foreach ( $snippets as $index => $line ) {
241 $extended[$index] = $line;
242 $len = strlen( $line );
243 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
244 // $targetchars is set when $snippes contains anything
245 if ( $len < $targetchars - 20 ) {
246 // complete this line
247 if ( $len < strlen( $all[$index] ) ) {
248 $extended[$index] = $this->extract(
249 $all[$index],
250 $offsets[$index],
251 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
252 // $targetchars is set when $snippes contains anything
253 $offsets[$index] + $targetchars,
254 $offsets[$index]
255 );
256 $len = strlen( $extended[$index] );
257 }
258
259 // add more lines
260 $add = $index + 1;
261 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
262 // $targetchars is set when $snippes contains anything
263 while ( $len < $targetchars - 20
264 && array_key_exists( $add, $all )
265 && !array_key_exists( $add, $snippets ) ) {
266 $offsets[$add] = 0;
267 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
268 // $targetchars is set when $snippes contains anything
269 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
270 $extended[$add] = $tt;
271 $len += strlen( $tt );
272 $add++;
273 }
274 }
275 }
276
277 // $snippets = array_map( 'htmlspecialchars', $extended );
278 $snippets = $extended;
279 $last = -1;
280 $extract = '';
281 foreach ( $snippets as $index => $line ) {
282 if ( $last == -1 ) {
283 $extract .= $line; // first line
284 } elseif ( $last + 1 == $index
285 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
286 ) {
287 $extract .= " " . $line; // continuous lines
288 } else {
289 $extract .= '<b> ... </b>' . $line;
290 }
291
292 $last = $index;
293 }
294 if ( $extract ) {
295 $extract .= '<b> ... </b>';
296 }
297
298 $processed = [];
299 foreach ( $terms as $term ) {
300 if ( !isset( $processed[$term] ) ) {
301 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
302 $extract = preg_replace( $pat3,
303 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
304 $processed[$term] = true;
305 }
306 }
307
308 return $extract;
309 }
310
318 private function splitAndAdd( &$extracts, &$count, $text ) {
319 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
320 foreach ( $split as $line ) {
321 $tt = trim( $line );
322 if ( $tt ) {
323 $extracts[$count++] = $tt;
324 }
325 }
326 }
327
334 private function caseCallback( $matches ) {
335 if ( strlen( $matches[0] ) > 1 ) {
336 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
337 return '[' . $contLang->lc( $matches[0] ) .
338 $contLang->uc( $matches[0] ) . ']';
339 } else {
340 return $matches[0];
341 }
342 }
343
354 private function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
355 if ( $start != 0 ) {
356 $start = $this->position( $text, $start, 1 );
357 }
358 if ( $end >= strlen( $text ) ) {
359 $end = strlen( $text );
360 } else {
361 $end = $this->position( $text, $end );
362 }
363
364 if ( $posStart !== null ) {
365 $posStart = $start;
366 }
367 if ( $posEnd !== null ) {
368 $posEnd = $end;
369 }
370
371 if ( $end > $start ) {
372 return substr( $text, $start, $end - $start );
373 } else {
374 return '';
375 }
376 }
377
386 private function position( $text, $point, $offset = 0 ) {
387 $tolerance = 10;
388 $s = max( 0, $point - $tolerance );
389 $l = min( strlen( $text ), $point + $tolerance ) - $s;
390 $m = [];
391
392 if ( preg_match(
393 '/[ ,.!?~!@#$%^&*\‍(\‍)+=\-\\\|\[\]"\'<>]/',
394 substr( $text, $s, $l ),
395 $m,
396 PREG_OFFSET_CAPTURE
397 ) ) {
398 return $m[0][1] + $s + $offset;
399 } else {
400 // check if point is on a valid first UTF8 char
401 $char = ord( $text[$point] );
402 while ( $char >= 0x80 && $char < 0xc0 ) {
403 // skip trailing bytes
404 $point++;
405 if ( $point >= strlen( $text ) ) {
406 return strlen( $text );
407 }
408 $char = ord( $text[$point] );
409 }
410
411 return $point;
412
413 }
414 }
415
426 private function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
427 if ( $linesleft == 0 ) {
428 return; // nothing to do
429 }
430 foreach ( $extracts as $index => $line ) {
431 if ( array_key_exists( $index, $out ) ) {
432 continue; // this line already highlighted
433 }
434
435 $m = [];
436 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
437 continue;
438 }
439
440 $offset = $m[0][1];
441 $len = strlen( $m[0][0] );
442 if ( $offset + $len < $contextchars ) {
443 $begin = 0;
444 } elseif ( $len > $contextchars ) {
445 $begin = $offset;
446 } else {
447 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
448 }
449
450 $end = $begin + $contextchars;
451
452 $posBegin = $begin;
453 // basic snippet from this line
454 $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
455 $offsets[$index] = $posBegin;
456 $linesleft--;
457 if ( $linesleft == 0 ) {
458 return;
459 }
460 }
461 }
462
468 private function removeWiki( $text ) {
469 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
470 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
471 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
472 $text = preg_replace_callback(
473 "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
474 $this->linkReplace( ... ),
475 $text
476 );
477 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
478 $text = preg_replace( "/'''''/", "", $text );
479 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
480 $text = preg_replace( "/''/", "", $text );
481
482 // Note, the previous /<\/?[^>]+>/ is insufficient
483 // for XSS safety as the HTML tag can span multiple
484 // search results (T144845).
485 $text = Sanitizer::escapeHtmlAllowEntities( $text );
486 return $text;
487 }
488
496 private function linkReplace( $matches ) {
497 $colon = strpos( $matches[1], ':' );
498 if ( $colon === false ) {
499 return $matches[2]; // replace with caption
500 }
501 $ns = substr( $matches[1], 0, $colon );
502 $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
503 if ( $index !== false && ( $index === NS_FILE || $index === NS_CATEGORY ) ) {
504 return $matches[0]; // return the whole thing
505 } else {
506 return $matches[2];
507 }
508 }
509
522 public function highlightSimple(
523 $text,
524 $terms,
525 $contextlines = self::DEFAULT_CONTEXT_LINES,
526 $contextchars = self::DEFAULT_CONTEXT_CHARS
527 ) {
528 $lines = explode( "\n", $text );
529
530 $terms = implode( '|', $terms );
531 $max = intval( $contextchars ) + 1;
532 $pat1 = "/(.*)($terms)(.{0,$max})/ui";
533
534 $extract = '';
535 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
536 foreach ( $lines as $line ) {
537 if ( $contextlines == 0 ) {
538 break;
539 }
540 $m = [];
541 if ( !preg_match( $pat1, $line, $m ) ) {
542 continue;
543 }
544 --$contextlines;
545 // truncate function changes ... to relevant i18n message.
546 $pre = $contLang->truncateForVisual( $m[1], -$contextchars, '...', false );
547
548 if ( count( $m ) < 3 ) {
549 $post = '';
550 } else {
551 $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
552 }
553
554 $found = $m[2];
555
556 $line = htmlspecialchars( $pre . $found . $post );
557 $pat2 = '/(' . $terms . ')/ui';
558 $line = preg_replace( $pat2, '<span class="searchmatch">\1</span>', $line );
559
560 $extract .= "{$line}\n";
561 }
562
563 return $extract;
564 }
565
574 public function highlightNone(
575 $text,
576 $contextlines = self::DEFAULT_CONTEXT_LINES,
577 $contextchars = self::DEFAULT_CONTEXT_CHARS
578 ) {
579 $match = [];
580 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
581 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
582 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
583
584 // Trim and limit to max number of chars
585 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
586 return str_replace( "\n", '<br>', $text );
587 }
588}
589
591class_alias( SearchHighlighter::class, 'SearchHighlighter' );
const NS_FILE
Definition Defines.php:57
const NS_CATEGORY
Definition Defines.php:65
A class containing constants representing the names of configuration variables.
const SearchHighlightBoundaries
Name constant for the SearchHighlightBoundaries setting, for use with Config::get()
Service locator for MediaWiki core services.
static getInstance()
Returns the global default instance of the top level service locator.
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:34
Load JSON files, and uses a Processor to extract information.
highlightNone( $text, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Returns the first few lines of the text.
highlightSimple( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Simple & fast snippet extraction, but gives completely irrelevant snippets.
highlightText( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
Definition of a mapping for the search index field.