MediaWiki master
SearchHighlighter.php
Go to the documentation of this file.
1<?php
27
37 public const DEFAULT_CONTEXT_LINES = 2;
38 public const DEFAULT_CONTEXT_CHARS = 75;
39
40 protected $mCleanWikitext = true;
41
48 public function __construct( $cleanupWikitext = true ) {
49 $this->mCleanWikitext = $cleanupWikitext;
50 }
51
62 public function highlightText(
63 $text,
64 $terms,
65 $contextlines = self::DEFAULT_CONTEXT_LINES,
66 $contextchars = self::DEFAULT_CONTEXT_CHARS
67 ) {
68 $searchHighlightBoundaries = MediaWikiServices::getInstance()
69 ->getMainConfig()->get( MainConfigNames::SearchHighlightBoundaries );
70
71 if ( $text == '' ) {
72 return '';
73 }
74
75 // split text into text + templates/links/tables
76 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
77 // first capture group is for detecting nested templates/links/tables/references
78 $endPatterns = [
79 1 => '/(\{\{)|(\}\})/', // template
80 2 => '/(\[\[)|(\]\])/', // image
81 3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
82
83 // @todo FIXME: This should prolly be a hook or something
84 // instead of hardcoding the name of the Cite extension
85 if ( \ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
86 $spat .= '|(<ref>)'; // references via cite extension
87 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
88 }
89 $spat .= '/';
90 $textExt = []; // text extracts
91 $otherExt = []; // other extracts
92 $start = 0;
93 $textLen = strlen( $text );
94 $count = 0; // sequence number to maintain ordering
95 while ( $start < $textLen ) {
96 // find start of template/image/table
97 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
98 $epat = '';
99 foreach ( $matches as $key => $val ) {
100 if ( $key > 0 && $val[1] != -1 ) {
101 if ( $key == 2 ) {
102 // see if this is an image link
103 $ns = substr( $val[0], 2, -1 );
104 if (
105 MediaWikiServices::getInstance()->getContentLanguage()->
106 getNsIndex( $ns ) !== NS_FILE
107 ) {
108 break;
109 }
110
111 }
112 $epat = $endPatterns[$key];
113 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
114 $start = $val[1];
115 break;
116 }
117 }
118 if ( $epat ) {
119 // find end (and detect any nested elements)
120 $level = 0;
121 $offset = $start + 1;
122 $found = false;
123 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
124 if ( array_key_exists( 2, $endMatches ) ) {
125 // found end
126 if ( $level == 0 ) {
127 $len = strlen( $endMatches[2][0] );
128 $off = $endMatches[2][1];
129 $this->splitAndAdd( $otherExt, $count,
130 substr( $text, $start, $off + $len - $start ) );
131 $start = $off + $len;
132 $found = true;
133 break;
134 } else {
135 // end of nested element
136 $level -= 1;
137 }
138 } else {
139 // nested
140 $level += 1;
141 }
142 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
143 }
144 if ( !$found ) {
145 // couldn't find appropriate closing tag, skip
146 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
147 $start += strlen( $matches[0][0] );
148 }
149 continue;
150 }
151 }
152 // else: add as text extract
153 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
154 break;
155 }
156 '@phan-var string[] $textExt';
157
158 $all = $textExt + $otherExt; // these have disjunct key sets
159
160 // prepare regexps
161 foreach ( $terms as $index => $term ) {
162 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
163 if ( preg_match( '/[\x80-\xff]/', $term ) ) {
164 $terms[$index] = preg_replace_callback(
165 '/./us',
166 [ $this, 'caseCallback' ],
167 $terms[$index]
168 );
169 } else {
170 $terms[$index] = $term;
171 }
172 }
173 $anyterm = implode( '|', $terms );
174 $phrase = implode( "{$searchHighlightBoundaries}+", $terms );
175 // @todo FIXME: A hack to scale contextchars, a correct solution
176 // would be to have contextchars actually be char and not byte
177 // length, and do proper utf-8 substrings and lengths everywhere,
178 // but PHP is making that very hard and unclean to implement :(
179 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
180 $contextchars = intval( $contextchars * $scale );
181
182 $patPre = "(^|{$searchHighlightBoundaries})";
183 $patPost = "({$searchHighlightBoundaries}|$)";
184
185 $pat1 = "/(" . $phrase . ")/ui";
186 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
187
188 $left = $contextlines;
189
190 $snippets = [];
191 $offsets = [];
192
193 // show beginning only if it contains all words
194 $first = 0;
195 $firstText = '';
196 foreach ( $textExt as $index => $line ) {
197 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
198 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
199 $first = $index;
200 break;
201 }
202 }
203 if ( $firstText ) {
204 $succ = true;
205 // check if first text contains all terms
206 foreach ( $terms as $term ) {
207 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
208 $succ = false;
209 break;
210 }
211 }
212 if ( $succ ) {
213 $snippets[$first] = $firstText;
214 $offsets[$first] = 0;
215 }
216 }
217 if ( !$snippets ) {
218 // match whole query on text
219 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
220 // match whole query on templates/tables/images
221 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
222 // match any words on text
223 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
224 // match any words on templates/tables/images
225 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
226
227 ksort( $snippets );
228 }
229
230 // add extra chars to each snippet to make snippets constant size
231 $extended = [];
232 if ( count( $snippets ) == 0 ) {
233 // couldn't find the target words, just show beginning of article
234 if ( array_key_exists( $first, $all ) ) {
235 $targetchars = $contextchars * $contextlines;
236 $snippets[$first] = '';
237 $offsets[$first] = 0;
238 }
239 } else {
240 // if begin of the article contains the whole phrase, show only that !!
241 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
242 && $offsets[$first] < $contextchars * 2 ) {
243 $snippets = [ $first => $snippets[$first] ];
244 }
245
246 // calc by how much to extend existing snippets
247 $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
248 }
249
250 foreach ( $snippets as $index => $line ) {
251 $extended[$index] = $line;
252 $len = strlen( $line );
253 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
254 // $targetchars is set when $snippes contains anything
255 if ( $len < $targetchars - 20 ) {
256 // complete this line
257 if ( $len < strlen( $all[$index] ) ) {
258 $extended[$index] = $this->extract(
259 $all[$index],
260 $offsets[$index],
261 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
262 // $targetchars is set when $snippes contains anything
263 $offsets[$index] + $targetchars,
264 $offsets[$index]
265 );
266 $len = strlen( $extended[$index] );
267 }
268
269 // add more lines
270 $add = $index + 1;
271 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
272 // $targetchars is set when $snippes contains anything
273 while ( $len < $targetchars - 20
274 && array_key_exists( $add, $all )
275 && !array_key_exists( $add, $snippets ) ) {
276 $offsets[$add] = 0;
277 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
278 // $targetchars is set when $snippes contains anything
279 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
280 $extended[$add] = $tt;
281 $len += strlen( $tt );
282 $add++;
283 }
284 }
285 }
286
287 // $snippets = array_map( 'htmlspecialchars', $extended );
288 $snippets = $extended;
289 $last = -1;
290 $extract = '';
291 foreach ( $snippets as $index => $line ) {
292 if ( $last == -1 ) {
293 $extract .= $line; // first line
294 } elseif ( $last + 1 == $index
295 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
296 ) {
297 $extract .= " " . $line; // continuous lines
298 } else {
299 $extract .= '<b> ... </b>' . $line;
300 }
301
302 $last = $index;
303 }
304 if ( $extract ) {
305 $extract .= '<b> ... </b>';
306 }
307
308 $processed = [];
309 foreach ( $terms as $term ) {
310 if ( !isset( $processed[$term] ) ) {
311 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
312 $extract = preg_replace( $pat3,
313 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
314 $processed[$term] = true;
315 }
316 }
317
318 return $extract;
319 }
320
328 private function splitAndAdd( &$extracts, &$count, $text ) {
329 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
330 foreach ( $split as $line ) {
331 $tt = trim( $line );
332 if ( $tt ) {
333 $extracts[$count++] = $tt;
334 }
335 }
336 }
337
344 private function caseCallback( $matches ) {
345 if ( strlen( $matches[0] ) > 1 ) {
346 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
347 return '[' . $contLang->lc( $matches[0] ) .
348 $contLang->uc( $matches[0] ) . ']';
349 } else {
350 return $matches[0];
351 }
352 }
353
364 private function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
365 if ( $start != 0 ) {
366 $start = $this->position( $text, $start, 1 );
367 }
368 if ( $end >= strlen( $text ) ) {
369 $end = strlen( $text );
370 } else {
371 $end = $this->position( $text, $end );
372 }
373
374 if ( $posStart !== null ) {
375 $posStart = $start;
376 }
377 if ( $posEnd !== null ) {
378 $posEnd = $end;
379 }
380
381 if ( $end > $start ) {
382 return substr( $text, $start, $end - $start );
383 } else {
384 return '';
385 }
386 }
387
396 private function position( $text, $point, $offset = 0 ) {
397 $tolerance = 10;
398 $s = max( 0, $point - $tolerance );
399 $l = min( strlen( $text ), $point + $tolerance ) - $s;
400 $m = [];
401
402 if ( preg_match(
403 '/[ ,.!?~!@#$%^&*\‍(\‍)+=\-\\\|\[\]"\'<>]/',
404 substr( $text, $s, $l ),
405 $m,
406 PREG_OFFSET_CAPTURE
407 ) ) {
408 return $m[0][1] + $s + $offset;
409 } else {
410 // check if point is on a valid first UTF8 char
411 $char = ord( $text[$point] );
412 while ( $char >= 0x80 && $char < 0xc0 ) {
413 // skip trailing bytes
414 $point++;
415 if ( $point >= strlen( $text ) ) {
416 return strlen( $text );
417 }
418 $char = ord( $text[$point] );
419 }
420
421 return $point;
422
423 }
424 }
425
436 private function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
437 if ( $linesleft == 0 ) {
438 return; // nothing to do
439 }
440 foreach ( $extracts as $index => $line ) {
441 if ( array_key_exists( $index, $out ) ) {
442 continue; // this line already highlighted
443 }
444
445 $m = [];
446 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
447 continue;
448 }
449
450 $offset = $m[0][1];
451 $len = strlen( $m[0][0] );
452 if ( $offset + $len < $contextchars ) {
453 $begin = 0;
454 } elseif ( $len > $contextchars ) {
455 $begin = $offset;
456 } else {
457 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
458 }
459
460 $end = $begin + $contextchars;
461
462 $posBegin = $begin;
463 // basic snippet from this line
464 $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
465 $offsets[$index] = $posBegin;
466 $linesleft--;
467 if ( $linesleft == 0 ) {
468 return;
469 }
470 }
471 }
472
478 private function removeWiki( $text ) {
479 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
480 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
481 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
482 $text = preg_replace_callback(
483 "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
484 [ $this, 'linkReplace' ],
485 $text
486 );
487 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
488 $text = preg_replace( "/'''''/", "", $text );
489 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
490 $text = preg_replace( "/''/", "", $text );
491
492 // Note, the previous /<\/?[^>]+>/ is insufficient
493 // for XSS safety as the HTML tag can span multiple
494 // search results (T144845).
495 $text = Sanitizer::escapeHtmlAllowEntities( $text );
496 return $text;
497 }
498
506 private function linkReplace( $matches ) {
507 $colon = strpos( $matches[1], ':' );
508 if ( $colon === false ) {
509 return $matches[2]; // replace with caption
510 }
511 $ns = substr( $matches[1], 0, $colon );
512 $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
513 if ( $index !== false && ( $index === NS_FILE || $index === NS_CATEGORY ) ) {
514 return $matches[0]; // return the whole thing
515 } else {
516 return $matches[2];
517 }
518 }
519
532 public function highlightSimple(
533 $text,
534 $terms,
535 $contextlines = self::DEFAULT_CONTEXT_LINES,
536 $contextchars = self::DEFAULT_CONTEXT_CHARS
537 ) {
538 $lines = explode( "\n", $text );
539
540 $terms = implode( '|', $terms );
541 $max = intval( $contextchars ) + 1;
542 $pat1 = "/(.*)($terms)(.{0,$max})/ui";
543
544 $extract = '';
545 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
546 foreach ( $lines as $line ) {
547 if ( $contextlines == 0 ) {
548 break;
549 }
550 $m = [];
551 if ( !preg_match( $pat1, $line, $m ) ) {
552 continue;
553 }
554 --$contextlines;
555 // truncate function changes ... to relevant i18n message.
556 $pre = $contLang->truncateForVisual( $m[1], -$contextchars, '...', false );
557
558 if ( count( $m ) < 3 ) {
559 $post = '';
560 } else {
561 $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
562 }
563
564 $found = $m[2];
565
566 $line = htmlspecialchars( $pre . $found . $post );
567 $pat2 = '/(' . $terms . ')/ui';
568 $line = preg_replace( $pat2, '<span class="searchmatch">\1</span>', $line );
569
570 $extract .= "{$line}\n";
571 }
572
573 return $extract;
574 }
575
584 public function highlightNone(
585 $text,
586 $contextlines = self::DEFAULT_CONTEXT_LINES,
587 $contextchars = self::DEFAULT_CONTEXT_CHARS
588 ) {
589 $match = [];
590 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
591 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
592 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
593
594 // Trim and limit to max number of chars
595 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
596 return str_replace( "\n", '<br>', $text );
597 }
598}
const NS_FILE
Definition Defines.php:70
const NS_CATEGORY
Definition Defines.php:78
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:46
Highlight bits of wikitext.
highlightText( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
__construct( $cleanupWikitext=true)
highlightSimple( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Simple & fast snippet extraction, but gives completely irrelevant snippets.
highlightNone( $text, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Returns the first few lines of the text.
if(!file_exists( $CREDITS)) $lines