MediaWiki  master
SearchHighlighter.php
Go to the documentation of this file.
1 <?php
27 
37  public const DEFAULT_CONTEXT_LINES = 2;
38  public const DEFAULT_CONTEXT_CHARS = 75;
39 
40  protected $mCleanWikitext = true;
41 
48  public function __construct( $cleanupWikitext = true ) {
49  $this->mCleanWikitext = $cleanupWikitext;
50  }
51 
62  public function highlightText(
63  $text,
64  $terms,
65  $contextlines = self::DEFAULT_CONTEXT_LINES,
66  $contextchars = self::DEFAULT_CONTEXT_CHARS
67  ) {
68  $searchHighlightBoundaries = MediaWikiServices::getInstance()
69  ->getMainConfig()->get( MainConfigNames::SearchHighlightBoundaries );
70 
71  if ( $text == '' ) {
72  return '';
73  }
74 
75  // split text into text + templates/links/tables
76  $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
77  // first capture group is for detecting nested templates/links/tables/references
78  $endPatterns = [
79  1 => '/(\{\{)|(\}\})/', // template
80  2 => '/(\[\[)|(\]\])/', // image
81  3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
82 
83  // @todo FIXME: This should prolly be a hook or something
84  // instead of hardcoding the name of the Cite extension
85  if ( \ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
86  $spat .= '|(<ref>)'; // references via cite extension
87  $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
88  }
89  $spat .= '/';
90  $textExt = []; // text extracts
91  $otherExt = []; // other extracts
92  $start = 0;
93  $textLen = strlen( $text );
94  $count = 0; // sequence number to maintain ordering
95  while ( $start < $textLen ) {
96  // find start of template/image/table
97  if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
98  $epat = '';
99  foreach ( $matches as $key => $val ) {
100  if ( $key > 0 && $val[1] != -1 ) {
101  if ( $key == 2 ) {
102  // see if this is an image link
103  $ns = substr( $val[0], 2, -1 );
104  if (
105  MediaWikiServices::getInstance()->getContentLanguage()->
106  getNsIndex( $ns ) !== NS_FILE
107  ) {
108  break;
109  }
110 
111  }
112  $epat = $endPatterns[$key];
113  $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
114  $start = $val[1];
115  break;
116  }
117  }
118  if ( $epat ) {
119  // find end (and detect any nested elements)
120  $level = 0;
121  $offset = $start + 1;
122  $found = false;
123  while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
124  if ( array_key_exists( 2, $endMatches ) ) {
125  // found end
126  if ( $level == 0 ) {
127  $len = strlen( $endMatches[2][0] );
128  $off = $endMatches[2][1];
129  $this->splitAndAdd( $otherExt, $count,
130  substr( $text, $start, $off + $len - $start ) );
131  $start = $off + $len;
132  $found = true;
133  break;
134  } else {
135  // end of nested element
136  $level -= 1;
137  }
138  } else {
139  // nested
140  $level += 1;
141  }
142  $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
143  }
144  if ( !$found ) {
145  // couldn't find appropriate closing tag, skip
146  $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
147  $start += strlen( $matches[0][0] );
148  }
149  continue;
150  }
151  }
152  // else: add as text extract
153  $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
154  break;
155  }
156  '@phan-var string[] $textExt';
157 
158  $all = $textExt + $otherExt; // these have disjunct key sets
159 
160  // prepare regexps
161  foreach ( $terms as $index => $term ) {
162  // manually do upper/lowercase stuff for utf-8 since PHP won't do it
163  if ( preg_match( '/[\x80-\xff]/', $term ) ) {
164  $terms[$index] = preg_replace_callback(
165  '/./us',
166  [ $this, 'caseCallback' ],
167  $terms[$index]
168  );
169  } else {
170  $terms[$index] = $term;
171  }
172  }
173  $anyterm = implode( '|', $terms );
174  $phrase = implode( "{$searchHighlightBoundaries}+", $terms );
175  // @todo FIXME: A hack to scale contextchars, a correct solution
176  // would be to have contextchars actually be char and not byte
177  // length, and do proper utf-8 substrings and lengths everywhere,
178  // but PHP is making that very hard and unclean to implement :(
179  $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
180  $contextchars = intval( $contextchars * $scale );
181 
182  $patPre = "(^|{$searchHighlightBoundaries})";
183  $patPost = "({$searchHighlightBoundaries}|$)";
184 
185  $pat1 = "/(" . $phrase . ")/ui";
186  $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
187 
188  $left = $contextlines;
189 
190  $snippets = [];
191  $offsets = [];
192 
193  // show beginning only if it contains all words
194  $first = 0;
195  $firstText = '';
196  foreach ( $textExt as $index => $line ) {
197  if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
198  $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
199  $first = $index;
200  break;
201  }
202  }
203  if ( $firstText ) {
204  $succ = true;
205  // check if first text contains all terms
206  foreach ( $terms as $term ) {
207  if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
208  $succ = false;
209  break;
210  }
211  }
212  if ( $succ ) {
213  $snippets[$first] = $firstText;
214  $offsets[$first] = 0;
215  }
216  }
217  if ( !$snippets ) {
218  // match whole query on text
219  $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
220  // match whole query on templates/tables/images
221  $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
222  // match any words on text
223  $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
224  // match any words on templates/tables/images
225  $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
226 
227  ksort( $snippets );
228  }
229 
230  // add extra chars to each snippet to make snippets constant size
231  $extended = [];
232  if ( count( $snippets ) == 0 ) {
233  // couldn't find the target words, just show beginning of article
234  if ( array_key_exists( $first, $all ) ) {
235  $targetchars = $contextchars * $contextlines;
236  $snippets[$first] = '';
237  $offsets[$first] = 0;
238  }
239  } else {
240  // if begin of the article contains the whole phrase, show only that !!
241  if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
242  && $offsets[$first] < $contextchars * 2 ) {
243  $snippets = [ $first => $snippets[$first] ];
244  }
245 
246  // calc by how much to extend existing snippets
247  $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
248  }
249 
250  foreach ( $snippets as $index => $line ) {
251  $extended[$index] = $line;
252  $len = strlen( $line );
253  // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
254  // $targetchars is set when $snippes contains anything
255  if ( $len < $targetchars - 20 ) {
256  // complete this line
257  if ( $len < strlen( $all[$index] ) ) {
258  $extended[$index] = $this->extract(
259  $all[$index],
260  $offsets[$index],
261  // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
262  // $targetchars is set when $snippes contains anything
263  $offsets[$index] + $targetchars,
264  $offsets[$index]
265  );
266  $len = strlen( $extended[$index] );
267  }
268 
269  // add more lines
270  $add = $index + 1;
271  // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
272  // $targetchars is set when $snippes contains anything
273  while ( $len < $targetchars - 20
274  && array_key_exists( $add, $all )
275  && !array_key_exists( $add, $snippets ) ) {
276  $offsets[$add] = 0;
277  // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
278  // $targetchars is set when $snippes contains anything
279  $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
280  $extended[$add] = $tt;
281  $len += strlen( $tt );
282  $add++;
283  }
284  }
285  }
286 
287  // $snippets = array_map( 'htmlspecialchars', $extended );
288  $snippets = $extended;
289  $last = -1;
290  $extract = '';
291  foreach ( $snippets as $index => $line ) {
292  if ( $last == -1 ) {
293  $extract .= $line; // first line
294  } elseif ( $last + 1 == $index
295  && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
296  ) {
297  $extract .= " " . $line; // continuous lines
298  } else {
299  $extract .= '<b> ... </b>' . $line;
300  }
301 
302  $last = $index;
303  }
304  if ( $extract ) {
305  $extract .= '<b> ... </b>';
306  }
307 
308  $processed = [];
309  foreach ( $terms as $term ) {
310  if ( !isset( $processed[$term] ) ) {
311  $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
312  $extract = preg_replace( $pat3,
313  "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
314  $processed[$term] = true;
315  }
316  }
317 
318  return $extract;
319  }
320 
328  private function splitAndAdd( &$extracts, &$count, $text ) {
329  $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
330  foreach ( $split as $line ) {
331  $tt = trim( $line );
332  if ( $tt ) {
333  $extracts[$count++] = $tt;
334  }
335  }
336  }
337 
344  private function caseCallback( $matches ) {
345  if ( strlen( $matches[0] ) > 1 ) {
346  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
347  return '[' . $contLang->lc( $matches[0] ) .
348  $contLang->uc( $matches[0] ) . ']';
349  } else {
350  return $matches[0];
351  }
352  }
353 
364  private function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
365  if ( $start != 0 ) {
366  $start = $this->position( $text, $start, 1 );
367  }
368  if ( $end >= strlen( $text ) ) {
369  $end = strlen( $text );
370  } else {
371  $end = $this->position( $text, $end );
372  }
373 
374  if ( $posStart !== null ) {
375  $posStart = $start;
376  }
377  if ( $posEnd !== null ) {
378  $posEnd = $end;
379  }
380 
381  if ( $end > $start ) {
382  return substr( $text, $start, $end - $start );
383  } else {
384  return '';
385  }
386  }
387 
396  private function position( $text, $point, $offset = 0 ) {
397  $tolerance = 10;
398  $s = max( 0, $point - $tolerance );
399  $l = min( strlen( $text ), $point + $tolerance ) - $s;
400  $m = [];
401 
402  if ( preg_match(
403  '/[ ,.!?~!@#$%^&*\‍(\‍)+=\-\\\|\[\]"\'<>]/',
404  substr( $text, $s, $l ),
405  $m,
406  PREG_OFFSET_CAPTURE
407  ) ) {
408  return $m[0][1] + $s + $offset;
409  } else {
410  // check if point is on a valid first UTF8 char
411  $char = ord( $text[$point] );
412  while ( $char >= 0x80 && $char < 0xc0 ) {
413  // skip trailing bytes
414  $point++;
415  if ( $point >= strlen( $text ) ) {
416  return strlen( $text );
417  }
418  $char = ord( $text[$point] );
419  }
420 
421  return $point;
422 
423  }
424  }
425 
436  private function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
437  if ( $linesleft == 0 ) {
438  return; // nothing to do
439  }
440  foreach ( $extracts as $index => $line ) {
441  if ( array_key_exists( $index, $out ) ) {
442  continue; // this line already highlighted
443  }
444 
445  $m = [];
446  if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
447  continue;
448  }
449 
450  $offset = $m[0][1];
451  $len = strlen( $m[0][0] );
452  if ( $offset + $len < $contextchars ) {
453  $begin = 0;
454  } elseif ( $len > $contextchars ) {
455  $begin = $offset;
456  } else {
457  $begin = $offset + intval( ( $len - $contextchars ) / 2 );
458  }
459 
460  $end = $begin + $contextchars;
461 
462  $posBegin = $begin;
463  // basic snippet from this line
464  $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
465  $offsets[$index] = $posBegin;
466  $linesleft--;
467  if ( $linesleft == 0 ) {
468  return;
469  }
470  }
471  }
472 
478  private function removeWiki( $text ) {
479  $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
480  $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
481  $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
482  $text = preg_replace_callback(
483  "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
484  [ $this, 'linkReplace' ],
485  $text
486  );
487  $text = preg_replace( "/<\/?[^>]+>/", "", $text );
488  $text = preg_replace( "/'''''/", "", $text );
489  $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
490  $text = preg_replace( "/''/", "", $text );
491 
492  // Note, the previous /<\/?[^>]+>/ is insufficient
493  // for XSS safety as the HTML tag can span multiple
494  // search results (T144845).
495  $text = Sanitizer::escapeHtmlAllowEntities( $text );
496  return $text;
497  }
498 
506  private function linkReplace( $matches ) {
507  $colon = strpos( $matches[1], ':' );
508  if ( $colon === false ) {
509  return $matches[2]; // replace with caption
510  }
511  $ns = substr( $matches[1], 0, $colon );
512  $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
513  if ( $index !== false && ( $index === NS_FILE || $index === NS_CATEGORY ) ) {
514  return $matches[0]; // return the whole thing
515  } else {
516  return $matches[2];
517  }
518  }
519 
532  public function highlightSimple(
533  $text,
534  $terms,
535  $contextlines = self::DEFAULT_CONTEXT_LINES,
536  $contextchars = self::DEFAULT_CONTEXT_CHARS
537  ) {
538  $lines = explode( "\n", $text );
539 
540  $terms = implode( '|', $terms );
541  $max = intval( $contextchars ) + 1;
542  $pat1 = "/(.*)($terms)(.{0,$max})/ui";
543 
544  $extract = '';
545  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
546  foreach ( $lines as $line ) {
547  if ( $contextlines == 0 ) {
548  break;
549  }
550  $m = [];
551  if ( !preg_match( $pat1, $line, $m ) ) {
552  continue;
553  }
554  --$contextlines;
555  // truncate function changes ... to relevant i18n message.
556  $pre = $contLang->truncateForVisual( $m[1], -$contextchars, '...', false );
557 
558  if ( count( $m ) < 3 ) {
559  $post = '';
560  } else {
561  $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
562  }
563 
564  $found = $m[2];
565 
566  $line = htmlspecialchars( $pre . $found . $post );
567  $pat2 = '/(' . $terms . ')/ui';
568  $line = preg_replace( $pat2, '<span class="searchmatch">\1</span>', $line );
569 
570  $extract .= "{$line}\n";
571  }
572 
573  return $extract;
574  }
575 
584  public function highlightNone(
585  $text,
586  $contextlines = self::DEFAULT_CONTEXT_LINES,
587  $contextchars = self::DEFAULT_CONTEXT_CHARS
588  ) {
589  $match = [];
590  $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
591  $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
592  preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
593 
594  // Trim and limit to max number of chars
595  $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
596  return str_replace( "\n", '<br>', $text );
597  }
598 }
const NS_FILE
Definition: Defines.php:70
const NS_CATEGORY
Definition: Defines.php:78
$matches
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
HTML sanitizer for MediaWiki.
Definition: Sanitizer.php:46
Highlight bits of wikitext.
highlightText( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
__construct( $cleanupWikitext=true)
highlightSimple( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Simple & fast snippet extraction, but gives completely irrelevant snippets.
highlightNone( $text, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Returns the first few lines of the text.
if(!file_exists( $CREDITS)) $lines