MediaWiki  master
SearchHighlighter.php
Go to the documentation of this file.
1 <?php
26 
36  public const DEFAULT_CONTEXT_LINES = 2;
37  public const DEFAULT_CONTEXT_CHARS = 75;
38 
39  protected $mCleanWikitext = true;
40 
47  public function __construct( $cleanupWikitext = true ) {
48  $this->mCleanWikitext = $cleanupWikitext;
49  }
50 
61  public function highlightText(
62  $text,
63  $terms,
64  $contextlines = self::DEFAULT_CONTEXT_LINES,
65  $contextchars = self::DEFAULT_CONTEXT_CHARS
66  ) {
67  $searchHighlightBoundaries = MediaWikiServices::getInstance()
68  ->getMainConfig()->get( MainConfigNames::SearchHighlightBoundaries );
69 
70  if ( $text == '' ) {
71  return '';
72  }
73 
74  // split text into text + templates/links/tables
75  $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
76  // first capture group is for detecting nested templates/links/tables/references
77  $endPatterns = [
78  1 => '/(\{\{)|(\}\})/', // template
79  2 => '/(\[\[)|(\]\])/', // image
80  3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
81 
82  // @todo FIXME: This should prolly be a hook or something
83  // instead of hardcoding the name of the Cite extension
84  if ( \ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
85  $spat .= '|(<ref>)'; // references via cite extension
86  $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
87  }
88  $spat .= '/';
89  $textExt = []; // text extracts
90  $otherExt = []; // other extracts
91  $start = 0;
92  $textLen = strlen( $text );
93  $count = 0; // sequence number to maintain ordering
94  while ( $start < $textLen ) {
95  // find start of template/image/table
96  if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
97  $epat = '';
98  foreach ( $matches as $key => $val ) {
99  if ( $key > 0 && $val[1] != -1 ) {
100  if ( $key == 2 ) {
101  // see if this is an image link
102  $ns = substr( $val[0], 2, -1 );
103  if (
104  MediaWikiServices::getInstance()->getContentLanguage()->
105  getNsIndex( $ns ) !== NS_FILE
106  ) {
107  break;
108  }
109 
110  }
111  $epat = $endPatterns[$key];
112  $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
113  $start = $val[1];
114  break;
115  }
116  }
117  if ( $epat ) {
118  // find end (and detect any nested elements)
119  $level = 0;
120  $offset = $start + 1;
121  $found = false;
122  while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
123  if ( array_key_exists( 2, $endMatches ) ) {
124  // found end
125  if ( $level == 0 ) {
126  $len = strlen( $endMatches[2][0] );
127  $off = $endMatches[2][1];
128  $this->splitAndAdd( $otherExt, $count,
129  substr( $text, $start, $off + $len - $start ) );
130  $start = $off + $len;
131  $found = true;
132  break;
133  } else {
134  // end of nested element
135  $level -= 1;
136  }
137  } else {
138  // nested
139  $level += 1;
140  }
141  $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
142  }
143  if ( !$found ) {
144  // couldn't find appropriate closing tag, skip
145  $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
146  $start += strlen( $matches[0][0] );
147  }
148  continue;
149  }
150  }
151  // else: add as text extract
152  $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
153  break;
154  }
155  '@phan-var string[] $textExt';
156 
157  $all = $textExt + $otherExt; // these have disjunct key sets
158 
159  // prepare regexps
160  foreach ( $terms as $index => $term ) {
161  // manually do upper/lowercase stuff for utf-8 since PHP won't do it
162  if ( preg_match( '/[\x80-\xff]/', $term ) ) {
163  $terms[$index] = preg_replace_callback(
164  '/./us',
165  [ $this, 'caseCallback' ],
166  $terms[$index]
167  );
168  } else {
169  $terms[$index] = $term;
170  }
171  }
172  $anyterm = implode( '|', $terms );
173  $phrase = implode( "{$searchHighlightBoundaries}+", $terms );
174  // @todo FIXME: A hack to scale contextchars, a correct solution
175  // would be to have contextchars actually be char and not byte
176  // length, and do proper utf-8 substrings and lengths everywhere,
177  // but PHP is making that very hard and unclean to implement :(
178  $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
179  $contextchars = intval( $contextchars * $scale );
180 
181  $patPre = "(^|{$searchHighlightBoundaries})";
182  $patPost = "({$searchHighlightBoundaries}|$)";
183 
184  $pat1 = "/(" . $phrase . ")/ui";
185  $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
186 
187  $left = $contextlines;
188 
189  $snippets = [];
190  $offsets = [];
191 
192  // show beginning only if it contains all words
193  $first = 0;
194  $firstText = '';
195  foreach ( $textExt as $index => $line ) {
196  if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
197  $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
198  $first = $index;
199  break;
200  }
201  }
202  if ( $firstText ) {
203  $succ = true;
204  // check if first text contains all terms
205  foreach ( $terms as $term ) {
206  if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
207  $succ = false;
208  break;
209  }
210  }
211  if ( $succ ) {
212  $snippets[$first] = $firstText;
213  $offsets[$first] = 0;
214  }
215  }
216  if ( !$snippets ) {
217  // match whole query on text
218  $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
219  // match whole query on templates/tables/images
220  $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
221  // match any words on text
222  $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
223  // match any words on templates/tables/images
224  $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
225 
226  ksort( $snippets );
227  }
228 
229  // add extra chars to each snippet to make snippets constant size
230  $extended = [];
231  if ( count( $snippets ) == 0 ) {
232  // couldn't find the target words, just show beginning of article
233  if ( array_key_exists( $first, $all ) ) {
234  $targetchars = $contextchars * $contextlines;
235  $snippets[$first] = '';
236  $offsets[$first] = 0;
237  }
238  } else {
239  // if begin of the article contains the whole phrase, show only that !!
240  if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
241  && $offsets[$first] < $contextchars * 2 ) {
242  $snippets = [ $first => $snippets[$first] ];
243  }
244 
245  // calc by how much to extend existing snippets
246  $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
247  }
248 
249  foreach ( $snippets as $index => $line ) {
250  $extended[$index] = $line;
251  $len = strlen( $line );
252  // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
253  // $targetchars is set when $snippes contains anything
254  if ( $len < $targetchars - 20 ) {
255  // complete this line
256  if ( $len < strlen( $all[$index] ) ) {
257  $extended[$index] = $this->extract(
258  $all[$index],
259  $offsets[$index],
260  // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
261  // $targetchars is set when $snippes contains anything
262  $offsets[$index] + $targetchars,
263  $offsets[$index]
264  );
265  $len = strlen( $extended[$index] );
266  }
267 
268  // add more lines
269  $add = $index + 1;
270  // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
271  // $targetchars is set when $snippes contains anything
272  while ( $len < $targetchars - 20
273  && array_key_exists( $add, $all )
274  && !array_key_exists( $add, $snippets ) ) {
275  $offsets[$add] = 0;
276  // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
277  // $targetchars is set when $snippes contains anything
278  $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
279  $extended[$add] = $tt;
280  $len += strlen( $tt );
281  $add++;
282  }
283  }
284  }
285 
286  // $snippets = array_map( 'htmlspecialchars', $extended );
287  $snippets = $extended;
288  $last = -1;
289  $extract = '';
290  foreach ( $snippets as $index => $line ) {
291  if ( $last == -1 ) {
292  $extract .= $line; // first line
293  } elseif ( $last + 1 == $index
294  && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
295  ) {
296  $extract .= " " . $line; // continuous lines
297  } else {
298  $extract .= '<b> ... </b>' . $line;
299  }
300 
301  $last = $index;
302  }
303  if ( $extract ) {
304  $extract .= '<b> ... </b>';
305  }
306 
307  $processed = [];
308  foreach ( $terms as $term ) {
309  if ( !isset( $processed[$term] ) ) {
310  $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
311  $extract = preg_replace( $pat3,
312  "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
313  $processed[$term] = true;
314  }
315  }
316 
317  return $extract;
318  }
319 
327  private function splitAndAdd( &$extracts, &$count, $text ) {
328  $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
329  foreach ( $split as $line ) {
330  $tt = trim( $line );
331  if ( $tt ) {
332  $extracts[$count++] = $tt;
333  }
334  }
335  }
336 
343  private function caseCallback( $matches ) {
344  if ( strlen( $matches[0] ) > 1 ) {
345  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
346  return '[' . $contLang->lc( $matches[0] ) .
347  $contLang->uc( $matches[0] ) . ']';
348  } else {
349  return $matches[0];
350  }
351  }
352 
363  private function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
364  if ( $start != 0 ) {
365  $start = $this->position( $text, $start, 1 );
366  }
367  if ( $end >= strlen( $text ) ) {
368  $end = strlen( $text );
369  } else {
370  $end = $this->position( $text, $end );
371  }
372 
373  if ( $posStart !== null ) {
374  $posStart = $start;
375  }
376  if ( $posEnd !== null ) {
377  $posEnd = $end;
378  }
379 
380  if ( $end > $start ) {
381  return substr( $text, $start, $end - $start );
382  } else {
383  return '';
384  }
385  }
386 
395  private function position( $text, $point, $offset = 0 ) {
396  $tolerance = 10;
397  $s = max( 0, $point - $tolerance );
398  $l = min( strlen( $text ), $point + $tolerance ) - $s;
399  $m = [];
400 
401  if ( preg_match(
402  '/[ ,.!?~!@#$%^&*\‍(\‍)+=\-\\\|\[\]"\'<>]/',
403  substr( $text, $s, $l ),
404  $m,
405  PREG_OFFSET_CAPTURE
406  ) ) {
407  return $m[0][1] + $s + $offset;
408  } else {
409  // check if point is on a valid first UTF8 char
410  $char = ord( $text[$point] );
411  while ( $char >= 0x80 && $char < 0xc0 ) {
412  // skip trailing bytes
413  $point++;
414  if ( $point >= strlen( $text ) ) {
415  return strlen( $text );
416  }
417  $char = ord( $text[$point] );
418  }
419 
420  return $point;
421 
422  }
423  }
424 
435  private function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
436  if ( $linesleft == 0 ) {
437  return; // nothing to do
438  }
439  foreach ( $extracts as $index => $line ) {
440  if ( array_key_exists( $index, $out ) ) {
441  continue; // this line already highlighted
442  }
443 
444  $m = [];
445  if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
446  continue;
447  }
448 
449  $offset = $m[0][1];
450  $len = strlen( $m[0][0] );
451  if ( $offset + $len < $contextchars ) {
452  $begin = 0;
453  } elseif ( $len > $contextchars ) {
454  $begin = $offset;
455  } else {
456  $begin = $offset + intval( ( $len - $contextchars ) / 2 );
457  }
458 
459  $end = $begin + $contextchars;
460 
461  $posBegin = $begin;
462  // basic snippet from this line
463  $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
464  $offsets[$index] = $posBegin;
465  $linesleft--;
466  if ( $linesleft == 0 ) {
467  return;
468  }
469  }
470  }
471 
477  private function removeWiki( $text ) {
478  $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
479  $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
480  $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
481  $text = preg_replace_callback(
482  "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
483  [ $this, 'linkReplace' ],
484  $text
485  );
486  $text = preg_replace( "/<\/?[^>]+>/", "", $text );
487  $text = preg_replace( "/'''''/", "", $text );
488  $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
489  $text = preg_replace( "/''/", "", $text );
490 
491  // Note, the previous /<\/?[^>]+>/ is insufficient
492  // for XSS safety as the HTML tag can span multiple
493  // search results (T144845).
494  $text = Sanitizer::escapeHtmlAllowEntities( $text );
495  return $text;
496  }
497 
505  private function linkReplace( $matches ) {
506  $colon = strpos( $matches[1], ':' );
507  if ( $colon === false ) {
508  return $matches[2]; // replace with caption
509  }
510  $ns = substr( $matches[1], 0, $colon );
511  $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
512  if ( $index !== false && ( $index === NS_FILE || $index === NS_CATEGORY ) ) {
513  return $matches[0]; // return the whole thing
514  } else {
515  return $matches[2];
516  }
517  }
518 
531  public function highlightSimple(
532  $text,
533  $terms,
534  $contextlines = self::DEFAULT_CONTEXT_LINES,
535  $contextchars = self::DEFAULT_CONTEXT_CHARS
536  ) {
537  $lines = explode( "\n", $text );
538 
539  $terms = implode( '|', $terms );
540  $max = intval( $contextchars ) + 1;
541  $pat1 = "/(.*)($terms)(.{0,$max})/i";
542 
543  $extract = "";
544  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
545  foreach ( $lines as $line ) {
546  if ( $contextlines == 0 ) {
547  break;
548  }
549  $m = [];
550  if ( !preg_match( $pat1, $line, $m ) ) {
551  continue;
552  }
553  --$contextlines;
554  // truncate function changes ... to relevant i18n message.
555  $pre = $contLang->truncateForVisual( $m[1], -$contextchars, '...', false );
556 
557  if ( count( $m ) < 3 ) {
558  $post = '';
559  } else {
560  $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
561  }
562 
563  $found = $m[2];
564 
565  $line = htmlspecialchars( $pre . $found . $post );
566  $pat2 = '/(' . $terms . ")/i";
567  $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
568 
569  $extract .= "{$line}\n";
570  }
571 
572  return $extract;
573  }
574 
583  public function highlightNone(
584  $text,
585  $contextlines = self::DEFAULT_CONTEXT_LINES,
586  $contextchars = self::DEFAULT_CONTEXT_CHARS
587  ) {
588  $match = [];
589  $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
590  $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
591  preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
592 
593  // Trim and limit to max number of chars
594  $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
595  return str_replace( "\n", '<br>', $text );
596  }
597 }
const NS_FILE
Definition: Defines.php:70
const NS_CATEGORY
Definition: Defines.php:78
$matches
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1122
Highlight bits of wikitext.
highlightText( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
__construct( $cleanupWikitext=true)
highlightSimple( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Simple & fast snippet extraction, but gives completely irrelevant snippets.
highlightNone( $text, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Returns the first few lines of the text.
foreach( $mmfl['setupFiles'] as $fileName) if( $queue) if(empty( $mmfl['quiet'])) $s
if(!file_exists( $CREDITS)) $lines