MediaWiki  master
SearchHighlighter.php
Go to the documentation of this file.
1 <?php
25 
34 
35  protected $mCleanWikitext = true;
36 
42  public function __construct( $cleanupWikitext = true ) {
43  $this->mCleanWikitext = $cleanupWikitext;
44  }
45 
56  public function highlightText(
57  $text,
58  $terms,
59  $contextlines = self::DEFAULT_CONTEXT_LINES,
60  $contextchars = self::DEFAULT_CONTEXT_CHARS
61  ) {
63 
64  if ( $text == '' ) {
65  return '';
66  }
67 
68  // spli text into text + templates/links/tables
69  $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
70  // first capture group is for detecting nested templates/links/tables/references
71  $endPatterns = [
72  1 => '/(\{\{)|(\}\})/', // template
73  2 => '/(\[\[)|(\]\])/', // image
74  3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
75 
76  // @todo FIXME: This should prolly be a hook or something
77  // instead of hardcoding the name of the Cite extension
78  if ( \ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
79  $spat .= '|(<ref>)'; // references via cite extension
80  $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
81  }
82  $spat .= '/';
83  $textExt = []; // text extracts
84  $otherExt = []; // other extracts
85  $start = 0;
86  $textLen = strlen( $text );
87  $count = 0; // sequence number to maintain ordering
88  while ( $start < $textLen ) {
89  // find start of template/image/table
90  if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
91  $epat = '';
92  foreach ( $matches as $key => $val ) {
93  if ( $key > 0 && $val[1] != -1 ) {
94  if ( $key == 2 ) {
95  // see if this is an image link
96  $ns = substr( $val[0], 2, -1 );
97  if (
98  MediaWikiServices::getInstance()->getContentLanguage()->
99  getNsIndex( $ns ) != NS_FILE
100  ) {
101  break;
102  }
103 
104  }
105  $epat = $endPatterns[$key];
106  $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
107  $start = $val[1];
108  break;
109  }
110  }
111  if ( $epat ) {
112  // find end (and detect any nested elements)
113  $level = 0;
114  $offset = $start + 1;
115  $found = false;
116  while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
117  if ( array_key_exists( 2, $endMatches ) ) {
118  // found end
119  if ( $level == 0 ) {
120  $len = strlen( $endMatches[2][0] );
121  $off = $endMatches[2][1];
122  $this->splitAndAdd( $otherExt, $count,
123  substr( $text, $start, $off + $len - $start ) );
124  $start = $off + $len;
125  $found = true;
126  break;
127  } else {
128  // end of nested element
129  $level -= 1;
130  }
131  } else {
132  // nested
133  $level += 1;
134  }
135  $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
136  }
137  if ( !$found ) {
138  // couldn't find appropriate closing tag, skip
139  $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
140  $start += strlen( $matches[0][0] );
141  }
142  continue;
143  }
144  }
145  // else: add as text extract
146  $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
147  break;
148  }
149  '@phan-var string[] $textExt';
150 
151  $all = $textExt + $otherExt; // these have disjunct key sets
152 
153  // prepare regexps
154  foreach ( $terms as $index => $term ) {
155  // manually do upper/lowercase stuff for utf-8 since PHP won't do it
156  if ( preg_match( '/[\x80-\xff]/', $term ) ) {
157  $terms[$index] = preg_replace_callback(
158  '/./us',
159  [ $this, 'caseCallback' ],
160  $terms[$index]
161  );
162  } else {
163  $terms[$index] = $term;
164  }
165  }
166  $anyterm = implode( '|', $terms );
167  $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
168  // @todo FIXME: A hack to scale contextchars, a correct solution
169  // would be to have contextchars actually be char and not byte
170  // length, and do proper utf-8 substrings and lengths everywhere,
171  // but PHP is making that very hard and unclean to implement :(
172  $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
173  $contextchars = intval( $contextchars * $scale );
174 
175  $patPre = "(^|$wgSearchHighlightBoundaries)";
176  $patPost = "($wgSearchHighlightBoundaries|$)";
177 
178  $pat1 = "/(" . $phrase . ")/ui";
179  $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
180 
181  $left = $contextlines;
182 
183  $snippets = [];
184  $offsets = [];
185 
186  // show beginning only if it contains all words
187  $first = 0;
188  $firstText = '';
189  foreach ( $textExt as $index => $line ) {
190  if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
191  $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
192  $first = $index;
193  break;
194  }
195  }
196  if ( $firstText ) {
197  $succ = true;
198  // check if first text contains all terms
199  foreach ( $terms as $term ) {
200  if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
201  $succ = false;
202  break;
203  }
204  }
205  if ( $succ ) {
206  $snippets[$first] = $firstText;
207  $offsets[$first] = 0;
208  }
209  }
210  if ( !$snippets ) {
211  // match whole query on text
212  $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
213  // match whole query on templates/tables/images
214  $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
215  // match any words on text
216  $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
217  // match any words on templates/tables/images
218  $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
219 
220  ksort( $snippets );
221  }
222 
223  // add extra chars to each snippet to make snippets constant size
224  $extended = [];
225  if ( count( $snippets ) == 0 ) {
226  // couldn't find the target words, just show beginning of article
227  if ( array_key_exists( $first, $all ) ) {
228  $targetchars = $contextchars * $contextlines;
229  $snippets[$first] = '';
230  $offsets[$first] = 0;
231  }
232  } else {
233  // if begin of the article contains the whole phrase, show only that !!
234  if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
235  && $offsets[$first] < $contextchars * 2 ) {
236  $snippets = [ $first => $snippets[$first] ];
237  }
238 
239  // calc by how much to extend existing snippets
240  $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
241  }
242 
243  foreach ( $snippets as $index => $line ) {
244  $extended[$index] = $line;
245  $len = strlen( $line );
246  if ( $len < $targetchars - 20 ) {
247  // complete this line
248  if ( $len < strlen( $all[$index] ) ) {
249  $extended[$index] = $this->extract(
250  $all[$index],
251  $offsets[$index],
252  $offsets[$index] + $targetchars,
253  $offsets[$index]
254  );
255  $len = strlen( $extended[$index] );
256  }
257 
258  // add more lines
259  $add = $index + 1;
260  while ( $len < $targetchars - 20
261  && array_key_exists( $add, $all )
262  && !array_key_exists( $add, $snippets ) ) {
263  $offsets[$add] = 0;
264  $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
265  $extended[$add] = $tt;
266  $len += strlen( $tt );
267  $add++;
268  }
269  }
270  }
271 
272  // $snippets = array_map( 'htmlspecialchars', $extended );
273  $snippets = $extended;
274  $last = -1;
275  $extract = '';
276  foreach ( $snippets as $index => $line ) {
277  if ( $last == -1 ) {
278  $extract .= $line; // first line
279  } elseif ( $last + 1 == $index
280  && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
281  ) {
282  $extract .= " " . $line; // continous lines
283  } else {
284  $extract .= '<b> ... </b>' . $line;
285  }
286 
287  $last = $index;
288  }
289  if ( $extract ) {
290  $extract .= '<b> ... </b>';
291  }
292 
293  $processed = [];
294  foreach ( $terms as $term ) {
295  if ( !isset( $processed[$term] ) ) {
296  $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
297  $extract = preg_replace( $pat3,
298  "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
299  $processed[$term] = true;
300  }
301  }
302 
303  return $extract;
304  }
305 
313  function splitAndAdd( &$extracts, &$count, $text ) {
314  $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
315  foreach ( $split as $line ) {
316  $tt = trim( $line );
317  if ( $tt ) {
318  $extracts[$count++] = $tt;
319  }
320  }
321  }
322 
329  function caseCallback( $matches ) {
330  if ( strlen( $matches[0] ) > 1 ) {
331  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
332  return '[' . $contLang->lc( $matches[0] ) .
333  $contLang->uc( $matches[0] ) . ']';
334  } else {
335  return $matches[0];
336  }
337  }
338 
349  function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
350  if ( $start != 0 ) {
351  $start = $this->position( $text, $start, 1 );
352  }
353  if ( $end >= strlen( $text ) ) {
354  $end = strlen( $text );
355  } else {
356  $end = $this->position( $text, $end );
357  }
358 
359  if ( !is_null( $posStart ) ) {
360  $posStart = $start;
361  }
362  if ( !is_null( $posEnd ) ) {
363  $posEnd = $end;
364  }
365 
366  if ( $end > $start ) {
367  return substr( $text, $start, $end - $start );
368  } else {
369  return '';
370  }
371  }
372 
381  function position( $text, $point, $offset = 0 ) {
382  $tolerance = 10;
383  $s = max( 0, $point - $tolerance );
384  $l = min( strlen( $text ), $point + $tolerance ) - $s;
385  $m = [];
386 
387  if ( preg_match(
388  '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
389  substr( $text, $s, $l ),
390  $m,
391  PREG_OFFSET_CAPTURE
392  ) ) {
393  return $m[0][1] + $s + $offset;
394  } else {
395  // check if point is on a valid first UTF8 char
396  $char = ord( $text[$point] );
397  while ( $char >= 0x80 && $char < 0xc0 ) {
398  // skip trailing bytes
399  $point++;
400  if ( $point >= strlen( $text ) ) {
401  return strlen( $text );
402  }
403  $char = ord( $text[$point] );
404  }
405 
406  return $point;
407 
408  }
409  }
410 
422  function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
423  if ( $linesleft == 0 ) {
424  return; // nothing to do
425  }
426  foreach ( $extracts as $index => $line ) {
427  if ( array_key_exists( $index, $out ) ) {
428  continue; // this line already highlighted
429  }
430 
431  $m = [];
432  if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
433  continue;
434  }
435 
436  $offset = $m[0][1];
437  $len = strlen( $m[0][0] );
438  if ( $offset + $len < $contextchars ) {
439  $begin = 0;
440  } elseif ( $len > $contextchars ) {
441  $begin = $offset;
442  } else {
443  $begin = $offset + intval( ( $len - $contextchars ) / 2 );
444  }
445 
446  $end = $begin + $contextchars;
447 
448  $posBegin = $begin;
449  // basic snippet from this line
450  $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
451  $offsets[$index] = $posBegin;
452  $linesleft--;
453  if ( $linesleft == 0 ) {
454  return;
455  }
456  }
457  }
458 
465  function removeWiki( $text ) {
466  $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
467  $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
468  $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
469  $text = preg_replace_callback(
470  "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
471  [ $this, 'linkReplace' ],
472  $text
473  );
474  $text = preg_replace( "/<\/?[^>]+>/", "", $text );
475  $text = preg_replace( "/'''''/", "", $text );
476  $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
477  $text = preg_replace( "/''/", "", $text );
478 
479  // Note, the previous /<\/?[^>]+>/ is insufficient
480  // for XSS safety as the HTML tag can span multiple
481  // search results (T144845).
482  $text = Sanitizer::escapeHtmlAllowEntities( $text );
483  return $text;
484  }
485 
493  function linkReplace( $matches ) {
494  $colon = strpos( $matches[1], ':' );
495  if ( $colon === false ) {
496  return $matches[2]; // replace with caption
497  }
498  $ns = substr( $matches[1], 0, $colon );
499  $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
500  if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
501  return $matches[0]; // return the whole thing
502  } else {
503  return $matches[2];
504  }
505  }
506 
519  public function highlightSimple(
520  $text,
521  $terms,
522  $contextlines = self::DEFAULT_CONTEXT_LINES,
523  $contextchars = self::DEFAULT_CONTEXT_CHARS
524  ) {
525  $lines = explode( "\n", $text );
526 
527  $terms = implode( '|', $terms );
528  $max = intval( $contextchars ) + 1;
529  $pat1 = "/(.*)($terms)(.{0,$max})/i";
530 
531  $lineno = 0;
532 
533  $extract = "";
534  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
535  foreach ( $lines as $line ) {
536  if ( $contextlines == 0 ) {
537  break;
538  }
539  ++$lineno;
540  $m = [];
541  if ( !preg_match( $pat1, $line, $m ) ) {
542  continue;
543  }
544  --$contextlines;
545  // truncate function changes ... to relevant i18n message.
546  $pre = $contLang->truncateForVisual( $m[1], - $contextchars, '...', false );
547 
548  if ( count( $m ) < 3 ) {
549  $post = '';
550  } else {
551  $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
552  }
553 
554  $found = $m[2];
555 
556  $line = htmlspecialchars( $pre . $found . $post );
557  $pat2 = '/(' . $terms . ")/i";
558  $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
559 
560  $extract .= "${line}\n";
561  }
562 
563  return $extract;
564  }
565 
574  public function highlightNone(
575  $text,
576  $contextlines = self::DEFAULT_CONTEXT_LINES,
577  $contextchars = self::DEFAULT_CONTEXT_CHARS
578  ) {
579  $match = [];
580  $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
581  $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
582  preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
583 
584  // Trim and limit to max number of chars
585  $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
586  return str_replace( "\n", '<br>', $text );
587  }
588 }
splitAndAdd(&$extracts, &$count, $text)
Split text into lines and add it to extracts array.
extract( $text, $start, $end, &$posStart=null, &$posEnd=null)
Extract part of the text from start to end, but by not chopping up words.
Highlight bits of wikitext.
$wgSearchHighlightBoundaries
Regexp to match word boundaries, defaults for non-CJK languages should be empty for CJK since the wor...
removeWiki( $text)
Basic wikitext removal.
const NS_CATEGORY
Definition: Defines.php:74
const NS_FILE
Definition: Defines.php:66
highlightSimple( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Simple & fast snippet extraction, but gives completely unrelevant snippets.
highlightText( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
$lines
Definition: router.php:61
position( $text, $point, $offset=0)
Find a nonletter near a point (index) in the text.
process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets)
Search extracts for a pattern, and return snippets.
linkReplace( $matches)
callback to replace [[target|caption]] kind of links, if the target is category or image...
highlightNone( $text, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Returns the first few lines of the text.
caseCallback( $matches)
Do manual case conversion for non-ascii chars.
__construct( $cleanupWikitext=true)
$line
Definition: mcc.php:119
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1395
$matches