MediaWiki  master
SearchHighlighter.php
Go to the documentation of this file.
1 <?php
25 
35  public const DEFAULT_CONTEXT_LINES = 2;
36  public const DEFAULT_CONTEXT_CHARS = 75;
37 
38  protected $mCleanWikitext = true;
39 
46  public function __construct( $cleanupWikitext = true ) {
47  $this->mCleanWikitext = $cleanupWikitext;
48  }
49 
60  public function highlightText(
61  $text,
62  $terms,
63  $contextlines = self::DEFAULT_CONTEXT_LINES,
64  $contextchars = self::DEFAULT_CONTEXT_CHARS
65  ) {
67 
68  if ( $text == '' ) {
69  return '';
70  }
71 
72  // spli text into text + templates/links/tables
73  $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
74  // first capture group is for detecting nested templates/links/tables/references
75  $endPatterns = [
76  1 => '/(\{\{)|(\}\})/', // template
77  2 => '/(\[\[)|(\]\])/', // image
78  3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
79 
80  // @todo FIXME: This should prolly be a hook or something
81  // instead of hardcoding the name of the Cite extension
82  if ( \ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
83  $spat .= '|(<ref>)'; // references via cite extension
84  $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
85  }
86  $spat .= '/';
87  $textExt = []; // text extracts
88  $otherExt = []; // other extracts
89  $start = 0;
90  $textLen = strlen( $text );
91  $count = 0; // sequence number to maintain ordering
92  while ( $start < $textLen ) {
93  // find start of template/image/table
94  if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
95  $epat = '';
96  foreach ( $matches as $key => $val ) {
97  if ( $key > 0 && $val[1] != -1 ) {
98  if ( $key == 2 ) {
99  // see if this is an image link
100  $ns = substr( $val[0], 2, -1 );
101  if (
102  MediaWikiServices::getInstance()->getContentLanguage()->
103  getNsIndex( $ns ) !== NS_FILE
104  ) {
105  break;
106  }
107 
108  }
109  $epat = $endPatterns[$key];
110  $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
111  $start = $val[1];
112  break;
113  }
114  }
115  if ( $epat ) {
116  // find end (and detect any nested elements)
117  $level = 0;
118  $offset = $start + 1;
119  $found = false;
120  while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
121  if ( array_key_exists( 2, $endMatches ) ) {
122  // found end
123  if ( $level == 0 ) {
124  $len = strlen( $endMatches[2][0] );
125  $off = $endMatches[2][1];
126  $this->splitAndAdd( $otherExt, $count,
127  substr( $text, $start, $off + $len - $start ) );
128  $start = $off + $len;
129  $found = true;
130  break;
131  } else {
132  // end of nested element
133  $level -= 1;
134  }
135  } else {
136  // nested
137  $level += 1;
138  }
139  $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
140  }
141  if ( !$found ) {
142  // couldn't find appropriate closing tag, skip
143  $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
144  $start += strlen( $matches[0][0] );
145  }
146  continue;
147  }
148  }
149  // else: add as text extract
150  $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
151  break;
152  }
153  '@phan-var string[] $textExt';
154 
155  $all = $textExt + $otherExt; // these have disjunct key sets
156 
157  // prepare regexps
158  foreach ( $terms as $index => $term ) {
159  // manually do upper/lowercase stuff for utf-8 since PHP won't do it
160  if ( preg_match( '/[\x80-\xff]/', $term ) ) {
161  $terms[$index] = preg_replace_callback(
162  '/./us',
163  [ $this, 'caseCallback' ],
164  $terms[$index]
165  );
166  } else {
167  $terms[$index] = $term;
168  }
169  }
170  $anyterm = implode( '|', $terms );
171  $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
172  // @todo FIXME: A hack to scale contextchars, a correct solution
173  // would be to have contextchars actually be char and not byte
174  // length, and do proper utf-8 substrings and lengths everywhere,
175  // but PHP is making that very hard and unclean to implement :(
176  $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
177  $contextchars = intval( $contextchars * $scale );
178 
179  $patPre = "(^|$wgSearchHighlightBoundaries)";
180  $patPost = "($wgSearchHighlightBoundaries|$)";
181 
182  $pat1 = "/(" . $phrase . ")/ui";
183  $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
184 
185  $left = $contextlines;
186 
187  $snippets = [];
188  $offsets = [];
189 
190  // show beginning only if it contains all words
191  $first = 0;
192  $firstText = '';
193  foreach ( $textExt as $index => $line ) {
194  if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
195  $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
196  $first = $index;
197  break;
198  }
199  }
200  if ( $firstText ) {
201  $succ = true;
202  // check if first text contains all terms
203  foreach ( $terms as $term ) {
204  if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
205  $succ = false;
206  break;
207  }
208  }
209  if ( $succ ) {
210  $snippets[$first] = $firstText;
211  $offsets[$first] = 0;
212  }
213  }
214  if ( !$snippets ) {
215  // match whole query on text
216  $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
217  // match whole query on templates/tables/images
218  $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
219  // match any words on text
220  $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
221  // match any words on templates/tables/images
222  $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
223 
224  ksort( $snippets );
225  }
226 
227  // add extra chars to each snippet to make snippets constant size
228  $extended = [];
229  if ( count( $snippets ) == 0 ) {
230  // couldn't find the target words, just show beginning of article
231  if ( array_key_exists( $first, $all ) ) {
232  $targetchars = $contextchars * $contextlines;
233  $snippets[$first] = '';
234  $offsets[$first] = 0;
235  }
236  } else {
237  // if begin of the article contains the whole phrase, show only that !!
238  if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
239  && $offsets[$first] < $contextchars * 2 ) {
240  $snippets = [ $first => $snippets[$first] ];
241  }
242 
243  // calc by how much to extend existing snippets
244  $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
245  }
246 
247  foreach ( $snippets as $index => $line ) {
248  $extended[$index] = $line;
249  $len = strlen( $line );
250  if ( $len < $targetchars - 20 ) {
251  // complete this line
252  if ( $len < strlen( $all[$index] ) ) {
253  $extended[$index] = $this->extract(
254  $all[$index],
255  $offsets[$index],
256  $offsets[$index] + $targetchars,
257  $offsets[$index]
258  );
259  $len = strlen( $extended[$index] );
260  }
261 
262  // add more lines
263  $add = $index + 1;
264  while ( $len < $targetchars - 20
265  && array_key_exists( $add, $all )
266  && !array_key_exists( $add, $snippets ) ) {
267  $offsets[$add] = 0;
268  $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
269  $extended[$add] = $tt;
270  $len += strlen( $tt );
271  $add++;
272  }
273  }
274  }
275 
276  // $snippets = array_map( 'htmlspecialchars', $extended );
277  $snippets = $extended;
278  $last = -1;
279  $extract = '';
280  foreach ( $snippets as $index => $line ) {
281  if ( $last == -1 ) {
282  $extract .= $line; // first line
283  } elseif ( $last + 1 == $index
284  && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
285  ) {
286  $extract .= " " . $line; // continous lines
287  } else {
288  $extract .= '<b> ... </b>' . $line;
289  }
290 
291  $last = $index;
292  }
293  if ( $extract ) {
294  $extract .= '<b> ... </b>';
295  }
296 
297  $processed = [];
298  foreach ( $terms as $term ) {
299  if ( !isset( $processed[$term] ) ) {
300  $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
301  $extract = preg_replace( $pat3,
302  "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
303  $processed[$term] = true;
304  }
305  }
306 
307  return $extract;
308  }
309 
317  private function splitAndAdd( &$extracts, &$count, $text ) {
318  $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
319  foreach ( $split as $line ) {
320  $tt = trim( $line );
321  if ( $tt ) {
322  $extracts[$count++] = $tt;
323  }
324  }
325  }
326 
333  private function caseCallback( $matches ) {
334  if ( strlen( $matches[0] ) > 1 ) {
335  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
336  return '[' . $contLang->lc( $matches[0] ) .
337  $contLang->uc( $matches[0] ) . ']';
338  } else {
339  return $matches[0];
340  }
341  }
342 
353  private function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
354  if ( $start != 0 ) {
355  $start = $this->position( $text, $start, 1 );
356  }
357  if ( $end >= strlen( $text ) ) {
358  $end = strlen( $text );
359  } else {
360  $end = $this->position( $text, $end );
361  }
362 
363  if ( $posStart !== null ) {
364  $posStart = $start;
365  }
366  if ( $posEnd !== null ) {
367  $posEnd = $end;
368  }
369 
370  if ( $end > $start ) {
371  return substr( $text, $start, $end - $start );
372  } else {
373  return '';
374  }
375  }
376 
385  private function position( $text, $point, $offset = 0 ) {
386  $tolerance = 10;
387  $s = max( 0, $point - $tolerance );
388  $l = min( strlen( $text ), $point + $tolerance ) - $s;
389  $m = [];
390 
391  if ( preg_match(
392  '/[ ,.!?~!@#$%^&*\‍(\‍)+=\-\\\|\[\]"\'<>]/',
393  substr( $text, $s, $l ),
394  $m,
395  PREG_OFFSET_CAPTURE
396  ) ) {
397  return $m[0][1] + $s + $offset;
398  } else {
399  // check if point is on a valid first UTF8 char
400  $char = ord( $text[$point] );
401  while ( $char >= 0x80 && $char < 0xc0 ) {
402  // skip trailing bytes
403  $point++;
404  if ( $point >= strlen( $text ) ) {
405  return strlen( $text );
406  }
407  $char = ord( $text[$point] );
408  }
409 
410  return $point;
411 
412  }
413  }
414 
425  private function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
426  if ( $linesleft == 0 ) {
427  return; // nothing to do
428  }
429  foreach ( $extracts as $index => $line ) {
430  if ( array_key_exists( $index, $out ) ) {
431  continue; // this line already highlighted
432  }
433 
434  $m = [];
435  if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
436  continue;
437  }
438 
439  $offset = $m[0][1];
440  $len = strlen( $m[0][0] );
441  if ( $offset + $len < $contextchars ) {
442  $begin = 0;
443  } elseif ( $len > $contextchars ) {
444  $begin = $offset;
445  } else {
446  $begin = $offset + intval( ( $len - $contextchars ) / 2 );
447  }
448 
449  $end = $begin + $contextchars;
450 
451  $posBegin = $begin;
452  // basic snippet from this line
453  $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
454  $offsets[$index] = $posBegin;
455  $linesleft--;
456  if ( $linesleft == 0 ) {
457  return;
458  }
459  }
460  }
461 
467  private function removeWiki( $text ) {
468  $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
469  $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
470  $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
471  $text = preg_replace_callback(
472  "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
473  [ $this, 'linkReplace' ],
474  $text
475  );
476  $text = preg_replace( "/<\/?[^>]+>/", "", $text );
477  $text = preg_replace( "/'''''/", "", $text );
478  $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
479  $text = preg_replace( "/''/", "", $text );
480 
481  // Note, the previous /<\/?[^>]+>/ is insufficient
482  // for XSS safety as the HTML tag can span multiple
483  // search results (T144845).
484  $text = Sanitizer::escapeHtmlAllowEntities( $text );
485  return $text;
486  }
487 
495  private function linkReplace( $matches ) {
496  $colon = strpos( $matches[1], ':' );
497  if ( $colon === false ) {
498  return $matches[2]; // replace with caption
499  }
500  $ns = substr( $matches[1], 0, $colon );
501  $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
502  if ( $index !== false && ( $index === NS_FILE || $index === NS_CATEGORY ) ) {
503  return $matches[0]; // return the whole thing
504  } else {
505  return $matches[2];
506  }
507  }
508 
521  public function highlightSimple(
522  $text,
523  $terms,
524  $contextlines = self::DEFAULT_CONTEXT_LINES,
525  $contextchars = self::DEFAULT_CONTEXT_CHARS
526  ) {
527  $lines = explode( "\n", $text );
528 
529  $terms = implode( '|', $terms );
530  $max = intval( $contextchars ) + 1;
531  $pat1 = "/(.*)($terms)(.{0,$max})/i";
532 
533  $lineno = 0;
534 
535  $extract = "";
536  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
537  foreach ( $lines as $line ) {
538  if ( $contextlines == 0 ) {
539  break;
540  }
541  ++$lineno;
542  $m = [];
543  if ( !preg_match( $pat1, $line, $m ) ) {
544  continue;
545  }
546  --$contextlines;
547  // truncate function changes ... to relevant i18n message.
548  $pre = $contLang->truncateForVisual( $m[1], - $contextchars, '...', false );
549 
550  if ( count( $m ) < 3 ) {
551  $post = '';
552  } else {
553  $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
554  }
555 
556  $found = $m[2];
557 
558  $line = htmlspecialchars( $pre . $found . $post );
559  $pat2 = '/(' . $terms . ")/i";
560  $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
561 
562  $extract .= "${line}\n";
563  }
564 
565  return $extract;
566  }
567 
576  public function highlightNone(
577  $text,
578  $contextlines = self::DEFAULT_CONTEXT_LINES,
579  $contextchars = self::DEFAULT_CONTEXT_CHARS
580  ) {
581  $match = [];
582  $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
583  $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
584  preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
585 
586  // Trim and limit to max number of chars
587  $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
588  return str_replace( "\n", '<br>', $text );
589  }
590 }
SearchHighlighter\splitAndAdd
splitAndAdd(&$extracts, &$count, $text)
Split text into lines and add it to extracts array.
Definition: SearchHighlighter.php:317
SearchHighlighter\DEFAULT_CONTEXT_CHARS
const DEFAULT_CONTEXT_CHARS
Definition: SearchHighlighter.php:36
MediaWiki\MediaWikiServices
MediaWikiServices is the service locator for the application scope of MediaWiki.
Definition: MediaWikiServices.php:154
SearchHighlighter\$mCleanWikitext
$mCleanWikitext
Definition: SearchHighlighter.php:38
NS_FILE
const NS_FILE
Definition: Defines.php:75
$s
$s
Definition: mergeMessageFileList.php:185
SearchHighlighter\extract
extract( $text, $start, $end, &$posStart=null, &$posEnd=null)
Extract part of the text from start to end, but by not chopping up words.
Definition: SearchHighlighter.php:353
$wgSearchHighlightBoundaries
$wgSearchHighlightBoundaries
Regexp to match word boundaries, defaults for non-CJK languages should be empty for CJK since the wor...
Definition: DefaultSettings.php:6940
ExtensionRegistry\getInstance
static getInstance()
Definition: ExtensionRegistry.php:136
$matches
$matches
Definition: NoLocalSettings.php:24
NS_CATEGORY
const NS_CATEGORY
Definition: Defines.php:83
SearchHighlighter\removeWiki
removeWiki( $text)
Basic wikitext removal.
Definition: SearchHighlighter.php:467
SearchHighlighter\DEFAULT_CONTEXT_LINES
const DEFAULT_CONTEXT_LINES
Definition: SearchHighlighter.php:35
SearchHighlighter\highlightNone
highlightNone( $text, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Returns the first few lines of the text.
Definition: SearchHighlighter.php:576
$line
$line
Definition: mcc.php:119
SearchHighlighter\__construct
__construct( $cleanupWikitext=true)
Stable to call.
Definition: SearchHighlighter.php:46
SearchHighlighter\highlightSimple
highlightSimple( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Simple & fast snippet extraction, but gives completely unrelevant snippets.
Definition: SearchHighlighter.php:521
SearchHighlighter
Highlight bits of wikitext.
Definition: SearchHighlighter.php:34
$lines
if(!file_exists( $CREDITS)) $lines
Definition: updateCredits.php:49
SearchHighlighter\caseCallback
caseCallback( $matches)
Do manual case conversion for non-ascii chars.
Definition: SearchHighlighter.php:333
SearchHighlighter\position
position( $text, $point, $offset=0)
Find a nonletter near a point (index) in the text.
Definition: SearchHighlighter.php:385
SearchHighlighter\highlightText
highlightText( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
Definition: SearchHighlighter.php:60
SearchHighlighter\process
process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets)
Search extracts for a pattern, and return snippets.
Definition: SearchHighlighter.php:425
SearchHighlighter\linkReplace
linkReplace( $matches)
callback to replace [[target|caption]] kind of links, if the target is category or image,...
Definition: SearchHighlighter.php:495
Sanitizer\escapeHtmlAllowEntities
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1231