1 <?php
30  protected $mCleanWikitext = true;
36  function __construct( $cleanupWikitext = true ) {
37  $this->mCleanWikitext = $cleanupWikitext;
38  }
49  public function highlightText( $text, $terms, $contextlines, $contextchars ) {
50  global $wgContLang, $wgSearchHighlightBoundaries;
52  if ( $text == '' ) {
53  return '';
54  }
56  // spli text into text + templates/links/tables
57  $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
58  // first capture group is for detecting nested templates/links/tables/references
59  $endPatterns = [
60  1 => '/(\{\{)|(\}\})/', // template
61  2 => '/(\[\[)|(\]\])/', // image
62  3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
64  // @todo FIXME: This should prolly be a hook or something
65  // instead of hardcoding a class name from the Cite extension
66  if ( class_exists( 'Cite' ) ) {
67  $spat .= '|(<ref>)'; // references via cite extension
68  $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
69  }
70  $spat .= '/';
71  $textExt = []; // text extracts
72  $otherExt = []; // other extracts
73  $start = 0;
74  $textLen = strlen( $text );
75  $count = 0; // sequence number to maintain ordering
76  while ( $start < $textLen ) {
77  // find start of template/image/table
78  if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
79  $epat = '';
80  foreach ( $matches as $key => $val ) {
81  if ( $key > 0 && $val[1] != - 1 ) {
82  if ( $key == 2 ) {
83  // see if this is an image link
84  $ns = substr( $val[0], 2, - 1 );
85  if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
86  break;
87  }
89  }
90  $epat = $endPatterns[$key];
91  $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
92  $start = $val[1];
93  break;
94  }
95  }
96  if ( $epat ) {
97  // find end (and detect any nested elements)
98  $level = 0;
99  $offset = $start + 1;
100  $found = false;
101  while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
102  if ( array_key_exists( 2, $endMatches ) ) {
103  // found end
104  if ( $level == 0 ) {
105  $len = strlen( $endMatches[2][0] );
106  $off = $endMatches[2][1];
107  $this->splitAndAdd( $otherExt, $count,
108  substr( $text, $start, $off + $len - $start ) );
109  $start = $off + $len;
110  $found = true;
111  break;
112  } else {
113  // end of nested element
114  $level -= 1;
115  }
116  } else {
117  // nested
118  $level += 1;
119  }
120  $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
121  }
122  if ( !$found ) {
123  // couldn't find appropriate closing tag, skip
124  $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
125  $start += strlen( $matches[0][0] );
126  }
127  continue;
128  }
129  }
130  // else: add as text extract
131  $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
132  break;
133  }
135  $all = $textExt + $otherExt; // these have disjunct key sets
137  // prepare regexps
138  foreach ( $terms as $index => $term ) {
139  // manually do upper/lowercase stuff for utf-8 since PHP won't do it
140  if ( preg_match( '/[\x80-\xff]/', $term ) ) {
141  $terms[$index] = preg_replace_callback(
142  '/./us',
143  [ $this, 'caseCallback' ],
144  $terms[$index]
145  );
146  } else {
147  $terms[$index] = $term;
148  }
149  }
150  $anyterm = implode( '|', $terms );
151  $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
153  // @todo FIXME: A hack to scale contextchars, a correct solution
154  // would be to have contextchars actually be char and not byte
155  // length, and do proper utf-8 substrings and lengths everywhere,
156  // but PHP is making that very hard and unclean to implement :(
157  $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
158  $contextchars = intval( $contextchars * $scale );
160  $patPre = "(^|$wgSearchHighlightBoundaries)";
161  $patPost = "($wgSearchHighlightBoundaries|$)";
163  $pat1 = "/(" . $phrase . ")/ui";
164  $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
166  $left = $contextlines;
168  $snippets = [];
169  $offsets = [];
171  // show beginning only if it contains all words
172  $first = 0;
173  $firstText = '';
174  foreach ( $textExt as $index => $line ) {
175  if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
176  $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
177  $first = $index;
178  break;
179  }
180  }
181  if ( $firstText ) {
182  $succ = true;
183  // check if first text contains all terms
184  foreach ( $terms as $term ) {
185  if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
186  $succ = false;
187  break;
188  }
189  }
190  if ( $succ ) {
191  $snippets[$first] = $firstText;
192  $offsets[$first] = 0;
193  }
194  }
195  if ( !$snippets ) {
196  // match whole query on text
197  $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
198  // match whole query on templates/tables/images
199  $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
200  // match any words on text
201  $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
202  // match any words on templates/tables/images
203  $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
205  ksort( $snippets );
206  }
208  // add extra chars to each snippet to make snippets constant size
209  $extended = [];
210  if ( count( $snippets ) == 0 ) {
211  // couldn't find the target words, just show beginning of article
212  if ( array_key_exists( $first, $all ) ) {
213  $targetchars = $contextchars * $contextlines;
214  $snippets[$first] = '';
215  $offsets[$first] = 0;
216  }
217  } else {
218  // if begin of the article contains the whole phrase, show only that !!
219  if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
220  && $offsets[$first] < $contextchars * 2 ) {
221  $snippets = [ $first => $snippets[$first] ];
222  }
224  // calc by how much to extend existing snippets
225  $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
226  }
228  foreach ( $snippets as $index => $line ) {
229  $extended[$index] = $line;
230  $len = strlen( $line );
231  if ( $len < $targetchars - 20 ) {
232  // complete this line
233  if ( $len < strlen( $all[$index] ) ) {
234  $extended[$index] = $this->extract(
235  $all[$index],
236  $offsets[$index],
237  $offsets[$index] + $targetchars,
238  $offsets[$index]
239  );
240  $len = strlen( $extended[$index] );
241  }
243  // add more lines
244  $add = $index + 1;
245  while ( $len < $targetchars - 20
246  && array_key_exists( $add, $all )
247  && !array_key_exists( $add, $snippets ) ) {
248  $offsets[$add] = 0;
249  $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
250  $extended[$add] = $tt;
251  $len += strlen( $tt );
252  $add++;
253  }
254  }
255  }
257  // $snippets = array_map( 'htmlspecialchars', $extended );
258  $snippets = $extended;
259  $last = - 1;
260  $extract = '';
261  foreach ( $snippets as $index => $line ) {
262  if ( $last == - 1 ) {
263  $extract .= $line; // first line
264  } elseif ( $last + 1 == $index
265  && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
266  ) {
267  $extract .= " " . $line; // continous lines
268  } else {
269  $extract .= '<b> ... </b>' . $line;
270  }
272  $last = $index;
273  }
274  if ( $extract ) {
275  $extract .= '<b> ... </b>';
276  }
278  $processed = [];
279  foreach ( $terms as $term ) {
280  if ( !isset( $processed[$term] ) ) {
281  $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
282  $extract = preg_replace( $pat3,
283  "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
284  $processed[$term] = true;
285  }
286  }
288  return $extract;
289  }
298  function splitAndAdd( &$extracts, &$count, $text ) {
299  $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
300  foreach ( $split as $line ) {
301  $tt = trim( $line );
302  if ( $tt ) {
303  $extracts[$count++] = $tt;
304  }
305  }
306  }
314  function caseCallback( $matches ) {
316  if ( strlen( $matches[0] ) > 1 ) {
317  return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
318  } else {
319  return $matches[0];
320  }
321  }
333  function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
334  if ( $start != 0 ) {
335  $start = $this->position( $text, $start, 1 );
336  }
337  if ( $end >= strlen( $text ) ) {
338  $end = strlen( $text );
339  } else {
340  $end = $this->position( $text, $end );
341  }
343  if ( !is_null( $posStart ) ) {
344  $posStart = $start;
345  }
346  if ( !is_null( $posEnd ) ) {
347  $posEnd = $end;
348  }
350  if ( $end > $start ) {
351  return substr( $text, $start, $end - $start );
352  } else {
353  return '';
354  }
355  }
365  function position( $text, $point, $offset = 0 ) {
366  $tolerance = 10;
367  $s = max( 0, $point - $tolerance );
368  $l = min( strlen( $text ), $point + $tolerance ) - $s;
369  $m = [];
371  if ( preg_match(
372  '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
373  substr( $text, $s, $l ),
374  $m,
376  ) ) {
377  return $m[0][1] + $s + $offset;
378  } else {
379  // check if point is on a valid first UTF8 char
380  $char = ord( $text[$point] );
381  while ( $char >= 0x80 && $char < 0xc0 ) {
382  // skip trailing bytes
383  $point++;
384  if ( $point >= strlen( $text ) ) {
385  return strlen( $text );
386  }
387  $char = ord( $text[$point] );
388  }
390  return $point;
392  }
393  }
406  function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
407  if ( $linesleft == 0 ) {
408  return; // nothing to do
409  }
410  foreach ( $extracts as $index => $line ) {
411  if ( array_key_exists( $index, $out ) ) {
412  continue; // this line already highlighted
413  }
415  $m = [];
416  if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
417  continue;
418  }
420  $offset = $m[0][1];
421  $len = strlen( $m[0][0] );
422  if ( $offset + $len < $contextchars ) {
423  $begin = 0;
424  } elseif ( $len > $contextchars ) {
425  $begin = $offset;
426  } else {
427  $begin = $offset + intval( ( $len - $contextchars ) / 2 );
428  }
430  $end = $begin + $contextchars;
432  $posBegin = $begin;
433  // basic snippet from this line
434  $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
435  $offsets[$index] = $posBegin;
436  $linesleft--;
437  if ( $linesleft == 0 ) {
438  return;
439  }
440  }
441  }
449  function removeWiki( $text ) {
450  $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
451  $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
452  $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
453  $text = preg_replace_callback(
454  "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
455  [ $this, 'linkReplace' ],
456  $text
457  );
458  $text = preg_replace( "/<\/?[^>]+>/", "", $text );
459  $text = preg_replace( "/'''''/", "", $text );
460  $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
461  $text = preg_replace( "/''/", "", $text );
463  // Note, the previous /<\/?[^>]+>/ is insufficient
464  // for XSS safety as the HTML tag can span multiple
465  // search results (T144845).
466  $text = Sanitizer::escapeHtmlAllowEntities( $text );
467  return $text;
468  }
477  function linkReplace( $matches ) {
478  $colon = strpos( $matches[1], ':' );
479  if ( $colon === false ) {
480  return $matches[2]; // replace with caption
481  }
483  $ns = substr( $matches[1], 0, $colon );
484  $index = $wgContLang->getNsIndex( $ns );
485  if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
486  return $matches[0]; // return the whole thing
487  } else {
488  return $matches[2];
489  }
490  }
502  public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
505  $lines = explode( "\n", $text );
507  $terms = implode( '|', $terms );
508  $max = intval( $contextchars ) + 1;
509  $pat1 = "/(.*)($terms)(.{0,$max})/i";
511  $lineno = 0;
513  $extract = "";
514  foreach ( $lines as $line ) {
515  if ( 0 == $contextlines ) {
516  break;
517  }
518  ++$lineno;
519  $m = [];
520  if ( !preg_match( $pat1, $line, $m ) ) {
521  continue;
522  }
523  --$contextlines;
524  // truncate function changes ... to relevant i18n message.
525  $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
527  if ( count( $m ) < 3 ) {
528  $post = '';
529  } else {
530  $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
531  }
533  $found = $m[2];
535  $line = htmlspecialchars( $pre . $found . $post );
536  $pat2 = '/(' . $terms . ")/i";
537  $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
539  $extract .= "${line}\n";
540  }
542  return $extract;
543  }
553  public function highlightNone( $text, $contextlines, $contextchars ) {
554  $match = [];
555  $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
556  $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
557  preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
559  // Trim and limit to max number of chars
560  $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
561  return str_replace( "\n", '<br>', $text );
562  }
563 }
