1 <?php
30  var $mCleanWikitext = true;
32  function __construct( $cleanupWikitext = true ) {
33  $this->mCleanWikitext = $cleanupWikitext;
34  }
45  public function highlightText( $text, $terms, $contextlines, $contextchars ) {
47  global $wgSearchHighlightBoundaries;
48  $fname = __METHOD__;
50  if ( $text == '' ) {
51  return '';
52  }
54  // spli text into text + templates/links/tables
55  $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
56  // first capture group is for detecting nested templates/links/tables/references
57  $endPatterns = array(
58  1 => '/(\{\{)|(\}\})/', // template
59  2 => '/(\[\[)|(\]\])/', // image
60  3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
62  // @todo FIXME: This should prolly be a hook or something
63  if ( function_exists( 'wfCite' ) ) {
64  $spat .= '|(<ref>)'; // references via cite extension
65  $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
66  }
67  $spat .= '/';
68  $textExt = array(); // text extracts
69  $otherExt = array(); // other extracts
70  wfProfileIn( "$fname-split" );
71  $start = 0;
72  $textLen = strlen( $text );
73  $count = 0; // sequence number to maintain ordering
74  while ( $start < $textLen ) {
75  // find start of template/image/table
76  if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
77  $epat = '';
78  foreach ( $matches as $key => $val ) {
79  if ( $key > 0 && $val[1] != - 1 ) {
80  if ( $key == 2 ) {
81  // see if this is an image link
82  $ns = substr( $val[0], 2, - 1 );
83  if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
84  break;
85  }
87  }
88  $epat = $endPatterns[$key];
89  $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
90  $start = $val[1];
91  break;
92  }
93  }
94  if ( $epat ) {
95  // find end (and detect any nested elements)
96  $level = 0;
97  $offset = $start + 1;
98  $found = false;
99  while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
100  if ( array_key_exists( 2, $endMatches ) ) {
101  // found end
102  if ( $level == 0 ) {
103  $len = strlen( $endMatches[2][0] );
104  $off = $endMatches[2][1];
105  $this->splitAndAdd( $otherExt, $count,
106  substr( $text, $start, $off + $len - $start ) );
107  $start = $off + $len;
108  $found = true;
109  break;
110  } else {
111  // end of nested element
112  $level -= 1;
113  }
114  } else {
115  // nested
116  $level += 1;
117  }
118  $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
119  }
120  if ( ! $found ) {
121  // couldn't find appropriate closing tag, skip
122  $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
123  $start += strlen( $matches[0][0] );
124  }
125  continue;
126  }
127  }
128  // else: add as text extract
129  $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
130  break;
131  }
133  $all = $textExt + $otherExt; // these have disjunct key sets
135  wfProfileOut( "$fname-split" );
137  // prepare regexps
138  foreach ( $terms as $index => $term ) {
139  // manually do upper/lowercase stuff for utf-8 since PHP won't do it
140  if ( preg_match( '/[\x80-\xff]/', $term ) ) {
141  $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] );
142  } else {
143  $terms[$index] = $term;
144  }
145  }
146  $anyterm = implode( '|', $terms );
147  $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
149  // @todo FIXME: A hack to scale contextchars, a correct solution
150  // would be to have contextchars actually be char and not byte
151  // length, and do proper utf-8 substrings and lengths everywhere,
152  // but PHP is making that very hard and unclean to implement :(
153  $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
154  $contextchars = intval( $contextchars * $scale );
156  $patPre = "(^|$wgSearchHighlightBoundaries)";
157  $patPost = "($wgSearchHighlightBoundaries|$)";
159  $pat1 = "/(" . $phrase . ")/ui";
160  $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
162  wfProfileIn( "$fname-extract" );
164  $left = $contextlines;
166  $snippets = array();
167  $offsets = array();
169  // show beginning only if it contains all words
170  $first = 0;
171  $firstText = '';
172  foreach ( $textExt as $index => $line ) {
173  if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
174  $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
175  $first = $index;
176  break;
177  }
178  }
179  if ( $firstText ) {
180  $succ = true;
181  // check if first text contains all terms
182  foreach ( $terms as $term ) {
183  if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
184  $succ = false;
185  break;
186  }
187  }
188  if ( $succ ) {
189  $snippets[$first] = $firstText;
190  $offsets[$first] = 0;
191  }
192  }
193  if ( ! $snippets ) {
194  // match whole query on text
195  $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
196  // match whole query on templates/tables/images
197  $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
198  // match any words on text
199  $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
200  // match any words on templates/tables/images
201  $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
203  ksort( $snippets );
204  }
206  // add extra chars to each snippet to make snippets constant size
207  $extended = array();
208  if ( count( $snippets ) == 0 ) {
209  // couldn't find the target words, just show beginning of article
210  if ( array_key_exists( $first, $all ) ) {
211  $targetchars = $contextchars * $contextlines;
212  $snippets[$first] = '';
213  $offsets[$first] = 0;
214  }
215  } else {
216  // if begin of the article contains the whole phrase, show only that !!
217  if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
218  && $offsets[$first] < $contextchars * 2 ) {
219  $snippets = array( $first => $snippets[$first] );
220  }
222  // calc by how much to extend existing snippets
223  $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
224  }
226  foreach ( $snippets as $index => $line ) {
227  $extended[$index] = $line;
228  $len = strlen( $line );
229  if ( $len < $targetchars - 20 ) {
230  // complete this line
231  if ( $len < strlen( $all[$index] ) ) {
232  $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] );
233  $len = strlen( $extended[$index] );
234  }
236  // add more lines
237  $add = $index + 1;
238  while ( $len < $targetchars - 20
239  && array_key_exists( $add, $all )
240  && !array_key_exists( $add, $snippets ) ) {
241  $offsets[$add] = 0;
242  $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
243  $extended[$add] = $tt;
244  $len += strlen( $tt );
245  $add++;
246  }
247  }
248  }
250  // $snippets = array_map( 'htmlspecialchars', $extended );
251  $snippets = $extended;
252  $last = - 1;
253  $extract = '';
254  foreach ( $snippets as $index => $line ) {
255  if ( $last == - 1 ) {
256  $extract .= $line; // first line
257  } elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) ) {
258  $extract .= " " . $line; // continous lines
259  } else {
260  $extract .= '<b> ... </b>' . $line;
261  }
263  $last = $index;
264  }
265  if ( $extract ) {
266  $extract .= '<b> ... </b>';
267  }
269  $processed = array();
270  foreach ( $terms as $term ) {
271  if ( ! isset( $processed[$term] ) ) {
272  $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
273  $extract = preg_replace( $pat3,
274  "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
275  $processed[$term] = true;
276  }
277  }
279  wfProfileOut( "$fname-extract" );
281  return $extract;
282  }
291  function splitAndAdd( &$extracts, &$count, $text ) {
292  $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
293  foreach ( $split as $line ) {
294  $tt = trim( $line );
295  if ( $tt ) {
296  $extracts[$count++] = $tt;
297  }
298  }
299  }
307  function caseCallback( $matches ) {
309  if ( strlen( $matches[0] ) > 1 ) {
310  return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
311  } else {
312  return $matches[0];
313  }
314  }
326  function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
327  if ( $start != 0 ) {
328  $start = $this->position( $text, $start, 1 );
329  }
330  if ( $end >= strlen( $text ) ) {
331  $end = strlen( $text );
332  } else {
333  $end = $this->position( $text, $end );
334  }
336  if ( !is_null( $posStart ) ) {
337  $posStart = $start;
338  }
339  if ( !is_null( $posEnd ) ) {
340  $posEnd = $end;
341  }
343  if ( $end > $start ) {
344  return substr( $text, $start, $end - $start );
345  } else {
346  return '';
347  }
348  }
358  function position( $text, $point, $offset = 0 ) {
359  $tolerance = 10;
360  $s = max( 0, $point - $tolerance );
361  $l = min( strlen( $text ), $point + $tolerance ) - $s;
362  $m = array();
363  if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) {
364  return $m[0][1] + $s + $offset;
365  } else {
366  // check if point is on a valid first UTF8 char
367  $char = ord( $text[$point] );
368  while ( $char >= 0x80 && $char < 0xc0 ) {
369  // skip trailing bytes
370  $point++;
371  if ( $point >= strlen( $text ) ) {
372  return strlen( $text );
373  }
374  $char = ord( $text[$point] );
375  }
376  return $point;
378  }
379  }
392  function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
393  if ( $linesleft == 0 ) {
394  return; // nothing to do
395  }
396  foreach ( $extracts as $index => $line ) {
397  if ( array_key_exists( $index, $out ) ) {
398  continue; // this line already highlighted
399  }
401  $m = array();
402  if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
403  continue;
404  }
406  $offset = $m[0][1];
407  $len = strlen( $m[0][0] );
408  if ( $offset + $len < $contextchars ) {
409  $begin = 0;
410  } elseif ( $len > $contextchars ) {
411  $begin = $offset;
412  } else {
413  $begin = $offset + intval( ( $len - $contextchars ) / 2 );
414  }
416  $end = $begin + $contextchars;
418  $posBegin = $begin;
419  // basic snippet from this line
420  $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
421  $offsets[$index] = $posBegin;
422  $linesleft--;
423  if ( $linesleft == 0 ) {
424  return;
425  }
426  }
427  }
434  function removeWiki( $text ) {
435  $fname = __METHOD__;
436  wfProfileIn( $fname );
438  // $text = preg_replace( "/'{2,5}/", "", $text );
439  // $text = preg_replace( "/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text );
440  // $text = preg_replace( "/\[\[([^]|]+)\]\]/", "\\1", $text );
441  // $text = preg_replace( "/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text );
442  // $text = preg_replace( "/\\{\\|(.*?)\\|\\}/", "", $text );
443  // $text = preg_replace( "/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text );
444  $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
445  $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
446  $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
447  $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text );
448  // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
449  $text = preg_replace( "/<\/?[^>]+>/", "", $text );
450  $text = preg_replace( "/'''''/", "", $text );
451  $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
452  $text = preg_replace( "/''/", "", $text );
454  wfProfileOut( $fname );
455  return $text;
456  }
464  function linkReplace( $matches ) {
465  $colon = strpos( $matches[1], ':' );
466  if ( $colon === false ) {
467  return $matches[2]; // replace with caption
468  }
470  $ns = substr( $matches[1], 0, $colon );
471  $index = $wgContLang->getNsIndex( $ns );
472  if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
473  return $matches[0]; // return the whole thing
474  } else {
475  return $matches[2];
476  }
477  }
489  public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
491  $fname = __METHOD__;
493  $lines = explode( "\n", $text );
495  $terms = implode( '|', $terms );
496  $max = intval( $contextchars ) + 1;
497  $pat1 = "/(.*)($terms)(.{0,$max})/i";
499  $lineno = 0;
501  $extract = "";
502  wfProfileIn( "$fname-extract" );
503  foreach ( $lines as $line ) {
504  if ( 0 == $contextlines ) {
505  break;
506  }
507  ++$lineno;
508  $m = array();
509  if ( ! preg_match( $pat1, $line, $m ) ) {
510  continue;
511  }
512  --$contextlines;
513  // truncate function changes ... to relevant i18n message.
514  $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
516  if ( count( $m ) < 3 ) {
517  $post = '';
518  } else {
519  $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
520  }
522  $found = $m[2];
524  $line = htmlspecialchars( $pre . $found . $post );
525  $pat2 = '/(' . $terms . ")/i";
526  $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
528  $extract .= "${line}\n";
529  }
530  wfProfileOut( "$fname-extract" );
532  return $extract;
533  }
534 }
