MediaWiki  1.28.1
SearchHighlighter.php
Go to the documentation of this file.
1 <?php
30  protected $mCleanWikitext = true;
31 
36  function __construct( $cleanupWikitext = true ) {
37  $this->mCleanWikitext = $cleanupWikitext;
38  }
39 
50  public function highlightText( $text, $terms, $contextlines, $contextchars ) {
51  global $wgContLang, $wgSearchHighlightBoundaries;
52 
53  if ( $text == '' ) {
54  return '';
55  }
56 
57  // spli text into text + templates/links/tables
58  $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
59  // first capture group is for detecting nested templates/links/tables/references
60  $endPatterns = [
61  1 => '/(\{\{)|(\}\})/', // template
62  2 => '/(\[\[)|(\]\])/', // image
63  3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
64 
65  // @todo FIXME: This should prolly be a hook or something
66  // instead of hardcoding a class name from the Cite extension
67  if ( class_exists( 'Cite' ) ) {
68  $spat .= '|(<ref>)'; // references via cite extension
69  $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
70  }
71  $spat .= '/';
72  $textExt = []; // text extracts
73  $otherExt = []; // other extracts
74  $start = 0;
75  $textLen = strlen( $text );
76  $count = 0; // sequence number to maintain ordering
77  while ( $start < $textLen ) {
78  // find start of template/image/table
79  if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
80  $epat = '';
81  foreach ( $matches as $key => $val ) {
82  if ( $key > 0 && $val[1] != - 1 ) {
83  if ( $key == 2 ) {
84  // see if this is an image link
85  $ns = substr( $val[0], 2, - 1 );
86  if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
87  break;
88  }
89 
90  }
91  $epat = $endPatterns[$key];
92  $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
93  $start = $val[1];
94  break;
95  }
96  }
97  if ( $epat ) {
98  // find end (and detect any nested elements)
99  $level = 0;
100  $offset = $start + 1;
101  $found = false;
102  while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
103  if ( array_key_exists( 2, $endMatches ) ) {
104  // found end
105  if ( $level == 0 ) {
106  $len = strlen( $endMatches[2][0] );
107  $off = $endMatches[2][1];
108  $this->splitAndAdd( $otherExt, $count,
109  substr( $text, $start, $off + $len - $start ) );
110  $start = $off + $len;
111  $found = true;
112  break;
113  } else {
114  // end of nested element
115  $level -= 1;
116  }
117  } else {
118  // nested
119  $level += 1;
120  }
121  $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
122  }
123  if ( !$found ) {
124  // couldn't find appropriate closing tag, skip
125  $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
126  $start += strlen( $matches[0][0] );
127  }
128  continue;
129  }
130  }
131  // else: add as text extract
132  $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
133  break;
134  }
135 
136  $all = $textExt + $otherExt; // these have disjunct key sets
137 
138  // prepare regexps
139  foreach ( $terms as $index => $term ) {
140  // manually do upper/lowercase stuff for utf-8 since PHP won't do it
141  if ( preg_match( '/[\x80-\xff]/', $term ) ) {
142  $terms[$index] = preg_replace_callback(
143  '/./us',
144  [ $this, 'caseCallback' ],
145  $terms[$index]
146  );
147  } else {
148  $terms[$index] = $term;
149  }
150  }
151  $anyterm = implode( '|', $terms );
152  $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
153  // @todo FIXME: A hack to scale contextchars, a correct solution
154  // would be to have contextchars actually be char and not byte
155  // length, and do proper utf-8 substrings and lengths everywhere,
156  // but PHP is making that very hard and unclean to implement :(
157  $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
158  $contextchars = intval( $contextchars * $scale );
159 
160  $patPre = "(^|$wgSearchHighlightBoundaries)";
161  $patPost = "($wgSearchHighlightBoundaries|$)";
162 
163  $pat1 = "/(" . $phrase . ")/ui";
164  $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
165 
166  $left = $contextlines;
167 
168  $snippets = [];
169  $offsets = [];
170 
171  // show beginning only if it contains all words
172  $first = 0;
173  $firstText = '';
174  foreach ( $textExt as $index => $line ) {
175  if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
176  $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
177  $first = $index;
178  break;
179  }
180  }
181  if ( $firstText ) {
182  $succ = true;
183  // check if first text contains all terms
184  foreach ( $terms as $term ) {
185  if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
186  $succ = false;
187  break;
188  }
189  }
190  if ( $succ ) {
191  $snippets[$first] = $firstText;
192  $offsets[$first] = 0;
193  }
194  }
195  if ( !$snippets ) {
196  // match whole query on text
197  $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
198  // match whole query on templates/tables/images
199  $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
200  // match any words on text
201  $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
202  // match any words on templates/tables/images
203  $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
204 
205  ksort( $snippets );
206  }
207 
208  // add extra chars to each snippet to make snippets constant size
209  $extended = [];
210  if ( count( $snippets ) == 0 ) {
211  // couldn't find the target words, just show beginning of article
212  if ( array_key_exists( $first, $all ) ) {
213  $targetchars = $contextchars * $contextlines;
214  $snippets[$first] = '';
215  $offsets[$first] = 0;
216  }
217  } else {
218  // if begin of the article contains the whole phrase, show only that !!
219  if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
220  && $offsets[$first] < $contextchars * 2 ) {
221  $snippets = [ $first => $snippets[$first] ];
222  }
223 
224  // calc by how much to extend existing snippets
225  $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
226  }
227 
228  foreach ( $snippets as $index => $line ) {
229  $extended[$index] = $line;
230  $len = strlen( $line );
231  if ( $len < $targetchars - 20 ) {
232  // complete this line
233  if ( $len < strlen( $all[$index] ) ) {
234  $extended[$index] = $this->extract(
235  $all[$index],
236  $offsets[$index],
237  $offsets[$index] + $targetchars,
238  $offsets[$index]
239  );
240  $len = strlen( $extended[$index] );
241  }
242 
243  // add more lines
244  $add = $index + 1;
245  while ( $len < $targetchars - 20
246  && array_key_exists( $add, $all )
247  && !array_key_exists( $add, $snippets ) ) {
248  $offsets[$add] = 0;
249  $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
250  $extended[$add] = $tt;
251  $len += strlen( $tt );
252  $add++;
253  }
254  }
255  }
256 
257  // $snippets = array_map( 'htmlspecialchars', $extended );
258  $snippets = $extended;
259  $last = - 1;
260  $extract = '';
261  foreach ( $snippets as $index => $line ) {
262  if ( $last == - 1 ) {
263  $extract .= $line; // first line
264  } elseif ( $last + 1 == $index
265  && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
266  ) {
267  $extract .= " " . $line; // continous lines
268  } else {
269  $extract .= '<b> ... </b>' . $line;
270  }
271 
272  $last = $index;
273  }
274  if ( $extract ) {
275  $extract .= '<b> ... </b>';
276  }
277 
278  $processed = [];
279  foreach ( $terms as $term ) {
280  if ( !isset( $processed[$term] ) ) {
281  $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
282  $extract = preg_replace( $pat3,
283  "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
284  $processed[$term] = true;
285  }
286  }
287 
288  return $extract;
289  }
290 
298  function splitAndAdd( &$extracts, &$count, $text ) {
299  $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
300  foreach ( $split as $line ) {
301  $tt = trim( $line );
302  if ( $tt ) {
303  $extracts[$count++] = $tt;
304  }
305  }
306  }
307 
314  function caseCallback( $matches ) {
316  if ( strlen( $matches[0] ) > 1 ) {
317  return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
318  } else {
319  return $matches[0];
320  }
321  }
322 
333  function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
334  if ( $start != 0 ) {
335  $start = $this->position( $text, $start, 1 );
336  }
337  if ( $end >= strlen( $text ) ) {
338  $end = strlen( $text );
339  } else {
340  $end = $this->position( $text, $end );
341  }
342 
343  if ( !is_null( $posStart ) ) {
344  $posStart = $start;
345  }
346  if ( !is_null( $posEnd ) ) {
347  $posEnd = $end;
348  }
349 
350  if ( $end > $start ) {
351  return substr( $text, $start, $end - $start );
352  } else {
353  return '';
354  }
355  }
356 
365  function position( $text, $point, $offset = 0 ) {
366  $tolerance = 10;
367  $s = max( 0, $point - $tolerance );
368  $l = min( strlen( $text ), $point + $tolerance ) - $s;
369  $m = [];
370 
371  if ( preg_match(
372  '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
373  substr( $text, $s, $l ),
374  $m,
375  PREG_OFFSET_CAPTURE
376  ) ) {
377  return $m[0][1] + $s + $offset;
378  } else {
379  // check if point is on a valid first UTF8 char
380  $char = ord( $text[$point] );
381  while ( $char >= 0x80 && $char < 0xc0 ) {
382  // skip trailing bytes
383  $point++;
384  if ( $point >= strlen( $text ) ) {
385  return strlen( $text );
386  }
387  $char = ord( $text[$point] );
388  }
389 
390  return $point;
391 
392  }
393  }
394 
406  function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
407  if ( $linesleft == 0 ) {
408  return; // nothing to do
409  }
410  foreach ( $extracts as $index => $line ) {
411  if ( array_key_exists( $index, $out ) ) {
412  continue; // this line already highlighted
413  }
414 
415  $m = [];
416  if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
417  continue;
418  }
419 
420  $offset = $m[0][1];
421  $len = strlen( $m[0][0] );
422  if ( $offset + $len < $contextchars ) {
423  $begin = 0;
424  } elseif ( $len > $contextchars ) {
425  $begin = $offset;
426  } else {
427  $begin = $offset + intval( ( $len - $contextchars ) / 2 );
428  }
429 
430  $end = $begin + $contextchars;
431 
432  $posBegin = $begin;
433  // basic snippet from this line
434  $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
435  $offsets[$index] = $posBegin;
436  $linesleft--;
437  if ( $linesleft == 0 ) {
438  return;
439  }
440  }
441  }
442 
449  function removeWiki( $text ) {
450  $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
451  $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
452  $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
453  $text = preg_replace_callback(
454  "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
455  [ $this, 'linkReplace' ],
456  $text
457  );
458  $text = preg_replace( "/<\/?[^>]+>/", "", $text );
459  $text = preg_replace( "/'''''/", "", $text );
460  $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
461  $text = preg_replace( "/''/", "", $text );
462 
463  // Note, the previous /<\/?[^>]+>/ is insufficient
464  // for XSS safety as the HTML tag can span multiple
465  // search results (T144845).
466  $text = Sanitizer::escapeHtmlAllowEntities( $text );
467  return $text;
468  }
469 
477  function linkReplace( $matches ) {
478  $colon = strpos( $matches[1], ':' );
479  if ( $colon === false ) {
480  return $matches[2]; // replace with caption
481  }
483  $ns = substr( $matches[1], 0, $colon );
484  $index = $wgContLang->getNsIndex( $ns );
485  if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
486  return $matches[0]; // return the whole thing
487  } else {
488  return $matches[2];
489  }
490  }
491 
504  public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
506 
507  $lines = explode( "\n", $text );
508 
509  $terms = implode( '|', $terms );
510  $max = intval( $contextchars ) + 1;
511  $pat1 = "/(.*)($terms)(.{0,$max})/i";
512 
513  $lineno = 0;
514 
515  $extract = "";
516  foreach ( $lines as $line ) {
517  if ( 0 == $contextlines ) {
518  break;
519  }
520  ++$lineno;
521  $m = [];
522  if ( !preg_match( $pat1, $line, $m ) ) {
523  continue;
524  }
525  --$contextlines;
526  // truncate function changes ... to relevant i18n message.
527  $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
528 
529  if ( count( $m ) < 3 ) {
530  $post = '';
531  } else {
532  $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
533  }
534 
535  $found = $m[2];
536 
537  $line = htmlspecialchars( $pre . $found . $post );
538  $pat2 = '/(' . $terms . ")/i";
539  $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
540 
541  $extract .= "${line}\n";
542  }
543 
544  return $extract;
545  }
546 
555  public function highlightNone( $text, $contextlines, $contextchars ) {
556  $match = [];
557  $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
558  $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
559  preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
560 
561  // Trim and limit to max number of chars
562  $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
563  return str_replace( "\n", '<br>', $text );
564  }
565 }
process($pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets)
Search extracts for a pattern, and return snippets.
external whereas SearchGetNearMatch runs after $term
Definition: hooks.txt:2713
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that probably a stub it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output $out
Definition: hooks.txt:802
highlightSimple($text, $terms, $contextlines, $contextchars)
Simple & fast snippet extraction, but gives completely unrelevant snippets.
splitAndAdd(&$extracts, &$count, $text)
Split text into lines and add it to extracts array.
__construct($cleanupWikitext=true)
when a variable name is used in a it is silently declared as a new local masking the global
Definition: design.txt:93
$last
Highlight bits of wikitext.
position($text, $point, $offset=0)
Find a nonletter near a point (index) in the text.
static escapeHtmlAllowEntities($html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1262
highlightText($text, $terms, $contextlines, $contextchars)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
const NS_CATEGORY
Definition: Defines.php:70
extract($text, $start, $end, &$posStart=null, &$posEnd=null)
Extract part of the text from start to end, but by not chopping up words.
linkReplace($matches)
callback to replace [[target|caption]] kind of links, if the target is category or image...
const NS_FILE
Definition: Defines.php:62
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
$lines
Definition: router.php:67
removeWiki($text)
Basic wikitext removal.
caseCallback($matches)
Do manual case conversion for non-ascii chars.
$line
Definition: cdb.php:59
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the local content language as $wgContLang
Definition: design.txt:56
$count
$processed
return true to allow those checks to and false if checking is done remove or add to the links of a group of changes in EnhancedChangesList Hook subscribers can return false to omit this line from recentchanges use this to change the tables headers temp or archived zone change it to an object instance and return false override the list derivative used the name of the old file when set the default code will be skipped $pre
Definition: hooks.txt:1442
highlightNone($text, $contextlines, $contextchars)
Returns the first few lines of the text.
$matches