MediaWiki  master
SearchHighlighter.php
Go to the documentation of this file.
1 <?php
25 
34 
35  protected $mCleanWikitext = true;
36 
42  function __construct( $cleanupWikitext = true ) {
43  $this->mCleanWikitext = $cleanupWikitext;
44  }
45 
56  public function highlightText(
57  $text,
58  $terms,
59  $contextlines = self::DEFAULT_CONTEXT_LINES,
60  $contextchars = self::DEFAULT_CONTEXT_CHARS
61  ) {
63 
64  if ( $text == '' ) {
65  return '';
66  }
67 
68  // spli text into text + templates/links/tables
69  $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
70  // first capture group is for detecting nested templates/links/tables/references
71  $endPatterns = [
72  1 => '/(\{\{)|(\}\})/', // template
73  2 => '/(\[\[)|(\]\])/', // image
74  3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
75 
76  // @todo FIXME: This should prolly be a hook or something
77  // instead of hardcoding the name of the Cite extension
78  if ( \ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
79  $spat .= '|(<ref>)'; // references via cite extension
80  $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
81  }
82  $spat .= '/';
83  $textExt = []; // text extracts
84  $otherExt = []; // other extracts
85  $start = 0;
86  $textLen = strlen( $text );
87  $count = 0; // sequence number to maintain ordering
88  while ( $start < $textLen ) {
89  // find start of template/image/table
90  if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
91  $epat = '';
92  foreach ( $matches as $key => $val ) {
93  if ( $key > 0 && $val[1] != -1 ) {
94  if ( $key == 2 ) {
95  // see if this is an image link
96  $ns = substr( $val[0], 2, -1 );
97  if (
98  MediaWikiServices::getInstance()->getContentLanguage()->
99  getNsIndex( $ns ) != NS_FILE
100  ) {
101  break;
102  }
103 
104  }
105  $epat = $endPatterns[$key];
106  $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
107  $start = $val[1];
108  break;
109  }
110  }
111  if ( $epat ) {
112  // find end (and detect any nested elements)
113  $level = 0;
114  $offset = $start + 1;
115  $found = false;
116  while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
117  if ( array_key_exists( 2, $endMatches ) ) {
118  // found end
119  if ( $level == 0 ) {
120  $len = strlen( $endMatches[2][0] );
121  $off = $endMatches[2][1];
122  $this->splitAndAdd( $otherExt, $count,
123  substr( $text, $start, $off + $len - $start ) );
124  $start = $off + $len;
125  $found = true;
126  break;
127  } else {
128  // end of nested element
129  $level -= 1;
130  }
131  } else {
132  // nested
133  $level += 1;
134  }
135  $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
136  }
137  if ( !$found ) {
138  // couldn't find appropriate closing tag, skip
139  $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
140  $start += strlen( $matches[0][0] );
141  }
142  continue;
143  }
144  }
145  // else: add as text extract
146  $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
147  break;
148  }
149 
150  $all = $textExt + $otherExt; // these have disjunct key sets
151 
152  // prepare regexps
153  foreach ( $terms as $index => $term ) {
154  // manually do upper/lowercase stuff for utf-8 since PHP won't do it
155  if ( preg_match( '/[\x80-\xff]/', $term ) ) {
156  $terms[$index] = preg_replace_callback(
157  '/./us',
158  [ $this, 'caseCallback' ],
159  $terms[$index]
160  );
161  } else {
162  $terms[$index] = $term;
163  }
164  }
165  $anyterm = implode( '|', $terms );
166  $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
167  // @todo FIXME: A hack to scale contextchars, a correct solution
168  // would be to have contextchars actually be char and not byte
169  // length, and do proper utf-8 substrings and lengths everywhere,
170  // but PHP is making that very hard and unclean to implement :(
171  $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
172  $contextchars = intval( $contextchars * $scale );
173 
174  $patPre = "(^|$wgSearchHighlightBoundaries)";
175  $patPost = "($wgSearchHighlightBoundaries|$)";
176 
177  $pat1 = "/(" . $phrase . ")/ui";
178  $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
179 
180  $left = $contextlines;
181 
182  $snippets = [];
183  $offsets = [];
184 
185  // show beginning only if it contains all words
186  $first = 0;
187  $firstText = '';
188  foreach ( $textExt as $index => $line ) {
189  if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
190  $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
191  $first = $index;
192  break;
193  }
194  }
195  if ( $firstText ) {
196  $succ = true;
197  // check if first text contains all terms
198  foreach ( $terms as $term ) {
199  if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
200  $succ = false;
201  break;
202  }
203  }
204  if ( $succ ) {
205  $snippets[$first] = $firstText;
206  $offsets[$first] = 0;
207  }
208  }
209  if ( !$snippets ) {
210  // match whole query on text
211  $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
212  // match whole query on templates/tables/images
213  $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
214  // match any words on text
215  $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
216  // match any words on templates/tables/images
217  $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
218 
219  ksort( $snippets );
220  }
221 
222  // add extra chars to each snippet to make snippets constant size
223  $extended = [];
224  if ( count( $snippets ) == 0 ) {
225  // couldn't find the target words, just show beginning of article
226  if ( array_key_exists( $first, $all ) ) {
227  $targetchars = $contextchars * $contextlines;
228  $snippets[$first] = '';
229  $offsets[$first] = 0;
230  }
231  } else {
232  // if begin of the article contains the whole phrase, show only that !!
233  if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
234  && $offsets[$first] < $contextchars * 2 ) {
235  $snippets = [ $first => $snippets[$first] ];
236  }
237 
238  // calc by how much to extend existing snippets
239  $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
240  }
241 
242  foreach ( $snippets as $index => $line ) {
243  $extended[$index] = $line;
244  $len = strlen( $line );
245  if ( $len < $targetchars - 20 ) {
246  // complete this line
247  if ( $len < strlen( $all[$index] ) ) {
248  $extended[$index] = $this->extract(
249  $all[$index],
250  $offsets[$index],
251  $offsets[$index] + $targetchars,
252  $offsets[$index]
253  );
254  $len = strlen( $extended[$index] );
255  }
256 
257  // add more lines
258  $add = $index + 1;
259  while ( $len < $targetchars - 20
260  && array_key_exists( $add, $all )
261  && !array_key_exists( $add, $snippets ) ) {
262  $offsets[$add] = 0;
263  $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
264  $extended[$add] = $tt;
265  $len += strlen( $tt );
266  $add++;
267  }
268  }
269  }
270 
271  // $snippets = array_map( 'htmlspecialchars', $extended );
272  $snippets = $extended;
273  $last = -1;
274  $extract = '';
275  foreach ( $snippets as $index => $line ) {
276  if ( $last == -1 ) {
277  $extract .= $line; // first line
278  } elseif ( $last + 1 == $index
279  && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
280  ) {
281  $extract .= " " . $line; // continous lines
282  } else {
283  $extract .= '<b> ... </b>' . $line;
284  }
285 
286  $last = $index;
287  }
288  if ( $extract ) {
289  $extract .= '<b> ... </b>';
290  }
291 
292  $processed = [];
293  foreach ( $terms as $term ) {
294  if ( !isset( $processed[$term] ) ) {
295  $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
296  $extract = preg_replace( $pat3,
297  "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
298  $processed[$term] = true;
299  }
300  }
301 
302  return $extract;
303  }
304 
312  function splitAndAdd( &$extracts, &$count, $text ) {
313  $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
314  foreach ( $split as $line ) {
315  $tt = trim( $line );
316  if ( $tt ) {
317  $extracts[$count++] = $tt;
318  }
319  }
320  }
321 
328  function caseCallback( $matches ) {
329  if ( strlen( $matches[0] ) > 1 ) {
330  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
331  return '[' . $contLang->lc( $matches[0] ) .
332  $contLang->uc( $matches[0] ) . ']';
333  } else {
334  return $matches[0];
335  }
336  }
337 
348  function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
349  if ( $start != 0 ) {
350  $start = $this->position( $text, $start, 1 );
351  }
352  if ( $end >= strlen( $text ) ) {
353  $end = strlen( $text );
354  } else {
355  $end = $this->position( $text, $end );
356  }
357 
358  if ( !is_null( $posStart ) ) {
359  $posStart = $start;
360  }
361  if ( !is_null( $posEnd ) ) {
362  $posEnd = $end;
363  }
364 
365  if ( $end > $start ) {
366  return substr( $text, $start, $end - $start );
367  } else {
368  return '';
369  }
370  }
371 
380  function position( $text, $point, $offset = 0 ) {
381  $tolerance = 10;
382  $s = max( 0, $point - $tolerance );
383  $l = min( strlen( $text ), $point + $tolerance ) - $s;
384  $m = [];
385 
386  if ( preg_match(
387  '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
388  substr( $text, $s, $l ),
389  $m,
390  PREG_OFFSET_CAPTURE
391  ) ) {
392  return $m[0][1] + $s + $offset;
393  } else {
394  // check if point is on a valid first UTF8 char
395  $char = ord( $text[$point] );
396  while ( $char >= 0x80 && $char < 0xc0 ) {
397  // skip trailing bytes
398  $point++;
399  if ( $point >= strlen( $text ) ) {
400  return strlen( $text );
401  }
402  $char = ord( $text[$point] );
403  }
404 
405  return $point;
406 
407  }
408  }
409 
421  function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
422  if ( $linesleft == 0 ) {
423  return; // nothing to do
424  }
425  foreach ( $extracts as $index => $line ) {
426  if ( array_key_exists( $index, $out ) ) {
427  continue; // this line already highlighted
428  }
429 
430  $m = [];
431  if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
432  continue;
433  }
434 
435  $offset = $m[0][1];
436  $len = strlen( $m[0][0] );
437  if ( $offset + $len < $contextchars ) {
438  $begin = 0;
439  } elseif ( $len > $contextchars ) {
440  $begin = $offset;
441  } else {
442  $begin = $offset + intval( ( $len - $contextchars ) / 2 );
443  }
444 
445  $end = $begin + $contextchars;
446 
447  $posBegin = $begin;
448  // basic snippet from this line
449  $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
450  $offsets[$index] = $posBegin;
451  $linesleft--;
452  if ( $linesleft == 0 ) {
453  return;
454  }
455  }
456  }
457 
464  function removeWiki( $text ) {
465  $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
466  $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
467  $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
468  $text = preg_replace_callback(
469  "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
470  [ $this, 'linkReplace' ],
471  $text
472  );
473  $text = preg_replace( "/<\/?[^>]+>/", "", $text );
474  $text = preg_replace( "/'''''/", "", $text );
475  $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
476  $text = preg_replace( "/''/", "", $text );
477 
478  // Note, the previous /<\/?[^>]+>/ is insufficient
479  // for XSS safety as the HTML tag can span multiple
480  // search results (T144845).
481  $text = Sanitizer::escapeHtmlAllowEntities( $text );
482  return $text;
483  }
484 
492  function linkReplace( $matches ) {
493  $colon = strpos( $matches[1], ':' );
494  if ( $colon === false ) {
495  return $matches[2]; // replace with caption
496  }
497  $ns = substr( $matches[1], 0, $colon );
498  $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
499  if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
500  return $matches[0]; // return the whole thing
501  } else {
502  return $matches[2];
503  }
504  }
505 
518  public function highlightSimple(
519  $text,
520  $terms,
521  $contextlines = self::DEFAULT_CONTEXT_LINES,
522  $contextchars = self::DEFAULT_CONTEXT_CHARS
523  ) {
524  $lines = explode( "\n", $text );
525 
526  $terms = implode( '|', $terms );
527  $max = intval( $contextchars ) + 1;
528  $pat1 = "/(.*)($terms)(.{0,$max})/i";
529 
530  $lineno = 0;
531 
532  $extract = "";
533  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
534  foreach ( $lines as $line ) {
535  if ( $contextlines == 0 ) {
536  break;
537  }
538  ++$lineno;
539  $m = [];
540  if ( !preg_match( $pat1, $line, $m ) ) {
541  continue;
542  }
543  --$contextlines;
544  // truncate function changes ... to relevant i18n message.
545  $pre = $contLang->truncateForVisual( $m[1], - $contextchars, '...', false );
546 
547  if ( count( $m ) < 3 ) {
548  $post = '';
549  } else {
550  $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
551  }
552 
553  $found = $m[2];
554 
555  $line = htmlspecialchars( $pre . $found . $post );
556  $pat2 = '/(' . $terms . ")/i";
557  $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
558 
559  $extract .= "${line}\n";
560  }
561 
562  return $extract;
563  }
564 
573  public function highlightNone(
574  $text,
575  $contextlines = self::DEFAULT_CONTEXT_LINES,
576  $contextchars = self::DEFAULT_CONTEXT_CHARS
577  ) {
578  $match = [];
579  $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
580  $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
581  preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
582 
583  // Trim and limit to max number of chars
584  $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
585  return str_replace( "\n", '<br>', $text );
586  }
587 }
whereas SearchGetNearMatch runs after $term
Definition: hooks.txt:2869
splitAndAdd(&$extracts, &$count, $text)
Split text into lines and add it to extracts array.
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
extract( $text, $start, $end, &$posStart=null, &$posEnd=null)
Extract part of the text from start to end, but by not chopping up words.
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not it can be in the form of< username >< more info > e g for bot passwords intended to be added to log contexts Fields it might only if the login was with a bot password it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output $out
Definition: hooks.txt:773
$last
Highlight bits of wikitext.
$wgSearchHighlightBoundaries
Regexp to match word boundaries, defaults for non-CJK languages should be empty for CJK since the wor...
removeWiki( $text)
Basic wikitext removal.
const NS_CATEGORY
Definition: Defines.php:74
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:773
const NS_FILE
Definition: Defines.php:66
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
highlightSimple( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Simple & fast snippet extraction, but gives completely unrelevant snippets.
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
highlightText( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
$lines
Definition: router.php:61
position( $text, $point, $offset=0)
Find a nonletter near a point (index) in the text.
process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets)
Search extracts for a pattern, and return snippets.
linkReplace( $matches)
callback to replace [[target|caption]] kind of links, if the target is category or image...
$line
Definition: cdb.php:59
highlightNone( $text, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Returns the first few lines of the text.
caseCallback( $matches)
Do manual case conversion for non-ascii chars.
__construct( $cleanupWikitext=true)
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1433
$matches