MediaWiki  1.27.2
SearchHighlighter.php
Go to the documentation of this file.
1 <?php
30  protected $mCleanWikitext = true;
31 
36  function __construct( $cleanupWikitext = true ) {
37  $this->mCleanWikitext = $cleanupWikitext;
38  }
39 
49  public function highlightText( $text, $terms, $contextlines, $contextchars ) {
50  global $wgContLang, $wgSearchHighlightBoundaries;
51 
52  if ( $text == '' ) {
53  return '';
54  }
55 
56  // spli text into text + templates/links/tables
57  $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
58  // first capture group is for detecting nested templates/links/tables/references
59  $endPatterns = [
60  1 => '/(\{\{)|(\}\})/', // template
61  2 => '/(\[\[)|(\]\])/', // image
62  3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
63 
64  // @todo FIXME: This should prolly be a hook or something
65  // instead of hardcoding a class name from the Cite extension
66  if ( class_exists( 'Cite' ) ) {
67  $spat .= '|(<ref>)'; // references via cite extension
68  $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
69  }
70  $spat .= '/';
71  $textExt = []; // text extracts
72  $otherExt = []; // other extracts
73  $start = 0;
74  $textLen = strlen( $text );
75  $count = 0; // sequence number to maintain ordering
76  while ( $start < $textLen ) {
77  // find start of template/image/table
78  if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
79  $epat = '';
80  foreach ( $matches as $key => $val ) {
81  if ( $key > 0 && $val[1] != - 1 ) {
82  if ( $key == 2 ) {
83  // see if this is an image link
84  $ns = substr( $val[0], 2, - 1 );
85  if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
86  break;
87  }
88 
89  }
90  $epat = $endPatterns[$key];
91  $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
92  $start = $val[1];
93  break;
94  }
95  }
96  if ( $epat ) {
97  // find end (and detect any nested elements)
98  $level = 0;
99  $offset = $start + 1;
100  $found = false;
101  while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
102  if ( array_key_exists( 2, $endMatches ) ) {
103  // found end
104  if ( $level == 0 ) {
105  $len = strlen( $endMatches[2][0] );
106  $off = $endMatches[2][1];
107  $this->splitAndAdd( $otherExt, $count,
108  substr( $text, $start, $off + $len - $start ) );
109  $start = $off + $len;
110  $found = true;
111  break;
112  } else {
113  // end of nested element
114  $level -= 1;
115  }
116  } else {
117  // nested
118  $level += 1;
119  }
120  $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
121  }
122  if ( !$found ) {
123  // couldn't find appropriate closing tag, skip
124  $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
125  $start += strlen( $matches[0][0] );
126  }
127  continue;
128  }
129  }
130  // else: add as text extract
131  $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
132  break;
133  }
134 
135  $all = $textExt + $otherExt; // these have disjunct key sets
136 
137  // prepare regexps
138  foreach ( $terms as $index => $term ) {
139  // manually do upper/lowercase stuff for utf-8 since PHP won't do it
140  if ( preg_match( '/[\x80-\xff]/', $term ) ) {
141  $terms[$index] = preg_replace_callback(
142  '/./us',
143  [ $this, 'caseCallback' ],
144  $terms[$index]
145  );
146  } else {
147  $terms[$index] = $term;
148  }
149  }
150  $anyterm = implode( '|', $terms );
151  $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
152 
153  // @todo FIXME: A hack to scale contextchars, a correct solution
154  // would be to have contextchars actually be char and not byte
155  // length, and do proper utf-8 substrings and lengths everywhere,
156  // but PHP is making that very hard and unclean to implement :(
157  $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
158  $contextchars = intval( $contextchars * $scale );
159 
160  $patPre = "(^|$wgSearchHighlightBoundaries)";
161  $patPost = "($wgSearchHighlightBoundaries|$)";
162 
163  $pat1 = "/(" . $phrase . ")/ui";
164  $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
165 
166  $left = $contextlines;
167 
168  $snippets = [];
169  $offsets = [];
170 
171  // show beginning only if it contains all words
172  $first = 0;
173  $firstText = '';
174  foreach ( $textExt as $index => $line ) {
175  if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
176  $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
177  $first = $index;
178  break;
179  }
180  }
181  if ( $firstText ) {
182  $succ = true;
183  // check if first text contains all terms
184  foreach ( $terms as $term ) {
185  if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
186  $succ = false;
187  break;
188  }
189  }
190  if ( $succ ) {
191  $snippets[$first] = $firstText;
192  $offsets[$first] = 0;
193  }
194  }
195  if ( !$snippets ) {
196  // match whole query on text
197  $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
198  // match whole query on templates/tables/images
199  $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
200  // match any words on text
201  $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
202  // match any words on templates/tables/images
203  $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
204 
205  ksort( $snippets );
206  }
207 
208  // add extra chars to each snippet to make snippets constant size
209  $extended = [];
210  if ( count( $snippets ) == 0 ) {
211  // couldn't find the target words, just show beginning of article
212  if ( array_key_exists( $first, $all ) ) {
213  $targetchars = $contextchars * $contextlines;
214  $snippets[$first] = '';
215  $offsets[$first] = 0;
216  }
217  } else {
218  // if begin of the article contains the whole phrase, show only that !!
219  if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
220  && $offsets[$first] < $contextchars * 2 ) {
221  $snippets = [ $first => $snippets[$first] ];
222  }
223 
224  // calc by how much to extend existing snippets
225  $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
226  }
227 
228  foreach ( $snippets as $index => $line ) {
229  $extended[$index] = $line;
230  $len = strlen( $line );
231  if ( $len < $targetchars - 20 ) {
232  // complete this line
233  if ( $len < strlen( $all[$index] ) ) {
234  $extended[$index] = $this->extract(
235  $all[$index],
236  $offsets[$index],
237  $offsets[$index] + $targetchars,
238  $offsets[$index]
239  );
240  $len = strlen( $extended[$index] );
241  }
242 
243  // add more lines
244  $add = $index + 1;
245  while ( $len < $targetchars - 20
246  && array_key_exists( $add, $all )
247  && !array_key_exists( $add, $snippets ) ) {
248  $offsets[$add] = 0;
249  $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
250  $extended[$add] = $tt;
251  $len += strlen( $tt );
252  $add++;
253  }
254  }
255  }
256 
257  // $snippets = array_map( 'htmlspecialchars', $extended );
258  $snippets = $extended;
259  $last = - 1;
260  $extract = '';
261  foreach ( $snippets as $index => $line ) {
262  if ( $last == - 1 ) {
263  $extract .= $line; // first line
264  } elseif ( $last + 1 == $index
265  && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
266  ) {
267  $extract .= " " . $line; // continous lines
268  } else {
269  $extract .= '<b> ... </b>' . $line;
270  }
271 
272  $last = $index;
273  }
274  if ( $extract ) {
275  $extract .= '<b> ... </b>';
276  }
277 
278  $processed = [];
279  foreach ( $terms as $term ) {
280  if ( !isset( $processed[$term] ) ) {
281  $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
282  $extract = preg_replace( $pat3,
283  "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
284  $processed[$term] = true;
285  }
286  }
287 
288  return $extract;
289  }
290 
298  function splitAndAdd( &$extracts, &$count, $text ) {
299  $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
300  foreach ( $split as $line ) {
301  $tt = trim( $line );
302  if ( $tt ) {
303  $extracts[$count++] = $tt;
304  }
305  }
306  }
307 
314  function caseCallback( $matches ) {
316  if ( strlen( $matches[0] ) > 1 ) {
317  return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
318  } else {
319  return $matches[0];
320  }
321  }
322 
333  function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
334  if ( $start != 0 ) {
335  $start = $this->position( $text, $start, 1 );
336  }
337  if ( $end >= strlen( $text ) ) {
338  $end = strlen( $text );
339  } else {
340  $end = $this->position( $text, $end );
341  }
342 
343  if ( !is_null( $posStart ) ) {
344  $posStart = $start;
345  }
346  if ( !is_null( $posEnd ) ) {
347  $posEnd = $end;
348  }
349 
350  if ( $end > $start ) {
351  return substr( $text, $start, $end - $start );
352  } else {
353  return '';
354  }
355  }
356 
365  function position( $text, $point, $offset = 0 ) {
366  $tolerance = 10;
367  $s = max( 0, $point - $tolerance );
368  $l = min( strlen( $text ), $point + $tolerance ) - $s;
369  $m = [];
370 
371  if ( preg_match(
372  '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
373  substr( $text, $s, $l ),
374  $m,
375  PREG_OFFSET_CAPTURE
376  ) ) {
377  return $m[0][1] + $s + $offset;
378  } else {
379  // check if point is on a valid first UTF8 char
380  $char = ord( $text[$point] );
381  while ( $char >= 0x80 && $char < 0xc0 ) {
382  // skip trailing bytes
383  $point++;
384  if ( $point >= strlen( $text ) ) {
385  return strlen( $text );
386  }
387  $char = ord( $text[$point] );
388  }
389 
390  return $point;
391 
392  }
393  }
394 
406  function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
407  if ( $linesleft == 0 ) {
408  return; // nothing to do
409  }
410  foreach ( $extracts as $index => $line ) {
411  if ( array_key_exists( $index, $out ) ) {
412  continue; // this line already highlighted
413  }
414 
415  $m = [];
416  if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
417  continue;
418  }
419 
420  $offset = $m[0][1];
421  $len = strlen( $m[0][0] );
422  if ( $offset + $len < $contextchars ) {
423  $begin = 0;
424  } elseif ( $len > $contextchars ) {
425  $begin = $offset;
426  } else {
427  $begin = $offset + intval( ( $len - $contextchars ) / 2 );
428  }
429 
430  $end = $begin + $contextchars;
431 
432  $posBegin = $begin;
433  // basic snippet from this line
434  $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
435  $offsets[$index] = $posBegin;
436  $linesleft--;
437  if ( $linesleft == 0 ) {
438  return;
439  }
440  }
441  }
442 
449  function removeWiki( $text ) {
450  $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
451  $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
452  $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
453  $text = preg_replace_callback(
454  "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
455  [ $this, 'linkReplace' ],
456  $text
457  );
458  $text = preg_replace( "/<\/?[^>]+>/", "", $text );
459  $text = preg_replace( "/'''''/", "", $text );
460  $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
461  $text = preg_replace( "/''/", "", $text );
462 
463  // Note, the previous /<\/?[^>]+>/ is insufficient
464  // for XSS safety as the HTML tag can span multiple
465  // search results (T144845).
466  $text = Sanitizer::escapeHtmlAllowEntities( $text );
467  return $text;
468  }
469 
477  function linkReplace( $matches ) {
478  $colon = strpos( $matches[1], ':' );
479  if ( $colon === false ) {
480  return $matches[2]; // replace with caption
481  }
483  $ns = substr( $matches[1], 0, $colon );
484  $index = $wgContLang->getNsIndex( $ns );
485  if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
486  return $matches[0]; // return the whole thing
487  } else {
488  return $matches[2];
489  }
490  }
491 
502  public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
504 
505  $lines = explode( "\n", $text );
506 
507  $terms = implode( '|', $terms );
508  $max = intval( $contextchars ) + 1;
509  $pat1 = "/(.*)($terms)(.{0,$max})/i";
510 
511  $lineno = 0;
512 
513  $extract = "";
514  foreach ( $lines as $line ) {
515  if ( 0 == $contextlines ) {
516  break;
517  }
518  ++$lineno;
519  $m = [];
520  if ( !preg_match( $pat1, $line, $m ) ) {
521  continue;
522  }
523  --$contextlines;
524  // truncate function changes ... to relevant i18n message.
525  $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
526 
527  if ( count( $m ) < 3 ) {
528  $post = '';
529  } else {
530  $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
531  }
532 
533  $found = $m[2];
534 
535  $line = htmlspecialchars( $pre . $found . $post );
536  $pat2 = '/(' . $terms . ")/i";
537  $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
538 
539  $extract .= "${line}\n";
540  }
541 
542  return $extract;
543  }
544 
553  public function highlightNone( $text, $contextlines, $contextchars ) {
554  $match = [];
555  $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
556  $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
557  preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
558 
559  // Trim and limit to max number of chars
560  $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
561  return str_replace( "\n", '<br>', $text );
562  }
563 }
process($pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets)
Search extracts for a pattern, and return snippets.
external whereas SearchGetNearMatch runs after $term
Definition: hooks.txt:2558
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that probably a stub it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output $out
Definition: hooks.txt:762
magic word the default is to use $key to get the and $key value or $key value text $key value html to format the value $key
Definition: hooks.txt:2321
highlightSimple($text, $terms, $contextlines, $contextchars)
Simple & fast snippet extraction, but gives completely unrelevant snippets.
splitAndAdd(&$extracts, &$count, $text)
Split text into lines and add it to extracts array.
__construct($cleanupWikitext=true)
when a variable name is used in a it is silently declared as a new local masking the global
Definition: design.txt:93
$last
Highlight bits of wikitext.
position($text, $point, $offset=0)
Find a nonletter near a point (index) in the text.
static escapeHtmlAllowEntities($html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1224
highlightText($text, $terms, $contextlines, $contextchars)
Default implementation of wikitext highlighting.
const NS_CATEGORY
Definition: Defines.php:83
extract($text, $start, $end, &$posStart=null, &$posEnd=null)
Extract part of the text from start to end, but by not chopping up words.
linkReplace($matches)
callback to replace [[target|caption]] kind of links, if the target is category or image...
const NS_FILE
Definition: Defines.php:75
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
$lines
Definition: router.php:66
removeWiki($text)
Basic wikitext removal.
caseCallback($matches)
Do manual case conversion for non-ascii chars.
$line
Definition: cdb.php:59
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the local content language as $wgContLang
Definition: design.txt:56
$count
$processed
return true to allow those checks to and false if checking is done remove or add to the links of a group of changes in EnhancedChangesList Hook subscribers can return false to omit this line from recentchanges use this to change the tables headers temp or archived zone change it to an object instance and return false override the list derivative used the name of the old file when set the default code will be skipped $pre
Definition: hooks.txt:1306
highlightNone($text, $contextlines, $contextchars)
Returns the first few lines of the text.
$matches