MediaWiki  master
SearchHighlighter.php
Go to the documentation of this file.
1 <?php
25 
32  protected $mCleanWikitext = true;
33 
39  function __construct( $cleanupWikitext = true ) {
40  $this->mCleanWikitext = $cleanupWikitext;
41  }
42 
53  public function highlightText( $text, $terms, $contextlines, $contextchars ) {
55 
56  if ( $text == '' ) {
57  return '';
58  }
59 
60  // spli text into text + templates/links/tables
61  $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
62  // first capture group is for detecting nested templates/links/tables/references
63  $endPatterns = [
64  1 => '/(\{\{)|(\}\})/', // template
65  2 => '/(\[\[)|(\]\])/', // image
66  3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
67 
68  // @todo FIXME: This should prolly be a hook or something
69  // instead of hardcoding a class name from the Cite extension
70  if ( class_exists( 'Cite' ) ) {
71  $spat .= '|(<ref>)'; // references via cite extension
72  $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
73  }
74  $spat .= '/';
75  $textExt = []; // text extracts
76  $otherExt = []; // other extracts
77  $start = 0;
78  $textLen = strlen( $text );
79  $count = 0; // sequence number to maintain ordering
80  while ( $start < $textLen ) {
81  // find start of template/image/table
82  if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
83  $epat = '';
84  foreach ( $matches as $key => $val ) {
85  if ( $key > 0 && $val[1] != -1 ) {
86  if ( $key == 2 ) {
87  // see if this is an image link
88  $ns = substr( $val[0], 2, -1 );
89  if (
90  MediaWikiServices::getInstance()->getContentLanguage()->
91  getNsIndex( $ns ) != NS_FILE
92  ) {
93  break;
94  }
95 
96  }
97  $epat = $endPatterns[$key];
98  $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
99  $start = $val[1];
100  break;
101  }
102  }
103  if ( $epat ) {
104  // find end (and detect any nested elements)
105  $level = 0;
106  $offset = $start + 1;
107  $found = false;
108  while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
109  if ( array_key_exists( 2, $endMatches ) ) {
110  // found end
111  if ( $level == 0 ) {
112  $len = strlen( $endMatches[2][0] );
113  $off = $endMatches[2][1];
114  $this->splitAndAdd( $otherExt, $count,
115  substr( $text, $start, $off + $len - $start ) );
116  $start = $off + $len;
117  $found = true;
118  break;
119  } else {
120  // end of nested element
121  $level -= 1;
122  }
123  } else {
124  // nested
125  $level += 1;
126  }
127  $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
128  }
129  if ( !$found ) {
130  // couldn't find appropriate closing tag, skip
131  $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
132  $start += strlen( $matches[0][0] );
133  }
134  continue;
135  }
136  }
137  // else: add as text extract
138  $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
139  break;
140  }
141 
142  $all = $textExt + $otherExt; // these have disjunct key sets
143 
144  // prepare regexps
145  foreach ( $terms as $index => $term ) {
146  // manually do upper/lowercase stuff for utf-8 since PHP won't do it
147  if ( preg_match( '/[\x80-\xff]/', $term ) ) {
148  $terms[$index] = preg_replace_callback(
149  '/./us',
150  [ $this, 'caseCallback' ],
151  $terms[$index]
152  );
153  } else {
154  $terms[$index] = $term;
155  }
156  }
157  $anyterm = implode( '|', $terms );
158  $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
159  // @todo FIXME: A hack to scale contextchars, a correct solution
160  // would be to have contextchars actually be char and not byte
161  // length, and do proper utf-8 substrings and lengths everywhere,
162  // but PHP is making that very hard and unclean to implement :(
163  $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
164  $contextchars = intval( $contextchars * $scale );
165 
166  $patPre = "(^|$wgSearchHighlightBoundaries)";
167  $patPost = "($wgSearchHighlightBoundaries|$)";
168 
169  $pat1 = "/(" . $phrase . ")/ui";
170  $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
171 
172  $left = $contextlines;
173 
174  $snippets = [];
175  $offsets = [];
176 
177  // show beginning only if it contains all words
178  $first = 0;
179  $firstText = '';
180  foreach ( $textExt as $index => $line ) {
181  if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
182  $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
183  $first = $index;
184  break;
185  }
186  }
187  if ( $firstText ) {
188  $succ = true;
189  // check if first text contains all terms
190  foreach ( $terms as $term ) {
191  if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
192  $succ = false;
193  break;
194  }
195  }
196  if ( $succ ) {
197  $snippets[$first] = $firstText;
198  $offsets[$first] = 0;
199  }
200  }
201  if ( !$snippets ) {
202  // match whole query on text
203  $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
204  // match whole query on templates/tables/images
205  $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
206  // match any words on text
207  $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
208  // match any words on templates/tables/images
209  $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
210 
211  ksort( $snippets );
212  }
213 
214  // add extra chars to each snippet to make snippets constant size
215  $extended = [];
216  if ( count( $snippets ) == 0 ) {
217  // couldn't find the target words, just show beginning of article
218  if ( array_key_exists( $first, $all ) ) {
219  $targetchars = $contextchars * $contextlines;
220  $snippets[$first] = '';
221  $offsets[$first] = 0;
222  }
223  } else {
224  // if begin of the article contains the whole phrase, show only that !!
225  if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
226  && $offsets[$first] < $contextchars * 2 ) {
227  $snippets = [ $first => $snippets[$first] ];
228  }
229 
230  // calc by how much to extend existing snippets
231  $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
232  }
233 
234  foreach ( $snippets as $index => $line ) {
235  $extended[$index] = $line;
236  $len = strlen( $line );
237  if ( $len < $targetchars - 20 ) {
238  // complete this line
239  if ( $len < strlen( $all[$index] ) ) {
240  $extended[$index] = $this->extract(
241  $all[$index],
242  $offsets[$index],
243  $offsets[$index] + $targetchars,
244  $offsets[$index]
245  );
246  $len = strlen( $extended[$index] );
247  }
248 
249  // add more lines
250  $add = $index + 1;
251  while ( $len < $targetchars - 20
252  && array_key_exists( $add, $all )
253  && !array_key_exists( $add, $snippets ) ) {
254  $offsets[$add] = 0;
255  $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
256  $extended[$add] = $tt;
257  $len += strlen( $tt );
258  $add++;
259  }
260  }
261  }
262 
263  // $snippets = array_map( 'htmlspecialchars', $extended );
264  $snippets = $extended;
265  $last = -1;
266  $extract = '';
267  foreach ( $snippets as $index => $line ) {
268  if ( $last == -1 ) {
269  $extract .= $line; // first line
270  } elseif ( $last + 1 == $index
271  && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
272  ) {
273  $extract .= " " . $line; // continous lines
274  } else {
275  $extract .= '<b> ... </b>' . $line;
276  }
277 
278  $last = $index;
279  }
280  if ( $extract ) {
281  $extract .= '<b> ... </b>';
282  }
283 
284  $processed = [];
285  foreach ( $terms as $term ) {
286  if ( !isset( $processed[$term] ) ) {
287  $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
288  $extract = preg_replace( $pat3,
289  "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
290  $processed[$term] = true;
291  }
292  }
293 
294  return $extract;
295  }
296 
304  function splitAndAdd( &$extracts, &$count, $text ) {
305  $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
306  foreach ( $split as $line ) {
307  $tt = trim( $line );
308  if ( $tt ) {
309  $extracts[$count++] = $tt;
310  }
311  }
312  }
313 
320  function caseCallback( $matches ) {
321  if ( strlen( $matches[0] ) > 1 ) {
322  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
323  return '[' . $contLang->lc( $matches[0] ) .
324  $contLang->uc( $matches[0] ) . ']';
325  } else {
326  return $matches[0];
327  }
328  }
329 
340  function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
341  if ( $start != 0 ) {
342  $start = $this->position( $text, $start, 1 );
343  }
344  if ( $end >= strlen( $text ) ) {
345  $end = strlen( $text );
346  } else {
347  $end = $this->position( $text, $end );
348  }
349 
350  if ( !is_null( $posStart ) ) {
351  $posStart = $start;
352  }
353  if ( !is_null( $posEnd ) ) {
354  $posEnd = $end;
355  }
356 
357  if ( $end > $start ) {
358  return substr( $text, $start, $end - $start );
359  } else {
360  return '';
361  }
362  }
363 
372  function position( $text, $point, $offset = 0 ) {
373  $tolerance = 10;
374  $s = max( 0, $point - $tolerance );
375  $l = min( strlen( $text ), $point + $tolerance ) - $s;
376  $m = [];
377 
378  if ( preg_match(
379  '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
380  substr( $text, $s, $l ),
381  $m,
382  PREG_OFFSET_CAPTURE
383  ) ) {
384  return $m[0][1] + $s + $offset;
385  } else {
386  // check if point is on a valid first UTF8 char
387  $char = ord( $text[$point] );
388  while ( $char >= 0x80 && $char < 0xc0 ) {
389  // skip trailing bytes
390  $point++;
391  if ( $point >= strlen( $text ) ) {
392  return strlen( $text );
393  }
394  $char = ord( $text[$point] );
395  }
396 
397  return $point;
398 
399  }
400  }
401 
413  function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
414  if ( $linesleft == 0 ) {
415  return; // nothing to do
416  }
417  foreach ( $extracts as $index => $line ) {
418  if ( array_key_exists( $index, $out ) ) {
419  continue; // this line already highlighted
420  }
421 
422  $m = [];
423  if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
424  continue;
425  }
426 
427  $offset = $m[0][1];
428  $len = strlen( $m[0][0] );
429  if ( $offset + $len < $contextchars ) {
430  $begin = 0;
431  } elseif ( $len > $contextchars ) {
432  $begin = $offset;
433  } else {
434  $begin = $offset + intval( ( $len - $contextchars ) / 2 );
435  }
436 
437  $end = $begin + $contextchars;
438 
439  $posBegin = $begin;
440  // basic snippet from this line
441  $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
442  $offsets[$index] = $posBegin;
443  $linesleft--;
444  if ( $linesleft == 0 ) {
445  return;
446  }
447  }
448  }
449 
456  function removeWiki( $text ) {
457  $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
458  $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
459  $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
460  $text = preg_replace_callback(
461  "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
462  [ $this, 'linkReplace' ],
463  $text
464  );
465  $text = preg_replace( "/<\/?[^>]+>/", "", $text );
466  $text = preg_replace( "/'''''/", "", $text );
467  $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
468  $text = preg_replace( "/''/", "", $text );
469 
470  // Note, the previous /<\/?[^>]+>/ is insufficient
471  // for XSS safety as the HTML tag can span multiple
472  // search results (T144845).
473  $text = Sanitizer::escapeHtmlAllowEntities( $text );
474  return $text;
475  }
476 
484  function linkReplace( $matches ) {
485  $colon = strpos( $matches[1], ':' );
486  if ( $colon === false ) {
487  return $matches[2]; // replace with caption
488  }
489  $ns = substr( $matches[1], 0, $colon );
490  $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
491  if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
492  return $matches[0]; // return the whole thing
493  } else {
494  return $matches[2];
495  }
496  }
497 
510  public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
511  $lines = explode( "\n", $text );
512 
513  $terms = implode( '|', $terms );
514  $max = intval( $contextchars ) + 1;
515  $pat1 = "/(.*)($terms)(.{0,$max})/i";
516 
517  $lineno = 0;
518 
519  $extract = "";
520  $contLang = MediaWikiServices::getInstance()->getContentLanguage();
521  foreach ( $lines as $line ) {
522  if ( $contextlines == 0 ) {
523  break;
524  }
525  ++$lineno;
526  $m = [];
527  if ( !preg_match( $pat1, $line, $m ) ) {
528  continue;
529  }
530  --$contextlines;
531  // truncate function changes ... to relevant i18n message.
532  $pre = $contLang->truncateForVisual( $m[1], - $contextchars, '...', false );
533 
534  if ( count( $m ) < 3 ) {
535  $post = '';
536  } else {
537  $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
538  }
539 
540  $found = $m[2];
541 
542  $line = htmlspecialchars( $pre . $found . $post );
543  $pat2 = '/(' . $terms . ")/i";
544  $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
545 
546  $extract .= "${line}\n";
547  }
548 
549  return $extract;
550  }
551 
560  public function highlightNone( $text, $contextlines, $contextchars ) {
561  $match = [];
562  $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
563  $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
564  preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
565 
566  // Trim and limit to max number of chars
567  $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
568  return str_replace( "\n", '<br>', $text );
569  }
570 }
highlightSimple( $text, $terms, $contextlines, $contextchars)
Simple & fast snippet extraction, but gives completely unrelevant snippets.
whereas SearchGetNearMatch runs after $term
Definition: hooks.txt:2875
splitAndAdd(&$extracts, &$count, $text)
Split text into lines and add it to extracts array.
Apache License January AND DISTRIBUTION Definitions License shall mean the terms and conditions for use
extract( $text, $start, $end, &$posStart=null, &$posEnd=null)
Extract part of the text from start to end, but by not chopping up words.
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency MediaWikiServices
Definition: injection.txt:23
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not it can be in the form of< username >< more info > e g for bot passwords intended to be added to log contexts Fields it might only if the login was with a bot password it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output $out
Definition: hooks.txt:780
highlightNone( $text, $contextlines, $contextchars)
Returns the first few lines of the text.
$last
Highlight bits of wikitext.
$wgSearchHighlightBoundaries
Regexp to match word boundaries, defaults for non-CJK languages should be empty for CJK since the wor...
removeWiki( $text)
Basic wikitext removal.
const NS_CATEGORY
Definition: Defines.php:78
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that When $user is not null
Definition: hooks.txt:780
const NS_FILE
Definition: Defines.php:70
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
highlightText( $text, $terms, $contextlines, $contextchars)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
$lines
Definition: router.php:61
position( $text, $point, $offset=0)
Find a nonletter near a point (index) in the text.
process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets)
Search extracts for a pattern, and return snippets.
linkReplace( $matches)
callback to replace [[target|caption]] kind of links, if the target is category or image...
$line
Definition: cdb.php:59
caseCallback( $matches)
Do manual case conversion for non-ascii chars.
__construct( $cleanupWikitext=true)
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1425
$matches
return true to allow those checks to and false if checking is done remove or add to the links of a group of changes in EnhancedChangesList Hook subscribers can return false to omit this line from recentchanges use this to change the tables headers change it to an object instance and return false override the list derivative used the name of the old file when set the default code will be skipped $pre
Definition: hooks.txt:1473