MediaWiki  1.23.16
SearchHighlighter.php
Go to the documentation of this file.
1 <?php
30  var $mCleanWikitext = true;
31 
36  function __construct( $cleanupWikitext = true ) {
37  $this->mCleanWikitext = $cleanupWikitext;
38  }
39 
49  public function highlightText( $text, $terms, $contextlines, $contextchars ) {
51  global $wgSearchHighlightBoundaries;
52  $fname = __METHOD__;
53 
54  if ( $text == '' ) {
55  return '';
56  }
57 
58  // spli text into text + templates/links/tables
59  $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
60  // first capture group is for detecting nested templates/links/tables/references
61  $endPatterns = array(
62  1 => '/(\{\{)|(\}\})/', // template
63  2 => '/(\[\[)|(\]\])/', // image
64  3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
65 
66  // @todo FIXME: This should prolly be a hook or something
67  if ( function_exists( 'wfCite' ) ) {
68  $spat .= '|(<ref>)'; // references via cite extension
69  $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
70  }
71  $spat .= '/';
72  $textExt = array(); // text extracts
73  $otherExt = array(); // other extracts
74  wfProfileIn( "$fname-split" );
75  $start = 0;
76  $textLen = strlen( $text );
77  $count = 0; // sequence number to maintain ordering
78  while ( $start < $textLen ) {
79  // find start of template/image/table
80  if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
81  $epat = '';
82  foreach ( $matches as $key => $val ) {
83  if ( $key > 0 && $val[1] != - 1 ) {
84  if ( $key == 2 ) {
85  // see if this is an image link
86  $ns = substr( $val[0], 2, - 1 );
87  if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
88  break;
89  }
90 
91  }
92  $epat = $endPatterns[$key];
93  $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
94  $start = $val[1];
95  break;
96  }
97  }
98  if ( $epat ) {
99  // find end (and detect any nested elements)
100  $level = 0;
101  $offset = $start + 1;
102  $found = false;
103  while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
104  if ( array_key_exists( 2, $endMatches ) ) {
105  // found end
106  if ( $level == 0 ) {
107  $len = strlen( $endMatches[2][0] );
108  $off = $endMatches[2][1];
109  $this->splitAndAdd( $otherExt, $count,
110  substr( $text, $start, $off + $len - $start ) );
111  $start = $off + $len;
112  $found = true;
113  break;
114  } else {
115  // end of nested element
116  $level -= 1;
117  }
118  } else {
119  // nested
120  $level += 1;
121  }
122  $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
123  }
124  if ( ! $found ) {
125  // couldn't find appropriate closing tag, skip
126  $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
127  $start += strlen( $matches[0][0] );
128  }
129  continue;
130  }
131  }
132  // else: add as text extract
133  $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
134  break;
135  }
136 
137  $all = $textExt + $otherExt; // these have disjunct key sets
138 
139  wfProfileOut( "$fname-split" );
140 
141  // prepare regexps
142  foreach ( $terms as $index => $term ) {
143  // manually do upper/lowercase stuff for utf-8 since PHP won't do it
144  if ( preg_match( '/[\x80-\xff]/', $term ) ) {
145  $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] );
146  } else {
147  $terms[$index] = $term;
148  }
149  }
150  $anyterm = implode( '|', $terms );
151  $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
152 
153  // @todo FIXME: A hack to scale contextchars, a correct solution
154  // would be to have contextchars actually be char and not byte
155  // length, and do proper utf-8 substrings and lengths everywhere,
156  // but PHP is making that very hard and unclean to implement :(
157  $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
158  $contextchars = intval( $contextchars * $scale );
159 
160  $patPre = "(^|$wgSearchHighlightBoundaries)";
161  $patPost = "($wgSearchHighlightBoundaries|$)";
162 
163  $pat1 = "/(" . $phrase . ")/ui";
164  $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
165 
166  wfProfileIn( "$fname-extract" );
167 
168  $left = $contextlines;
169 
170  $snippets = array();
171  $offsets = array();
172 
173  // show beginning only if it contains all words
174  $first = 0;
175  $firstText = '';
176  foreach ( $textExt as $index => $line ) {
177  if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
178  $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
179  $first = $index;
180  break;
181  }
182  }
183  if ( $firstText ) {
184  $succ = true;
185  // check if first text contains all terms
186  foreach ( $terms as $term ) {
187  if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
188  $succ = false;
189  break;
190  }
191  }
192  if ( $succ ) {
193  $snippets[$first] = $firstText;
194  $offsets[$first] = 0;
195  }
196  }
197  if ( ! $snippets ) {
198  // match whole query on text
199  $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
200  // match whole query on templates/tables/images
201  $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
202  // match any words on text
203  $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
204  // match any words on templates/tables/images
205  $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
206 
207  ksort( $snippets );
208  }
209 
210  // add extra chars to each snippet to make snippets constant size
211  $extended = array();
212  if ( count( $snippets ) == 0 ) {
213  // couldn't find the target words, just show beginning of article
214  if ( array_key_exists( $first, $all ) ) {
215  $targetchars = $contextchars * $contextlines;
216  $snippets[$first] = '';
217  $offsets[$first] = 0;
218  }
219  } else {
220  // if begin of the article contains the whole phrase, show only that !!
221  if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
222  && $offsets[$first] < $contextchars * 2 ) {
223  $snippets = array( $first => $snippets[$first] );
224  }
225 
226  // calc by how much to extend existing snippets
227  $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
228  }
229 
230  foreach ( $snippets as $index => $line ) {
231  $extended[$index] = $line;
232  $len = strlen( $line );
233  if ( $len < $targetchars - 20 ) {
234  // complete this line
235  if ( $len < strlen( $all[$index] ) ) {
236  $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] );
237  $len = strlen( $extended[$index] );
238  }
239 
240  // add more lines
241  $add = $index + 1;
242  while ( $len < $targetchars - 20
243  && array_key_exists( $add, $all )
244  && !array_key_exists( $add, $snippets ) ) {
245  $offsets[$add] = 0;
246  $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
247  $extended[$add] = $tt;
248  $len += strlen( $tt );
249  $add++;
250  }
251  }
252  }
253 
254  // $snippets = array_map( 'htmlspecialchars', $extended );
255  $snippets = $extended;
256  $last = - 1;
257  $extract = '';
258  foreach ( $snippets as $index => $line ) {
259  if ( $last == - 1 ) {
260  $extract .= $line; // first line
261  } elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) ) {
262  $extract .= " " . $line; // continous lines
263  } else {
264  $extract .= '<b> ... </b>' . $line;
265  }
266 
267  $last = $index;
268  }
269  if ( $extract ) {
270  $extract .= '<b> ... </b>';
271  }
272 
273  $processed = array();
274  foreach ( $terms as $term ) {
275  if ( ! isset( $processed[$term] ) ) {
276  $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
277  $extract = preg_replace( $pat3,
278  "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
279  $processed[$term] = true;
280  }
281  }
282 
283  wfProfileOut( "$fname-extract" );
284 
285  return $extract;
286  }
287 
295  function splitAndAdd( &$extracts, &$count, $text ) {
296  $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
297  foreach ( $split as $line ) {
298  $tt = trim( $line );
299  if ( $tt ) {
300  $extracts[$count++] = $tt;
301  }
302  }
303  }
304 
311  function caseCallback( $matches ) {
313  if ( strlen( $matches[0] ) > 1 ) {
314  return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
315  } else {
316  return $matches[0];
317  }
318  }
319 
330  function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
331  if ( $start != 0 ) {
332  $start = $this->position( $text, $start, 1 );
333  }
334  if ( $end >= strlen( $text ) ) {
335  $end = strlen( $text );
336  } else {
337  $end = $this->position( $text, $end );
338  }
339 
340  if ( !is_null( $posStart ) ) {
341  $posStart = $start;
342  }
343  if ( !is_null( $posEnd ) ) {
344  $posEnd = $end;
345  }
346 
347  if ( $end > $start ) {
348  return substr( $text, $start, $end - $start );
349  } else {
350  return '';
351  }
352  }
353 
362  function position( $text, $point, $offset = 0 ) {
363  $tolerance = 10;
364  $s = max( 0, $point - $tolerance );
365  $l = min( strlen( $text ), $point + $tolerance ) - $s;
366  $m = array();
367  if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) {
368  return $m[0][1] + $s + $offset;
369  } else {
370  // check if point is on a valid first UTF8 char
371  $char = ord( $text[$point] );
372  while ( $char >= 0x80 && $char < 0xc0 ) {
373  // skip trailing bytes
374  $point++;
375  if ( $point >= strlen( $text ) ) {
376  return strlen( $text );
377  }
378  $char = ord( $text[$point] );
379  }
380  return $point;
381 
382  }
383  }
384 
396  function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
397  if ( $linesleft == 0 ) {
398  return; // nothing to do
399  }
400  foreach ( $extracts as $index => $line ) {
401  if ( array_key_exists( $index, $out ) ) {
402  continue; // this line already highlighted
403  }
404 
405  $m = array();
406  if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
407  continue;
408  }
409 
410  $offset = $m[0][1];
411  $len = strlen( $m[0][0] );
412  if ( $offset + $len < $contextchars ) {
413  $begin = 0;
414  } elseif ( $len > $contextchars ) {
415  $begin = $offset;
416  } else {
417  $begin = $offset + intval( ( $len - $contextchars ) / 2 );
418  }
419 
420  $end = $begin + $contextchars;
421 
422  $posBegin = $begin;
423  // basic snippet from this line
424  $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
425  $offsets[$index] = $posBegin;
426  $linesleft--;
427  if ( $linesleft == 0 ) {
428  return;
429  }
430  }
431  }
432 
438  function removeWiki( $text ) {
439  $fname = __METHOD__;
440  wfProfileIn( $fname );
441 
442  // $text = preg_replace( "/'{2,5}/", "", $text );
443  // $text = preg_replace( "/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text );
444  // $text = preg_replace( "/\[\[([^]|]+)\]\]/", "\\1", $text );
445  // $text = preg_replace( "/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text );
446  // $text = preg_replace( "/\\{\\|(.*?)\\|\\}/", "", $text );
447  // $text = preg_replace( "/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text );
448  $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
449  $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
450  $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
451  $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text );
452  // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
453  $text = preg_replace( "/<\/?[^>]+>/", "", $text );
454  $text = preg_replace( "/'''''/", "", $text );
455  $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
456  $text = preg_replace( "/''/", "", $text );
457 
458  // Note, the previous /<\/?[^>]+>/ is insufficient
459  // for XSS safety as the HTML tag can span multiple
460  // search results (T144845).
461  $text = Sanitizer::escapeHtmlAllowEntities( $text );
462 
463  wfProfileOut( $fname );
464  return $text;
465  }
466 
473  function linkReplace( $matches ) {
474  $colon = strpos( $matches[1], ':' );
475  if ( $colon === false ) {
476  return $matches[2]; // replace with caption
477  }
479  $ns = substr( $matches[1], 0, $colon );
480  $index = $wgContLang->getNsIndex( $ns );
481  if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
482  return $matches[0]; // return the whole thing
483  } else {
484  return $matches[2];
485  }
486  }
487 
498  public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
500  $fname = __METHOD__;
501 
502  $lines = explode( "\n", $text );
503 
504  $terms = implode( '|', $terms );
505  $max = intval( $contextchars ) + 1;
506  $pat1 = "/(.*)($terms)(.{0,$max})/i";
507 
508  $lineno = 0;
509 
510  $extract = "";
511  wfProfileIn( "$fname-extract" );
512  foreach ( $lines as $line ) {
513  if ( 0 == $contextlines ) {
514  break;
515  }
516  ++$lineno;
517  $m = array();
518  if ( ! preg_match( $pat1, $line, $m ) ) {
519  continue;
520  }
521  --$contextlines;
522  // truncate function changes ... to relevant i18n message.
523  $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
524 
525  if ( count( $m ) < 3 ) {
526  $post = '';
527  } else {
528  $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
529  }
530 
531  $found = $m[2];
532 
533  $line = htmlspecialchars( $pre . $found . $post );
534  $pat2 = '/(' . $terms . ")/i";
535  $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
536 
537  $extract .= "${line}\n";
538  }
539  wfProfileOut( "$fname-extract" );
540 
541  return $extract;
542  }
543 }
SearchHighlighter\splitAndAdd
splitAndAdd(&$extracts, &$count, $text)
Split text into lines and add it to extracts array.
Definition: SearchHighlighter.php:295
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
$last
$last
Definition: profileinfo.php:365
SearchHighlighter\$mCleanWikitext
$mCleanWikitext
Definition: SearchHighlighter.php:30
wfProfileIn
wfProfileIn( $functionname)
Begin profiling of a function.
Definition: Profiler.php:33
SearchHighlighter\highlightSimple
highlightSimple( $text, $terms, $contextlines, $contextchars)
Simple & fast snippet extraction, but gives completely unrelevant snippets.
Definition: SearchHighlighter.php:498
$fname
if(!defined( 'MEDIAWIKI')) $fname
This file is not a valid entry point, perform no further processing unless MEDIAWIKI is defined.
Definition: Setup.php:35
NS_FILE
const NS_FILE
Definition: Defines.php:85
$s
$s
Definition: mergeMessageFileList.php:156
$wgContLang
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the content language as $wgContLang
Definition: design.txt:56
SearchHighlighter\extract
extract( $text, $start, $end, &$posStart=null, &$posEnd=null)
Extract part of the text from start to end, but by not chopping up words.
Definition: SearchHighlighter.php:330
$processed
$processed
Definition: importImages.php:40
$out
$out
Definition: UtfNormalGenerate.php:167
$pre
return true to allow those checks to and false if checking is done use this to change the tables headers temp or archived zone change it to an object instance and return false override the list derivative used the name of the old file when set the default code will be skipped $pre
Definition: hooks.txt:1105
wfProfileOut
wfProfileOut( $functionname='missing')
Stop profiling of a function.
Definition: Profiler.php:46
SearchHighlighter\highlightText
highlightText( $text, $terms, $contextlines, $contextchars)
Default implementation of wikitext highlighting.
Definition: SearchHighlighter.php:49
$lines
$lines
Definition: router.php:65
array
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
global
when a variable name is used in a it is silently declared as a new masking the global
Definition: design.txt:93
NS_CATEGORY
const NS_CATEGORY
Definition: Defines.php:93
$line
$line
Definition: cdb.php:57
SearchHighlighter\removeWiki
removeWiki( $text)
Basic wikitext removal.
Definition: SearchHighlighter.php:438
$matches
if(!defined( 'MEDIAWIKI')) if(!isset( $wgVersion)) $matches
Definition: NoLocalSettings.php:33
SearchHighlighter\__construct
__construct( $cleanupWikitext=true)
Definition: SearchHighlighter.php:36
SearchHighlighter
Highlight bits of wikitext.
Definition: SearchHighlighter.php:29
$count
$count
Definition: UtfNormalTest2.php:96
SearchHighlighter\caseCallback
caseCallback( $matches)
Do manual case conversion for non-ascii chars.
Definition: SearchHighlighter.php:311
SearchHighlighter\position
position( $text, $point, $offset=0)
Find a nonletter near a point (index) in the text.
Definition: SearchHighlighter.php:362
$term
the value to return A Title object or null whereas SearchGetNearMatch runs after $term
Definition: hooks.txt:2136
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
SearchHighlighter\process
process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets)
Search extracts for a pattern, and return snippets.
Definition: SearchHighlighter.php:396
SearchHighlighter\linkReplace
linkReplace( $matches)
callback to replace [[target|caption]] kind of links, if the target is category or image,...
Definition: SearchHighlighter.php:473
Sanitizer\escapeHtmlAllowEntities
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Definition: Sanitizer.php:1159