MediaWiki  1.23.2
SearchHighlighter.php
Go to the documentation of this file.
1 <?php
30  var $mCleanWikitext = true;
31 
32  function __construct( $cleanupWikitext = true ) {
33  $this->mCleanWikitext = $cleanupWikitext;
34  }
35 
45  public function highlightText( $text, $terms, $contextlines, $contextchars ) {
47  global $wgSearchHighlightBoundaries;
48  $fname = __METHOD__;
49 
50  if ( $text == '' ) {
51  return '';
52  }
53 
54  // spli text into text + templates/links/tables
55  $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
56  // first capture group is for detecting nested templates/links/tables/references
57  $endPatterns = array(
58  1 => '/(\{\{)|(\}\})/', // template
59  2 => '/(\[\[)|(\]\])/', // image
60  3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
61 
62  // @todo FIXME: This should prolly be a hook or something
63  if ( function_exists( 'wfCite' ) ) {
64  $spat .= '|(<ref>)'; // references via cite extension
65  $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
66  }
67  $spat .= '/';
68  $textExt = array(); // text extracts
69  $otherExt = array(); // other extracts
70  wfProfileIn( "$fname-split" );
71  $start = 0;
72  $textLen = strlen( $text );
73  $count = 0; // sequence number to maintain ordering
74  while ( $start < $textLen ) {
75  // find start of template/image/table
76  if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
77  $epat = '';
78  foreach ( $matches as $key => $val ) {
79  if ( $key > 0 && $val[1] != - 1 ) {
80  if ( $key == 2 ) {
81  // see if this is an image link
82  $ns = substr( $val[0], 2, - 1 );
83  if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
84  break;
85  }
86 
87  }
88  $epat = $endPatterns[$key];
89  $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
90  $start = $val[1];
91  break;
92  }
93  }
94  if ( $epat ) {
95  // find end (and detect any nested elements)
96  $level = 0;
97  $offset = $start + 1;
98  $found = false;
99  while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
100  if ( array_key_exists( 2, $endMatches ) ) {
101  // found end
102  if ( $level == 0 ) {
103  $len = strlen( $endMatches[2][0] );
104  $off = $endMatches[2][1];
105  $this->splitAndAdd( $otherExt, $count,
106  substr( $text, $start, $off + $len - $start ) );
107  $start = $off + $len;
108  $found = true;
109  break;
110  } else {
111  // end of nested element
112  $level -= 1;
113  }
114  } else {
115  // nested
116  $level += 1;
117  }
118  $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
119  }
120  if ( ! $found ) {
121  // couldn't find appropriate closing tag, skip
122  $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
123  $start += strlen( $matches[0][0] );
124  }
125  continue;
126  }
127  }
128  // else: add as text extract
129  $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
130  break;
131  }
132 
133  $all = $textExt + $otherExt; // these have disjunct key sets
134 
135  wfProfileOut( "$fname-split" );
136 
137  // prepare regexps
138  foreach ( $terms as $index => $term ) {
139  // manually do upper/lowercase stuff for utf-8 since PHP won't do it
140  if ( preg_match( '/[\x80-\xff]/', $term ) ) {
141  $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] );
142  } else {
143  $terms[$index] = $term;
144  }
145  }
146  $anyterm = implode( '|', $terms );
147  $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
148 
149  // @todo FIXME: A hack to scale contextchars, a correct solution
150  // would be to have contextchars actually be char and not byte
151  // length, and do proper utf-8 substrings and lengths everywhere,
152  // but PHP is making that very hard and unclean to implement :(
153  $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
154  $contextchars = intval( $contextchars * $scale );
155 
156  $patPre = "(^|$wgSearchHighlightBoundaries)";
157  $patPost = "($wgSearchHighlightBoundaries|$)";
158 
159  $pat1 = "/(" . $phrase . ")/ui";
160  $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
161 
162  wfProfileIn( "$fname-extract" );
163 
164  $left = $contextlines;
165 
166  $snippets = array();
167  $offsets = array();
168 
169  // show beginning only if it contains all words
170  $first = 0;
171  $firstText = '';
172  foreach ( $textExt as $index => $line ) {
173  if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
174  $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
175  $first = $index;
176  break;
177  }
178  }
179  if ( $firstText ) {
180  $succ = true;
181  // check if first text contains all terms
182  foreach ( $terms as $term ) {
183  if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
184  $succ = false;
185  break;
186  }
187  }
188  if ( $succ ) {
189  $snippets[$first] = $firstText;
190  $offsets[$first] = 0;
191  }
192  }
193  if ( ! $snippets ) {
194  // match whole query on text
195  $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
196  // match whole query on templates/tables/images
197  $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
198  // match any words on text
199  $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
200  // match any words on templates/tables/images
201  $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
202 
203  ksort( $snippets );
204  }
205 
206  // add extra chars to each snippet to make snippets constant size
207  $extended = array();
208  if ( count( $snippets ) == 0 ) {
209  // couldn't find the target words, just show beginning of article
210  if ( array_key_exists( $first, $all ) ) {
211  $targetchars = $contextchars * $contextlines;
212  $snippets[$first] = '';
213  $offsets[$first] = 0;
214  }
215  } else {
216  // if begin of the article contains the whole phrase, show only that !!
217  if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
218  && $offsets[$first] < $contextchars * 2 ) {
219  $snippets = array( $first => $snippets[$first] );
220  }
221 
222  // calc by how much to extend existing snippets
223  $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
224  }
225 
226  foreach ( $snippets as $index => $line ) {
227  $extended[$index] = $line;
228  $len = strlen( $line );
229  if ( $len < $targetchars - 20 ) {
230  // complete this line
231  if ( $len < strlen( $all[$index] ) ) {
232  $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] );
233  $len = strlen( $extended[$index] );
234  }
235 
236  // add more lines
237  $add = $index + 1;
238  while ( $len < $targetchars - 20
239  && array_key_exists( $add, $all )
240  && !array_key_exists( $add, $snippets ) ) {
241  $offsets[$add] = 0;
242  $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
243  $extended[$add] = $tt;
244  $len += strlen( $tt );
245  $add++;
246  }
247  }
248  }
249 
250  // $snippets = array_map( 'htmlspecialchars', $extended );
251  $snippets = $extended;
252  $last = - 1;
253  $extract = '';
254  foreach ( $snippets as $index => $line ) {
255  if ( $last == - 1 ) {
256  $extract .= $line; // first line
257  } elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) ) {
258  $extract .= " " . $line; // continous lines
259  } else {
260  $extract .= '<b> ... </b>' . $line;
261  }
262 
263  $last = $index;
264  }
265  if ( $extract ) {
266  $extract .= '<b> ... </b>';
267  }
268 
269  $processed = array();
270  foreach ( $terms as $term ) {
271  if ( ! isset( $processed[$term] ) ) {
272  $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
273  $extract = preg_replace( $pat3,
274  "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
275  $processed[$term] = true;
276  }
277  }
278 
279  wfProfileOut( "$fname-extract" );
280 
281  return $extract;
282  }
283 
291  function splitAndAdd( &$extracts, &$count, $text ) {
292  $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
293  foreach ( $split as $line ) {
294  $tt = trim( $line );
295  if ( $tt ) {
296  $extracts[$count++] = $tt;
297  }
298  }
299  }
300 
307  function caseCallback( $matches ) {
309  if ( strlen( $matches[0] ) > 1 ) {
310  return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
311  } else {
312  return $matches[0];
313  }
314  }
315 
326  function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
327  if ( $start != 0 ) {
328  $start = $this->position( $text, $start, 1 );
329  }
330  if ( $end >= strlen( $text ) ) {
331  $end = strlen( $text );
332  } else {
333  $end = $this->position( $text, $end );
334  }
335 
336  if ( !is_null( $posStart ) ) {
337  $posStart = $start;
338  }
339  if ( !is_null( $posEnd ) ) {
340  $posEnd = $end;
341  }
342 
343  if ( $end > $start ) {
344  return substr( $text, $start, $end - $start );
345  } else {
346  return '';
347  }
348  }
349 
358  function position( $text, $point, $offset = 0 ) {
359  $tolerance = 10;
360  $s = max( 0, $point - $tolerance );
361  $l = min( strlen( $text ), $point + $tolerance ) - $s;
362  $m = array();
363  if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) {
364  return $m[0][1] + $s + $offset;
365  } else {
366  // check if point is on a valid first UTF8 char
367  $char = ord( $text[$point] );
368  while ( $char >= 0x80 && $char < 0xc0 ) {
369  // skip trailing bytes
370  $point++;
371  if ( $point >= strlen( $text ) ) {
372  return strlen( $text );
373  }
374  $char = ord( $text[$point] );
375  }
376  return $point;
377 
378  }
379  }
380 
392  function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
393  if ( $linesleft == 0 ) {
394  return; // nothing to do
395  }
396  foreach ( $extracts as $index => $line ) {
397  if ( array_key_exists( $index, $out ) ) {
398  continue; // this line already highlighted
399  }
400 
401  $m = array();
402  if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
403  continue;
404  }
405 
406  $offset = $m[0][1];
407  $len = strlen( $m[0][0] );
408  if ( $offset + $len < $contextchars ) {
409  $begin = 0;
410  } elseif ( $len > $contextchars ) {
411  $begin = $offset;
412  } else {
413  $begin = $offset + intval( ( $len - $contextchars ) / 2 );
414  }
415 
416  $end = $begin + $contextchars;
417 
418  $posBegin = $begin;
419  // basic snippet from this line
420  $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
421  $offsets[$index] = $posBegin;
422  $linesleft--;
423  if ( $linesleft == 0 ) {
424  return;
425  }
426  }
427  }
428 
434  function removeWiki( $text ) {
435  $fname = __METHOD__;
436  wfProfileIn( $fname );
437 
438  // $text = preg_replace( "/'{2,5}/", "", $text );
439  // $text = preg_replace( "/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text );
440  // $text = preg_replace( "/\[\[([^]|]+)\]\]/", "\\1", $text );
441  // $text = preg_replace( "/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text );
442  // $text = preg_replace( "/\\{\\|(.*?)\\|\\}/", "", $text );
443  // $text = preg_replace( "/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text );
444  $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
445  $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
446  $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
447  $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text );
448  // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
449  $text = preg_replace( "/<\/?[^>]+>/", "", $text );
450  $text = preg_replace( "/'''''/", "", $text );
451  $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
452  $text = preg_replace( "/''/", "", $text );
453 
454  wfProfileOut( $fname );
455  return $text;
456  }
457 
464  function linkReplace( $matches ) {
465  $colon = strpos( $matches[1], ':' );
466  if ( $colon === false ) {
467  return $matches[2]; // replace with caption
468  }
470  $ns = substr( $matches[1], 0, $colon );
471  $index = $wgContLang->getNsIndex( $ns );
472  if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
473  return $matches[0]; // return the whole thing
474  } else {
475  return $matches[2];
476  }
477  }
478 
489  public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
491  $fname = __METHOD__;
492 
493  $lines = explode( "\n", $text );
494 
495  $terms = implode( '|', $terms );
496  $max = intval( $contextchars ) + 1;
497  $pat1 = "/(.*)($terms)(.{0,$max})/i";
498 
499  $lineno = 0;
500 
501  $extract = "";
502  wfProfileIn( "$fname-extract" );
503  foreach ( $lines as $line ) {
504  if ( 0 == $contextlines ) {
505  break;
506  }
507  ++$lineno;
508  $m = array();
509  if ( ! preg_match( $pat1, $line, $m ) ) {
510  continue;
511  }
512  --$contextlines;
513  // truncate function changes ... to relevant i18n message.
514  $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
515 
516  if ( count( $m ) < 3 ) {
517  $post = '';
518  } else {
519  $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
520  }
521 
522  $found = $m[2];
523 
524  $line = htmlspecialchars( $pre . $found . $post );
525  $pat2 = '/(' . $terms . ")/i";
526  $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
527 
528  $extract .= "${line}\n";
529  }
530  wfProfileOut( "$fname-extract" );
531 
532  return $extract;
533  }
534 }
SearchHighlighter\splitAndAdd
splitAndAdd(&$extracts, &$count, $text)
Split text into lines and add it to extracts array.
Definition: SearchHighlighter.php:291
php
skin txt MediaWiki includes four core it has been set as the default in MediaWiki since the replacing Monobook it had been been the default skin since before being replaced by Vector largely rewritten in while keeping its appearance Several legacy skins were removed in the as the burden of supporting them became too heavy to bear Those in etc for skin dependent CSS etc for skin dependent JavaScript These can also be customised on a per user by etc This feature has led to a wide variety of user styles becoming that gallery is a good place to ending in php
Definition: skin.txt:62
$last
$last
Definition: profileinfo.php:365
SearchHighlighter\$mCleanWikitext
$mCleanWikitext
Definition: SearchHighlighter.php:30
wfProfileIn
wfProfileIn( $functionname)
Begin profiling of a function.
Definition: Profiler.php:33
SearchHighlighter\highlightSimple
highlightSimple( $text, $terms, $contextlines, $contextchars)
Simple & fast snippet extraction, but gives completely unrelevant snippets.
Definition: SearchHighlighter.php:489
$fname
if(!defined( 'MEDIAWIKI')) $fname
This file is not a valid entry point, perform no further processing unless MEDIAWIKI is defined.
Definition: Setup.php:35
NS_FILE
const NS_FILE
Definition: Defines.php:85
$s
$s
Definition: mergeMessageFileList.php:156
$wgContLang
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the content language as $wgContLang
Definition: design.txt:56
SearchHighlighter\extract
extract( $text, $start, $end, &$posStart=null, &$posEnd=null)
Extract part of the text from start to end, but by not chopping up words.
Definition: SearchHighlighter.php:326
$processed
$processed
Definition: importImages.php:40
$out
$out
Definition: UtfNormalGenerate.php:167
$pre
return true to allow those checks to and false if checking is done use this to change the tables headers temp or archived zone change it to an object instance and return false override the list derivative used the name of the old file when set the default code will be skipped $pre
Definition: hooks.txt:1105
wfProfileOut
wfProfileOut( $functionname='missing')
Stop profiling of a function.
Definition: Profiler.php:46
SearchHighlighter\highlightText
highlightText( $text, $terms, $contextlines, $contextchars)
Default implementation of wikitext highlighting.
Definition: SearchHighlighter.php:45
$lines
$lines
Definition: router.php:65
array
the array() calling protocol came about after MediaWiki 1.4rc1.
List of Api Query prop modules.
global
when a variable name is used in a it is silently declared as a new masking the global
Definition: design.txt:93
NS_CATEGORY
const NS_CATEGORY
Definition: Defines.php:93
$line
$line
Definition: cdb.php:57
SearchHighlighter\removeWiki
removeWiki( $text)
Basic wikitext removal.
Definition: SearchHighlighter.php:434
$matches
if(!defined( 'MEDIAWIKI')) if(!isset( $wgVersion)) $matches
Definition: NoLocalSettings.php:33
SearchHighlighter\__construct
__construct( $cleanupWikitext=true)
Definition: SearchHighlighter.php:32
SearchHighlighter
Highlight bits of wikitext.
Definition: SearchHighlighter.php:29
$count
$count
Definition: UtfNormalTest2.php:96
SearchHighlighter\caseCallback
caseCallback( $matches)
Do manual case conversion for non-ascii chars.
Definition: SearchHighlighter.php:307
SearchHighlighter\position
position( $text, $point, $offset=0)
Find a nonletter near a point (index) in the text.
Definition: SearchHighlighter.php:358
$term
the value to return A Title object or null whereas SearchGetNearMatch runs after $term
Definition: hooks.txt:2125
as
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
SearchHighlighter\process
process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets)
Search extracts for a pattern, and return snippets.
Definition: SearchHighlighter.php:392
SearchHighlighter\linkReplace
linkReplace( $matches)
callback to replace [[target|caption]] kind of links, if the target is category or image,...
Definition: SearchHighlighter.php:464