MediaWiki REL1_30
SearchHighlighter.php
Go to the documentation of this file.
1<?php
30 protected $mCleanWikitext = true;
31
37 function __construct( $cleanupWikitext = true ) {
38 $this->mCleanWikitext = $cleanupWikitext;
39 }
40
51 public function highlightText( $text, $terms, $contextlines, $contextchars ) {
53
54 if ( $text == '' ) {
55 return '';
56 }
57
58 // spli text into text + templates/links/tables
59 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
60 // first capture group is for detecting nested templates/links/tables/references
61 $endPatterns = [
62 1 => '/(\{\{)|(\}\})/', // template
63 2 => '/(\[\[)|(\]\])/', // image
64 3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
65
66 // @todo FIXME: This should prolly be a hook or something
67 // instead of hardcoding a class name from the Cite extension
68 if ( class_exists( 'Cite' ) ) {
69 $spat .= '|(<ref>)'; // references via cite extension
70 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
71 }
72 $spat .= '/';
73 $textExt = []; // text extracts
74 $otherExt = []; // other extracts
75 $start = 0;
76 $textLen = strlen( $text );
77 $count = 0; // sequence number to maintain ordering
78 while ( $start < $textLen ) {
79 // find start of template/image/table
80 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
81 $epat = '';
82 foreach ( $matches as $key => $val ) {
83 if ( $key > 0 && $val[1] != -1 ) {
84 if ( $key == 2 ) {
85 // see if this is an image link
86 $ns = substr( $val[0], 2, -1 );
87 if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {
88 break;
89 }
90
91 }
92 $epat = $endPatterns[$key];
93 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
94 $start = $val[1];
95 break;
96 }
97 }
98 if ( $epat ) {
99 // find end (and detect any nested elements)
100 $level = 0;
101 $offset = $start + 1;
102 $found = false;
103 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
104 if ( array_key_exists( 2, $endMatches ) ) {
105 // found end
106 if ( $level == 0 ) {
107 $len = strlen( $endMatches[2][0] );
108 $off = $endMatches[2][1];
109 $this->splitAndAdd( $otherExt, $count,
110 substr( $text, $start, $off + $len - $start ) );
111 $start = $off + $len;
112 $found = true;
113 break;
114 } else {
115 // end of nested element
116 $level -= 1;
117 }
118 } else {
119 // nested
120 $level += 1;
121 }
122 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
123 }
124 if ( !$found ) {
125 // couldn't find appropriate closing tag, skip
126 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
127 $start += strlen( $matches[0][0] );
128 }
129 continue;
130 }
131 }
132 // else: add as text extract
133 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
134 break;
135 }
136
137 $all = $textExt + $otherExt; // these have disjunct key sets
138
139 // prepare regexps
140 foreach ( $terms as $index => $term ) {
141 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
142 if ( preg_match( '/[\x80-\xff]/', $term ) ) {
143 $terms[$index] = preg_replace_callback(
144 '/./us',
145 [ $this, 'caseCallback' ],
146 $terms[$index]
147 );
148 } else {
149 $terms[$index] = $term;
150 }
151 }
152 $anyterm = implode( '|', $terms );
153 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
154 // @todo FIXME: A hack to scale contextchars, a correct solution
155 // would be to have contextchars actually be char and not byte
156 // length, and do proper utf-8 substrings and lengths everywhere,
157 // but PHP is making that very hard and unclean to implement :(
158 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
159 $contextchars = intval( $contextchars * $scale );
160
161 $patPre = "(^|$wgSearchHighlightBoundaries)";
162 $patPost = "($wgSearchHighlightBoundaries|$)";
163
164 $pat1 = "/(" . $phrase . ")/ui";
165 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
166
167 $left = $contextlines;
168
169 $snippets = [];
170 $offsets = [];
171
172 // show beginning only if it contains all words
173 $first = 0;
174 $firstText = '';
175 foreach ( $textExt as $index => $line ) {
176 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
177 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
178 $first = $index;
179 break;
180 }
181 }
182 if ( $firstText ) {
183 $succ = true;
184 // check if first text contains all terms
185 foreach ( $terms as $term ) {
186 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
187 $succ = false;
188 break;
189 }
190 }
191 if ( $succ ) {
192 $snippets[$first] = $firstText;
193 $offsets[$first] = 0;
194 }
195 }
196 if ( !$snippets ) {
197 // match whole query on text
198 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
199 // match whole query on templates/tables/images
200 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
201 // match any words on text
202 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
203 // match any words on templates/tables/images
204 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
205
206 ksort( $snippets );
207 }
208
209 // add extra chars to each snippet to make snippets constant size
210 $extended = [];
211 if ( count( $snippets ) == 0 ) {
212 // couldn't find the target words, just show beginning of article
213 if ( array_key_exists( $first, $all ) ) {
214 $targetchars = $contextchars * $contextlines;
215 $snippets[$first] = '';
216 $offsets[$first] = 0;
217 }
218 } else {
219 // if begin of the article contains the whole phrase, show only that !!
220 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
221 && $offsets[$first] < $contextchars * 2 ) {
222 $snippets = [ $first => $snippets[$first] ];
223 }
224
225 // calc by how much to extend existing snippets
226 $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
227 }
228
229 foreach ( $snippets as $index => $line ) {
230 $extended[$index] = $line;
231 $len = strlen( $line );
232 if ( $len < $targetchars - 20 ) {
233 // complete this line
234 if ( $len < strlen( $all[$index] ) ) {
235 $extended[$index] = $this->extract(
236 $all[$index],
237 $offsets[$index],
238 $offsets[$index] + $targetchars,
239 $offsets[$index]
240 );
241 $len = strlen( $extended[$index] );
242 }
243
244 // add more lines
245 $add = $index + 1;
246 while ( $len < $targetchars - 20
247 && array_key_exists( $add, $all )
248 && !array_key_exists( $add, $snippets ) ) {
249 $offsets[$add] = 0;
250 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
251 $extended[$add] = $tt;
252 $len += strlen( $tt );
253 $add++;
254 }
255 }
256 }
257
258 // $snippets = array_map( 'htmlspecialchars', $extended );
259 $snippets = $extended;
260 $last = -1;
261 $extract = '';
262 foreach ( $snippets as $index => $line ) {
263 if ( $last == -1 ) {
264 $extract .= $line; // first line
265 } elseif ( $last + 1 == $index
266 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
267 ) {
268 $extract .= " " . $line; // continous lines
269 } else {
270 $extract .= '<b> ... </b>' . $line;
271 }
272
273 $last = $index;
274 }
275 if ( $extract ) {
276 $extract .= '<b> ... </b>';
277 }
278
279 $processed = [];
280 foreach ( $terms as $term ) {
281 if ( !isset( $processed[$term] ) ) {
282 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
283 $extract = preg_replace( $pat3,
284 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
285 $processed[$term] = true;
286 }
287 }
288
289 return $extract;
290 }
291
299 function splitAndAdd( &$extracts, &$count, $text ) {
300 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
301 foreach ( $split as $line ) {
302 $tt = trim( $line );
303 if ( $tt ) {
304 $extracts[$count++] = $tt;
305 }
306 }
307 }
308
315 function caseCallback( $matches ) {
316 global $wgContLang;
317 if ( strlen( $matches[0] ) > 1 ) {
318 return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
319 } else {
320 return $matches[0];
321 }
322 }
323
334 function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
335 if ( $start != 0 ) {
336 $start = $this->position( $text, $start, 1 );
337 }
338 if ( $end >= strlen( $text ) ) {
339 $end = strlen( $text );
340 } else {
341 $end = $this->position( $text, $end );
342 }
343
344 if ( !is_null( $posStart ) ) {
345 $posStart = $start;
346 }
347 if ( !is_null( $posEnd ) ) {
348 $posEnd = $end;
349 }
350
351 if ( $end > $start ) {
352 return substr( $text, $start, $end - $start );
353 } else {
354 return '';
355 }
356 }
357
366 function position( $text, $point, $offset = 0 ) {
367 $tolerance = 10;
368 $s = max( 0, $point - $tolerance );
369 $l = min( strlen( $text ), $point + $tolerance ) - $s;
370 $m = [];
371
372 if ( preg_match(
373 '/[ ,.!?~!@#$%^&*\‍(\‍)+=\-\\\|\[\]"\'<>]/',
374 substr( $text, $s, $l ),
375 $m,
376 PREG_OFFSET_CAPTURE
377 ) ) {
378 return $m[0][1] + $s + $offset;
379 } else {
380 // check if point is on a valid first UTF8 char
381 $char = ord( $text[$point] );
382 while ( $char >= 0x80 && $char < 0xc0 ) {
383 // skip trailing bytes
384 $point++;
385 if ( $point >= strlen( $text ) ) {
386 return strlen( $text );
387 }
388 $char = ord( $text[$point] );
389 }
390
391 return $point;
392
393 }
394 }
395
407 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
408 if ( $linesleft == 0 ) {
409 return; // nothing to do
410 }
411 foreach ( $extracts as $index => $line ) {
412 if ( array_key_exists( $index, $out ) ) {
413 continue; // this line already highlighted
414 }
415
416 $m = [];
417 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
418 continue;
419 }
420
421 $offset = $m[0][1];
422 $len = strlen( $m[0][0] );
423 if ( $offset + $len < $contextchars ) {
424 $begin = 0;
425 } elseif ( $len > $contextchars ) {
426 $begin = $offset;
427 } else {
428 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
429 }
430
431 $end = $begin + $contextchars;
432
433 $posBegin = $begin;
434 // basic snippet from this line
435 $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
436 $offsets[$index] = $posBegin;
437 $linesleft--;
438 if ( $linesleft == 0 ) {
439 return;
440 }
441 }
442 }
443
450 function removeWiki( $text ) {
451 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
452 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
453 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
454 $text = preg_replace_callback(
455 "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
456 [ $this, 'linkReplace' ],
457 $text
458 );
459 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
460 $text = preg_replace( "/'''''/", "", $text );
461 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
462 $text = preg_replace( "/''/", "", $text );
463
464 // Note, the previous /<\/?[^>]+>/ is insufficient
465 // for XSS safety as the HTML tag can span multiple
466 // search results (T144845).
467 $text = Sanitizer::escapeHtmlAllowEntities( $text );
468 return $text;
469 }
470
478 function linkReplace( $matches ) {
479 $colon = strpos( $matches[1], ':' );
480 if ( $colon === false ) {
481 return $matches[2]; // replace with caption
482 }
483 global $wgContLang;
484 $ns = substr( $matches[1], 0, $colon );
485 $index = $wgContLang->getNsIndex( $ns );
486 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {
487 return $matches[0]; // return the whole thing
488 } else {
489 return $matches[2];
490 }
491 }
492
505 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
506 global $wgContLang;
507
508 $lines = explode( "\n", $text );
509
510 $terms = implode( '|', $terms );
511 $max = intval( $contextchars ) + 1;
512 $pat1 = "/(.*)($terms)(.{0,$max})/i";
513
514 $lineno = 0;
515
516 $extract = "";
517 foreach ( $lines as $line ) {
518 if ( 0 == $contextlines ) {
519 break;
520 }
521 ++$lineno;
522 $m = [];
523 if ( !preg_match( $pat1, $line, $m ) ) {
524 continue;
525 }
526 --$contextlines;
527 // truncate function changes ... to relevant i18n message.
528 $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
529
530 if ( count( $m ) < 3 ) {
531 $post = '';
532 } else {
533 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
534 }
535
536 $found = $m[2];
537
538 $line = htmlspecialchars( $pre . $found . $post );
539 $pat2 = '/(' . $terms . ")/i";
540 $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
541
542 $extract .= "${line}\n";
543 }
544
545 return $extract;
546 }
547
556 public function highlightNone( $text, $contextlines, $contextchars ) {
557 $match = [];
558 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
559 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
560 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
561
562 // Trim and limit to max number of chars
563 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
564 return str_replace( "\n", '<br>', $text );
565 }
566}
$wgSearchHighlightBoundaries
Regexp to match word boundaries, defaults for non-CJK languages should be empty for CJK since the wor...
$line
Definition cdb.php:58
Highlight bits of wikitext.
process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets)
Search extracts for a pattern, and return snippets.
removeWiki( $text)
Basic wikitext removal.
highlightSimple( $text, $terms, $contextlines, $contextchars)
Simple & fast snippet extraction, but gives completely unrelevant snippets.
highlightNone( $text, $contextlines, $contextchars)
Returns the first few lines of the text.
__construct( $cleanupWikitext=true)
splitAndAdd(&$extracts, &$count, $text)
Split text into lines and add it to extracts array.
caseCallback( $matches)
Do manual case conversion for non-ascii chars.
highlightText( $text, $terms, $contextlines, $contextchars)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
position( $text, $point, $offset=0)
Find a nonletter near a point (index) in the text.
linkReplace( $matches)
callback to replace [[target|caption]] kind of links, if the target is category or image,...
extract( $text, $start, $end, &$posStart=null, &$posEnd=null)
Extract part of the text from start to end, but by not chopping up words.
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the local content language as $wgContLang
Definition design.txt:57
external whereas SearchGetNearMatch runs after $term
Definition hooks.txt:2814
return true to allow those checks to and false if checking is done remove or add to the links of a group of changes in EnhancedChangesList Hook subscribers can return false to omit this line from recentchanges use this to change the tables headers change it to an object instance and return false override the list derivative used the name of the old file when set the default code will be skipped $pre
Definition hooks.txt:1575
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that probably a stub it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output $out
Definition hooks.txt:862
const NS_FILE
Definition Defines.php:71
const NS_CATEGORY
Definition Defines.php:79
$last
$lines
Definition router.php:61