MediaWiki REL1_40
SearchHighlighter.php
Go to the documentation of this file.
1<?php
26
36 public const DEFAULT_CONTEXT_LINES = 2;
37 public const DEFAULT_CONTEXT_CHARS = 75;
38
39 protected $mCleanWikitext = true;
40
47 public function __construct( $cleanupWikitext = true ) {
48 $this->mCleanWikitext = $cleanupWikitext;
49 }
50
61 public function highlightText(
62 $text,
63 $terms,
64 $contextlines = self::DEFAULT_CONTEXT_LINES,
65 $contextchars = self::DEFAULT_CONTEXT_CHARS
66 ) {
67 $searchHighlightBoundaries = MediaWikiServices::getInstance()
68 ->getMainConfig()->get( MainConfigNames::SearchHighlightBoundaries );
69
70 if ( $text == '' ) {
71 return '';
72 }
73
74 // split text into text + templates/links/tables
75 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
76 // first capture group is for detecting nested templates/links/tables/references
77 $endPatterns = [
78 1 => '/(\{\{)|(\}\})/', // template
79 2 => '/(\[\[)|(\]\])/', // image
80 3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
81
82 // @todo FIXME: This should prolly be a hook or something
83 // instead of hardcoding the name of the Cite extension
84 if ( \ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
85 $spat .= '|(<ref>)'; // references via cite extension
86 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
87 }
88 $spat .= '/';
89 $textExt = []; // text extracts
90 $otherExt = []; // other extracts
91 $start = 0;
92 $textLen = strlen( $text );
93 $count = 0; // sequence number to maintain ordering
94 while ( $start < $textLen ) {
95 // find start of template/image/table
96 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
97 $epat = '';
98 foreach ( $matches as $key => $val ) {
99 if ( $key > 0 && $val[1] != -1 ) {
100 if ( $key == 2 ) {
101 // see if this is an image link
102 $ns = substr( $val[0], 2, -1 );
103 if (
104 MediaWikiServices::getInstance()->getContentLanguage()->
105 getNsIndex( $ns ) !== NS_FILE
106 ) {
107 break;
108 }
109
110 }
111 $epat = $endPatterns[$key];
112 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
113 $start = $val[1];
114 break;
115 }
116 }
117 if ( $epat ) {
118 // find end (and detect any nested elements)
119 $level = 0;
120 $offset = $start + 1;
121 $found = false;
122 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
123 if ( array_key_exists( 2, $endMatches ) ) {
124 // found end
125 if ( $level == 0 ) {
126 $len = strlen( $endMatches[2][0] );
127 $off = $endMatches[2][1];
128 $this->splitAndAdd( $otherExt, $count,
129 substr( $text, $start, $off + $len - $start ) );
130 $start = $off + $len;
131 $found = true;
132 break;
133 } else {
134 // end of nested element
135 $level -= 1;
136 }
137 } else {
138 // nested
139 $level += 1;
140 }
141 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
142 }
143 if ( !$found ) {
144 // couldn't find appropriate closing tag, skip
145 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
146 $start += strlen( $matches[0][0] );
147 }
148 continue;
149 }
150 }
151 // else: add as text extract
152 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
153 break;
154 }
155 '@phan-var string[] $textExt';
156
157 $all = $textExt + $otherExt; // these have disjunct key sets
158
159 // prepare regexps
160 foreach ( $terms as $index => $term ) {
161 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
162 if ( preg_match( '/[\x80-\xff]/', $term ) ) {
163 $terms[$index] = preg_replace_callback(
164 '/./us',
165 [ $this, 'caseCallback' ],
166 $terms[$index]
167 );
168 } else {
169 $terms[$index] = $term;
170 }
171 }
172 $anyterm = implode( '|', $terms );
173 $phrase = implode( "{$searchHighlightBoundaries}+", $terms );
174 // @todo FIXME: A hack to scale contextchars, a correct solution
175 // would be to have contextchars actually be char and not byte
176 // length, and do proper utf-8 substrings and lengths everywhere,
177 // but PHP is making that very hard and unclean to implement :(
178 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
179 $contextchars = intval( $contextchars * $scale );
180
181 $patPre = "(^|{$searchHighlightBoundaries})";
182 $patPost = "({$searchHighlightBoundaries}|$)";
183
184 $pat1 = "/(" . $phrase . ")/ui";
185 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
186
187 $left = $contextlines;
188
189 $snippets = [];
190 $offsets = [];
191
192 // show beginning only if it contains all words
193 $first = 0;
194 $firstText = '';
195 foreach ( $textExt as $index => $line ) {
196 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
197 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
198 $first = $index;
199 break;
200 }
201 }
202 if ( $firstText ) {
203 $succ = true;
204 // check if first text contains all terms
205 foreach ( $terms as $term ) {
206 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
207 $succ = false;
208 break;
209 }
210 }
211 if ( $succ ) {
212 $snippets[$first] = $firstText;
213 $offsets[$first] = 0;
214 }
215 }
216 if ( !$snippets ) {
217 // match whole query on text
218 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
219 // match whole query on templates/tables/images
220 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
221 // match any words on text
222 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
223 // match any words on templates/tables/images
224 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
225
226 ksort( $snippets );
227 }
228
229 // add extra chars to each snippet to make snippets constant size
230 $extended = [];
231 if ( count( $snippets ) == 0 ) {
232 // couldn't find the target words, just show beginning of article
233 if ( array_key_exists( $first, $all ) ) {
234 $targetchars = $contextchars * $contextlines;
235 $snippets[$first] = '';
236 $offsets[$first] = 0;
237 }
238 } else {
239 // if begin of the article contains the whole phrase, show only that !!
240 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
241 && $offsets[$first] < $contextchars * 2 ) {
242 $snippets = [ $first => $snippets[$first] ];
243 }
244
245 // calc by how much to extend existing snippets
246 $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
247 }
248
249 foreach ( $snippets as $index => $line ) {
250 $extended[$index] = $line;
251 $len = strlen( $line );
252 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
253 // $targetchars is set when $snippes contains anything
254 if ( $len < $targetchars - 20 ) {
255 // complete this line
256 if ( $len < strlen( $all[$index] ) ) {
257 $extended[$index] = $this->extract(
258 $all[$index],
259 $offsets[$index],
260 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
261 // $targetchars is set when $snippes contains anything
262 $offsets[$index] + $targetchars,
263 $offsets[$index]
264 );
265 $len = strlen( $extended[$index] );
266 }
267
268 // add more lines
269 $add = $index + 1;
270 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
271 // $targetchars is set when $snippes contains anything
272 while ( $len < $targetchars - 20
273 && array_key_exists( $add, $all )
274 && !array_key_exists( $add, $snippets ) ) {
275 $offsets[$add] = 0;
276 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
277 // $targetchars is set when $snippes contains anything
278 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
279 $extended[$add] = $tt;
280 $len += strlen( $tt );
281 $add++;
282 }
283 }
284 }
285
286 // $snippets = array_map( 'htmlspecialchars', $extended );
287 $snippets = $extended;
288 $last = -1;
289 $extract = '';
290 foreach ( $snippets as $index => $line ) {
291 if ( $last == -1 ) {
292 $extract .= $line; // first line
293 } elseif ( $last + 1 == $index
294 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
295 ) {
296 $extract .= " " . $line; // continuous lines
297 } else {
298 $extract .= '<b> ... </b>' . $line;
299 }
300
301 $last = $index;
302 }
303 if ( $extract ) {
304 $extract .= '<b> ... </b>';
305 }
306
307 $processed = [];
308 foreach ( $terms as $term ) {
309 if ( !isset( $processed[$term] ) ) {
310 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
311 $extract = preg_replace( $pat3,
312 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
313 $processed[$term] = true;
314 }
315 }
316
317 return $extract;
318 }
319
327 private function splitAndAdd( &$extracts, &$count, $text ) {
328 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
329 foreach ( $split as $line ) {
330 $tt = trim( $line );
331 if ( $tt ) {
332 $extracts[$count++] = $tt;
333 }
334 }
335 }
336
343 private function caseCallback( $matches ) {
344 if ( strlen( $matches[0] ) > 1 ) {
345 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
346 return '[' . $contLang->lc( $matches[0] ) .
347 $contLang->uc( $matches[0] ) . ']';
348 } else {
349 return $matches[0];
350 }
351 }
352
363 private function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
364 if ( $start != 0 ) {
365 $start = $this->position( $text, $start, 1 );
366 }
367 if ( $end >= strlen( $text ) ) {
368 $end = strlen( $text );
369 } else {
370 $end = $this->position( $text, $end );
371 }
372
373 if ( $posStart !== null ) {
374 $posStart = $start;
375 }
376 if ( $posEnd !== null ) {
377 $posEnd = $end;
378 }
379
380 if ( $end > $start ) {
381 return substr( $text, $start, $end - $start );
382 } else {
383 return '';
384 }
385 }
386
395 private function position( $text, $point, $offset = 0 ) {
396 $tolerance = 10;
397 $s = max( 0, $point - $tolerance );
398 $l = min( strlen( $text ), $point + $tolerance ) - $s;
399 $m = [];
400
401 if ( preg_match(
402 '/[ ,.!?~!@#$%^&*\‍(\‍)+=\-\\\|\[\]"\'<>]/',
403 substr( $text, $s, $l ),
404 $m,
405 PREG_OFFSET_CAPTURE
406 ) ) {
407 return $m[0][1] + $s + $offset;
408 } else {
409 // check if point is on a valid first UTF8 char
410 $char = ord( $text[$point] );
411 while ( $char >= 0x80 && $char < 0xc0 ) {
412 // skip trailing bytes
413 $point++;
414 if ( $point >= strlen( $text ) ) {
415 return strlen( $text );
416 }
417 $char = ord( $text[$point] );
418 }
419
420 return $point;
421
422 }
423 }
424
435 private function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
436 if ( $linesleft == 0 ) {
437 return; // nothing to do
438 }
439 foreach ( $extracts as $index => $line ) {
440 if ( array_key_exists( $index, $out ) ) {
441 continue; // this line already highlighted
442 }
443
444 $m = [];
445 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
446 continue;
447 }
448
449 $offset = $m[0][1];
450 $len = strlen( $m[0][0] );
451 if ( $offset + $len < $contextchars ) {
452 $begin = 0;
453 } elseif ( $len > $contextchars ) {
454 $begin = $offset;
455 } else {
456 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
457 }
458
459 $end = $begin + $contextchars;
460
461 $posBegin = $begin;
462 // basic snippet from this line
463 $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
464 $offsets[$index] = $posBegin;
465 $linesleft--;
466 if ( $linesleft == 0 ) {
467 return;
468 }
469 }
470 }
471
477 private function removeWiki( $text ) {
478 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
479 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
480 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
481 $text = preg_replace_callback(
482 "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
483 [ $this, 'linkReplace' ],
484 $text
485 );
486 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
487 $text = preg_replace( "/'''''/", "", $text );
488 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
489 $text = preg_replace( "/''/", "", $text );
490
491 // Note, the previous /<\/?[^>]+>/ is insufficient
492 // for XSS safety as the HTML tag can span multiple
493 // search results (T144845).
494 $text = Sanitizer::escapeHtmlAllowEntities( $text );
495 return $text;
496 }
497
505 private function linkReplace( $matches ) {
506 $colon = strpos( $matches[1], ':' );
507 if ( $colon === false ) {
508 return $matches[2]; // replace with caption
509 }
510 $ns = substr( $matches[1], 0, $colon );
511 $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
512 if ( $index !== false && ( $index === NS_FILE || $index === NS_CATEGORY ) ) {
513 return $matches[0]; // return the whole thing
514 } else {
515 return $matches[2];
516 }
517 }
518
531 public function highlightSimple(
532 $text,
533 $terms,
534 $contextlines = self::DEFAULT_CONTEXT_LINES,
535 $contextchars = self::DEFAULT_CONTEXT_CHARS
536 ) {
537 $lines = explode( "\n", $text );
538
539 $terms = implode( '|', $terms );
540 $max = intval( $contextchars ) + 1;
541 $pat1 = "/(.*)($terms)(.{0,$max})/i";
542
543 $extract = "";
544 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
545 foreach ( $lines as $line ) {
546 if ( $contextlines == 0 ) {
547 break;
548 }
549 $m = [];
550 if ( !preg_match( $pat1, $line, $m ) ) {
551 continue;
552 }
553 --$contextlines;
554 // truncate function changes ... to relevant i18n message.
555 $pre = $contLang->truncateForVisual( $m[1], -$contextchars, '...', false );
556
557 if ( count( $m ) < 3 ) {
558 $post = '';
559 } else {
560 $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
561 }
562
563 $found = $m[2];
564
565 $line = htmlspecialchars( $pre . $found . $post );
566 $pat2 = '/(' . $terms . ")/i";
567 $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
568
569 $extract .= "{$line}\n";
570 }
571
572 return $extract;
573 }
574
583 public function highlightNone(
584 $text,
585 $contextlines = self::DEFAULT_CONTEXT_LINES,
586 $contextchars = self::DEFAULT_CONTEXT_CHARS
587 ) {
588 $match = [];
589 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
590 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
591 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
592
593 // Trim and limit to max number of chars
594 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
595 return str_replace( "\n", '<br>', $text );
596 }
597}
const NS_FILE
Definition Defines.php:70
const NS_CATEGORY
Definition Defines.php:78
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Highlight bits of wikitext.
highlightText( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
__construct( $cleanupWikitext=true)
highlightSimple( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Simple & fast snippet extraction, but gives completely irrelevant snippets.
highlightNone( $text, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Returns the first few lines of the text.
if(!file_exists( $CREDITS)) $lines