MediaWiki REL1_37
SearchHighlighter.php
Go to the documentation of this file.
1<?php
25
35 public const DEFAULT_CONTEXT_LINES = 2;
36 public const DEFAULT_CONTEXT_CHARS = 75;
37
38 protected $mCleanWikitext = true;
39
46 public function __construct( $cleanupWikitext = true ) {
47 $this->mCleanWikitext = $cleanupWikitext;
48 }
49
60 public function highlightText(
61 $text,
62 $terms,
63 $contextlines = self::DEFAULT_CONTEXT_LINES,
64 $contextchars = self::DEFAULT_CONTEXT_CHARS
65 ) {
67
68 if ( $text == '' ) {
69 return '';
70 }
71
72 // spli text into text + templates/links/tables
73 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
74 // first capture group is for detecting nested templates/links/tables/references
75 $endPatterns = [
76 1 => '/(\{\{)|(\}\})/', // template
77 2 => '/(\[\[)|(\]\])/', // image
78 3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
79
80 // @todo FIXME: This should prolly be a hook or something
81 // instead of hardcoding the name of the Cite extension
82 if ( \ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
83 $spat .= '|(<ref>)'; // references via cite extension
84 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
85 }
86 $spat .= '/';
87 $textExt = []; // text extracts
88 $otherExt = []; // other extracts
89 $start = 0;
90 $textLen = strlen( $text );
91 $count = 0; // sequence number to maintain ordering
92 while ( $start < $textLen ) {
93 // find start of template/image/table
94 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
95 $epat = '';
96 foreach ( $matches as $key => $val ) {
97 if ( $key > 0 && $val[1] != -1 ) {
98 if ( $key == 2 ) {
99 // see if this is an image link
100 $ns = substr( $val[0], 2, -1 );
101 if (
102 MediaWikiServices::getInstance()->getContentLanguage()->
103 getNsIndex( $ns ) !== NS_FILE
104 ) {
105 break;
106 }
107
108 }
109 $epat = $endPatterns[$key];
110 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
111 $start = $val[1];
112 break;
113 }
114 }
115 if ( $epat ) {
116 // find end (and detect any nested elements)
117 $level = 0;
118 $offset = $start + 1;
119 $found = false;
120 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
121 if ( array_key_exists( 2, $endMatches ) ) {
122 // found end
123 if ( $level == 0 ) {
124 $len = strlen( $endMatches[2][0] );
125 $off = $endMatches[2][1];
126 $this->splitAndAdd( $otherExt, $count,
127 substr( $text, $start, $off + $len - $start ) );
128 $start = $off + $len;
129 $found = true;
130 break;
131 } else {
132 // end of nested element
133 $level -= 1;
134 }
135 } else {
136 // nested
137 $level += 1;
138 }
139 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
140 }
141 if ( !$found ) {
142 // couldn't find appropriate closing tag, skip
143 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
144 $start += strlen( $matches[0][0] );
145 }
146 continue;
147 }
148 }
149 // else: add as text extract
150 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
151 break;
152 }
153 '@phan-var string[] $textExt';
154
155 $all = $textExt + $otherExt; // these have disjunct key sets
156
157 // prepare regexps
158 foreach ( $terms as $index => $term ) {
159 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
160 if ( preg_match( '/[\x80-\xff]/', $term ) ) {
161 $terms[$index] = preg_replace_callback(
162 '/./us',
163 [ $this, 'caseCallback' ],
164 $terms[$index]
165 );
166 } else {
167 $terms[$index] = $term;
168 }
169 }
170 $anyterm = implode( '|', $terms );
171 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
172 // @todo FIXME: A hack to scale contextchars, a correct solution
173 // would be to have contextchars actually be char and not byte
174 // length, and do proper utf-8 substrings and lengths everywhere,
175 // but PHP is making that very hard and unclean to implement :(
176 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
177 $contextchars = intval( $contextchars * $scale );
178
179 $patPre = "(^|$wgSearchHighlightBoundaries)";
180 $patPost = "($wgSearchHighlightBoundaries|$)";
181
182 $pat1 = "/(" . $phrase . ")/ui";
183 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
184
185 $left = $contextlines;
186
187 $snippets = [];
188 $offsets = [];
189
190 // show beginning only if it contains all words
191 $first = 0;
192 $firstText = '';
193 foreach ( $textExt as $index => $line ) {
194 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
195 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
196 $first = $index;
197 break;
198 }
199 }
200 if ( $firstText ) {
201 $succ = true;
202 // check if first text contains all terms
203 foreach ( $terms as $term ) {
204 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
205 $succ = false;
206 break;
207 }
208 }
209 if ( $succ ) {
210 $snippets[$first] = $firstText;
211 $offsets[$first] = 0;
212 }
213 }
214 if ( !$snippets ) {
215 // match whole query on text
216 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
217 // match whole query on templates/tables/images
218 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
219 // match any words on text
220 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
221 // match any words on templates/tables/images
222 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
223
224 ksort( $snippets );
225 }
226
227 // add extra chars to each snippet to make snippets constant size
228 $extended = [];
229 if ( count( $snippets ) == 0 ) {
230 // couldn't find the target words, just show beginning of article
231 if ( array_key_exists( $first, $all ) ) {
232 $targetchars = $contextchars * $contextlines;
233 $snippets[$first] = '';
234 $offsets[$first] = 0;
235 }
236 } else {
237 // if begin of the article contains the whole phrase, show only that !!
238 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
239 && $offsets[$first] < $contextchars * 2 ) {
240 $snippets = [ $first => $snippets[$first] ];
241 }
242
243 // calc by how much to extend existing snippets
244 $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
245 }
246
247 foreach ( $snippets as $index => $line ) {
248 $extended[$index] = $line;
249 $len = strlen( $line );
250 if ( $len < $targetchars - 20 ) {
251 // complete this line
252 if ( $len < strlen( $all[$index] ) ) {
253 $extended[$index] = $this->extract(
254 $all[$index],
255 $offsets[$index],
256 $offsets[$index] + $targetchars,
257 $offsets[$index]
258 );
259 $len = strlen( $extended[$index] );
260 }
261
262 // add more lines
263 $add = $index + 1;
264 while ( $len < $targetchars - 20
265 && array_key_exists( $add, $all )
266 && !array_key_exists( $add, $snippets ) ) {
267 $offsets[$add] = 0;
268 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
269 $extended[$add] = $tt;
270 $len += strlen( $tt );
271 $add++;
272 }
273 }
274 }
275
276 // $snippets = array_map( 'htmlspecialchars', $extended );
277 $snippets = $extended;
278 $last = -1;
279 $extract = '';
280 foreach ( $snippets as $index => $line ) {
281 if ( $last == -1 ) {
282 $extract .= $line; // first line
283 } elseif ( $last + 1 == $index
284 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
285 ) {
286 $extract .= " " . $line; // continous lines
287 } else {
288 $extract .= '<b> ... </b>' . $line;
289 }
290
291 $last = $index;
292 }
293 if ( $extract ) {
294 $extract .= '<b> ... </b>';
295 }
296
297 $processed = [];
298 foreach ( $terms as $term ) {
299 if ( !isset( $processed[$term] ) ) {
300 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
301 $extract = preg_replace( $pat3,
302 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
303 $processed[$term] = true;
304 }
305 }
306
307 return $extract;
308 }
309
317 private function splitAndAdd( &$extracts, &$count, $text ) {
318 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
319 foreach ( $split as $line ) {
320 $tt = trim( $line );
321 if ( $tt ) {
322 $extracts[$count++] = $tt;
323 }
324 }
325 }
326
333 private function caseCallback( $matches ) {
334 if ( strlen( $matches[0] ) > 1 ) {
335 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
336 return '[' . $contLang->lc( $matches[0] ) .
337 $contLang->uc( $matches[0] ) . ']';
338 } else {
339 return $matches[0];
340 }
341 }
342
353 private function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
354 if ( $start != 0 ) {
355 $start = $this->position( $text, $start, 1 );
356 }
357 if ( $end >= strlen( $text ) ) {
358 $end = strlen( $text );
359 } else {
360 $end = $this->position( $text, $end );
361 }
362
363 if ( $posStart !== null ) {
364 $posStart = $start;
365 }
366 if ( $posEnd !== null ) {
367 $posEnd = $end;
368 }
369
370 if ( $end > $start ) {
371 return substr( $text, $start, $end - $start );
372 } else {
373 return '';
374 }
375 }
376
385 private function position( $text, $point, $offset = 0 ) {
386 $tolerance = 10;
387 $s = max( 0, $point - $tolerance );
388 $l = min( strlen( $text ), $point + $tolerance ) - $s;
389 $m = [];
390
391 if ( preg_match(
392 '/[ ,.!?~!@#$%^&*\‍(\‍)+=\-\\\|\[\]"\'<>]/',
393 substr( $text, $s, $l ),
394 $m,
395 PREG_OFFSET_CAPTURE
396 ) ) {
397 return $m[0][1] + $s + $offset;
398 } else {
399 // check if point is on a valid first UTF8 char
400 $char = ord( $text[$point] );
401 while ( $char >= 0x80 && $char < 0xc0 ) {
402 // skip trailing bytes
403 $point++;
404 if ( $point >= strlen( $text ) ) {
405 return strlen( $text );
406 }
407 $char = ord( $text[$point] );
408 }
409
410 return $point;
411
412 }
413 }
414
425 private function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
426 if ( $linesleft == 0 ) {
427 return; // nothing to do
428 }
429 foreach ( $extracts as $index => $line ) {
430 if ( array_key_exists( $index, $out ) ) {
431 continue; // this line already highlighted
432 }
433
434 $m = [];
435 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
436 continue;
437 }
438
439 $offset = $m[0][1];
440 $len = strlen( $m[0][0] );
441 if ( $offset + $len < $contextchars ) {
442 $begin = 0;
443 } elseif ( $len > $contextchars ) {
444 $begin = $offset;
445 } else {
446 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
447 }
448
449 $end = $begin + $contextchars;
450
451 $posBegin = $begin;
452 // basic snippet from this line
453 $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
454 $offsets[$index] = $posBegin;
455 $linesleft--;
456 if ( $linesleft == 0 ) {
457 return;
458 }
459 }
460 }
461
467 private function removeWiki( $text ) {
468 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
469 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
470 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
471 $text = preg_replace_callback(
472 "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
473 [ $this, 'linkReplace' ],
474 $text
475 );
476 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
477 $text = preg_replace( "/'''''/", "", $text );
478 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
479 $text = preg_replace( "/''/", "", $text );
480
481 // Note, the previous /<\/?[^>]+>/ is insufficient
482 // for XSS safety as the HTML tag can span multiple
483 // search results (T144845).
484 $text = Sanitizer::escapeHtmlAllowEntities( $text );
485 return $text;
486 }
487
495 private function linkReplace( $matches ) {
496 $colon = strpos( $matches[1], ':' );
497 if ( $colon === false ) {
498 return $matches[2]; // replace with caption
499 }
500 $ns = substr( $matches[1], 0, $colon );
501 $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
502 if ( $index !== false && ( $index === NS_FILE || $index === NS_CATEGORY ) ) {
503 return $matches[0]; // return the whole thing
504 } else {
505 return $matches[2];
506 }
507 }
508
521 public function highlightSimple(
522 $text,
523 $terms,
524 $contextlines = self::DEFAULT_CONTEXT_LINES,
525 $contextchars = self::DEFAULT_CONTEXT_CHARS
526 ) {
527 $lines = explode( "\n", $text );
528
529 $terms = implode( '|', $terms );
530 $max = intval( $contextchars ) + 1;
531 $pat1 = "/(.*)($terms)(.{0,$max})/i";
532
533 $lineno = 0;
534
535 $extract = "";
536 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
537 foreach ( $lines as $line ) {
538 if ( $contextlines == 0 ) {
539 break;
540 }
541 ++$lineno;
542 $m = [];
543 if ( !preg_match( $pat1, $line, $m ) ) {
544 continue;
545 }
546 --$contextlines;
547 // truncate function changes ... to relevant i18n message.
548 $pre = $contLang->truncateForVisual( $m[1], -$contextchars, '...', false );
549
550 if ( count( $m ) < 3 ) {
551 $post = '';
552 } else {
553 $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
554 }
555
556 $found = $m[2];
557
558 // @phan-suppress-next-line SecurityCheck-DoubleEscaped Triggered by Language::truncateForVisual
559 $line = htmlspecialchars( $pre . $found . $post );
560 $pat2 = '/(' . $terms . ")/i";
561 $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );
562
563 $extract .= "{$line}\n";
564 }
565
566 return $extract;
567 }
568
577 public function highlightNone(
578 $text,
579 $contextlines = self::DEFAULT_CONTEXT_LINES,
580 $contextchars = self::DEFAULT_CONTEXT_CHARS
581 ) {
582 $match = [];
583 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
584 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
585 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
586
587 // Trim and limit to max number of chars
588 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
589 return str_replace( "\n", '<br>', $text );
590 }
591}
$wgSearchHighlightBoundaries
Regexp to match word boundaries, defaults for non-CJK languages should be empty for CJK since the wor...
const NS_FILE
Definition Defines.php:70
const NS_CATEGORY
Definition Defines.php:78
MediaWikiServices is the service locator for the application scope of MediaWiki.
Highlight bits of wikitext.
process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets)
Search extracts for a pattern, and return snippets.
highlightText( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
removeWiki( $text)
Basic wikitext removal.
__construct( $cleanupWikitext=true)
highlightSimple( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Simple & fast snippet extraction, but gives completely irrelevant snippets.
splitAndAdd(&$extracts, &$count, $text)
Split text into lines and add it to extracts array.
caseCallback( $matches)
Do manual case conversion for non-ascii chars.
position( $text, $point, $offset=0)
Find a nonletter near a point (index) in the text.
linkReplace( $matches)
callback to replace [[target|caption]] kind of links, if the target is category or image,...
extract( $text, $start, $end, &$posStart=null, &$posEnd=null)
Extract part of the text from start to end, but by not chopping up words.
highlightNone( $text, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Returns the first few lines of the text.
$line
Definition mcc.php:119
foreach( $mmfl['setupFiles'] as $fileName) if($queue) if(empty( $mmfl['quiet'])) $s
if(!file_exists( $CREDITS)) $lines