MediaWiki master
SearchHighlighter.php
Go to the documentation of this file.
1<?php
14
24 public const DEFAULT_CONTEXT_LINES = 2;
25 public const DEFAULT_CONTEXT_CHARS = 75;
26
28 protected $mCleanWikitext = true;
29
36 public function __construct( $cleanupWikitext = true ) {
37 $this->mCleanWikitext = $cleanupWikitext;
38 }
39
50 public function highlightText(
51 $text,
52 $terms,
53 $contextlines = self::DEFAULT_CONTEXT_LINES,
54 $contextchars = self::DEFAULT_CONTEXT_CHARS
55 ) {
56 $searchHighlightBoundaries = MediaWikiServices::getInstance()
57 ->getMainConfig()->get( MainConfigNames::SearchHighlightBoundaries );
58
59 if ( $text == '' ) {
60 return '';
61 }
62
63 // split text into text + templates/links/tables
64 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
65 // first capture group is for detecting nested templates/links/tables/references
66 $endPatterns = [
67 1 => '/(\{\{)|(\}\})/', // template
68 2 => '/(\[\[)|(\]\])/', // image
69 3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table
70
71 // @todo FIXME: This should prolly be a hook or something
72 // instead of hardcoding the name of the Cite extension
73 if ( ExtensionRegistry::getInstance()->isLoaded( 'Cite' ) ) {
74 $spat .= '|(<ref>)'; // references via cite extension
75 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
76 }
77 $spat .= '/';
78 $textExt = []; // text extracts
79 $otherExt = []; // other extracts
80 $start = 0;
81 $textLen = strlen( $text );
82 $count = 0; // sequence number to maintain ordering
83 while ( $start < $textLen ) {
84 // find start of template/image/table
85 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
86 $epat = '';
87 foreach ( $matches as $key => $val ) {
88 if ( $key > 0 && $val[1] != -1 ) {
89 if ( $key == 2 ) {
90 // see if this is an image link
91 $ns = substr( $val[0], 2, -1 );
92 if (
93 MediaWikiServices::getInstance()->getContentLanguage()->
94 getNsIndex( $ns ) !== NS_FILE
95 ) {
96 break;
97 }
98
99 }
100 $epat = $endPatterns[$key];
101 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
102 $start = $val[1];
103 break;
104 }
105 }
106 if ( $epat ) {
107 // find end (and detect any nested elements)
108 $level = 0;
109 $offset = $start + 1;
110 $found = false;
111 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
112 if ( array_key_exists( 2, $endMatches ) ) {
113 // found end
114 if ( $level == 0 ) {
115 $len = strlen( $endMatches[2][0] );
116 $off = $endMatches[2][1];
117 $this->splitAndAdd( $otherExt, $count,
118 substr( $text, $start, $off + $len - $start ) );
119 $start = $off + $len;
120 $found = true;
121 break;
122 } else {
123 // end of nested element
124 $level--;
125 }
126 } else {
127 // nested
128 $level++;
129 }
130 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
131 }
132 if ( !$found ) {
133 // couldn't find appropriate closing tag, skip
134 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
135 $start += strlen( $matches[0][0] );
136 }
137 continue;
138 }
139 }
140 // else: add as text extract
141 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
142 break;
143 }
144 '@phan-var string[] $textExt';
145
146 $all = $textExt + $otherExt; // these have disjunct key sets
147
148 // prepare regexps
149 foreach ( $terms as $index => $term ) {
150 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
151 if ( preg_match( '/[\x80-\xff]/', $term ) ) {
152 $terms[$index] = preg_replace_callback(
153 '/./us',
154 $this->caseCallback( ... ),
155 $terms[$index]
156 );
157 } else {
158 $terms[$index] = $term;
159 }
160 }
161 $anyterm = implode( '|', $terms );
162 $phrase = implode( "{$searchHighlightBoundaries}+", $terms );
163 // @todo FIXME: A hack to scale contextchars, a correct solution
164 // would be to have contextchars actually be char and not byte
165 // length, and do proper utf-8 substrings and lengths everywhere,
166 // but PHP is making that very hard and unclean to implement :(
167 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
168 $contextchars = intval( $contextchars * $scale );
169
170 $patPre = "(^|{$searchHighlightBoundaries})";
171 $patPost = "({$searchHighlightBoundaries}|$)";
172
173 $pat1 = "/(" . $phrase . ")/ui";
174 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
175
176 $left = $contextlines;
177
178 $snippets = [];
179 $offsets = [];
180
181 // show beginning only if it contains all words
182 $first = 0;
183 $firstText = '';
184 foreach ( $textExt as $index => $line ) {
185 if ( $line !== '' && $line[0] != ';' && $line[0] != ':' ) {
186 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
187 $first = $index;
188 break;
189 }
190 }
191 if ( $firstText ) {
192 $succ = true;
193 // check if first text contains all terms
194 foreach ( $terms as $term ) {
195 if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
196 $succ = false;
197 break;
198 }
199 }
200 if ( $succ ) {
201 $snippets[$first] = $firstText;
202 $offsets[$first] = 0;
203 }
204 }
205 if ( !$snippets ) {
206 // match whole query on text
207 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
208 // match whole query on templates/tables/images
209 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
210 // match any words on text
211 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
212 // match any words on templates/tables/images
213 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
214
215 ksort( $snippets );
216 }
217
218 // add extra chars to each snippet to make snippets constant size
219 $extended = [];
220 if ( count( $snippets ) == 0 ) {
221 // couldn't find the target words, just show beginning of article
222 if ( array_key_exists( $first, $all ) ) {
223 $targetchars = $contextchars * $contextlines;
224 $snippets[$first] = '';
225 $offsets[$first] = 0;
226 }
227 } else {
228 // if begin of the article contains the whole phrase, show only that !!
229 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
230 && $offsets[$first] < $contextchars * 2 ) {
231 $snippets = [ $first => $snippets[$first] ];
232 }
233
234 // calc by how much to extend existing snippets
235 $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
236 }
237
238 foreach ( $snippets as $index => $line ) {
239 $extended[$index] = $line;
240 $len = strlen( $line );
241 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
242 // $targetchars is set when $snippes contains anything
243 if ( $len < $targetchars - 20 ) {
244 // complete this line
245 if ( $len < strlen( $all[$index] ) ) {
246 $extended[$index] = $this->extract(
247 $all[$index],
248 $offsets[$index],
249 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
250 // $targetchars is set when $snippes contains anything
251 $offsets[$index] + $targetchars,
252 $offsets[$index]
253 );
254 $len = strlen( $extended[$index] );
255 }
256
257 // add more lines
258 $add = $index + 1;
259 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
260 // $targetchars is set when $snippes contains anything
261 while ( $len < $targetchars - 20
262 && array_key_exists( $add, $all )
263 && !array_key_exists( $add, $snippets ) ) {
264 $offsets[$add] = 0;
265 // @phan-suppress-next-next-line PhanPossiblyUndeclaredVariable
266 // $targetchars is set when $snippes contains anything
267 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
268 $extended[$add] = $tt;
269 $len += strlen( $tt );
270 $add++;
271 }
272 }
273 }
274
275 // $snippets = array_map( 'htmlspecialchars', $extended );
276 $snippets = $extended;
277 $last = -1;
278 $extract = '';
279 foreach ( $snippets as $index => $line ) {
280 if ( $last == -1 ) {
281 $extract .= $line; // first line
282 } elseif ( $last + 1 == $index
283 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
284 ) {
285 $extract .= " " . $line; // continuous lines
286 } else {
287 $extract .= '<b> ... </b>' . $line;
288 }
289
290 $last = $index;
291 }
292 if ( $extract ) {
293 $extract .= '<b> ... </b>';
294 }
295
296 $processed = [];
297 foreach ( $terms as $term ) {
298 if ( !isset( $processed[$term] ) ) {
299 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
300 $extract = preg_replace( $pat3,
301 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
302 $processed[$term] = true;
303 }
304 }
305
306 return $extract;
307 }
308
316 private function splitAndAdd( &$extracts, &$count, $text ) {
317 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
318 foreach ( $split as $line ) {
319 $tt = trim( $line );
320 if ( $tt ) {
321 $extracts[$count++] = $tt;
322 }
323 }
324 }
325
332 private function caseCallback( $matches ) {
333 if ( strlen( $matches[0] ) > 1 ) {
334 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
335 return '[' . $contLang->lc( $matches[0] ) .
336 $contLang->uc( $matches[0] ) . ']';
337 } else {
338 return $matches[0];
339 }
340 }
341
352 private function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
353 if ( $start != 0 ) {
354 $start = $this->position( $text, $start, 1 );
355 }
356 if ( $end >= strlen( $text ) ) {
357 $end = strlen( $text );
358 } else {
359 $end = $this->position( $text, $end );
360 }
361
362 if ( $posStart !== null ) {
363 $posStart = $start;
364 }
365 if ( $posEnd !== null ) {
366 $posEnd = $end;
367 }
368
369 if ( $end > $start ) {
370 return substr( $text, $start, $end - $start );
371 } else {
372 return '';
373 }
374 }
375
384 private function position( $text, $point, $offset = 0 ) {
385 $tolerance = 10;
386 $s = max( 0, $point - $tolerance );
387 $l = min( strlen( $text ), $point + $tolerance ) - $s;
388 $m = [];
389
390 if ( preg_match(
391 '/[ ,.!?~!@#$%^&*\‍(\‍)+=\-\\\|\[\]"\'<>]/',
392 substr( $text, $s, $l ),
393 $m,
394 PREG_OFFSET_CAPTURE
395 ) ) {
396 return $m[0][1] + $s + $offset;
397 } else {
398 // check if point is on a valid first UTF8 char
399 $char = ord( $text[$point] );
400 while ( $char >= 0x80 && $char < 0xc0 ) {
401 // skip trailing bytes
402 $point++;
403 if ( $point >= strlen( $text ) ) {
404 return strlen( $text );
405 }
406 $char = ord( $text[$point] );
407 }
408
409 return $point;
410
411 }
412 }
413
424 private function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
425 if ( $linesleft == 0 ) {
426 return; // nothing to do
427 }
428 foreach ( $extracts as $index => $line ) {
429 if ( array_key_exists( $index, $out ) ) {
430 continue; // this line already highlighted
431 }
432
433 $m = [];
434 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
435 continue;
436 }
437
438 $offset = $m[0][1];
439 $len = strlen( $m[0][0] );
440 if ( $offset + $len < $contextchars ) {
441 $begin = 0;
442 } elseif ( $len > $contextchars ) {
443 $begin = $offset;
444 } else {
445 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
446 }
447
448 $end = $begin + $contextchars;
449
450 $posBegin = $begin;
451 // basic snippet from this line
452 $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
453 $offsets[$index] = $posBegin;
454 $linesleft--;
455 if ( $linesleft == 0 ) {
456 return;
457 }
458 }
459 }
460
466 private function removeWiki( $text ) {
467 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
468 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
469 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
470 $text = preg_replace_callback(
471 "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
472 $this->linkReplace( ... ),
473 $text
474 );
475 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
476 $text = preg_replace( "/'''''/", "", $text );
477 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
478 $text = preg_replace( "/''/", "", $text );
479
480 // Note, the previous /<\/?[^>]+>/ is insufficient
481 // for XSS safety as the HTML tag can span multiple
482 // search results (T144845).
483 $text = Sanitizer::escapeHtmlAllowEntities( $text );
484 return $text;
485 }
486
494 private function linkReplace( $matches ) {
495 $colon = strpos( $matches[1], ':' );
496 if ( $colon === false ) {
497 return $matches[2]; // replace with caption
498 }
499 $ns = substr( $matches[1], 0, $colon );
500 $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
501 if ( $index !== false && ( $index === NS_FILE || $index === NS_CATEGORY ) ) {
502 return $matches[0]; // return the whole thing
503 } else {
504 return $matches[2];
505 }
506 }
507
520 public function highlightSimple(
521 $text,
522 $terms,
523 $contextlines = self::DEFAULT_CONTEXT_LINES,
524 $contextchars = self::DEFAULT_CONTEXT_CHARS
525 ) {
526 $lines = explode( "\n", $text );
527
528 $terms = implode( '|', $terms );
529 $max = intval( $contextchars ) + 1;
530 $pat1 = "/(.*)($terms)(.{0,$max})/ui";
531
532 $extract = '';
533 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
534 foreach ( $lines as $line ) {
535 if ( $contextlines == 0 ) {
536 break;
537 }
538 $m = [];
539 if ( !preg_match( $pat1, $line, $m ) ) {
540 continue;
541 }
542 --$contextlines;
543 // truncate function changes ... to relevant i18n message.
544 $pre = $contLang->truncateForVisual( $m[1], -$contextchars, '...', false );
545
546 if ( count( $m ) < 3 ) {
547 $post = '';
548 } else {
549 $post = $contLang->truncateForVisual( $m[3], $contextchars, '...', false );
550 }
551
552 $found = $m[2];
553
554 $line = htmlspecialchars( $pre . $found . $post );
555 $pat2 = '/(' . $terms . ')/ui';
556 $line = preg_replace( $pat2, '<span class="searchmatch">\1</span>', $line );
557
558 $extract .= "{$line}\n";
559 }
560
561 return $extract;
562 }
563
572 public function highlightNone(
573 $text,
574 $contextlines = self::DEFAULT_CONTEXT_LINES,
575 $contextchars = self::DEFAULT_CONTEXT_CHARS
576 ) {
577 $match = [];
578 $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line
579 $text = str_replace( "\n\n", "\n", $text ); // remove empty lines
580 preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );
581
582 // Trim and limit to max number of chars
583 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
584 return str_replace( "\n", '<br>', $text );
585 }
586}
const NS_FILE
Definition Defines.php:57
const NS_CATEGORY
Definition Defines.php:65
A class containing constants representing the names of configuration variables.
Service locator for MediaWiki core services.
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:32
Load JSON files, and uses a Processor to extract information.
Highlight bits of wikitext.
highlightText( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
__construct( $cleanupWikitext=true)
highlightSimple( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Simple & fast snippet extraction, but gives completely irrelevant snippets.
highlightNone( $text, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Returns the first few lines of the text.