49 $this->mCleanWikitext = $cleanupWikitext;
65 $contextlines = self::DEFAULT_CONTEXT_LINES,
66 $contextchars = self::DEFAULT_CONTEXT_CHARS
68 $searchHighlightBoundaries = MediaWikiServices::getInstance()
69 ->getMainConfig()->get( MainConfigNames::SearchHighlightBoundaries );
76 $spat =
"/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
79 1 =>
'/(\{\{)|(\}\})/',
80 2 =>
'/(\[\[)|(\]\])/',
81 3 =>
"/(\n\\{\\|)|(\n\\|\\})/" ];
87 $endPatterns[4] =
'/(<ref>)|(<\/ref>)/';
93 $textLen = strlen( $text );
95 while ( $start < $textLen ) {
97 if ( preg_match( $spat, $text,
$matches, PREG_OFFSET_CAPTURE, $start ) ) {
99 foreach (
$matches as $key => $val ) {
100 if ( $key > 0 && $val[1] != -1 ) {
103 $ns = substr( $val[0], 2, -1 );
105 MediaWikiServices::getInstance()->getContentLanguage()->
112 $epat = $endPatterns[$key];
113 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
121 $offset = $start + 1;
123 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
124 if ( array_key_exists( 2, $endMatches ) ) {
127 $len = strlen( $endMatches[2][0] );
128 $off = $endMatches[2][1];
129 $this->splitAndAdd( $otherExt, $count,
130 substr( $text, $start, $off + $len - $start ) );
131 $start = $off + $len;
142 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
146 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen(
$matches[0][0] ) ) );
153 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
156 '@phan-var string[] $textExt';
158 $all = $textExt + $otherExt;
161 foreach ( $terms as $index => $term ) {
163 if ( preg_match(
'/[\x80-\xff]/', $term ) ) {
164 $terms[$index] = preg_replace_callback(
166 [ $this,
'caseCallback' ],
170 $terms[$index] = $term;
173 $anyterm = implode(
'|', $terms );
174 $phrase = implode(
"{$searchHighlightBoundaries}+", $terms );
179 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
180 $contextchars = intval( $contextchars * $scale );
182 $patPre =
"(^|{$searchHighlightBoundaries})";
183 $patPost =
"({$searchHighlightBoundaries}|$)";
185 $pat1 =
"/(" . $phrase .
")/ui";
186 $pat2 =
"/$patPre(" . $anyterm .
")$patPost/ui";
188 $left = $contextlines;
196 foreach ( $textExt as $index => $line ) {
197 if ( strlen( $line ) > 0 && $line[0] !=
';' && $line[0] !=
':' ) {
198 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
206 foreach ( $terms as $term ) {
207 if ( !preg_match(
"/$patPre" . $term .
"$patPost/ui", $firstText ) ) {
213 $snippets[$first] = $firstText;
214 $offsets[$first] = 0;
219 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
221 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
223 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
225 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
232 if ( count( $snippets ) == 0 ) {
234 if ( array_key_exists( $first, $all ) ) {
235 $targetchars = $contextchars * $contextlines;
236 $snippets[$first] =
'';
237 $offsets[$first] = 0;
241 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
242 && $offsets[$first] < $contextchars * 2 ) {
243 $snippets = [ $first => $snippets[$first] ];
247 $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
250 foreach ( $snippets as $index => $line ) {
251 $extended[$index] = $line;
252 $len = strlen( $line );
255 if ( $len < $targetchars - 20 ) {
257 if ( $len < strlen( $all[$index] ) ) {
258 $extended[$index] = $this->extract(
263 $offsets[$index] + $targetchars,
266 $len = strlen( $extended[$index] );
273 while ( $len < $targetchars - 20
274 && array_key_exists( $add, $all )
275 && !array_key_exists( $add, $snippets ) ) {
279 $tt =
"\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
280 $extended[$add] = $tt;
281 $len += strlen( $tt );
288 $snippets = $extended;
291 foreach ( $snippets as $index => $line ) {
294 } elseif ( $last + 1 == $index
295 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
297 $extract .=
" " . $line;
299 $extract .=
'<b> ... </b>' . $line;
305 $extract .=
'<b> ... </b>';
309 foreach ( $terms as $term ) {
310 if ( !isset( $processed[$term] ) ) {
311 $pat3 =
"/$patPre(" . $term .
")$patPost/ui";
312 $extract = preg_replace( $pat3,
313 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
314 $processed[$term] =
true;
328 private function splitAndAdd( &$extracts, &$count, $text ) {
329 $split = explode(
"\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
330 foreach ( $split as $line ) {
333 $extracts[$count++] = $tt;
344 private function caseCallback(
$matches ) {
346 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
347 return '[' . $contLang->lc(
$matches[0] ) .
364 private function extract( $text, $start, $end, &$posStart =
null, &$posEnd =
null ) {
366 $start = $this->position( $text, $start, 1 );
368 if ( $end >= strlen( $text ) ) {
369 $end = strlen( $text );
371 $end = $this->position( $text, $end );
374 if ( $posStart !==
null ) {
377 if ( $posEnd !==
null ) {
381 if ( $end > $start ) {
382 return substr( $text, $start, $end - $start );
396 private function position( $text, $point, $offset = 0 ) {
398 $s = max( 0, $point - $tolerance );
399 $l = min( strlen( $text ), $point + $tolerance ) - $s;
403 '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
404 substr( $text, $s, $l ),
408 return $m[0][1] + $s + $offset;
411 $char = ord( $text[$point] );
412 while ( $char >= 0x80 && $char < 0xc0 ) {
415 if ( $point >= strlen( $text ) ) {
416 return strlen( $text );
418 $char = ord( $text[$point] );
436 private function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
437 if ( $linesleft == 0 ) {
440 foreach ( $extracts as $index => $line ) {
441 if ( array_key_exists( $index, $out ) ) {
446 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
451 $len = strlen( $m[0][0] );
452 if ( $offset + $len < $contextchars ) {
454 } elseif ( $len > $contextchars ) {
457 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
460 $end = $begin + $contextchars;
464 $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
465 $offsets[$index] = $posBegin;
467 if ( $linesleft == 0 ) {
478 private function removeWiki( $text ) {
479 $text = preg_replace(
"/\\{\\{([^|]+?)\\}\\}/",
"", $text );
480 $text = preg_replace(
"/\\{\\{([^|]+\\|)(.*?)\\}\\}/",
"\\2", $text );
481 $text = preg_replace(
"/\\[\\[([^|]+?)\\]\\]/",
"\\1", $text );
482 $text = preg_replace_callback(
483 "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
484 [ $this,
'linkReplace' ],
487 $text = preg_replace(
"/<\/?[^>]+>/",
"", $text );
488 $text = preg_replace(
"/'''''/",
"", $text );
489 $text = preg_replace(
"/('''|<\/?[iIuUbB]>)/",
"", $text );
490 $text = preg_replace(
"/''/",
"", $text );
495 $text = Sanitizer::escapeHtmlAllowEntities( $text );
506 private function linkReplace(
$matches ) {
507 $colon = strpos(
$matches[1],
':' );
508 if ( $colon ===
false ) {
511 $ns = substr(
$matches[1], 0, $colon );
512 $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
535 $contextlines = self::DEFAULT_CONTEXT_LINES,
536 $contextchars = self::DEFAULT_CONTEXT_CHARS
538 $lines = explode(
"\n", $text );
540 $terms = implode(
'|', $terms );
541 $max = intval( $contextchars ) + 1;
542 $pat1 =
"/(.*)($terms)(.{0,$max})/ui";
545 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
546 foreach (
$lines as $line ) {
547 if ( $contextlines == 0 ) {
551 if ( !preg_match( $pat1, $line, $m ) ) {
556 $pre = $contLang->truncateForVisual( $m[1], -$contextchars,
'...',
false );
558 if ( count( $m ) < 3 ) {
561 $post = $contLang->truncateForVisual( $m[3], $contextchars,
'...',
false );
566 $line = htmlspecialchars( $pre . $found . $post );
567 $pat2 =
'/(' . $terms .
')/ui';
568 $line = preg_replace( $pat2,
'<span class="searchmatch">\1</span>', $line );
570 $extract .=
"{$line}\n";
586 $contextlines = self::DEFAULT_CONTEXT_LINES,
587 $contextchars = self::DEFAULT_CONTEXT_CHARS
590 $text = ltrim( $text ) .
"\n";
591 $text = str_replace(
"\n\n",
"\n", $text );
592 preg_match(
"/^(.*\n){0,$contextlines}/", $text, $match );
595 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
596 return str_replace(
"\n",
'<br>', $text );
A class containing constants representing the names of configuration variables.
Highlight bits of wikitext.
const DEFAULT_CONTEXT_LINES
highlightText( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
__construct( $cleanupWikitext=true)
highlightSimple( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Simple & fast snippet extraction, but gives completely irrelevant snippets.
const DEFAULT_CONTEXT_CHARS
highlightNone( $text, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Returns the first few lines of the text.
if(!file_exists( $CREDITS)) $lines