48 $this->mCleanWikitext = $cleanupWikitext;
64 $contextlines = self::DEFAULT_CONTEXT_LINES,
65 $contextchars = self::DEFAULT_CONTEXT_CHARS
67 $searchHighlightBoundaries = MediaWikiServices::getInstance()
68 ->getMainConfig()->get( MainConfigNames::SearchHighlightBoundaries );
75 $spat =
"/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
78 1 =>
'/(\{\{)|(\}\})/',
79 2 =>
'/(\[\[)|(\]\])/',
80 3 =>
"/(\n\\{\\|)|(\n\\|\\})/" ];
86 $endPatterns[4] =
'/(<ref>)|(<\/ref>)/';
92 $textLen = strlen( $text );
94 while ( $start < $textLen ) {
96 if ( preg_match( $spat, $text,
$matches, PREG_OFFSET_CAPTURE, $start ) ) {
98 foreach (
$matches as $key => $val ) {
99 if ( $key > 0 && $val[1] != -1 ) {
102 $ns = substr( $val[0], 2, -1 );
104 MediaWikiServices::getInstance()->getContentLanguage()->
111 $epat = $endPatterns[$key];
112 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
120 $offset = $start + 1;
122 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
123 if ( array_key_exists( 2, $endMatches ) ) {
126 $len = strlen( $endMatches[2][0] );
127 $off = $endMatches[2][1];
128 $this->splitAndAdd( $otherExt, $count,
129 substr( $text, $start, $off + $len - $start ) );
130 $start = $off + $len;
141 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
145 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen(
$matches[0][0] ) ) );
152 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
155 '@phan-var string[] $textExt';
157 $all = $textExt + $otherExt;
160 foreach ( $terms as $index => $term ) {
162 if ( preg_match(
'/[\x80-\xff]/', $term ) ) {
163 $terms[$index] = preg_replace_callback(
165 [ $this,
'caseCallback' ],
169 $terms[$index] = $term;
172 $anyterm = implode(
'|', $terms );
173 $phrase = implode(
"{$searchHighlightBoundaries}+", $terms );
178 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
179 $contextchars = intval( $contextchars * $scale );
181 $patPre =
"(^|{$searchHighlightBoundaries})";
182 $patPost =
"({$searchHighlightBoundaries}|$)";
184 $pat1 =
"/(" . $phrase .
")/ui";
185 $pat2 =
"/$patPre(" . $anyterm .
")$patPost/ui";
187 $left = $contextlines;
195 foreach ( $textExt as $index => $line ) {
196 if ( strlen( $line ) > 0 && $line[0] !=
';' && $line[0] !=
':' ) {
197 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
205 foreach ( $terms as $term ) {
206 if ( !preg_match(
"/$patPre" . $term .
"$patPost/ui", $firstText ) ) {
212 $snippets[$first] = $firstText;
213 $offsets[$first] = 0;
218 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
220 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
222 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
224 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
231 if ( count( $snippets ) == 0 ) {
233 if ( array_key_exists( $first, $all ) ) {
234 $targetchars = $contextchars * $contextlines;
235 $snippets[$first] =
'';
236 $offsets[$first] = 0;
240 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
241 && $offsets[$first] < $contextchars * 2 ) {
242 $snippets = [ $first => $snippets[$first] ];
246 $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );
249 foreach ( $snippets as $index => $line ) {
250 $extended[$index] = $line;
251 $len = strlen( $line );
254 if ( $len < $targetchars - 20 ) {
256 if ( $len < strlen( $all[$index] ) ) {
257 $extended[$index] = $this->extract(
262 $offsets[$index] + $targetchars,
265 $len = strlen( $extended[$index] );
272 while ( $len < $targetchars - 20
273 && array_key_exists( $add, $all )
274 && !array_key_exists( $add, $snippets ) ) {
278 $tt =
"\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
279 $extended[$add] = $tt;
280 $len += strlen( $tt );
287 $snippets = $extended;
290 foreach ( $snippets as $index => $line ) {
293 } elseif ( $last + 1 == $index
294 && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )
296 $extract .=
" " . $line;
298 $extract .=
'<b> ... </b>' . $line;
304 $extract .=
'<b> ... </b>';
308 foreach ( $terms as $term ) {
309 if ( !isset( $processed[$term] ) ) {
310 $pat3 =
"/$patPre(" . $term .
")$patPost/ui";
311 $extract = preg_replace( $pat3,
312 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
313 $processed[$term] =
true;
327 private function splitAndAdd( &$extracts, &$count, $text ) {
328 $split = explode(
"\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
329 foreach ( $split as $line ) {
332 $extracts[$count++] = $tt;
343 private function caseCallback(
$matches ) {
345 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
346 return '[' . $contLang->lc(
$matches[0] ) .
363 private function extract( $text, $start, $end, &$posStart =
null, &$posEnd =
null ) {
365 $start = $this->position( $text, $start, 1 );
367 if ( $end >= strlen( $text ) ) {
368 $end = strlen( $text );
370 $end = $this->position( $text, $end );
373 if ( $posStart !==
null ) {
376 if ( $posEnd !==
null ) {
380 if ( $end > $start ) {
381 return substr( $text, $start, $end - $start );
395 private function position( $text, $point, $offset = 0 ) {
397 $s = max( 0, $point - $tolerance );
398 $l = min( strlen( $text ), $point + $tolerance ) -
$s;
402 '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/',
403 substr( $text,
$s, $l ),
407 return $m[0][1] +
$s + $offset;
410 $char = ord( $text[$point] );
411 while ( $char >= 0x80 && $char < 0xc0 ) {
414 if ( $point >= strlen( $text ) ) {
415 return strlen( $text );
417 $char = ord( $text[$point] );
435 private function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
436 if ( $linesleft == 0 ) {
439 foreach ( $extracts as $index => $line ) {
440 if ( array_key_exists( $index, $out ) ) {
445 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {
450 $len = strlen( $m[0][0] );
451 if ( $offset + $len < $contextchars ) {
453 } elseif ( $len > $contextchars ) {
456 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
459 $end = $begin + $contextchars;
463 $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
464 $offsets[$index] = $posBegin;
466 if ( $linesleft == 0 ) {
477 private function removeWiki( $text ) {
478 $text = preg_replace(
"/\\{\\{([^|]+?)\\}\\}/",
"", $text );
479 $text = preg_replace(
"/\\{\\{([^|]+\\|)(.*?)\\}\\}/",
"\\2", $text );
480 $text = preg_replace(
"/\\[\\[([^|]+?)\\]\\]/",
"\\1", $text );
481 $text = preg_replace_callback(
482 "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",
483 [ $this,
'linkReplace' ],
486 $text = preg_replace(
"/<\/?[^>]+>/",
"", $text );
487 $text = preg_replace(
"/'''''/",
"", $text );
488 $text = preg_replace(
"/('''|<\/?[iIuUbB]>)/",
"", $text );
489 $text = preg_replace(
"/''/",
"", $text );
505 private function linkReplace(
$matches ) {
506 $colon = strpos(
$matches[1],
':' );
507 if ( $colon ===
false ) {
510 $ns = substr(
$matches[1], 0, $colon );
511 $index = MediaWikiServices::getInstance()->getContentLanguage()->getNsIndex( $ns );
534 $contextlines = self::DEFAULT_CONTEXT_LINES,
535 $contextchars = self::DEFAULT_CONTEXT_CHARS
537 $lines = explode(
"\n", $text );
539 $terms = implode(
'|', $terms );
540 $max = intval( $contextchars ) + 1;
541 $pat1 =
"/(.*)($terms)(.{0,$max})/i";
544 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
545 foreach (
$lines as $line ) {
546 if ( $contextlines == 0 ) {
550 if ( !preg_match( $pat1, $line, $m ) ) {
555 $pre = $contLang->truncateForVisual( $m[1], -$contextchars,
'...',
false );
557 if ( count( $m ) < 3 ) {
560 $post = $contLang->truncateForVisual( $m[3], $contextchars,
'...',
false );
565 $line = htmlspecialchars( $pre . $found . $post );
566 $pat2 =
'/(' . $terms .
")/i";
567 $line = preg_replace( $pat2,
"<span class='searchmatch'>\\1</span>", $line );
569 $extract .=
"{$line}\n";
585 $contextlines = self::DEFAULT_CONTEXT_LINES,
586 $contextchars = self::DEFAULT_CONTEXT_CHARS
589 $text = ltrim( $text ) .
"\n";
590 $text = str_replace(
"\n\n",
"\n", $text );
591 preg_match(
"/^(.*\n){0,$contextlines}/", $text, $match );
594 $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );
595 return str_replace(
"\n",
'<br>', $text );
A class containing constants representing the names of configuration variables.
static escapeHtmlAllowEntities( $html)
Given HTML input, escape with htmlspecialchars but un-escape entities.
Highlight bits of wikitext.
const DEFAULT_CONTEXT_LINES
highlightText( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
__construct( $cleanupWikitext=true)
highlightSimple( $text, $terms, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Simple & fast snippet extraction, but gives completely irrelevant snippets.
const DEFAULT_CONTEXT_CHARS
highlightNone( $text, $contextlines=self::DEFAULT_CONTEXT_LINES, $contextchars=self::DEFAULT_CONTEXT_CHARS)
Returns the first few lines of the text.
foreach( $mmfl['setupFiles'] as $fileName) if( $queue) if(empty( $mmfl['quiet'])) $s
if(!file_exists( $CREDITS)) $lines