REL1_31/php/SearchHighlighter_8php_source.html

<?php


class SearchHighlighter {

    protected $mCleanWikitext = true;


    function __construct( $cleanupWikitext = true ) {

        $this->mCleanWikitext = $cleanupWikitext;

    }


    public function highlightText( $text, $terms, $contextlines, $contextchars ) {

        global $wgContLang, $wgSearchHighlightBoundaries;


        if ( $text == '' ) {

            return '';

        }


        // spli text into text + templates/links/tables

        $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";

        // first capture group is for detecting nested templates/links/tables/references

        $endPatterns = [

            1 => '/(\{\{)|(\}\})/', // template

            2 => '/(\[\[)|(\]\])/', // image

            3 => "/(\n\\{\\|)|(\n\\|\\})/" ]; // table


        // @todo FIXME: This should prolly be a hook or something

        // instead of hardcoding a class name from the Cite extension

        if ( class_exists( 'Cite' ) ) {

            $spat .= '|(<ref>)'; // references via cite extension

            $endPatterns[4] = '/(<ref>)|(<\/ref>)/';

        }

        $spat .= '/';

        $textExt = []; // text extracts

        $otherExt = []; // other extracts

        $start = 0;

        $textLen = strlen( $text );

        $count = 0; // sequence number to maintain ordering

        while ( $start < $textLen ) {

            // find start of template/image/table

            if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {

                $epat = '';

                foreach ( $matches as $key => $val ) {

                    if ( $key > 0 && $val[1] != -1 ) {

                        if ( $key == 2 ) {

                            // see if this is an image link

                            $ns = substr( $val[0], 2, -1 );

                            if ( $wgContLang->getNsIndex( $ns ) != NS_FILE ) {

                                break;

                            }


                        }

                        $epat = $endPatterns[$key];

                        $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );

                        $start = $val[1];

                        break;

                    }

                }

                if ( $epat ) {

                    // find end (and detect any nested elements)

                    $level = 0;

                    $offset = $start + 1;

                    $found = false;

                    while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {

                        if ( array_key_exists( 2, $endMatches ) ) {

                            // found end

                            if ( $level == 0 ) {

                                $len = strlen( $endMatches[2][0] );

                                $off = $endMatches[2][1];

                                $this->splitAndAdd( $otherExt, $count,

                                    substr( $text, $start, $off + $len - $start ) );

                                $start = $off + $len;

                                $found = true;

                                break;

                            } else {

                                // end of nested element

                                $level -= 1;

                            }

                        } else {

                            // nested

                            $level += 1;

                        }

                        $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );

                    }

                    if ( !$found ) {

                        // couldn't find appropriate closing tag, skip

                        $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );

                        $start += strlen( $matches[0][0] );

                    }

                    continue;

                }

            }

            // else: add as text extract

            $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );

            break;

        }


        $all = $textExt + $otherExt; // these have disjunct key sets


        // prepare regexps

        foreach ( $terms as $index => $term ) {

            // manually do upper/lowercase stuff for utf-8 since PHP won't do it

            if ( preg_match( '/[\x80-\xff]/', $term ) ) {

                $terms[$index] = preg_replace_callback(

                    '/./us',

                    [ $this, 'caseCallback' ],

                    $terms[$index]

                );

            } else {

                $terms[$index] = $term;

            }

        }

        $anyterm = implode( '|', $terms );

        $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );

        // @todo FIXME: A hack to scale contextchars, a correct solution

        // would be to have contextchars actually be char and not byte

        // length, and do proper utf-8 substrings and lengths everywhere,

        // but PHP is making that very hard and unclean to implement :(

        $scale = strlen( $anyterm ) / mb_strlen( $anyterm );

        $contextchars = intval( $contextchars * $scale );


        $patPre = "(^|$wgSearchHighlightBoundaries)";

        $patPost = "($wgSearchHighlightBoundaries|$)";


        $pat1 = "/(" . $phrase . ")/ui";

        $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";


        $left = $contextlines;


        $snippets = [];

        $offsets = [];


        // show beginning only if it contains all words

        $first = 0;

        $firstText = '';

        foreach ( $textExt as $index => $line ) {

            if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {

                $firstText = $this->extract( $line, 0, $contextchars * $contextlines );

                $first = $index;

                break;

            }

        }

        if ( $firstText ) {

            $succ = true;

            // check if first text contains all terms

            foreach ( $terms as $term ) {

                if ( !preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {

                    $succ = false;

                    break;

                }

            }

            if ( $succ ) {

                $snippets[$first] = $firstText;

                $offsets[$first] = 0;

            }

        }

        if ( !$snippets ) {

            // match whole query on text

            $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );

            // match whole query on templates/tables/images

            $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );

            // match any words on text

            $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );

            // match any words on templates/tables/images

            $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );


            ksort( $snippets );

        }


        // add extra chars to each snippet to make snippets constant size

        $extended = [];

        if ( count( $snippets ) == 0 ) {

            // couldn't find the target words, just show beginning of article

            if ( array_key_exists( $first, $all ) ) {

                $targetchars = $contextchars * $contextlines;

                $snippets[$first] = '';

                $offsets[$first] = 0;

            }

        } else {

            // if begin of the article contains the whole phrase, show only that !!

            if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )

                && $offsets[$first] < $contextchars * 2 ) {

                $snippets = [ $first => $snippets[$first] ];

            }


            // calc by how much to extend existing snippets

            $targetchars = intval( ( $contextchars * $contextlines ) / count( $snippets ) );

        }


        foreach ( $snippets as $index => $line ) {

            $extended[$index] = $line;

            $len = strlen( $line );

            if ( $len < $targetchars - 20 ) {

                // complete this line

                if ( $len < strlen( $all[$index] ) ) {

                    $extended[$index] = $this->extract(

                        $all[$index],

                        $offsets[$index],

                        $offsets[$index] + $targetchars,

                        $offsets[$index]

                    );

                    $len = strlen( $extended[$index] );

                }


                // add more lines

                $add = $index + 1;

                while ( $len < $targetchars - 20

                        && array_key_exists( $add, $all )

                        && !array_key_exists( $add, $snippets ) ) {

                    $offsets[$add] = 0;

                    $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );

                    $extended[$add] = $tt;

                    $len += strlen( $tt );

                    $add++;

                }

            }

        }


        // $snippets = array_map( 'htmlspecialchars', $extended );

        $snippets = $extended;

        $last = -1;

        $extract = '';

        foreach ( $snippets as $index => $line ) {

            if ( $last == -1 ) {

                $extract .= $line; // first line

            } elseif ( $last + 1 == $index

                && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] )

            ) {

                $extract .= " " . $line; // continous lines

            } else {

                $extract .= '<b> ... </b>' . $line;

            }


            $last = $index;

        }

        if ( $extract ) {

            $extract .= '<b> ... </b>';

        }


        $processed = [];

        foreach ( $terms as $term ) {

            if ( !isset( $processed[$term] ) ) {

                $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word

                $extract = preg_replace( $pat3,

                    "\\1<span class='searchmatch'>\\2</span>\\3", $extract );

                $processed[$term] = true;

            }

        }


        return $extract;

    }


    function splitAndAdd( &$extracts, &$count, $text ) {

        $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );

        foreach ( $split as $line ) {

            $tt = trim( $line );

            if ( $tt ) {

                $extracts[$count++] = $tt;

            }

        }

    }


    function caseCallback( $matches ) {

        global $wgContLang;

        if ( strlen( $matches[0] ) > 1 ) {

            return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';

        } else {

            return $matches[0];

        }

    }


    function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {

        if ( $start != 0 ) {

            $start = $this->position( $text, $start, 1 );

        }

        if ( $end >= strlen( $text ) ) {

            $end = strlen( $text );

        } else {

            $end = $this->position( $text, $end );

        }


        if ( !is_null( $posStart ) ) {

            $posStart = $start;

        }

        if ( !is_null( $posEnd ) ) {

            $posEnd = $end;

        }


        if ( $end > $start ) {

            return substr( $text, $start, $end - $start );

        } else {

            return '';

        }

    }


    function position( $text, $point, $offset = 0 ) {

        $tolerance = 10;

        $s = max( 0, $point - $tolerance );

        $l = min( strlen( $text ), $point + $tolerance ) - $s;

        $m = [];


        if ( preg_match(

            '/[ ,.!?~!@#$%^&*\‍(\‍)+=\-\\\|\[\]"\'<>]/',

            substr( $text, $s, $l ),

            $m,

            PREG_OFFSET_CAPTURE

        ) ) {

            return $m[0][1] + $s + $offset;

        } else {

            // check if point is on a valid first UTF8 char

            $char = ord( $text[$point] );

            while ( $char >= 0x80 && $char < 0xc0 ) {

                // skip trailing bytes

                $point++;

                if ( $point >= strlen( $text ) ) {

                    return strlen( $text );

                }

                $char = ord( $text[$point] );

            }


            return $point;


        }

    }


    function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {

        if ( $linesleft == 0 ) {

            return; // nothing to do

        }

        foreach ( $extracts as $index => $line ) {

            if ( array_key_exists( $index, $out ) ) {

                continue; // this line already highlighted

            }


            $m = [];

            if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) ) {

                continue;

            }


            $offset = $m[0][1];

            $len = strlen( $m[0][0] );

            if ( $offset + $len < $contextchars ) {

                $begin = 0;

            } elseif ( $len > $contextchars ) {

                $begin = $offset;

            } else {

                $begin = $offset + intval( ( $len - $contextchars ) / 2 );

            }


            $end = $begin + $contextchars;


            $posBegin = $begin;

            // basic snippet from this line

            $out[$index] = $this->extract( $line, $begin, $end, $posBegin );

            $offsets[$index] = $posBegin;

            $linesleft--;

            if ( $linesleft == 0 ) {

                return;

            }

        }

    }


    function removeWiki( $text ) {

        $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );

        $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );

        $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );

        $text = preg_replace_callback(

            "/\\[\\[([^|]+\\|)(.*?)\\]\\]/",

            [ $this, 'linkReplace' ],

            $text

        );

        $text = preg_replace( "/<\/?[^>]+>/", "", $text );

        $text = preg_replace( "/'''''/", "", $text );

        $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );

        $text = preg_replace( "/''/", "", $text );


        // Note, the previous /<\/?[^>]+>/ is insufficient

        // for XSS safety as the HTML tag can span multiple

        // search results (T144845).

        $text = Sanitizer::escapeHtmlAllowEntities( $text );

        return $text;

    }


    function linkReplace( $matches ) {

        $colon = strpos( $matches[1], ':' );

        if ( $colon === false ) {

            return $matches[2]; // replace with caption

        }

        global $wgContLang;

        $ns = substr( $matches[1], 0, $colon );

        $index = $wgContLang->getNsIndex( $ns );

        if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) ) {

            return $matches[0]; // return the whole thing

        } else {

            return $matches[2];

        }

    }


    public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {

        global $wgContLang;


        $lines = explode( "\n", $text );


        $terms = implode( '|', $terms );

        $max = intval( $contextchars ) + 1;

        $pat1 = "/(.*)($terms)(.{0,$max})/i";


        $lineno = 0;


        $extract = "";

        foreach ( $lines as $line ) {

            if ( 0 == $contextlines ) {

                break;

            }

            ++$lineno;

            $m = [];

            if ( !preg_match( $pat1, $line, $m ) ) {

                continue;

            }

            --$contextlines;

            // truncate function changes ... to relevant i18n message.

            $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );


            if ( count( $m ) < 3 ) {

                $post = '';

            } else {

                $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );

            }


            $found = $m[2];


            $line = htmlspecialchars( $pre . $found . $post );

            $pat2 = '/(' . $terms . ")/i";

            $line = preg_replace( $pat2, "<span class='searchmatch'>\\1</span>", $line );


            $extract .= "${line}\n";

        }


        return $extract;

    }


    public function highlightNone( $text, $contextlines, $contextchars ) {

        $match = [];

        $text = ltrim( $text ) . "\n"; // make sure the preg_match may find the last line

        $text = str_replace( "\n\n", "\n", $text ); // remove empty lines

        preg_match( "/^(.*\n){0,$contextlines}/", $text, $match );


        // Trim and limit to max number of chars

        $text = htmlspecialchars( substr( trim( $match[0] ), 0, $contextlines * $contextchars ) );

        return str_replace( "\n", '<br>', $text );

    }


}


$wgSearchHighlightBoundaries
$wgSearchHighlightBoundaries
Regexp to match word boundaries, defaults for non-CJK languages should be empty for CJK since the wor...
Definition DefaultSettings.php:6468

$matches
$matches
Definition NoLocalSettings.php:24

$line
$line
Definition cdb.php:59

SearchHighlighter
Highlight bits of wikitext.
Definition SearchHighlighter.php:29

SearchHighlighter\process
process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets)
Search extracts for a pattern, and return snippets.
Definition SearchHighlighter.php:407

SearchHighlighter\removeWiki
removeWiki( $text)
Basic wikitext removal.
Definition SearchHighlighter.php:450

SearchHighlighter\highlightSimple
highlightSimple( $text, $terms, $contextlines, $contextchars)
Simple & fast snippet extraction, but gives completely unrelevant snippets.
Definition SearchHighlighter.php:505

SearchHighlighter\highlightNone
highlightNone( $text, $contextlines, $contextchars)
Returns the first few lines of the text.
Definition SearchHighlighter.php:556

SearchHighlighter\__construct
__construct( $cleanupWikitext=true)
Definition SearchHighlighter.php:37

SearchHighlighter\splitAndAdd
splitAndAdd(&$extracts, &$count, $text)
Split text into lines and add it to extracts array.
Definition SearchHighlighter.php:299

SearchHighlighter\caseCallback
caseCallback( $matches)
Do manual case conversion for non-ascii chars.
Definition SearchHighlighter.php:315

SearchHighlighter\highlightText
highlightText( $text, $terms, $contextlines, $contextchars)
Wikitext highlighting when $wgAdvancedSearchHighlighting = true.
Definition SearchHighlighter.php:51

SearchHighlighter\position
position( $text, $point, $offset=0)
Find a nonletter near a point (index) in the text.
Definition SearchHighlighter.php:366

SearchHighlighter\$mCleanWikitext
$mCleanWikitext
Definition SearchHighlighter.php:30

SearchHighlighter\linkReplace
linkReplace( $matches)
callback to replace [[target|caption]] kind of links, if the target is category or image,...
Definition SearchHighlighter.php:478

SearchHighlighter\extract
extract( $text, $start, $end, &$posStart=null, &$posEnd=null)
Extract part of the text from start to end, but by not chopping up words.
Definition SearchHighlighter.php:334

$wgContLang
this class mediates it Skin Encapsulates a look and feel for the wiki All of the functions that render HTML and make choices about how to render it are here and are called from various other places when and is meant to be subclassed with other skins that may override some of its functions The User object contains a reference to a and so rather than having a global skin object we just rely on the global User and get the skin with $wgUser and also has some character encoding functions and other locale stuff The current user interface language is instantiated as and the local content language as $wgContLang
Definition design.txt:57

$pre
return true to allow those checks to and false if checking is done remove or add to the links of a group of changes in EnhancedChangesList Hook subscribers can return false to omit this line from recentchanges use this to change the tables headers change it to an object instance and return false override the list derivative used the name of the old file when set the default code will be skipped $pre
Definition hooks.txt:1585

$term
For QUnit the mediawiki tests qunit testrunner dependency will be added to any module whereas SearchGetNearMatch runs after $term
Definition hooks.txt:2845

$out
this hook is for auditing only or null if authentication failed before getting that far or null if we can t even determine that probably a stub it is not rendered in wiki pages or galleries in category pages allow injecting custom HTML after the section Any uses of the hook need to handle escaping see BaseTemplate::getToolbox and BaseTemplate::makeListItem for details on the format of individual items inside of this array or by returning and letting standard HTTP rendering take place modifiable or by returning false and taking over the output $out
Definition hooks.txt:864

NS_FILE
const NS_FILE
Definition Defines.php:80

NS_CATEGORY
const NS_CATEGORY
Definition Defines.php:88

$s
$s
Definition mergeMessageFileList.php:187

$last
$last
Definition profileinfo.php:408

$lines
$lines
Definition router.php:61