REL1_34/php/WikiTextStructure_8php_source.html

<?php


use HtmlFormatter\HtmlFormatter;


class WikiTextStructure {

    private $openingText;

    private $allText;

    private $auxText = [];

    private $parserOutput;


    private $excludedElementSelectors = [

        // "it looks like you don't have javascript enabled..." – do not need to index

        'audio', 'video',

        // CSS stylesheets aren't content

        'style',

        // The [1] for references from Cite

        'sup.reference',

        // The ↑ next to references in the references section from Cite

        '.mw-cite-backlink',

        // Headings are already indexed in their own field.

        'h1', 'h2', 'h3', 'h4', 'h5', 'h6',

        // Collapsed fields are hidden by default so we don't want them showing up.

        '.autocollapse',

        // Content explicitly decided to be not searchable by editors such

        // as custom navigation templates.

        '.navigation-not-searchable',

        // User-facing interface code prompting the user to act from WikibaseMediaInfo

        '.wbmi-entityview-emptyCaption',

    ];


    private $auxiliaryElementSelectors = [

        // Thumbnail captions aren't really part of the text proper

        '.thumbcaption',

        // Neither are tables

        'table',

        // Common style for "See also:".

        '.rellink',

        // Common style for calling out helpful links at the top of the article.

        '.dablink',

        // New class users can use to mark stuff as auxiliary to searches.

        '.searchaux',

    ];


    public function __construct( ParserOutput $parserOutput ) {

        $this->parserOutput = $parserOutput;

    }


    public function headings() {

        $headings = [];

        $ignoredHeadings = $this->getIgnoredHeadings();

        foreach ( $this->parserOutput->getSections() as $heading ) {

            $heading = $heading[ 'line' ];


            // Some wikis wrap the brackets in a span:

            // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link

            $heading = preg_replace( '/<\/?span>/', '', $heading );

            // Normalize [] so the following regexp would work.

            $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );

            $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );


            // Strip tags from the heading or else we'll display them (escaped) in search results

            $heading = trim( Sanitizer::stripAllTags( $heading ) );


            // Note that we don't take the level of the heading into account - all headings are equal.

            // Except the ones we ignore.

            if ( !in_array( $heading, $ignoredHeadings ) ) {

                $headings[] = $heading;

            }

        }

        return $headings;

    }


    public static function parseSettingsInMessage( $message ) {

        $lines = explode( "\n", $message );

        $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments

        $lines = array_map( 'trim', $lines );          // Remove extra spaces

        $lines = array_filter( $lines );               // Remove empty lines

        return $lines;

    }


    private function getIgnoredHeadings() {

        static $ignoredHeadings = null;

        if ( $ignoredHeadings === null ) {

            $ignoredHeadings = [];

            $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();

            if ( $source->isBlank() ) {

                // Try old version too, just in case

                $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();

            }

            if ( !$source->isDisabled() ) {

                $lines = self::parseSettingsInMessage( $source->plain() );

                $ignoredHeadings = $lines;               // Now we just have headings!

            }

        }

        return $ignoredHeadings;

    }


    private function extractWikitextParts() {

        if ( !is_null( $this->allText ) ) {

            return;

        }

        $text = $this->parserOutput->getText( [

            'enableSectionEditTokens' => false,

            'allowTOC' => false,

        ] );

        if ( $text === '' ) {

            $this->allText = "";

            // empty text - nothing to seek here

            return;

        }

        $opening = null;


        $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );


        $formatter = new HtmlFormatter( $text );


        // Strip elements from the page that we never want in the search text.

        $formatter->remove( $this->excludedElementSelectors );

        $formatter->filterContent();


        // Strip elements from the page that are auxiliary text.  These will still be

        // searched but matches will be ranked lower and non-auxiliary matches will be

        // preferred in highlighting.

        $formatter->remove( $this->auxiliaryElementSelectors );

        $auxiliaryElements = $formatter->filterContent();

        $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );

        foreach ( $auxiliaryElements as $auxiliaryElement ) {

            $this->auxText[] =

                trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );

        }

    }


    private function extractHeadingBeforeFirstHeading( $text ) {

        $matches = [];

        if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {

            // There isn't a first heading so we interpret this as the article

            // being entirely without heading.

            return null;

        }

        $text = substr( $text, 0, $matches[ 0 ][ 1 ] );

        if ( !$text ) {

            // There isn't any text before the first heading so we declare there isn't

            // a first heading.

            return null;

        }


        $formatter = new HtmlFormatter( $text );

        $formatter->remove( $this->excludedElementSelectors );

        $formatter->remove( $this->auxiliaryElementSelectors );

        $formatter->filterContent();

        $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );


        if ( !$text ) {

            // There isn't any text after filtering before the first heading so we declare

            // that there isn't a first heading.

            return null;

        }


        return $text;

    }


    public function getOpeningText() {

        $this->extractWikitextParts();

        return $this->openingText;

    }


    public function getMainText() {

        $this->extractWikitextParts();

        return $this->allText;

    }


    public function getAuxiliaryText() {

        $this->extractWikitextParts();

        return $this->auxText;

    }


    public function getDefaultSort() {

        return $this->parserOutput->getProperty( 'defaultsort' );

    }


}


wfMessage
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Definition GlobalFunctions.php:1263

$matches
$matches
Definition NoLocalSettings.php:24

ParserOutput
Definition ParserOutput.php:25

WikiTextStructure
Class allowing to explore structure of parsed wikitext.
Definition WikiTextStructure.php:8

WikiTextStructure\$allText
string $allText
Definition WikiTextStructure.php:16

WikiTextStructure\getDefaultSort
getDefaultSort()
Get the defaultsort property.
Definition WikiTextStructure.php:249

WikiTextStructure\$auxText
string[] $auxText
Definition WikiTextStructure.php:20

WikiTextStructure\extractHeadingBeforeFirstHeading
extractHeadingBeforeFirstHeading( $text)
Get text before first heading.
Definition WikiTextStructure.php:189

WikiTextStructure\$auxiliaryElementSelectors
string[] $auxiliaryElementSelectors
selectors to elements that are considered auxiliary to article text for search
Definition WikiTextStructure.php:52

WikiTextStructure\$parserOutput
ParserOutput $parserOutput
Definition WikiTextStructure.php:24

WikiTextStructure\parseSettingsInMessage
static parseSettingsInMessage( $message)
Parse a message content into an array.
Definition WikiTextStructure.php:117

WikiTextStructure\extractWikitextParts
extractWikitextParts()
Extract parts of the text - opening, main and auxiliary.
Definition WikiTextStructure.php:149

WikiTextStructure\getOpeningText
getOpeningText()
Get opening text.
Definition WikiTextStructure.php:222

WikiTextStructure\getMainText
getMainText()
Get main text.
Definition WikiTextStructure.php:231

WikiTextStructure\$openingText
string $openingText
Definition WikiTextStructure.php:12

WikiTextStructure\$excludedElementSelectors
string[] $excludedElementSelectors
selectors to elements that are excluded entirely from search
Definition WikiTextStructure.php:29

WikiTextStructure\headings
headings()
Get headings on the page.
Definition WikiTextStructure.php:85

WikiTextStructure\getIgnoredHeadings
getIgnoredHeadings()
Get list of heading to ignore.
Definition WikiTextStructure.php:129

WikiTextStructure\getAuxiliaryText
getAuxiliaryText()
Get auxiliary text.
Definition WikiTextStructure.php:240

WikiTextStructure\__construct
__construct(ParserOutput $parserOutput)
Definition WikiTextStructure.php:68

$source
$source
Definition mwdoc-filter.php:34

$lines
$lines
Definition router.php:61