master/php/WikiTextStructure_8php_source.html

<?php


use HtmlFormatter\HtmlFormatter;

use MediaWiki\Parser\ParserOutput;

use MediaWiki\Parser\Sanitizer;


class WikiTextStructure {


    private ?string $openingText = null;

    private ?string $allText = null;

    private array $auxText = [];

    private ParserOutput $parserOutput;


    private const EXCLUDED_ELEMENT_SELECTORS = [

        // "it looks like you don't have javascript enabled..." – do not need to index

        'audio', 'video',

        // CSS stylesheets aren't content

        'style',

        // The [1] for references from Cite

        'sup.reference',

        // The ↑ next to references in the references section from Cite

        '.mw-cite-backlink',

        // Headings are already indexed in their own field.

        'h1', 'h2', 'h3', 'h4', 'h5', 'h6',

        // Collapsed fields are hidden by default, so we don't want them showing up.

        '.autocollapse',

        // Content explicitly decided to be not searchable by editors such

        // as custom navigation templates.

        '.navigation-not-searchable',

        // User-facing interface code prompting the user to act from WikibaseMediaInfo

        '.wbmi-entityview-emptyCaption',

    ];


    private const AUXILIARY_ELEMENT_SELECTORS = [

        // Thumbnail captions aren't really part of the text proper

        '.thumbcaption',

        'figcaption',

        // Neither are tables

        'table',

        // Common style for "See also:".

        '.rellink',

        // Common style for calling out helpful links at the top of the article.

        '.dablink',

        // New class users can use to mark stuff as auxiliary to searches.

        '.searchaux',

    ];


    public function __construct( ParserOutput $parserOutput ) {

        $this->parserOutput = $parserOutput;

    }


    public function headings() {

        $headings = [];

        $tocData = $this->parserOutput->getTOCData();

        if ( $tocData === null ) {

            return $headings;

        }

        $ignoredHeadings = $this->getIgnoredHeadings();

        foreach ( $tocData->getSections() as $heading ) {

            $heading = $heading->line;


            // Some wikis wrap the brackets in a span:

            // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link

            $heading = preg_replace( '/<\/?span>/', '', $heading );

            // Normalize [] so the following regexp would work.

            $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );

            $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading );


            // Strip tags from the heading or else we'll display them (escaped) in search results

            $heading = trim( Sanitizer::stripAllTags( $heading ) );


            // Note that we don't take the level of the heading into account - all headings are equal.

            // Except the ones we ignore.

            if ( !in_array( $heading, $ignoredHeadings ) ) {

                $headings[] = $heading;

            }

        }

        return $headings;

    }


    public static function parseSettingsInMessage( $message ) {

        $lines = explode( "\n", $message );

        // Remove comments

        $lines = preg_replace( '/#.*$/', '', $lines );

        // Remove extra spaces

        $lines = array_map( 'trim', $lines );

        // Remove empty lines

        return array_filter( $lines );

    }


    private function getIgnoredHeadings() {

        static $ignoredHeadings = null;

        if ( $ignoredHeadings === null ) {

            $ignoredHeadings = [];

            $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();

            if ( $source->isBlank() ) {

                // Try the old version too, just in case

                $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();

            }

            if ( !$source->isDisabled() ) {

                $lines = self::parseSettingsInMessage( $source->plain() );

                // Now we just have headings!

                $ignoredHeadings = $lines;

            }

        }

        return $ignoredHeadings;

    }


    private function extractWikitextParts() {

        if ( $this->allText !== null ) {

            return;

        }

        $text = $this->parserOutput->getRawText();

        if ( $text === '' ) {

            $this->allText = "";

            // empty text - nothing to seek here

            return;

        }


        $this->openingText = $this->extractTextBeforeFirstHeading( $text );


        $formatter = new HtmlFormatter( $text );


        // Strip elements from the page that we never want in the search text.

        $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );

        $formatter->filterContent();


        // Strip elements from the page that are auxiliary text.  These will still be

        // searched, but matches will be ranked lower and non-auxiliary matches will be

        // preferred in highlighting.

        $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );

        $auxiliaryElements = $formatter->filterContent();

        $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );

        foreach ( $auxiliaryElements as $auxiliaryElement ) {

            $this->auxText[] =

                trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );

        }

    }


    private function extractTextBeforeFirstHeading( $text ) {

        $matches = [];

        if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {

            // There isn't a first heading, so we interpret this as the article

            // being entirely without heading.

            return null;

        }

        $text = substr( $text, 0, $matches[ 0 ][ 1 ] );

        if ( !$text ) {

            // There isn't any text before the first heading, so we declare there isn't

            // a first heading.

            return null;

        }


        $formatter = new HtmlFormatter( $text );

        $formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );

        $formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );

        $formatter->filterContent();

        $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );


        if ( !$text ) {

            // There isn't any text after filtering before the first heading, so we declare

            // that there isn't a first heading.

            return null;

        }


        return $text;

    }


    public function getOpeningText() {

        $this->extractWikitextParts();

        return $this->openingText;

    }


    public function getMainText() {

        $this->extractWikitextParts();

        return $this->allText;

    }


    public function getAuxiliaryText() {

        $this->extractWikitextParts();

        return $this->auxText;

    }


    public function getDefaultSort() {

        $sort = $this->parserOutput->getPageProperty( 'defaultsort' );

        if ( $sort === false ) {

            return null;

        }

        return $sort;

    }


}


wfMessage
wfMessage( $key,... $params)
This is the function for getting translated interface messages.
Definition GlobalFunctions.php:906

$matches
$matches
Definition NoLocalSettings.php:27

MediaWiki\Parser\ParserOutput
ParserOutput is a rendering of a Content object or a message.
Definition ParserOutput.php:97

MediaWiki\Parser\Sanitizer
HTML sanitizer for MediaWiki.
Definition Sanitizer.php:46

WikiTextStructure
Class allowing to explore the structure of parsed wikitext.
Definition WikiTextStructure.php:10

WikiTextStructure\getDefaultSort
getDefaultSort()
Get the "defaultsort" property.
Definition WikiTextStructure.php:245

WikiTextStructure\parseSettingsInMessage
static parseSettingsInMessage( $message)
Parse a message content into an array.
Definition WikiTextStructure.php:117

WikiTextStructure\getOpeningText
getOpeningText()
Definition WikiTextStructure.php:220

WikiTextStructure\getMainText
getMainText()
Definition WikiTextStructure.php:228

WikiTextStructure\headings
headings()
Gets headings from the page.
Definition WikiTextStructure.php:81

WikiTextStructure\getAuxiliaryText
getAuxiliaryText()
Definition WikiTextStructure.php:236

WikiTextStructure\__construct
__construct(ParserOutput $parserOutput)
Definition WikiTextStructure.php:61

$source
$source
Definition mwdoc-filter.php:34

$lines
if(!file_exists( $CREDITS)) $lines
Definition updateCredits.php:45