Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
100.00% covered (success)
100.00%
31 / 31
100.00% covered (success)
100.00%
4 / 4
CRAP
100.00% covered (success)
100.00%
1 / 1
ExtractFormatter
100.00% covered (success)
100.00%
31 / 31
100.00% covered (success)
100.00%
4 / 4
10
100.00% covered (success)
100.00%
1 / 1
 __construct
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 getText
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
2
 onHtmlReady
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 filterContent
100.00% covered (success)
100.00%
11 / 11
100.00% covered (success)
100.00%
1 / 1
4
1<?php
2
3namespace TextExtracts;
4
5use DOMElement;
6use HtmlFormatter\HtmlFormatter;
7use Wikimedia\Parsoid\Utils\DOMCompat;
8
9/**
10 * Provides text-only or limited-HTML extracts of page HTML
11 *
12 * @license GPL-2.0-or-later
13 */
14class ExtractFormatter extends HtmlFormatter {
15    public const SECTION_MARKER_START = "\1\2";
16    public const SECTION_MARKER_END = "\2\1";
17
18    /**
19     * @var bool
20     */
21    private $plainText;
22
23    /**
24     * @param string $text Text to convert
25     * @param bool $plainText Whether extract should be plaintext
26     */
27    public function __construct( $text, $plainText ) {
28        parent::__construct( HtmlFormatter::wrapHTML( $text ) );
29        $this->plainText = $plainText;
30
31        $this->setRemoveMedia( true );
32
33        if ( $plainText ) {
34            $this->flattenAllTags();
35        } else {
36            $this->flatten( [ 'a' ] );
37        }
38    }
39
40    /**
41     * Performs final transformations (such as newline replacement for plaintext
42     * option) and returns resulting HTML.
43     *
44     * @param DOMElement|string|null $element ID of element to get HTML from.
45     * Ignored
46     * @return string Processed HTML
47     */
48    public function getText( $element = null ): string {
49        $this->filterContent();
50        $text = parent::getText();
51        if ( $this->plainText ) {
52            $text = html_entity_decode( $text );
53            // replace nbsp with space
54            $text = str_replace( "\u{00A0}", ' ', $text );
55            // for Windows
56            $text = str_replace( "\r", "\n", $text );
57            // normalise newlines
58            $text = preg_replace( "/\n{3,}/", "\n\n", $text );
59        }
60        return trim( $text );
61    }
62
63    /**
64     * @param string $html HTML string to process
65     * @return string Processed HTML
66     */
67    public function onHtmlReady( string $html ): string {
68        if ( $this->plainText ) {
69            $html = preg_replace( '/\s*(<h([1-6])\b)/i',
70                "\n\n" . self::SECTION_MARKER_START . '$2' . self::SECTION_MARKER_END . '$1',
71                $html
72            );
73        }
74        return $html;
75    }
76
77    /**
78     * Removes content we've chosen to remove then removes class and style
79     * attributes from the remaining span elements.
80     *
81     * @return array Array of removed DOMElements
82     */
83    public function filterContent(): array {
84        $doc = $this->getDoc();
85
86        // Headings in a DIV wrapper may get removed by $wgExtractsRemoveClasses,
87        // move it outside the header to rescue it (T363445)
88        // https://www.mediawiki.org/wiki/Heading_HTML_changes
89        $headings = DOMCompat::querySelectorAll( $doc->documentElement, 'h1, h2, h3, h4, h5, h6' );
90        foreach ( $headings as $heading ) {
91            // @phan-suppress-next-line PhanTypeMismatchArgumentSuperType
92            if ( DOMCompat::getClassList( $heading->parentNode )->contains( 'mw-heading' ) ) {
93                $heading->parentNode->parentNode->insertBefore( $heading, $heading->parentNode );
94            }
95        }
96
97        $removed = parent::filterContent();
98
99        $spans = $doc->getElementsByTagName( 'span' );
100
101        /** @var DOMElement $span */
102        foreach ( $spans as $span ) {
103            $span->removeAttribute( 'class' );
104            $span->removeAttribute( 'style' );
105        }
106
107        return $removed;
108    }
109}