Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
100.00% covered (success)
100.00%
27 / 27
100.00% covered (success)
100.00%
4 / 4
CRAP
100.00% covered (success)
100.00%
1 / 1
ExtractFormatter
100.00% covered (success)
100.00%
27 / 27
100.00% covered (success)
100.00%
4 / 4
8
100.00% covered (success)
100.00%
1 / 1
 __construct
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 getText
100.00% covered (success)
100.00%
8 / 8
100.00% covered (success)
100.00%
1 / 1
2
 onHtmlReady
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
2
 filterContent
100.00% covered (success)
100.00%
7 / 7
100.00% covered (success)
100.00%
1 / 1
2
1<?php
2
3namespace TextExtracts;
4
5use DOMElement;
6use HtmlFormatter\HtmlFormatter;
7
8/**
9 * Provides text-only or limited-HTML extracts of page HTML
10 *
11 * @license GPL-2.0-or-later
12 */
13class ExtractFormatter extends HtmlFormatter {
14    public const SECTION_MARKER_START = "\1\2";
15    public const SECTION_MARKER_END = "\2\1";
16
17    /**
18     * @var bool
19     */
20    private $plainText;
21
22    /**
23     * @param string $text Text to convert
24     * @param bool $plainText Whether extract should be plaintext
25     */
26    public function __construct( $text, $plainText ) {
27        parent::__construct( HtmlFormatter::wrapHTML( $text ) );
28        $this->plainText = $plainText;
29
30        $this->setRemoveMedia( true );
31
32        if ( $plainText ) {
33            $this->flattenAllTags();
34        } else {
35            $this->flatten( [ 'a' ] );
36        }
37    }
38
39    /**
40     * Performs final transformations (such as newline replacement for plaintext
41     * option) and returns resulting HTML.
42     *
43     * @param DOMElement|string|null $element ID of element to get HTML from.
44     * Ignored
45     * @return string Processed HTML
46     */
47    public function getText( $element = null ): string {
48        $this->filterContent();
49        $text = parent::getText();
50        if ( $this->plainText ) {
51            $text = html_entity_decode( $text );
52            // replace nbsp with space
53            $text = str_replace( "\u{00A0}", ' ', $text );
54            // for Windows
55            $text = str_replace( "\r", "\n", $text );
56            // normalise newlines
57            $text = preg_replace( "/\n{3,}/", "\n\n", $text );
58        }
59        return trim( $text );
60    }
61
62    /**
63     * @param string $html HTML string to process
64     * @return string Processed HTML
65     */
66    public function onHtmlReady( string $html ): string {
67        if ( $this->plainText ) {
68            $html = preg_replace( '/\s*(<h([1-6])\b)/i',
69                "\n\n" . self::SECTION_MARKER_START . '$2' . self::SECTION_MARKER_END . '$1',
70                $html
71            );
72        }
73        return $html;
74    }
75
76    /**
77     * Removes content we've chosen to remove then removes class and style
78     * attributes from the remaining span elements.
79     *
80     * @return array Array of removed DOMElements
81     */
82    public function filterContent(): array {
83        $removed = parent::filterContent();
84
85        $doc = $this->getDoc();
86        $spans = $doc->getElementsByTagName( 'span' );
87
88        /** @var DOMElement $span */
89        foreach ( $spans as $span ) {
90            $span->removeAttribute( 'class' );
91            $span->removeAttribute( 'style' );
92        }
93
94        return $removed;
95    }
96}