Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
31 / 31 |
|
100.00% |
4 / 4 |
CRAP | |
100.00% |
1 / 1 |
ExtractFormatter | |
100.00% |
31 / 31 |
|
100.00% |
4 / 4 |
10 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
getText | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
onHtmlReady | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
filterContent | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
4 |
1 | <?php |
2 | |
3 | namespace TextExtracts; |
4 | |
5 | use DOMElement; |
6 | use HtmlFormatter\HtmlFormatter; |
7 | use Wikimedia\Parsoid\Utils\DOMCompat; |
8 | |
9 | /** |
10 | * Provides text-only or limited-HTML extracts of page HTML |
11 | * |
12 | * @license GPL-2.0-or-later |
13 | */ |
14 | class ExtractFormatter extends HtmlFormatter { |
15 | public const SECTION_MARKER_START = "\1\2"; |
16 | public const SECTION_MARKER_END = "\2\1"; |
17 | |
18 | /** |
19 | * @var bool |
20 | */ |
21 | private $plainText; |
22 | |
23 | /** |
24 | * @param string $text Text to convert |
25 | * @param bool $plainText Whether extract should be plaintext |
26 | */ |
27 | public function __construct( $text, $plainText ) { |
28 | parent::__construct( HtmlFormatter::wrapHTML( $text ) ); |
29 | $this->plainText = $plainText; |
30 | |
31 | $this->setRemoveMedia( true ); |
32 | |
33 | if ( $plainText ) { |
34 | $this->flattenAllTags(); |
35 | } else { |
36 | $this->flatten( [ 'a' ] ); |
37 | } |
38 | } |
39 | |
40 | /** |
41 | * Performs final transformations (such as newline replacement for plaintext |
42 | * option) and returns resulting HTML. |
43 | * |
44 | * @param DOMElement|string|null $element ID of element to get HTML from. |
45 | * Ignored |
46 | * @return string Processed HTML |
47 | */ |
48 | public function getText( $element = null ): string { |
49 | $this->filterContent(); |
50 | $text = parent::getText(); |
51 | if ( $this->plainText ) { |
52 | $text = html_entity_decode( $text ); |
53 | // replace nbsp with space |
54 | $text = str_replace( "\u{00A0}", ' ', $text ); |
55 | // for Windows |
56 | $text = str_replace( "\r", "\n", $text ); |
57 | // normalise newlines |
58 | $text = preg_replace( "/\n{3,}/", "\n\n", $text ); |
59 | } |
60 | return trim( $text ); |
61 | } |
62 | |
63 | /** |
64 | * @param string $html HTML string to process |
65 | * @return string Processed HTML |
66 | */ |
67 | public function onHtmlReady( string $html ): string { |
68 | if ( $this->plainText ) { |
69 | $html = preg_replace( '/\s*(<h([1-6])\b)/i', |
70 | "\n\n" . self::SECTION_MARKER_START . '$2' . self::SECTION_MARKER_END . '$1', |
71 | $html |
72 | ); |
73 | } |
74 | return $html; |
75 | } |
76 | |
77 | /** |
78 | * Removes content we've chosen to remove then removes class and style |
79 | * attributes from the remaining span elements. |
80 | * |
81 | * @return array Array of removed DOMElements |
82 | */ |
83 | public function filterContent(): array { |
84 | $doc = $this->getDoc(); |
85 | |
86 | // Headings in a DIV wrapper may get removed by $wgExtractsRemoveClasses, |
87 | // move it outside the header to rescue it (T363445) |
88 | // https://www.mediawiki.org/wiki/Heading_HTML_changes |
89 | $headings = DOMCompat::querySelectorAll( $doc->documentElement, 'h1, h2, h3, h4, h5, h6' ); |
90 | foreach ( $headings as $heading ) { |
91 | // @phan-suppress-next-line PhanTypeMismatchArgumentSuperType |
92 | if ( DOMCompat::getClassList( $heading->parentNode )->contains( 'mw-heading' ) ) { |
93 | $heading->parentNode->parentNode->insertBefore( $heading, $heading->parentNode ); |
94 | } |
95 | } |
96 | |
97 | $removed = parent::filterContent(); |
98 | |
99 | $spans = $doc->getElementsByTagName( 'span' ); |
100 | |
101 | /** @var DOMElement $span */ |
102 | foreach ( $spans as $span ) { |
103 | $span->removeAttribute( 'class' ); |
104 | $span->removeAttribute( 'style' ); |
105 | } |
106 | |
107 | return $removed; |
108 | } |
109 | } |