Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
100.00% |
27 / 27 |
|
100.00% |
4 / 4 |
CRAP | |
100.00% |
1 / 1 |
ExtractFormatter | |
100.00% |
27 / 27 |
|
100.00% |
4 / 4 |
8 | |
100.00% |
1 / 1 |
__construct | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
getText | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
2 | |||
onHtmlReady | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
filterContent | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
2 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Extension\TextExtracts; |
4 | |
5 | use DOMElement; |
6 | use HtmlFormatter\HtmlFormatter; |
7 | |
8 | /** |
9 | * Provides text-only or limited-HTML extracts of page HTML |
10 | * |
11 | * @license GPL-2.0-or-later |
12 | */ |
13 | class ExtractFormatter extends HtmlFormatter { |
14 | public const SECTION_MARKER_START = "\1\2"; |
15 | public const SECTION_MARKER_END = "\2\1"; |
16 | |
17 | /** |
18 | * @var bool |
19 | */ |
20 | private $plainText; |
21 | |
22 | /** |
23 | * @param string $text Text to convert |
24 | * @param bool $plainText Whether extract should be plaintext |
25 | */ |
26 | public function __construct( $text, $plainText ) { |
27 | parent::__construct( HtmlFormatter::wrapHTML( $text ) ); |
28 | $this->plainText = $plainText; |
29 | |
30 | $this->setRemoveMedia( true ); |
31 | |
32 | if ( $plainText ) { |
33 | $this->flattenAllTags(); |
34 | } else { |
35 | $this->flatten( [ 'a' ] ); |
36 | } |
37 | } |
38 | |
39 | /** |
40 | * Performs final transformations (such as newline replacement for plaintext |
41 | * option) and returns resulting HTML. |
42 | * |
43 | * @param DOMElement|string|null $element ID of element to get HTML from. |
44 | * Ignored |
45 | * @return string Processed HTML |
46 | */ |
47 | public function getText( $element = null ): string { |
48 | $this->filterContent(); |
49 | $text = parent::getText(); |
50 | if ( $this->plainText ) { |
51 | $text = html_entity_decode( $text ); |
52 | // replace nbsp with space |
53 | $text = str_replace( "\u{00A0}", ' ', $text ); |
54 | // for Windows |
55 | $text = str_replace( "\r", "\n", $text ); |
56 | // normalise newlines |
57 | $text = preg_replace( "/\n{3,}/", "\n\n", $text ); |
58 | } |
59 | return trim( $text ); |
60 | } |
61 | |
62 | /** |
63 | * @param string $html HTML string to process |
64 | * @return string Processed HTML |
65 | */ |
66 | public function onHtmlReady( string $html ): string { |
67 | if ( $this->plainText ) { |
68 | $html = preg_replace( '/\s*(<h([1-6])\b)/i', |
69 | "\n\n" . self::SECTION_MARKER_START . '$2' . self::SECTION_MARKER_END . '$1', |
70 | $html |
71 | ); |
72 | } |
73 | return $html; |
74 | } |
75 | |
76 | /** |
77 | * Removes content we've chosen to remove then removes class and style |
78 | * attributes from the remaining span elements. |
79 | * |
80 | * @return array Array of removed DOMElements |
81 | */ |
82 | public function filterContent(): array { |
83 | $removed = parent::filterContent(); |
84 | |
85 | $doc = $this->getDoc(); |
86 | $spans = $doc->getElementsByTagName( 'span' ); |
87 | |
88 | /** @var DOMElement $span */ |
89 | foreach ( $spans as $span ) { |
90 | $span->removeAttribute( 'class' ); |
91 | $span->removeAttribute( 'style' ); |
92 | } |
93 | |
94 | return $removed; |
95 | } |
96 | } |