Code Coverage for /workspace/src/extensions/TextExtracts/includes/ExtractFormatter.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	100.00% covered (success)	100.00%	31 / 31	100.00% covered (success)	100.00%	4 / 4	CRAP	100.00% covered (success)	100.00%	1 / 1
ExtractFormatter	100.00% covered (success)	100.00%	31 / 31	100.00% covered (success)	100.00%	4 / 4	10	100.00% covered (success)	100.00%	1 / 1
__construct	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	2
getText	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	2
onHtmlReady	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	2
filterContent	100.00% covered (success)	100.00%	11 / 11	100.00% covered (success)	100.00%	1 / 1	4

1	<?php
2
3	namespace TextExtracts;
4
5	use DOMElement;
6	use HtmlFormatter\HtmlFormatter;
7	use Wikimedia\Parsoid\Utils\DOMCompat;
8
9	/**
10	* Provides text-only or limited-HTML extracts of page HTML
11	*
12	* @license GPL-2.0-or-later
13	*/
14	class ExtractFormatter extends HtmlFormatter {
15	public const SECTION_MARKER_START = "\1\2";
16	public const SECTION_MARKER_END = "\2\1";
17
18	/**
19	* @var bool
20	*/
21	private $plainText;
22
23	/**
24	* @param string $text Text to convert
25	* @param bool $plainText Whether extract should be plaintext
26	*/
27	public function __construct( $text, $plainText ) {
28	parent::__construct( HtmlFormatter::wrapHTML( $text ) );
29	$this->plainText = $plainText;
30
31	$this->setRemoveMedia( true );
32
33	if ( $plainText ) {
34	$this->flattenAllTags();
35	} else {
36	$this->flatten( [ 'a' ] );
37	}
38	}
39
40	/**
41	* Performs final transformations (such as newline replacement for plaintext
42	* option) and returns resulting HTML.
43	*
44	* @param DOMElement\|string\|null $element ID of element to get HTML from.
45	* Ignored
46	* @return string Processed HTML
47	*/
48	public function getText( $element = null ): string {
49	$this->filterContent();
50	$text = parent::getText();
51	if ( $this->plainText ) {
52	$text = html_entity_decode( $text );
53	// replace nbsp with space
54	$text = str_replace( "\u{00A0}", ' ', $text );
55	// for Windows
56	$text = str_replace( "\r", "\n", $text );
57	// normalise newlines
58	$text = preg_replace( "/\n{3,}/", "\n\n", $text );
59	}
60	return trim( $text );
61	}
62
63	/**
64	* @param string $html HTML string to process
65	* @return string Processed HTML
66	*/
67	public function onHtmlReady( string $html ): string {
68	if ( $this->plainText ) {
69	$html = preg_replace( '/\s*(<h([1-6])\b)/i',
70	"\n\n" . self::SECTION_MARKER_START . '$2' . self::SECTION_MARKER_END . '$1',
71	$html
72	);
73	}
74	return $html;
75	}
76
77	/**
78	* Removes content we've chosen to remove then removes class and style
79	* attributes from the remaining span elements.
80	*
81	* @return array Array of removed DOMElements
82	*/
83	public function filterContent(): array {
84	$doc = $this->getDoc();
85
86	// Headings in a DIV wrapper may get removed by $wgExtractsRemoveClasses,
87	// move it outside the header to rescue it (T363445)
88	// https://www.mediawiki.org/wiki/Heading_HTML_changes
89	$headings = DOMCompat::querySelectorAll( $doc->documentElement, 'h1, h2, h3, h4, h5, h6' );
90	foreach ( $headings as $heading ) {
91	// @phan-suppress-next-line PhanTypeMismatchArgumentSuperType
92	if ( DOMCompat::getClassList( $heading->parentNode )->contains( 'mw-heading' ) ) {
93	$heading->parentNode->parentNode->insertBefore( $heading, $heading->parentNode );
94	}
95	}
96
97	$removed = parent::filterContent();
98
99	$spans = $doc->getElementsByTagName( 'span' );
100
101	/** @var DOMElement $span */
102	foreach ( $spans as $span ) {
103	$span->removeAttribute( 'class' );
104	$span->removeAttribute( 'style' );
105	}
106
107	return $removed;
108	}
109	}