Code Coverage for /src/src/HtmlFormatter.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	98.58% covered (success)	98.58%	139 / 141	88.24% covered (warning)	88.24%	15 / 17	CRAP	0.00% covered (danger)	0.00%	0 / 1
HtmlFormatter	98.58% covered (success)	98.58%	139 / 141	88.24% covered (warning)	88.24%	15 / 17	57	0.00% covered (danger)	0.00%	0 / 1
__construct	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
wrapHTML	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
onHtmlReady	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
getDoc	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	5
setRemoveComments	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
setRemoveMedia	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
remove	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
flatten	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
flattenAllTags	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	1
filterContent	100.00% covered (success)	100.00%	39 / 39	100.00% covered (success)	100.00%	1 / 1	13
removeElements	100.00% covered (success)	100.00%	9 / 9	100.00% covered (success)	100.00%	1 / 1	5
getText	94.44% covered (success)	94.44%	17 / 18	0.00% covered (danger)	0.00%	0 / 1	8.01
removeBeforeIncluding	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2
removeAfterIncluding	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	2
removeBetweenIncluding	92.31% covered (success)	92.31%	12 / 13	0.00% covered (danger)	0.00%	0 / 1	4.01
parseSelector	100.00% covered (success)	100.00%	15 / 15	100.00% covered (success)	100.00%	1 / 1	6
parseItemsToRemove	100.00% covered (success)	100.00%	16 / 16	100.00% covered (success)	100.00%	1 / 1	4

1	<?php
2	/**
3	* Performs transformations of HTML by wrapping around libxml2 and working
4	* around its countless bugs.
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License along
17	* with this program; if not, write to the Free Software Foundation, Inc.,
18	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19	* http://www.gnu.org/copyleft/gpl.html
20	*
21	* @file
22	*/
23
24	namespace HtmlFormatter;
25
26	use DOMDocument;
27	use DOMElement;
28	use DOMNodeList;
29	use DOMXPath;
30	use InvalidArgumentException;
31
32	class HtmlFormatter {
33	/**
34	* @var ?DOMDocument
35	*/
36	private ?DOMDocument $doc = null;
37
38	/**
39	* @var string
40	*/
41	private string $html;
42
43	/**
44	* @var string[]
45	*/
46	private array $itemsToRemove = [];
47
48	/**
49	* @var string[]
50	*/
51	private array $elementsToFlatten = [];
52
53	/**
54	* Whether a libxml_disable_entity_loader() call is needed
55	*/
56	private const DISABLE_LOADER = LIBXML_VERSION < 20900;
57
58	/**
59	* @var bool
60	*/
61	protected bool $removeMedia = false;
62
63	/**
64	* @var bool
65	*/
66	protected bool $removeComments = false;
67
68	/**
69	* @param string $html Text to process
70	*/
71	public function __construct( string $html ) {
72	$this->html = $html;
73	}
74
75	/**
76	* Turns a chunk of HTML into a proper document
77	* @param string $html HTML to wrap
78	* @return string
79	*/
80	public static function wrapHTML( string $html ): string {
81	return '<!doctype html><html><head><meta charset="UTF-8"/></head><body>' . $html . '</body></html>';
82	}
83
84	/**
85	* Override this in descendant class to modify HTML after it has been converted from DOM tree
86	* @param string $html HTML to process
87	* @return string Processed HTML
88	*/
89	#[\ReturnTypeWillChange]
90	protected function onHtmlReady( string $html ): string {
91	return $html;
92	}
93
94	/**
95	* @return DOMDocument DOM to manipulate
96	*/
97	#[\ReturnTypeWillChange]
98	public function getDoc(): DOMDocument {
99	if ( !$this->doc ) {
100	$html = $this->html;
101	if ( !str_starts_with( $html, '<!doctype html>' ) ) {
102	// DOMDocument::loadHTML defaults to ASCII for partial html
103	// Parse as full html with encoding
104	$html = self::wrapHTML( $html );
105	}
106
107	// Workaround for bug that caused spaces after references
108	// to disappear during processing (T55086, T348402)
109	$html = str_replace( '> <', '> <', $html );
110
111	\libxml_use_internal_errors( true );
112	$loader = false;
113	if ( self::DISABLE_LOADER ) {
114	// @codeCoverageIgnoreStart
115	$loader = \libxml_disable_entity_loader();
116	// @codeCoverageIgnoreEnd
117	}
118	$this->doc = new DOMDocument();
119	$this->doc->strictErrorChecking = false;
120	$this->doc->loadHTML( $html );
121	if ( self::DISABLE_LOADER ) {
122	// @codeCoverageIgnoreStart
123	\libxml_disable_entity_loader( $loader );
124	// @codeCoverageIgnoreEnd
125	}
126	\libxml_use_internal_errors( false );
127	}
128	return $this->doc;
129	}
130
131	/**
132	* Sets whether comments should be removed from output
133	* @param bool $flag Whether to remove or not
134	*/
135	public function setRemoveComments( bool $flag = true ): void {
136	$this->removeComments = $flag;
137	}
138
139	/**
140	* Sets whether images/videos/sounds should be removed from output
141	* @param bool $flag Whether to remove or not
142	*/
143	public function setRemoveMedia( bool $flag = true ): void {
144	$this->removeMedia = $flag;
145	}
146
147	/**
148	* Adds one or more selector of content to remove. A subset of CSS selector
149	* syntax is supported:
150	*
151	* <tag>
152	* <tag>.class
153	* .<class>
154	* #<id>
155	*
156	* @param string[]\|string $selectors Selector(s) of stuff to remove
157	*/
158	public function remove( $selectors ): void {
159	$this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
160	}
161
162	/**
163	* Adds one or more element name to the list to flatten (remove tag, but not its content)
164	* Can accept non-delimited regexes
165	*
166	* Note this interface may fail in surprising unexpected ways due to usage of regexes,
167	* so should not be relied on for HTML markup security measures.
168	*
169	* @param string[]\|string $elements Name(s) of tag(s) to flatten
170	*/
171	public function flatten( $elements ): void {
172	$this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
173	}
174
175	/**
176	* Instructs the formatter to flatten all tags, and remove comments
177	*/
178	public function flattenAllTags(): void {
179	$this->flatten( '[?!]?[a-z0-9]+' );
180	$this->setRemoveComments( true );
181	}
182
183	/**
184	* Removes content we've chosen to remove. The text of the removed elements can be
185	* extracted with the getText method.
186	* @return DOMElement[] Array of removed DOMElements
187	*/
188	#[\ReturnTypeWillChange]
189	public function filterContent(): array {
190	$removals = $this->parseItemsToRemove();
191
192	// Bail out early if nothing to do
193	if ( \array_reduce( $removals,
194	static function ( $carry, $item ) {
195	return $carry && !$item;
196	},
197	true
198	) ) {
199	return [];
200	}
201
202	$doc = $this->getDoc();
203
204	// Remove tags
205
206	// You can't remove DOMNodes from a DOMNodeList as you're iterating
207	// over them in a foreach loop. It will seemingly leave the internal
208	// iterator on the foreach out of wack and results will be quite
209	// strange. Though, making a queue of items to remove seems to work.
210	$domElemsToRemove = [];
211	foreach ( $removals['TAG'] as $tagToRemove ) {
212	$tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
213	foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
214	if ( $tagToRemoveNode ) {
215	$domElemsToRemove[] = $tagToRemoveNode;
216	}
217	}
218	}
219	$removed = $this->removeElements( $domElemsToRemove );
220
221	// Elements with named IDs
222	$domElemsToRemove = [];
223	foreach ( $removals['ID'] as $itemToRemove ) {
224	$itemToRemoveNode = $doc->getElementById( $itemToRemove );
225	if ( $itemToRemoveNode ) {
226	$domElemsToRemove[] = $itemToRemoveNode;
227	}
228	}
229	$removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
230
231	// CSS Classes
232	$domElemsToRemove = [];
233	$xpath = new DOMXPath( $doc );
234	foreach ( $removals['CLASS'] as $classToRemove ) {
235	// Use spaces to avoid matching for unrelated classnames (T231160)
236	// https://stackoverflow.com/a/1604480/319266
237	$elements = $xpath->query( '//*[contains(concat(" ", @class, " "), " ' . $classToRemove . ' ")]' );
238
239	/** @var $element DOMElement */
240	foreach ( $elements as $element ) {
241	$classes = $element->getAttribute( 'class' );
242	if ( \preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
243	$domElemsToRemove[] = $element;
244	}
245	}
246	}
247	$removed = \array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
248
249	$return = [];
250	// Tags with CSS Classes
251	foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
252	$parts = explode( '.', $classToRemove );
253
254	$elements = $xpath->query(
255	'//' . $parts[0] . '[@class="' . $parts[1] . '"]'
256	);
257	$return[] = $this->removeElements( $elements );
258	}
259
260	return array_merge( array_merge( ...$return ), $removed );
261	}
262
263	/**
264	* Removes a list of elements from DOMDocument
265	* @param DOMElement[]\|DOMNodeList $elements
266	* @return DOMElement[] Array of removed elements
267	*/
268	private function removeElements( $elements ): array {
269	$list = $elements;
270	if ( $elements instanceof DOMNodeList ) {
271	$list = [];
272	foreach ( $elements as $element ) {
273	$list[] = $element;
274	}
275	}
276	/** @var $element DOMElement */
277	foreach ( $list as $element ) {
278	if ( $element->parentNode ) {
279	$element->parentNode->removeChild( $element );
280	}
281	}
282	return $list;
283	}
284
285	/**
286	* Performs final transformations and returns resulting HTML. Note that if you want to call this
287	* both without an element and with an element you should call it without an element first. If you
288	* specify the $element in the method it'll change the underlying dom and you won't be able to get
289	* it back.
290	*
291	* @param DOMElement\|string\|null $element ID of element to get HTML from or
292	* false to get it from the whole tree
293	* @return string Processed HTML
294	*/
295	#[\ReturnTypeWillChange]
296	public function getText( $element = null ): string {
297	if ( $this->doc ) {
298	if ( $element !== null && !( $element instanceof DOMElement ) ) {
299	$element = $this->doc->getElementById( $element );
300	}
301	if ( !$element ) {
302	$element = $this->doc->getElementsByTagName( 'body' )->item( 0 );
303	}
304	$html = $this->doc->saveHTML( $element );
305	if ( PHP_EOL === "\r\n" ) {
306	// Cleanup for CRLF mis-processing of unknown origin on Windows.
307	$html = str_replace( ' ', '', $html );
308	}
309	} else {
310	$html = $this->html;
311	}
312	// Remove stuff added by wrapHTML()
313	$html = self::removeBeforeIncluding( $html, '<body>' );
314	$html = self::removeAfterIncluding( $html, '</body>' );
315	$html = $this->onHtmlReady( $html );
316
317	if ( $this->removeComments ) {
318	$html = self::removeBetweenIncluding( $html, '<!--', '-->' );
319	}
320	if ( $this->elementsToFlatten ) {
321	$elements = \implode( '\|', $this->elementsToFlatten );
322	$html = \preg_replace( "#</?(?:$elements)\\b[^>]*>#is", '', $html );
323	}
324
325	return $html;
326	}
327
328	/**
329	* Removes everything from beginning of string to last occurance of $needle, including $needle.
330	*
331	* Equivalent to the regex /^.*?<body>/s when $needle = '<body>'
332	*/
333	public static function removeBeforeIncluding( string $haystack, string $needle ): string {
334	$pos = strrpos( $haystack, $needle );
335	if ( $pos === false ) {
336	return $haystack;
337	}
338	return substr( $haystack, $pos + strlen( $needle ) );
339	}
340
341	/**
342	* Removes everything from the first occurance of $needle to the end of the string, including $needle
343	*
344	* Equivalent to the regex /<\/body>.*$/s when $needle = '</body>'
345	*/
346	public static function removeAfterIncluding( string $haystack, string $needle ): string {
347	$pos = strpos( $haystack, $needle );
348	if ( $pos === false ) {
349	return $haystack;
350	}
351	return substr( $haystack, 0, $pos );
352	}
353
354	/**
355	* Removes everything between $open and $close, including $open and $close.
356	*/
357	public static function removeBetweenIncluding( string $haystack, string $open, string $close ): string {
358	$pieces = [];
359	$offset = 0;
360	while ( true ) {
361	$openPos = strpos( $haystack, $open, $offset );
362	if ( $openPos == false ) {
363	break;
364	}
365
366	$closePos = strpos( $haystack, $close, $openPos );
367	if ( $closePos === false ) {
368	break;
369	}
370
371	$pieces[] = substr( $haystack, $offset, $openPos - $offset );
372	$offset = $closePos + strlen( $close );
373	}
374	$pieces[] = substr( $haystack, $offset );
375	return implode( '', $pieces );
376	}
377
378	/**
379	* Helper function for parseItemsToRemove(). This function extracts the selector type
380	* and the raw name of a selector from a CSS-style selector string and assigns those
381	* values to parameters passed by reference. For example, if given '#toc' as the
382	* $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName.
383	* @param string $selector CSS selector to parse
384	* @param string &$type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
385	* @param string &$rawName The raw name of the selector
386	* @return bool Whether the selector was successfully recognised
387	*/
388	protected function parseSelector( string $selector, string &$type, string &$rawName ): bool {
389	$firstChar = substr( $selector, 0, 1 );
390	if ( $firstChar === '.' ) {
391	$type = 'CLASS';
392	$rawName = substr( $selector, 1 );
393	} elseif ( $firstChar === '#' ) {
394	$type = 'ID';
395	$rawName = substr( $selector, 1 );
396	} elseif ( strpos( $selector, '.' ) > 0 ) {
397	$type = 'TAG_CLASS';
398	$rawName = $selector;
399	} elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
400	$type = 'TAG';
401	$rawName = $selector;
402	} else {
403	throw new InvalidArgumentException( __METHOD__ . "(): unrecognized selector '$selector'" );
404	}
405
406	return true;
407	}
408
409	/**
410	* Transforms CSS-style selectors into an internal representation suitable for
411	* processing by filterContent()
412	* @return array
413	*/
414	#[\ReturnTypeWillChange]
415	protected function parseItemsToRemove(): array {
416	$removals = [
417	'ID' => [],
418	'TAG' => [],
419	'CLASS' => [],
420	'TAG_CLASS' => [],
421	];
422
423	foreach ( $this->itemsToRemove as $itemToRemove ) {
424	$type = '';
425	$rawName = '';
426	if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
427	$removals[$type][] = $rawName;
428	}
429	}
430
431	if ( $this->removeMedia ) {
432	$removals['TAG'][] = 'img';
433	$removals['TAG'][] = 'audio';
434	$removals['TAG'][] = 'video';
435	}
436
437	return $removals;
438	}
439	}