Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
98.58% |
139 / 141 |
|
88.24% |
15 / 17 |
CRAP | |
0.00% |
0 / 1 |
HtmlFormatter | |
98.58% |
139 / 141 |
|
88.24% |
15 / 17 |
57 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
wrapHTML | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
onHtmlReady | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getDoc | |
100.00% |
14 / 14 |
|
100.00% |
1 / 1 |
5 | |||
setRemoveComments | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
setRemoveMedia | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
remove | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
flatten | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
flattenAllTags | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
filterContent | |
100.00% |
39 / 39 |
|
100.00% |
1 / 1 |
13 | |||
removeElements | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
5 | |||
getText | |
94.44% |
17 / 18 |
|
0.00% |
0 / 1 |
8.01 | |||
removeBeforeIncluding | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
removeAfterIncluding | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
removeBetweenIncluding | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
4.01 | |||
parseSelector | |
100.00% |
15 / 15 |
|
100.00% |
1 / 1 |
6 | |||
parseItemsToRemove | |
100.00% |
16 / 16 |
|
100.00% |
1 / 1 |
4 |
1 | <?php |
2 | /** |
3 | * Performs transformations of HTML by wrapping around libxml2 and working |
4 | * around its countless bugs. |
5 | * |
6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by |
8 | * the Free Software Foundation; either version 2 of the License, or |
9 | * (at your option) any later version. |
10 | * |
11 | * This program is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | * GNU General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU General Public License along |
17 | * with this program; if not, write to the Free Software Foundation, Inc., |
18 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
19 | * http://www.gnu.org/copyleft/gpl.html |
20 | * |
21 | * @file |
22 | */ |
23 | |
24 | namespace HtmlFormatter; |
25 | |
26 | use DOMDocument; |
27 | use DOMElement; |
28 | use DOMNodeList; |
29 | use DOMXPath; |
30 | use InvalidArgumentException; |
31 | |
32 | class HtmlFormatter { |
33 | /** |
34 | * @var ?DOMDocument |
35 | */ |
36 | private ?DOMDocument $doc = null; |
37 | |
38 | /** |
39 | * @var string |
40 | */ |
41 | private string $html; |
42 | |
43 | /** |
44 | * @var string[] |
45 | */ |
46 | private array $itemsToRemove = []; |
47 | |
48 | /** |
49 | * @var string[] |
50 | */ |
51 | private array $elementsToFlatten = []; |
52 | |
53 | /** |
54 | * Whether a libxml_disable_entity_loader() call is needed |
55 | */ |
56 | private const DISABLE_LOADER = LIBXML_VERSION < 20900; |
57 | |
58 | /** |
59 | * @var bool |
60 | */ |
61 | protected bool $removeMedia = false; |
62 | |
63 | /** |
64 | * @var bool |
65 | */ |
66 | protected bool $removeComments = false; |
67 | |
68 | /** |
69 | * @param string $html Text to process |
70 | */ |
71 | public function __construct( string $html ) { |
72 | $this->html = $html; |
73 | } |
74 | |
75 | /** |
76 | * Turns a chunk of HTML into a proper document |
77 | * @param string $html HTML to wrap |
78 | * @return string |
79 | */ |
80 | public static function wrapHTML( string $html ): string { |
81 | return '<!doctype html><html><head><meta charset="UTF-8"/></head><body>' . $html . '</body></html>'; |
82 | } |
83 | |
84 | /** |
85 | * Override this in descendant class to modify HTML after it has been converted from DOM tree |
86 | * @param string $html HTML to process |
87 | * @return string Processed HTML |
88 | */ |
89 | #[\ReturnTypeWillChange] |
90 | protected function onHtmlReady( string $html ): string { |
91 | return $html; |
92 | } |
93 | |
94 | /** |
95 | * @return DOMDocument DOM to manipulate |
96 | */ |
97 | #[\ReturnTypeWillChange] |
98 | public function getDoc(): DOMDocument { |
99 | if ( !$this->doc ) { |
100 | $html = $this->html; |
101 | if ( !str_starts_with( $html, '<!doctype html>' ) ) { |
102 | // DOMDocument::loadHTML defaults to ASCII for partial html |
103 | // Parse as full html with encoding |
104 | $html = self::wrapHTML( $html ); |
105 | } |
106 | |
107 | // Workaround for bug that caused spaces after references |
108 | // to disappear during processing (T55086, T348402) |
109 | $html = str_replace( '> <', '> <', $html ); |
110 | |
111 | \libxml_use_internal_errors( true ); |
112 | $loader = false; |
113 | if ( self::DISABLE_LOADER ) { |
114 | // @codeCoverageIgnoreStart |
115 | $loader = \libxml_disable_entity_loader(); |
116 | // @codeCoverageIgnoreEnd |
117 | } |
118 | $this->doc = new DOMDocument(); |
119 | $this->doc->strictErrorChecking = false; |
120 | $this->doc->loadHTML( $html ); |
121 | if ( self::DISABLE_LOADER ) { |
122 | // @codeCoverageIgnoreStart |
123 | \libxml_disable_entity_loader( $loader ); |
124 | // @codeCoverageIgnoreEnd |
125 | } |
126 | \libxml_use_internal_errors( false ); |
127 | } |
128 | return $this->doc; |
129 | } |
130 | |
131 | /** |
132 | * Sets whether comments should be removed from output |
133 | * @param bool $flag Whether to remove or not |
134 | */ |
135 | public function setRemoveComments( bool $flag = true ): void { |
136 | $this->removeComments = $flag; |
137 | } |
138 | |
139 | /** |
140 | * Sets whether images/videos/sounds should be removed from output |
141 | * @param bool $flag Whether to remove or not |
142 | */ |
143 | public function setRemoveMedia( bool $flag = true ): void { |
144 | $this->removeMedia = $flag; |
145 | } |
146 | |
147 | /** |
148 | * Adds one or more selector of content to remove. A subset of CSS selector |
149 | * syntax is supported: |
150 | * |
151 | * <tag> |
152 | * <tag>.class |
153 | * .<class> |
154 | * #<id> |
155 | * |
156 | * @param string[]|string $selectors Selector(s) of stuff to remove |
157 | */ |
158 | public function remove( $selectors ): void { |
159 | $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors ); |
160 | } |
161 | |
162 | /** |
163 | * Adds one or more element name to the list to flatten (remove tag, but not its content) |
164 | * Can accept non-delimited regexes |
165 | * |
166 | * Note this interface may fail in surprising unexpected ways due to usage of regexes, |
167 | * so should not be relied on for HTML markup security measures. |
168 | * |
169 | * @param string[]|string $elements Name(s) of tag(s) to flatten |
170 | */ |
171 | public function flatten( $elements ): void { |
172 | $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements ); |
173 | } |
174 | |
175 | /** |
176 | * Instructs the formatter to flatten all tags, and remove comments |
177 | */ |
178 | public function flattenAllTags(): void { |
179 | $this->flatten( '[?!]?[a-z0-9]+' ); |
180 | $this->setRemoveComments( true ); |
181 | } |
182 | |
183 | /** |
184 | * Removes content we've chosen to remove. The text of the removed elements can be |
185 | * extracted with the getText method. |
186 | * @return DOMElement[] Array of removed DOMElements |
187 | */ |
188 | #[\ReturnTypeWillChange] |
189 | public function filterContent(): array { |
190 | $removals = $this->parseItemsToRemove(); |
191 | |
192 | // Bail out early if nothing to do |
193 | if ( \array_reduce( $removals, |
194 | static function ( $carry, $item ) { |
195 | return $carry && !$item; |
196 | }, |
197 | true |
198 | ) ) { |
199 | return []; |
200 | } |
201 | |
202 | $doc = $this->getDoc(); |
203 | |
204 | // Remove tags |
205 | |
206 | // You can't remove DOMNodes from a DOMNodeList as you're iterating |
207 | // over them in a foreach loop. It will seemingly leave the internal |
208 | // iterator on the foreach out of wack and results will be quite |
209 | // strange. Though, making a queue of items to remove seems to work. |
210 | $domElemsToRemove = []; |
211 | foreach ( $removals['TAG'] as $tagToRemove ) { |
212 | $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove ); |
213 | foreach ( $tagToRemoveNodes as $tagToRemoveNode ) { |
214 | if ( $tagToRemoveNode ) { |
215 | $domElemsToRemove[] = $tagToRemoveNode; |
216 | } |
217 | } |
218 | } |
219 | $removed = $this->removeElements( $domElemsToRemove ); |
220 | |
221 | // Elements with named IDs |
222 | $domElemsToRemove = []; |
223 | foreach ( $removals['ID'] as $itemToRemove ) { |
224 | $itemToRemoveNode = $doc->getElementById( $itemToRemove ); |
225 | if ( $itemToRemoveNode ) { |
226 | $domElemsToRemove[] = $itemToRemoveNode; |
227 | } |
228 | } |
229 | $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); |
230 | |
231 | // CSS Classes |
232 | $domElemsToRemove = []; |
233 | $xpath = new DOMXPath( $doc ); |
234 | foreach ( $removals['CLASS'] as $classToRemove ) { |
235 | // Use spaces to avoid matching for unrelated classnames (T231160) |
236 | // https://stackoverflow.com/a/1604480/319266 |
237 | $elements = $xpath->query( '//*[contains(concat(" ", @class, " "), " ' . $classToRemove . ' ")]' ); |
238 | |
239 | /** @var $element DOMElement */ |
240 | foreach ( $elements as $element ) { |
241 | $classes = $element->getAttribute( 'class' ); |
242 | if ( \preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) { |
243 | $domElemsToRemove[] = $element; |
244 | } |
245 | } |
246 | } |
247 | $removed = \array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); |
248 | |
249 | $return = []; |
250 | // Tags with CSS Classes |
251 | foreach ( $removals['TAG_CLASS'] as $classToRemove ) { |
252 | $parts = explode( '.', $classToRemove ); |
253 | |
254 | $elements = $xpath->query( |
255 | '//' . $parts[0] . '[@class="' . $parts[1] . '"]' |
256 | ); |
257 | $return[] = $this->removeElements( $elements ); |
258 | } |
259 | |
260 | return array_merge( array_merge( ...$return ), $removed ); |
261 | } |
262 | |
263 | /** |
264 | * Removes a list of elements from DOMDocument |
265 | * @param DOMElement[]|DOMNodeList $elements |
266 | * @return DOMElement[] Array of removed elements |
267 | */ |
268 | private function removeElements( $elements ): array { |
269 | $list = $elements; |
270 | if ( $elements instanceof DOMNodeList ) { |
271 | $list = []; |
272 | foreach ( $elements as $element ) { |
273 | $list[] = $element; |
274 | } |
275 | } |
276 | /** @var $element DOMElement */ |
277 | foreach ( $list as $element ) { |
278 | if ( $element->parentNode ) { |
279 | $element->parentNode->removeChild( $element ); |
280 | } |
281 | } |
282 | return $list; |
283 | } |
284 | |
285 | /** |
286 | * Performs final transformations and returns resulting HTML. Note that if you want to call this |
287 | * both without an element and with an element you should call it without an element first. If you |
288 | * specify the $element in the method it'll change the underlying dom and you won't be able to get |
289 | * it back. |
290 | * |
291 | * @param DOMElement|string|null $element ID of element to get HTML from or |
292 | * false to get it from the whole tree |
293 | * @return string Processed HTML |
294 | */ |
295 | #[\ReturnTypeWillChange] |
296 | public function getText( $element = null ): string { |
297 | if ( $this->doc ) { |
298 | if ( $element !== null && !( $element instanceof DOMElement ) ) { |
299 | $element = $this->doc->getElementById( $element ); |
300 | } |
301 | if ( !$element ) { |
302 | $element = $this->doc->getElementsByTagName( 'body' )->item( 0 ); |
303 | } |
304 | $html = $this->doc->saveHTML( $element ); |
305 | if ( PHP_EOL === "\r\n" ) { |
306 | // Cleanup for CRLF mis-processing of unknown origin on Windows. |
307 | $html = str_replace( ' ', '', $html ); |
308 | } |
309 | } else { |
310 | $html = $this->html; |
311 | } |
312 | // Remove stuff added by wrapHTML() |
313 | $html = self::removeBeforeIncluding( $html, '<body>' ); |
314 | $html = self::removeAfterIncluding( $html, '</body>' ); |
315 | $html = $this->onHtmlReady( $html ); |
316 | |
317 | if ( $this->removeComments ) { |
318 | $html = self::removeBetweenIncluding( $html, '<!--', '-->' ); |
319 | } |
320 | if ( $this->elementsToFlatten ) { |
321 | $elements = \implode( '|', $this->elementsToFlatten ); |
322 | $html = \preg_replace( "#</?(?:$elements)\\b[^>]*>#is", '', $html ); |
323 | } |
324 | |
325 | return $html; |
326 | } |
327 | |
328 | /** |
329 | * Removes everything from beginning of string to last occurance of $needle, including $needle. |
330 | * |
331 | * Equivalent to the regex /^.*?<body>/s when $needle = '<body>' |
332 | */ |
333 | public static function removeBeforeIncluding( string $haystack, string $needle ): string { |
334 | $pos = strrpos( $haystack, $needle ); |
335 | if ( $pos === false ) { |
336 | return $haystack; |
337 | } |
338 | return substr( $haystack, $pos + strlen( $needle ) ); |
339 | } |
340 | |
341 | /** |
342 | * Removes everything from the first occurance of $needle to the end of the string, including $needle |
343 | * |
344 | * Equivalent to the regex /<\/body>.*$/s when $needle = '</body>' |
345 | */ |
346 | public static function removeAfterIncluding( string $haystack, string $needle ): string { |
347 | $pos = strpos( $haystack, $needle ); |
348 | if ( $pos === false ) { |
349 | return $haystack; |
350 | } |
351 | return substr( $haystack, 0, $pos ); |
352 | } |
353 | |
354 | /** |
355 | * Removes everything between $open and $close, including $open and $close. |
356 | */ |
357 | public static function removeBetweenIncluding( string $haystack, string $open, string $close ): string { |
358 | $pieces = []; |
359 | $offset = 0; |
360 | while ( true ) { |
361 | $openPos = strpos( $haystack, $open, $offset ); |
362 | if ( $openPos == false ) { |
363 | break; |
364 | } |
365 | |
366 | $closePos = strpos( $haystack, $close, $openPos ); |
367 | if ( $closePos === false ) { |
368 | break; |
369 | } |
370 | |
371 | $pieces[] = substr( $haystack, $offset, $openPos - $offset ); |
372 | $offset = $closePos + strlen( $close ); |
373 | } |
374 | $pieces[] = substr( $haystack, $offset ); |
375 | return implode( '', $pieces ); |
376 | } |
377 | |
378 | /** |
379 | * Helper function for parseItemsToRemove(). This function extracts the selector type |
380 | * and the raw name of a selector from a CSS-style selector string and assigns those |
381 | * values to parameters passed by reference. For example, if given '#toc' as the |
382 | * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName. |
383 | * @param string $selector CSS selector to parse |
384 | * @param string &$type The type of selector (ID, CLASS, TAG_CLASS, or TAG) |
385 | * @param string &$rawName The raw name of the selector |
386 | * @return bool Whether the selector was successfully recognised |
387 | */ |
388 | protected function parseSelector( string $selector, string &$type, string &$rawName ): bool { |
389 | $firstChar = substr( $selector, 0, 1 ); |
390 | if ( $firstChar === '.' ) { |
391 | $type = 'CLASS'; |
392 | $rawName = substr( $selector, 1 ); |
393 | } elseif ( $firstChar === '#' ) { |
394 | $type = 'ID'; |
395 | $rawName = substr( $selector, 1 ); |
396 | } elseif ( strpos( $selector, '.' ) > 0 ) { |
397 | $type = 'TAG_CLASS'; |
398 | $rawName = $selector; |
399 | } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) { |
400 | $type = 'TAG'; |
401 | $rawName = $selector; |
402 | } else { |
403 | throw new InvalidArgumentException( __METHOD__ . "(): unrecognized selector '$selector'" ); |
404 | } |
405 | |
406 | return true; |
407 | } |
408 | |
409 | /** |
410 | * Transforms CSS-style selectors into an internal representation suitable for |
411 | * processing by filterContent() |
412 | * @return array |
413 | */ |
414 | #[\ReturnTypeWillChange] |
415 | protected function parseItemsToRemove(): array { |
416 | $removals = [ |
417 | 'ID' => [], |
418 | 'TAG' => [], |
419 | 'CLASS' => [], |
420 | 'TAG_CLASS' => [], |
421 | ]; |
422 | |
423 | foreach ( $this->itemsToRemove as $itemToRemove ) { |
424 | $type = ''; |
425 | $rawName = ''; |
426 | if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) { |
427 | $removals[$type][] = $rawName; |
428 | } |
429 | } |
430 | |
431 | if ( $this->removeMedia ) { |
432 | $removals['TAG'][] = 'img'; |
433 | $removals['TAG'][] = 'audio'; |
434 | $removals['TAG'][] = 'video'; |
435 | } |
436 | |
437 | return $removals; |
438 | } |
439 | } |