Code Coverage |
||||||||||
Classes and Traits |
Functions and Methods |
Lines |
||||||||
Total | |
100.00% |
1 / 1 |
|
100.00% |
4 / 4 |
CRAP | |
100.00% |
26 / 26 |
ExtractFormatter | |
100.00% |
1 / 1 |
|
100.00% |
4 / 4 |
8 | |
100.00% |
26 / 26 |
__construct | |
100.00% |
1 / 1 |
2 | |
100.00% |
7 / 7 |
|||
getText | |
100.00% |
1 / 1 |
2 | |
100.00% |
8 / 8 |
|||
onHtmlReady | |
100.00% |
1 / 1 |
2 | |
100.00% |
4 / 4 |
|||
filterContent | |
100.00% |
1 / 1 |
2 | |
100.00% |
7 / 7 |
<?php | |
namespace TextExtracts; | |
use DOMElement; | |
use HtmlFormatter\HtmlFormatter; | |
/** | |
* Provides text-only or limited-HTML extracts of page HTML | |
* | |
* @license GPL-2.0-or-later | |
*/ | |
class ExtractFormatter extends HtmlFormatter { | |
public const SECTION_MARKER_START = "\1\2"; | |
public const SECTION_MARKER_END = "\2\1"; | |
/** | |
* @var bool | |
*/ | |
private $plainText; | |
/** | |
* @param string $text Text to convert | |
* @param bool $plainText Whether extract should be plaintext | |
*/ | |
public function __construct( $text, $plainText ) { | |
parent::__construct( HtmlFormatter::wrapHTML( $text ) ); | |
$this->plainText = $plainText; | |
$this->setRemoveMedia( true ); | |
if ( $plainText ) { | |
$this->flattenAllTags(); | |
} else { | |
$this->flatten( [ 'a' ] ); | |
} | |
} | |
/** | |
* Performs final transformations (such as newline replacement for plaintext | |
* option) and returns resulting HTML. | |
* | |
* @param DOMElement|string|null $element ID of element to get HTML from. | |
* Ignored | |
* @return string Processed HTML | |
*/ | |
public function getText( $element = null ) { | |
$this->filterContent(); | |
$text = parent::getText(); | |
if ( $this->plainText ) { | |
$text = html_entity_decode( $text ); | |
// replace nbsp with space | |
$text = str_replace( "\u{00A0}", ' ', $text ); | |
// for Windows | |
$text = str_replace( "\r", "\n", $text ); | |
// normalise newlines | |
$text = preg_replace( "/\n{3,}/", "\n\n", $text ); | |
} | |
return trim( $text ); | |
} | |
/** | |
* @param string $html HTML string to process | |
* @return string Processed HTML | |
*/ | |
public function onHtmlReady( $html ) { | |
if ( $this->plainText ) { | |
$html = preg_replace( '/\s*(<h([1-6])\b)/i', | |
"\n\n" . self::SECTION_MARKER_START . '$2' . self::SECTION_MARKER_END . '$1', | |
$html | |
); | |
} | |
return $html; | |
} | |
/** | |
* Removes content we've chosen to remove then removes class and style | |
* attributes from the remaining span elements. | |
* | |
* @return array Array of removed DOMElements | |
*/ | |
public function filterContent() { | |
$removed = parent::filterContent(); | |
$doc = $this->getDoc(); | |
$spans = $doc->getElementsByTagName( 'span' ); | |
/** @var DOMElement $span */ | |
foreach ( $spans as $span ) { | |
$span->removeAttribute( 'class' ); | |
$span->removeAttribute( 'style' ); | |
} | |
return $removed; | |
} | |
} |