49 return '<!doctype html><html><head></head><body>' .
$html .
'</body></html>';
66 $html = mb_convert_encoding( $this->html,
'HTML-ENTITIES',
'UTF-8' );
75 libxml_use_internal_errors(
true );
76 $loader = libxml_disable_entity_loader();
77 $this->doc =
new DOMDocument();
78 $this->doc->strictErrorChecking =
false;
79 $this->doc->loadHTML(
$html );
80 libxml_disable_entity_loader( $loader );
81 libxml_use_internal_errors(
false );
82 $this->doc->encoding =
'UTF-8';
92 $this->removeMedia = $flag;
106 public function remove( $selectors ) {
107 $this->itemsToRemove = array_merge( $this->itemsToRemove, (
array)$selectors );
119 public function flatten( $elements ) {
120 $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (
array)$elements );
127 $this->
flatten(
'[?!]?[a-z0-9]+' );
150 $domElemsToRemove =
array();
151 foreach ( $removals[
'TAG']
as $tagToRemove ) {
152 $tagToRemoveNodes =
$doc->getElementsByTagName( $tagToRemove );
153 foreach ( $tagToRemoveNodes
as $tagToRemoveNode ) {
154 if ( $tagToRemoveNode ) {
155 $domElemsToRemove[] = $tagToRemoveNode;
163 $domElemsToRemove =
array();
164 foreach ( $removals[
'ID']
as $itemToRemove ) {
165 $itemToRemoveNode =
$doc->getElementById( $itemToRemove );
166 if ( $itemToRemoveNode ) {
167 $domElemsToRemove[] = $itemToRemoveNode;
173 $domElemsToRemove =
array();
174 $xpath =
new DOMXpath(
$doc );
175 foreach ( $removals[
'CLASS']
as $classToRemove ) {
176 $elements = $xpath->query(
'//*[contains(@class, "' . $classToRemove .
'")]' );
179 foreach ( $elements
as $element ) {
180 $classes = $element->getAttribute(
'class' );
181 if ( preg_match(
"/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
182 $domElemsToRemove[] = $element;
189 foreach ( $removals[
'TAG_CLASS']
as $classToRemove ) {
190 $parts = explode(
'.', $classToRemove );
192 $elements = $xpath->query(
193 '//' . $parts[0] .
'[@class="' . $parts[1] .
'"]'
208 if ( $elements instanceof DOMNodeList ) {
210 foreach ( $elements
as $element ) {
215 foreach ( $list
as $element ) {
216 if ( $element->parentNode ) {
217 $element->parentNode->removeChild( $element );
230 static $replacements;
231 if ( ! $replacements ) {
235 '"' =>
'&quot;',
236 '&' =>
'&amp;',
237 '<' =>
'&lt;',
238 '>' =>
'&gt;',
242 $html = mb_convert_encoding(
$html,
'UTF-8',
'HTML-ENTITIES' );
253 public function getText( $element =
null ) {
257 if ( $element !==
null && !( $element instanceof DOMElement ) ) {
258 $element = $this->doc->getElementById( $element );
261 $body = $this->doc->getElementsByTagName(
'body' )->item( 0 );
262 $nodesArray =
array();
263 foreach ( $body->childNodes
as $node ) {
264 $nodesArray[] = $node;
266 foreach ( $nodesArray
as $nodeArray ) {
267 $body->removeChild( $nodeArray );
269 $body->appendChild( $element );
271 $html = $this->doc->saveHTML();
284 $html = preg_replace(
'/<!--.*?-->|^.*?<body>|<\/body>.*$/s',
'',
$html );
287 if ( $this->elementsToFlatten ) {
288 $elements = implode(
'|', $this->elementsToFlatten );
289 $html = preg_replace(
"#</?($elements)\\b[^>]*>#is",
'',
$html );
306 } elseif ( strpos(
$selector,
'#' ) === 0 ) {
312 } elseif ( strpos(
$selector,
'[' ) ===
false && strpos(
$selector,
']' ) ===
false ) {
316 throw new MWException( __METHOD__ .
"(): unrecognized selector '$selector'" );
332 'TAG_CLASS' =>
array(),
335 foreach ( $this->itemsToRemove
as $itemToRemove ) {
339 $removals[
$type][] = $rawName;
343 if ( $this->removeMedia ) {
344 $removals[
'TAG'][] =
'img';
345 $removals[
'TAG'][] =
'audio';
346 $removals[
'TAG'][] =
'video';