Code Coverage |
||||||||||
Classes and Traits |
Functions and Methods |
Lines |
||||||||
Total | |
0.00% |
0 / 1 |
|
65.38% |
17 / 26 |
CRAP | |
69.12% |
94 / 136 |
DOMCompat | |
0.00% |
0 / 1 |
|
65.38% |
17 / 26 |
179.90 | |
69.12% |
94 / 136 |
newDocument | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
nodeName | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 1 |
|||
getBody | |
0.00% |
0 / 1 |
5.07 | |
85.71% |
6 / 7 |
|||
getHead | |
0.00% |
0 / 1 |
4.05 | |
85.71% |
6 / 7 |
|||
getTitle | |
100.00% |
1 / 1 |
2 | |
100.00% |
2 / 2 |
|||
setTitle | |
100.00% |
1 / 1 |
4 | |
100.00% |
8 / 8 |
|||
getParentElement | |
100.00% |
1 / 1 |
3 | |
100.00% |
4 / 4 |
|||
getElementById | |
100.00% |
1 / 1 |
2 | |
100.00% |
7 / 7 |
|||
setIdAttribute | |
100.00% |
1 / 1 |
1 | |
100.00% |
3 / 3 |
|||
getElementsByTagName | |
0.00% |
0 / 1 |
6 | |
0.00% |
0 / 7 |
|||
getLastElementChild | |
100.00% |
1 / 1 |
4 | |
100.00% |
9 / 9 |
|||
querySelector | |
100.00% |
1 / 1 |
2 | |
100.00% |
3 / 3 |
|||
querySelectorAll | |
100.00% |
1 / 1 |
2 | |
100.00% |
6 / 6 |
|||
getPreviousElementSibling | |
100.00% |
1 / 1 |
4 | |
100.00% |
9 / 9 |
|||
getNextElementSibling | |
100.00% |
1 / 1 |
4 | |
100.00% |
9 / 9 |
|||
remove | |
0.00% |
0 / 1 |
12 | |
0.00% |
0 / 8 |
|||
getInnerHTML | |
100.00% |
1 / 1 |
1 | |
100.00% |
1 / 1 |
|||
setInnerHTML | |
100.00% |
1 / 1 |
1 | |
100.00% |
13 / 13 |
|||
createDocument | |
100.00% |
1 / 1 |
1 | |
100.00% |
1 / 1 |
|||
getOuterHTML | |
100.00% |
1 / 1 |
1 | |
100.00% |
1 / 1 |
|||
getClassList | |
100.00% |
1 / 1 |
1 | |
100.00% |
1 / 1 |
|||
stripAndCollapseASCIIWhitespace | |
100.00% |
1 / 1 |
1 | |
100.00% |
2 / 2 |
|||
stripEmptyTextNodes | |
0.00% |
0 / 1 |
30 | |
0.00% |
0 / 10 |
|||
normalize | |
100.00% |
1 / 1 |
1 | |
100.00% |
3 / 3 |
|||
replaceChildren | |
0.00% |
0 / 1 |
30 | |
0.00% |
0 / 12 |
|||
or | |
0.00% |
0 / 1 |
6 | |
0.00% |
0 / 1 |
<?php | |
declare( strict_types = 1 ); | |
namespace Wikimedia\Parsoid\Utils; | |
use Wikimedia\Assert\Assert; | |
use Wikimedia\Parsoid\DOM\CharacterData; | |
use Wikimedia\Parsoid\DOM\Document; | |
use Wikimedia\Parsoid\DOM\DocumentFragment; | |
use Wikimedia\Parsoid\DOM\Element; | |
use Wikimedia\Parsoid\DOM\Node; | |
use Wikimedia\Parsoid\DOM\Text; | |
use Wikimedia\Parsoid\Utils\DOMCompat\TokenList; | |
use Wikimedia\Parsoid\Wt2Html\XMLSerializer; | |
use Wikimedia\RemexHtml\DOM\DOMBuilder; | |
use Wikimedia\RemexHtml\HTMLData; | |
use Wikimedia\RemexHtml\Tokenizer\Tokenizer; | |
use Wikimedia\RemexHtml\TreeBuilder\Dispatcher; | |
use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder; | |
use Wikimedia\Zest\Zest; | |
/** | |
* Helper class that provides missing DOM level 3 methods for the PHP DOM classes. | |
* For a DOM method $node->foo( $bar) the equivalent helper is DOMCompat::foo( $node, $bar ). | |
* For a DOM property $node->foo there is a DOMCompat::getFoo( $node ) and | |
* DOMCompat::setFoo( $node, $value ). | |
* | |
* Only implements the methods that are actually used by Parsoid. | |
* | |
* Because this class may be used by code outside Parsoid it tries to | |
* be relatively tolerant of object types: you can call it either with | |
* PHP's DOM* types or with a "proper" DOM implementation, and it will | |
* attempt to Do The Right Thing regardless. As a result there are | |
* generally not parameter type hints for DOM object types, and the | |
* return types will be broad enough to accomodate the value a "real" | |
* DOM implementation would return, as well as the values our | |
* thunk will return. (For instance, we can't create a "real" NodeList | |
* in our compatibility thunk.) | |
*/ | |
class DOMCompat { | |
/** | |
* Tab, LF, FF, CR, space | |
* @see https://infra.spec.whatwg.org/#ascii-whitespace | |
*/ | |
private static $ASCII_WHITESPACE = "\t\r\f\n "; | |
/** | |
* Create a new empty document. | |
* This is abstracted because the process is a little different depending | |
* on whether we're using Dodo or DOMDocument, and phan gets a little | |
* confused by this. | |
* @param bool $isHtml | |
* @return Document | |
*/ | |
public static function newDocument( bool $isHtml ) { | |
// @phan-suppress-next-line PhanParamTooMany,PhanTypeInstantiateInterface | |
return new Document( "1.0", "UTF-8" ); | |
} | |
/** | |
* Return the lower-case version of the node name (HTML says this should | |
* be capitalized). | |
* @param Node $node | |
* @return string | |
*/ | |
public static function nodeName( Node $node ): string { | |
return strtolower( $node->nodeName ); | |
} | |
/** | |
* Get document body. | |
* Unlike the spec we return it as a native PHP DOM object. | |
* @param Document $document | |
* @return Element|null | |
* @see https://html.spec.whatwg.org/multipage/dom.html#dom-document-body | |
*/ | |
public static function getBody( $document ) { | |
// Use an undeclared dynamic property as a cache. | |
// WARNING: this will not be updated if (for some reason) the | |
// document body changes. | |
if ( isset( $document->body ) ) { | |
return $document->body; | |
} | |
foreach ( $document->documentElement->childNodes as $element ) { | |
/** @var Element $element */ | |
if ( self::nodeName( $element ) === 'body' || self::nodeName( $element ) === 'frameset' ) { | |
$document->body = $element; // Caching! | |
// @phan-suppress-next-line PhanTypeMismatchReturnSuperType | |
return $element; | |
} | |
} | |
return null; | |
} | |
/** | |
* Get document head. | |
* Unlike the spec we return it as a native PHP DOM object. | |
* @param Document $document | |
* @return Element|null | |
* @see https://html.spec.whatwg.org/multipage/dom.html#dom-document-head | |
*/ | |
public static function getHead( $document ) { | |
// Use an undeclared dynamic property as a cache. | |
// WARNING: this will not be updated if (for some reason) the | |
// document head changes. | |
if ( isset( $document->head ) ) { | |
return $document->head; | |
} | |
foreach ( $document->documentElement->childNodes as $element ) { | |
/** @var Element $element */ | |
if ( self::nodeName( $element ) === 'head' ) { | |
$document->head = $element; // Caching! | |
// @phan-suppress-next-line PhanTypeMismatchReturnSuperType | |
return $element; | |
} | |
} | |
return null; | |
} | |
/** | |
* Get document title. | |
* @param Document $document | |
* @return string | |
* @see https://html.spec.whatwg.org/multipage/dom.html#document.title | |
*/ | |
public static function getTitle( $document ): string { | |
$titleElement = self::querySelector( $document, 'title' ); | |
return $titleElement ? self::stripAndCollapseASCIIWhitespace( $titleElement->textContent ) : ''; | |
} | |
/** | |
* Set document title. | |
* @param Document $document | |
* @param string $title | |
* @see https://html.spec.whatwg.org/multipage/dom.html#document.title | |
*/ | |
public static function setTitle( $document, string $title ): void { | |
$titleElement = self::querySelector( $document, 'title' ); | |
if ( !$titleElement ) { | |
$headElement = self::getHead( $document ); | |
if ( $headElement ) { | |
$titleElement = DOMUtils::appendToHead( $document, 'title' ); | |
} | |
} | |
if ( $titleElement ) { | |
$titleElement->textContent = $title; | |
} | |
} | |
/** | |
* Return the parent element, or null if the parent is not an element. | |
* @param Node $node | |
* @return Element|null | |
* @see https://dom.spec.whatwg.org/#dom-node-parentelement | |
*/ | |
public static function getParentElement( $node ) { | |
$parent = $node->parentNode; | |
if ( $parent && $parent->nodeType === XML_ELEMENT_NODE ) { | |
/** @var Element $parent */ | |
// @phan-suppress-next-line PhanTypeMismatchReturnSuperType | |
return $parent; | |
} | |
return null; | |
} | |
/** | |
* Return the descendant with the specified ID. | |
* Workaround for https://bugs.php.net/bug.php?id=77686 and other issues related to | |
* inconsistent indexing behavior. | |
* @param Document|DocumentFragment $node | |
* @param string $id | |
* @return Element|null | |
* @see https://dom.spec.whatwg.org/#dom-nonelementparentnode-getelementbyid | |
*/ | |
public static function getElementById( $node, string $id ) { | |
Assert::parameterType( | |
self::or( | |
Document::class, DocumentFragment::class, | |
// For compatibility with code which might call this from | |
// outside Parsoid. | |
\DOMDocument::class, \DOMDocumentFragment::class | |
), | |
$node, '$node' ); | |
// @phan-suppress-next-line PhanTypeMismatchArgument Zest is declared to take DOMDocument\DOMElement | |
$elements = Zest::getElementsById( $node, $id ); | |
// @phan-suppress-next-line PhanTypeMismatchReturn | |
return $elements[0] ?? null; | |
} | |
/** | |
* Workaround bug in PHP's Document::getElementById() which doesn't | |
* actually index the 'id' attribute unless you use the non-standard | |
* `Element::setIdAttribute` method after the attribute is set; | |
* see https://www.php.net/manual/en/domdocument.getelementbyid.php | |
* for more details. | |
* | |
* @param Element $element | |
* @param string $id The desired value for the `id` attribute on $element. | |
* @see https://phabricator.wikimedia.org/T232390 | |
*/ | |
public static function setIdAttribute( $element, string $id ): void { | |
$element->setAttribute( 'id', $id ); | |
$element->setIdAttribute( 'id', true );// phab:T232390 | |
} | |
/** | |
* Return all descendants with the specified tag name. | |
* Workaround for PHP's getElementsByTagName being inexplicably slow in some situations | |
* and the lack of Element::getElementsByTagName(). | |
* @param Document|Element $node | |
* @param string $tagName | |
* @return iterable<Element> Either an array or an HTMLCollection object | |
* @see https://dom.spec.whatwg.org/#dom-document-getelementsbytagname | |
* @see https://dom.spec.whatwg.org/#dom-element-getelementsbytagname | |
* @note Note that unlike the spec this method is not guaranteed to return a NodeList | |
* (which cannot be freely constructed in PHP), just a traversable containing Elements. | |
*/ | |
public static function getElementsByTagName( $node, string $tagName ): iterable { | |
Assert::parameterType( | |
self::or( | |
Document::class, Element::class, | |
// For compatibility with code which might call this from | |
// outside Parsoid. | |
\DOMDocument::class, \DOMElement::class | |
), | |
$node, '$node' ); | |
// @phan-suppress-next-line PhanTypeMismatchArgument Zest is declared to take DOMDocument\DOMElement | |
$result = Zest::getElementsByTagName( $node, $tagName ); | |
'@phan-var array<Element> $result'; // @var array<Element> $result | |
return $result; | |
} | |
/** | |
* Return the last child of the node that is an Element, or null otherwise. | |
* @param Document|DocumentFragment|Element $node | |
* @return Element|null | |
* @see https://dom.spec.whatwg.org/#dom-parentnode-lastelementchild | |
*/ | |
public static function getLastElementChild( $node ) { | |
Assert::parameterType( | |
self::or( | |
Document::class, DocumentFragment::class, Element::class, | |
// For compatibility with code which might call this from | |
// outside Parsoid. | |
\DOMDocument::class, \DOMDocumentFragment::class, \DOMElement::class | |
), | |
$node, '$node' ); | |
$lastChild = $node->lastChild; | |
while ( $lastChild && $lastChild->nodeType !== XML_ELEMENT_NODE ) { | |
$lastChild = $lastChild->previousSibling; | |
} | |
// @phan-suppress-next-line PhanTypeMismatchReturnSuperType | |
return $lastChild; | |
} | |
/** | |
* @param Document|DocumentFragment|Element $node | |
* @param string $selector | |
* @return Element|null | |
* @see https://dom.spec.whatwg.org/#dom-parentnode-queryselector | |
*/ | |
public static function querySelector( $node, string $selector ) { | |
foreach ( self::querySelectorAll( $node, $selector ) as $el ) { | |
return $el; | |
} | |
return null; | |
} | |
/** | |
* @param Document|DocumentFragment|Element $node | |
* @param string $selector | |
* @return iterable<Element> Either a NodeList or an array | |
* @see https://dom.spec.whatwg.org/#dom-parentnode-queryselectorall | |
* @note Note that unlike the spec this method is not guaranteed to return a NodeList | |
* (which cannot be freely constructed in PHP), just a traversable containing Elements. | |
*/ | |
public static function querySelectorAll( $node, string $selector ): iterable { | |
Assert::parameterType( | |
self::or( | |
Document::class, DocumentFragment::class, Element::class, | |
// For compatibility with code which might call this from | |
// outside Parsoid. | |
\DOMDocument::class, \DOMDocumentFragment::class, \DOMElement::class | |
), | |
$node, '$node' ); | |
// @phan-suppress-next-line PhanTypeMismatchArgument DOMNode | |
return Zest::find( $selector, $node ); | |
} | |
/** | |
* Return the last preceding sibling of the node that is an element, or null otherwise. | |
* @param Node $node | |
* @return Element|null | |
* @see https://dom.spec.whatwg.org/#dom-nondocumenttypechildnode-previouselementsibling | |
*/ | |
public static function getPreviousElementSibling( $node ) { | |
Assert::parameterType( | |
self::or( | |
Element::class, CharacterData::class, | |
// For compatibility with code which might call this from | |
// outside Parsoid. | |
\DOMElement::class, \DOMCharacterData::class | |
), | |
$node, '$node' ); | |
$previousSibling = $node->previousSibling; | |
while ( $previousSibling && $previousSibling->nodeType !== XML_ELEMENT_NODE ) { | |
$previousSibling = $previousSibling->previousSibling; | |
} | |
// @phan-suppress-next-line PhanTypeMismatchReturnSuperType | |
return $previousSibling; | |
} | |
/** | |
* Return the first following sibling of the node that is an element, or null otherwise. | |
* @param Node $node | |
* @return Element|null | |
* @see https://dom.spec.whatwg.org/#dom-nondocumenttypechildnode-nextelementsibling | |
*/ | |
public static function getNextElementSibling( $node ) { | |
Assert::parameterType( | |
self::or( | |
Element::class, CharacterData::class, | |
// For compatibility with code which might call this from | |
// outside Parsoid. | |
\DOMElement::class, \DOMCharacterData::class | |
), | |
$node, '$node' ); | |
$nextSibling = $node->nextSibling; | |
while ( $nextSibling && $nextSibling->nodeType !== XML_ELEMENT_NODE ) { | |
$nextSibling = $nextSibling->nextSibling; | |
} | |
// @phan-suppress-next-line PhanTypeMismatchReturnSuperType | |
return $nextSibling; | |
} | |
/** | |
* Removes the node from the document. | |
* @param Element|CharacterData $node | |
* @see https://dom.spec.whatwg.org/#dom-childnode-remove | |
*/ | |
public static function remove( $node ): void { | |
Assert::parameterType( | |
self::or( | |
Element::class, CharacterData::class, | |
// For compatibility with code which might call this from | |
// outside Parsoid. | |
\DOMElement::class, \DOMCharacterData::class | |
), | |
$node, '$node' ); | |
if ( $node->parentNode ) { | |
$node->parentNode->removeChild( $node ); | |
} | |
} | |
/** | |
* Get innerHTML. | |
* @param Element $element | |
* @return string | |
* @see https://w3c.github.io/DOM-Parsing/#dom-innerhtml-innerhtml | |
*/ | |
public static function getInnerHTML( $element ): string { | |
return XMLSerializer::serialize( $element, [ 'innerXML' => true ] )['html']; | |
} | |
/** | |
* Set innerHTML. | |
* @see https://w3c.github.io/DOM-Parsing/#dom-innerhtml-innerhtml | |
* @param Element $element | |
* @param string $html | |
*/ | |
public static function setInnerHTML( $element, string $html ): void { | |
$domBuilder = new class( [ | |
'suppressHtmlNamespace' => true, | |
] ) extends DOMBuilder { | |
/** @inheritDoc */ | |
protected function createDocument( | |
string $doctypeName = null, | |
string $public = null, | |
string $system = null | |
) { | |
// @phan-suppress-next-line PhanTypeMismatchReturn | |
return DOMCompat::newDocument( $doctypeName === 'html' ); | |
} | |
}; | |
$treeBuilder = new TreeBuilder( $domBuilder ); | |
$dispatcher = new Dispatcher( $treeBuilder ); | |
$tokenizer = new Tokenizer( $dispatcher, $html, [ 'ignoreErrors' => true ] ); | |
$tokenizer->execute( [ | |
'fragmentNamespace' => HTMLData::NS_HTML, | |
'fragmentName' => self::nodeName( $element ), | |
] ); | |
// Empty the element | |
self::replaceChildren( $element ); | |
$frag = $domBuilder->getFragment(); | |
'@phan-var Node $frag'; // @var Node $frag | |
DOMUtils::migrateChildrenBetweenDocs( | |
$frag, $element | |
); | |
} | |
/** | |
* Get outerHTML. | |
* @param Element $element | |
* @return string | |
* @see https://w3c.github.io/DOM-Parsing/#dom-element-outerhtml | |
*/ | |
public static function getOuterHTML( $element ): string { | |
return XMLSerializer::serialize( $element, [ 'addDoctype' => false ] )['html']; | |
} | |
/** | |
* Return the class list of this element. | |
* @param Element $node | |
* @return TokenList | |
* @see https://dom.spec.whatwg.org/#dom-element-classlist | |
*/ | |
public static function getClassList( $node ): TokenList { | |
return new TokenList( $node ); | |
} | |
/** | |
* @param string $text | |
* @return string | |
* @see https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace | |
*/ | |
private static function stripAndCollapseASCIIWhitespace( string $text ): string { | |
$ws = self::$ASCII_WHITESPACE; | |
return preg_replace( "/[$ws]+/", ' ', trim( $text, $ws ) ); | |
} | |
/** | |
* @param Element|DocumentFragment $e | |
*/ | |
private static function stripEmptyTextNodes( $e ): void { | |
$c = $e->firstChild; | |
while ( $c ) { | |
$next = $c->nextSibling; | |
if ( $c instanceof Text ) { | |
if ( $c->nodeValue === '' ) { | |
$e->removeChild( $c ); | |
} | |
} elseif ( $c instanceof Element ) { | |
self::stripEmptyTextNodes( $c ); | |
} | |
$c = $next; | |
} | |
} | |
/** | |
* @param Element|DocumentFragment $elt root of the DOM tree that | |
* needs to be normalized | |
*/ | |
public static function normalize( $elt ): void { | |
$elt->normalize(); | |
// Now traverse the tree rooted at $elt and remove any stray empty text nodes | |
// Unlike what https://www.w3.org/TR/DOM-Level-2-Core/core.html#ID-normalize says, | |
// the PHP DOM's normalization leaves behind up to 1 empty text node. | |
// See https://bugs.php.net/bug.php?id=78221 | |
self::stripEmptyTextNodes( $elt ); | |
} | |
/** | |
* ParentNode.replaceChildren() | |
* https://developer.mozilla.org/en-US/docs/Web/API/ParentNode/replaceChildren | |
* | |
* @param Document|DocumentFragment|Element $parentNode | |
* @param array<string|Node> ...$nodes | |
*/ | |
public static function replaceChildren( | |
$parentNode, ...$nodes | |
): void { | |
Assert::parameterType( | |
self::or( | |
Document::class, DocumentFragment::class, Element::class, | |
// For compatibility with code which might call this from | |
// outside Parsoid. | |
\DOMDocument::class, \DOMDocumentFragment::class, \DOMElement::class | |
), | |
$parentNode, '$parentNode' | |
); | |
while ( $parentNode->firstChild ) { | |
$parentNode->removeChild( $parentNode->firstChild ); | |
} | |
foreach ( $nodes as $node ) { | |
if ( is_string( $node ) ) { | |
$node = $parentNode->ownerDocument->createTextNode( $node ); | |
} | |
$parentNode->insertBefore( $node, null ); | |
} | |
} | |
/** | |
* Join class names together in a form suitable for Assert::parameterType. | |
* @param class-string ...$args | |
* @return string | |
*/ | |
private static function or( ...$args ) { | |
return implode( '|', $args ); | |
} | |
} |