64 public function parse(
string $html, array $options = [] ): array {
65 $dom = $this->parseHTML( $html );
67 $head = Zest::getElementsByTagName( $dom,
'head' )[ 0 ];
72 throw new InvalidArgumentException(
'Parsed DOM did not contain a <head> tag' );
74 $nodes = $this->findUniqueTags( $head, [
'script',
'template',
'style' ] );
77 foreach ( [
'script',
'template' ] as $requiredTag ) {
78 if ( !isset( $nodes[ $requiredTag ] ) ) {
79 throw new InvalidArgumentException(
"No <$requiredTag> tag found" );
83 $this->validateAttributes( $nodes[
'script'], [] );
84 $this->validateAttributes( $nodes[
'template'], [] );
85 if ( isset( $nodes[
'style'] ) ) {
86 $this->validateAttributes( $nodes[
'style'], [
'lang' ] );
89 $styleData = isset( $nodes[
'style'] ) ? $this->getStyleAndLang( $nodes[
'style'] ) : null;
90 $template = $this->getTemplateHtml( $html, $options[
'minifyTemplate'] ??
false );
93 'script' => trim( $nodes[
'script']->nodeValue ??
'' ),
94 'template' => $template,
95 'style' => $styleData ? $styleData[
'style'] :
null,
96 'styleLang' => $styleData ? $styleData[
'lang'] : null
105 private function parseHTML( $html ): DOMDocument {
106 $domBuilder = new DOMBuilder( [
'suppressHtmlNamespace' => true ] );
107 $treeBuilder =
new TreeBuilder( $domBuilder, [
'ignoreErrors' =>
true ] );
108 $tokenizer =
new Tokenizer(
new Dispatcher( $treeBuilder ), $html, [
'ignoreErrors' =>
true ] );
109 $tokenizer->execute();
111 return $domBuilder->getFragment();
122 private function findUniqueTags( DOMNode $rootNode, array $tagNames ): array {
124 foreach ( $rootNode->childNodes as $node ) {
125 $tagName = strtolower( $node->nodeName );
126 if ( in_array( $tagName, $tagNames ) ) {
127 if ( isset( $nodes[ $tagName ] ) ) {
128 throw new InvalidArgumentException(
"More than one <$tagName> tag found" );
130 $nodes[ $tagName ] = $node;
142 private function validateAttributes( DOMNode $node, array $allowedAttributes ): void {
143 if ( $allowedAttributes ) {
144 foreach ( $node->attributes as $attr ) {
145 if ( !in_array( $attr->name, $allowedAttributes ) ) {
146 throw new InvalidArgumentException(
"<{$node->nodeName}> may not have the " .
147 "{$attr->name} attribute" );
150 } elseif ( $node->attributes->length > 0 ) {
151 throw new InvalidArgumentException(
"<{$node->nodeName}> may not have any attributes" );
161 private function getStyleAndLang( DOMElement $styleNode ): array {
162 $style = trim( $styleNode->nodeValue ??
'' );
163 $styleLang = $styleNode->hasAttribute(
'lang' ) ?
164 $styleNode->getAttribute(
'lang' ) :
'css';
165 if ( $styleLang !==
'css' && $styleLang !==
'less' ) {
166 throw new InvalidArgumentException(
"<style lang=\"$styleLang\"> is invalid," .
167 " lang must be \"css\" or \"less\"" );
171 'lang' => $styleLang,
187 private function getTemplateHtml( $html, $minify ) {
188 $serializer =
new Serializer( $this->newTemplateFormatter( $minify ) );
189 $tokenizer =
new Tokenizer(
190 $this->newFilteringDispatcher(
191 new TreeBuilder( $serializer, [
'ignoreErrors' =>
true ] ),
194 $html, [
'ignoreErrors' =>
true ]
196 $tokenizer->execute( [
'fragmentNamespace' => HTMLData::NS_HTML,
'fragmentName' =>
'template' ] );
197 return trim( $serializer->getResult() );
208 private function newTemplateFormatter( $minify ) {
209 return new class( $minify ) extends HtmlFormatter {
213 $this->minify = $minify;
216 public function startDocument( $fragmentNamespace, $fragmentName ) {
221 public function comment( SerializerNode $parent, $text ) {
222 if ( $this->minify ) {
226 return parent::comment( $parent, $text );
229 public function characters( SerializerNode $parent, $text, $start, $length ) {
233 $parent->namespace !== HTMLData::NS_HTML ||
234 !isset( $this->prefixLfElements[ $parent->name ] )
237 $text = substr( $text, $start, $length );
239 $text = preg_replace(
'/[ \r\n\t]+/',
' ', $text );
241 $length = strlen( $text );
243 return parent::characters( $parent, $text, $start, $length );
246 public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
250 $node->namespace !== HTMLData::NS_HTML ||
251 !isset( $this->prefixLfElements[ $node->name ] )
256 $contents = preg_replace(
'/(^[ \r\n\t]+)|([\r\n\t ]+$)/',
'', $contents );
258 return parent::element( $parent, $node, $contents );
271 private function newFilteringDispatcher( TreeBuilder $treeBuilder, $nodeName ) {
272 return new class( $treeBuilder, $nodeName ) extends Dispatcher {
274 private $nodeDepth = 0;
275 private $seenTag =
false;
277 public function __construct( TreeBuilder $treeBuilder, $nodeName ) {
278 $this->nodeName = $nodeName;
279 parent::__construct( $treeBuilder );
282 public function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
283 if ( $this->nodeDepth ) {
284 parent::startTag( $name, $attrs, $selfClose, $sourceStart, $sourceLength );
287 if ( $name === $this->nodeName ) {
288 if ( $this->nodeDepth === 0 && $this->seenTag ) {
290 throw new InvalidArgumentException(
"More than one <{$this->nodeName}> tag found" );
293 $this->seenTag =
true;
297 public function endTag( $name, $sourceStart, $sourceLength ) {
298 if ( $name === $this->nodeName ) {
301 if ( $this->nodeDepth ) {
302 parent::endTag( $name, $sourceStart, $sourceLength );
306 public function characters( $text, $start, $length, $sourceStart, $sourceLength ) {
307 if ( $this->nodeDepth ) {
308 parent::characters( $text, $start, $length, $sourceStart, $sourceLength );
312 public function comment( $text, $sourceStart, $sourceLength ) {
313 if ( $this->nodeDepth ) {
314 parent::comment( $text, $sourceStart, $sourceLength );