65 public function parse(
string $html, array $options = [] ): array {
66 $dom = $this->parseHTML( $html );
68 $head = Zest::getElementsByTagName( $dom,
'head' )[ 0 ];
73 throw new Exception(
'Parsed DOM did not contain a <head> tag' );
75 $nodes = $this->findUniqueTags( $head, [
'script',
'template',
'style' ] );
78 foreach ( [
'script',
'template' ] as $requiredTag ) {
79 if ( !isset( $nodes[ $requiredTag ] ) ) {
80 throw new Exception(
"No <$requiredTag> tag found" );
84 $this->validateAttributes( $nodes[
'script'], [] );
85 $this->validateAttributes( $nodes[
'template'], [] );
86 if ( isset( $nodes[
'style'] ) ) {
87 $this->validateAttributes( $nodes[
'style'], [
'lang' ] );
90 $styleData = isset( $nodes[
'style'] ) ? $this->getStyleAndLang( $nodes[
'style'] ) : null;
91 $template = $this->getTemplateHtml( $html, $options[
'minifyTemplate'] ??
false );
94 'script' => trim( $nodes[
'script']->nodeValue ??
'' ),
95 'template' => $template,
96 'style' => $styleData ? $styleData[
'style'] :
null,
97 'styleLang' => $styleData ? $styleData[
'lang'] : null
106 private function parseHTML( $html ): DOMDocument {
107 $domBuilder = new DOMBuilder( [
'suppressHtmlNamespace' =>
true ] );
108 $treeBuilder =
new TreeBuilder( $domBuilder, [
'ignoreErrors' =>
true ] );
109 $tokenizer =
new Tokenizer(
new Dispatcher( $treeBuilder ), $html, [
'ignoreErrors' =>
true ] );
110 $tokenizer->execute();
112 return $domBuilder->getFragment();
123 private function findUniqueTags( DOMNode $rootNode, array $tagNames ): array {
125 foreach ( $rootNode->childNodes as $node ) {
126 $tagName = strtolower( $node->nodeName );
127 if ( in_array( $tagName, $tagNames ) ) {
128 if ( isset( $nodes[ $tagName ] ) ) {
129 throw new Exception(
"More than one <$tagName> tag found" );
131 $nodes[ $tagName ] = $node;
143 private function validateAttributes( DOMNode $node, array $allowedAttributes ): void {
144 if ( $allowedAttributes ) {
145 foreach ( $node->attributes as $attr ) {
146 if ( !in_array( $attr->name, $allowedAttributes ) ) {
147 throw new Exception(
"<{$node->nodeName}> may not have the " .
148 "{$attr->name} attribute" );
151 } elseif ( $node->attributes->length > 0 ) {
152 throw new Exception(
"<{$node->nodeName}> may not have any attributes" );
162 private function getStyleAndLang( DOMElement $styleNode ): array {
163 $style = trim( $styleNode->nodeValue ??
'' );
164 $styleLang = $styleNode->hasAttribute(
'lang' ) ?
165 $styleNode->getAttribute(
'lang' ) :
'css';
166 if ( $styleLang !==
'css' && $styleLang !==
'less' ) {
167 throw new Exception(
"<style lang=\"$styleLang\"> is invalid," .
168 " lang must be \"css\" or \"less\"" );
172 'lang' => $styleLang,
188 private function getTemplateHtml( $html, $minify ) {
189 $serializer =
new Serializer( $this->newTemplateFormatter( $minify ) );
190 $tokenizer =
new Tokenizer(
191 $this->newFilteringDispatcher(
192 new TreeBuilder( $serializer, [
'ignoreErrors' =>
true ] ),
195 $html, [
'ignoreErrors' =>
true ]
197 $tokenizer->execute( [
'fragmentNamespace' => HTMLData::NS_HTML,
'fragmentName' =>
'template' ] );
198 return trim( $serializer->getResult() );
209 private function newTemplateFormatter( $minify ) {
210 return new class( $minify ) extends HtmlFormatter {
213 public function __construct( $minify ) {
214 $this->minify = $minify;
217 public function startDocument( $fragmentNamespace, $fragmentName ) {
222 public function comment( SerializerNode $parent, $text ) {
223 if ( $this->minify ) {
227 return parent::comment( $parent, $text );
230 public function characters( SerializerNode $parent, $text, $start, $length ) {
234 $parent->namespace !== HTMLData::NS_HTML ||
235 !isset( $this->prefixLfElements[ $parent->name ] )
238 $text = substr( $text, $start, $length );
240 $text = preg_replace(
'/[ \r\n\t]+/',
' ', $text );
242 $length = strlen( $text );
244 return parent::characters( $parent, $text, $start, $length );
247 public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
251 $node->namespace !== HTMLData::NS_HTML ||
252 !isset( $this->prefixLfElements[ $node->name ] )
257 $contents = preg_replace(
'/(^[ \r\n\t]+)|([\r\n\t ]+$)/',
'', $contents );
259 return parent::element( $parent, $node, $contents );
272 private function newFilteringDispatcher( TreeBuilder $treeBuilder, $nodeName ) {
273 return new class( $treeBuilder, $nodeName ) extends Dispatcher {
275 private $nodeDepth = 0;
276 private $seenTag =
false;
278 public function __construct( TreeBuilder $treeBuilder, $nodeName ) {
279 $this->nodeName = $nodeName;
280 parent::__construct( $treeBuilder );
283 public function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
284 if ( $this->nodeDepth ) {
285 parent::startTag( $name, $attrs, $selfClose, $sourceStart, $sourceLength );
288 if ( $name === $this->nodeName ) {
289 if ( $this->nodeDepth === 0 && $this->seenTag ) {
291 throw new Exception(
"More than one <{$this->nodeName}> tag found" );
294 $this->seenTag =
true;
298 public function endTag( $name, $sourceStart, $sourceLength ) {
299 if ( $name === $this->nodeName ) {
302 if ( $this->nodeDepth ) {
303 parent::endTag( $name, $sourceStart, $sourceLength );
307 public function characters( $text, $start, $length, $sourceStart, $sourceLength ) {
308 if ( $this->nodeDepth ) {
309 parent::characters( $text, $start, $length, $sourceStart, $sourceLength );
313 public function comment( $text, $sourceStart, $sourceLength ) {
314 if ( $this->nodeDepth ) {
315 parent::comment( $text, $sourceStart, $sourceLength );