MediaWiki  master
VueComponentParser.php
Go to the documentation of this file.
1 <?php
22 use Wikimedia\RemexHtml\DOM\DOMBuilder;
23 use Wikimedia\RemexHtml\HTMLData;
24 use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
25 use Wikimedia\RemexHtml\Serializer\Serializer;
26 use Wikimedia\RemexHtml\Serializer\SerializerNode;
27 use Wikimedia\RemexHtml\Tokenizer\Attributes;
28 use Wikimedia\RemexHtml\Tokenizer\Tokenizer;
29 use Wikimedia\RemexHtml\TreeBuilder\Dispatcher;
30 use Wikimedia\RemexHtml\TreeBuilder\Element;
31 use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder;
32 
58  public function parse( string $html, array $options = [] ): array {
59  $dom = $this->parseHTML( $html );
60  // Remex wraps everything in <html><head>, unwrap that
61  $head = $dom->getElementsByTagName( 'head' )->item( 0 );
62 
63  // Find the <script>, <template> and <style> tags. They can appear in any order, but they
64  // must be at the top level, and there can only be one of each.
65  if ( !$head ) {
66  throw new Exception( 'Parsed DOM did not contain a <head> tag' );
67  }
68  $nodes = $this->findUniqueTags( $head, [ 'script', 'template', 'style' ] );
69 
70  // Throw an error if we didn't find a <script> or <template> tag. <style> is optional.
71  foreach ( [ 'script', 'template' ] as $requiredTag ) {
72  if ( !isset( $nodes[ $requiredTag ] ) ) {
73  throw new Exception( "No <$requiredTag> tag found" );
74  }
75  }
76 
77  $this->validateAttributes( $nodes['script'], [] );
78  $this->validateAttributes( $nodes['template'], [] );
79  if ( isset( $nodes['style'] ) ) {
80  $this->validateAttributes( $nodes['style'], [ 'lang' ] );
81  }
82  $this->validateTemplateTag( $nodes['template'] );
83 
84  $styleData = isset( $nodes['style'] ) ? $this->getStyleAndLang( $nodes['style'] ) : null;
85  $template = $this->getTemplateHtml( $html, $options['minifyTemplate'] ?? false );
86 
87  return [
88  'script' => trim( $nodes['script']->nodeValue ),
89  'template' => $template,
90  'style' => $styleData ? $styleData['style'] : null,
91  'styleLang' => $styleData ? $styleData['lang'] : null
92  ];
93  }
94 
100  private function parseHTML( $html ): DOMDocument {
101  $domBuilder = new DOMBuilder( [ 'suppressHtmlNamespace' => true ] );
102  $treeBuilder = new TreeBuilder( $domBuilder, [ 'ignoreErrors' => true ] );
103  $tokenizer = new Tokenizer( new Dispatcher( $treeBuilder ), $html, [ 'ignoreErrors' => true ] );
104  $tokenizer->execute();
105  // @phan-suppress-next-line PhanTypeMismatchReturnSuperType
106  return $domBuilder->getFragment();
107  }
108 
117  private function findUniqueTags( DOMNode $rootNode, array $tagNames ): array {
118  $nodes = [];
119  foreach ( $rootNode->childNodes as $node ) {
120  $tagName = strtolower( $node->nodeName );
121  if ( in_array( $tagName, $tagNames ) ) {
122  if ( isset( $nodes[ $tagName ] ) ) {
123  throw new Exception( "More than one <$tagName> tag found" );
124  }
125  $nodes[ $tagName ] = $node;
126  }
127  }
128  return $nodes;
129  }
130 
137  private function validateAttributes( DOMNode $node, array $allowedAttributes ): void {
138  if ( $allowedAttributes ) {
139  foreach ( $node->attributes as $attr ) {
140  if ( !in_array( $attr->name, $allowedAttributes ) ) {
141  throw new Exception( "<{$node->nodeName}> may not have the " .
142  "{$attr->name} attribute" );
143  }
144  }
145  } elseif ( $node->attributes->length > 0 ) {
146  throw new Exception( "<{$node->nodeName}> may not have any attributes" );
147  }
148  }
149 
159  private function validateTemplateTag( DOMNode $templateNode ): void {
160  // Verify that the <template> tag only contains one tag, and put it in $rootTemplateNode
161  // We can't use ->childNodes->length === 1 here because whitespace shows up as text nodes,
162  // and comments are also allowed.
163  $rootTemplateNode = null;
164  foreach ( $templateNode->childNodes as $node ) {
165  if ( $node->nodeType === XML_ELEMENT_NODE ) {
166  if ( $rootTemplateNode !== null ) {
167  throw new Exception( '<template> tag may not have multiple child tags' );
168  }
169  $rootTemplateNode = $node;
170  } elseif ( $node->nodeType === XML_TEXT_NODE ) {
171  // Text nodes are allowed, as long as they only contain whitespace
172  if ( trim( $node->nodeValue ) !== '' ) {
173  throw new Exception( '<template> tag may not contain text' );
174  }
175  } elseif ( $node->nodeType !== XML_COMMENT_NODE ) {
176  // Comment nodes are allowed, anything else is not allowed
177  throw new Exception( "<template> tag may only contain element and comment nodes, " .
178  " found node of type {$node->nodeType}" );
179  }
180  }
181  if ( $rootTemplateNode === null ) {
182  throw new Exception( '<template> tag may not be empty' );
183  }
184  }
185 
192  private function getStyleAndLang( DOMElement $styleNode ): array {
193  $style = trim( $styleNode->nodeValue );
194  $styleLang = $styleNode->hasAttribute( 'lang' ) ?
195  $styleNode->getAttribute( 'lang' ) : 'css';
196  if ( $styleLang !== 'css' && $styleLang !== 'less' ) {
197  throw new Exception( "<style lang=\"$styleLang\"> is invalid," .
198  " lang must be \"css\" or \"less\"" );
199  }
200  return [
201  'style' => $style,
202  'lang' => $styleLang,
203  ];
204  }
205 
218  private function getTemplateHtml( $html, $minify ) {
219  $serializer = new Serializer( $this->newTemplateFormatter( $minify ) );
220  $tokenizer = new Tokenizer(
221  $this->newFilteringDispatcher(
222  new TreeBuilder( $serializer, [ 'ignoreErrors' => true ] ),
223  'template'
224  ),
225  $html, [ 'ignoreErrors' => true ]
226  );
227  $tokenizer->execute( [ 'fragmentNamespace' => HTMLData::NS_HTML, 'fragmentName' => 'template' ] );
228  return trim( $serializer->getResult() );
229  }
230 
239  private function newTemplateFormatter( $minify ) {
240  return new class( $minify ) extends HtmlFormatter {
241  private $minify;
242 
243  public function __construct( $minify ) {
244  $this->minify = $minify;
245  }
246 
247  public function startDocument( $fragmentNamespace, $fragmentName ) {
248  // Remove <!doctype html>
249  return '';
250  }
251 
252  public function comment( SerializerNode $parent, $text ) {
253  if ( $this->minify ) {
254  // Remove all comments
255  return '';
256  }
257  return parent::comment( $parent, $text );
258  }
259 
260  public function characters( SerializerNode $parent, $text, $start, $length ) {
261  if (
262  $this->minify && (
263  // Don't touch <pre>/<listing>/<textarea> nodes
264  $parent->namespace !== HTMLData::NS_HTML ||
265  !isset( $this->prefixLfElements[ $parent->name ] )
266  )
267  ) {
268  $text = substr( $text, $start, $length );
269  // Collapse runs of adjacent whitespace, and convert all whitespace to spaces
270  $text = preg_replace( '/[ \r\n\t]+/', ' ', $text );
271  $start = 0;
272  $length = strlen( $text );
273  }
274  return parent::characters( $parent, $text, $start, $length );
275  }
276 
277  public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
278  if (
279  $this->minify && (
280  // Don't touch <pre>/<listing>/<textarea> nodes
281  $node->namespace !== HTMLData::NS_HTML ||
282  !isset( $this->prefixLfElements[ $node->name ] )
283  )
284  ) {
285  // Remove leading and trailing whitespace
286  $contents = preg_replace( '/(^[ \r\n\t]+)|([\r\n\t ]+$)/', '', $contents );
287  }
288  return parent::element( $parent, $node, $contents );
289  }
290  };
291  }
292 
301  private function newFilteringDispatcher( TreeBuilder $treeBuilder, $nodeName ) {
302  return new class( $treeBuilder, $nodeName ) extends Dispatcher {
303  private $nodeName;
304  private $nodeDepth = 0;
305  private $seenTag = false;
306 
307  public function __construct( TreeBuilder $treeBuilder, $nodeName ) {
308  $this->nodeName = $nodeName;
309  parent::__construct( $treeBuilder );
310  }
311 
312  public function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
313  if ( $this->nodeDepth ) {
314  parent::startTag( $name, $attrs, $selfClose, $sourceStart, $sourceLength );
315  }
316 
317  if ( $name === $this->nodeName ) {
318  if ( $this->nodeDepth === 0 && $this->seenTag ) {
319  // This is the second opening tag, not nested in the first one
320  throw new Exception( "More than one <{$this->nodeName}> tag found" );
321  }
322  $this->nodeDepth++;
323  $this->seenTag = true;
324  }
325  }
326 
327  public function endTag( $name, $sourceStart, $sourceLength ) {
328  if ( $name === $this->nodeName ) {
329  $this->nodeDepth--;
330  }
331  if ( $this->nodeDepth ) {
332  parent::endTag( $name, $sourceStart, $sourceLength );
333  }
334  }
335 
336  public function characters( $text, $start, $length, $sourceStart, $sourceLength ) {
337  if ( $this->nodeDepth ) {
338  parent::characters( $text, $start, $length, $sourceStart, $sourceLength );
339  }
340  }
341 
342  public function comment( $text, $sourceStart, $sourceLength ) {
343  if ( $this->nodeDepth ) {
344  parent::comment( $text, $sourceStart, $sourceLength );
345  }
346  }
347  };
348  }
349 }
VueComponentParser\validateTemplateTag
validateTemplateTag(DOMNode $templateNode)
Check that the <template> tag has exactly one element child.
Definition: VueComponentParser.php:159
VueComponentParser\validateAttributes
validateAttributes(DOMNode $node, array $allowedAttributes)
Verify that a given node only has a given set of attributes, and no others.
Definition: VueComponentParser.php:137
VueComponentParser\getTemplateHtml
getTemplateHtml( $html, $minify)
Get the HTML contents of the <template> tag, optionally minifed.
Definition: VueComponentParser.php:218
VueComponentParser\newTemplateFormatter
newTemplateFormatter( $minify)
Custom HtmlFormatter subclass that optionally removes comments and strips whitespace.
Definition: VueComponentParser.php:239
VueComponentParser\parse
parse(string $html, array $options=[])
Parse a Vue single file component, and extract the script, template and style parts.
Definition: VueComponentParser.php:58
VueComponentParser\findUniqueTags
findUniqueTags(DOMNode $rootNode, array $tagNames)
Find occurrences of specified tags in a DOM node, expecting at most one occurrence of each.
Definition: VueComponentParser.php:117
VueComponentParser\newFilteringDispatcher
newFilteringDispatcher(TreeBuilder $treeBuilder, $nodeName)
Custom Dispatcher subclass that only dispatches tree events inside a tag with a certain name.
Definition: VueComponentParser.php:301
VueComponentParser\parseHTML
parseHTML( $html)
Parse HTML to DOM using RemexHtml.
Definition: VueComponentParser.php:100
VueComponentParser
Parser for Vue single file components (.vue files).
Definition: VueComponentParser.php:39
VueComponentParser\getStyleAndLang
getStyleAndLang(DOMElement $styleNode)
Get the contents and language of the <style> tag.
Definition: VueComponentParser.php:192