Code Coverage for /workspace/src/includes/ResourceLoader/VueComponentParser.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	98.00% covered (success)	98.00%	98 / 100	75.00% covered (warning)	75.00%	6 / 8	CRAP	0.00% covered (danger)	0.00%	0 / 1
VueComponentParser	98.00% covered (success)	98.00%	98 / 100	75.00% covered (warning)	75.00%	6 / 8	41	0.00% covered (danger)	0.00%	0 / 1
parse	95.00% covered (success)	95.00%	19 / 20	0.00% covered (danger)	0.00%	0 / 1	8
parseHTML	100.00% covered (success)	100.00%	5 / 5	100.00% covered (success)	100.00%	1 / 1	1
findUniqueTags	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	4
validateAttributes	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	5
getStyleAndLang	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	4
getTemplateHtml	100.00% covered (success)	100.00%	10 / 10	100.00% covered (success)	100.00%	1 / 1	1
newTemplateFormatter	100.00% covered (success)	100.00%	21 / 21	100.00% covered (success)	100.00%	1 / 1	9
newFilteringDispatcher	94.74% covered (success)	94.74%	18 / 19	0.00% covered (danger)	0.00%	0 / 1	9.01

1	<?php
2	/**
3	* This program is free software; you can redistribute it and/or modify
4	* it under the terms of the GNU General Public License as published by
5	* the Free Software Foundation; either version 2 of the License, or
6	* (at your option) any later version.
7	*
8	* This program is distributed in the hope that it will be useful,
9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11	* GNU General Public License for more details.
12	*
13	* You should have received a copy of the GNU General Public License along
14	* with this program; if not, write to the Free Software Foundation, Inc.,
15	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16	* http://www.gnu.org/copyleft/gpl.html
17	*
18	* @file
19	* @author Roan Kattouw
20	*/
21
22	namespace MediaWiki\ResourceLoader;
23
24	use DOMDocument;
25	use DOMElement;
26	use DOMNode;
27	use InvalidArgumentException;
28	use Wikimedia\RemexHtml\DOM\DOMBuilder;
29	use Wikimedia\RemexHtml\HTMLData;
30	use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
31	use Wikimedia\RemexHtml\Serializer\Serializer;
32	use Wikimedia\RemexHtml\Serializer\SerializerNode;
33	use Wikimedia\RemexHtml\Tokenizer\Attributes;
34	use Wikimedia\RemexHtml\Tokenizer\Tokenizer;
35	use Wikimedia\RemexHtml\TreeBuilder\Dispatcher;
36	use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder;
37	use Wikimedia\Zest\Zest;
38
39	/**
40	* Parser for Vue single file components (.vue files). See parse() for usage.
41	*
42	* @ingroup ResourceLoader
43	* @internal For use within FileModule.
44	*/
45	class VueComponentParser {
46	/**
47	* Parse a Vue single file component, and extract the script, template and style parts.
48	*
49	* Returns an associative array with the following keys:
50	* - 'script': The JS code in the <script> tag
51	* - 'template': The HTML in the <template> tag
52	* - 'style': The CSS/LESS styles in the <style> tag, or null if the <style> tag was missing
53	* - 'styleLang': The language used for 'style'; either 'css' or 'less', or null if no <style> tag
54	*
55	* The following options can be passed in the $options parameter:
56	* - 'minifyTemplate': Whether to minify the HTML in the template tag. This removes
57	* HTML comments and strips whitespace. Default: false
58	*
59	* @param string $html HTML with <script>, <template> and <style> tags at the top level
60	* @param array $options Associative array of options
61	* @return array
62	* @throws InvalidArgumentException If the input is invalid
63	*/
64	public function parse( string $html, array $options = [] ): array {
65	$dom = $this->parseHTML( $html );
66	// Remex wraps everything in <html><head>, unwrap that
67	$head = Zest::getElementsByTagName( $dom, 'head' )[ 0 ];
68
69	// Find the <script>, <template> and <style> tags. They can appear in any order, but they
70	// must be at the top level, and there can only be one of each.
71	if ( !$head ) {
72	throw new InvalidArgumentException( 'Parsed DOM did not contain a <head> tag' );
73	}
74	$nodes = $this->findUniqueTags( $head, [ 'script', 'template', 'style' ] );
75
76	// Throw an error if we didn't find a <script> or <template> tag. <style> is optional.
77	foreach ( [ 'script', 'template' ] as $requiredTag ) {
78	if ( !isset( $nodes[ $requiredTag ] ) ) {
79	throw new InvalidArgumentException( "No <$requiredTag> tag found" );
80	}
81	}
82
83	$this->validateAttributes( $nodes['script'], [] );
84	$this->validateAttributes( $nodes['template'], [] );
85	if ( isset( $nodes['style'] ) ) {
86	$this->validateAttributes( $nodes['style'], [ 'lang' ] );
87	}
88
89	$styleData = isset( $nodes['style'] ) ? $this->getStyleAndLang( $nodes['style'] ) : null;
90	$template = $this->getTemplateHtml( $html, $options['minifyTemplate'] ?? false );
91
92	return [
93	'script' => trim( $nodes['script']->nodeValue ?? '' ),
94	'template' => $template,
95	'style' => $styleData ? $styleData['style'] : null,
96	'styleLang' => $styleData ? $styleData['lang'] : null
97	];
98	}
99
100	/**
101	* Parse HTML to DOM using RemexHtml
102	* @param string $html
103	* @return DOMDocument
104	*/
105	private function parseHTML( $html ): DOMDocument {
106	$domBuilder = new DOMBuilder( [ 'suppressHtmlNamespace' => true ] );
107	$treeBuilder = new TreeBuilder( $domBuilder, [ 'ignoreErrors' => true ] );
108	$tokenizer = new Tokenizer( new Dispatcher( $treeBuilder ), $html, [ 'ignoreErrors' => true ] );
109	$tokenizer->execute();
110	// @phan-suppress-next-line PhanTypeMismatchReturnSuperType
111	return $domBuilder->getFragment();
112	}
113
114	/**
115	* Find occurrences of specified tags in a DOM node, expecting at most one occurrence of each.
116	* This method only looks at the top-level children of $rootNode, it doesn't descend into them.
117	*
118	* @param DOMNode $rootNode Node whose children to look at
119	* @param string[] $tagNames Tag names to look for (must be all lowercase)
120	* @return DOMElement[] Associative arrays whose keys are tag names and values are DOM nodes
121	*/
122	private function findUniqueTags( DOMNode $rootNode, array $tagNames ): array {
123	$nodes = [];
124	foreach ( $rootNode->childNodes as $node ) {
125	$tagName = strtolower( $node->nodeName );
126	if ( in_array( $tagName, $tagNames ) ) {
127	if ( isset( $nodes[ $tagName ] ) ) {
128	throw new InvalidArgumentException( "More than one <$tagName> tag found" );
129	}
130	$nodes[ $tagName ] = $node;
131	}
132	}
133	return $nodes;
134	}
135
136	/**
137	* Verify that a given node only has a given set of attributes, and no others.
138	* @param DOMNode $node Node to check
139	* @param array $allowedAttributes Attributes the node is allowed to have
140	* @throws InvalidArgumentException If the node has an attribute it's not allowed to have
141	*/
142	private function validateAttributes( DOMNode $node, array $allowedAttributes ): void {
143	if ( $allowedAttributes ) {
144	foreach ( $node->attributes as $attr ) {
145	if ( !in_array( $attr->name, $allowedAttributes ) ) {
146	throw new InvalidArgumentException( "<{$node->nodeName}> may not have the " .
147	"{$attr->name} attribute" );
148	}
149	}
150	} elseif ( $node->attributes->length > 0 ) {
151	throw new InvalidArgumentException( "<{$node->nodeName}> may not have any attributes" );
152	}
153	}
154
155	/**
156	* Get the contents and language of the <style> tag. The language can be 'css' or 'less'.
157	* @param DOMElement $styleNode The <style> tag.
158	* @return array [ 'style' => string, 'lang' => string ]
159	* @throws InvalidArgumentException If an invalid language is used, or if the 'scoped' attribute is set.
160	*/
161	private function getStyleAndLang( DOMElement $styleNode ): array {
162	$style = trim( $styleNode->nodeValue ?? '' );
163	$styleLang = $styleNode->hasAttribute( 'lang' ) ?
164	$styleNode->getAttribute( 'lang' ) : 'css';
165	if ( $styleLang !== 'css' && $styleLang !== 'less' ) {
166	throw new InvalidArgumentException( "<style lang=\"$styleLang\"> is invalid," .
167	" lang must be \"css\" or \"less\"" );
168	}
169	return [
170	'style' => $style,
171	'lang' => $styleLang,
172	];
173	}
174
175	/**
176	* Get the HTML contents of the <template> tag, optionally minifed.
177	*
178	* To work around a bug in PHP's DOMDocument where attributes like @click get mangled,
179	* we re-parse the entire file using a Remex parse+serialize pipeline, with a custom dispatcher
180	* to zoom in on just the contents of the <template> tag, and a custom formatter for minification.
181	* Keeping everything in Remex and never converting it to DOM avoids the attribute mangling issue.
182	*
183	* @param string $html HTML that contains a <template> tag somewhere
184	* @param bool $minify Whether to minify the output (remove comments, strip whitespace)
185	* @return string HTML contents of the template tag
186	*/
187	private function getTemplateHtml( $html, $minify ) {
188	$serializer = new Serializer( $this->newTemplateFormatter( $minify ) );
189	$tokenizer = new Tokenizer(
190	$this->newFilteringDispatcher(
191	new TreeBuilder( $serializer, [ 'ignoreErrors' => true ] ),
192	'template'
193	),
194	$html, [ 'ignoreErrors' => true ]
195	);
196	$tokenizer->execute( [ 'fragmentNamespace' => HTMLData::NS_HTML, 'fragmentName' => 'template' ] );
197	return trim( $serializer->getResult() );
198	}
199
200	/**
201	* Custom HtmlFormatter subclass that optionally removes comments and strips whitespace.
202	* If $minify=false, this formatter falls through to HtmlFormatter for everything (except that
203	* it strips the <!doctype html> tag).
204	*
205	* @param bool $minify If true, remove comments and strip whitespace
206	* @return HtmlFormatter
207	*/
208	private function newTemplateFormatter( $minify ) {
209	return new class( $minify ) extends HtmlFormatter {
210	private $minify;
211
212	public function __construct( $minify ) {
213	$this->minify = $minify;
214	}
215
216	public function startDocument( $fragmentNamespace, $fragmentName ) {
217	// Remove <!doctype html>
218	return '';
219	}
220
221	public function comment( SerializerNode $parent, $text ) {
222	if ( $this->minify ) {
223	// Remove all comments
224	return '';
225	}
226	return parent::comment( $parent, $text );
227	}
228
229	public function characters( SerializerNode $parent, $text, $start, $length ) {
230	if (
231	$this->minify && (
232	// Don't touch <pre>/<listing>/<textarea> nodes
233	$parent->namespace !== HTMLData::NS_HTML \|\|
234	!isset( $this->prefixLfElements[ $parent->name ] )
235	)
236	) {
237	$text = substr( $text, $start, $length );
238	// Collapse runs of adjacent whitespace, and convert all whitespace to spaces
239	$text = preg_replace( '/[ \r\n\t]+/', ' ', $text );
240	$start = 0;
241	$length = strlen( $text );
242	}
243	return parent::characters( $parent, $text, $start, $length );
244	}
245
246	public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
247	if (
248	$this->minify && (
249	// Don't touch <pre>/<listing>/<textarea> nodes
250	$node->namespace !== HTMLData::NS_HTML \|\|
251	!isset( $this->prefixLfElements[ $node->name ] )
252	) &&
253	$contents !== null
254	) {
255	// Remove leading and trailing whitespace
256	$contents = preg_replace( '/(^[ \r\n\t]+)\|([\r\n\t ]+$)/', '', $contents );
257	}
258	return parent::element( $parent, $node, $contents );
259	}
260	};
261	}
262
263	/**
264	* Custom Dispatcher subclass that only dispatches tree events inside a tag with a certain name.
265	* This effectively filters the tree to only the contents of that tag.
266	*
267	* @param TreeBuilder $treeBuilder
268	* @param string $nodeName Tag name to filter for
269	* @return Dispatcher
270	*/
271	private function newFilteringDispatcher( TreeBuilder $treeBuilder, $nodeName ) {
272	return new class( $treeBuilder, $nodeName ) extends Dispatcher {
273	private $nodeName;
274	private $nodeDepth = 0;
275	private $seenTag = false;
276
277	public function __construct( TreeBuilder $treeBuilder, $nodeName ) {
278	$this->nodeName = $nodeName;
279	parent::__construct( $treeBuilder );
280	}
281
282	public function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
283	if ( $this->nodeDepth ) {
284	parent::startTag( $name, $attrs, $selfClose, $sourceStart, $sourceLength );
285	}
286
287	if ( $name === $this->nodeName ) {
288	if ( $this->nodeDepth === 0 && $this->seenTag ) {
289	// This is the second opening tag, not nested in the first one
290	throw new InvalidArgumentException( "More than one <{$this->nodeName}> tag found" );
291	}
292	$this->nodeDepth++;
293	$this->seenTag = true;
294	}
295	}
296
297	public function endTag( $name, $sourceStart, $sourceLength ) {
298	if ( $name === $this->nodeName ) {
299	$this->nodeDepth--;
300	}
301	if ( $this->nodeDepth ) {
302	parent::endTag( $name, $sourceStart, $sourceLength );
303	}
304	}
305
306	public function characters( $text, $start, $length, $sourceStart, $sourceLength ) {
307	if ( $this->nodeDepth ) {
308	parent::characters( $text, $start, $length, $sourceStart, $sourceLength );
309	}
310	}
311
312	public function comment( $text, $sourceStart, $sourceLength ) {
313	if ( $this->nodeDepth ) {
314	parent::comment( $text, $sourceStart, $sourceLength );
315	}
316	}
317	};
318	}
319	}