Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
98.00% |
98 / 100 |
|
75.00% |
6 / 8 |
CRAP | |
0.00% |
0 / 1 |
VueComponentParser | |
98.00% |
98 / 100 |
|
75.00% |
6 / 8 |
41 | |
0.00% |
0 / 1 |
parse | |
95.00% |
19 / 20 |
|
0.00% |
0 / 1 |
8 | |||
parseHTML | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
1 | |||
findUniqueTags | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
4 | |||
validateAttributes | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
5 | |||
getStyleAndLang | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
4 | |||
getTemplateHtml | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
newTemplateFormatter | |
100.00% |
21 / 21 |
|
100.00% |
1 / 1 |
9 | |||
newFilteringDispatcher | |
94.74% |
18 / 19 |
|
0.00% |
0 / 1 |
9.01 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | * @author Roan Kattouw |
20 | */ |
21 | |
22 | namespace MediaWiki\ResourceLoader; |
23 | |
24 | use DOMDocument; |
25 | use DOMElement; |
26 | use DOMNode; |
27 | use InvalidArgumentException; |
28 | use Wikimedia\RemexHtml\DOM\DOMBuilder; |
29 | use Wikimedia\RemexHtml\HTMLData; |
30 | use Wikimedia\RemexHtml\Serializer\HtmlFormatter; |
31 | use Wikimedia\RemexHtml\Serializer\Serializer; |
32 | use Wikimedia\RemexHtml\Serializer\SerializerNode; |
33 | use Wikimedia\RemexHtml\Tokenizer\Attributes; |
34 | use Wikimedia\RemexHtml\Tokenizer\Tokenizer; |
35 | use Wikimedia\RemexHtml\TreeBuilder\Dispatcher; |
36 | use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder; |
37 | use Wikimedia\Zest\Zest; |
38 | |
39 | /** |
40 | * Parser for Vue single file components (.vue files). See parse() for usage. |
41 | * |
42 | * @ingroup ResourceLoader |
43 | * @internal For use within FileModule. |
44 | */ |
45 | class VueComponentParser { |
46 | /** |
47 | * Parse a Vue single file component, and extract the script, template and style parts. |
48 | * |
49 | * Returns an associative array with the following keys: |
50 | * - 'script': The JS code in the <script> tag |
51 | * - 'template': The HTML in the <template> tag |
52 | * - 'style': The CSS/LESS styles in the <style> tag, or null if the <style> tag was missing |
53 | * - 'styleLang': The language used for 'style'; either 'css' or 'less', or null if no <style> tag |
54 | * |
55 | * The following options can be passed in the $options parameter: |
56 | * - 'minifyTemplate': Whether to minify the HTML in the template tag. This removes |
57 | * HTML comments and strips whitespace. Default: false |
58 | * |
59 | * @param string $html HTML with <script>, <template> and <style> tags at the top level |
60 | * @param array $options Associative array of options |
61 | * @return array |
62 | * @throws InvalidArgumentException If the input is invalid |
63 | */ |
64 | public function parse( string $html, array $options = [] ): array { |
65 | $dom = $this->parseHTML( $html ); |
66 | // Remex wraps everything in <html><head>, unwrap that |
67 | $head = Zest::getElementsByTagName( $dom, 'head' )[ 0 ]; |
68 | |
69 | // Find the <script>, <template> and <style> tags. They can appear in any order, but they |
70 | // must be at the top level, and there can only be one of each. |
71 | if ( !$head ) { |
72 | throw new InvalidArgumentException( 'Parsed DOM did not contain a <head> tag' ); |
73 | } |
74 | $nodes = $this->findUniqueTags( $head, [ 'script', 'template', 'style' ] ); |
75 | |
76 | // Throw an error if we didn't find a <script> or <template> tag. <style> is optional. |
77 | foreach ( [ 'script', 'template' ] as $requiredTag ) { |
78 | if ( !isset( $nodes[ $requiredTag ] ) ) { |
79 | throw new InvalidArgumentException( "No <$requiredTag> tag found" ); |
80 | } |
81 | } |
82 | |
83 | $this->validateAttributes( $nodes['script'], [] ); |
84 | $this->validateAttributes( $nodes['template'], [] ); |
85 | if ( isset( $nodes['style'] ) ) { |
86 | $this->validateAttributes( $nodes['style'], [ 'lang' ] ); |
87 | } |
88 | |
89 | $styleData = isset( $nodes['style'] ) ? $this->getStyleAndLang( $nodes['style'] ) : null; |
90 | $template = $this->getTemplateHtml( $html, $options['minifyTemplate'] ?? false ); |
91 | |
92 | return [ |
93 | 'script' => trim( $nodes['script']->nodeValue ?? '' ), |
94 | 'template' => $template, |
95 | 'style' => $styleData ? $styleData['style'] : null, |
96 | 'styleLang' => $styleData ? $styleData['lang'] : null |
97 | ]; |
98 | } |
99 | |
100 | /** |
101 | * Parse HTML to DOM using RemexHtml |
102 | * @param string $html |
103 | * @return DOMDocument |
104 | */ |
105 | private function parseHTML( $html ): DOMDocument { |
106 | $domBuilder = new DOMBuilder( [ 'suppressHtmlNamespace' => true ] ); |
107 | $treeBuilder = new TreeBuilder( $domBuilder, [ 'ignoreErrors' => true ] ); |
108 | $tokenizer = new Tokenizer( new Dispatcher( $treeBuilder ), $html, [ 'ignoreErrors' => true ] ); |
109 | $tokenizer->execute(); |
110 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
111 | return $domBuilder->getFragment(); |
112 | } |
113 | |
114 | /** |
115 | * Find occurrences of specified tags in a DOM node, expecting at most one occurrence of each. |
116 | * This method only looks at the top-level children of $rootNode, it doesn't descend into them. |
117 | * |
118 | * @param DOMNode $rootNode Node whose children to look at |
119 | * @param string[] $tagNames Tag names to look for (must be all lowercase) |
120 | * @return DOMElement[] Associative arrays whose keys are tag names and values are DOM nodes |
121 | */ |
122 | private function findUniqueTags( DOMNode $rootNode, array $tagNames ): array { |
123 | $nodes = []; |
124 | foreach ( $rootNode->childNodes as $node ) { |
125 | $tagName = strtolower( $node->nodeName ); |
126 | if ( in_array( $tagName, $tagNames ) ) { |
127 | if ( isset( $nodes[ $tagName ] ) ) { |
128 | throw new InvalidArgumentException( "More than one <$tagName> tag found" ); |
129 | } |
130 | $nodes[ $tagName ] = $node; |
131 | } |
132 | } |
133 | return $nodes; |
134 | } |
135 | |
136 | /** |
137 | * Verify that a given node only has a given set of attributes, and no others. |
138 | * @param DOMNode $node Node to check |
139 | * @param array $allowedAttributes Attributes the node is allowed to have |
140 | * @throws InvalidArgumentException If the node has an attribute it's not allowed to have |
141 | */ |
142 | private function validateAttributes( DOMNode $node, array $allowedAttributes ): void { |
143 | if ( $allowedAttributes ) { |
144 | foreach ( $node->attributes as $attr ) { |
145 | if ( !in_array( $attr->name, $allowedAttributes ) ) { |
146 | throw new InvalidArgumentException( "<{$node->nodeName}> may not have the " . |
147 | "{$attr->name} attribute" ); |
148 | } |
149 | } |
150 | } elseif ( $node->attributes->length > 0 ) { |
151 | throw new InvalidArgumentException( "<{$node->nodeName}> may not have any attributes" ); |
152 | } |
153 | } |
154 | |
155 | /** |
156 | * Get the contents and language of the <style> tag. The language can be 'css' or 'less'. |
157 | * @param DOMElement $styleNode The <style> tag. |
158 | * @return array [ 'style' => string, 'lang' => string ] |
159 | * @throws InvalidArgumentException If an invalid language is used, or if the 'scoped' attribute is set. |
160 | */ |
161 | private function getStyleAndLang( DOMElement $styleNode ): array { |
162 | $style = trim( $styleNode->nodeValue ?? '' ); |
163 | $styleLang = $styleNode->hasAttribute( 'lang' ) ? |
164 | $styleNode->getAttribute( 'lang' ) : 'css'; |
165 | if ( $styleLang !== 'css' && $styleLang !== 'less' ) { |
166 | throw new InvalidArgumentException( "<style lang=\"$styleLang\"> is invalid," . |
167 | " lang must be \"css\" or \"less\"" ); |
168 | } |
169 | return [ |
170 | 'style' => $style, |
171 | 'lang' => $styleLang, |
172 | ]; |
173 | } |
174 | |
175 | /** |
176 | * Get the HTML contents of the <template> tag, optionally minifed. |
177 | * |
178 | * To work around a bug in PHP's DOMDocument where attributes like @click get mangled, |
179 | * we re-parse the entire file using a Remex parse+serialize pipeline, with a custom dispatcher |
180 | * to zoom in on just the contents of the <template> tag, and a custom formatter for minification. |
181 | * Keeping everything in Remex and never converting it to DOM avoids the attribute mangling issue. |
182 | * |
183 | * @param string $html HTML that contains a <template> tag somewhere |
184 | * @param bool $minify Whether to minify the output (remove comments, strip whitespace) |
185 | * @return string HTML contents of the template tag |
186 | */ |
187 | private function getTemplateHtml( $html, $minify ) { |
188 | $serializer = new Serializer( $this->newTemplateFormatter( $minify ) ); |
189 | $tokenizer = new Tokenizer( |
190 | $this->newFilteringDispatcher( |
191 | new TreeBuilder( $serializer, [ 'ignoreErrors' => true ] ), |
192 | 'template' |
193 | ), |
194 | $html, [ 'ignoreErrors' => true ] |
195 | ); |
196 | $tokenizer->execute( [ 'fragmentNamespace' => HTMLData::NS_HTML, 'fragmentName' => 'template' ] ); |
197 | return trim( $serializer->getResult() ); |
198 | } |
199 | |
200 | /** |
201 | * Custom HtmlFormatter subclass that optionally removes comments and strips whitespace. |
202 | * If $minify=false, this formatter falls through to HtmlFormatter for everything (except that |
203 | * it strips the <!doctype html> tag). |
204 | * |
205 | * @param bool $minify If true, remove comments and strip whitespace |
206 | * @return HtmlFormatter |
207 | */ |
208 | private function newTemplateFormatter( $minify ) { |
209 | return new class( $minify ) extends HtmlFormatter { |
210 | private $minify; |
211 | |
212 | public function __construct( $minify ) { |
213 | $this->minify = $minify; |
214 | } |
215 | |
216 | public function startDocument( $fragmentNamespace, $fragmentName ) { |
217 | // Remove <!doctype html> |
218 | return ''; |
219 | } |
220 | |
221 | public function comment( SerializerNode $parent, $text ) { |
222 | if ( $this->minify ) { |
223 | // Remove all comments |
224 | return ''; |
225 | } |
226 | return parent::comment( $parent, $text ); |
227 | } |
228 | |
229 | public function characters( SerializerNode $parent, $text, $start, $length ) { |
230 | if ( |
231 | $this->minify && ( |
232 | // Don't touch <pre>/<listing>/<textarea> nodes |
233 | $parent->namespace !== HTMLData::NS_HTML || |
234 | !isset( $this->prefixLfElements[ $parent->name ] ) |
235 | ) |
236 | ) { |
237 | $text = substr( $text, $start, $length ); |
238 | // Collapse runs of adjacent whitespace, and convert all whitespace to spaces |
239 | $text = preg_replace( '/[ \r\n\t]+/', ' ', $text ); |
240 | $start = 0; |
241 | $length = strlen( $text ); |
242 | } |
243 | return parent::characters( $parent, $text, $start, $length ); |
244 | } |
245 | |
246 | public function element( SerializerNode $parent, SerializerNode $node, $contents ) { |
247 | if ( |
248 | $this->minify && ( |
249 | // Don't touch <pre>/<listing>/<textarea> nodes |
250 | $node->namespace !== HTMLData::NS_HTML || |
251 | !isset( $this->prefixLfElements[ $node->name ] ) |
252 | ) && |
253 | $contents !== null |
254 | ) { |
255 | // Remove leading and trailing whitespace |
256 | $contents = preg_replace( '/(^[ \r\n\t]+)|([\r\n\t ]+$)/', '', $contents ); |
257 | } |
258 | return parent::element( $parent, $node, $contents ); |
259 | } |
260 | }; |
261 | } |
262 | |
263 | /** |
264 | * Custom Dispatcher subclass that only dispatches tree events inside a tag with a certain name. |
265 | * This effectively filters the tree to only the contents of that tag. |
266 | * |
267 | * @param TreeBuilder $treeBuilder |
268 | * @param string $nodeName Tag name to filter for |
269 | * @return Dispatcher |
270 | */ |
271 | private function newFilteringDispatcher( TreeBuilder $treeBuilder, $nodeName ) { |
272 | return new class( $treeBuilder, $nodeName ) extends Dispatcher { |
273 | private $nodeName; |
274 | private $nodeDepth = 0; |
275 | private $seenTag = false; |
276 | |
277 | public function __construct( TreeBuilder $treeBuilder, $nodeName ) { |
278 | $this->nodeName = $nodeName; |
279 | parent::__construct( $treeBuilder ); |
280 | } |
281 | |
282 | public function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) { |
283 | if ( $this->nodeDepth ) { |
284 | parent::startTag( $name, $attrs, $selfClose, $sourceStart, $sourceLength ); |
285 | } |
286 | |
287 | if ( $name === $this->nodeName ) { |
288 | if ( $this->nodeDepth === 0 && $this->seenTag ) { |
289 | // This is the second opening tag, not nested in the first one |
290 | throw new InvalidArgumentException( "More than one <{$this->nodeName}> tag found" ); |
291 | } |
292 | $this->nodeDepth++; |
293 | $this->seenTag = true; |
294 | } |
295 | } |
296 | |
297 | public function endTag( $name, $sourceStart, $sourceLength ) { |
298 | if ( $name === $this->nodeName ) { |
299 | $this->nodeDepth--; |
300 | } |
301 | if ( $this->nodeDepth ) { |
302 | parent::endTag( $name, $sourceStart, $sourceLength ); |
303 | } |
304 | } |
305 | |
306 | public function characters( $text, $start, $length, $sourceStart, $sourceLength ) { |
307 | if ( $this->nodeDepth ) { |
308 | parent::characters( $text, $start, $length, $sourceStart, $sourceLength ); |
309 | } |
310 | } |
311 | |
312 | public function comment( $text, $sourceStart, $sourceLength ) { |
313 | if ( $this->nodeDepth ) { |
314 | parent::comment( $text, $sourceStart, $sourceLength ); |
315 | } |
316 | } |
317 | }; |
318 | } |
319 | } |