Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 110 |
|
0.00% |
0 / 8 |
CRAP | |
0.00% |
0 / 1 |
HtmlFormatter | |
0.00% |
0 / 110 |
|
0.00% |
0 / 8 |
2550 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
startDocument | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
characters | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
element | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
56 | |||
comment | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
doctype | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
formatDOMNode | |
0.00% |
0 / 40 |
|
0.00% |
0 / 1 |
306 | |||
formatDOMElement | |
0.00% |
0 / 38 |
|
0.00% |
0 / 1 |
342 |
1 | <?php |
2 | |
3 | namespace Wikimedia\RemexHtml\Serializer; |
4 | |
5 | use Wikimedia\RemexHtml\DOM\DOMFormatter; |
6 | use Wikimedia\RemexHtml\DOM\DOMUtils; |
7 | use Wikimedia\RemexHtml\HTMLData; |
8 | |
9 | /** |
10 | * A formatter which follows the HTML 5 fragment serialization algorithm. |
11 | */ |
12 | class HtmlFormatter implements Formatter, DOMFormatter { |
13 | /** |
14 | * The elements for which a closing tag is omitted. |
15 | * |
16 | * @var array<string,bool> |
17 | */ |
18 | protected $voidElements = [ |
19 | 'area' => true, |
20 | 'base' => true, |
21 | 'basefont' => true, |
22 | 'bgsound' => true, |
23 | 'br' => true, |
24 | 'col' => true, |
25 | 'embed' => true, |
26 | 'frame' => true, |
27 | 'hr' => true, |
28 | 'img' => true, |
29 | 'input' => true, |
30 | 'keygen' => true, |
31 | 'link' => true, |
32 | 'menuitem' => true, |
33 | 'meta' => true, |
34 | 'param' => true, |
35 | 'source' => true, |
36 | 'track' => true, |
37 | 'wbr' => true, |
38 | ]; |
39 | |
40 | /** |
41 | * The elements which need a leading newline in their contents to be |
42 | * duplicated, since the parser strips a leading newline. |
43 | * |
44 | * @var array<string,bool> |
45 | */ |
46 | protected $prefixLfElements = [ |
47 | 'pre' => true, |
48 | 'textarea' => true, |
49 | 'listing' => true |
50 | ]; |
51 | |
52 | /** |
53 | * The elements which have unescaped contents. |
54 | * |
55 | * @var array<string,bool> |
56 | */ |
57 | protected $rawTextElements = [ |
58 | 'style' => true, |
59 | 'script' => true, |
60 | 'xmp' => true, |
61 | 'iframe' => true, |
62 | 'noembed' => true, |
63 | 'noframes' => true, |
64 | 'plaintext' => true, |
65 | ]; |
66 | |
67 | /** |
68 | * The escape table for attribute values |
69 | * |
70 | * @var array<string,string> |
71 | */ |
72 | protected $attributeEscapes = [ |
73 | '&' => '&', |
74 | "\xc2\xa0" => ' ', |
75 | '"' => '"', |
76 | ]; |
77 | |
78 | /** |
79 | * The escape table for text nodes |
80 | * |
81 | * @var array<string,string> |
82 | */ |
83 | protected $textEscapes = [ |
84 | '&' => '&', |
85 | "\xc2\xa0" => ' ', |
86 | '<' => '<', |
87 | '>' => '>', |
88 | ]; |
89 | |
90 | /** |
91 | * Attribute namespaces which have unqualified local names |
92 | * |
93 | * @var array<string,bool> |
94 | */ |
95 | protected $unqualifiedNamespaces = [ |
96 | HTMLData::NS_HTML => true, |
97 | HTMLData::NS_MATHML => true, |
98 | HTMLData::NS_SVG => true, |
99 | ]; |
100 | |
101 | protected $useSourceDoctype; |
102 | protected $reverseCoercion; |
103 | |
104 | /** |
105 | * Constructor. |
106 | * |
107 | * @param array $options An associative array of options: |
108 | * - scriptingFlag : Set this to false to disable scripting. True by default. |
109 | * - useSourceDoctype : Emit the doctype used in the source. If this is |
110 | * false or absent, an HTML doctype will be used. |
111 | * - reverseCoercion : When formatting a DOM node, reverse the encoding |
112 | * of invalid names. False by default. |
113 | */ |
114 | public function __construct( $options = [] ) { |
115 | $options += [ |
116 | 'scriptingFlag' => true, |
117 | 'useSourceDoctype' => false, |
118 | 'reverseCoercion' => false, |
119 | ]; |
120 | if ( $options['scriptingFlag'] ) { |
121 | $this->rawTextElements['noscript'] = true; |
122 | } |
123 | $this->useSourceDoctype = $options['useSourceDoctype']; |
124 | $this->reverseCoercion = $options['reverseCoercion']; |
125 | } |
126 | |
127 | public function startDocument( $fragmentNamespace, $fragmentName ) { |
128 | return "<!DOCTYPE html>"; |
129 | } |
130 | |
131 | public function characters( SerializerNode $parent, $text, $start, $length ) { |
132 | $text = substr( $text, $start, $length ); |
133 | if ( $parent->namespace !== HTMLData::NS_HTML |
134 | || !isset( $this->rawTextElements[$parent->name] ) |
135 | ) { |
136 | $text = strtr( $text, $this->textEscapes ); |
137 | } |
138 | return $text; |
139 | } |
140 | |
141 | public function element( SerializerNode $parent, SerializerNode $node, $contents ) { |
142 | $name = $node->name; |
143 | $s = "<$name"; |
144 | foreach ( $node->attrs->getValues() as $attrName => $attrValue ) { |
145 | $encValue = strtr( $attrValue, $this->attributeEscapes ); |
146 | $s .= " $attrName=\"$encValue\""; |
147 | } |
148 | $s .= '>'; |
149 | if ( $node->namespace === HTMLData::NS_HTML ) { |
150 | if ( isset( $contents[0] ) && $contents[0] === "\n" |
151 | && isset( $this->prefixLfElements[$name] ) |
152 | ) { |
153 | $s .= "\n$contents</$name>"; |
154 | } elseif ( !isset( $this->voidElements[$name] ) ) { |
155 | $s .= "$contents</$name>"; |
156 | } |
157 | } else { |
158 | $s .= "$contents</$name>"; |
159 | } |
160 | return $s; |
161 | } |
162 | |
163 | public function comment( SerializerNode $parent, $text ) { |
164 | return "<!--$text-->"; |
165 | } |
166 | |
167 | public function doctype( $name, $public, $system ) { |
168 | return ''; |
169 | } |
170 | |
171 | /** |
172 | * @param \DOMNode $node |
173 | * @return string |
174 | */ |
175 | public function formatDOMNode( $node ) { |
176 | $contents = ''; |
177 | if ( $node->firstChild ) { |
178 | foreach ( $node->childNodes as $child ) { |
179 | $contents .= $this->formatDOMNode( $child ); |
180 | } |
181 | } |
182 | |
183 | switch ( $node->nodeType ) { |
184 | case XML_ELEMENT_NODE: |
185 | '@phan-var \DOMElement $node'; /** @var \DOMElement $node */ |
186 | return $this->formatDOMElement( $node, $contents ); |
187 | |
188 | case XML_DOCUMENT_NODE: |
189 | if ( !$this->useSourceDoctype ) { |
190 | return "<!DOCTYPE html>" . $contents; |
191 | } else { |
192 | return $contents; |
193 | } |
194 | |
195 | case XML_DOCUMENT_FRAG_NODE: |
196 | return $contents; |
197 | |
198 | case XML_TEXT_NODE: |
199 | '@phan-var \DOMCharacterData $node'; /** @var \DOMCharacterData $node */ |
200 | $text = $node->data; |
201 | $parent = $node->parentNode; |
202 | if ( $parent->namespaceURI !== HTMLData::NS_HTML |
203 | || !isset( $this->rawTextElements[$parent->nodeName] ) |
204 | ) { |
205 | $text = strtr( $text, $this->textEscapes ); |
206 | } |
207 | return $text; |
208 | |
209 | case XML_CDATA_SECTION_NODE: |
210 | '@phan-var \DOMCdataSection $node'; /** @var \DOMCdataSection $node */ |
211 | $parent = $node->parentNode; |
212 | if ( $parent->namespaceURI === HTMLData::NS_HTML ) { |
213 | // CDATA is not allowed in HTML nodes |
214 | return $node->data; |
215 | } else { |
216 | return "<![CDATA[{$node->data}]]>"; |
217 | } |
218 | |
219 | case XML_PI_NODE: |
220 | '@phan-var \DOMProcessingInstruction $node'; /** @var \DOMProcessingInstruction $node */ |
221 | return "<?{$node->target} {$node->data}>"; |
222 | |
223 | case XML_COMMENT_NODE: |
224 | '@phan-var \DOMComment $node'; /** @var \DOMComment $node */ |
225 | return "<!--{$node->data}-->"; |
226 | |
227 | case XML_DOCUMENT_TYPE_NODE: |
228 | '@phan-var \DOMDocumentType $node'; /** @var \DOMDocumentType $node */ |
229 | if ( $this->useSourceDoctype ) { |
230 | return "<!DOCTYPE {$node->name}>"; |
231 | } else { |
232 | return ''; |
233 | } |
234 | |
235 | default: |
236 | return ''; |
237 | } |
238 | } |
239 | |
240 | /** |
241 | * @param \DOMElement $node |
242 | * @param string $contents |
243 | * @return string |
244 | */ |
245 | public function formatDOMElement( $node, $contents ) { |
246 | $ns = $node->namespaceURI; |
247 | if ( $ns === null |
248 | || isset( $this->unqualifiedNamespaces[$ns] ) |
249 | || !( $node->prefix ) |
250 | ) { |
251 | $name = (string)$node->localName; |
252 | } else { |
253 | $name = $node->prefix . ':' . $node->localName; |
254 | } |
255 | if ( $this->reverseCoercion ) { |
256 | $name = DOMUtils::uncoerceName( $name ); |
257 | } |
258 | |
259 | $s = '<' . $name; |
260 | foreach ( $node->attributes as $attr ) { |
261 | switch ( $attr->namespaceURI ) { |
262 | case HTMLData::NS_XML: |
263 | $attrName = 'xml:' . $attr->localName; |
264 | break; |
265 | case HTMLData::NS_XMLNS: |
266 | if ( $attr->localName === 'xmlns' ) { |
267 | $attrName = 'xmlns'; |
268 | } else { |
269 | $attrName = 'xmlns:' . $attr->localName; |
270 | } |
271 | break; |
272 | case HTMLData::NS_XLINK: |
273 | $attrName = 'xlink:' . $attr->localName; |
274 | break; |
275 | default: |
276 | if ( strlen( $attr->prefix ) ) { |
277 | $attrName = $attr->prefix . ':' . $attr->localName; |
278 | } else { |
279 | $attrName = $attr->localName; |
280 | } |
281 | } |
282 | if ( $this->reverseCoercion ) { |
283 | $attrName = DOMUtils::uncoerceName( $attrName ); |
284 | } |
285 | $encValue = strtr( $attr->value, $this->attributeEscapes ); |
286 | $s .= " $attrName=\"$encValue\""; |
287 | } |
288 | $s .= '>'; |
289 | if ( $ns === HTMLData::NS_HTML ) { |
290 | if ( isset( $contents[0] ) && $contents[0] === "\n" |
291 | && isset( $this->prefixLfElements[$name] ) |
292 | ) { |
293 | $s .= "\n$contents</$name>"; |
294 | } elseif ( !isset( $this->voidElements[$name] ) ) { |
295 | $s .= "$contents</$name>"; |
296 | } |
297 | } else { |
298 | $s .= "$contents</$name>"; |
299 | } |
300 | return $s; |
301 | } |
302 | } |