Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
22.56% |
30 / 133 |
|
40.00% |
2 / 5 |
CRAP | |
0.00% |
0 / 1 |
XMLSerializer | |
22.56% |
30 / 133 |
|
40.00% |
2 / 5 |
1460.02 | |
0.00% |
0 / 1 |
encodeHtmlEntities | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
dumpDataAttribs | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
56 | |||
serializeToString | |
0.00% |
0 / 51 |
|
0.00% |
0 / 1 |
506 | |||
accumOffsets | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
272 | |||
serialize | |
100.00% |
29 / 29 |
|
100.00% |
1 / 1 |
9 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\DOM\Comment; |
8 | use Wikimedia\Parsoid\DOM\Document; |
9 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\DOM\Text; |
13 | use Wikimedia\Parsoid\NodeData\DataParsoidDiff; |
14 | use Wikimedia\Parsoid\Utils\DOMCompat; |
15 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMUtils; |
17 | use Wikimedia\Parsoid\Utils\WTUtils; |
18 | use Wikimedia\Parsoid\Wikitext\Consts; |
19 | |
20 | /** |
21 | * Stand-alone XMLSerializer for DOM3 documents. |
22 | * |
23 | * The output is identical to standard XHTML5 DOM serialization, as given by |
24 | * http://www.w3.org/TR/html-polyglot/ |
25 | * and |
26 | * https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments |
27 | * except that we may quote attributes with single quotes, *only* where that would |
28 | * result in more compact output than the standard double-quoted serialization. |
29 | */ |
30 | class XMLSerializer { |
31 | |
32 | // https://html.spec.whatwg.org/#serialising-html-fragments |
33 | private static $alsoSerializeAsVoid = [ |
34 | 'basefont' => true, |
35 | 'bgsound' => true, |
36 | 'frame' => true, |
37 | 'keygen' => true |
38 | ]; |
39 | |
40 | /** |
41 | * Elements that strip leading newlines |
42 | * http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#html-fragment-serialization-algorithm |
43 | */ |
44 | private const NEWLINE_STRIPPING_ELEMENTS = [ |
45 | 'pre' => true, |
46 | 'textarea' => true, |
47 | 'listing' => true |
48 | ]; |
49 | |
50 | private const ENTITY_ENCODINGS = [ |
51 | 'single' => [ '<' => '<', '&' => '&', "'" => ''' ], |
52 | 'double' => [ '<' => '<', '&' => '&', '"' => '"' ], |
53 | 'xml' => [ '<' => '<', '&' => '&' ], |
54 | ]; |
55 | |
56 | /** |
57 | * HTML entity encoder helper. |
58 | * Only supports the few entities we'll actually need: <&'" |
59 | * @param string $raw Input string |
60 | * @param string $encodeChars Set of characters to encode, "single", "double", or "xml" |
61 | * @return string |
62 | */ |
63 | private static function encodeHtmlEntities( string $raw, string $encodeChars ): string { |
64 | return strtr( $raw, self::ENTITY_ENCODINGS[$encodeChars] ); |
65 | } |
66 | |
67 | /** |
68 | * Modify the attribute array, replacing data-object-id with JSON |
69 | * encoded data. This is just a debugging hack, not to be confused with |
70 | * DOMDataUtils::storeDataAttribs() |
71 | * |
72 | * @param Element $node |
73 | * @param array &$attrs |
74 | * @param bool $keepTmp |
75 | * @param bool $storeDiffMark |
76 | */ |
77 | private static function dumpDataAttribs( |
78 | Element $node, array &$attrs, bool $keepTmp, bool $storeDiffMark |
79 | ) { |
80 | if ( !isset( $attrs[DOMDataUtils::DATA_OBJECT_ATTR_NAME] ) ) { |
81 | return; |
82 | } |
83 | $codec = DOMDataUtils::getCodec( $node->ownerDocument ); |
84 | $dpd = DOMDataUtils::getDataParsoidDiff( $node ); |
85 | if ( $storeDiffMark && $dpd !== null ) { |
86 | $attrs['data-parsoid-diff'] = $codec->toJsonString( |
87 | $dpd, DataParsoidDiff::hint() |
88 | ); |
89 | } |
90 | $nd = DOMDataUtils::getNodeData( $node ); |
91 | $dp = $nd->parsoid; |
92 | if ( $dp ) { |
93 | if ( !$keepTmp ) { |
94 | $dp = clone $dp; |
95 | // @phan-suppress-next-line PhanTypeObjectUnsetDeclaredProperty |
96 | unset( $dp->tmp ); |
97 | } |
98 | $attrs['data-parsoid'] = $codec->toJsonString( |
99 | $dp, DOMDataUtils::getCodecHints()['data-parsoid'] |
100 | ); |
101 | } |
102 | $dmw = $nd->mw; |
103 | if ( $dmw ) { |
104 | $attrs['data-mw'] = $codec->toJsonString( |
105 | $dmw, DOMDataUtils::getCodecHints()['data-mw'] |
106 | ); |
107 | } |
108 | unset( $attrs[DOMDataUtils::DATA_OBJECT_ATTR_NAME] ); |
109 | } |
110 | |
111 | /** |
112 | * Serialize an HTML DOM3 node to XHTML. The XHTML and associated information will be fed |
113 | * step-by-step to the callback given in $accum. |
114 | * @param Node $node |
115 | * @param array $options See {@link XMLSerializer::serialize()} |
116 | * @param callable $accum function( $bit, $node, $flag ) |
117 | * - $bit: (string) piece of HTML code |
118 | * - $node: (Node) ?? |
119 | * - $flag: (string|null) 'start' or 'end' (??) |
120 | */ |
121 | private static function serializeToString( Node $node, array $options, callable $accum ): void { |
122 | $smartQuote = $options['smartQuote']; |
123 | $saveData = $options['saveData']; |
124 | switch ( $node->nodeType ) { |
125 | case XML_ELEMENT_NODE: |
126 | DOMUtils::assertElt( $node ); |
127 | $child = $node->firstChild; |
128 | $nodeName = DOMCompat::nodeName( $node ); |
129 | $localName = $node->localName; |
130 | $accum( '<' . $localName, $node ); |
131 | $attrs = DOMUtils::attributes( $node ); |
132 | if ( $saveData ) { |
133 | self::dumpDataAttribs( $node, $attrs, $options['keepTmp'], $options['storeDiffMark'] ); |
134 | } |
135 | foreach ( $attrs as $an => $av ) { |
136 | if ( $smartQuote |
137 | // More double quotes than single quotes in value? |
138 | && substr_count( $av, '"' ) > substr_count( $av, "'" ) |
139 | ) { |
140 | // use single quotes |
141 | $accum( ' ' . $an . "='" |
142 | . self::encodeHtmlEntities( $av, 'single' ) . "'", |
143 | $node ); |
144 | } else { |
145 | // use double quotes |
146 | $accum( ' ' . $an . '="' |
147 | . self::encodeHtmlEntities( $av, 'double' ) . '"', |
148 | $node ); |
149 | } |
150 | } |
151 | if ( $child || ( |
152 | !isset( Consts::$HTML['VoidTags'][$nodeName] ) && |
153 | !isset( self::$alsoSerializeAsVoid[$nodeName] ) |
154 | ) ) { |
155 | $accum( '>', $node, 'start' ); |
156 | // if is cdata child node |
157 | if ( DOMUtils::isRawTextElement( $node ) ) { |
158 | // TODO: perform context-sensitive escaping? |
159 | // Currently this content is not normally part of our DOM, so |
160 | // no problem. If it was, we'd probably have to do some |
161 | // tag-specific escaping. Examples: |
162 | // * < to \u003c in <script> |
163 | // * < to \3c in <style> |
164 | // ... |
165 | if ( $child ) { |
166 | $accum( $child->nodeValue, $node ); |
167 | } |
168 | } else { |
169 | if ( $child && isset( self::NEWLINE_STRIPPING_ELEMENTS[$localName] ) |
170 | && $child->nodeType === XML_TEXT_NODE && str_starts_with( $child->nodeValue, "\n" ) |
171 | ) { |
172 | /* If current node is a pre, textarea, or listing element, |
173 | * and the first child node of the element, if any, is a |
174 | * Text node whose character data has as its first |
175 | * character a U+000A LINE FEED (LF) character, then |
176 | * append a U+000A LINE FEED (LF) character. */ |
177 | $accum( "\n", $node ); |
178 | } |
179 | while ( $child ) { |
180 | self::serializeToString( $child, $options, $accum ); |
181 | $child = $child->nextSibling; |
182 | } |
183 | } |
184 | $accum( '</' . $localName . '>', $node, 'end' ); |
185 | } else { |
186 | $accum( '/>', $node, 'end' ); |
187 | } |
188 | return; |
189 | |
190 | case XML_DOCUMENT_NODE: |
191 | case XML_DOCUMENT_FRAG_NODE: |
192 | '@phan-var Document|DocumentFragment $node'; |
193 | // @var Document|DocumentFragment $node |
194 | $child = $node->firstChild; |
195 | while ( $child ) { |
196 | self::serializeToString( $child, $options, $accum ); |
197 | $child = $child->nextSibling; |
198 | } |
199 | return; |
200 | |
201 | case XML_TEXT_NODE: |
202 | '@phan-var Text $node'; // @var Text $node |
203 | $accum( self::encodeHtmlEntities( $node->nodeValue, 'xml' ), $node ); |
204 | return; |
205 | |
206 | case XML_COMMENT_NODE: |
207 | // According to |
208 | // http://www.w3.org/TR/DOM-Parsing/#dfn-concept-serialize-xml |
209 | // we could throw an exception here if node.data would not create |
210 | // a "well-formed" XML comment. But we use entity encoding when |
211 | // we create the comment node to ensure that node.data will always |
212 | // be okay; see DOMUtils.encodeComment(). |
213 | '@phan-var Comment $node'; // @var Comment $node |
214 | $accum( '<!--' . $node->nodeValue . '-->', $node ); |
215 | return; |
216 | |
217 | default: |
218 | $accum( '??' . DOMCompat::nodeName( $node ), $node ); |
219 | } |
220 | } |
221 | |
222 | /** |
223 | * Add data to an output/memory array (used when serialize() was called with the |
224 | * captureOffsets flag). |
225 | * @param array &$out Output array, see {@link self::serialize()} for details on the |
226 | * 'html' and 'offset' fields. The other fields (positions are 0-based |
227 | * and refer to UTF-8 byte indices): |
228 | * - start: position in the HTML of the end of the opening tag of <body> |
229 | * - last: (Node) last "about sibling" of the currently processed element |
230 | * (see {@link WTUtils::getAboutSiblings()} |
231 | * - uid: the ID of the element |
232 | * @param string $bit A piece of the HTML string |
233 | * @param Node $node The DOM node $bit is a part of |
234 | * @param ?string $flag 'start' when receiving the final part of the opening tag |
235 | * of an element, 'end' when receiving the final part of the closing tag of an element |
236 | * or the final part of a self-closing element. |
237 | */ |
238 | private static function accumOffsets( |
239 | array &$out, string $bit, Node $node, ?string $flag = null |
240 | ): void { |
241 | if ( DOMUtils::atTheTop( $node ) ) { |
242 | $out['html'] .= $bit; |
243 | if ( $flag === 'start' ) { |
244 | $out['start'] = strlen( $out['html'] ); |
245 | } elseif ( $flag === 'end' ) { |
246 | $out['start'] = null; |
247 | $out['uid'] = null; |
248 | } |
249 | } elseif ( |
250 | !( $node instanceof Element ) || $out['start'] === null || |
251 | !DOMUtils::atTheTop( $node->parentNode ) |
252 | ) { |
253 | // In case you're wondering, out.start may never be set if body |
254 | // isn't a child of the node passed to serializeToString, or if it |
255 | // is the node itself but options.innerXML is true. |
256 | $out['html'] .= $bit; |
257 | if ( $out['uid'] !== null ) { |
258 | $out['offsets'][$out['uid']]['html'][1] += strlen( $bit ); |
259 | } |
260 | } else { |
261 | $newUid = DOMCompat::getAttribute( $node, 'id' ); |
262 | // Encapsulated siblings don't have generated ids (but may have an id), |
263 | // so associate them with preceding content. |
264 | if ( $newUid && $newUid !== $out['uid'] && !$out['last'] ) { |
265 | if ( !WTUtils::isEncapsulationWrapper( $node ) ) { |
266 | $out['uid'] = $newUid; |
267 | } elseif ( WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
268 | $about = DOMCompat::getAttribute( $node, 'about' ); |
269 | $aboutSiblings = WTUtils::getAboutSiblings( $node, $about ); |
270 | $out['last'] = end( $aboutSiblings ); |
271 | $out['uid'] = $newUid; |
272 | } |
273 | } |
274 | if ( $out['last'] === $node && $flag === 'end' ) { |
275 | $out['last'] = null; |
276 | } |
277 | Assert::invariant( $out['uid'] !== null, 'uid cannot be null' ); |
278 | if ( !isset( $out['offsets'][$out['uid']] ) ) { |
279 | $dt = strlen( $out['html'] ) - $out['start']; |
280 | $out['offsets'][$out['uid']] = [ 'html' => [ $dt, $dt ] ]; |
281 | } |
282 | $out['html'] .= $bit; |
283 | $out['offsets'][$out['uid']]['html'][1] += strlen( $bit ); |
284 | } |
285 | } |
286 | |
287 | /** |
288 | * Serialize an HTML DOM3 node to an XHTML string. |
289 | * |
290 | * @param Node $node |
291 | * @param array $options |
292 | * - smartQuote (bool, default true): use single quotes for attributes when that's less escaping |
293 | * - innerXML (bool, default false): only serialize the contents of $node, exclude $node itself |
294 | * - captureOffsets (bool, default false): return tag position data (see below) |
295 | * - addDoctype (bool, default true): prepend a DOCTYPE when a full HTML document is serialized |
296 | * - saveData (bool, default false): Copy the NodeData into JSON attributes. This is for |
297 | * debugging purposes only, the normal code path is to use DOMDataUtils::storeDataAttribs(). |
298 | * - keepTmp (bool, default false): When saving data, include DataParsoid::$tmp. |
299 | * @return array An array with the following data: |
300 | * - html: the serialized HTML |
301 | * - offsets: the start and end position of each element in the HTML, in a |
302 | * [ $uid => [ 'html' => [ $start, $end ] ], ... ] format where $uid is the element's |
303 | * Parsoid ID, $start is the 0-based index of the first character of the element and |
304 | * $end is the index of the first character of the opening tag of the next sibling element, |
305 | * or the index of the last character of the element's closing tag if there is no next |
306 | * sibling. The positions are relative to the end of the opening <body> tag |
307 | * (the DOCTYPE header is not counted), and only present when the captureOffsets flag is set. |
308 | */ |
309 | public static function serialize( Node $node, array $options = [] ): array { |
310 | $options += [ |
311 | 'smartQuote' => true, |
312 | 'innerXML' => false, |
313 | 'captureOffsets' => false, |
314 | 'addDoctype' => true, |
315 | 'saveData' => false, |
316 | 'keepTmp' => false, |
317 | 'storeDiffMark' => false, |
318 | ]; |
319 | if ( $node instanceof Document ) { |
320 | $node = $node->documentElement; |
321 | } |
322 | $out = [ 'html' => '', 'offsets' => [], 'start' => null, 'uid' => null, 'last' => null ]; |
323 | $accum = $options['captureOffsets'] |
324 | ? function ( string $bit, Node $node, ?string $flag = null ) use ( &$out ): void { |
325 | self::accumOffsets( $out, $bit, $node, $flag ); |
326 | } |
327 | : static function ( string $bit ) use ( &$out ): void { |
328 | $out['html'] .= $bit; |
329 | }; |
330 | |
331 | if ( $options['innerXML'] ) { |
332 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
333 | self::serializeToString( $child, $options, $accum ); |
334 | } |
335 | } else { |
336 | self::serializeToString( $node, $options, $accum ); |
337 | } |
338 | // Ensure there's a doctype for documents. |
339 | if ( !$options['innerXML'] && DOMCompat::nodeName( $node ) === 'html' && $options['addDoctype'] ) { |
340 | $out['html'] = "<!DOCTYPE html>\n" . $out['html']; |
341 | } |
342 | // Drop the bookkeeping |
343 | unset( $out['start'], $out['uid'], $out['last'] ); |
344 | if ( !$options['captureOffsets'] ) { |
345 | unset( $out['offsets'] ); |
346 | } |
347 | return $out; |
348 | } |
349 | |
350 | } |