Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
27.27% |
30 / 110 |
|
50.00% |
2 / 4 |
CRAP | |
0.00% |
0 / 1 |
XMLSerializer | |
27.27% |
30 / 110 |
|
50.00% |
2 / 4 |
934.29 | |
0.00% |
0 / 1 |
encodeHtmlEntities | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
serializeToString | |
0.00% |
0 / 51 |
|
0.00% |
0 / 1 |
506 | |||
accumOffsets | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
272 | |||
serialize | |
100.00% |
29 / 29 |
|
100.00% |
1 / 1 |
9 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\DOM\Comment; |
8 | use Wikimedia\Parsoid\DOM\Document; |
9 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\DOM\Text; |
13 | use Wikimedia\Parsoid\Utils\DOMCompat; |
14 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
15 | use Wikimedia\Parsoid\Utils\DOMUtils; |
16 | use Wikimedia\Parsoid\Utils\WTUtils; |
17 | use Wikimedia\Parsoid\Wikitext\Consts; |
18 | |
19 | /** |
20 | * Stand-alone XMLSerializer for DOM3 documents. |
21 | * |
22 | * The output is identical to standard XHTML5 DOM serialization, as given by |
23 | * http://www.w3.org/TR/html-polyglot/ |
24 | * and |
25 | * https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments |
26 | * except that we may quote attributes with single quotes, *only* where that would |
27 | * result in more compact output than the standard double-quoted serialization. |
28 | */ |
29 | class XMLSerializer { |
30 | |
31 | // https://html.spec.whatwg.org/#serialising-html-fragments |
32 | private static $alsoSerializeAsVoid = [ |
33 | 'basefont' => true, |
34 | 'bgsound' => true, |
35 | 'frame' => true, |
36 | 'keygen' => true |
37 | ]; |
38 | |
39 | /** |
40 | * Elements that strip leading newlines |
41 | * http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#html-fragment-serialization-algorithm |
42 | */ |
43 | private const NEWLINE_STRIPPING_ELEMENTS = [ |
44 | 'pre' => true, |
45 | 'textarea' => true, |
46 | 'listing' => true |
47 | ]; |
48 | |
49 | private const ENTITY_ENCODINGS = [ |
50 | 'single' => [ '<' => '<', '&' => '&', "'" => ''' ], |
51 | 'double' => [ '<' => '<', '&' => '&', '"' => '"' ], |
52 | 'xml' => [ '<' => '<', '&' => '&', "\u{0338}" => '̸' ], |
53 | ]; |
54 | |
55 | /** |
56 | * HTML entity encoder helper. |
57 | * Only supports the few entities we'll actually need: <&'" |
58 | * @param string $raw Input string |
59 | * @param string $encodeChars Set of characters to encode, "single", "double", or "xml" |
60 | * @return string |
61 | */ |
62 | private static function encodeHtmlEntities( string $raw, string $encodeChars ): string { |
63 | return strtr( $raw, self::ENTITY_ENCODINGS[$encodeChars] ); |
64 | } |
65 | |
66 | /** |
67 | * Serialize an HTML DOM3 node to XHTML. The XHTML and associated information will be fed |
68 | * step-by-step to the callback given in $accum. |
69 | * @param Node $node |
70 | * @param array $options See {@link XMLSerializer::serialize()} |
71 | * @param callable $accum function( $bit, $node, $flag ) |
72 | * - $bit: (string) piece of HTML code |
73 | * - $node: (Node) ?? |
74 | * - $flag: (string|null) 'start' or 'end' (??) |
75 | */ |
76 | private static function serializeToString( Node $node, array $options, callable $accum ): void { |
77 | $smartQuote = $options['smartQuote']; |
78 | $saveData = $options['saveData']; |
79 | switch ( $node->nodeType ) { |
80 | case XML_ELEMENT_NODE: |
81 | DOMUtils::assertElt( $node ); |
82 | $child = $node->firstChild; |
83 | $nodeName = DOMCompat::nodeName( $node ); |
84 | $localName = $node->localName; |
85 | $accum( '<' . $localName, $node ); |
86 | $attrs = DOMUtils::attributes( $node ); |
87 | if ( $saveData ) { |
88 | DOMDataUtils::dumpRichAttribs( $node, $attrs, $options['keepTmp'], $options['storeDiffMark'] ); |
89 | } |
90 | foreach ( $attrs as $an => $av ) { |
91 | if ( $smartQuote |
92 | // More double quotes than single quotes in value? |
93 | && substr_count( $av, '"' ) > substr_count( $av, "'" ) |
94 | ) { |
95 | // use single quotes |
96 | $accum( ' ' . $an . "='" |
97 | . self::encodeHtmlEntities( $av, 'single' ) . "'", |
98 | $node ); |
99 | } else { |
100 | // use double quotes |
101 | $accum( ' ' . $an . '="' |
102 | . self::encodeHtmlEntities( $av, 'double' ) . '"', |
103 | $node ); |
104 | } |
105 | } |
106 | if ( $child || ( |
107 | !isset( Consts::$HTML['VoidTags'][$nodeName] ) && |
108 | !isset( self::$alsoSerializeAsVoid[$nodeName] ) |
109 | ) ) { |
110 | $accum( '>', $node, 'start' ); |
111 | // if is cdata child node |
112 | if ( DOMUtils::isRawTextElement( $node ) ) { |
113 | // TODO: perform context-sensitive escaping? |
114 | // Currently this content is not normally part of our DOM, so |
115 | // no problem. If it was, we'd probably have to do some |
116 | // tag-specific escaping. Examples: |
117 | // * < to \u003c in <script> |
118 | // * < to \3c in <style> |
119 | // ... |
120 | if ( $child ) { |
121 | $accum( $child->nodeValue, $node ); |
122 | } |
123 | } else { |
124 | if ( $child && isset( self::NEWLINE_STRIPPING_ELEMENTS[$localName] ) |
125 | && $child->nodeType === XML_TEXT_NODE && str_starts_with( $child->nodeValue, "\n" ) |
126 | ) { |
127 | /* If current node is a pre, textarea, or listing element, |
128 | * and the first child node of the element, if any, is a |
129 | * Text node whose character data has as its first |
130 | * character a U+000A LINE FEED (LF) character, then |
131 | * append a U+000A LINE FEED (LF) character. */ |
132 | $accum( "\n", $node ); |
133 | } |
134 | while ( $child ) { |
135 | self::serializeToString( $child, $options, $accum ); |
136 | $child = $child->nextSibling; |
137 | } |
138 | } |
139 | $accum( '</' . $localName . '>', $node, 'end' ); |
140 | } else { |
141 | $accum( '/>', $node, 'end' ); |
142 | } |
143 | return; |
144 | |
145 | case XML_DOCUMENT_NODE: |
146 | case XML_DOCUMENT_FRAG_NODE: |
147 | '@phan-var Document|DocumentFragment $node'; |
148 | // @var Document|DocumentFragment $node |
149 | $child = $node->firstChild; |
150 | while ( $child ) { |
151 | self::serializeToString( $child, $options, $accum ); |
152 | $child = $child->nextSibling; |
153 | } |
154 | return; |
155 | |
156 | case XML_TEXT_NODE: |
157 | '@phan-var Text $node'; // @var Text $node |
158 | $accum( self::encodeHtmlEntities( $node->nodeValue, 'xml' ), $node ); |
159 | return; |
160 | |
161 | case XML_COMMENT_NODE: |
162 | // According to |
163 | // http://www.w3.org/TR/DOM-Parsing/#dfn-concept-serialize-xml |
164 | // we could throw an exception here if node.data would not create |
165 | // a "well-formed" XML comment. But we use entity encoding when |
166 | // we create the comment node to ensure that node.data will always |
167 | // be okay; see DOMUtils.encodeComment(). |
168 | '@phan-var Comment $node'; // @var Comment $node |
169 | $accum( '<!--' . $node->nodeValue . '-->', $node ); |
170 | return; |
171 | |
172 | default: |
173 | $accum( '??' . DOMCompat::nodeName( $node ), $node ); |
174 | } |
175 | } |
176 | |
177 | /** |
178 | * Add data to an output/memory array (used when serialize() was called with the |
179 | * captureOffsets flag). |
180 | * @param array &$out Output array, see {@link self::serialize()} for details on the |
181 | * 'html' and 'offset' fields. The other fields (positions are 0-based |
182 | * and refer to UTF-8 byte indices): |
183 | * - start: position in the HTML of the end of the opening tag of <body> |
184 | * - last: (Node) last "about sibling" of the currently processed element |
185 | * (see {@link WTUtils::getAboutSiblings()} |
186 | * - uid: the ID of the element |
187 | * @param string $bit A piece of the HTML string |
188 | * @param Node $node The DOM node $bit is a part of |
189 | * @param ?string $flag 'start' when receiving the final part of the opening tag |
190 | * of an element, 'end' when receiving the final part of the closing tag of an element |
191 | * or the final part of a self-closing element. |
192 | */ |
193 | private static function accumOffsets( |
194 | array &$out, string $bit, Node $node, ?string $flag = null |
195 | ): void { |
196 | if ( DOMUtils::atTheTop( $node ) ) { |
197 | $out['html'] .= $bit; |
198 | if ( $flag === 'start' ) { |
199 | $out['start'] = strlen( $out['html'] ); |
200 | } elseif ( $flag === 'end' ) { |
201 | $out['start'] = null; |
202 | $out['uid'] = null; |
203 | } |
204 | } elseif ( |
205 | !( $node instanceof Element ) || $out['start'] === null || |
206 | !DOMUtils::atTheTop( $node->parentNode ) |
207 | ) { |
208 | // In case you're wondering, out.start may never be set if body |
209 | // isn't a child of the node passed to serializeToString, or if it |
210 | // is the node itself but options.innerXML is true. |
211 | $out['html'] .= $bit; |
212 | if ( $out['uid'] !== null ) { |
213 | $out['offsets'][$out['uid']]['html'][1] += strlen( $bit ); |
214 | } |
215 | } else { |
216 | $newUid = DOMCompat::getAttribute( $node, 'id' ); |
217 | // Encapsulated siblings don't have generated ids (but may have an id), |
218 | // so associate them with preceding content. |
219 | if ( $newUid && $newUid !== $out['uid'] && !$out['last'] ) { |
220 | if ( !WTUtils::isEncapsulationWrapper( $node ) ) { |
221 | $out['uid'] = $newUid; |
222 | } elseif ( WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
223 | $about = DOMCompat::getAttribute( $node, 'about' ); |
224 | $aboutSiblings = WTUtils::getAboutSiblings( $node, $about ); |
225 | $out['last'] = end( $aboutSiblings ); |
226 | $out['uid'] = $newUid; |
227 | } |
228 | } |
229 | if ( $out['last'] === $node && $flag === 'end' ) { |
230 | $out['last'] = null; |
231 | } |
232 | Assert::invariant( $out['uid'] !== null, 'uid cannot be null' ); |
233 | if ( !isset( $out['offsets'][$out['uid']] ) ) { |
234 | $dt = strlen( $out['html'] ) - $out['start']; |
235 | $out['offsets'][$out['uid']] = [ 'html' => [ $dt, $dt ] ]; |
236 | } |
237 | $out['html'] .= $bit; |
238 | $out['offsets'][$out['uid']]['html'][1] += strlen( $bit ); |
239 | } |
240 | } |
241 | |
242 | /** |
243 | * Serialize an HTML DOM3 node to an XHTML string. |
244 | * |
245 | * @param Node $node |
246 | * @param array $options |
247 | * - smartQuote (bool, default true): use single quotes for attributes when that's less escaping |
248 | * - innerXML (bool, default false): only serialize the contents of $node, exclude $node itself |
249 | * - captureOffsets (bool, default false): return tag position data (see below) |
250 | * - addDoctype (bool, default true): prepend a DOCTYPE when a full HTML document is serialized |
251 | * - saveData (bool, default false): Copy the NodeData into JSON attributes. This is for |
252 | * debugging purposes only, the normal code path is to use DOMDataUtils::storeDataAttribs(). |
253 | * - keepTmp (bool, default false): When saving data, include DataParsoid::$tmp. |
254 | * @return array An array with the following data: |
255 | * - html: the serialized HTML |
256 | * - offsets: the start and end position of each element in the HTML, in a |
257 | * [ $uid => [ 'html' => [ $start, $end ] ], ... ] format where $uid is the element's |
258 | * Parsoid ID, $start is the 0-based index of the first character of the element and |
259 | * $end is the index of the first character of the opening tag of the next sibling element, |
260 | * or the index of the last character of the element's closing tag if there is no next |
261 | * sibling. The positions are relative to the end of the opening <body> tag |
262 | * (the DOCTYPE header is not counted), and only present when the captureOffsets flag is set. |
263 | */ |
264 | public static function serialize( Node $node, array $options = [] ): array { |
265 | $options += [ |
266 | 'smartQuote' => true, |
267 | 'innerXML' => false, |
268 | 'captureOffsets' => false, |
269 | 'addDoctype' => true, |
270 | 'saveData' => false, |
271 | 'keepTmp' => false, |
272 | 'storeDiffMark' => false, |
273 | ]; |
274 | if ( $node instanceof Document ) { |
275 | $node = $node->documentElement; |
276 | } |
277 | $out = [ 'html' => '', 'offsets' => [], 'start' => null, 'uid' => null, 'last' => null ]; |
278 | $accum = $options['captureOffsets'] |
279 | ? function ( string $bit, Node $node, ?string $flag = null ) use ( &$out ): void { |
280 | self::accumOffsets( $out, $bit, $node, $flag ); |
281 | } |
282 | : static function ( string $bit ) use ( &$out ): void { |
283 | $out['html'] .= $bit; |
284 | }; |
285 | |
286 | if ( $options['innerXML'] ) { |
287 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
288 | self::serializeToString( $child, $options, $accum ); |
289 | } |
290 | } else { |
291 | self::serializeToString( $node, $options, $accum ); |
292 | } |
293 | // Ensure there's a doctype for documents. |
294 | if ( !$options['innerXML'] && DOMCompat::nodeName( $node ) === 'html' && $options['addDoctype'] ) { |
295 | $out['html'] = "<!DOCTYPE html>\n" . $out['html']; |
296 | } |
297 | // Drop the bookkeeping |
298 | unset( $out['start'], $out['uid'], $out['last'] ); |
299 | if ( !$options['captureOffsets'] ) { |
300 | unset( $out['offsets'] ); |
301 | } |
302 | return $out; |
303 | } |
304 | |
305 | } |