Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
61.67% |
74 / 120 |
|
33.33% |
1 / 3 |
CRAP | |
0.00% |
0 / 1 |
| XHtmlSerializer | |
62.18% |
74 / 119 |
|
33.33% |
1 / 3 |
204.90 | |
0.00% |
0 / 1 |
| serializeToString | |
73.33% |
44 / 60 |
|
0.00% |
0 / 1 |
42.87 | |||
| accumOffsets | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
272 | |||
| serialize | |
100.00% |
30 / 30 |
|
100.00% |
1 / 1 |
9 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Wt2Html; |
| 5 | |
| 6 | use Wikimedia\Assert\Assert; |
| 7 | use Wikimedia\Parsoid\Core\DOMCompat; |
| 8 | use Wikimedia\Parsoid\DOM\Comment; |
| 9 | use Wikimedia\Parsoid\DOM\Document; |
| 10 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
| 11 | use Wikimedia\Parsoid\DOM\Element; |
| 12 | use Wikimedia\Parsoid\DOM\Node; |
| 13 | use Wikimedia\Parsoid\DOM\Text; |
| 14 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 15 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 16 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 17 | use Wikimedia\Parsoid\Wikitext\Consts; |
| 18 | |
| 19 | /** |
| 20 | * Stand-alone XHtmlSerializer for DOM3 documents. |
| 21 | * |
| 22 | * The output is identical to standard XHTML5 DOM serialization, as given by |
| 23 | * http://www.w3.org/TR/html-polyglot/ |
| 24 | * and |
| 25 | * https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments |
| 26 | * except that we may quote attributes with single quotes, *only* where that would |
| 27 | * result in more compact output than the standard double-quoted serialization. |
| 28 | */ |
| 29 | class XHtmlSerializer { |
| 30 | |
| 31 | // https://html.spec.whatwg.org/#serialising-html-fragments |
| 32 | private const ALSO_SERIALIZE_AS_VOID = [ |
| 33 | 'basefont' => true, |
| 34 | 'bgsound' => true, |
| 35 | 'frame' => true, |
| 36 | 'keygen' => true |
| 37 | ]; |
| 38 | |
| 39 | /** |
| 40 | * Elements that strip leading newlines |
| 41 | * http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#html-fragment-serialization-algorithm |
| 42 | */ |
| 43 | private const NEWLINE_STRIPPING_ELEMENTS = [ |
| 44 | 'pre' => true, |
| 45 | 'textarea' => true, |
| 46 | 'listing' => true |
| 47 | ]; |
| 48 | |
| 49 | private const ENTITY_ENCODINGS_SINGLE = [ '<' => '<', '&' => '&', "'" => ''' ]; |
| 50 | private const ENTITY_ENCODINGS_DOUBLE = [ '<' => '<', '&' => '&', '"' => '"' ]; |
| 51 | private const ENTITY_ENCODINGS_XML = [ '<' => '<', '&' => '&', "\u{0338}" => '̸' ]; |
| 52 | |
| 53 | /** |
| 54 | * Serialize an HTML DOM3 node to XHTML. The XHTML and associated information will be fed |
| 55 | * step-by-step to the callback given in $accum. |
| 56 | * @param Node $node |
| 57 | * @param array $options See {@link XHtmlSerializer::serialize()} |
| 58 | * @param callable $accum function( $bit, $node, $flag ) |
| 59 | * - $bit: (string) piece of HTML code |
| 60 | * - $node: (Node) ?? |
| 61 | * - $flag: (string|null) 'start' or 'end' (??) |
| 62 | */ |
| 63 | private static function serializeToString( Node $node, array $options, callable $accum ): void { |
| 64 | $smartQuote = $options['smartQuote']; |
| 65 | $noSideEffects = $options['noSideEffects'] && |
| 66 | // Don't bother dumping rich attributes if this isn't a prepare document |
| 67 | DOMDataUtils::isPreparedAndLoaded( $node->ownerDocument ); |
| 68 | switch ( $node->nodeType ) { |
| 69 | case XML_ELEMENT_NODE: |
| 70 | '@phan-var Element $node'; // @var Element $node |
| 71 | $nodeName = DOMUtils::nodeName( $node ); |
| 72 | $child = $nodeName === 'template' ? |
| 73 | DOMCompat::getTemplateElementContent( $node )->firstChild : |
| 74 | $node->firstChild; |
| 75 | $localName = $node->localName; |
| 76 | $accum( '<' . $localName, $node ); |
| 77 | if ( $noSideEffects ) { |
| 78 | // Ensure that embedded HTML is properly dumped -- |
| 79 | // note that this isn't *exactly* "no side effects" (sigh) |
| 80 | DOMDataUtils::eagerlyLoadRichAttributes( $node ); |
| 81 | } |
| 82 | $attrs = DOMCompat::attributes( $node ); |
| 83 | if ( $noSideEffects ) { |
| 84 | DOMDataUtils::dumpRichAttribs( $node, $attrs, $options['keepTmp'], $options['storeDiffMark'] ); |
| 85 | } |
| 86 | if ( $options['sortAttrs'] ) { |
| 87 | # sort on attribute name to reduce test case dependency on |
| 88 | # attribute order |
| 89 | ksort( $attrs ); |
| 90 | } |
| 91 | foreach ( $attrs as $an => $av ) { |
| 92 | if ( $smartQuote |
| 93 | && str_contains( $av, '"' ) |
| 94 | && ( !str_contains( $av, "'" ) |
| 95 | // More double quotes than single quotes in value? |
| 96 | || substr_count( $av, '"' ) > substr_count( $av, "'" ) |
| 97 | ) |
| 98 | ) { |
| 99 | // use single quotes |
| 100 | $accum( ' ' . $an . "='" |
| 101 | . strtr( $av, self::ENTITY_ENCODINGS_SINGLE ) . "'", |
| 102 | $node ); |
| 103 | } else { |
| 104 | // use double quotes |
| 105 | $accum( ' ' . $an . '="' |
| 106 | . strtr( $av, self::ENTITY_ENCODINGS_DOUBLE ) . '"', |
| 107 | $node ); |
| 108 | } |
| 109 | } |
| 110 | if ( $child || ( |
| 111 | !isset( Consts::$HTML['VoidTags'][$nodeName] ) && |
| 112 | !isset( self::ALSO_SERIALIZE_AS_VOID[$nodeName] ) |
| 113 | ) ) { |
| 114 | $accum( '>', $node, 'start' ); |
| 115 | // if is cdata child node |
| 116 | if ( DOMUtils::isRawTextElement( $node ) ) { |
| 117 | // TODO: perform context-sensitive escaping? |
| 118 | // Currently this content is not normally part of our DOM, so |
| 119 | // no problem. If it was, we'd probably have to do some |
| 120 | // tag-specific escaping. Examples: |
| 121 | // * < to \u003c in <script> |
| 122 | // * < to \3c in <style> |
| 123 | // ... |
| 124 | if ( $child ) { |
| 125 | $accum( $child->nodeValue, $node ); |
| 126 | } |
| 127 | } else { |
| 128 | if ( $child && isset( self::NEWLINE_STRIPPING_ELEMENTS[$localName] ) |
| 129 | && $child->nodeType === XML_TEXT_NODE && str_starts_with( $child->nodeValue, "\n" ) |
| 130 | ) { |
| 131 | /* If current node is a pre, textarea, or listing element, |
| 132 | * and the first child node of the element, if any, is a |
| 133 | * Text node whose character data has as its first |
| 134 | * character a U+000A LINE FEED (LF) character, then |
| 135 | * append a U+000A LINE FEED (LF) character. */ |
| 136 | $accum( "\n", $node ); |
| 137 | } |
| 138 | while ( $child ) { |
| 139 | self::serializeToString( $child, $options, $accum ); |
| 140 | $child = $child->nextSibling; |
| 141 | } |
| 142 | } |
| 143 | $accum( '</' . $localName . '>', $node, 'end' ); |
| 144 | } else { |
| 145 | $accum( '/>', $node, 'end' ); |
| 146 | } |
| 147 | return; |
| 148 | |
| 149 | case XML_DOCUMENT_NODE: |
| 150 | case XML_DOCUMENT_FRAG_NODE: |
| 151 | '@phan-var Document|DocumentFragment $node'; |
| 152 | // @var Document|DocumentFragment $node |
| 153 | $child = $node->firstChild; |
| 154 | while ( $child ) { |
| 155 | self::serializeToString( $child, $options, $accum ); |
| 156 | $child = $child->nextSibling; |
| 157 | } |
| 158 | return; |
| 159 | |
| 160 | case XML_TEXT_NODE: |
| 161 | '@phan-var Text $node'; // @var Text $node |
| 162 | $accum( strtr( $node->nodeValue, self::ENTITY_ENCODINGS_XML ), $node ); |
| 163 | return; |
| 164 | |
| 165 | case XML_COMMENT_NODE: |
| 166 | // According to |
| 167 | // http://www.w3.org/TR/DOM-Parsing/#dfn-concept-serialize-xml |
| 168 | // we could throw an exception here if node.data would not create |
| 169 | // a "well-formed" XML comment. But we use entity encoding when |
| 170 | // we create the comment node to ensure that node.data will always |
| 171 | // be okay; see DOMUtils.encodeComment(). |
| 172 | '@phan-var Comment $node'; // @var Comment $node |
| 173 | $accum( '<!--' . $node->nodeValue . '-->', $node ); |
| 174 | return; |
| 175 | |
| 176 | default: |
| 177 | $accum( '??' . DOMUtils::nodeName( $node ), $node ); |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | /** |
| 182 | * Add data to an output/memory array (used when serialize() was called with the |
| 183 | * captureOffsets flag). |
| 184 | * @param array &$out Output array, see {@link self::serialize()} for details on the |
| 185 | * 'html' and 'offset' fields. The other fields (positions are 0-based |
| 186 | * and refer to UTF-8 byte indices): |
| 187 | * - start: position in the HTML of the end of the opening tag of <body> |
| 188 | * - last: (Node) last "about sibling" of the currently processed element |
| 189 | * (see {@link WTUtils::getAboutSiblings()} |
| 190 | * - uid: the ID of the element |
| 191 | * @param string $bit A piece of the HTML string |
| 192 | * @param Node $node The DOM node $bit is a part of |
| 193 | * @param ?string $flag 'start' when receiving the final part of the opening tag |
| 194 | * of an element, 'end' when receiving the final part of the closing tag of an element |
| 195 | * or the final part of a self-closing element. |
| 196 | */ |
| 197 | private static function accumOffsets( |
| 198 | array &$out, string $bit, Node $node, ?string $flag = null |
| 199 | ): void { |
| 200 | if ( DOMUtils::atTheTop( $node ) ) { |
| 201 | $out['html'] .= $bit; |
| 202 | if ( $flag === 'start' ) { |
| 203 | $out['start'] = strlen( $out['html'] ); |
| 204 | } elseif ( $flag === 'end' ) { |
| 205 | $out['start'] = null; |
| 206 | $out['uid'] = null; |
| 207 | } |
| 208 | } elseif ( |
| 209 | !( $node instanceof Element ) || $out['start'] === null || |
| 210 | !DOMUtils::atTheTop( $node->parentNode ) |
| 211 | ) { |
| 212 | // In case you're wondering, out.start may never be set if body |
| 213 | // isn't a child of the node passed to serializeToString, or if it |
| 214 | // is the node itself but options.innerXML is true. |
| 215 | $out['html'] .= $bit; |
| 216 | if ( $out['uid'] !== null ) { |
| 217 | $out['offsets'][$out['uid']]['html'][1] += strlen( $bit ); |
| 218 | } |
| 219 | } else { |
| 220 | $newUid = DOMCompat::getAttribute( $node, 'id' ); |
| 221 | // Encapsulated siblings don't have generated ids (but may have an id), |
| 222 | // so associate them with preceding content. |
| 223 | if ( $newUid && $newUid !== $out['uid'] && !$out['last'] ) { |
| 224 | if ( !WTUtils::isEncapsulationWrapper( $node ) ) { |
| 225 | $out['uid'] = $newUid; |
| 226 | } elseif ( WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
| 227 | $about = DOMCompat::getAttribute( $node, 'about' ); |
| 228 | $aboutSiblings = WTUtils::getAboutSiblings( $node, $about ); |
| 229 | $out['last'] = end( $aboutSiblings ); |
| 230 | $out['uid'] = $newUid; |
| 231 | } |
| 232 | } |
| 233 | if ( $out['last'] === $node && $flag === 'end' ) { |
| 234 | $out['last'] = null; |
| 235 | } |
| 236 | Assert::invariant( $out['uid'] !== null, 'uid cannot be null' ); |
| 237 | if ( !isset( $out['offsets'][$out['uid']] ) ) { |
| 238 | $dt = strlen( $out['html'] ) - $out['start']; |
| 239 | $out['offsets'][$out['uid']] = [ 'html' => [ $dt, $dt ] ]; |
| 240 | } |
| 241 | $out['html'] .= $bit; |
| 242 | $out['offsets'][$out['uid']]['html'][1] += strlen( $bit ); |
| 243 | } |
| 244 | } |
| 245 | |
| 246 | /** |
| 247 | * Serialize an HTML DOM3 node to an XHTML string. |
| 248 | * |
| 249 | * @param Node $node |
| 250 | * @param array $options |
| 251 | * - smartQuote (bool, default true): use single quotes for attributes when that's less escaping |
| 252 | * - innerXML (bool, default false): only serialize the contents of $node, exclude $node itself |
| 253 | * - captureOffsets (bool, default false): return tag position data (see below) |
| 254 | * - addDoctype (bool, default true): prepend a DOCTYPE when a full HTML document is serialized |
| 255 | * - noSideEffects (bool, default false): Copy the NodeData into JSON attributes. This is for |
| 256 | * debugging purposes only, the normal code path is to use DOMDataUtils::storeDataAttribs(). |
| 257 | * - keepTmp (bool, default false): When saving data, include DataParsoid::$tmp. |
| 258 | * - sortAttrs (bool, default false): Sort all attributes when serializing. |
| 259 | * This helps ensure non-semantic differences in test cases don't |
| 260 | * cause spurious failures. |
| 261 | * @return array An array with the following data: |
| 262 | * - html: the serialized HTML |
| 263 | * - offsets: the start and end position of each element in the HTML, in a |
| 264 | * [ $uid => [ 'html' => [ $start, $end ] ], ... ] format where $uid is the element's |
| 265 | * Parsoid ID, $start is the 0-based index of the first character of the element and |
| 266 | * $end is the index of the first character of the opening tag of the next sibling element, |
| 267 | * or the index of the last character of the element's closing tag if there is no next |
| 268 | * sibling. The positions are relative to the end of the opening <body> tag |
| 269 | * (the DOCTYPE header is not counted), and only present when the captureOffsets flag is set. |
| 270 | */ |
| 271 | public static function serialize( Node $node, array $options = [] ): array { |
| 272 | $options += [ |
| 273 | 'smartQuote' => true, |
| 274 | 'innerXML' => false, |
| 275 | 'captureOffsets' => false, |
| 276 | 'addDoctype' => true, |
| 277 | 'noSideEffects' => false, |
| 278 | 'keepTmp' => false, |
| 279 | 'storeDiffMark' => false, |
| 280 | 'sortAttrs' => false, |
| 281 | ]; |
| 282 | if ( $node instanceof Document ) { |
| 283 | $node = $node->documentElement; |
| 284 | } |
| 285 | $out = [ 'html' => '', 'offsets' => [], 'start' => null, 'uid' => null, 'last' => null ]; |
| 286 | $accum = $options['captureOffsets'] |
| 287 | ? function ( string $bit, Node $node, ?string $flag = null ) use ( &$out ): void { |
| 288 | self::accumOffsets( $out, $bit, $node, $flag ); |
| 289 | } |
| 290 | : static function ( string $bit ) use ( &$out ): void { |
| 291 | $out['html'] .= $bit; |
| 292 | }; |
| 293 | |
| 294 | if ( $options['innerXML'] ) { |
| 295 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
| 296 | self::serializeToString( $child, $options, $accum ); |
| 297 | } |
| 298 | } else { |
| 299 | self::serializeToString( $node, $options, $accum ); |
| 300 | } |
| 301 | // Ensure there's a doctype for documents. |
| 302 | if ( !$options['innerXML'] && DOMUtils::nodeName( $node ) === 'html' && $options['addDoctype'] ) { |
| 303 | $out['html'] = "<!DOCTYPE html>\n" . $out['html']; |
| 304 | } |
| 305 | // Drop the bookkeeping |
| 306 | unset( $out['start'], $out['uid'], $out['last'] ); |
| 307 | if ( !$options['captureOffsets'] ) { |
| 308 | unset( $out['offsets'] ); |
| 309 | } |
| 310 | return $out; |
| 311 | } |
| 312 | |
| 313 | } |
| 314 | |
| 315 | // This class was renamed in Parsoid v0.21.0 |
| 316 | class_alias( XHtmlSerializer::class, XMLSerializer::class ); |