Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
39.18% |
67 / 171 |
|
14.29% |
1 / 7 |
CRAP | |
0.00% |
0 / 1 |
| CleanUp | |
39.18% |
67 / 171 |
|
14.29% |
1 / 7 |
1632.77 | |
0.00% |
0 / 1 |
| stripMarkerMetas | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
90 | |||
| isEmptyNode | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
110 | |||
| handleEmptyElements | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
132 | |||
| inNativeContent | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| trimWhiteSpace | |
100.00% |
38 / 38 |
|
100.00% |
1 / 1 |
17 | |||
| finalCleanup | |
78.38% |
29 / 37 |
|
0.00% |
0 / 1 |
29.82 | |||
| markDiscardableDataParsoid | |
0.00% |
0 / 35 |
|
0.00% |
0 / 1 |
90 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Handlers; |
| 5 | |
| 6 | use Wikimedia\Assert\Assert; |
| 7 | use Wikimedia\Parsoid\Config\Env; |
| 8 | use Wikimedia\Parsoid\Core\DomSourceRange; |
| 9 | use Wikimedia\Parsoid\DOM\Comment; |
| 10 | use Wikimedia\Parsoid\DOM\Element; |
| 11 | use Wikimedia\Parsoid\DOM\Node; |
| 12 | use Wikimedia\Parsoid\DOM\Text; |
| 13 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
| 14 | use Wikimedia\Parsoid\NodeData\TempData; |
| 15 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 16 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 17 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 18 | use Wikimedia\Parsoid\Utils\DTState; |
| 19 | use Wikimedia\Parsoid\Utils\Utils; |
| 20 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 21 | use Wikimedia\Parsoid\Wikitext\Consts; |
| 22 | use Wikimedia\Parsoid\Wt2Html\TT\PreHandler; |
| 23 | |
| 24 | class CleanUp { |
| 25 | /** |
| 26 | * @param Element $node |
| 27 | * @return bool|Element |
| 28 | */ |
| 29 | public static function stripMarkerMetas( Element $node ) { |
| 30 | // This meta tag can never have data-mw associated with it. |
| 31 | // If it were produced by a template, it would always have a <pre> |
| 32 | // wrapper around which carries any relevant data-mw & typeof properties. |
| 33 | $isIndentPreSpace = PreHandler::isIndentPreWS( $node ); |
| 34 | if ( $isIndentPreSpace || |
| 35 | DOMUtils::hasTypeOf( $node, "mw:Placeholder/UnclosedComment" ) || |
| 36 | // Sometimes a non-tpl meta node might get the mw:Transclusion typeof |
| 37 | // element attached to it. So, check if the node has data-mw, |
| 38 | // in which case we also have to keep it. |
| 39 | ( DOMDataUtils::getDataMw( $node )->isEmpty() && ( |
| 40 | ( |
| 41 | DOMUtils::hasTypeOf( $node, 'mw:Placeholder/StrippedTag' ) && |
| 42 | // NOTE: In ComputeDSR, we don't zero out the width of these |
| 43 | // markers because they're staying in the DOM and serializeDOMNode |
| 44 | // only handles a few cases of zero width nodes. |
| 45 | !DOMUtils::isNestedInListItem( $node ) |
| 46 | ) || |
| 47 | DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) |
| 48 | ) ) |
| 49 | ) { |
| 50 | $nextNode = $node->nextSibling; |
| 51 | $parent = $node->parentNode; |
| 52 | if ( $isIndentPreSpace ) { |
| 53 | $dsr = DOMDataUtils::getDataParsoid( $parent )->dsr ?? null; |
| 54 | if ( $dsr ) { |
| 55 | // @see explanation in PreHandler::newIndentPreWS() |
| 56 | $dsr->openWidth = 1; |
| 57 | } |
| 58 | // Strip this in the cleanup handler since |
| 59 | // DOM passes till the end may need DSR info from this tag. |
| 60 | return true; |
| 61 | } else { |
| 62 | $parent->removeChild( $node ); |
| 63 | // stop the traversal, since this node is no longer in the DOM. |
| 64 | return $nextNode; |
| 65 | } |
| 66 | } else { |
| 67 | return true; |
| 68 | } |
| 69 | } |
| 70 | |
| 71 | /** |
| 72 | * The following are considered "empty node"s: |
| 73 | * - Comments, rendering transparent nodes, nowiki spans without content |
| 74 | * are all stripped by the core parser. |
| 75 | * - Text nodes with whitespace don't count either. |
| 76 | * - Parsoid-added span wrappers around other "empty node"s. |
| 77 | * |
| 78 | * @param Node $node |
| 79 | * @param bool &$hasRTNodes Set to true if the node contained rendering transparent nodes. |
| 80 | * Note this value is only reliable if ::isEmptyNode() returns true. |
| 81 | * @return bool |
| 82 | */ |
| 83 | private static function isEmptyNode( Node $node, bool &$hasRTNodes ): bool { |
| 84 | for ( $n = $node->firstChild; $n !== null; $n = $n->nextSibling ) { |
| 85 | if ( $n instanceof Comment ) { |
| 86 | continue; |
| 87 | } elseif ( $n instanceof Text ) { |
| 88 | if ( !preg_match( '/^[ \t\r\n]*$/D', $n->nodeValue ) ) { |
| 89 | return false; |
| 90 | } |
| 91 | continue; |
| 92 | } elseif ( $n instanceof Element ) { |
| 93 | if ( WTUtils::isRenderingTransparentNode( $n ) ) { |
| 94 | $hasRTNodes = true; |
| 95 | continue; |
| 96 | } |
| 97 | if ( |
| 98 | ( |
| 99 | DOMUtils::hasTypeOf( $n, 'mw:Nowiki' ) || |
| 100 | DOMDataUtils::getDataParsoid( $n )->getTempFlag( TempData::WRAPPER ) |
| 101 | ) && self::isEmptyNode( $n, $hasRTNodes ) |
| 102 | ) { |
| 103 | continue; |
| 104 | } |
| 105 | return false; |
| 106 | } else { |
| 107 | return false; |
| 108 | } |
| 109 | } |
| 110 | return true; |
| 111 | } |
| 112 | |
| 113 | // These template-wrapping attributes can be ignored while looking for |
| 114 | // empty elements. Note that data-mw & data-parsoid are unlikely to exist |
| 115 | // at this stage of DOM processing. This is conservative but safe. |
| 116 | // In this case, it is also sufficient since only p, li, tr can be deleted. |
| 117 | public const ALLOWED_TPL_WRAPPER_ATTRS = [ |
| 118 | 'about' => 1, |
| 119 | 'typeof' => 1, |
| 120 | ]; |
| 121 | |
| 122 | /** |
| 123 | * @param Node $node |
| 124 | * @param DTState $state |
| 125 | * @return bool|Node |
| 126 | */ |
| 127 | public static function handleEmptyElements( Node $node, DTState $state ) { |
| 128 | // Set by isEmptyNode() to indicate whether a node which is "empty" contained |
| 129 | // invisible "rendering transparent" nodes. |
| 130 | $hasRTNodes = false; |
| 131 | |
| 132 | if ( !( $node instanceof Element ) || |
| 133 | !isset( Consts::$Output['FlaggedEmptyElts'][DOMCompat::nodeName( $node )] ) || |
| 134 | !self::isEmptyNode( $node, $hasRTNodes ) |
| 135 | ) { |
| 136 | return true; |
| 137 | } |
| 138 | foreach ( DOMUtils::attributes( $node ) as $name => $value ) { |
| 139 | // Skip the Parsoid-added data attribute and template-wrapping attributes |
| 140 | if ( $name === DOMDataUtils::DATA_OBJECT_ATTR_NAME || |
| 141 | ( ( $state->tplInfo ?? null ) && isset( self::ALLOWED_TPL_WRAPPER_ATTRS[$name] ) ) |
| 142 | ) { |
| 143 | continue; |
| 144 | } |
| 145 | |
| 146 | return true; |
| 147 | } |
| 148 | |
| 149 | /** |
| 150 | * The node is known to be empty and a deletion candidate |
| 151 | * - If node is part of template content and is not the |
| 152 | * first encapsulation wrapper node, and doesn't contain |
| 153 | * any rendering transparent nodes, it can be deleted. |
| 154 | * - If not, we add the mw-empty-elt class so that wikis |
| 155 | * can decide what to do with them. |
| 156 | */ |
| 157 | if ( |
| 158 | $state->tplInfo && |
| 159 | $state->tplInfo->first !== $node && |
| 160 | !$hasRTNodes |
| 161 | ) { |
| 162 | $nextNode = $node->nextSibling; |
| 163 | $node->parentNode->removeChild( $node ); |
| 164 | return $nextNode; |
| 165 | } |
| 166 | |
| 167 | DOMCompat::getClassList( $node )->add( 'mw-empty-elt' ); |
| 168 | return true; |
| 169 | } |
| 170 | |
| 171 | /** |
| 172 | * FIXME: Worry about "about" siblings |
| 173 | * |
| 174 | * @param Env $env |
| 175 | * @param Element $node |
| 176 | * @return bool |
| 177 | */ |
| 178 | private static function inNativeContent( Env $env, Element $node ): bool { |
| 179 | while ( !DOMUtils::atTheTop( $node ) ) { |
| 180 | if ( WTUtils::getNativeExt( $env, $node ) !== null ) { |
| 181 | return true; |
| 182 | } |
| 183 | $node = $node->parentNode; |
| 184 | } |
| 185 | return false; |
| 186 | } |
| 187 | |
| 188 | /** |
| 189 | * Whitespace in this function refers to [ \t] only |
| 190 | * @param Element $node |
| 191 | * @param ?DomSourceRange $dsr |
| 192 | */ |
| 193 | private static function trimWhiteSpace( Element $node, ?DomSourceRange $dsr ): void { |
| 194 | // Trim leading ws (on the first line) |
| 195 | $trimmedLen = 0; |
| 196 | $updateDSR = true; |
| 197 | $skipped = false; |
| 198 | for ( $c = $node->firstChild; $c; $c = $next ) { |
| 199 | $next = $c->nextSibling; |
| 200 | if ( $c instanceof Text && preg_match( '/^[ \t]*$/D', $c->nodeValue ) ) { |
| 201 | $node->removeChild( $c ); |
| 202 | $trimmedLen += strlen( $c->nodeValue ); |
| 203 | $updateDSR = !$skipped; |
| 204 | } elseif ( !WTUtils::isRenderingTransparentNode( $c ) ) { |
| 205 | break; |
| 206 | } else { |
| 207 | // We are now skipping over a rendering transparent node |
| 208 | // and will trim additional whitespace => we cannot reliably |
| 209 | // maintain info about trimmed whitespace. |
| 210 | $skipped = true; |
| 211 | } |
| 212 | } |
| 213 | |
| 214 | if ( $c instanceof Text && |
| 215 | preg_match( '/^([ \t]+)([\s\S]*)$/D', $c->nodeValue, $matches ) |
| 216 | ) { |
| 217 | $updateDSR = !$skipped; |
| 218 | $c->nodeValue = $matches[2]; |
| 219 | $trimmedLen += strlen( $matches[1] ); |
| 220 | } |
| 221 | |
| 222 | if ( $dsr ) { |
| 223 | $dsr->leadingWS = $updateDSR ? $trimmedLen : -1; |
| 224 | } |
| 225 | |
| 226 | // Trim trailing ws (on the last line) |
| 227 | $trimmedLen = 0; |
| 228 | $updateDSR = true; |
| 229 | $skipped = false; |
| 230 | for ( $c = $node->lastChild; $c; $c = $prev ) { |
| 231 | $prev = $c->previousSibling; |
| 232 | if ( $c instanceof Text && preg_match( '/^[ \t]*$/D', $c->nodeValue ) ) { |
| 233 | $trimmedLen += strlen( $c->nodeValue ); |
| 234 | $node->removeChild( $c ); |
| 235 | $updateDSR = !$skipped; |
| 236 | } elseif ( !WTUtils::isRenderingTransparentNode( $c ) ) { |
| 237 | break; |
| 238 | } else { |
| 239 | // We are now skipping over a rendering transparent node |
| 240 | // and will trim additional whitespace => we cannot reliably |
| 241 | // maintain info about trimmed whitespace. |
| 242 | $skipped = true; |
| 243 | } |
| 244 | } |
| 245 | |
| 246 | if ( $c instanceof Text && |
| 247 | preg_match( '/^([\s\S]*\S)([ \t]+)$/D', $c->nodeValue, $matches ) |
| 248 | ) { |
| 249 | $updateDSR = !$skipped; |
| 250 | $c->nodeValue = $matches[1]; |
| 251 | $trimmedLen += strlen( $matches[2] ); |
| 252 | } |
| 253 | |
| 254 | if ( $dsr ) { |
| 255 | $dsr->trailingWS = $updateDSR ? $trimmedLen : -1; |
| 256 | } |
| 257 | } |
| 258 | |
| 259 | /** |
| 260 | * Perform some final cleanup |
| 261 | * |
| 262 | * @param Node $node |
| 263 | * @param DTState $state |
| 264 | * @return bool|Node The next node or true to continue with $node->nextSibling |
| 265 | */ |
| 266 | public static function finalCleanup( Node $node, DTState $state ) { |
| 267 | if ( !( $node instanceof Element ) ) { |
| 268 | return true; |
| 269 | } |
| 270 | |
| 271 | Assert::invariant( $state->atTopLevel, 'This pass should only be run on the top-level' ); |
| 272 | |
| 273 | $dp = DOMDataUtils::getDataParsoid( $node ); |
| 274 | // Delete from data parsoid, wikitext originating autoInsertedEnd info |
| 275 | if ( !empty( $dp->autoInsertedEnd ) && !WTUtils::hasLiteralHTMLMarker( $dp ) && |
| 276 | isset( Consts::$WTTagsWithNoClosingTags[DOMCompat::nodeName( $node )] ) |
| 277 | ) { |
| 278 | unset( $dp->autoInsertedEnd ); |
| 279 | } |
| 280 | |
| 281 | $isFirstEncapsulationWrapperNode = ( $state->tplInfo->first ?? null ) === $node || |
| 282 | // Traversal isn't done with tplInfo for section tags, but we should |
| 283 | // still clean them up as if they are the head of encapsulation. |
| 284 | WTUtils::isParsoidSectionTag( $node ); |
| 285 | |
| 286 | // Remove dp.src from elements that have non-empty data-mw and dsr. |
| 287 | // This should reduce data-parsoid bloat. |
| 288 | // |
| 289 | // Presence of data-mw is a proxy for us knowing how to serialize |
| 290 | // this content from HTML. Token handlers should strip src for |
| 291 | // content where data-mw isn't necessary and html2wt knows how to |
| 292 | // handle the HTML markup. |
| 293 | $validDSR = Utils::isValidDSR( $dp->dsr ?? null ) && |
| 294 | !DOMDataUtils::getDataMw( $node )->isEmpty(); |
| 295 | $isPageProp = DOMCompat::nodeName( $node ) === 'meta' && |
| 296 | str_starts_with( DOMCompat::getAttribute( $node, 'property' ) ?? '', 'mw:PageProp/' ); |
| 297 | if ( $validDSR && !$isPageProp ) { |
| 298 | unset( $dp->src ); |
| 299 | } elseif ( $isFirstEncapsulationWrapperNode && empty( $dp->tsr ) ) { |
| 300 | // Transcluded nodes will not have dp.tsr set |
| 301 | // and don't need dp.src either. |
| 302 | unset( $dp->src ); |
| 303 | } |
| 304 | |
| 305 | // Remove tsr |
| 306 | if ( property_exists( $dp, 'tsr' ) ) { |
| 307 | unset( $dp->tsr ); |
| 308 | } |
| 309 | |
| 310 | // Various places, like ContentUtils::shiftDSR, can set this to `null` |
| 311 | if ( property_exists( $dp, 'dsr' ) && $dp->dsr === null ) { |
| 312 | unset( $dp->dsr ); |
| 313 | } |
| 314 | |
| 315 | // Make dsr zero-range for fostered content |
| 316 | // to prevent selser from duplicating this content |
| 317 | // outside the table from where this came. |
| 318 | // |
| 319 | // But, do not zero it out if the node has template encapsulation |
| 320 | // information. That will be disastrous (see T54638, T54488). |
| 321 | if ( !empty( $dp->fostered ) && !empty( $dp->dsr ) && !$isFirstEncapsulationWrapperNode ) { |
| 322 | $dp->dsr->start = $dp->dsr->end; |
| 323 | } |
| 324 | |
| 325 | // Strip nowiki spans from encapsulated content but leave behind |
| 326 | // wrappers on root nodes since they have valid about ids and we |
| 327 | // don't want to break the about-chain by stripping the wrapper |
| 328 | // and associated ids (we cannot add an about id on the nowiki-ed |
| 329 | // content since that would be a text node). |
| 330 | if ( ( $state->tplInfo ?? null ) && !WTUtils::isEncapsulatedDOMForestRoot( $node ) && |
| 331 | DOMUtils::hasTypeOf( $node, 'mw:Nowiki' ) |
| 332 | ) { |
| 333 | DOMUtils::migrateChildren( $node, $node->parentNode, $node->nextSibling ); |
| 334 | $next = $node->nextSibling; |
| 335 | $node->parentNode->removeChild( $node ); |
| 336 | return $next; |
| 337 | } |
| 338 | |
| 339 | // Strip IndentPre marker metas |
| 340 | if ( PreHandler::isIndentPreWS( $node ) ) { |
| 341 | $nextNode = $node->nextSibling; |
| 342 | $node->parentNode->removeChild( $node ); |
| 343 | return $nextNode; |
| 344 | } |
| 345 | |
| 346 | // Trim whitespace from some wikitext markup |
| 347 | // not involving explicit HTML tags (T157481) |
| 348 | if ( !WTUtils::hasLiteralHTMLMarker( $dp ) && |
| 349 | isset( Consts::$WikitextTagsWithTrimmableWS[DOMCompat::nodeName( $node )] ) |
| 350 | ) { |
| 351 | self::trimWhiteSpace( $node, $dp->dsr ?? null ); |
| 352 | } |
| 353 | |
| 354 | return true; |
| 355 | } |
| 356 | |
| 357 | /** |
| 358 | * Mark which data-parsoid attributes can be discarded |
| 359 | * |
| 360 | * @param Node $node |
| 361 | * @param DTState $state |
| 362 | * @return bool|Node The next node or true to continue with $node->nextSibling |
| 363 | */ |
| 364 | public static function markDiscardableDataParsoid( Node $node, DTState $state ) { |
| 365 | if ( !( $node instanceof Element ) ) { |
| 366 | return true; |
| 367 | } |
| 368 | Assert::invariant( $state->atTopLevel, 'This pass should only be run on the top-level' ); |
| 369 | |
| 370 | $env = $state->env; |
| 371 | $dp = DOMDataUtils::getDataParsoid( $node ); |
| 372 | $isFirstEncapsulationWrapperNode = ( $state->tplInfo->first ?? null ) === $node || |
| 373 | // Traversal isn't done with tplInfo for section tags, but we should |
| 374 | // still clean them up as if they are the head of encapsulation. |
| 375 | WTUtils::isParsoidSectionTag( $node ); |
| 376 | |
| 377 | // Strip data-parsoid from templated content, where unnecessary. |
| 378 | $discardDataParsoid = ( |
| 379 | ( $state->tplInfo ?? null ) && |
| 380 | // Always keep info for the first node |
| 381 | !$isFirstEncapsulationWrapperNode && |
| 382 | // We can't remove data-parsoid from inside <references> text, |
| 383 | // as that's the only HTML representation we have left for it. |
| 384 | !self::inNativeContent( $env, $node ) && |
| 385 | // FIXME(T100856): stx is semantic info and should probably be |
| 386 | // moved out of data-parsoid. We can't remove dp from nodes |
| 387 | // with stx information for two scenarios. |
| 388 | // |
| 389 | // 1. The serializer uses stx information in some cases to |
| 390 | // emit the right newline separators. |
| 391 | // |
| 392 | // For example, "a\n\nb" and "<p>a</p><p>b/p>" both generate |
| 393 | // identical html but serialize to different wikitext. |
| 394 | // |
| 395 | // This is only needed for the last top-level node. |
| 396 | // |
| 397 | // 2. We omit heading wrapping for html literals in core's |
| 398 | // OutputTransform stages and need a way to distinguish them. |
| 399 | ( empty( $dp->stx ) || !( |
| 400 | ( $state->tplInfo->last ?? null ) === $node || |
| 401 | DOMUtils::isHeading( $node ) |
| 402 | ) ) |
| 403 | ); |
| 404 | |
| 405 | // Mark this as an empty AND new data-parsoid |
| 406 | if ( $discardDataParsoid ) { |
| 407 | // We cannot unset data-parsoid because any code that runs after |
| 408 | // this that calls DOMDataUtils::getDataParsoid will reinitialize |
| 409 | // it to an empty object. So, we do that re-init here and set the |
| 410 | // IS_NEW flag to ensure DOMDataUtils::storeDataAttribs discards this |
| 411 | // if unmodified. The empty data-parsoid blob is considered unmodified. |
| 412 | $dp = new DataParsoid; |
| 413 | $dp->setTempFlag( TempData::IS_NEW ); |
| 414 | DOMDataUtils::setDataParsoid( $node, $dp ); |
| 415 | } |
| 416 | |
| 417 | return true; |
| 418 | } |
| 419 | |
| 420 | } |