Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 85 |
|
0.00% |
0 / 4 |
CRAP | |
0.00% |
0 / 1 |
| SelectiveSerializer | |
0.00% |
0 / 85 |
|
0.00% |
0 / 4 |
992 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
| wrapTextChildrenOfNode | |
0.00% |
0 / 55 |
|
0.00% |
0 / 1 |
552 | |||
| preprocessDOMForSelser | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| serializeDOM | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
30 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Html2Wt; |
| 5 | |
| 6 | use Composer\Semver\Semver; |
| 7 | use Wikimedia\Parsoid\Config\Env; |
| 8 | use Wikimedia\Parsoid\Core\DomSourceRange; |
| 9 | use Wikimedia\Parsoid\Core\SelectiveUpdateData; |
| 10 | use Wikimedia\Parsoid\DOM\Comment; |
| 11 | use Wikimedia\Parsoid\DOM\Document; |
| 12 | use Wikimedia\Parsoid\DOM\Element; |
| 13 | use Wikimedia\Parsoid\DOM\Text; |
| 14 | use Wikimedia\Parsoid\Utils\ContentUtils; |
| 15 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 16 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 17 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 18 | use Wikimedia\Parsoid\Utils\Timing; |
| 19 | use Wikimedia\Parsoid\Utils\Utils; |
| 20 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 21 | use Wikimedia\Parsoid\Wikitext\Consts; |
| 22 | |
| 23 | /** |
| 24 | * This is a Serializer class that will compare two versions of a DOM |
| 25 | * and re-use the original wikitext for unmodified regions of the DOM. |
| 26 | * Originally this relied on special change markers inserted by the |
| 27 | * editor, but we now generate these ourselves using DOMDiff. |
| 28 | */ |
| 29 | class SelectiveSerializer { |
| 30 | |
| 31 | private Env $env; |
| 32 | private WikitextSerializer $wts; |
| 33 | private SelectiveUpdateData $selserData; |
| 34 | private bool $trace; |
| 35 | |
| 36 | public function __construct( Env $env, array $options ) { |
| 37 | $this->env = $env; |
| 38 | $this->wts = new WikitextSerializer( $env, $options ); |
| 39 | $this->selserData = $options['selserData']; |
| 40 | $this->trace = $this->env->hasTraceFlag( 'selser' ); |
| 41 | } |
| 42 | |
| 43 | /** |
| 44 | * Wrap text node children of nodes with name $nodeName in <span> tags and |
| 45 | * compute the DSR values for those span tags. |
| 46 | * |
| 47 | * This helps DOMDiff mark up diffs of content in these nodes at a more fine-grained level. |
| 48 | * |
| 49 | * These DSR values rely on availability of information about trimmed leading |
| 50 | * and trailing WS in these nodes in the wt->html direction. Given this info, |
| 51 | * on the original unedited DOM, the computed DSR values for span tags wrapping |
| 52 | * text nodes will be accurate. |
| 53 | * |
| 54 | * However, for the edited DOM with modified nodes, the computation is necessarily |
| 55 | * speculative and as such, the computed DSR values may be bogus. Given this, |
| 56 | * we rely on DOMDiff to diff the data-parsoid attribute and mark these nodes as |
| 57 | * modified because of the mismatched dsr values. If so, these span tags will never |
| 58 | * have selser reuse apply to them and the speculatively computed DSR values will |
| 59 | * be discarded. |
| 60 | * |
| 61 | * @param Element $body |
| 62 | * @param string $nodeName |
| 63 | */ |
| 64 | private function wrapTextChildrenOfNode( Element $body, string $nodeName ): void { |
| 65 | // Note that while it might seem that only the first and last child need to be |
| 66 | // wrapped, when nested list items are added, the previously last child of |
| 67 | // a list item become an intermediate child in the new DOM. Without the span |
| 68 | // wrapper, trailing trimmed whitespace gets dropped. |
| 69 | $inListItem = isset( Consts::$HTML['ListItemTags'][$nodeName] ); |
| 70 | foreach ( DOMCompat::querySelectorAll( $body, $nodeName ) as $elt ) { |
| 71 | if ( WTUtils::isLiteralHTMLNode( $elt ) ) { |
| 72 | continue; |
| 73 | } |
| 74 | |
| 75 | // Skip items with about id => part of templates / extensions like Cite |
| 76 | // CAVEAT: In some cases, this might be bailing out a little too early. |
| 77 | // For example, where certain extensions might actually support nested DSR |
| 78 | // values inside and where <li> items in them might benefit. But, given that |
| 79 | // so far, such extensions are more the exception than the norm, we will take |
| 80 | // the easy way out here and revisit this if dirty diffs for those <li> items |
| 81 | // merit further action in the future. |
| 82 | if ( $elt->hasAttribute( 'about' ) ) { |
| 83 | continue; |
| 84 | } |
| 85 | |
| 86 | // No point wrapping text nodes if there is no usable DSR |
| 87 | $eltDSR = DOMDataUtils::getDataParsoid( $elt )->dsr ?? null; |
| 88 | if ( !Utils::isValidDSR( $eltDSR ) ) { |
| 89 | continue; |
| 90 | } |
| 91 | |
| 92 | $doc = $body->ownerDocument; |
| 93 | $firstChild = $c = $elt->firstChild; |
| 94 | $start = $eltDSR->innerStart(); |
| 95 | while ( $c ) { |
| 96 | if ( $eltDSR && $c === $firstChild ) { |
| 97 | if ( !$eltDSR->hasValidLeadingWS() ) { |
| 98 | // We don't have accurate information about the length of trimmed WS. |
| 99 | // So, we cannot wrap this text node with a <span>. |
| 100 | break; |
| 101 | } else { |
| 102 | $start += $eltDSR->leadingWS; |
| 103 | } |
| 104 | } |
| 105 | $next = $c->nextSibling; |
| 106 | if ( $c instanceof Text ) { |
| 107 | $text = $c->nodeValue; |
| 108 | $len = strlen( $text ); |
| 109 | |
| 110 | // Don't wrap newlines since single-line-context handling will convert these |
| 111 | // newlines into spaces and introduce dirty-diffs. Leaving nls outside the |
| 112 | // wrapped text lets it be handled as separator text and emitted appropriately. |
| 113 | if ( $len > 0 && $text[$len - 1] === "\n" ) { |
| 114 | $text = rtrim( $text, "\n" ); |
| 115 | $numOfNls = $len - strlen( $text ); |
| 116 | $nl = str_repeat( "\n", $numOfNls ); |
| 117 | $len -= $numOfNls; |
| 118 | } else { |
| 119 | $nl = null; |
| 120 | |
| 121 | // Detect last child of "original" item and tack on trailingWS width |
| 122 | // to the contents of this text node. If this is a list item and |
| 123 | // we added a nested list, that nested list will be the last item. |
| 124 | // |
| 125 | // Note that trailingWS is only captured for the last line, so if |
| 126 | // the text ends in a newline (the "if" condition), we shouldn't need |
| 127 | // to do this. |
| 128 | if ( $eltDSR && ( |
| 129 | !$next || ( |
| 130 | $inListItem && DOMUtils::isList( $next ) && WTUtils::isNewElt( $next ) |
| 131 | ) |
| 132 | ) ) { |
| 133 | if ( !$eltDSR->hasValidTrailingWS() ) { |
| 134 | break; |
| 135 | } else { |
| 136 | $len += $eltDSR->trailingWS; |
| 137 | } |
| 138 | } |
| 139 | } |
| 140 | |
| 141 | $span = $doc->createElement( 'span' ); |
| 142 | $span->setAttribute( 'data-mw-selser-wrapper', '' ); |
| 143 | $dp = DOMDataUtils::getDataParsoid( $span ); |
| 144 | $dp->dsr = new DomSourceRange( $start, $start + $len, 0, 0 ); |
| 145 | $start += $len; |
| 146 | |
| 147 | if ( $nl ) { |
| 148 | $elt->insertBefore( $span, $c ); |
| 149 | $span->appendChild( $doc->createTextNode( $text ) ); |
| 150 | $c->nodeValue = $nl; |
| 151 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable |
| 152 | $start += $numOfNls; |
| 153 | } else { |
| 154 | $elt->replaceChild( $span, $c ); |
| 155 | $span->appendChild( $c ); |
| 156 | } |
| 157 | } elseif ( $c instanceof Comment ) { |
| 158 | $start += WTUtils::decodedCommentLength( $c ); |
| 159 | } elseif ( $c instanceof Element ) { |
| 160 | // No point wrapping following text nodes if there won't be any usable DSR |
| 161 | $cDSR = DOMDataUtils::getDataParsoid( $c )->dsr ?? null; |
| 162 | if ( !Utils::isValidDSR( $cDSR ) ) { |
| 163 | break; |
| 164 | } |
| 165 | $start = $cDSR->end; |
| 166 | $next = $c->hasAttribute( 'about' ) ? WTUtils::skipOverEncapsulatedContent( $c ) : $next; |
| 167 | } |
| 168 | $c = $next; |
| 169 | } |
| 170 | } |
| 171 | } |
| 172 | |
| 173 | private function preprocessDOMForSelser( Element $body ): void { |
| 174 | if ( Semver::satisfies( $this->env->getInputContentVersion(), '>=2.1.2' ) ) { |
| 175 | // Wrap text node children of <li> elements in dummy spans |
| 176 | $this->wrapTextChildrenOfNode( $body, 'li' ); |
| 177 | $this->wrapTextChildrenOfNode( $body, 'dd' ); |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | /** |
| 182 | * Selectively serialize an HTML DOM. |
| 183 | * |
| 184 | * WARNING: You probably want to use WikitextContentModelHandler::fromDOM instead. |
| 185 | * |
| 186 | * @param Document $doc |
| 187 | * @return string |
| 188 | */ |
| 189 | public function serializeDOM( Document $doc ): string { |
| 190 | $serializeStart = null; |
| 191 | $domDiffStart = null; |
| 192 | $r = null; |
| 193 | |
| 194 | $body = DOMCompat::getBody( $doc ); |
| 195 | $oldBody = DOMCompat::getBody( $this->selserData->revDOM ); |
| 196 | |
| 197 | // Preprocess DOMs - this is specific to selser |
| 198 | $this->preprocessDOMForSelser( $oldBody ); |
| 199 | $this->preprocessDOMForSelser( $body ); |
| 200 | |
| 201 | // Use provided diff-marked DOM (used during testing) |
| 202 | // or generate one (used in production) |
| 203 | if ( $this->env->getDOMDiff() ) { |
| 204 | $diff = [ 'isEmpty' => false ]; |
| 205 | $body = DOMCompat::getBody( $this->env->getDOMDiff() ); |
| 206 | } else { |
| 207 | $domDiffTiming = Timing::start( $this->env->getSiteConfig() ); |
| 208 | $diff = ( new DOMDiff( $this->env ) )->diff( $oldBody, $body ); |
| 209 | $domDiffTiming->end( 'html2wt.selser.domDiff', 'html2wt_domDiff_seconds', [ 'wts' => 'selser' ] ); |
| 210 | } |
| 211 | |
| 212 | if ( $diff['isEmpty'] ) { |
| 213 | // Nothing was modified, just re-use the original source |
| 214 | $r = $this->selserData->revText; |
| 215 | } else { |
| 216 | if ( $this->trace || $this->env->hasDumpFlag( 'dom:post-dom-diff' ) ) { |
| 217 | $options = [ 'storeDiffMark' => true ]; |
| 218 | $this->env->writeDump( |
| 219 | ContentUtils::dumpDOM( $oldBody, 'OLD DOM ', $options ) . "\n" . |
| 220 | ContentUtils::dumpDOM( $body, 'DOM after running DOMDiff', $options ) |
| 221 | ); |
| 222 | } |
| 223 | |
| 224 | // Call the WikitextSerializer to do our bidding |
| 225 | $r = $this->wts->serializeDOM( $doc, true ); |
| 226 | } |
| 227 | |
| 228 | return $r; |
| 229 | } |
| 230 | } |