Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 85 |
|
0.00% |
0 / 4 |
CRAP | |
0.00% |
0 / 1 |
SelectiveSerializer | |
0.00% |
0 / 85 |
|
0.00% |
0 / 4 |
992 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
wrapTextChildrenOfNode | |
0.00% |
0 / 55 |
|
0.00% |
0 / 1 |
552 | |||
preprocessDOMForSelser | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
serializeDOM | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
30 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Composer\Semver\Semver; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\Core\DomSourceRange; |
9 | use Wikimedia\Parsoid\Core\SelectiveUpdateData; |
10 | use Wikimedia\Parsoid\DOM\Comment; |
11 | use Wikimedia\Parsoid\DOM\Document; |
12 | use Wikimedia\Parsoid\DOM\Element; |
13 | use Wikimedia\Parsoid\DOM\Text; |
14 | use Wikimedia\Parsoid\Utils\ContentUtils; |
15 | use Wikimedia\Parsoid\Utils\DOMCompat; |
16 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
17 | use Wikimedia\Parsoid\Utils\DOMUtils; |
18 | use Wikimedia\Parsoid\Utils\Timing; |
19 | use Wikimedia\Parsoid\Utils\Utils; |
20 | use Wikimedia\Parsoid\Utils\WTUtils; |
21 | use Wikimedia\Parsoid\Wikitext\Consts; |
22 | |
23 | /** |
24 | * This is a Serializer class that will compare two versions of a DOM |
25 | * and re-use the original wikitext for unmodified regions of the DOM. |
26 | * Originally this relied on special change markers inserted by the |
27 | * editor, but we now generate these ourselves using DOMDiff. |
28 | */ |
29 | class SelectiveSerializer { |
30 | |
31 | private Env $env; |
32 | private WikitextSerializer $wts; |
33 | private SelectiveUpdateData $selserData; |
34 | private bool $trace; |
35 | |
36 | public function __construct( Env $env, array $options ) { |
37 | $this->env = $env; |
38 | $this->wts = new WikitextSerializer( $env, $options ); |
39 | $this->selserData = $options['selserData']; |
40 | $this->trace = $this->env->hasTraceFlag( 'selser' ); |
41 | } |
42 | |
43 | /** |
44 | * Wrap text node children of nodes with name $nodeName in <span> tags and |
45 | * compute the DSR values for those span tags. |
46 | * |
47 | * This helps DOMDiff mark up diffs of content in these nodes at a more fine-grained level. |
48 | * |
49 | * These DSR values rely on availability of information about trimmed leading |
50 | * and trailing WS in these nodes in the wt->html direction. Given this info, |
51 | * on the original unedited DOM, the computed DSR values for span tags wrapping |
52 | * text nodes will be accurate. |
53 | * |
54 | * However, for the edited DOM with modified nodes, the computation is necessarily |
55 | * speculative and as such, the computed DSR values may be bogus. Given this, |
56 | * we rely on DOMDiff to diff the data-parsoid attribute and mark these nodes as |
57 | * modified because of the mismatched dsr values. If so, these span tags will never |
58 | * have selser reuse apply to them and the speculatively computed DSR values will |
59 | * be discarded. |
60 | * |
61 | * @param Element $body |
62 | * @param string $nodeName |
63 | */ |
64 | private function wrapTextChildrenOfNode( Element $body, string $nodeName ): void { |
65 | // Note that while it might seem that only the first and last child need to be |
66 | // wrapped, when nested list items are added, the previously last child of |
67 | // a list item become an intermediate child in the new DOM. Without the span |
68 | // wrapper, trailing trimmed whitespace gets dropped. |
69 | $inListItem = isset( Consts::$HTML['ListItemTags'][$nodeName] ); |
70 | foreach ( DOMCompat::querySelectorAll( $body, $nodeName ) as $elt ) { |
71 | if ( WTUtils::isLiteralHTMLNode( $elt ) ) { |
72 | continue; |
73 | } |
74 | |
75 | // Skip items with about id => part of templates / extensions like Cite |
76 | // CAVEAT: In some cases, this might be bailing out a little too early. |
77 | // For example, where certain extensions might actually support nested DSR |
78 | // values inside and where <li> items in them might benefit. But, given that |
79 | // so far, such extensions are more the exception than the norm, we will take |
80 | // the easy way out here and revisit this if dirty diffs for those <li> items |
81 | // merit further action in the future. |
82 | if ( $elt->hasAttribute( 'about' ) ) { |
83 | continue; |
84 | } |
85 | |
86 | // No point wrapping text nodes if there is no usable DSR |
87 | $eltDSR = DOMDataUtils::getDataParsoid( $elt )->dsr ?? null; |
88 | if ( !Utils::isValidDSR( $eltDSR ) ) { |
89 | continue; |
90 | } |
91 | |
92 | $doc = $body->ownerDocument; |
93 | $firstChild = $c = $elt->firstChild; |
94 | $start = $eltDSR->innerStart(); |
95 | while ( $c ) { |
96 | if ( $eltDSR && $c === $firstChild ) { |
97 | if ( !$eltDSR->hasValidLeadingWS() ) { |
98 | // We don't have accurate information about the length of trimmed WS. |
99 | // So, we cannot wrap this text node with a <span>. |
100 | break; |
101 | } else { |
102 | $start += $eltDSR->leadingWS; |
103 | } |
104 | } |
105 | $next = $c->nextSibling; |
106 | if ( $c instanceof Text ) { |
107 | $text = $c->nodeValue; |
108 | $len = strlen( $text ); |
109 | |
110 | // Don't wrap newlines since single-line-context handling will convert these |
111 | // newlines into spaces and introduce dirty-diffs. Leaving nls outside the |
112 | // wrapped text lets it be handled as separator text and emitted appropriately. |
113 | if ( $len > 0 && $text[$len - 1] === "\n" ) { |
114 | $text = rtrim( $text, "\n" ); |
115 | $numOfNls = $len - strlen( $text ); |
116 | $nl = str_repeat( "\n", $numOfNls ); |
117 | $len -= $numOfNls; |
118 | } else { |
119 | $nl = null; |
120 | |
121 | // Detect last child of "original" item and tack on trailingWS width |
122 | // to the contents of this text node. If this is a list item and |
123 | // we added a nested list, that nested list will be the last item. |
124 | // |
125 | // Note that trailingWS is only captured for the last line, so if |
126 | // the text ends in a newline (the "if" condition), we shouldn't need |
127 | // to do this. |
128 | if ( $eltDSR && ( |
129 | !$next || ( |
130 | $inListItem && DOMUtils::isList( $next ) && WTUtils::isNewElt( $next ) |
131 | ) |
132 | ) ) { |
133 | if ( !$eltDSR->hasValidTrailingWS() ) { |
134 | break; |
135 | } else { |
136 | $len += $eltDSR->trailingWS; |
137 | } |
138 | } |
139 | } |
140 | |
141 | $span = $doc->createElement( 'span' ); |
142 | $span->setAttribute( 'data-mw-selser-wrapper', '' ); |
143 | $dp = DOMDataUtils::getDataParsoid( $span ); |
144 | $dp->dsr = new DomSourceRange( $start, $start + $len, 0, 0 ); |
145 | $start += $len; |
146 | |
147 | if ( $nl ) { |
148 | $elt->insertBefore( $span, $c ); |
149 | $span->appendChild( $doc->createTextNode( $text ) ); |
150 | $c->nodeValue = $nl; |
151 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable |
152 | $start += $numOfNls; |
153 | } else { |
154 | $elt->replaceChild( $span, $c ); |
155 | $span->appendChild( $c ); |
156 | } |
157 | } elseif ( $c instanceof Comment ) { |
158 | $start += WTUtils::decodedCommentLength( $c ); |
159 | } elseif ( $c instanceof Element ) { |
160 | // No point wrapping following text nodes if there won't be any usable DSR |
161 | $cDSR = DOMDataUtils::getDataParsoid( $c )->dsr ?? null; |
162 | if ( !Utils::isValidDSR( $cDSR ) ) { |
163 | break; |
164 | } |
165 | $start = $cDSR->end; |
166 | $next = $c->hasAttribute( 'about' ) ? WTUtils::skipOverEncapsulatedContent( $c ) : $next; |
167 | } |
168 | $c = $next; |
169 | } |
170 | } |
171 | } |
172 | |
173 | private function preprocessDOMForSelser( Element $body ): void { |
174 | if ( Semver::satisfies( $this->env->getInputContentVersion(), '>=2.1.2' ) ) { |
175 | // Wrap text node children of <li> elements in dummy spans |
176 | $this->wrapTextChildrenOfNode( $body, 'li' ); |
177 | $this->wrapTextChildrenOfNode( $body, 'dd' ); |
178 | } |
179 | } |
180 | |
181 | /** |
182 | * Selectively serialize an HTML DOM. |
183 | * |
184 | * WARNING: You probably want to use WikitextContentModelHandler::fromDOM instead. |
185 | * |
186 | * @param Document $doc |
187 | * @return string |
188 | */ |
189 | public function serializeDOM( Document $doc ): string { |
190 | $serializeStart = null; |
191 | $domDiffStart = null; |
192 | $r = null; |
193 | |
194 | $body = DOMCompat::getBody( $doc ); |
195 | $oldBody = DOMCompat::getBody( $this->selserData->revDOM ); |
196 | |
197 | // Preprocess DOMs - this is specific to selser |
198 | $this->preprocessDOMForSelser( $oldBody ); |
199 | $this->preprocessDOMForSelser( $body ); |
200 | |
201 | // Use provided diff-marked DOM (used during testing) |
202 | // or generate one (used in production) |
203 | if ( $this->env->getDOMDiff() ) { |
204 | $diff = [ 'isEmpty' => false ]; |
205 | $body = DOMCompat::getBody( $this->env->getDOMDiff() ); |
206 | } else { |
207 | $domDiffTiming = Timing::start( $this->env->getSiteConfig() ); |
208 | $diff = ( new DOMDiff( $this->env ) )->diff( $oldBody, $body ); |
209 | $domDiffTiming->end( 'html2wt.selser.domDiff', 'html2wt_domDiff_seconds', [ 'wts' => 'selser' ] ); |
210 | } |
211 | |
212 | if ( $diff['isEmpty'] ) { |
213 | // Nothing was modified, just re-use the original source |
214 | $r = $this->selserData->revText; |
215 | } else { |
216 | if ( $this->trace || $this->env->hasDumpFlag( 'dom:post-dom-diff' ) ) { |
217 | $options = [ 'storeDiffMark' => true ]; |
218 | $this->env->writeDump( |
219 | ContentUtils::dumpDOM( $oldBody, 'OLD DOM ', $options ) . "\n" . |
220 | ContentUtils::dumpDOM( $body, 'DOM after running DOMDiff', $options ) |
221 | ); |
222 | } |
223 | |
224 | // Call the WikitextSerializer to do our bidding |
225 | $r = $this->wts->serializeDOM( $doc, true ); |
226 | } |
227 | |
228 | return $r; |
229 | } |
230 | } |