Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
35.05% |
68 / 194 |
|
0.00% |
0 / 9 |
CRAP | |
0.00% |
0 / 1 |
DOMDiff | |
35.05% |
68 / 194 |
|
0.00% |
0 / 9 |
1533.00 | |
0.00% |
0 / 1 |
nextAnalyzableSibling | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
debug | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
__construct | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
diff | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
treeEquals | |
0.00% |
0 / 37 |
|
0.00% |
0 / 1 |
380 | |||
doDOMDiff | |
78.16% |
68 / 87 |
|
0.00% |
0 / 1 |
36.17 | |||
subtreeDiffers | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
110 | |||
markNode | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
56 | |||
debugOut | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\DOM\Comment; |
10 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
11 | use Wikimedia\Parsoid\DOM\Element; |
12 | use Wikimedia\Parsoid\DOM\Node; |
13 | use Wikimedia\Parsoid\DOM\Text; |
14 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
15 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMCompat; |
17 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
18 | use Wikimedia\Parsoid\Utils\DOMUtils; |
19 | use Wikimedia\Parsoid\Utils\PHPUtils; |
20 | use Wikimedia\Parsoid\Utils\WTUtils; |
21 | |
22 | /** |
23 | * A DOM diff helper class. |
24 | * |
25 | * Compares two DOMs and annotates a copy of the passed-in DOM with change |
26 | * information for the selective serializer. |
27 | */ |
28 | class DOMDiff { |
29 | |
30 | // These attributes are ignored for equality purposes if they are added to a node. |
31 | private const IGNORE_ATTRIBUTES = [ |
32 | // Note that we are explicitly not ignoring data-parsoid even though clients |
33 | // would never modify data-parsoid because SelectiveSerializer is wrapping text |
34 | // nodes in spans and speculatively computes DSR offsets for these span tags |
35 | // which are accurate for original DOM and may be inaccurate for the edited DOM. |
36 | // By diffing data-parsoid which diffs the DSR as well, we ensure we mark such |
37 | // nodes as modified and prevent use of those speculatively computed incorrect |
38 | // DSR values. |
39 | 'data-parsoid-diff', |
40 | 'about', |
41 | DOMDataUtils::DATA_OBJECT_ATTR_NAME, |
42 | ]; |
43 | |
44 | /** |
45 | * @var Env |
46 | */ |
47 | public $env; |
48 | |
49 | /** @var ParsoidExtensionAPI */ |
50 | public $extApi; |
51 | |
52 | /** |
53 | * @var array |
54 | */ |
55 | public $specializedAttribHandlers; |
56 | |
57 | public bool $skipEncapsulatedContent = true; |
58 | |
59 | private function nextAnalyzableSibling( Node $node ): ?Node { |
60 | if ( WTUtils::isEncapsulationWrapper( $node ) && $this->skipEncapsulatedContent ) { |
61 | return WTUtils::skipOverEncapsulatedContent( $node ); |
62 | } |
63 | return $node->nextSibling; |
64 | } |
65 | |
66 | /** |
67 | * @param mixed ...$args |
68 | */ |
69 | private function debug( ...$args ): void { |
70 | $this->env->log( 'trace/domdiff', ...$args ); |
71 | } |
72 | |
73 | public function __construct( Env $env ) { |
74 | $this->env = $env; |
75 | $this->extApi = new ParsoidExtensionAPI( $env ); |
76 | $this->specializedAttribHandlers = [ |
77 | 'data-mw' => static function ( $nodeA, $dmwA, $nodeB, $dmwB ) { |
78 | return $dmwA == $dmwB; |
79 | }, |
80 | 'data-parsoid' => static function ( $nodeA, $dpA, $nodeB, $dpB ) { |
81 | return $dpA == $dpB; |
82 | }, |
83 | ]; |
84 | } |
85 | |
86 | /** |
87 | * Diff two HTML documents, and add / update data-parsoid-diff attributes with |
88 | * change information. |
89 | * |
90 | * @param Element $nodeA |
91 | * @param Element $nodeB |
92 | * @return array |
93 | */ |
94 | public function diff( Element $nodeA, Element $nodeB ): array { |
95 | Assert::invariant( |
96 | $nodeA->ownerDocument !== $nodeB->ownerDocument, |
97 | 'Expected to be diff\'ing different documents.' |
98 | ); |
99 | |
100 | $this->debug( static function () use( $nodeA, $nodeB ) { |
101 | return "ORIG:\n" . |
102 | DOMCompat::getOuterHTML( $nodeA ) . |
103 | "\nNEW :\n" . |
104 | DOMCompat::getOuterHTML( $nodeB ); |
105 | } ); |
106 | |
107 | // The root nodes are equal, call recursive differ |
108 | $foundChange = $this->doDOMDiff( $nodeA, $nodeB ); |
109 | return [ 'isEmpty' => !$foundChange ]; |
110 | } |
111 | |
112 | /** |
113 | * Test if two DOM nodes are equal. |
114 | * |
115 | * @param Node $nodeA |
116 | * @param Node $nodeB |
117 | * @param bool $deep |
118 | * @return bool |
119 | */ |
120 | public function treeEquals( Node $nodeA, Node $nodeB, bool $deep ): bool { |
121 | if ( $nodeA->nodeType !== $nodeB->nodeType ) { |
122 | return false; |
123 | } elseif ( $nodeA instanceof Text ) { |
124 | // In the past we've had bugs where we let non-primitive strings |
125 | // leak into our DOM. Safety first: |
126 | Assert::invariant( $nodeA->nodeValue === (string)$nodeA->nodeValue, '' ); |
127 | Assert::invariant( $nodeB->nodeValue === (string)$nodeB->nodeValue, '' ); |
128 | // ok, now do the comparison. |
129 | return $nodeA->nodeValue === $nodeB->nodeValue; |
130 | } elseif ( $nodeA instanceof Comment ) { |
131 | return WTUtils::decodeComment( $nodeA->nodeValue ) === |
132 | WTUtils::decodeComment( $nodeB->nodeValue ); |
133 | } elseif ( $nodeA instanceof Element || $nodeA instanceof DocumentFragment ) { |
134 | if ( $nodeA instanceof DocumentFragment ) { |
135 | if ( !( $nodeB instanceof DocumentFragment ) ) { |
136 | return false; |
137 | } |
138 | } else { // $nodeA instanceof Element |
139 | // Compare node name and attribute length |
140 | if ( |
141 | !( $nodeB instanceof Element ) || |
142 | DOMCompat::nodeName( $nodeA ) !== DOMCompat::nodeName( $nodeB ) || |
143 | !DiffUtils::attribsEquals( |
144 | $nodeA, |
145 | $nodeB, |
146 | self::IGNORE_ATTRIBUTES, |
147 | $this->specializedAttribHandlers |
148 | ) |
149 | ) { |
150 | return false; |
151 | } |
152 | } |
153 | |
154 | // Passed all tests, node itself is equal. |
155 | if ( $deep ) { |
156 | $childA = null; |
157 | $childB = null; |
158 | // Compare # of children, since that's fast. |
159 | // (Avoid de-optimizing DOM by using node#childNodes) |
160 | for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild; |
161 | $childA && $childB; |
162 | $childA = $childA->nextSibling, $childB = $childB->nextSibling |
163 | ) { |
164 | /* don't look inside children yet, just look at # of children */ |
165 | } |
166 | |
167 | if ( $childA || $childB ) { |
168 | return false; /* nodes have different # of children */ |
169 | } |
170 | // Now actually compare the child subtrees |
171 | for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild; |
172 | $childA && $childB; |
173 | $childA = $childA->nextSibling, $childB = $childB->nextSibling |
174 | ) { |
175 | if ( !$this->treeEquals( $childA, $childB, $deep ) ) { |
176 | return false; |
177 | } |
178 | } |
179 | } |
180 | |
181 | // Did not find a diff yet, so the trees must be equal. |
182 | return true; |
183 | } |
184 | throw new UnreachableException( 'we shouldn\'t get here' ); |
185 | } |
186 | |
187 | /** |
188 | * Diff two DOM trees by comparing them node-by-node. |
189 | * |
190 | * TODO: Implement something more intelligent like |
191 | * http://gregory.cobena.free.fr/www/Publications/%5BICDE2002%5D%20XyDiff%20-%20published%20version.pdf, |
192 | * which uses hash signatures of subtrees to efficiently detect moves / |
193 | * wrapping. |
194 | * |
195 | * Adds / updates a data-parsoid-diff structure with change information. |
196 | * |
197 | * Returns true if subtree is changed, false otherwise. |
198 | * |
199 | * TODO: |
200 | * Assume typical CSS white-space, so ignore ws diffs in non-pre content. |
201 | * |
202 | * @param Node $baseParentNode |
203 | * @param Node $newParentNode |
204 | * @return bool |
205 | */ |
206 | public function doDOMDiff( Node $baseParentNode, Node $newParentNode ): bool { |
207 | // Perform a relaxed version of the recursive treeEquals algorithm that |
208 | // allows for some minor differences and tries to produce a sensible diff |
209 | // marking using heuristics like look-ahead on siblings. |
210 | $baseNode = $baseParentNode->firstChild; |
211 | $newNode = $newParentNode->firstChild; |
212 | $lookaheadNode = null; |
213 | $foundDiffOverall = false; |
214 | |
215 | while ( $baseNode && $newNode ) { |
216 | $dontAdvanceNewNode = false; |
217 | $this->debugOut( $baseNode, $newNode ); |
218 | // shallow check first |
219 | if ( !$this->treeEquals( $baseNode, $newNode, false ) ) { |
220 | $this->debug( '-- not equal --' ); |
221 | $savedNewNode = $newNode; |
222 | $foundDiff = false; |
223 | |
224 | // Some simplistic look-ahead, currently limited to a single level |
225 | // in the DOM. |
226 | |
227 | // look-ahead in *new* DOM to detect insertions |
228 | if ( DiffDOMUtils::isContentNode( $baseNode ) ) { |
229 | $this->debug( '--lookahead in new dom--' ); |
230 | $lookaheadNode = $newNode->nextSibling; |
231 | while ( $lookaheadNode ) { |
232 | $this->debugOut( $baseNode, $lookaheadNode, 'new' ); |
233 | if ( DiffDOMUtils::isContentNode( $lookaheadNode ) && |
234 | $this->treeEquals( $baseNode, $lookaheadNode, true ) |
235 | ) { |
236 | // mark skipped-over nodes as inserted |
237 | $markNode = $newNode; |
238 | while ( $markNode !== $lookaheadNode ) { |
239 | $this->debug( '--found diff: inserted--' ); |
240 | $this->markNode( $markNode, DiffMarkers::INSERTED ); |
241 | $markNode = $markNode->nextSibling; |
242 | } |
243 | $foundDiff = true; |
244 | $newNode = $lookaheadNode; |
245 | break; |
246 | } |
247 | $lookaheadNode = self::nextAnalyzableSibling( $lookaheadNode ); |
248 | } |
249 | } |
250 | |
251 | // look-ahead in *base* DOM to detect deletions |
252 | if ( !$foundDiff && DiffDOMUtils::isContentNode( $newNode ) ) { |
253 | $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $baseNode ); |
254 | $this->debug( '--lookahead in old dom--' ); |
255 | $lookaheadNode = $baseNode->nextSibling; |
256 | while ( $lookaheadNode ) { |
257 | $this->debugOut( $lookaheadNode, $newNode, 'old' ); |
258 | if ( DiffDOMUtils::isContentNode( $lookaheadNode ) && |
259 | $this->treeEquals( $lookaheadNode, $newNode, true ) |
260 | ) { |
261 | $this->debug( '--found diff: deleted--' ); |
262 | // mark skipped-over nodes as deleted |
263 | $this->markNode( $newNode, DiffMarkers::DELETED, $isBlockNode ); |
264 | $baseNode = $lookaheadNode; |
265 | $foundDiff = true; |
266 | break; |
267 | } elseif ( !WTUtils::emitsSolTransparentSingleLineWT( $lookaheadNode ) ) { |
268 | // We only care about the deleted node prior to the one that |
269 | // gets a tree match (but, ignore nodes that show up in wikitext |
270 | // but don't affect sol-state or HTML rendering -- note that |
271 | // whitespace is being ignored, but that whitespace occurs |
272 | // between block nodes). |
273 | $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $lookaheadNode ); |
274 | } |
275 | $lookaheadNode = self::nextAnalyzableSibling( $lookaheadNode ); |
276 | } |
277 | } |
278 | |
279 | if ( !$foundDiff ) { |
280 | if ( !( $savedNewNode instanceof Element ) ) { |
281 | $this->debug( '--found diff: modified text/comment--' ); |
282 | $this->markNode( |
283 | $savedNewNode, DiffMarkers::DELETED, |
284 | WTUtils::isBlockNodeWithVisibleWT( $baseNode ) |
285 | ); |
286 | } elseif ( $baseNode instanceof Element && |
287 | DOMCompat::nodeName( $savedNewNode ) === DOMCompat::nodeName( $baseNode ) && |
288 | ( DOMDataUtils::getDataParsoid( $savedNewNode )->stx ?? null ) === |
289 | ( DOMDataUtils::getDataParsoid( $baseNode )->stx ?? null ) |
290 | ) { |
291 | // Identical wrapper-type, but modified. |
292 | // Mark modified-wrapper, and recurse. |
293 | $this->debug( '--found diff: modified-wrapper--' ); |
294 | $this->markNode( $savedNewNode, DiffMarkers::MODIFIED_WRAPPER ); |
295 | $this->subtreeDiffers( $baseNode, $savedNewNode ); |
296 | } else { |
297 | // We now want to compare current newNode with the next baseNode. |
298 | $dontAdvanceNewNode = true; |
299 | |
300 | // Since we are advancing in an old DOM without advancing |
301 | // in the new DOM, there were deletions. Add a deletion marker |
302 | // since this is important for accurate separator handling in selser. |
303 | $this->markNode( |
304 | $savedNewNode, DiffMarkers::DELETED, |
305 | WTUtils::isBlockNodeWithVisibleWT( $baseNode ) |
306 | ); |
307 | } |
308 | } |
309 | |
310 | // Record the fact that direct children changed in the parent node |
311 | $this->debug( '--found diff: children-changed--' ); |
312 | $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED ); |
313 | |
314 | $foundDiffOverall = true; |
315 | } elseif ( $this->subtreeDiffers( $baseNode, $newNode ) ) { |
316 | $foundDiffOverall = true; |
317 | } |
318 | |
319 | // And move on to the next pair (skipping over template HTML) |
320 | if ( $baseNode && $newNode ) { |
321 | $baseNode = self::nextAnalyzableSibling( $baseNode ); |
322 | if ( !$dontAdvanceNewNode ) { |
323 | $newNode = self::nextAnalyzableSibling( $newNode ); |
324 | } |
325 | } |
326 | } |
327 | |
328 | // mark extra new nodes as inserted |
329 | while ( $newNode ) { |
330 | $this->debug( '--found trailing new node: inserted--' ); |
331 | $this->markNode( $newNode, DiffMarkers::INSERTED ); |
332 | $foundDiffOverall = true; |
333 | $newNode = self::nextAnalyzableSibling( $newNode ); |
334 | } |
335 | |
336 | // If there are extra base nodes, something was deleted. Mark the parent as |
337 | // having lost children for now. |
338 | if ( $baseNode ) { |
339 | $this->debug( '--found trailing base nodes: deleted--' ); |
340 | $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED ); |
341 | // SSS FIXME: WTS checks for zero children in a few places |
342 | // That code would have to be upgraded if we emit mw:DiffMarker |
343 | // in this scenario. So, bailing out in this one case for now. |
344 | if ( $newParentNode->hasChildNodes() ) { |
345 | $meta = $newParentNode->ownerDocument->createElement( 'meta' ); |
346 | DOMUtils::addTypeOf( $meta, 'mw:DiffMarker/deleted' ); |
347 | if ( WTUtils::isBlockNodeWithVisibleWT( $baseNode ) ) { |
348 | $meta->setAttribute( 'data-is-block', 'true' ); |
349 | } |
350 | $newParentNode->appendChild( $meta ); |
351 | } |
352 | $foundDiffOverall = true; |
353 | } |
354 | |
355 | return $foundDiffOverall; |
356 | } |
357 | |
358 | /* *************************************************** |
359 | * Helpers |
360 | * ***************************************************/ |
361 | |
362 | /** |
363 | * @param Node $baseNode |
364 | * @param Node $newNode |
365 | * @return bool |
366 | */ |
367 | private function subtreeDiffers( Node $baseNode, Node $newNode ): bool { |
368 | $baseEncapsulated = WTUtils::isEncapsulationWrapper( $baseNode ); |
369 | $newEncapsulated = WTUtils::isEncapsulationWrapper( $newNode ); |
370 | |
371 | if ( !$baseEncapsulated && !$newEncapsulated ) { |
372 | $this->debug( '--shallow equal: recursing--' ); |
373 | // Recursively diff subtrees if not template-like content |
374 | $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode ); |
375 | } elseif ( $baseEncapsulated && $newEncapsulated ) { |
376 | '@phan-var Element $baseNode'; // @var Element $baseNode |
377 | '@phan-var Element $newNode'; // @var Element $newNode |
378 | |
379 | $ext = null; |
380 | |
381 | $baseExtTagName = WTUtils::getExtTagName( $baseNode ); |
382 | if ( $baseExtTagName ) { |
383 | $ext = $this->env->getSiteConfig()->getExtTagImpl( $baseExtTagName ); |
384 | } |
385 | |
386 | if ( $ext && ( $baseExtTagName === WTUtils::getExtTagName( $newNode ) ) ) { |
387 | $this->debug( '--diffing extension content--' ); |
388 | $subtreeDiffers = $ext->diffHandler( |
389 | $this->extApi, [ $this, 'doDOMDiff' ], $baseNode, $newNode |
390 | ); |
391 | } elseif ( $this->skipEncapsulatedContent ) { |
392 | // Otherwise, for encapsulated content, we don't know about the subtree. |
393 | $subtreeDiffers = false; |
394 | } else { |
395 | $this->debug( '--shallow equal (encapsulated): recursing--' ); |
396 | // Recursively diff subtrees if not template-like content |
397 | $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode ); |
398 | } |
399 | } else { |
400 | // FIXME: Maybe $editNode should be marked as inserted to avoid |
401 | // losing any edits, at the cost of more normalization. |
402 | // $state->inInsertedContent is only set when we're in inserted |
403 | // content, so not sure this is currently doing all that much. |
404 | $subtreeDiffers = true; |
405 | } |
406 | |
407 | if ( $subtreeDiffers ) { |
408 | $this->debug( '--found diff: subtree-changed--' ); |
409 | $this->markNode( $newNode, DiffMarkers::SUBTREE_CHANGED ); |
410 | } |
411 | return $subtreeDiffers; |
412 | } |
413 | |
414 | private function markNode( Node $node, string $mark, bool $blockNodeDeleted = false ): void { |
415 | $meta = DiffUtils::addDiffMark( $node, $this->env, $mark ); |
416 | |
417 | if ( $meta && $blockNodeDeleted ) { |
418 | $meta->setAttribute( 'data-is-block', 'true' ); |
419 | } |
420 | |
421 | if ( $mark === DiffMarkers::DELETED || $mark === DiffMarkers::INSERTED ) { |
422 | $this->markNode( $node->parentNode, DiffMarkers::CHILDREN_CHANGED ); |
423 | } |
424 | |
425 | // Clear out speculatively computed DSR values for data-mw-selser-wrapper nodes |
426 | // since they may be incorrect. This eliminates any inadvertent use of |
427 | // these incorrect values. |
428 | if ( $node instanceof Element && $node->hasAttribute( 'data-mw-selser-wrapper' ) ) { |
429 | DOMDataUtils::getDataParsoid( $node )->dsr = null; |
430 | } |
431 | } |
432 | |
433 | private function debugOut( Node $nodeA, Node $nodeB, string $laPrefix = '' ): void { |
434 | $prefix = 'trace/domdiff'; |
435 | $this->env->log( $prefix, |
436 | static fn () => '--> A' . $laPrefix . ':' . |
437 | ( $nodeA instanceof Element |
438 | ? DOMCompat::getOuterHTML( $nodeA ) |
439 | : PHPUtils::jsonEncode( $nodeA->nodeValue ) ) |
440 | ); |
441 | $this->env->log( $prefix, |
442 | static fn () => '--> B' . $laPrefix . ':' . |
443 | ( $nodeB instanceof Element |
444 | ? DOMCompat::getOuterHTML( $nodeB ) |
445 | : PHPUtils::jsonEncode( $nodeB->nodeValue ) ) |
446 | ); |
447 | } |
448 | } |