Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
34.00% covered (danger)
34.00%
68 / 200
0.00% covered (danger)
0.00%
0 / 9
CRAP
0.00% covered (danger)
0.00%
0 / 1
DOMDiff
34.00% covered (danger)
34.00%
68 / 200
0.00% covered (danger)
0.00%
0 / 9
1648.33
0.00% covered (danger)
0.00%
0 / 1
 nextNonTemplateSibling
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
6
 debug
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 __construct
0.00% covered (danger)
0.00%
0 / 19
0.00% covered (danger)
0.00%
0 / 1
20
 diff
0.00% covered (danger)
0.00%
0 / 12
0.00% covered (danger)
0.00%
0 / 1
2
 treeEquals
0.00% covered (danger)
0.00%
0 / 37
0.00% covered (danger)
0.00%
0 / 1
380
 doDOMDiff
78.16% covered (warning)
78.16%
68 / 87
0.00% covered (danger)
0.00%
0 / 1
36.17
 subtreeDiffers
0.00% covered (danger)
0.00%
0 / 21
0.00% covered (danger)
0.00%
0 / 1
90
 markNode
0.00% covered (danger)
0.00%
0 / 7
0.00% covered (danger)
0.00%
0 / 1
56
 debugOut
0.00% covered (danger)
0.00%
0 / 13
0.00% covered (danger)
0.00%
0 / 1
12
1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Html2Wt;
5
6use Wikimedia\Assert\Assert;
7use Wikimedia\Assert\UnreachableException;
8use Wikimedia\Parsoid\Config\Env;
9use Wikimedia\Parsoid\DOM\Comment;
10use Wikimedia\Parsoid\DOM\DocumentFragment;
11use Wikimedia\Parsoid\DOM\Element;
12use Wikimedia\Parsoid\DOM\Node;
13use Wikimedia\Parsoid\DOM\Text;
14use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI;
15use Wikimedia\Parsoid\Utils\DiffDOMUtils;
16use Wikimedia\Parsoid\Utils\DOMCompat;
17use Wikimedia\Parsoid\Utils\DOMDataUtils;
18use Wikimedia\Parsoid\Utils\DOMUtils;
19use Wikimedia\Parsoid\Utils\PHPUtils;
20use Wikimedia\Parsoid\Utils\WTUtils;
21
22/**
23 * A DOM diff helper class.
24 *
25 * Compares two DOMs and annotates a copy of the passed-in DOM with change
26 * information for the selective serializer.
27 */
28class DOMDiff {
29
30    // These attributes are ignored for equality purposes if they are added to a node.
31    private const IGNORE_ATTRIBUTES = [
32        // Note that we are explicitly not ignoring data-parsoid even though clients
33        // would never modify data-parsoid because SelectiveSerializer is wrapping text
34        // nodes in spans and speculatively computes DSR offsets for these span tags
35        // which are accurate for original DOM and may be inaccurate for the edited DOM.
36        // By diffing data-parsoid which diffs the DSR as well, we ensure we mark such
37        // nodes as modified and prevent use of those speculatively computed incorrect
38        // DSR values.
39        'data-parsoid-diff',
40        'about',
41        DOMDataUtils::DATA_OBJECT_ATTR_NAME,
42    ];
43
44    /**
45     * @var Env
46     */
47    public $env;
48
49    /** @var ParsoidExtensionAPI */
50    public $extApi;
51
52    /**
53     * @var array
54     */
55    public $specializedAttribHandlers;
56
57    private function nextNonTemplateSibling( Node $node ): ?Node {
58        if ( WTUtils::isEncapsulationWrapper( $node ) ) {
59            return WTUtils::skipOverEncapsulatedContent( $node );
60        }
61        return $node->nextSibling;
62    }
63
64    /**
65     * @param mixed ...$args
66     */
67    private function debug( ...$args ): void {
68        $this->env->log( 'trace/domdiff', ...$args );
69    }
70
71    public function __construct( Env $env ) {
72        $this->env = $env;
73        $this->extApi = new ParsoidExtensionAPI( $env );
74        $this->specializedAttribHandlers = [
75            'data-mw' => static function ( $nodeA, $dmwA, $nodeB, $dmwB ) {
76                return $dmwA == $dmwB;
77            },
78            'data-parsoid' => static function ( $nodeA, $dpA, $nodeB, $dpB ) {
79                return $dpA == $dpB;
80            },
81            // TODO(T254502): This is added temporarily for backwards
82            // compatibility and can be removed when versions up to 2.1.0
83            // are no longer stored
84            'typeof' => static function ( $nodeA, $valA, $nodeB, $valB ) {
85                if ( $valA === $valB ) {
86                    return true;
87                } elseif ( $valA === 'mw:DisplaySpace' ) {
88                    return $valB === 'mw:DisplaySpace mw:Placeholder';
89                } elseif ( $valB === 'mw:DisplaySpace' ) {
90                    return $valA === 'mw:DisplaySpace mw:Placeholder';
91                } else {
92                    return false;
93                }
94            }
95        ];
96    }
97
98    /**
99     * Diff two HTML documents, and add / update data-parsoid-diff attributes with
100     * change information.
101     *
102     * @param Element $nodeA
103     * @param Element $nodeB
104     * @return array
105     */
106    public function diff( Element $nodeA, Element $nodeB ): array {
107        Assert::invariant(
108            $nodeA->ownerDocument !== $nodeB->ownerDocument,
109            'Expected to be diff\'ing different documents.'
110        );
111
112        $this->debug( static function () use( $nodeA, $nodeB ) {
113            return "ORIG:\n" .
114                DOMCompat::getOuterHTML( $nodeA ) .
115                "\nNEW :\n" .
116                DOMCompat::getOuterHTML( $nodeB );
117        } );
118
119        // The root nodes are equal, call recursive differ
120        $foundChange = $this->doDOMDiff( $nodeA, $nodeB );
121        return [ 'isEmpty' => !$foundChange ];
122    }
123
124    /**
125     * Test if two DOM nodes are equal.
126     *
127     * @param Node $nodeA
128     * @param Node $nodeB
129     * @param bool $deep
130     * @return bool
131     */
132    public function treeEquals( Node $nodeA, Node $nodeB, bool $deep ): bool {
133        if ( $nodeA->nodeType !== $nodeB->nodeType ) {
134            return false;
135        } elseif ( $nodeA instanceof Text ) {
136            // In the past we've had bugs where we let non-primitive strings
137            // leak into our DOM.  Safety first:
138            Assert::invariant( $nodeA->nodeValue === (string)$nodeA->nodeValue, '' );
139            Assert::invariant( $nodeB->nodeValue === (string)$nodeB->nodeValue, '' );
140            // ok, now do the comparison.
141            return $nodeA->nodeValue === $nodeB->nodeValue;
142        } elseif ( $nodeA instanceof Comment ) {
143            return WTUtils::decodeComment( $nodeA->nodeValue ) ===
144                WTUtils::decodeComment( $nodeB->nodeValue );
145        } elseif ( $nodeA instanceof Element || $nodeA instanceof DocumentFragment ) {
146            if ( $nodeA instanceof DocumentFragment ) {
147                if ( !( $nodeB instanceof DocumentFragment ) ) {
148                    return false;
149                }
150            } else {  // $nodeA instanceof Element
151                // Compare node name and attribute length
152                if (
153                    !( $nodeB instanceof Element ) ||
154                    DOMCompat::nodeName( $nodeA ) !== DOMCompat::nodeName( $nodeB ) ||
155                    !DiffUtils::attribsEquals(
156                        $nodeA,
157                        $nodeB,
158                        self::IGNORE_ATTRIBUTES,
159                        $this->specializedAttribHandlers
160                    )
161                ) {
162                    return false;
163                }
164            }
165
166            // Passed all tests, node itself is equal.
167            if ( $deep ) {
168                $childA = null;
169                $childB = null;
170                // Compare # of children, since that's fast.
171                // (Avoid de-optimizing DOM by using node#childNodes)
172                for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild;
173                    $childA && $childB;
174                    $childA = $childA->nextSibling, $childB = $childB->nextSibling
175                ) {
176                    /* don't look inside children yet, just look at # of children */
177                }
178
179                if ( $childA || $childB ) {
180                    return false; /* nodes have different # of children */
181                }
182                // Now actually compare the child subtrees
183                for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild;
184                    $childA && $childB;
185                    $childA = $childA->nextSibling, $childB = $childB->nextSibling
186                ) {
187                    if ( !$this->treeEquals( $childA, $childB, $deep ) ) {
188                        return false;
189                    }
190                }
191            }
192
193            // Did not find a diff yet, so the trees must be equal.
194            return true;
195        }
196        throw new UnreachableException( 'we shouldn\'t get here' );
197    }
198
199    /**
200     * Diff two DOM trees by comparing them node-by-node.
201     *
202     * TODO: Implement something more intelligent like
203     * http://gregory.cobena.free.fr/www/Publications/%5BICDE2002%5D%20XyDiff%20-%20published%20version.pdf,
204     * which uses hash signatures of subtrees to efficiently detect moves /
205     * wrapping.
206     *
207     * Adds / updates a data-parsoid-diff structure with change information.
208     *
209     * Returns true if subtree is changed, false otherwise.
210     *
211     * TODO:
212     * Assume typical CSS white-space, so ignore ws diffs in non-pre content.
213     *
214     * @param Node $baseParentNode
215     * @param Node $newParentNode
216     * @return bool
217     */
218    public function doDOMDiff( Node $baseParentNode, Node $newParentNode ): bool {
219        // Perform a relaxed version of the recursive treeEquals algorithm that
220        // allows for some minor differences and tries to produce a sensible diff
221        // marking using heuristics like look-ahead on siblings.
222        $baseNode = $baseParentNode->firstChild;
223        $newNode = $newParentNode->firstChild;
224        $lookaheadNode = null;
225        $foundDiffOverall = false;
226
227        while ( $baseNode && $newNode ) {
228            $dontAdvanceNewNode = false;
229            $this->debugOut( $baseNode, $newNode );
230            // shallow check first
231            if ( !$this->treeEquals( $baseNode, $newNode, false ) ) {
232                $this->debug( '-- not equal --' );
233                $savedNewNode = $newNode;
234                $foundDiff = false;
235
236                // Some simplistic look-ahead, currently limited to a single level
237                // in the DOM.
238
239                // look-ahead in *new* DOM to detect insertions
240                if ( DiffDOMUtils::isContentNode( $baseNode ) ) {
241                    $this->debug( '--lookahead in new dom--' );
242                    $lookaheadNode = $newNode->nextSibling;
243                    while ( $lookaheadNode ) {
244                        $this->debugOut( $baseNode, $lookaheadNode, 'new' );
245                        if ( DiffDOMUtils::isContentNode( $lookaheadNode ) &&
246                            $this->treeEquals( $baseNode, $lookaheadNode, true )
247                        ) {
248                            // mark skipped-over nodes as inserted
249                            $markNode = $newNode;
250                            while ( $markNode !== $lookaheadNode ) {
251                                $this->debug( '--found diff: inserted--' );
252                                $this->markNode( $markNode, DiffMarkers::INSERTED );
253                                $markNode = $markNode->nextSibling;
254                            }
255                            $foundDiff = true;
256                            $newNode = $lookaheadNode;
257                            break;
258                        }
259                        $lookaheadNode = self::nextNonTemplateSibling( $lookaheadNode );
260                    }
261                }
262
263                // look-ahead in *base* DOM to detect deletions
264                if ( !$foundDiff && DiffDOMUtils::isContentNode( $newNode ) ) {
265                    $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $baseNode );
266                    $this->debug( '--lookahead in old dom--' );
267                    $lookaheadNode = $baseNode->nextSibling;
268                    while ( $lookaheadNode ) {
269                        $this->debugOut( $lookaheadNode, $newNode, 'old' );
270                        if ( DiffDOMUtils::isContentNode( $lookaheadNode ) &&
271                            $this->treeEquals( $lookaheadNode, $newNode, true )
272                        ) {
273                            $this->debug( '--found diff: deleted--' );
274                            // mark skipped-over nodes as deleted
275                            $this->markNode( $newNode, DiffMarkers::DELETED, $isBlockNode );
276                            $baseNode = $lookaheadNode;
277                            $foundDiff = true;
278                            break;
279                        } elseif ( !WTUtils::emitsSolTransparentSingleLineWT( $lookaheadNode ) ) {
280                            // We only care about the deleted node prior to the one that
281                            // gets a tree match (but, ignore nodes that show up in wikitext
282                            // but don't affect sol-state or HTML rendering -- note that
283                            // whitespace is being ignored, but that whitespace occurs
284                            // between block nodes).
285                            $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $lookaheadNode );
286                        }
287                        $lookaheadNode = self::nextNonTemplateSibling( $lookaheadNode );
288                    }
289                }
290
291                if ( !$foundDiff ) {
292                    if ( !( $savedNewNode instanceof Element ) ) {
293                        $this->debug( '--found diff: modified text/comment--' );
294                        $this->markNode(
295                            $savedNewNode, DiffMarkers::DELETED,
296                            WTUtils::isBlockNodeWithVisibleWT( $baseNode )
297                        );
298                    } elseif ( $baseNode instanceof Element &&
299                        DOMCompat::nodeName( $savedNewNode ) === DOMCompat::nodeName( $baseNode ) &&
300                        ( DOMDataUtils::getDataParsoid( $savedNewNode )->stx ?? null ) ===
301                        ( DOMDataUtils::getDataParsoid( $baseNode )->stx ?? null )
302                    ) {
303                        // Identical wrapper-type, but modified.
304                        // Mark modified-wrapper, and recurse.
305                        $this->debug( '--found diff: modified-wrapper--' );
306                        $this->markNode( $savedNewNode, DiffMarkers::MODIFIED_WRAPPER );
307                        $this->subtreeDiffers( $baseNode, $savedNewNode );
308                    } else {
309                        // We now want to compare current newNode with the next baseNode.
310                        $dontAdvanceNewNode = true;
311
312                        // Since we are advancing in an old DOM without advancing
313                        // in the new DOM, there were deletions. Add a deletion marker
314                        // since this is important for accurate separator handling in selser.
315                        $this->markNode(
316                            $savedNewNode, DiffMarkers::DELETED,
317                            WTUtils::isBlockNodeWithVisibleWT( $baseNode )
318                        );
319                    }
320                }
321
322                // Record the fact that direct children changed in the parent node
323                $this->debug( '--found diff: children-changed--' );
324                $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED );
325
326                $foundDiffOverall = true;
327            } elseif ( $this->subtreeDiffers( $baseNode, $newNode ) ) {
328                $foundDiffOverall = true;
329            }
330
331            // And move on to the next pair (skipping over template HTML)
332            if ( $baseNode && $newNode ) {
333                $baseNode = self::nextNonTemplateSibling( $baseNode );
334                if ( !$dontAdvanceNewNode ) {
335                    $newNode = self::nextNonTemplateSibling( $newNode );
336                }
337            }
338        }
339
340        // mark extra new nodes as inserted
341        while ( $newNode ) {
342            $this->debug( '--found trailing new node: inserted--' );
343            $this->markNode( $newNode, DiffMarkers::INSERTED );
344            $foundDiffOverall = true;
345            $newNode = self::nextNonTemplateSibling( $newNode );
346        }
347
348        // If there are extra base nodes, something was deleted. Mark the parent as
349        // having lost children for now.
350        if ( $baseNode ) {
351            $this->debug( '--found trailing base nodes: deleted--' );
352            $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED );
353            // SSS FIXME: WTS checks for zero children in a few places
354            // That code would have to be upgraded if we emit mw:DiffMarker
355            // in this scenario. So, bailing out in this one case for now.
356            if ( $newParentNode->hasChildNodes() ) {
357                $meta = $newParentNode->ownerDocument->createElement( 'meta' );
358                DOMUtils::addTypeOf( $meta, 'mw:DiffMarker/deleted' );
359                if ( WTUtils::isBlockNodeWithVisibleWT( $baseNode ) ) {
360                    $meta->setAttribute( 'data-is-block', 'true' );
361                }
362                $newParentNode->appendChild( $meta );
363            }
364            $foundDiffOverall = true;
365        }
366
367        return $foundDiffOverall;
368    }
369
370    /* ***************************************************
371     * Helpers
372     * ***************************************************/
373
374    /**
375     * @param Node $baseNode
376     * @param Node $newNode
377     * @return bool
378     */
379    private function subtreeDiffers( Node $baseNode, Node $newNode ): bool {
380        $baseEncapsulated = WTUtils::isEncapsulationWrapper( $baseNode );
381        $newEncapsulated = WTUtils::isEncapsulationWrapper( $newNode );
382
383        if ( !$baseEncapsulated && !$newEncapsulated ) {
384            $this->debug( '--shallow equal: recursing--' );
385            // Recursively diff subtrees if not template-like content
386            $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode );
387        } elseif ( $baseEncapsulated && $newEncapsulated ) {
388            '@phan-var Element $baseNode';  // @var Element $baseNode
389            '@phan-var Element $newNode';  // @var Element $newNode
390
391            $ext = null;
392
393            $baseExtTagName = WTUtils::getExtTagName( $baseNode );
394            if ( $baseExtTagName ) {
395                $ext = $this->env->getSiteConfig()->getExtTagImpl( $baseExtTagName );
396            }
397
398            if ( $ext && ( $baseExtTagName === WTUtils::getExtTagName( $newNode ) ) ) {
399                $this->debug( '--diffing extension content--' );
400                $subtreeDiffers = $ext->diffHandler(
401                    $this->extApi, [ $this, 'doDOMDiff' ], $baseNode, $newNode
402                );
403            } else {
404                // Otherwise, for encapsulated content, we don't know about the subtree.
405                $subtreeDiffers = false;
406            }
407        } else {
408            // FIXME: Maybe $editNode should be marked as inserted to avoid
409            // losing any edits, at the cost of more normalization.
410            // $state->inInsertedContent is only set when we're in inserted
411            // content, so not sure this is currently doing all that much.
412            $subtreeDiffers = true;
413        }
414
415        if ( $subtreeDiffers ) {
416            $this->debug( '--found diff: subtree-changed--' );
417            $this->markNode( $newNode, DiffMarkers::SUBTREE_CHANGED );
418        }
419        return $subtreeDiffers;
420    }
421
422    private function markNode( Node $node, string $mark, bool $blockNodeDeleted = false ): void {
423        $meta = DiffUtils::addDiffMark( $node, $this->env, $mark );
424
425        if ( $meta && $blockNodeDeleted ) {
426            $meta->setAttribute( 'data-is-block', 'true' );
427        }
428
429        if ( $mark === DiffMarkers::DELETED || $mark === DiffMarkers::INSERTED ) {
430            $this->markNode( $node->parentNode, DiffMarkers::CHILDREN_CHANGED );
431        }
432
433        // Clear out speculatively computed DSR values for data-mw-selser-wrapper nodes
434        // since they may be incorrect. This eliminates any inadvertent use of
435        // these incorrect values.
436        if ( $node instanceof Element && $node->hasAttribute( 'data-mw-selser-wrapper' ) ) {
437            DOMDataUtils::getDataParsoid( $node )->dsr = null;
438        }
439    }
440
441    private function debugOut( Node $nodeA, Node $nodeB, string $laPrefix = '' ): void {
442        $prefix = 'trace/domdiff';
443        $this->env->log( $prefix,
444            static fn () => '--> A' . $laPrefix . ':' .
445                ( $nodeA instanceof Element
446                    ? DOMCompat::getOuterHTML( $nodeA )
447                    : PHPUtils::jsonEncode( $nodeA->nodeValue ) )
448        );
449        $this->env->log( $prefix,
450            static fn () => '--> B' . $laPrefix . ':' .
451                ( $nodeB instanceof Element
452                    ? DOMCompat::getOuterHTML( $nodeB )
453                    : PHPUtils::jsonEncode( $nodeB->nodeValue ) )
454        );
455    }
456}