Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
35.05% covered (danger)
35.05%
68 / 194
0.00% covered (danger)
0.00%
0 / 9
CRAP
0.00% covered (danger)
0.00%
0 / 1
DOMDiff
35.05% covered (danger)
35.05%
68 / 194
0.00% covered (danger)
0.00%
0 / 9
1533.00
0.00% covered (danger)
0.00%
0 / 1
 nextAnalyzableSibling
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
12
 debug
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 __construct
0.00% covered (danger)
0.00%
0 / 10
0.00% covered (danger)
0.00%
0 / 1
2
 diff
0.00% covered (danger)
0.00%
0 / 12
0.00% covered (danger)
0.00%
0 / 1
2
 treeEquals
0.00% covered (danger)
0.00%
0 / 37
0.00% covered (danger)
0.00%
0 / 1
380
 doDOMDiff
78.16% covered (warning)
78.16%
68 / 87
0.00% covered (danger)
0.00%
0 / 1
36.17
 subtreeDiffers
0.00% covered (danger)
0.00%
0 / 24
0.00% covered (danger)
0.00%
0 / 1
110
 markNode
0.00% covered (danger)
0.00%
0 / 7
0.00% covered (danger)
0.00%
0 / 1
56
 debugOut
0.00% covered (danger)
0.00%
0 / 13
0.00% covered (danger)
0.00%
0 / 1
12
1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Html2Wt;
5
6use Wikimedia\Assert\Assert;
7use Wikimedia\Assert\UnreachableException;
8use Wikimedia\Parsoid\Config\Env;
9use Wikimedia\Parsoid\DOM\Comment;
10use Wikimedia\Parsoid\DOM\DocumentFragment;
11use Wikimedia\Parsoid\DOM\Element;
12use Wikimedia\Parsoid\DOM\Node;
13use Wikimedia\Parsoid\DOM\Text;
14use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI;
15use Wikimedia\Parsoid\Utils\DiffDOMUtils;
16use Wikimedia\Parsoid\Utils\DOMCompat;
17use Wikimedia\Parsoid\Utils\DOMDataUtils;
18use Wikimedia\Parsoid\Utils\DOMUtils;
19use Wikimedia\Parsoid\Utils\PHPUtils;
20use Wikimedia\Parsoid\Utils\WTUtils;
21
22/**
23 * A DOM diff helper class.
24 *
25 * Compares two DOMs and annotates a copy of the passed-in DOM with change
26 * information for the selective serializer.
27 */
28class DOMDiff {
29
30    // These attributes are ignored for equality purposes if they are added to a node.
31    private const IGNORE_ATTRIBUTES = [
32        // Note that we are explicitly not ignoring data-parsoid even though clients
33        // would never modify data-parsoid because SelectiveSerializer is wrapping text
34        // nodes in spans and speculatively computes DSR offsets for these span tags
35        // which are accurate for original DOM and may be inaccurate for the edited DOM.
36        // By diffing data-parsoid which diffs the DSR as well, we ensure we mark such
37        // nodes as modified and prevent use of those speculatively computed incorrect
38        // DSR values.
39        'data-parsoid-diff',
40        'about',
41        DOMDataUtils::DATA_OBJECT_ATTR_NAME,
42    ];
43
44    /**
45     * @var Env
46     */
47    public $env;
48
49    /** @var ParsoidExtensionAPI */
50    public $extApi;
51
52    /**
53     * @var array
54     */
55    public $specializedAttribHandlers;
56
57    public bool $skipEncapsulatedContent = true;
58
59    private function nextAnalyzableSibling( Node $node ): ?Node {
60        if ( WTUtils::isEncapsulationWrapper( $node ) && $this->skipEncapsulatedContent ) {
61            return WTUtils::skipOverEncapsulatedContent( $node );
62        }
63        return $node->nextSibling;
64    }
65
66    /**
67     * @param mixed ...$args
68     */
69    private function debug( ...$args ): void {
70        $this->env->log( 'trace/domdiff', ...$args );
71    }
72
73    public function __construct( Env $env ) {
74        $this->env = $env;
75        $this->extApi = new ParsoidExtensionAPI( $env );
76        $this->specializedAttribHandlers = [
77            'data-mw' => static function ( $nodeA, $dmwA, $nodeB, $dmwB ) {
78                return $dmwA == $dmwB;
79            },
80            'data-parsoid' => static function ( $nodeA, $dpA, $nodeB, $dpB ) {
81                return $dpA == $dpB;
82            },
83        ];
84    }
85
86    /**
87     * Diff two HTML documents, and add / update data-parsoid-diff attributes with
88     * change information.
89     *
90     * @param Element $nodeA
91     * @param Element $nodeB
92     * @return array
93     */
94    public function diff( Element $nodeA, Element $nodeB ): array {
95        Assert::invariant(
96            $nodeA->ownerDocument !== $nodeB->ownerDocument,
97            'Expected to be diff\'ing different documents.'
98        );
99
100        $this->debug( static function () use( $nodeA, $nodeB ) {
101            return "ORIG:\n" .
102                DOMCompat::getOuterHTML( $nodeA ) .
103                "\nNEW :\n" .
104                DOMCompat::getOuterHTML( $nodeB );
105        } );
106
107        // The root nodes are equal, call recursive differ
108        $foundChange = $this->doDOMDiff( $nodeA, $nodeB );
109        return [ 'isEmpty' => !$foundChange ];
110    }
111
112    /**
113     * Test if two DOM nodes are equal.
114     *
115     * @param Node $nodeA
116     * @param Node $nodeB
117     * @param bool $deep
118     * @return bool
119     */
120    public function treeEquals( Node $nodeA, Node $nodeB, bool $deep ): bool {
121        if ( $nodeA->nodeType !== $nodeB->nodeType ) {
122            return false;
123        } elseif ( $nodeA instanceof Text ) {
124            // In the past we've had bugs where we let non-primitive strings
125            // leak into our DOM.  Safety first:
126            Assert::invariant( $nodeA->nodeValue === (string)$nodeA->nodeValue, '' );
127            Assert::invariant( $nodeB->nodeValue === (string)$nodeB->nodeValue, '' );
128            // ok, now do the comparison.
129            return $nodeA->nodeValue === $nodeB->nodeValue;
130        } elseif ( $nodeA instanceof Comment ) {
131            return WTUtils::decodeComment( $nodeA->nodeValue ) ===
132                WTUtils::decodeComment( $nodeB->nodeValue );
133        } elseif ( $nodeA instanceof Element || $nodeA instanceof DocumentFragment ) {
134            if ( $nodeA instanceof DocumentFragment ) {
135                if ( !( $nodeB instanceof DocumentFragment ) ) {
136                    return false;
137                }
138            } else {  // $nodeA instanceof Element
139                // Compare node name and attribute length
140                if (
141                    !( $nodeB instanceof Element ) ||
142                    DOMCompat::nodeName( $nodeA ) !== DOMCompat::nodeName( $nodeB ) ||
143                    !DiffUtils::attribsEquals(
144                        $nodeA,
145                        $nodeB,
146                        self::IGNORE_ATTRIBUTES,
147                        $this->specializedAttribHandlers
148                    )
149                ) {
150                    return false;
151                }
152            }
153
154            // Passed all tests, node itself is equal.
155            if ( $deep ) {
156                $childA = null;
157                $childB = null;
158                // Compare # of children, since that's fast.
159                // (Avoid de-optimizing DOM by using node#childNodes)
160                for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild;
161                    $childA && $childB;
162                    $childA = $childA->nextSibling, $childB = $childB->nextSibling
163                ) {
164                    /* don't look inside children yet, just look at # of children */
165                }
166
167                if ( $childA || $childB ) {
168                    return false; /* nodes have different # of children */
169                }
170                // Now actually compare the child subtrees
171                for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild;
172                    $childA && $childB;
173                    $childA = $childA->nextSibling, $childB = $childB->nextSibling
174                ) {
175                    if ( !$this->treeEquals( $childA, $childB, $deep ) ) {
176                        return false;
177                    }
178                }
179            }
180
181            // Did not find a diff yet, so the trees must be equal.
182            return true;
183        }
184        throw new UnreachableException( 'we shouldn\'t get here' );
185    }
186
187    /**
188     * Diff two DOM trees by comparing them node-by-node.
189     *
190     * TODO: Implement something more intelligent like
191     * http://gregory.cobena.free.fr/www/Publications/%5BICDE2002%5D%20XyDiff%20-%20published%20version.pdf,
192     * which uses hash signatures of subtrees to efficiently detect moves /
193     * wrapping.
194     *
195     * Adds / updates a data-parsoid-diff structure with change information.
196     *
197     * Returns true if subtree is changed, false otherwise.
198     *
199     * TODO:
200     * Assume typical CSS white-space, so ignore ws diffs in non-pre content.
201     *
202     * @param Node $baseParentNode
203     * @param Node $newParentNode
204     * @return bool
205     */
206    public function doDOMDiff( Node $baseParentNode, Node $newParentNode ): bool {
207        // Perform a relaxed version of the recursive treeEquals algorithm that
208        // allows for some minor differences and tries to produce a sensible diff
209        // marking using heuristics like look-ahead on siblings.
210        $baseNode = $baseParentNode->firstChild;
211        $newNode = $newParentNode->firstChild;
212        $lookaheadNode = null;
213        $foundDiffOverall = false;
214
215        while ( $baseNode && $newNode ) {
216            $dontAdvanceNewNode = false;
217            $this->debugOut( $baseNode, $newNode );
218            // shallow check first
219            if ( !$this->treeEquals( $baseNode, $newNode, false ) ) {
220                $this->debug( '-- not equal --' );
221                $savedNewNode = $newNode;
222                $foundDiff = false;
223
224                // Some simplistic look-ahead, currently limited to a single level
225                // in the DOM.
226
227                // look-ahead in *new* DOM to detect insertions
228                if ( DiffDOMUtils::isContentNode( $baseNode ) ) {
229                    $this->debug( '--lookahead in new dom--' );
230                    $lookaheadNode = $newNode->nextSibling;
231                    while ( $lookaheadNode ) {
232                        $this->debugOut( $baseNode, $lookaheadNode, 'new' );
233                        if ( DiffDOMUtils::isContentNode( $lookaheadNode ) &&
234                            $this->treeEquals( $baseNode, $lookaheadNode, true )
235                        ) {
236                            // mark skipped-over nodes as inserted
237                            $markNode = $newNode;
238                            while ( $markNode !== $lookaheadNode ) {
239                                $this->debug( '--found diff: inserted--' );
240                                $this->markNode( $markNode, DiffMarkers::INSERTED );
241                                $markNode = $markNode->nextSibling;
242                            }
243                            $foundDiff = true;
244                            $newNode = $lookaheadNode;
245                            break;
246                        }
247                        $lookaheadNode = self::nextAnalyzableSibling( $lookaheadNode );
248                    }
249                }
250
251                // look-ahead in *base* DOM to detect deletions
252                if ( !$foundDiff && DiffDOMUtils::isContentNode( $newNode ) ) {
253                    $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $baseNode );
254                    $this->debug( '--lookahead in old dom--' );
255                    $lookaheadNode = $baseNode->nextSibling;
256                    while ( $lookaheadNode ) {
257                        $this->debugOut( $lookaheadNode, $newNode, 'old' );
258                        if ( DiffDOMUtils::isContentNode( $lookaheadNode ) &&
259                            $this->treeEquals( $lookaheadNode, $newNode, true )
260                        ) {
261                            $this->debug( '--found diff: deleted--' );
262                            // mark skipped-over nodes as deleted
263                            $this->markNode( $newNode, DiffMarkers::DELETED, $isBlockNode );
264                            $baseNode = $lookaheadNode;
265                            $foundDiff = true;
266                            break;
267                        } elseif ( !WTUtils::emitsSolTransparentSingleLineWT( $lookaheadNode ) ) {
268                            // We only care about the deleted node prior to the one that
269                            // gets a tree match (but, ignore nodes that show up in wikitext
270                            // but don't affect sol-state or HTML rendering -- note that
271                            // whitespace is being ignored, but that whitespace occurs
272                            // between block nodes).
273                            $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $lookaheadNode );
274                        }
275                        $lookaheadNode = self::nextAnalyzableSibling( $lookaheadNode );
276                    }
277                }
278
279                if ( !$foundDiff ) {
280                    if ( !( $savedNewNode instanceof Element ) ) {
281                        $this->debug( '--found diff: modified text/comment--' );
282                        $this->markNode(
283                            $savedNewNode, DiffMarkers::DELETED,
284                            WTUtils::isBlockNodeWithVisibleWT( $baseNode )
285                        );
286                    } elseif ( $baseNode instanceof Element &&
287                        DOMCompat::nodeName( $savedNewNode ) === DOMCompat::nodeName( $baseNode ) &&
288                        ( DOMDataUtils::getDataParsoid( $savedNewNode )->stx ?? null ) ===
289                        ( DOMDataUtils::getDataParsoid( $baseNode )->stx ?? null )
290                    ) {
291                        // Identical wrapper-type, but modified.
292                        // Mark modified-wrapper, and recurse.
293                        $this->debug( '--found diff: modified-wrapper--' );
294                        $this->markNode( $savedNewNode, DiffMarkers::MODIFIED_WRAPPER );
295                        $this->subtreeDiffers( $baseNode, $savedNewNode );
296                    } else {
297                        // We now want to compare current newNode with the next baseNode.
298                        $dontAdvanceNewNode = true;
299
300                        // Since we are advancing in an old DOM without advancing
301                        // in the new DOM, there were deletions. Add a deletion marker
302                        // since this is important for accurate separator handling in selser.
303                        $this->markNode(
304                            $savedNewNode, DiffMarkers::DELETED,
305                            WTUtils::isBlockNodeWithVisibleWT( $baseNode )
306                        );
307                    }
308                }
309
310                // Record the fact that direct children changed in the parent node
311                $this->debug( '--found diff: children-changed--' );
312                $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED );
313
314                $foundDiffOverall = true;
315            } elseif ( $this->subtreeDiffers( $baseNode, $newNode ) ) {
316                $foundDiffOverall = true;
317            }
318
319            // And move on to the next pair (skipping over template HTML)
320            if ( $baseNode && $newNode ) {
321                $baseNode = self::nextAnalyzableSibling( $baseNode );
322                if ( !$dontAdvanceNewNode ) {
323                    $newNode = self::nextAnalyzableSibling( $newNode );
324                }
325            }
326        }
327
328        // mark extra new nodes as inserted
329        while ( $newNode ) {
330            $this->debug( '--found trailing new node: inserted--' );
331            $this->markNode( $newNode, DiffMarkers::INSERTED );
332            $foundDiffOverall = true;
333            $newNode = self::nextAnalyzableSibling( $newNode );
334        }
335
336        // If there are extra base nodes, something was deleted. Mark the parent as
337        // having lost children for now.
338        if ( $baseNode ) {
339            $this->debug( '--found trailing base nodes: deleted--' );
340            $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED );
341            // SSS FIXME: WTS checks for zero children in a few places
342            // That code would have to be upgraded if we emit mw:DiffMarker
343            // in this scenario. So, bailing out in this one case for now.
344            if ( $newParentNode->hasChildNodes() ) {
345                $meta = $newParentNode->ownerDocument->createElement( 'meta' );
346                DOMUtils::addTypeOf( $meta, 'mw:DiffMarker/deleted' );
347                if ( WTUtils::isBlockNodeWithVisibleWT( $baseNode ) ) {
348                    $meta->setAttribute( 'data-is-block', 'true' );
349                }
350                $newParentNode->appendChild( $meta );
351            }
352            $foundDiffOverall = true;
353        }
354
355        return $foundDiffOverall;
356    }
357
358    /* ***************************************************
359     * Helpers
360     * ***************************************************/
361
362    /**
363     * @param Node $baseNode
364     * @param Node $newNode
365     * @return bool
366     */
367    private function subtreeDiffers( Node $baseNode, Node $newNode ): bool {
368        $baseEncapsulated = WTUtils::isEncapsulationWrapper( $baseNode );
369        $newEncapsulated = WTUtils::isEncapsulationWrapper( $newNode );
370
371        if ( !$baseEncapsulated && !$newEncapsulated ) {
372            $this->debug( '--shallow equal: recursing--' );
373            // Recursively diff subtrees if not template-like content
374            $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode );
375        } elseif ( $baseEncapsulated && $newEncapsulated ) {
376            '@phan-var Element $baseNode';  // @var Element $baseNode
377            '@phan-var Element $newNode';  // @var Element $newNode
378
379            $ext = null;
380
381            $baseExtTagName = WTUtils::getExtTagName( $baseNode );
382            if ( $baseExtTagName ) {
383                $ext = $this->env->getSiteConfig()->getExtTagImpl( $baseExtTagName );
384            }
385
386            if ( $ext && ( $baseExtTagName === WTUtils::getExtTagName( $newNode ) ) ) {
387                $this->debug( '--diffing extension content--' );
388                $subtreeDiffers = $ext->diffHandler(
389                    $this->extApi, [ $this, 'doDOMDiff' ], $baseNode, $newNode
390                );
391            } elseif ( $this->skipEncapsulatedContent ) {
392                // Otherwise, for encapsulated content, we don't know about the subtree.
393                $subtreeDiffers = false;
394            } else {
395                $this->debug( '--shallow equal (encapsulated): recursing--' );
396                // Recursively diff subtrees if not template-like content
397                $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode );
398            }
399        } else {
400            // FIXME: Maybe $editNode should be marked as inserted to avoid
401            // losing any edits, at the cost of more normalization.
402            // $state->inInsertedContent is only set when we're in inserted
403            // content, so not sure this is currently doing all that much.
404            $subtreeDiffers = true;
405        }
406
407        if ( $subtreeDiffers ) {
408            $this->debug( '--found diff: subtree-changed--' );
409            $this->markNode( $newNode, DiffMarkers::SUBTREE_CHANGED );
410        }
411        return $subtreeDiffers;
412    }
413
414    private function markNode( Node $node, string $mark, bool $blockNodeDeleted = false ): void {
415        $meta = DiffUtils::addDiffMark( $node, $this->env, $mark );
416
417        if ( $meta && $blockNodeDeleted ) {
418            $meta->setAttribute( 'data-is-block', 'true' );
419        }
420
421        if ( $mark === DiffMarkers::DELETED || $mark === DiffMarkers::INSERTED ) {
422            $this->markNode( $node->parentNode, DiffMarkers::CHILDREN_CHANGED );
423        }
424
425        // Clear out speculatively computed DSR values for data-mw-selser-wrapper nodes
426        // since they may be incorrect. This eliminates any inadvertent use of
427        // these incorrect values.
428        if ( $node instanceof Element && $node->hasAttribute( 'data-mw-selser-wrapper' ) ) {
429            DOMDataUtils::getDataParsoid( $node )->dsr = null;
430        }
431    }
432
433    private function debugOut( Node $nodeA, Node $nodeB, string $laPrefix = '' ): void {
434        $prefix = 'trace/domdiff';
435        $this->env->log( $prefix,
436            static fn () => '--> A' . $laPrefix . ':' .
437                ( $nodeA instanceof Element
438                    ? DOMCompat::getOuterHTML( $nodeA )
439                    : PHPUtils::jsonEncode( $nodeA->nodeValue ) )
440        );
441        $this->env->log( $prefix,
442            static fn () => '--> B' . $laPrefix . ':' .
443                ( $nodeB instanceof Element
444                    ? DOMCompat::getOuterHTML( $nodeB )
445                    : PHPUtils::jsonEncode( $nodeB->nodeValue ) )
446        );
447    }
448}