Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
33.50% covered (danger)
33.50%
68 / 203
0.00% covered (danger)
0.00%
0 / 9
CRAP
0.00% covered (danger)
0.00%
0 / 1
DOMDiff
33.50% covered (danger)
33.50%
68 / 203
0.00% covered (danger)
0.00%
0 / 9
1774.79
0.00% covered (danger)
0.00%
0 / 1
 nextAnalyzableSibling
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
12
 debug
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 __construct
0.00% covered (danger)
0.00%
0 / 19
0.00% covered (danger)
0.00%
0 / 1
20
 diff
0.00% covered (danger)
0.00%
0 / 12
0.00% covered (danger)
0.00%
0 / 1
2
 treeEquals
0.00% covered (danger)
0.00%
0 / 37
0.00% covered (danger)
0.00%
0 / 1
380
 doDOMDiff
78.16% covered (warning)
78.16%
68 / 87
0.00% covered (danger)
0.00%
0 / 1
36.17
 subtreeDiffers
0.00% covered (danger)
0.00%
0 / 24
0.00% covered (danger)
0.00%
0 / 1
110
 markNode
0.00% covered (danger)
0.00%
0 / 7
0.00% covered (danger)
0.00%
0 / 1
56
 debugOut
0.00% covered (danger)
0.00%
0 / 13
0.00% covered (danger)
0.00%
0 / 1
12
1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Html2Wt;
5
6use Wikimedia\Assert\Assert;
7use Wikimedia\Assert\UnreachableException;
8use Wikimedia\Parsoid\Config\Env;
9use Wikimedia\Parsoid\DOM\Comment;
10use Wikimedia\Parsoid\DOM\DocumentFragment;
11use Wikimedia\Parsoid\DOM\Element;
12use Wikimedia\Parsoid\DOM\Node;
13use Wikimedia\Parsoid\DOM\Text;
14use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI;
15use Wikimedia\Parsoid\Utils\DiffDOMUtils;
16use Wikimedia\Parsoid\Utils\DOMCompat;
17use Wikimedia\Parsoid\Utils\DOMDataUtils;
18use Wikimedia\Parsoid\Utils\DOMUtils;
19use Wikimedia\Parsoid\Utils\PHPUtils;
20use Wikimedia\Parsoid\Utils\WTUtils;
21
22/**
23 * A DOM diff helper class.
24 *
25 * Compares two DOMs and annotates a copy of the passed-in DOM with change
26 * information for the selective serializer.
27 */
28class DOMDiff {
29
30    // These attributes are ignored for equality purposes if they are added to a node.
31    private const IGNORE_ATTRIBUTES = [
32        // Note that we are explicitly not ignoring data-parsoid even though clients
33        // would never modify data-parsoid because SelectiveSerializer is wrapping text
34        // nodes in spans and speculatively computes DSR offsets for these span tags
35        // which are accurate for original DOM and may be inaccurate for the edited DOM.
36        // By diffing data-parsoid which diffs the DSR as well, we ensure we mark such
37        // nodes as modified and prevent use of those speculatively computed incorrect
38        // DSR values.
39        'data-parsoid-diff',
40        'about',
41        DOMDataUtils::DATA_OBJECT_ATTR_NAME,
42    ];
43
44    /**
45     * @var Env
46     */
47    public $env;
48
49    /** @var ParsoidExtensionAPI */
50    public $extApi;
51
52    /**
53     * @var array
54     */
55    public $specializedAttribHandlers;
56
57    public bool $skipEncapsulatedContent = true;
58
59    private function nextAnalyzableSibling( Node $node ): ?Node {
60        if ( WTUtils::isEncapsulationWrapper( $node ) && $this->skipEncapsulatedContent ) {
61            return WTUtils::skipOverEncapsulatedContent( $node );
62        }
63        return $node->nextSibling;
64    }
65
66    /**
67     * @param mixed ...$args
68     */
69    private function debug( ...$args ): void {
70        $this->env->log( 'trace/domdiff', ...$args );
71    }
72
73    public function __construct( Env $env ) {
74        $this->env = $env;
75        $this->extApi = new ParsoidExtensionAPI( $env );
76        $this->specializedAttribHandlers = [
77            'data-mw' => static function ( $nodeA, $dmwA, $nodeB, $dmwB ) {
78                return $dmwA == $dmwB;
79            },
80            'data-parsoid' => static function ( $nodeA, $dpA, $nodeB, $dpB ) {
81                return $dpA == $dpB;
82            },
83            // TODO(T254502): This is added temporarily for backwards
84            // compatibility and can be removed when versions up to 2.1.0
85            // are no longer stored
86            'typeof' => static function ( $nodeA, $valA, $nodeB, $valB ) {
87                if ( $valA === $valB ) {
88                    return true;
89                } elseif ( $valA === 'mw:DisplaySpace' ) {
90                    return $valB === 'mw:DisplaySpace mw:Placeholder';
91                } elseif ( $valB === 'mw:DisplaySpace' ) {
92                    return $valA === 'mw:DisplaySpace mw:Placeholder';
93                } else {
94                    return false;
95                }
96            }
97        ];
98    }
99
100    /**
101     * Diff two HTML documents, and add / update data-parsoid-diff attributes with
102     * change information.
103     *
104     * @param Element $nodeA
105     * @param Element $nodeB
106     * @return array
107     */
108    public function diff( Element $nodeA, Element $nodeB ): array {
109        Assert::invariant(
110            $nodeA->ownerDocument !== $nodeB->ownerDocument,
111            'Expected to be diff\'ing different documents.'
112        );
113
114        $this->debug( static function () use( $nodeA, $nodeB ) {
115            return "ORIG:\n" .
116                DOMCompat::getOuterHTML( $nodeA ) .
117                "\nNEW :\n" .
118                DOMCompat::getOuterHTML( $nodeB );
119        } );
120
121        // The root nodes are equal, call recursive differ
122        $foundChange = $this->doDOMDiff( $nodeA, $nodeB );
123        return [ 'isEmpty' => !$foundChange ];
124    }
125
126    /**
127     * Test if two DOM nodes are equal.
128     *
129     * @param Node $nodeA
130     * @param Node $nodeB
131     * @param bool $deep
132     * @return bool
133     */
134    public function treeEquals( Node $nodeA, Node $nodeB, bool $deep ): bool {
135        if ( $nodeA->nodeType !== $nodeB->nodeType ) {
136            return false;
137        } elseif ( $nodeA instanceof Text ) {
138            // In the past we've had bugs where we let non-primitive strings
139            // leak into our DOM.  Safety first:
140            Assert::invariant( $nodeA->nodeValue === (string)$nodeA->nodeValue, '' );
141            Assert::invariant( $nodeB->nodeValue === (string)$nodeB->nodeValue, '' );
142            // ok, now do the comparison.
143            return $nodeA->nodeValue === $nodeB->nodeValue;
144        } elseif ( $nodeA instanceof Comment ) {
145            return WTUtils::decodeComment( $nodeA->nodeValue ) ===
146                WTUtils::decodeComment( $nodeB->nodeValue );
147        } elseif ( $nodeA instanceof Element || $nodeA instanceof DocumentFragment ) {
148            if ( $nodeA instanceof DocumentFragment ) {
149                if ( !( $nodeB instanceof DocumentFragment ) ) {
150                    return false;
151                }
152            } else {  // $nodeA instanceof Element
153                // Compare node name and attribute length
154                if (
155                    !( $nodeB instanceof Element ) ||
156                    DOMCompat::nodeName( $nodeA ) !== DOMCompat::nodeName( $nodeB ) ||
157                    !DiffUtils::attribsEquals(
158                        $nodeA,
159                        $nodeB,
160                        self::IGNORE_ATTRIBUTES,
161                        $this->specializedAttribHandlers
162                    )
163                ) {
164                    return false;
165                }
166            }
167
168            // Passed all tests, node itself is equal.
169            if ( $deep ) {
170                $childA = null;
171                $childB = null;
172                // Compare # of children, since that's fast.
173                // (Avoid de-optimizing DOM by using node#childNodes)
174                for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild;
175                    $childA && $childB;
176                    $childA = $childA->nextSibling, $childB = $childB->nextSibling
177                ) {
178                    /* don't look inside children yet, just look at # of children */
179                }
180
181                if ( $childA || $childB ) {
182                    return false; /* nodes have different # of children */
183                }
184                // Now actually compare the child subtrees
185                for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild;
186                    $childA && $childB;
187                    $childA = $childA->nextSibling, $childB = $childB->nextSibling
188                ) {
189                    if ( !$this->treeEquals( $childA, $childB, $deep ) ) {
190                        return false;
191                    }
192                }
193            }
194
195            // Did not find a diff yet, so the trees must be equal.
196            return true;
197        }
198        throw new UnreachableException( 'we shouldn\'t get here' );
199    }
200
201    /**
202     * Diff two DOM trees by comparing them node-by-node.
203     *
204     * TODO: Implement something more intelligent like
205     * http://gregory.cobena.free.fr/www/Publications/%5BICDE2002%5D%20XyDiff%20-%20published%20version.pdf,
206     * which uses hash signatures of subtrees to efficiently detect moves /
207     * wrapping.
208     *
209     * Adds / updates a data-parsoid-diff structure with change information.
210     *
211     * Returns true if subtree is changed, false otherwise.
212     *
213     * TODO:
214     * Assume typical CSS white-space, so ignore ws diffs in non-pre content.
215     *
216     * @param Node $baseParentNode
217     * @param Node $newParentNode
218     * @return bool
219     */
220    public function doDOMDiff( Node $baseParentNode, Node $newParentNode ): bool {
221        // Perform a relaxed version of the recursive treeEquals algorithm that
222        // allows for some minor differences and tries to produce a sensible diff
223        // marking using heuristics like look-ahead on siblings.
224        $baseNode = $baseParentNode->firstChild;
225        $newNode = $newParentNode->firstChild;
226        $lookaheadNode = null;
227        $foundDiffOverall = false;
228
229        while ( $baseNode && $newNode ) {
230            $dontAdvanceNewNode = false;
231            $this->debugOut( $baseNode, $newNode );
232            // shallow check first
233            if ( !$this->treeEquals( $baseNode, $newNode, false ) ) {
234                $this->debug( '-- not equal --' );
235                $savedNewNode = $newNode;
236                $foundDiff = false;
237
238                // Some simplistic look-ahead, currently limited to a single level
239                // in the DOM.
240
241                // look-ahead in *new* DOM to detect insertions
242                if ( DiffDOMUtils::isContentNode( $baseNode ) ) {
243                    $this->debug( '--lookahead in new dom--' );
244                    $lookaheadNode = $newNode->nextSibling;
245                    while ( $lookaheadNode ) {
246                        $this->debugOut( $baseNode, $lookaheadNode, 'new' );
247                        if ( DiffDOMUtils::isContentNode( $lookaheadNode ) &&
248                            $this->treeEquals( $baseNode, $lookaheadNode, true )
249                        ) {
250                            // mark skipped-over nodes as inserted
251                            $markNode = $newNode;
252                            while ( $markNode !== $lookaheadNode ) {
253                                $this->debug( '--found diff: inserted--' );
254                                $this->markNode( $markNode, DiffMarkers::INSERTED );
255                                $markNode = $markNode->nextSibling;
256                            }
257                            $foundDiff = true;
258                            $newNode = $lookaheadNode;
259                            break;
260                        }
261                        $lookaheadNode = self::nextAnalyzableSibling( $lookaheadNode );
262                    }
263                }
264
265                // look-ahead in *base* DOM to detect deletions
266                if ( !$foundDiff && DiffDOMUtils::isContentNode( $newNode ) ) {
267                    $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $baseNode );
268                    $this->debug( '--lookahead in old dom--' );
269                    $lookaheadNode = $baseNode->nextSibling;
270                    while ( $lookaheadNode ) {
271                        $this->debugOut( $lookaheadNode, $newNode, 'old' );
272                        if ( DiffDOMUtils::isContentNode( $lookaheadNode ) &&
273                            $this->treeEquals( $lookaheadNode, $newNode, true )
274                        ) {
275                            $this->debug( '--found diff: deleted--' );
276                            // mark skipped-over nodes as deleted
277                            $this->markNode( $newNode, DiffMarkers::DELETED, $isBlockNode );
278                            $baseNode = $lookaheadNode;
279                            $foundDiff = true;
280                            break;
281                        } elseif ( !WTUtils::emitsSolTransparentSingleLineWT( $lookaheadNode ) ) {
282                            // We only care about the deleted node prior to the one that
283                            // gets a tree match (but, ignore nodes that show up in wikitext
284                            // but don't affect sol-state or HTML rendering -- note that
285                            // whitespace is being ignored, but that whitespace occurs
286                            // between block nodes).
287                            $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $lookaheadNode );
288                        }
289                        $lookaheadNode = self::nextAnalyzableSibling( $lookaheadNode );
290                    }
291                }
292
293                if ( !$foundDiff ) {
294                    if ( !( $savedNewNode instanceof Element ) ) {
295                        $this->debug( '--found diff: modified text/comment--' );
296                        $this->markNode(
297                            $savedNewNode, DiffMarkers::DELETED,
298                            WTUtils::isBlockNodeWithVisibleWT( $baseNode )
299                        );
300                    } elseif ( $baseNode instanceof Element &&
301                        DOMCompat::nodeName( $savedNewNode ) === DOMCompat::nodeName( $baseNode ) &&
302                        ( DOMDataUtils::getDataParsoid( $savedNewNode )->stx ?? null ) ===
303                        ( DOMDataUtils::getDataParsoid( $baseNode )->stx ?? null )
304                    ) {
305                        // Identical wrapper-type, but modified.
306                        // Mark modified-wrapper, and recurse.
307                        $this->debug( '--found diff: modified-wrapper--' );
308                        $this->markNode( $savedNewNode, DiffMarkers::MODIFIED_WRAPPER );
309                        $this->subtreeDiffers( $baseNode, $savedNewNode );
310                    } else {
311                        // We now want to compare current newNode with the next baseNode.
312                        $dontAdvanceNewNode = true;
313
314                        // Since we are advancing in an old DOM without advancing
315                        // in the new DOM, there were deletions. Add a deletion marker
316                        // since this is important for accurate separator handling in selser.
317                        $this->markNode(
318                            $savedNewNode, DiffMarkers::DELETED,
319                            WTUtils::isBlockNodeWithVisibleWT( $baseNode )
320                        );
321                    }
322                }
323
324                // Record the fact that direct children changed in the parent node
325                $this->debug( '--found diff: children-changed--' );
326                $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED );
327
328                $foundDiffOverall = true;
329            } elseif ( $this->subtreeDiffers( $baseNode, $newNode ) ) {
330                $foundDiffOverall = true;
331            }
332
333            // And move on to the next pair (skipping over template HTML)
334            if ( $baseNode && $newNode ) {
335                $baseNode = self::nextAnalyzableSibling( $baseNode );
336                if ( !$dontAdvanceNewNode ) {
337                    $newNode = self::nextAnalyzableSibling( $newNode );
338                }
339            }
340        }
341
342        // mark extra new nodes as inserted
343        while ( $newNode ) {
344            $this->debug( '--found trailing new node: inserted--' );
345            $this->markNode( $newNode, DiffMarkers::INSERTED );
346            $foundDiffOverall = true;
347            $newNode = self::nextAnalyzableSibling( $newNode );
348        }
349
350        // If there are extra base nodes, something was deleted. Mark the parent as
351        // having lost children for now.
352        if ( $baseNode ) {
353            $this->debug( '--found trailing base nodes: deleted--' );
354            $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED );
355            // SSS FIXME: WTS checks for zero children in a few places
356            // That code would have to be upgraded if we emit mw:DiffMarker
357            // in this scenario. So, bailing out in this one case for now.
358            if ( $newParentNode->hasChildNodes() ) {
359                $meta = $newParentNode->ownerDocument->createElement( 'meta' );
360                DOMUtils::addTypeOf( $meta, 'mw:DiffMarker/deleted' );
361                if ( WTUtils::isBlockNodeWithVisibleWT( $baseNode ) ) {
362                    $meta->setAttribute( 'data-is-block', 'true' );
363                }
364                $newParentNode->appendChild( $meta );
365            }
366            $foundDiffOverall = true;
367        }
368
369        return $foundDiffOverall;
370    }
371
372    /* ***************************************************
373     * Helpers
374     * ***************************************************/
375
376    /**
377     * @param Node $baseNode
378     * @param Node $newNode
379     * @return bool
380     */
381    private function subtreeDiffers( Node $baseNode, Node $newNode ): bool {
382        $baseEncapsulated = WTUtils::isEncapsulationWrapper( $baseNode );
383        $newEncapsulated = WTUtils::isEncapsulationWrapper( $newNode );
384
385        if ( !$baseEncapsulated && !$newEncapsulated ) {
386            $this->debug( '--shallow equal: recursing--' );
387            // Recursively diff subtrees if not template-like content
388            $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode );
389        } elseif ( $baseEncapsulated && $newEncapsulated ) {
390            '@phan-var Element $baseNode';  // @var Element $baseNode
391            '@phan-var Element $newNode';  // @var Element $newNode
392
393            $ext = null;
394
395            $baseExtTagName = WTUtils::getExtTagName( $baseNode );
396            if ( $baseExtTagName ) {
397                $ext = $this->env->getSiteConfig()->getExtTagImpl( $baseExtTagName );
398            }
399
400            if ( $ext && ( $baseExtTagName === WTUtils::getExtTagName( $newNode ) ) ) {
401                $this->debug( '--diffing extension content--' );
402                $subtreeDiffers = $ext->diffHandler(
403                    $this->extApi, [ $this, 'doDOMDiff' ], $baseNode, $newNode
404                );
405            } elseif ( $this->skipEncapsulatedContent ) {
406                // Otherwise, for encapsulated content, we don't know about the subtree.
407                $subtreeDiffers = false;
408            } else {
409                $this->debug( '--shallow equal (encapsulated): recursing--' );
410                // Recursively diff subtrees if not template-like content
411                $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode );
412            }
413        } else {
414            // FIXME: Maybe $editNode should be marked as inserted to avoid
415            // losing any edits, at the cost of more normalization.
416            // $state->inInsertedContent is only set when we're in inserted
417            // content, so not sure this is currently doing all that much.
418            $subtreeDiffers = true;
419        }
420
421        if ( $subtreeDiffers ) {
422            $this->debug( '--found diff: subtree-changed--' );
423            $this->markNode( $newNode, DiffMarkers::SUBTREE_CHANGED );
424        }
425        return $subtreeDiffers;
426    }
427
428    private function markNode( Node $node, string $mark, bool $blockNodeDeleted = false ): void {
429        $meta = DiffUtils::addDiffMark( $node, $this->env, $mark );
430
431        if ( $meta && $blockNodeDeleted ) {
432            $meta->setAttribute( 'data-is-block', 'true' );
433        }
434
435        if ( $mark === DiffMarkers::DELETED || $mark === DiffMarkers::INSERTED ) {
436            $this->markNode( $node->parentNode, DiffMarkers::CHILDREN_CHANGED );
437        }
438
439        // Clear out speculatively computed DSR values for data-mw-selser-wrapper nodes
440        // since they may be incorrect. This eliminates any inadvertent use of
441        // these incorrect values.
442        if ( $node instanceof Element && $node->hasAttribute( 'data-mw-selser-wrapper' ) ) {
443            DOMDataUtils::getDataParsoid( $node )->dsr = null;
444        }
445    }
446
447    private function debugOut( Node $nodeA, Node $nodeB, string $laPrefix = '' ): void {
448        $prefix = 'trace/domdiff';
449        $this->env->log( $prefix,
450            static fn () => '--> A' . $laPrefix . ':' .
451                ( $nodeA instanceof Element
452                    ? DOMCompat::getOuterHTML( $nodeA )
453                    : PHPUtils::jsonEncode( $nodeA->nodeValue ) )
454        );
455        $this->env->log( $prefix,
456            static fn () => '--> B' . $laPrefix . ':' .
457                ( $nodeB instanceof Element
458                    ? DOMCompat::getOuterHTML( $nodeB )
459                    : PHPUtils::jsonEncode( $nodeB->nodeValue ) )
460        );
461    }
462}