Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
33.66% covered (danger)
33.66%
68 / 202
0.00% covered (danger)
0.00%
0 / 9
CRAP
0.00% covered (danger)
0.00%
0 / 1
DOMDiff
33.66% covered (danger)
33.66%
68 / 202
0.00% covered (danger)
0.00%
0 / 9
1672.54
0.00% covered (danger)
0.00%
0 / 1
 nextNonTemplateSibling
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
6
 debug
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 __construct
0.00% covered (danger)
0.00%
0 / 19
0.00% covered (danger)
0.00%
0 / 1
20
 diff
0.00% covered (danger)
0.00%
0 / 12
0.00% covered (danger)
0.00%
0 / 1
2
 treeEquals
0.00% covered (danger)
0.00%
0 / 37
0.00% covered (danger)
0.00%
0 / 1
380
 doDOMDiff
78.16% covered (warning)
78.16%
68 / 87
0.00% covered (danger)
0.00%
0 / 1
36.17
 subtreeDiffers
0.00% covered (danger)
0.00%
0 / 22
0.00% covered (danger)
0.00%
0 / 1
90
 markNode
0.00% covered (danger)
0.00%
0 / 7
0.00% covered (danger)
0.00%
0 / 1
56
 debugOut
0.00% covered (danger)
0.00%
0 / 14
0.00% covered (danger)
0.00%
0 / 1
12
1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Html2Wt;
5
6use Wikimedia\Assert\Assert;
7use Wikimedia\Assert\UnreachableException;
8use Wikimedia\Parsoid\Config\Env;
9use Wikimedia\Parsoid\DOM\Comment;
10use Wikimedia\Parsoid\DOM\DocumentFragment;
11use Wikimedia\Parsoid\DOM\Element;
12use Wikimedia\Parsoid\DOM\Node;
13use Wikimedia\Parsoid\DOM\Text;
14use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI;
15use Wikimedia\Parsoid\Utils\DiffDOMUtils;
16use Wikimedia\Parsoid\Utils\DOMCompat;
17use Wikimedia\Parsoid\Utils\DOMDataUtils;
18use Wikimedia\Parsoid\Utils\DOMUtils;
19use Wikimedia\Parsoid\Utils\PHPUtils;
20use Wikimedia\Parsoid\Utils\WTUtils;
21
22/**
23 * A DOM diff helper class.
24 *
25 * Compares two DOMs and annotates a copy of the passed-in DOM with change
26 * information for the selective serializer.
27 */
28class DOMDiff {
29
30    // These attributes are ignored for equality purposes if they are added to a node.
31    private const IGNORE_ATTRIBUTES = [
32        // Note that we are explicitly not ignoring data-parsoid even though clients
33        // would never modify data-parsoid because SelectiveSerializer is wrapping text
34        // nodes in spans and speculatively computes DSR offsets for these span tags
35        // which are accurate for original DOM and may be inaccurate for the edited DOM.
36        // By diffing data-parsoid which diffs the DSR as well, we ensure we mark such
37        // nodes as modified and prevent use of those speculatively computed incorrect
38        // DSR values.
39        'data-parsoid-diff',
40        'about',
41        DOMDataUtils::DATA_OBJECT_ATTR_NAME,
42    ];
43
44    /**
45     * @var Env
46     */
47    public $env;
48
49    /** @var ParsoidExtensionAPI */
50    public $extApi;
51
52    /**
53     * @var array
54     */
55    public $specializedAttribHandlers;
56
57    /**
58     * @param Node $node
59     * @return Node|null
60     */
61    private function nextNonTemplateSibling( Node $node ): ?Node {
62        if ( WTUtils::isEncapsulationWrapper( $node ) ) {
63            return WTUtils::skipOverEncapsulatedContent( $node );
64        }
65        return $node->nextSibling;
66    }
67
68    /**
69     * @param mixed ...$args
70     */
71    private function debug( ...$args ): void {
72        $this->env->log( 'trace/domdiff', ...$args );
73    }
74
75    /**
76     * @param Env $env
77     */
78    public function __construct( Env $env ) {
79        $this->env = $env;
80        $this->extApi = new ParsoidExtensionAPI( $env );
81        $this->specializedAttribHandlers = [
82            'data-mw' => static function ( $nodeA, $dmwA, $nodeB, $dmwB ) {
83                return $dmwA == $dmwB;
84            },
85            'data-parsoid' => static function ( $nodeA, $dpA, $nodeB, $dpB ) {
86                return $dpA == $dpB;
87            },
88            // TODO(T254502): This is added temporarily for backwards
89            // compatibility and can be removed when versions up to 2.1.0
90            // are no longer stored
91            'typeof' => static function ( $nodeA, $valA, $nodeB, $valB ) {
92                if ( $valA === $valB ) {
93                    return true;
94                } elseif ( $valA === 'mw:DisplaySpace' ) {
95                    return $valB === 'mw:DisplaySpace mw:Placeholder';
96                } elseif ( $valB === 'mw:DisplaySpace' ) {
97                    return $valA === 'mw:DisplaySpace mw:Placeholder';
98                } else {
99                    return false;
100                }
101            }
102        ];
103    }
104
105    /**
106     * Diff two HTML documents, and add / update data-parsoid-diff attributes with
107     * change information.
108     *
109     * @param Element $nodeA
110     * @param Element $nodeB
111     * @return array
112     */
113    public function diff( Element $nodeA, Element $nodeB ): array {
114        Assert::invariant(
115            $nodeA->ownerDocument !== $nodeB->ownerDocument,
116            'Expected to be diff\'ing different documents.'
117        );
118
119        $this->debug( static function () use( $nodeA, $nodeB ) {
120            return "ORIG:\n" .
121                DOMCompat::getOuterHTML( $nodeA ) .
122                "\nNEW :\n" .
123                DOMCompat::getOuterHTML( $nodeB );
124        } );
125
126        // The root nodes are equal, call recursive differ
127        $foundChange = $this->doDOMDiff( $nodeA, $nodeB );
128        return [ 'isEmpty' => !$foundChange ];
129    }
130
131    /**
132     * Test if two DOM nodes are equal.
133     *
134     * @param Node $nodeA
135     * @param Node $nodeB
136     * @param bool $deep
137     * @return bool
138     */
139    public function treeEquals( Node $nodeA, Node $nodeB, bool $deep ): bool {
140        if ( $nodeA->nodeType !== $nodeB->nodeType ) {
141            return false;
142        } elseif ( $nodeA instanceof Text ) {
143            // In the past we've had bugs where we let non-primitive strings
144            // leak into our DOM.  Safety first:
145            Assert::invariant( $nodeA->nodeValue === (string)$nodeA->nodeValue, '' );
146            Assert::invariant( $nodeB->nodeValue === (string)$nodeB->nodeValue, '' );
147            // ok, now do the comparison.
148            return $nodeA->nodeValue === $nodeB->nodeValue;
149        } elseif ( $nodeA instanceof Comment ) {
150            return WTUtils::decodeComment( $nodeA->nodeValue ) ===
151                WTUtils::decodeComment( $nodeB->nodeValue );
152        } elseif ( $nodeA instanceof Element || $nodeA instanceof DocumentFragment ) {
153            if ( $nodeA instanceof DocumentFragment ) {
154                if ( !( $nodeB instanceof DocumentFragment ) ) {
155                    return false;
156                }
157            } else {  // $nodeA instanceof Element
158                // Compare node name and attribute length
159                if (
160                    !( $nodeB instanceof Element ) ||
161                    DOMCompat::nodeName( $nodeA ) !== DOMCompat::nodeName( $nodeB ) ||
162                    !DiffUtils::attribsEquals(
163                        $nodeA,
164                        $nodeB,
165                        self::IGNORE_ATTRIBUTES,
166                        $this->specializedAttribHandlers
167                    )
168                ) {
169                    return false;
170                }
171            }
172
173            // Passed all tests, node itself is equal.
174            if ( $deep ) {
175                $childA = null;
176                $childB = null;
177                // Compare # of children, since that's fast.
178                // (Avoid de-optimizing DOM by using node#childNodes)
179                for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild;
180                    $childA && $childB;
181                    $childA = $childA->nextSibling, $childB = $childB->nextSibling
182                ) {
183                    /* don't look inside children yet, just look at # of children */
184                }
185
186                if ( $childA || $childB ) {
187                    return false; /* nodes have different # of children */
188                }
189                // Now actually compare the child subtrees
190                for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild;
191                    $childA && $childB;
192                    $childA = $childA->nextSibling, $childB = $childB->nextSibling
193                ) {
194                    if ( !$this->treeEquals( $childA, $childB, $deep ) ) {
195                        return false;
196                    }
197                }
198            }
199
200            // Did not find a diff yet, so the trees must be equal.
201            return true;
202        }
203        throw new UnreachableException( 'we shouldn\'t get here' );
204    }
205
206    /**
207     * Diff two DOM trees by comparing them node-by-node.
208     *
209     * TODO: Implement something more intelligent like
210     * http://gregory.cobena.free.fr/www/Publications/%5BICDE2002%5D%20XyDiff%20-%20published%20version.pdf,
211     * which uses hash signatures of subtrees to efficiently detect moves /
212     * wrapping.
213     *
214     * Adds / updates a data-parsoid-diff structure with change information.
215     *
216     * Returns true if subtree is changed, false otherwise.
217     *
218     * TODO:
219     * Assume typical CSS white-space, so ignore ws diffs in non-pre content.
220     *
221     * @param Node $baseParentNode
222     * @param Node $newParentNode
223     * @return bool
224     */
225    public function doDOMDiff( Node $baseParentNode, Node $newParentNode ): bool {
226        // Perform a relaxed version of the recursive treeEquals algorithm that
227        // allows for some minor differences and tries to produce a sensible diff
228        // marking using heuristics like look-ahead on siblings.
229        $baseNode = $baseParentNode->firstChild;
230        $newNode = $newParentNode->firstChild;
231        $lookaheadNode = null;
232        $foundDiffOverall = false;
233
234        while ( $baseNode && $newNode ) {
235            $dontAdvanceNewNode = false;
236            $this->debugOut( $baseNode, $newNode );
237            // shallow check first
238            if ( !$this->treeEquals( $baseNode, $newNode, false ) ) {
239                $this->debug( '-- not equal --' );
240                $savedNewNode = $newNode;
241                $foundDiff = false;
242
243                // Some simplistic look-ahead, currently limited to a single level
244                // in the DOM.
245
246                // look-ahead in *new* DOM to detect insertions
247                if ( DiffDOMUtils::isContentNode( $baseNode ) ) {
248                    $this->debug( '--lookahead in new dom--' );
249                    $lookaheadNode = $newNode->nextSibling;
250                    while ( $lookaheadNode ) {
251                        $this->debugOut( $baseNode, $lookaheadNode, 'new' );
252                        if ( DiffDOMUtils::isContentNode( $lookaheadNode ) &&
253                            $this->treeEquals( $baseNode, $lookaheadNode, true )
254                        ) {
255                            // mark skipped-over nodes as inserted
256                            $markNode = $newNode;
257                            while ( $markNode !== $lookaheadNode ) {
258                                $this->debug( '--found diff: inserted--' );
259                                $this->markNode( $markNode, DiffMarkers::INSERTED );
260                                $markNode = $markNode->nextSibling;
261                            }
262                            $foundDiff = true;
263                            $newNode = $lookaheadNode;
264                            break;
265                        }
266                        $lookaheadNode = self::nextNonTemplateSibling( $lookaheadNode );
267                    }
268                }
269
270                // look-ahead in *base* DOM to detect deletions
271                if ( !$foundDiff && DiffDOMUtils::isContentNode( $newNode ) ) {
272                    $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $baseNode );
273                    $this->debug( '--lookahead in old dom--' );
274                    $lookaheadNode = $baseNode->nextSibling;
275                    while ( $lookaheadNode ) {
276                        $this->debugOut( $lookaheadNode, $newNode, 'old' );
277                        if ( DiffDOMUtils::isContentNode( $lookaheadNode ) &&
278                            $this->treeEquals( $lookaheadNode, $newNode, true )
279                        ) {
280                            $this->debug( '--found diff: deleted--' );
281                            // mark skipped-over nodes as deleted
282                            $this->markNode( $newNode, DiffMarkers::DELETED, $isBlockNode );
283                            $baseNode = $lookaheadNode;
284                            $foundDiff = true;
285                            break;
286                        } elseif ( !WTUtils::emitsSolTransparentSingleLineWT( $lookaheadNode ) ) {
287                            // We only care about the deleted node prior to the one that
288                            // gets a tree match (but, ignore nodes that show up in wikitext
289                            // but don't affect sol-state or HTML rendering -- note that
290                            // whitespace is being ignored, but that whitespace occurs
291                            // between block nodes).
292                            $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $lookaheadNode );
293                        }
294                        $lookaheadNode = self::nextNonTemplateSibling( $lookaheadNode );
295                    }
296                }
297
298                if ( !$foundDiff ) {
299                    if ( !( $savedNewNode instanceof Element ) ) {
300                        $this->debug( '--found diff: modified text/comment--' );
301                        $this->markNode(
302                            $savedNewNode, DiffMarkers::DELETED,
303                            WTUtils::isBlockNodeWithVisibleWT( $baseNode )
304                        );
305                    } elseif ( $baseNode instanceof Element &&
306                        DOMCompat::nodeName( $savedNewNode ) === DOMCompat::nodeName( $baseNode ) &&
307                        ( DOMDataUtils::getDataParsoid( $savedNewNode )->stx ?? null ) ===
308                        ( DOMDataUtils::getDataParsoid( $baseNode )->stx ?? null )
309                    ) {
310                        // Identical wrapper-type, but modified.
311                        // Mark modified-wrapper, and recurse.
312                        $this->debug( '--found diff: modified-wrapper--' );
313                        $this->markNode( $savedNewNode, DiffMarkers::MODIFIED_WRAPPER );
314                        $this->subtreeDiffers( $baseNode, $savedNewNode );
315                    } else {
316                        // We now want to compare current newNode with the next baseNode.
317                        $dontAdvanceNewNode = true;
318
319                        // Since we are advancing in an old DOM without advancing
320                        // in the new DOM, there were deletions. Add a deletion marker
321                        // since this is important for accurate separator handling in selser.
322                        $this->markNode(
323                            $savedNewNode, DiffMarkers::DELETED,
324                            WTUtils::isBlockNodeWithVisibleWT( $baseNode )
325                        );
326                    }
327                }
328
329                // Record the fact that direct children changed in the parent node
330                $this->debug( '--found diff: children-changed--' );
331                $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED );
332
333                $foundDiffOverall = true;
334            } elseif ( $this->subtreeDiffers( $baseNode, $newNode ) ) {
335                $foundDiffOverall = true;
336            }
337
338            // And move on to the next pair (skipping over template HTML)
339            if ( $baseNode && $newNode ) {
340                $baseNode = self::nextNonTemplateSibling( $baseNode );
341                if ( !$dontAdvanceNewNode ) {
342                    $newNode = self::nextNonTemplateSibling( $newNode );
343                }
344            }
345        }
346
347        // mark extra new nodes as inserted
348        while ( $newNode ) {
349            $this->debug( '--found trailing new node: inserted--' );
350            $this->markNode( $newNode, DiffMarkers::INSERTED );
351            $foundDiffOverall = true;
352            $newNode = self::nextNonTemplateSibling( $newNode );
353        }
354
355        // If there are extra base nodes, something was deleted. Mark the parent as
356        // having lost children for now.
357        if ( $baseNode ) {
358            $this->debug( '--found trailing base nodes: deleted--' );
359            $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED );
360            // SSS FIXME: WTS checks for zero children in a few places
361            // That code would have to be upgraded if we emit mw:DiffMarker
362            // in this scenario. So, bailing out in this one case for now.
363            if ( $newParentNode->hasChildNodes() ) {
364                $meta = $newParentNode->ownerDocument->createElement( 'meta' );
365                DOMUtils::addTypeOf( $meta, 'mw:DiffMarker/deleted' );
366                if ( WTUtils::isBlockNodeWithVisibleWT( $baseNode ) ) {
367                    $meta->setAttribute( 'data-is-block', 'true' );
368                }
369                $newParentNode->appendChild( $meta );
370            }
371            $foundDiffOverall = true;
372        }
373
374        return $foundDiffOverall;
375    }
376
377    /* ***************************************************
378     * Helpers
379     * ***************************************************/
380
381    /**
382     * @param Node $baseNode
383     * @param Node $newNode
384     * @return bool
385     */
386    private function subtreeDiffers( Node $baseNode, Node $newNode ): bool {
387        $baseEncapsulated = WTUtils::isEncapsulationWrapper( $baseNode );
388        $newEncapsulated = WTUtils::isEncapsulationWrapper( $newNode );
389
390        if ( !$baseEncapsulated && !$newEncapsulated ) {
391            $this->debug( '--shallow equal: recursing--' );
392            // Recursively diff subtrees if not template-like content
393            $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode );
394        } elseif ( $baseEncapsulated && $newEncapsulated ) {
395            '@phan-var Element $baseNode';  // @var Element $baseNode
396            '@phan-var Element $newNode';  // @var Element $newNode
397
398            $extType = DOMUtils::matchTypeOf( $baseNode, '!^mw:Extension/!' );
399            $ext = null;
400
401            if ( $extType ) {
402                $dataMw = DOMDataUtils::getDataMw( $baseNode );
403                // FIXME: The EncapsulatedContentHandler tries to get the name from
404                // the typeOf if it isn't in dataMw ...
405                $ext = $this->env->getSiteConfig()->getExtTagImpl( $dataMw->name ?? '' );
406            }
407
408            if (
409                $ext && ( DOMUtils::matchTypeOf( $newNode, '!^mw:Extension/!' ) === $extType )
410            ) {
411                $this->debug( '--diffing extension content--' );
412                $subtreeDiffers = $ext->diffHandler(
413                    $this->extApi, [ $this, 'doDOMDiff' ], $baseNode, $newNode
414                );
415            } else {
416                // Otherwise, for encapsulated content, we don't know about the subtree.
417                $subtreeDiffers = false;
418            }
419        } else {
420            // FIXME: Maybe $editNode should be marked as inserted to avoid
421            // losing any edits, at the cost of more normalization.
422            // $state->inModifiedContent is only set when we're in inserted
423            // content, so not sure this is currently doing all that much.
424            $subtreeDiffers = true;
425        }
426
427        if ( $subtreeDiffers ) {
428            $this->debug( '--found diff: subtree-changed--' );
429            $this->markNode( $newNode, DiffMarkers::SUBTREE_CHANGED );
430        }
431        return $subtreeDiffers;
432    }
433
434    /**
435     * @param Node $node
436     * @param string $mark
437     * @param bool $blockNodeDeleted
438     */
439    private function markNode( Node $node, string $mark, bool $blockNodeDeleted = false ): void {
440        $meta = DiffUtils::addDiffMark( $node, $this->env, $mark );
441
442        if ( $meta && $blockNodeDeleted ) {
443            $meta->setAttribute( 'data-is-block', 'true' );
444        }
445
446        if ( $mark === DiffMarkers::DELETED || $mark === DiffMarkers::INSERTED ) {
447            $this->markNode( $node->parentNode, DiffMarkers::CHILDREN_CHANGED );
448        }
449
450        // Clear out speculatively computed DSR values for data-mw-selser-wrapper nodes
451        // since they may be incorrect. This eliminates any inadvertent use of
452        // these incorrect values.
453        if ( $node instanceof Element && $node->hasAttribute( 'data-mw-selser-wrapper' ) ) {
454            DOMDataUtils::getDataParsoid( $node )->dsr = null;
455        }
456    }
457
458    /**
459     * @param Node $nodeA
460     * @param Node $nodeB
461     * @param string $laPrefix
462     */
463    private function debugOut( Node $nodeA, Node $nodeB, string $laPrefix = '' ): void {
464        $this->env->log(
465            'trace/domdiff',
466            '--> A' . $laPrefix . ':' .
467                ( $nodeA instanceof Element
468                    ? DOMCompat::getOuterHTML( $nodeA )
469                    : PHPUtils::jsonEncode( $nodeA->nodeValue ) )
470        );
471
472        $this->env->log(
473            'trace/domdiff',
474            '--> B' . $laPrefix . ':' .
475                ( $nodeB instanceof Element
476                    ? DOMCompat::getOuterHTML( $nodeB )
477                    : PHPUtils::jsonEncode( $nodeB->nodeValue ) )
478        );
479    }
480}