Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
33.17% covered (danger)
33.17%
67 / 202
0.00% covered (danger)
0.00%
0 / 8
CRAP
0.00% covered (danger)
0.00%
0 / 1
DOMDiff
33.17% covered (danger)
33.17%
67 / 202
0.00% covered (danger)
0.00%
0 / 8
1619.43
0.00% covered (danger)
0.00%
0 / 1
 nextAnalyzableSibling
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
12
 __construct
0.00% covered (danger)
0.00%
0 / 29
0.00% covered (danger)
0.00%
0 / 1
2
 diff
0.00% covered (danger)
0.00%
0 / 8
0.00% covered (danger)
0.00%
0 / 1
2
 treeEquals
0.00% covered (danger)
0.00%
0 / 35
0.00% covered (danger)
0.00%
0 / 1
380
 doDOMDiff
77.91% covered (warning)
77.91%
67 / 86
0.00% covered (danger)
0.00%
0 / 1
36.45
 subtreeDiffers
0.00% covered (danger)
0.00%
0 / 24
0.00% covered (danger)
0.00%
0 / 1
110
 markNode
0.00% covered (danger)
0.00%
0 / 7
0.00% covered (danger)
0.00%
0 / 1
56
 debugOut
0.00% covered (danger)
0.00%
0 / 10
0.00% covered (danger)
0.00%
0 / 1
12
1<?php
2declare( strict_types = 1 );
3
4namespace Wikimedia\Parsoid\Html2Wt;
5
6use Wikimedia\Assert\Assert;
7use Wikimedia\Assert\UnreachableException;
8use Wikimedia\Parsoid\Config\Env;
9use Wikimedia\Parsoid\DOM\Comment;
10use Wikimedia\Parsoid\DOM\DocumentFragment;
11use Wikimedia\Parsoid\DOM\Element;
12use Wikimedia\Parsoid\DOM\Node;
13use Wikimedia\Parsoid\DOM\Text;
14use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI;
15use Wikimedia\Parsoid\NodeData\DataMw;
16use Wikimedia\Parsoid\NodeData\DataMwI18n;
17use Wikimedia\Parsoid\NodeData\DataMwVariant;
18use Wikimedia\Parsoid\NodeData\DataParsoid;
19use Wikimedia\Parsoid\Utils\DiffDOMUtils;
20use Wikimedia\Parsoid\Utils\DOMDataUtils;
21use Wikimedia\Parsoid\Utils\DOMUtils;
22use Wikimedia\Parsoid\Utils\PHPUtils;
23use Wikimedia\Parsoid\Utils\WTUtils;
24
25/**
26 * A DOM diff helper class.
27 *
28 * Compares two DOMs and annotates a copy of the passed-in DOM with change
29 * information for the selective serializer.
30 */
31class DOMDiff {
32
33    // These attributes are ignored for equality purposes if they are added to a node.
34    private const IGNORE_ATTRIBUTES = [
35        // Note that we are explicitly not ignoring data-parsoid even though clients
36        // would never modify data-parsoid because SelectiveSerializer is wrapping text
37        // nodes in spans and speculatively computes DSR offsets for these span tags
38        // which are accurate for original DOM and may be inaccurate for the edited DOM.
39        // By diffing data-parsoid which diffs the DSR as well, we ensure we mark such
40        // nodes as modified and prevent use of those speculatively computed incorrect
41        // DSR values.
42        'data-parsoid-diff',
43        'about',
44        DOMDataUtils::DATA_OBJECT_ATTR_NAME,
45    ];
46
47    /**
48     * @var Env
49     */
50    public $env;
51
52    /** @var ParsoidExtensionAPI */
53    public $extApi;
54
55    /**
56     * @var array<string,callable(Element,mixed,Element,mixed):bool>
57     */
58    public $specializedAttribHandlers;
59
60    public bool $skipEncapsulatedContent = true;
61
62    private function nextAnalyzableSibling( Node $node ): ?Node {
63        if ( WTUtils::isEncapsulationWrapper( $node ) && $this->skipEncapsulatedContent ) {
64            return WTUtils::skipOverEncapsulatedContent( $node );
65        }
66        return $node->nextSibling;
67    }
68
69    public function __construct( Env $env ) {
70        $this->env = $env;
71        $this->extApi = new ParsoidExtensionAPI( $env );
72        $this->specializedAttribHandlers = [
73            'data-mw' => function (
74                Element $nodeA, DataMw $dmwA, Element $nodeB, DataMw $dmwB
75            ): bool {
76                return $dmwA->equalsWithComparator(
77                    $dmwB, fn ( $a, $b ) => $this->treeEquals( $a, $b, true )
78                );
79            },
80            'data-parsoid' => static function (
81                Element $nodeA, DataParsoid $dpA, Element $nodeB, DataParsoid $dpB
82            ): bool {
83                // @phan-suppress-next-line PhanPluginComparisonObjectEqualityNotStrict
84                return $dpA == $dpB;
85            },
86            'data-mw-i18n' => static function (
87                Element $nodeA, DataMwI18n $dmwi18nA, Element $nodeB, DataMwI18n $dmwi18nB
88            ): bool {
89                // @phan-suppress-next-line PhanPluginComparisonObjectEqualityNotStrict
90                return $dmwi18nA == $dmwi18nB;
91            },
92            'data-mw-variant' => function (
93                Element $nodeA, DataMwVariant $dmwvA, Element $nodeB, DataMwVariant $dmwvB
94            ): bool {
95                return $dmwvA->equalsWithComparator(
96                    $dmwvB,
97                    fn ( $dfA, $dfB ) => $this->treeEquals( $dfA, $dfB, true )
98                );
99            },
100        ];
101    }
102
103    /**
104     * Diff two HTML documents, and add / update data-parsoid-diff attributes with
105     * change information.
106     *
107     * @param Element $nodeA
108     * @param Element $nodeB
109     *
110     * @return array{isEmpty: bool}
111     */
112    public function diff( Element $nodeA, Element $nodeB ): array {
113        Assert::invariant(
114            $nodeA->ownerDocument !== $nodeB->ownerDocument,
115            'Expected to be diff\'ing different documents.'
116        );
117
118        $this->env->trace( 'domdiff', "ORIG:\n", $nodeA );
119        $this->env->trace( 'domdiff', "NEW :\n", $nodeB );
120
121        // The root nodes are equal, call recursive differ
122        $foundChange = $this->doDOMDiff( $nodeA, $nodeB );
123        return [ 'isEmpty' => !$foundChange ];
124    }
125
126    /**
127     * Test if two DOM nodes are equal.
128     *
129     * @param Node $nodeA
130     * @param Node $nodeB
131     * @param bool $deep
132     * @return bool
133     */
134    public function treeEquals( Node $nodeA, Node $nodeB, bool $deep ): bool {
135        if ( $nodeA->nodeType !== $nodeB->nodeType ) {
136            return false;
137        } elseif ( $nodeA instanceof Text ) {
138            // In the past we've had bugs where we let non-primitive strings
139            // leak into our DOM.  Safety first:
140            Assert::invariant( $nodeA->nodeValue === (string)$nodeA->nodeValue, '' );
141            Assert::invariant( $nodeB->nodeValue === (string)$nodeB->nodeValue, '' );
142            // ok, now do the comparison.
143            return $nodeA->nodeValue === $nodeB->nodeValue;
144        } elseif ( $nodeA instanceof Comment ) {
145            return WTUtils::decodeComment( $nodeA->nodeValue ) ===
146                WTUtils::decodeComment( $nodeB->nodeValue );
147        } elseif ( $nodeA instanceof Element || $nodeA instanceof DocumentFragment ) {
148            if ( $nodeA instanceof DocumentFragment ) {
149                if ( !( $nodeB instanceof DocumentFragment ) ) {
150                    return false;
151                }
152            } else {  // $nodeA instanceof Element
153                // Compare node name and attribute length
154                if (
155                    !( $nodeB instanceof Element ) ||
156                    DOMUtils::nodeName( $nodeA ) !== DOMUtils::nodeName( $nodeB ) ||
157                    !DiffUtils::attribsEquals(
158                        $nodeA,
159                        $nodeB,
160                        self::IGNORE_ATTRIBUTES,
161                        $this->specializedAttribHandlers
162                    )
163                ) {
164                    return false;
165                }
166            }
167
168            // Passed all tests, node itself is equal.
169            if ( $deep ) {
170                // Compare # of children, since that's fast.
171                // (Avoid de-optimizing DOM by using node#childNodes)
172                for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild;
173                    $childA && $childB;
174                    $childA = $childA->nextSibling, $childB = $childB->nextSibling
175                ) {
176                    /* don't look inside children yet, just look at # of children */
177                }
178
179                if ( $childA || $childB ) {
180                    return false; /* nodes have different # of children */
181                }
182                // Now actually compare the child subtrees
183                for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild;
184                    $childA && $childB;
185                    $childA = $childA->nextSibling, $childB = $childB->nextSibling
186                ) {
187                    if ( !$this->treeEquals( $childA, $childB, $deep ) ) {
188                        return false;
189                    }
190                }
191            }
192
193            // Did not find a diff yet, so the trees must be equal.
194            return true;
195        }
196        throw new UnreachableException( 'we shouldn\'t get here' );
197    }
198
199    /**
200     * Diff two DOM trees by comparing them node-by-node.
201     *
202     * TODO: Implement something more intelligent like
203     * http://gregory.cobena.free.fr/www/Publications/%5BICDE2002%5D%20XyDiff%20-%20published%20version.pdf,
204     * which uses hash signatures of subtrees to efficiently detect moves /
205     * wrapping.
206     *
207     * Adds / updates a data-parsoid-diff structure with change information.
208     *
209     * Returns true if subtree is changed, false otherwise.
210     *
211     * TODO:
212     * Assume typical CSS white-space, so ignore ws diffs in non-pre content.
213     *
214     * @param Node $baseParentNode
215     * @param Node $newParentNode
216     * @return bool
217     */
218    public function doDOMDiff( Node $baseParentNode, Node $newParentNode ): bool {
219        // Perform a relaxed version of the recursive treeEquals algorithm that
220        // allows for some minor differences and tries to produce a sensible diff
221        // marking using heuristics like look-ahead on siblings.
222        $baseNode = $baseParentNode->firstChild;
223        $newNode = $newParentNode->firstChild;
224        $foundDiffOverall = false;
225
226        while ( $baseNode && $newNode ) {
227            $dontAdvanceNewNode = false;
228            $this->debugOut( $baseNode, $newNode );
229            // shallow check first
230            if ( !$this->treeEquals( $baseNode, $newNode, false ) ) {
231                $this->env->trace( 'domdiff', '-- not equal --' );
232                $savedNewNode = $newNode;
233                $foundDiff = false;
234
235                // Some simplistic look-ahead, currently limited to a single level
236                // in the DOM.
237
238                // look-ahead in *new* DOM to detect insertions
239                if ( DiffDOMUtils::isContentNode( $baseNode ) ) {
240                    $this->env->trace( 'domdiff', '--lookahead in new dom--' );
241                    $lookaheadNode = $newNode->nextSibling;
242                    while ( $lookaheadNode ) {
243                        $this->debugOut( $baseNode, $lookaheadNode, 'new' );
244                        if ( DiffDOMUtils::isContentNode( $lookaheadNode ) &&
245                            $this->treeEquals( $baseNode, $lookaheadNode, true )
246                        ) {
247                            // mark skipped-over nodes as inserted
248                            $markNode = $newNode;
249                            while ( $markNode !== $lookaheadNode ) {
250                                $this->env->trace( 'domdiff', '--found diff: inserted--' );
251                                $this->markNode( $markNode, DiffMarkers::INSERTED );
252                                $markNode = $markNode->nextSibling;
253                            }
254                            $foundDiff = true;
255                            $newNode = $lookaheadNode;
256                            break;
257                        }
258                        $lookaheadNode = self::nextAnalyzableSibling( $lookaheadNode );
259                    }
260                }
261
262                // look-ahead in *base* DOM to detect deletions
263                if ( !$foundDiff && DiffDOMUtils::isContentNode( $newNode ) ) {
264                    $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $baseNode );
265                    $this->env->trace( 'domdiff', '--lookahead in old dom--' );
266                    $lookaheadNode = $baseNode->nextSibling;
267                    while ( $lookaheadNode ) {
268                        $this->debugOut( $lookaheadNode, $newNode, 'old' );
269                        if ( DiffDOMUtils::isContentNode( $lookaheadNode ) &&
270                            $this->treeEquals( $lookaheadNode, $newNode, true )
271                        ) {
272                            $this->env->trace( 'domdiff', '--found diff: deleted--' );
273                            // mark skipped-over nodes as deleted
274                            $this->markNode( $newNode, DiffMarkers::DELETED, $isBlockNode );
275                            $baseNode = $lookaheadNode;
276                            $foundDiff = true;
277                            break;
278                        } elseif ( !WTUtils::emitsSolTransparentSingleLineWT( $lookaheadNode ) ) {
279                            // We only care about the deleted node prior to the one that
280                            // gets a tree match (but, ignore nodes that show up in wikitext
281                            // but don't affect sol-state or HTML rendering -- note that
282                            // whitespace is being ignored, but that whitespace occurs
283                            // between block nodes).
284                            $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $lookaheadNode );
285                        }
286                        $lookaheadNode = self::nextAnalyzableSibling( $lookaheadNode );
287                    }
288                }
289
290                if ( !$foundDiff ) {
291                    if ( !( $savedNewNode instanceof Element ) ) {
292                        $this->env->trace( 'domdiff', '--found diff: modified text/comment--' );
293                        $this->markNode(
294                            $savedNewNode, DiffMarkers::DELETED,
295                            WTUtils::isBlockNodeWithVisibleWT( $baseNode )
296                        );
297                    } elseif ( $baseNode instanceof Element &&
298                        DOMUtils::nodeName( $savedNewNode ) === DOMUtils::nodeName( $baseNode ) &&
299                        ( DOMDataUtils::getDataParsoid( $savedNewNode )->stx ?? null ) ===
300                        ( DOMDataUtils::getDataParsoid( $baseNode )->stx ?? null )
301                    ) {
302                        // Identical wrapper-type, but modified.
303                        // Mark modified-wrapper, and recurse.
304                        $this->env->trace( 'domdiff', '--found diff: modified-wrapper--' );
305                        $this->markNode( $savedNewNode, DiffMarkers::MODIFIED_WRAPPER );
306                        $this->subtreeDiffers( $baseNode, $savedNewNode );
307                    } else {
308                        // We now want to compare current newNode with the next baseNode.
309                        $dontAdvanceNewNode = true;
310
311                        // Since we are advancing in an old DOM without advancing
312                        // in the new DOM, there were deletions. Add a deletion marker
313                        // since this is important for accurate separator handling in selser.
314                        $this->markNode(
315                            $savedNewNode, DiffMarkers::DELETED,
316                            WTUtils::isBlockNodeWithVisibleWT( $baseNode )
317                        );
318                    }
319                }
320
321                // Record the fact that direct children changed in the parent node
322                $this->env->trace( 'domdiff', '--found diff: children-changed--' );
323                $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED );
324
325                $foundDiffOverall = true;
326            } elseif ( $this->subtreeDiffers( $baseNode, $newNode ) ) {
327                $foundDiffOverall = true;
328            }
329
330            // And move on to the next pair (skipping over template HTML)
331            if ( $baseNode && $newNode ) {
332                $baseNode = self::nextAnalyzableSibling( $baseNode );
333                if ( !$dontAdvanceNewNode ) {
334                    $newNode = self::nextAnalyzableSibling( $newNode );
335                }
336            }
337        }
338
339        // mark extra new nodes as inserted
340        while ( $newNode ) {
341            $this->env->trace( 'domdiff', '--found trailing new node: inserted--' );
342            $this->markNode( $newNode, DiffMarkers::INSERTED );
343            $foundDiffOverall = true;
344            $newNode = self::nextAnalyzableSibling( $newNode );
345        }
346
347        // If there are extra base nodes, something was deleted. Mark the parent as
348        // having lost children for now.
349        if ( $baseNode ) {
350            $this->env->trace( 'domdiff', '--found trailing base nodes: deleted--' );
351            $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED );
352            // SSS FIXME: WTS checks for zero children in a few places
353            // That code would have to be upgraded if we emit mw:DiffMarker
354            // in this scenario. So, bailing out in this one case for now.
355            if ( $newParentNode->hasChildNodes() ) {
356                $meta = $newParentNode->ownerDocument->createElement( 'meta' );
357                DOMUtils::addTypeOf( $meta, 'mw:DiffMarker/deleted' );
358                if ( WTUtils::isBlockNodeWithVisibleWT( $baseNode ) ) {
359                    $meta->setAttribute( 'data-is-block', 'true' );
360                }
361                $newParentNode->appendChild( $meta );
362            }
363            $foundDiffOverall = true;
364        }
365
366        return $foundDiffOverall;
367    }
368
369    /* ***************************************************
370     * Helpers
371     * ***************************************************/
372
373    /**
374     * @param Node $baseNode
375     * @param Node $newNode
376     * @return bool
377     */
378    private function subtreeDiffers( Node $baseNode, Node $newNode ): bool {
379        $baseEncapsulated = WTUtils::isEncapsulationWrapper( $baseNode );
380        $newEncapsulated = WTUtils::isEncapsulationWrapper( $newNode );
381
382        if ( !$baseEncapsulated && !$newEncapsulated ) {
383            $this->env->trace( 'domdiff', '--shallow equal: recursing--' );
384            // Recursively diff subtrees if not template-like content
385            $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode );
386        } elseif ( $baseEncapsulated && $newEncapsulated ) {
387            '@phan-var Element $baseNode';  // @var Element $baseNode
388            '@phan-var Element $newNode';  // @var Element $newNode
389
390            $ext = null;
391
392            $baseExtTagName = WTUtils::getExtTagName( $baseNode );
393            if ( $baseExtTagName ) {
394                $ext = $this->env->getSiteConfig()->getExtTagImpl( $baseExtTagName );
395            }
396
397            if ( $ext && ( $baseExtTagName === WTUtils::getExtTagName( $newNode ) ) ) {
398                $this->env->trace( 'domdiff', '--diffing extension content--' );
399                $subtreeDiffers = $ext->diffHandler(
400                    $this->extApi, [ $this, 'doDOMDiff' ], $baseNode, $newNode
401                );
402            } elseif ( $this->skipEncapsulatedContent ) {
403                // Otherwise, for encapsulated content, we don't know about the subtree.
404                $subtreeDiffers = false;
405            } else {
406                $this->env->trace( 'domdiff', '--shallow equal (encapsulated): recursing--' );
407                // Recursively diff subtrees if not template-like content
408                $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode );
409            }
410        } else {
411            // FIXME: Maybe $editNode should be marked as inserted to avoid
412            // losing any edits, at the cost of more normalization.
413            // $state->inInsertedContent is only set when we're in inserted
414            // content, so not sure this is currently doing all that much.
415            $subtreeDiffers = true;
416        }
417
418        if ( $subtreeDiffers ) {
419            $this->env->trace( 'domdiff', '--found diff: subtree-changed--' );
420            $this->markNode( $newNode, DiffMarkers::SUBTREE_CHANGED );
421        }
422        return $subtreeDiffers;
423    }
424
425    private function markNode( Node $node, DiffMarkers $mark, bool $blockNodeDeleted = false ): void {
426        $meta = DiffUtils::addDiffMark( $node, $this->env, $mark );
427
428        if ( $meta && $blockNodeDeleted ) {
429            $meta->setAttribute( 'data-is-block', 'true' );
430        }
431
432        if ( $mark === DiffMarkers::DELETED || $mark === DiffMarkers::INSERTED ) {
433            $this->markNode( $node->parentNode, DiffMarkers::CHILDREN_CHANGED );
434        }
435
436        // Clear out speculatively computed DSR values for data-mw-selser-wrapper nodes
437        // since they may be incorrect. This eliminates any inadvertent use of
438        // these incorrect values.
439        if ( $node instanceof Element && $node->hasAttribute( 'data-mw-selser-wrapper' ) ) {
440            DOMDataUtils::getDataParsoid( $node )->dsr = null;
441        }
442    }
443
444    private function debugOut( Node $nodeA, Node $nodeB, string $laPrefix = '' ): void {
445        $this->env->trace(
446            'domdiff',
447            '--> A', $laPrefix, ':',
448            static fn () => ( $nodeA instanceof Element ? $nodeA : PHPUtils::jsonEncode( $nodeA->nodeValue ) )
449        );
450        $this->env->trace(
451            'domdiff',
452            '--> B', $laPrefix, ':',
453            static fn () => ( $nodeB instanceof Element ? $nodeB : PHPUtils::jsonEncode( $nodeB->nodeValue ) )
454        );
455    }
456}