Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
33.50% |
68 / 203 |
|
0.00% |
0 / 9 |
CRAP | |
0.00% |
0 / 1 |
DOMDiff | |
33.50% |
68 / 203 |
|
0.00% |
0 / 9 |
1774.79 | |
0.00% |
0 / 1 |
nextAnalyzableSibling | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
debug | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
__construct | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
20 | |||
diff | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
treeEquals | |
0.00% |
0 / 37 |
|
0.00% |
0 / 1 |
380 | |||
doDOMDiff | |
78.16% |
68 / 87 |
|
0.00% |
0 / 1 |
36.17 | |||
subtreeDiffers | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
110 | |||
markNode | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
56 | |||
debugOut | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\DOM\Comment; |
10 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
11 | use Wikimedia\Parsoid\DOM\Element; |
12 | use Wikimedia\Parsoid\DOM\Node; |
13 | use Wikimedia\Parsoid\DOM\Text; |
14 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
15 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMCompat; |
17 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
18 | use Wikimedia\Parsoid\Utils\DOMUtils; |
19 | use Wikimedia\Parsoid\Utils\PHPUtils; |
20 | use Wikimedia\Parsoid\Utils\WTUtils; |
21 | |
22 | /** |
23 | * A DOM diff helper class. |
24 | * |
25 | * Compares two DOMs and annotates a copy of the passed-in DOM with change |
26 | * information for the selective serializer. |
27 | */ |
28 | class DOMDiff { |
29 | |
30 | // These attributes are ignored for equality purposes if they are added to a node. |
31 | private const IGNORE_ATTRIBUTES = [ |
32 | // Note that we are explicitly not ignoring data-parsoid even though clients |
33 | // would never modify data-parsoid because SelectiveSerializer is wrapping text |
34 | // nodes in spans and speculatively computes DSR offsets for these span tags |
35 | // which are accurate for original DOM and may be inaccurate for the edited DOM. |
36 | // By diffing data-parsoid which diffs the DSR as well, we ensure we mark such |
37 | // nodes as modified and prevent use of those speculatively computed incorrect |
38 | // DSR values. |
39 | 'data-parsoid-diff', |
40 | 'about', |
41 | DOMDataUtils::DATA_OBJECT_ATTR_NAME, |
42 | ]; |
43 | |
44 | /** |
45 | * @var Env |
46 | */ |
47 | public $env; |
48 | |
49 | /** @var ParsoidExtensionAPI */ |
50 | public $extApi; |
51 | |
52 | /** |
53 | * @var array |
54 | */ |
55 | public $specializedAttribHandlers; |
56 | |
57 | public bool $skipEncapsulatedContent = true; |
58 | |
59 | private function nextAnalyzableSibling( Node $node ): ?Node { |
60 | if ( WTUtils::isEncapsulationWrapper( $node ) && $this->skipEncapsulatedContent ) { |
61 | return WTUtils::skipOverEncapsulatedContent( $node ); |
62 | } |
63 | return $node->nextSibling; |
64 | } |
65 | |
66 | /** |
67 | * @param mixed ...$args |
68 | */ |
69 | private function debug( ...$args ): void { |
70 | $this->env->log( 'trace/domdiff', ...$args ); |
71 | } |
72 | |
73 | public function __construct( Env $env ) { |
74 | $this->env = $env; |
75 | $this->extApi = new ParsoidExtensionAPI( $env ); |
76 | $this->specializedAttribHandlers = [ |
77 | 'data-mw' => static function ( $nodeA, $dmwA, $nodeB, $dmwB ) { |
78 | return $dmwA == $dmwB; |
79 | }, |
80 | 'data-parsoid' => static function ( $nodeA, $dpA, $nodeB, $dpB ) { |
81 | return $dpA == $dpB; |
82 | }, |
83 | // TODO(T254502): This is added temporarily for backwards |
84 | // compatibility and can be removed when versions up to 2.1.0 |
85 | // are no longer stored |
86 | 'typeof' => static function ( $nodeA, $valA, $nodeB, $valB ) { |
87 | if ( $valA === $valB ) { |
88 | return true; |
89 | } elseif ( $valA === 'mw:DisplaySpace' ) { |
90 | return $valB === 'mw:DisplaySpace mw:Placeholder'; |
91 | } elseif ( $valB === 'mw:DisplaySpace' ) { |
92 | return $valA === 'mw:DisplaySpace mw:Placeholder'; |
93 | } else { |
94 | return false; |
95 | } |
96 | } |
97 | ]; |
98 | } |
99 | |
100 | /** |
101 | * Diff two HTML documents, and add / update data-parsoid-diff attributes with |
102 | * change information. |
103 | * |
104 | * @param Element $nodeA |
105 | * @param Element $nodeB |
106 | * @return array |
107 | */ |
108 | public function diff( Element $nodeA, Element $nodeB ): array { |
109 | Assert::invariant( |
110 | $nodeA->ownerDocument !== $nodeB->ownerDocument, |
111 | 'Expected to be diff\'ing different documents.' |
112 | ); |
113 | |
114 | $this->debug( static function () use( $nodeA, $nodeB ) { |
115 | return "ORIG:\n" . |
116 | DOMCompat::getOuterHTML( $nodeA ) . |
117 | "\nNEW :\n" . |
118 | DOMCompat::getOuterHTML( $nodeB ); |
119 | } ); |
120 | |
121 | // The root nodes are equal, call recursive differ |
122 | $foundChange = $this->doDOMDiff( $nodeA, $nodeB ); |
123 | return [ 'isEmpty' => !$foundChange ]; |
124 | } |
125 | |
126 | /** |
127 | * Test if two DOM nodes are equal. |
128 | * |
129 | * @param Node $nodeA |
130 | * @param Node $nodeB |
131 | * @param bool $deep |
132 | * @return bool |
133 | */ |
134 | public function treeEquals( Node $nodeA, Node $nodeB, bool $deep ): bool { |
135 | if ( $nodeA->nodeType !== $nodeB->nodeType ) { |
136 | return false; |
137 | } elseif ( $nodeA instanceof Text ) { |
138 | // In the past we've had bugs where we let non-primitive strings |
139 | // leak into our DOM. Safety first: |
140 | Assert::invariant( $nodeA->nodeValue === (string)$nodeA->nodeValue, '' ); |
141 | Assert::invariant( $nodeB->nodeValue === (string)$nodeB->nodeValue, '' ); |
142 | // ok, now do the comparison. |
143 | return $nodeA->nodeValue === $nodeB->nodeValue; |
144 | } elseif ( $nodeA instanceof Comment ) { |
145 | return WTUtils::decodeComment( $nodeA->nodeValue ) === |
146 | WTUtils::decodeComment( $nodeB->nodeValue ); |
147 | } elseif ( $nodeA instanceof Element || $nodeA instanceof DocumentFragment ) { |
148 | if ( $nodeA instanceof DocumentFragment ) { |
149 | if ( !( $nodeB instanceof DocumentFragment ) ) { |
150 | return false; |
151 | } |
152 | } else { // $nodeA instanceof Element |
153 | // Compare node name and attribute length |
154 | if ( |
155 | !( $nodeB instanceof Element ) || |
156 | DOMCompat::nodeName( $nodeA ) !== DOMCompat::nodeName( $nodeB ) || |
157 | !DiffUtils::attribsEquals( |
158 | $nodeA, |
159 | $nodeB, |
160 | self::IGNORE_ATTRIBUTES, |
161 | $this->specializedAttribHandlers |
162 | ) |
163 | ) { |
164 | return false; |
165 | } |
166 | } |
167 | |
168 | // Passed all tests, node itself is equal. |
169 | if ( $deep ) { |
170 | $childA = null; |
171 | $childB = null; |
172 | // Compare # of children, since that's fast. |
173 | // (Avoid de-optimizing DOM by using node#childNodes) |
174 | for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild; |
175 | $childA && $childB; |
176 | $childA = $childA->nextSibling, $childB = $childB->nextSibling |
177 | ) { |
178 | /* don't look inside children yet, just look at # of children */ |
179 | } |
180 | |
181 | if ( $childA || $childB ) { |
182 | return false; /* nodes have different # of children */ |
183 | } |
184 | // Now actually compare the child subtrees |
185 | for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild; |
186 | $childA && $childB; |
187 | $childA = $childA->nextSibling, $childB = $childB->nextSibling |
188 | ) { |
189 | if ( !$this->treeEquals( $childA, $childB, $deep ) ) { |
190 | return false; |
191 | } |
192 | } |
193 | } |
194 | |
195 | // Did not find a diff yet, so the trees must be equal. |
196 | return true; |
197 | } |
198 | throw new UnreachableException( 'we shouldn\'t get here' ); |
199 | } |
200 | |
201 | /** |
202 | * Diff two DOM trees by comparing them node-by-node. |
203 | * |
204 | * TODO: Implement something more intelligent like |
205 | * http://gregory.cobena.free.fr/www/Publications/%5BICDE2002%5D%20XyDiff%20-%20published%20version.pdf, |
206 | * which uses hash signatures of subtrees to efficiently detect moves / |
207 | * wrapping. |
208 | * |
209 | * Adds / updates a data-parsoid-diff structure with change information. |
210 | * |
211 | * Returns true if subtree is changed, false otherwise. |
212 | * |
213 | * TODO: |
214 | * Assume typical CSS white-space, so ignore ws diffs in non-pre content. |
215 | * |
216 | * @param Node $baseParentNode |
217 | * @param Node $newParentNode |
218 | * @return bool |
219 | */ |
220 | public function doDOMDiff( Node $baseParentNode, Node $newParentNode ): bool { |
221 | // Perform a relaxed version of the recursive treeEquals algorithm that |
222 | // allows for some minor differences and tries to produce a sensible diff |
223 | // marking using heuristics like look-ahead on siblings. |
224 | $baseNode = $baseParentNode->firstChild; |
225 | $newNode = $newParentNode->firstChild; |
226 | $lookaheadNode = null; |
227 | $foundDiffOverall = false; |
228 | |
229 | while ( $baseNode && $newNode ) { |
230 | $dontAdvanceNewNode = false; |
231 | $this->debugOut( $baseNode, $newNode ); |
232 | // shallow check first |
233 | if ( !$this->treeEquals( $baseNode, $newNode, false ) ) { |
234 | $this->debug( '-- not equal --' ); |
235 | $savedNewNode = $newNode; |
236 | $foundDiff = false; |
237 | |
238 | // Some simplistic look-ahead, currently limited to a single level |
239 | // in the DOM. |
240 | |
241 | // look-ahead in *new* DOM to detect insertions |
242 | if ( DiffDOMUtils::isContentNode( $baseNode ) ) { |
243 | $this->debug( '--lookahead in new dom--' ); |
244 | $lookaheadNode = $newNode->nextSibling; |
245 | while ( $lookaheadNode ) { |
246 | $this->debugOut( $baseNode, $lookaheadNode, 'new' ); |
247 | if ( DiffDOMUtils::isContentNode( $lookaheadNode ) && |
248 | $this->treeEquals( $baseNode, $lookaheadNode, true ) |
249 | ) { |
250 | // mark skipped-over nodes as inserted |
251 | $markNode = $newNode; |
252 | while ( $markNode !== $lookaheadNode ) { |
253 | $this->debug( '--found diff: inserted--' ); |
254 | $this->markNode( $markNode, DiffMarkers::INSERTED ); |
255 | $markNode = $markNode->nextSibling; |
256 | } |
257 | $foundDiff = true; |
258 | $newNode = $lookaheadNode; |
259 | break; |
260 | } |
261 | $lookaheadNode = self::nextAnalyzableSibling( $lookaheadNode ); |
262 | } |
263 | } |
264 | |
265 | // look-ahead in *base* DOM to detect deletions |
266 | if ( !$foundDiff && DiffDOMUtils::isContentNode( $newNode ) ) { |
267 | $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $baseNode ); |
268 | $this->debug( '--lookahead in old dom--' ); |
269 | $lookaheadNode = $baseNode->nextSibling; |
270 | while ( $lookaheadNode ) { |
271 | $this->debugOut( $lookaheadNode, $newNode, 'old' ); |
272 | if ( DiffDOMUtils::isContentNode( $lookaheadNode ) && |
273 | $this->treeEquals( $lookaheadNode, $newNode, true ) |
274 | ) { |
275 | $this->debug( '--found diff: deleted--' ); |
276 | // mark skipped-over nodes as deleted |
277 | $this->markNode( $newNode, DiffMarkers::DELETED, $isBlockNode ); |
278 | $baseNode = $lookaheadNode; |
279 | $foundDiff = true; |
280 | break; |
281 | } elseif ( !WTUtils::emitsSolTransparentSingleLineWT( $lookaheadNode ) ) { |
282 | // We only care about the deleted node prior to the one that |
283 | // gets a tree match (but, ignore nodes that show up in wikitext |
284 | // but don't affect sol-state or HTML rendering -- note that |
285 | // whitespace is being ignored, but that whitespace occurs |
286 | // between block nodes). |
287 | $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $lookaheadNode ); |
288 | } |
289 | $lookaheadNode = self::nextAnalyzableSibling( $lookaheadNode ); |
290 | } |
291 | } |
292 | |
293 | if ( !$foundDiff ) { |
294 | if ( !( $savedNewNode instanceof Element ) ) { |
295 | $this->debug( '--found diff: modified text/comment--' ); |
296 | $this->markNode( |
297 | $savedNewNode, DiffMarkers::DELETED, |
298 | WTUtils::isBlockNodeWithVisibleWT( $baseNode ) |
299 | ); |
300 | } elseif ( $baseNode instanceof Element && |
301 | DOMCompat::nodeName( $savedNewNode ) === DOMCompat::nodeName( $baseNode ) && |
302 | ( DOMDataUtils::getDataParsoid( $savedNewNode )->stx ?? null ) === |
303 | ( DOMDataUtils::getDataParsoid( $baseNode )->stx ?? null ) |
304 | ) { |
305 | // Identical wrapper-type, but modified. |
306 | // Mark modified-wrapper, and recurse. |
307 | $this->debug( '--found diff: modified-wrapper--' ); |
308 | $this->markNode( $savedNewNode, DiffMarkers::MODIFIED_WRAPPER ); |
309 | $this->subtreeDiffers( $baseNode, $savedNewNode ); |
310 | } else { |
311 | // We now want to compare current newNode with the next baseNode. |
312 | $dontAdvanceNewNode = true; |
313 | |
314 | // Since we are advancing in an old DOM without advancing |
315 | // in the new DOM, there were deletions. Add a deletion marker |
316 | // since this is important for accurate separator handling in selser. |
317 | $this->markNode( |
318 | $savedNewNode, DiffMarkers::DELETED, |
319 | WTUtils::isBlockNodeWithVisibleWT( $baseNode ) |
320 | ); |
321 | } |
322 | } |
323 | |
324 | // Record the fact that direct children changed in the parent node |
325 | $this->debug( '--found diff: children-changed--' ); |
326 | $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED ); |
327 | |
328 | $foundDiffOverall = true; |
329 | } elseif ( $this->subtreeDiffers( $baseNode, $newNode ) ) { |
330 | $foundDiffOverall = true; |
331 | } |
332 | |
333 | // And move on to the next pair (skipping over template HTML) |
334 | if ( $baseNode && $newNode ) { |
335 | $baseNode = self::nextAnalyzableSibling( $baseNode ); |
336 | if ( !$dontAdvanceNewNode ) { |
337 | $newNode = self::nextAnalyzableSibling( $newNode ); |
338 | } |
339 | } |
340 | } |
341 | |
342 | // mark extra new nodes as inserted |
343 | while ( $newNode ) { |
344 | $this->debug( '--found trailing new node: inserted--' ); |
345 | $this->markNode( $newNode, DiffMarkers::INSERTED ); |
346 | $foundDiffOverall = true; |
347 | $newNode = self::nextAnalyzableSibling( $newNode ); |
348 | } |
349 | |
350 | // If there are extra base nodes, something was deleted. Mark the parent as |
351 | // having lost children for now. |
352 | if ( $baseNode ) { |
353 | $this->debug( '--found trailing base nodes: deleted--' ); |
354 | $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED ); |
355 | // SSS FIXME: WTS checks for zero children in a few places |
356 | // That code would have to be upgraded if we emit mw:DiffMarker |
357 | // in this scenario. So, bailing out in this one case for now. |
358 | if ( $newParentNode->hasChildNodes() ) { |
359 | $meta = $newParentNode->ownerDocument->createElement( 'meta' ); |
360 | DOMUtils::addTypeOf( $meta, 'mw:DiffMarker/deleted' ); |
361 | if ( WTUtils::isBlockNodeWithVisibleWT( $baseNode ) ) { |
362 | $meta->setAttribute( 'data-is-block', 'true' ); |
363 | } |
364 | $newParentNode->appendChild( $meta ); |
365 | } |
366 | $foundDiffOverall = true; |
367 | } |
368 | |
369 | return $foundDiffOverall; |
370 | } |
371 | |
372 | /* *************************************************** |
373 | * Helpers |
374 | * ***************************************************/ |
375 | |
376 | /** |
377 | * @param Node $baseNode |
378 | * @param Node $newNode |
379 | * @return bool |
380 | */ |
381 | private function subtreeDiffers( Node $baseNode, Node $newNode ): bool { |
382 | $baseEncapsulated = WTUtils::isEncapsulationWrapper( $baseNode ); |
383 | $newEncapsulated = WTUtils::isEncapsulationWrapper( $newNode ); |
384 | |
385 | if ( !$baseEncapsulated && !$newEncapsulated ) { |
386 | $this->debug( '--shallow equal: recursing--' ); |
387 | // Recursively diff subtrees if not template-like content |
388 | $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode ); |
389 | } elseif ( $baseEncapsulated && $newEncapsulated ) { |
390 | '@phan-var Element $baseNode'; // @var Element $baseNode |
391 | '@phan-var Element $newNode'; // @var Element $newNode |
392 | |
393 | $ext = null; |
394 | |
395 | $baseExtTagName = WTUtils::getExtTagName( $baseNode ); |
396 | if ( $baseExtTagName ) { |
397 | $ext = $this->env->getSiteConfig()->getExtTagImpl( $baseExtTagName ); |
398 | } |
399 | |
400 | if ( $ext && ( $baseExtTagName === WTUtils::getExtTagName( $newNode ) ) ) { |
401 | $this->debug( '--diffing extension content--' ); |
402 | $subtreeDiffers = $ext->diffHandler( |
403 | $this->extApi, [ $this, 'doDOMDiff' ], $baseNode, $newNode |
404 | ); |
405 | } elseif ( $this->skipEncapsulatedContent ) { |
406 | // Otherwise, for encapsulated content, we don't know about the subtree. |
407 | $subtreeDiffers = false; |
408 | } else { |
409 | $this->debug( '--shallow equal (encapsulated): recursing--' ); |
410 | // Recursively diff subtrees if not template-like content |
411 | $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode ); |
412 | } |
413 | } else { |
414 | // FIXME: Maybe $editNode should be marked as inserted to avoid |
415 | // losing any edits, at the cost of more normalization. |
416 | // $state->inInsertedContent is only set when we're in inserted |
417 | // content, so not sure this is currently doing all that much. |
418 | $subtreeDiffers = true; |
419 | } |
420 | |
421 | if ( $subtreeDiffers ) { |
422 | $this->debug( '--found diff: subtree-changed--' ); |
423 | $this->markNode( $newNode, DiffMarkers::SUBTREE_CHANGED ); |
424 | } |
425 | return $subtreeDiffers; |
426 | } |
427 | |
428 | private function markNode( Node $node, string $mark, bool $blockNodeDeleted = false ): void { |
429 | $meta = DiffUtils::addDiffMark( $node, $this->env, $mark ); |
430 | |
431 | if ( $meta && $blockNodeDeleted ) { |
432 | $meta->setAttribute( 'data-is-block', 'true' ); |
433 | } |
434 | |
435 | if ( $mark === DiffMarkers::DELETED || $mark === DiffMarkers::INSERTED ) { |
436 | $this->markNode( $node->parentNode, DiffMarkers::CHILDREN_CHANGED ); |
437 | } |
438 | |
439 | // Clear out speculatively computed DSR values for data-mw-selser-wrapper nodes |
440 | // since they may be incorrect. This eliminates any inadvertent use of |
441 | // these incorrect values. |
442 | if ( $node instanceof Element && $node->hasAttribute( 'data-mw-selser-wrapper' ) ) { |
443 | DOMDataUtils::getDataParsoid( $node )->dsr = null; |
444 | } |
445 | } |
446 | |
447 | private function debugOut( Node $nodeA, Node $nodeB, string $laPrefix = '' ): void { |
448 | $prefix = 'trace/domdiff'; |
449 | $this->env->log( $prefix, |
450 | static fn () => '--> A' . $laPrefix . ':' . |
451 | ( $nodeA instanceof Element |
452 | ? DOMCompat::getOuterHTML( $nodeA ) |
453 | : PHPUtils::jsonEncode( $nodeA->nodeValue ) ) |
454 | ); |
455 | $this->env->log( $prefix, |
456 | static fn () => '--> B' . $laPrefix . ':' . |
457 | ( $nodeB instanceof Element |
458 | ? DOMCompat::getOuterHTML( $nodeB ) |
459 | : PHPUtils::jsonEncode( $nodeB->nodeValue ) ) |
460 | ); |
461 | } |
462 | } |