Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
33.66% |
68 / 202 |
|
0.00% |
0 / 9 |
CRAP | |
0.00% |
0 / 1 |
DOMDiff | |
33.66% |
68 / 202 |
|
0.00% |
0 / 9 |
1672.54 | |
0.00% |
0 / 1 |
nextNonTemplateSibling | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
debug | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
__construct | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
20 | |||
diff | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
treeEquals | |
0.00% |
0 / 37 |
|
0.00% |
0 / 1 |
380 | |||
doDOMDiff | |
78.16% |
68 / 87 |
|
0.00% |
0 / 1 |
36.17 | |||
subtreeDiffers | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
90 | |||
markNode | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
56 | |||
debugOut | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\DOM\Comment; |
10 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
11 | use Wikimedia\Parsoid\DOM\Element; |
12 | use Wikimedia\Parsoid\DOM\Node; |
13 | use Wikimedia\Parsoid\DOM\Text; |
14 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
15 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMCompat; |
17 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
18 | use Wikimedia\Parsoid\Utils\DOMUtils; |
19 | use Wikimedia\Parsoid\Utils\PHPUtils; |
20 | use Wikimedia\Parsoid\Utils\WTUtils; |
21 | |
22 | /** |
23 | * A DOM diff helper class. |
24 | * |
25 | * Compares two DOMs and annotates a copy of the passed-in DOM with change |
26 | * information for the selective serializer. |
27 | */ |
28 | class DOMDiff { |
29 | |
30 | // These attributes are ignored for equality purposes if they are added to a node. |
31 | private const IGNORE_ATTRIBUTES = [ |
32 | // Note that we are explicitly not ignoring data-parsoid even though clients |
33 | // would never modify data-parsoid because SelectiveSerializer is wrapping text |
34 | // nodes in spans and speculatively computes DSR offsets for these span tags |
35 | // which are accurate for original DOM and may be inaccurate for the edited DOM. |
36 | // By diffing data-parsoid which diffs the DSR as well, we ensure we mark such |
37 | // nodes as modified and prevent use of those speculatively computed incorrect |
38 | // DSR values. |
39 | 'data-parsoid-diff', |
40 | 'about', |
41 | DOMDataUtils::DATA_OBJECT_ATTR_NAME, |
42 | ]; |
43 | |
44 | /** |
45 | * @var Env |
46 | */ |
47 | public $env; |
48 | |
49 | /** @var ParsoidExtensionAPI */ |
50 | public $extApi; |
51 | |
52 | /** |
53 | * @var array |
54 | */ |
55 | public $specializedAttribHandlers; |
56 | |
57 | /** |
58 | * @param Node $node |
59 | * @return Node|null |
60 | */ |
61 | private function nextNonTemplateSibling( Node $node ): ?Node { |
62 | if ( WTUtils::isEncapsulationWrapper( $node ) ) { |
63 | return WTUtils::skipOverEncapsulatedContent( $node ); |
64 | } |
65 | return $node->nextSibling; |
66 | } |
67 | |
68 | /** |
69 | * @param mixed ...$args |
70 | */ |
71 | private function debug( ...$args ): void { |
72 | $this->env->log( 'trace/domdiff', ...$args ); |
73 | } |
74 | |
75 | /** |
76 | * @param Env $env |
77 | */ |
78 | public function __construct( Env $env ) { |
79 | $this->env = $env; |
80 | $this->extApi = new ParsoidExtensionAPI( $env ); |
81 | $this->specializedAttribHandlers = [ |
82 | 'data-mw' => static function ( $nodeA, $dmwA, $nodeB, $dmwB ) { |
83 | return $dmwA == $dmwB; |
84 | }, |
85 | 'data-parsoid' => static function ( $nodeA, $dpA, $nodeB, $dpB ) { |
86 | return $dpA == $dpB; |
87 | }, |
88 | // TODO(T254502): This is added temporarily for backwards |
89 | // compatibility and can be removed when versions up to 2.1.0 |
90 | // are no longer stored |
91 | 'typeof' => static function ( $nodeA, $valA, $nodeB, $valB ) { |
92 | if ( $valA === $valB ) { |
93 | return true; |
94 | } elseif ( $valA === 'mw:DisplaySpace' ) { |
95 | return $valB === 'mw:DisplaySpace mw:Placeholder'; |
96 | } elseif ( $valB === 'mw:DisplaySpace' ) { |
97 | return $valA === 'mw:DisplaySpace mw:Placeholder'; |
98 | } else { |
99 | return false; |
100 | } |
101 | } |
102 | ]; |
103 | } |
104 | |
105 | /** |
106 | * Diff two HTML documents, and add / update data-parsoid-diff attributes with |
107 | * change information. |
108 | * |
109 | * @param Element $nodeA |
110 | * @param Element $nodeB |
111 | * @return array |
112 | */ |
113 | public function diff( Element $nodeA, Element $nodeB ): array { |
114 | Assert::invariant( |
115 | $nodeA->ownerDocument !== $nodeB->ownerDocument, |
116 | 'Expected to be diff\'ing different documents.' |
117 | ); |
118 | |
119 | $this->debug( static function () use( $nodeA, $nodeB ) { |
120 | return "ORIG:\n" . |
121 | DOMCompat::getOuterHTML( $nodeA ) . |
122 | "\nNEW :\n" . |
123 | DOMCompat::getOuterHTML( $nodeB ); |
124 | } ); |
125 | |
126 | // The root nodes are equal, call recursive differ |
127 | $foundChange = $this->doDOMDiff( $nodeA, $nodeB ); |
128 | return [ 'isEmpty' => !$foundChange ]; |
129 | } |
130 | |
131 | /** |
132 | * Test if two DOM nodes are equal. |
133 | * |
134 | * @param Node $nodeA |
135 | * @param Node $nodeB |
136 | * @param bool $deep |
137 | * @return bool |
138 | */ |
139 | public function treeEquals( Node $nodeA, Node $nodeB, bool $deep ): bool { |
140 | if ( $nodeA->nodeType !== $nodeB->nodeType ) { |
141 | return false; |
142 | } elseif ( $nodeA instanceof Text ) { |
143 | // In the past we've had bugs where we let non-primitive strings |
144 | // leak into our DOM. Safety first: |
145 | Assert::invariant( $nodeA->nodeValue === (string)$nodeA->nodeValue, '' ); |
146 | Assert::invariant( $nodeB->nodeValue === (string)$nodeB->nodeValue, '' ); |
147 | // ok, now do the comparison. |
148 | return $nodeA->nodeValue === $nodeB->nodeValue; |
149 | } elseif ( $nodeA instanceof Comment ) { |
150 | return WTUtils::decodeComment( $nodeA->nodeValue ) === |
151 | WTUtils::decodeComment( $nodeB->nodeValue ); |
152 | } elseif ( $nodeA instanceof Element || $nodeA instanceof DocumentFragment ) { |
153 | if ( $nodeA instanceof DocumentFragment ) { |
154 | if ( !( $nodeB instanceof DocumentFragment ) ) { |
155 | return false; |
156 | } |
157 | } else { // $nodeA instanceof Element |
158 | // Compare node name and attribute length |
159 | if ( |
160 | !( $nodeB instanceof Element ) || |
161 | DOMCompat::nodeName( $nodeA ) !== DOMCompat::nodeName( $nodeB ) || |
162 | !DiffUtils::attribsEquals( |
163 | $nodeA, |
164 | $nodeB, |
165 | self::IGNORE_ATTRIBUTES, |
166 | $this->specializedAttribHandlers |
167 | ) |
168 | ) { |
169 | return false; |
170 | } |
171 | } |
172 | |
173 | // Passed all tests, node itself is equal. |
174 | if ( $deep ) { |
175 | $childA = null; |
176 | $childB = null; |
177 | // Compare # of children, since that's fast. |
178 | // (Avoid de-optimizing DOM by using node#childNodes) |
179 | for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild; |
180 | $childA && $childB; |
181 | $childA = $childA->nextSibling, $childB = $childB->nextSibling |
182 | ) { |
183 | /* don't look inside children yet, just look at # of children */ |
184 | } |
185 | |
186 | if ( $childA || $childB ) { |
187 | return false; /* nodes have different # of children */ |
188 | } |
189 | // Now actually compare the child subtrees |
190 | for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild; |
191 | $childA && $childB; |
192 | $childA = $childA->nextSibling, $childB = $childB->nextSibling |
193 | ) { |
194 | if ( !$this->treeEquals( $childA, $childB, $deep ) ) { |
195 | return false; |
196 | } |
197 | } |
198 | } |
199 | |
200 | // Did not find a diff yet, so the trees must be equal. |
201 | return true; |
202 | } |
203 | throw new UnreachableException( 'we shouldn\'t get here' ); |
204 | } |
205 | |
206 | /** |
207 | * Diff two DOM trees by comparing them node-by-node. |
208 | * |
209 | * TODO: Implement something more intelligent like |
210 | * http://gregory.cobena.free.fr/www/Publications/%5BICDE2002%5D%20XyDiff%20-%20published%20version.pdf, |
211 | * which uses hash signatures of subtrees to efficiently detect moves / |
212 | * wrapping. |
213 | * |
214 | * Adds / updates a data-parsoid-diff structure with change information. |
215 | * |
216 | * Returns true if subtree is changed, false otherwise. |
217 | * |
218 | * TODO: |
219 | * Assume typical CSS white-space, so ignore ws diffs in non-pre content. |
220 | * |
221 | * @param Node $baseParentNode |
222 | * @param Node $newParentNode |
223 | * @return bool |
224 | */ |
225 | public function doDOMDiff( Node $baseParentNode, Node $newParentNode ): bool { |
226 | // Perform a relaxed version of the recursive treeEquals algorithm that |
227 | // allows for some minor differences and tries to produce a sensible diff |
228 | // marking using heuristics like look-ahead on siblings. |
229 | $baseNode = $baseParentNode->firstChild; |
230 | $newNode = $newParentNode->firstChild; |
231 | $lookaheadNode = null; |
232 | $foundDiffOverall = false; |
233 | |
234 | while ( $baseNode && $newNode ) { |
235 | $dontAdvanceNewNode = false; |
236 | $this->debugOut( $baseNode, $newNode ); |
237 | // shallow check first |
238 | if ( !$this->treeEquals( $baseNode, $newNode, false ) ) { |
239 | $this->debug( '-- not equal --' ); |
240 | $savedNewNode = $newNode; |
241 | $foundDiff = false; |
242 | |
243 | // Some simplistic look-ahead, currently limited to a single level |
244 | // in the DOM. |
245 | |
246 | // look-ahead in *new* DOM to detect insertions |
247 | if ( DiffDOMUtils::isContentNode( $baseNode ) ) { |
248 | $this->debug( '--lookahead in new dom--' ); |
249 | $lookaheadNode = $newNode->nextSibling; |
250 | while ( $lookaheadNode ) { |
251 | $this->debugOut( $baseNode, $lookaheadNode, 'new' ); |
252 | if ( DiffDOMUtils::isContentNode( $lookaheadNode ) && |
253 | $this->treeEquals( $baseNode, $lookaheadNode, true ) |
254 | ) { |
255 | // mark skipped-over nodes as inserted |
256 | $markNode = $newNode; |
257 | while ( $markNode !== $lookaheadNode ) { |
258 | $this->debug( '--found diff: inserted--' ); |
259 | $this->markNode( $markNode, DiffMarkers::INSERTED ); |
260 | $markNode = $markNode->nextSibling; |
261 | } |
262 | $foundDiff = true; |
263 | $newNode = $lookaheadNode; |
264 | break; |
265 | } |
266 | $lookaheadNode = self::nextNonTemplateSibling( $lookaheadNode ); |
267 | } |
268 | } |
269 | |
270 | // look-ahead in *base* DOM to detect deletions |
271 | if ( !$foundDiff && DiffDOMUtils::isContentNode( $newNode ) ) { |
272 | $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $baseNode ); |
273 | $this->debug( '--lookahead in old dom--' ); |
274 | $lookaheadNode = $baseNode->nextSibling; |
275 | while ( $lookaheadNode ) { |
276 | $this->debugOut( $lookaheadNode, $newNode, 'old' ); |
277 | if ( DiffDOMUtils::isContentNode( $lookaheadNode ) && |
278 | $this->treeEquals( $lookaheadNode, $newNode, true ) |
279 | ) { |
280 | $this->debug( '--found diff: deleted--' ); |
281 | // mark skipped-over nodes as deleted |
282 | $this->markNode( $newNode, DiffMarkers::DELETED, $isBlockNode ); |
283 | $baseNode = $lookaheadNode; |
284 | $foundDiff = true; |
285 | break; |
286 | } elseif ( !WTUtils::emitsSolTransparentSingleLineWT( $lookaheadNode ) ) { |
287 | // We only care about the deleted node prior to the one that |
288 | // gets a tree match (but, ignore nodes that show up in wikitext |
289 | // but don't affect sol-state or HTML rendering -- note that |
290 | // whitespace is being ignored, but that whitespace occurs |
291 | // between block nodes). |
292 | $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $lookaheadNode ); |
293 | } |
294 | $lookaheadNode = self::nextNonTemplateSibling( $lookaheadNode ); |
295 | } |
296 | } |
297 | |
298 | if ( !$foundDiff ) { |
299 | if ( !( $savedNewNode instanceof Element ) ) { |
300 | $this->debug( '--found diff: modified text/comment--' ); |
301 | $this->markNode( |
302 | $savedNewNode, DiffMarkers::DELETED, |
303 | WTUtils::isBlockNodeWithVisibleWT( $baseNode ) |
304 | ); |
305 | } elseif ( $baseNode instanceof Element && |
306 | DOMCompat::nodeName( $savedNewNode ) === DOMCompat::nodeName( $baseNode ) && |
307 | ( DOMDataUtils::getDataParsoid( $savedNewNode )->stx ?? null ) === |
308 | ( DOMDataUtils::getDataParsoid( $baseNode )->stx ?? null ) |
309 | ) { |
310 | // Identical wrapper-type, but modified. |
311 | // Mark modified-wrapper, and recurse. |
312 | $this->debug( '--found diff: modified-wrapper--' ); |
313 | $this->markNode( $savedNewNode, DiffMarkers::MODIFIED_WRAPPER ); |
314 | $this->subtreeDiffers( $baseNode, $savedNewNode ); |
315 | } else { |
316 | // We now want to compare current newNode with the next baseNode. |
317 | $dontAdvanceNewNode = true; |
318 | |
319 | // Since we are advancing in an old DOM without advancing |
320 | // in the new DOM, there were deletions. Add a deletion marker |
321 | // since this is important for accurate separator handling in selser. |
322 | $this->markNode( |
323 | $savedNewNode, DiffMarkers::DELETED, |
324 | WTUtils::isBlockNodeWithVisibleWT( $baseNode ) |
325 | ); |
326 | } |
327 | } |
328 | |
329 | // Record the fact that direct children changed in the parent node |
330 | $this->debug( '--found diff: children-changed--' ); |
331 | $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED ); |
332 | |
333 | $foundDiffOverall = true; |
334 | } elseif ( $this->subtreeDiffers( $baseNode, $newNode ) ) { |
335 | $foundDiffOverall = true; |
336 | } |
337 | |
338 | // And move on to the next pair (skipping over template HTML) |
339 | if ( $baseNode && $newNode ) { |
340 | $baseNode = self::nextNonTemplateSibling( $baseNode ); |
341 | if ( !$dontAdvanceNewNode ) { |
342 | $newNode = self::nextNonTemplateSibling( $newNode ); |
343 | } |
344 | } |
345 | } |
346 | |
347 | // mark extra new nodes as inserted |
348 | while ( $newNode ) { |
349 | $this->debug( '--found trailing new node: inserted--' ); |
350 | $this->markNode( $newNode, DiffMarkers::INSERTED ); |
351 | $foundDiffOverall = true; |
352 | $newNode = self::nextNonTemplateSibling( $newNode ); |
353 | } |
354 | |
355 | // If there are extra base nodes, something was deleted. Mark the parent as |
356 | // having lost children for now. |
357 | if ( $baseNode ) { |
358 | $this->debug( '--found trailing base nodes: deleted--' ); |
359 | $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED ); |
360 | // SSS FIXME: WTS checks for zero children in a few places |
361 | // That code would have to be upgraded if we emit mw:DiffMarker |
362 | // in this scenario. So, bailing out in this one case for now. |
363 | if ( $newParentNode->hasChildNodes() ) { |
364 | $meta = $newParentNode->ownerDocument->createElement( 'meta' ); |
365 | DOMUtils::addTypeOf( $meta, 'mw:DiffMarker/deleted' ); |
366 | if ( WTUtils::isBlockNodeWithVisibleWT( $baseNode ) ) { |
367 | $meta->setAttribute( 'data-is-block', 'true' ); |
368 | } |
369 | $newParentNode->appendChild( $meta ); |
370 | } |
371 | $foundDiffOverall = true; |
372 | } |
373 | |
374 | return $foundDiffOverall; |
375 | } |
376 | |
377 | /* *************************************************** |
378 | * Helpers |
379 | * ***************************************************/ |
380 | |
381 | /** |
382 | * @param Node $baseNode |
383 | * @param Node $newNode |
384 | * @return bool |
385 | */ |
386 | private function subtreeDiffers( Node $baseNode, Node $newNode ): bool { |
387 | $baseEncapsulated = WTUtils::isEncapsulationWrapper( $baseNode ); |
388 | $newEncapsulated = WTUtils::isEncapsulationWrapper( $newNode ); |
389 | |
390 | if ( !$baseEncapsulated && !$newEncapsulated ) { |
391 | $this->debug( '--shallow equal: recursing--' ); |
392 | // Recursively diff subtrees if not template-like content |
393 | $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode ); |
394 | } elseif ( $baseEncapsulated && $newEncapsulated ) { |
395 | '@phan-var Element $baseNode'; // @var Element $baseNode |
396 | '@phan-var Element $newNode'; // @var Element $newNode |
397 | |
398 | $extType = DOMUtils::matchTypeOf( $baseNode, '!^mw:Extension/!' ); |
399 | $ext = null; |
400 | |
401 | if ( $extType ) { |
402 | $dataMw = DOMDataUtils::getDataMw( $baseNode ); |
403 | // FIXME: The EncapsulatedContentHandler tries to get the name from |
404 | // the typeOf if it isn't in dataMw ... |
405 | $ext = $this->env->getSiteConfig()->getExtTagImpl( $dataMw->name ?? '' ); |
406 | } |
407 | |
408 | if ( |
409 | $ext && ( DOMUtils::matchTypeOf( $newNode, '!^mw:Extension/!' ) === $extType ) |
410 | ) { |
411 | $this->debug( '--diffing extension content--' ); |
412 | $subtreeDiffers = $ext->diffHandler( |
413 | $this->extApi, [ $this, 'doDOMDiff' ], $baseNode, $newNode |
414 | ); |
415 | } else { |
416 | // Otherwise, for encapsulated content, we don't know about the subtree. |
417 | $subtreeDiffers = false; |
418 | } |
419 | } else { |
420 | // FIXME: Maybe $editNode should be marked as inserted to avoid |
421 | // losing any edits, at the cost of more normalization. |
422 | // $state->inModifiedContent is only set when we're in inserted |
423 | // content, so not sure this is currently doing all that much. |
424 | $subtreeDiffers = true; |
425 | } |
426 | |
427 | if ( $subtreeDiffers ) { |
428 | $this->debug( '--found diff: subtree-changed--' ); |
429 | $this->markNode( $newNode, DiffMarkers::SUBTREE_CHANGED ); |
430 | } |
431 | return $subtreeDiffers; |
432 | } |
433 | |
434 | /** |
435 | * @param Node $node |
436 | * @param string $mark |
437 | * @param bool $blockNodeDeleted |
438 | */ |
439 | private function markNode( Node $node, string $mark, bool $blockNodeDeleted = false ): void { |
440 | $meta = DiffUtils::addDiffMark( $node, $this->env, $mark ); |
441 | |
442 | if ( $meta && $blockNodeDeleted ) { |
443 | $meta->setAttribute( 'data-is-block', 'true' ); |
444 | } |
445 | |
446 | if ( $mark === DiffMarkers::DELETED || $mark === DiffMarkers::INSERTED ) { |
447 | $this->markNode( $node->parentNode, DiffMarkers::CHILDREN_CHANGED ); |
448 | } |
449 | |
450 | // Clear out speculatively computed DSR values for data-mw-selser-wrapper nodes |
451 | // since they may be incorrect. This eliminates any inadvertent use of |
452 | // these incorrect values. |
453 | if ( $node instanceof Element && $node->hasAttribute( 'data-mw-selser-wrapper' ) ) { |
454 | DOMDataUtils::getDataParsoid( $node )->dsr = null; |
455 | } |
456 | } |
457 | |
458 | /** |
459 | * @param Node $nodeA |
460 | * @param Node $nodeB |
461 | * @param string $laPrefix |
462 | */ |
463 | private function debugOut( Node $nodeA, Node $nodeB, string $laPrefix = '' ): void { |
464 | $this->env->log( |
465 | 'trace/domdiff', |
466 | '--> A' . $laPrefix . ':' . |
467 | ( $nodeA instanceof Element |
468 | ? DOMCompat::getOuterHTML( $nodeA ) |
469 | : PHPUtils::jsonEncode( $nodeA->nodeValue ) ) |
470 | ); |
471 | |
472 | $this->env->log( |
473 | 'trace/domdiff', |
474 | '--> B' . $laPrefix . ':' . |
475 | ( $nodeB instanceof Element |
476 | ? DOMCompat::getOuterHTML( $nodeB ) |
477 | : PHPUtils::jsonEncode( $nodeB->nodeValue ) ) |
478 | ); |
479 | } |
480 | } |