Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
34.00% |
68 / 200 |
|
0.00% |
0 / 9 |
CRAP | |
0.00% |
0 / 1 |
DOMDiff | |
34.00% |
68 / 200 |
|
0.00% |
0 / 9 |
1648.33 | |
0.00% |
0 / 1 |
nextNonTemplateSibling | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
debug | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
__construct | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
20 | |||
diff | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
treeEquals | |
0.00% |
0 / 37 |
|
0.00% |
0 / 1 |
380 | |||
doDOMDiff | |
78.16% |
68 / 87 |
|
0.00% |
0 / 1 |
36.17 | |||
subtreeDiffers | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
90 | |||
markNode | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
56 | |||
debugOut | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\DOM\Comment; |
10 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
11 | use Wikimedia\Parsoid\DOM\Element; |
12 | use Wikimedia\Parsoid\DOM\Node; |
13 | use Wikimedia\Parsoid\DOM\Text; |
14 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
15 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMCompat; |
17 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
18 | use Wikimedia\Parsoid\Utils\DOMUtils; |
19 | use Wikimedia\Parsoid\Utils\PHPUtils; |
20 | use Wikimedia\Parsoid\Utils\WTUtils; |
21 | |
22 | /** |
23 | * A DOM diff helper class. |
24 | * |
25 | * Compares two DOMs and annotates a copy of the passed-in DOM with change |
26 | * information for the selective serializer. |
27 | */ |
28 | class DOMDiff { |
29 | |
30 | // These attributes are ignored for equality purposes if they are added to a node. |
31 | private const IGNORE_ATTRIBUTES = [ |
32 | // Note that we are explicitly not ignoring data-parsoid even though clients |
33 | // would never modify data-parsoid because SelectiveSerializer is wrapping text |
34 | // nodes in spans and speculatively computes DSR offsets for these span tags |
35 | // which are accurate for original DOM and may be inaccurate for the edited DOM. |
36 | // By diffing data-parsoid which diffs the DSR as well, we ensure we mark such |
37 | // nodes as modified and prevent use of those speculatively computed incorrect |
38 | // DSR values. |
39 | 'data-parsoid-diff', |
40 | 'about', |
41 | DOMDataUtils::DATA_OBJECT_ATTR_NAME, |
42 | ]; |
43 | |
44 | /** |
45 | * @var Env |
46 | */ |
47 | public $env; |
48 | |
49 | /** @var ParsoidExtensionAPI */ |
50 | public $extApi; |
51 | |
52 | /** |
53 | * @var array |
54 | */ |
55 | public $specializedAttribHandlers; |
56 | |
57 | private function nextNonTemplateSibling( Node $node ): ?Node { |
58 | if ( WTUtils::isEncapsulationWrapper( $node ) ) { |
59 | return WTUtils::skipOverEncapsulatedContent( $node ); |
60 | } |
61 | return $node->nextSibling; |
62 | } |
63 | |
64 | /** |
65 | * @param mixed ...$args |
66 | */ |
67 | private function debug( ...$args ): void { |
68 | $this->env->log( 'trace/domdiff', ...$args ); |
69 | } |
70 | |
71 | public function __construct( Env $env ) { |
72 | $this->env = $env; |
73 | $this->extApi = new ParsoidExtensionAPI( $env ); |
74 | $this->specializedAttribHandlers = [ |
75 | 'data-mw' => static function ( $nodeA, $dmwA, $nodeB, $dmwB ) { |
76 | return $dmwA == $dmwB; |
77 | }, |
78 | 'data-parsoid' => static function ( $nodeA, $dpA, $nodeB, $dpB ) { |
79 | return $dpA == $dpB; |
80 | }, |
81 | // TODO(T254502): This is added temporarily for backwards |
82 | // compatibility and can be removed when versions up to 2.1.0 |
83 | // are no longer stored |
84 | 'typeof' => static function ( $nodeA, $valA, $nodeB, $valB ) { |
85 | if ( $valA === $valB ) { |
86 | return true; |
87 | } elseif ( $valA === 'mw:DisplaySpace' ) { |
88 | return $valB === 'mw:DisplaySpace mw:Placeholder'; |
89 | } elseif ( $valB === 'mw:DisplaySpace' ) { |
90 | return $valA === 'mw:DisplaySpace mw:Placeholder'; |
91 | } else { |
92 | return false; |
93 | } |
94 | } |
95 | ]; |
96 | } |
97 | |
98 | /** |
99 | * Diff two HTML documents, and add / update data-parsoid-diff attributes with |
100 | * change information. |
101 | * |
102 | * @param Element $nodeA |
103 | * @param Element $nodeB |
104 | * @return array |
105 | */ |
106 | public function diff( Element $nodeA, Element $nodeB ): array { |
107 | Assert::invariant( |
108 | $nodeA->ownerDocument !== $nodeB->ownerDocument, |
109 | 'Expected to be diff\'ing different documents.' |
110 | ); |
111 | |
112 | $this->debug( static function () use( $nodeA, $nodeB ) { |
113 | return "ORIG:\n" . |
114 | DOMCompat::getOuterHTML( $nodeA ) . |
115 | "\nNEW :\n" . |
116 | DOMCompat::getOuterHTML( $nodeB ); |
117 | } ); |
118 | |
119 | // The root nodes are equal, call recursive differ |
120 | $foundChange = $this->doDOMDiff( $nodeA, $nodeB ); |
121 | return [ 'isEmpty' => !$foundChange ]; |
122 | } |
123 | |
124 | /** |
125 | * Test if two DOM nodes are equal. |
126 | * |
127 | * @param Node $nodeA |
128 | * @param Node $nodeB |
129 | * @param bool $deep |
130 | * @return bool |
131 | */ |
132 | public function treeEquals( Node $nodeA, Node $nodeB, bool $deep ): bool { |
133 | if ( $nodeA->nodeType !== $nodeB->nodeType ) { |
134 | return false; |
135 | } elseif ( $nodeA instanceof Text ) { |
136 | // In the past we've had bugs where we let non-primitive strings |
137 | // leak into our DOM. Safety first: |
138 | Assert::invariant( $nodeA->nodeValue === (string)$nodeA->nodeValue, '' ); |
139 | Assert::invariant( $nodeB->nodeValue === (string)$nodeB->nodeValue, '' ); |
140 | // ok, now do the comparison. |
141 | return $nodeA->nodeValue === $nodeB->nodeValue; |
142 | } elseif ( $nodeA instanceof Comment ) { |
143 | return WTUtils::decodeComment( $nodeA->nodeValue ) === |
144 | WTUtils::decodeComment( $nodeB->nodeValue ); |
145 | } elseif ( $nodeA instanceof Element || $nodeA instanceof DocumentFragment ) { |
146 | if ( $nodeA instanceof DocumentFragment ) { |
147 | if ( !( $nodeB instanceof DocumentFragment ) ) { |
148 | return false; |
149 | } |
150 | } else { // $nodeA instanceof Element |
151 | // Compare node name and attribute length |
152 | if ( |
153 | !( $nodeB instanceof Element ) || |
154 | DOMCompat::nodeName( $nodeA ) !== DOMCompat::nodeName( $nodeB ) || |
155 | !DiffUtils::attribsEquals( |
156 | $nodeA, |
157 | $nodeB, |
158 | self::IGNORE_ATTRIBUTES, |
159 | $this->specializedAttribHandlers |
160 | ) |
161 | ) { |
162 | return false; |
163 | } |
164 | } |
165 | |
166 | // Passed all tests, node itself is equal. |
167 | if ( $deep ) { |
168 | $childA = null; |
169 | $childB = null; |
170 | // Compare # of children, since that's fast. |
171 | // (Avoid de-optimizing DOM by using node#childNodes) |
172 | for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild; |
173 | $childA && $childB; |
174 | $childA = $childA->nextSibling, $childB = $childB->nextSibling |
175 | ) { |
176 | /* don't look inside children yet, just look at # of children */ |
177 | } |
178 | |
179 | if ( $childA || $childB ) { |
180 | return false; /* nodes have different # of children */ |
181 | } |
182 | // Now actually compare the child subtrees |
183 | for ( $childA = $nodeA->firstChild, $childB = $nodeB->firstChild; |
184 | $childA && $childB; |
185 | $childA = $childA->nextSibling, $childB = $childB->nextSibling |
186 | ) { |
187 | if ( !$this->treeEquals( $childA, $childB, $deep ) ) { |
188 | return false; |
189 | } |
190 | } |
191 | } |
192 | |
193 | // Did not find a diff yet, so the trees must be equal. |
194 | return true; |
195 | } |
196 | throw new UnreachableException( 'we shouldn\'t get here' ); |
197 | } |
198 | |
199 | /** |
200 | * Diff two DOM trees by comparing them node-by-node. |
201 | * |
202 | * TODO: Implement something more intelligent like |
203 | * http://gregory.cobena.free.fr/www/Publications/%5BICDE2002%5D%20XyDiff%20-%20published%20version.pdf, |
204 | * which uses hash signatures of subtrees to efficiently detect moves / |
205 | * wrapping. |
206 | * |
207 | * Adds / updates a data-parsoid-diff structure with change information. |
208 | * |
209 | * Returns true if subtree is changed, false otherwise. |
210 | * |
211 | * TODO: |
212 | * Assume typical CSS white-space, so ignore ws diffs in non-pre content. |
213 | * |
214 | * @param Node $baseParentNode |
215 | * @param Node $newParentNode |
216 | * @return bool |
217 | */ |
218 | public function doDOMDiff( Node $baseParentNode, Node $newParentNode ): bool { |
219 | // Perform a relaxed version of the recursive treeEquals algorithm that |
220 | // allows for some minor differences and tries to produce a sensible diff |
221 | // marking using heuristics like look-ahead on siblings. |
222 | $baseNode = $baseParentNode->firstChild; |
223 | $newNode = $newParentNode->firstChild; |
224 | $lookaheadNode = null; |
225 | $foundDiffOverall = false; |
226 | |
227 | while ( $baseNode && $newNode ) { |
228 | $dontAdvanceNewNode = false; |
229 | $this->debugOut( $baseNode, $newNode ); |
230 | // shallow check first |
231 | if ( !$this->treeEquals( $baseNode, $newNode, false ) ) { |
232 | $this->debug( '-- not equal --' ); |
233 | $savedNewNode = $newNode; |
234 | $foundDiff = false; |
235 | |
236 | // Some simplistic look-ahead, currently limited to a single level |
237 | // in the DOM. |
238 | |
239 | // look-ahead in *new* DOM to detect insertions |
240 | if ( DiffDOMUtils::isContentNode( $baseNode ) ) { |
241 | $this->debug( '--lookahead in new dom--' ); |
242 | $lookaheadNode = $newNode->nextSibling; |
243 | while ( $lookaheadNode ) { |
244 | $this->debugOut( $baseNode, $lookaheadNode, 'new' ); |
245 | if ( DiffDOMUtils::isContentNode( $lookaheadNode ) && |
246 | $this->treeEquals( $baseNode, $lookaheadNode, true ) |
247 | ) { |
248 | // mark skipped-over nodes as inserted |
249 | $markNode = $newNode; |
250 | while ( $markNode !== $lookaheadNode ) { |
251 | $this->debug( '--found diff: inserted--' ); |
252 | $this->markNode( $markNode, DiffMarkers::INSERTED ); |
253 | $markNode = $markNode->nextSibling; |
254 | } |
255 | $foundDiff = true; |
256 | $newNode = $lookaheadNode; |
257 | break; |
258 | } |
259 | $lookaheadNode = self::nextNonTemplateSibling( $lookaheadNode ); |
260 | } |
261 | } |
262 | |
263 | // look-ahead in *base* DOM to detect deletions |
264 | if ( !$foundDiff && DiffDOMUtils::isContentNode( $newNode ) ) { |
265 | $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $baseNode ); |
266 | $this->debug( '--lookahead in old dom--' ); |
267 | $lookaheadNode = $baseNode->nextSibling; |
268 | while ( $lookaheadNode ) { |
269 | $this->debugOut( $lookaheadNode, $newNode, 'old' ); |
270 | if ( DiffDOMUtils::isContentNode( $lookaheadNode ) && |
271 | $this->treeEquals( $lookaheadNode, $newNode, true ) |
272 | ) { |
273 | $this->debug( '--found diff: deleted--' ); |
274 | // mark skipped-over nodes as deleted |
275 | $this->markNode( $newNode, DiffMarkers::DELETED, $isBlockNode ); |
276 | $baseNode = $lookaheadNode; |
277 | $foundDiff = true; |
278 | break; |
279 | } elseif ( !WTUtils::emitsSolTransparentSingleLineWT( $lookaheadNode ) ) { |
280 | // We only care about the deleted node prior to the one that |
281 | // gets a tree match (but, ignore nodes that show up in wikitext |
282 | // but don't affect sol-state or HTML rendering -- note that |
283 | // whitespace is being ignored, but that whitespace occurs |
284 | // between block nodes). |
285 | $isBlockNode = WTUtils::isBlockNodeWithVisibleWT( $lookaheadNode ); |
286 | } |
287 | $lookaheadNode = self::nextNonTemplateSibling( $lookaheadNode ); |
288 | } |
289 | } |
290 | |
291 | if ( !$foundDiff ) { |
292 | if ( !( $savedNewNode instanceof Element ) ) { |
293 | $this->debug( '--found diff: modified text/comment--' ); |
294 | $this->markNode( |
295 | $savedNewNode, DiffMarkers::DELETED, |
296 | WTUtils::isBlockNodeWithVisibleWT( $baseNode ) |
297 | ); |
298 | } elseif ( $baseNode instanceof Element && |
299 | DOMCompat::nodeName( $savedNewNode ) === DOMCompat::nodeName( $baseNode ) && |
300 | ( DOMDataUtils::getDataParsoid( $savedNewNode )->stx ?? null ) === |
301 | ( DOMDataUtils::getDataParsoid( $baseNode )->stx ?? null ) |
302 | ) { |
303 | // Identical wrapper-type, but modified. |
304 | // Mark modified-wrapper, and recurse. |
305 | $this->debug( '--found diff: modified-wrapper--' ); |
306 | $this->markNode( $savedNewNode, DiffMarkers::MODIFIED_WRAPPER ); |
307 | $this->subtreeDiffers( $baseNode, $savedNewNode ); |
308 | } else { |
309 | // We now want to compare current newNode with the next baseNode. |
310 | $dontAdvanceNewNode = true; |
311 | |
312 | // Since we are advancing in an old DOM without advancing |
313 | // in the new DOM, there were deletions. Add a deletion marker |
314 | // since this is important for accurate separator handling in selser. |
315 | $this->markNode( |
316 | $savedNewNode, DiffMarkers::DELETED, |
317 | WTUtils::isBlockNodeWithVisibleWT( $baseNode ) |
318 | ); |
319 | } |
320 | } |
321 | |
322 | // Record the fact that direct children changed in the parent node |
323 | $this->debug( '--found diff: children-changed--' ); |
324 | $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED ); |
325 | |
326 | $foundDiffOverall = true; |
327 | } elseif ( $this->subtreeDiffers( $baseNode, $newNode ) ) { |
328 | $foundDiffOverall = true; |
329 | } |
330 | |
331 | // And move on to the next pair (skipping over template HTML) |
332 | if ( $baseNode && $newNode ) { |
333 | $baseNode = self::nextNonTemplateSibling( $baseNode ); |
334 | if ( !$dontAdvanceNewNode ) { |
335 | $newNode = self::nextNonTemplateSibling( $newNode ); |
336 | } |
337 | } |
338 | } |
339 | |
340 | // mark extra new nodes as inserted |
341 | while ( $newNode ) { |
342 | $this->debug( '--found trailing new node: inserted--' ); |
343 | $this->markNode( $newNode, DiffMarkers::INSERTED ); |
344 | $foundDiffOverall = true; |
345 | $newNode = self::nextNonTemplateSibling( $newNode ); |
346 | } |
347 | |
348 | // If there are extra base nodes, something was deleted. Mark the parent as |
349 | // having lost children for now. |
350 | if ( $baseNode ) { |
351 | $this->debug( '--found trailing base nodes: deleted--' ); |
352 | $this->markNode( $newParentNode, DiffMarkers::CHILDREN_CHANGED ); |
353 | // SSS FIXME: WTS checks for zero children in a few places |
354 | // That code would have to be upgraded if we emit mw:DiffMarker |
355 | // in this scenario. So, bailing out in this one case for now. |
356 | if ( $newParentNode->hasChildNodes() ) { |
357 | $meta = $newParentNode->ownerDocument->createElement( 'meta' ); |
358 | DOMUtils::addTypeOf( $meta, 'mw:DiffMarker/deleted' ); |
359 | if ( WTUtils::isBlockNodeWithVisibleWT( $baseNode ) ) { |
360 | $meta->setAttribute( 'data-is-block', 'true' ); |
361 | } |
362 | $newParentNode->appendChild( $meta ); |
363 | } |
364 | $foundDiffOverall = true; |
365 | } |
366 | |
367 | return $foundDiffOverall; |
368 | } |
369 | |
370 | /* *************************************************** |
371 | * Helpers |
372 | * ***************************************************/ |
373 | |
374 | /** |
375 | * @param Node $baseNode |
376 | * @param Node $newNode |
377 | * @return bool |
378 | */ |
379 | private function subtreeDiffers( Node $baseNode, Node $newNode ): bool { |
380 | $baseEncapsulated = WTUtils::isEncapsulationWrapper( $baseNode ); |
381 | $newEncapsulated = WTUtils::isEncapsulationWrapper( $newNode ); |
382 | |
383 | if ( !$baseEncapsulated && !$newEncapsulated ) { |
384 | $this->debug( '--shallow equal: recursing--' ); |
385 | // Recursively diff subtrees if not template-like content |
386 | $subtreeDiffers = $this->doDOMDiff( $baseNode, $newNode ); |
387 | } elseif ( $baseEncapsulated && $newEncapsulated ) { |
388 | '@phan-var Element $baseNode'; // @var Element $baseNode |
389 | '@phan-var Element $newNode'; // @var Element $newNode |
390 | |
391 | $ext = null; |
392 | |
393 | $baseExtTagName = WTUtils::getExtTagName( $baseNode ); |
394 | if ( $baseExtTagName ) { |
395 | $ext = $this->env->getSiteConfig()->getExtTagImpl( $baseExtTagName ); |
396 | } |
397 | |
398 | if ( $ext && ( $baseExtTagName === WTUtils::getExtTagName( $newNode ) ) ) { |
399 | $this->debug( '--diffing extension content--' ); |
400 | $subtreeDiffers = $ext->diffHandler( |
401 | $this->extApi, [ $this, 'doDOMDiff' ], $baseNode, $newNode |
402 | ); |
403 | } else { |
404 | // Otherwise, for encapsulated content, we don't know about the subtree. |
405 | $subtreeDiffers = false; |
406 | } |
407 | } else { |
408 | // FIXME: Maybe $editNode should be marked as inserted to avoid |
409 | // losing any edits, at the cost of more normalization. |
410 | // $state->inInsertedContent is only set when we're in inserted |
411 | // content, so not sure this is currently doing all that much. |
412 | $subtreeDiffers = true; |
413 | } |
414 | |
415 | if ( $subtreeDiffers ) { |
416 | $this->debug( '--found diff: subtree-changed--' ); |
417 | $this->markNode( $newNode, DiffMarkers::SUBTREE_CHANGED ); |
418 | } |
419 | return $subtreeDiffers; |
420 | } |
421 | |
422 | private function markNode( Node $node, string $mark, bool $blockNodeDeleted = false ): void { |
423 | $meta = DiffUtils::addDiffMark( $node, $this->env, $mark ); |
424 | |
425 | if ( $meta && $blockNodeDeleted ) { |
426 | $meta->setAttribute( 'data-is-block', 'true' ); |
427 | } |
428 | |
429 | if ( $mark === DiffMarkers::DELETED || $mark === DiffMarkers::INSERTED ) { |
430 | $this->markNode( $node->parentNode, DiffMarkers::CHILDREN_CHANGED ); |
431 | } |
432 | |
433 | // Clear out speculatively computed DSR values for data-mw-selser-wrapper nodes |
434 | // since they may be incorrect. This eliminates any inadvertent use of |
435 | // these incorrect values. |
436 | if ( $node instanceof Element && $node->hasAttribute( 'data-mw-selser-wrapper' ) ) { |
437 | DOMDataUtils::getDataParsoid( $node )->dsr = null; |
438 | } |
439 | } |
440 | |
441 | private function debugOut( Node $nodeA, Node $nodeB, string $laPrefix = '' ): void { |
442 | $prefix = 'trace/domdiff'; |
443 | $this->env->log( $prefix, |
444 | static fn () => '--> A' . $laPrefix . ':' . |
445 | ( $nodeA instanceof Element |
446 | ? DOMCompat::getOuterHTML( $nodeA ) |
447 | : PHPUtils::jsonEncode( $nodeA->nodeValue ) ) |
448 | ); |
449 | $this->env->log( $prefix, |
450 | static fn () => '--> B' . $laPrefix . ':' . |
451 | ( $nodeB instanceof Element |
452 | ? DOMCompat::getOuterHTML( $nodeB ) |
453 | : PHPUtils::jsonEncode( $nodeB->nodeValue ) ) |
454 | ); |
455 | } |
456 | } |