Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.32% |
1 / 313 |
|
4.76% |
1 / 21 |
CRAP | |
0.00% |
0 / 1 |
DOMNormalizer | |
0.32% |
1 / 313 |
|
4.76% |
1 / 21 |
23035.26 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
similar | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
132 | |||
mergable | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
swappable | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
firstChild | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isInsertedContent | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
rewriteablePair | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
30 | |||
addDiffMarks | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
182 | |||
merge | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
30 | |||
swap | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
6 | |||
hoistLinks | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
156 | |||
stripIfEmpty | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
12 | |||
moveTrailingSpacesOut | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
56 | |||
stripBRs | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
stripBidiCharsAroundCategories | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
72 | |||
moveFormatTagOutsideATag | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
156 | |||
normalizeNode | |
0.00% |
0 / 62 |
|
0.00% |
0 / 1 |
1122 | |||
normalizeSiblingPair | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
30 | |||
processSubtree | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
42 | |||
processNode | |
0.00% |
0 / 34 |
|
0.00% |
0 / 1 |
156 | |||
normalize | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
9 | use Wikimedia\Parsoid\DOM\Element; |
10 | use Wikimedia\Parsoid\DOM\Node; |
11 | use Wikimedia\Parsoid\DOM\Text; |
12 | use Wikimedia\Parsoid\Utils\ContentUtils; |
13 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
14 | use Wikimedia\Parsoid\Utils\DOMCompat; |
15 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMUtils; |
17 | use Wikimedia\Parsoid\Utils\PHPUtils; |
18 | use Wikimedia\Parsoid\Utils\WTUtils; |
19 | use Wikimedia\Parsoid\Wikitext\Consts; |
20 | |
21 | /* |
22 | * Tag minimization |
23 | * ---------------- |
24 | * Minimize a pair of tags in the dom tree rooted at node. |
25 | * |
26 | * This function merges adjacent nodes of the same type |
27 | * and swaps nodes where possible to enable further merging. |
28 | * |
29 | * See examples below: |
30 | * |
31 | * 1. <b>X</b><b>Y</b> |
32 | * ==> <b>XY</b> |
33 | * |
34 | * 2. <i>A</i><b><i>X</i></b><b><i>Y</i></b><i>Z</i> |
35 | * ==> <i>A<b>XY</b>Z</i> |
36 | * |
37 | * 3. <a href="Football">Foot</a><a href="Football">ball</a> |
38 | * ==> <a href="Football">Football</a> |
39 | */ |
40 | |
41 | /** |
42 | * DOM normalization. |
43 | * |
44 | * DOM normalizations are performed after DOMDiff is run. |
45 | * So, normalization routines should update diff markers appropriately. |
46 | */ |
47 | class DOMNormalizer { |
48 | |
49 | private const IGNORABLE_ATTRS = [ |
50 | 'data-parsoid', 'id', 'title', DOMDataUtils::DATA_OBJECT_ATTR_NAME |
51 | ]; |
52 | private const HTML_IGNORABLE_ATTRS = [ 'data-parsoid', DOMDataUtils::DATA_OBJECT_ATTR_NAME ]; |
53 | |
54 | private static $specializedAttribHandlers; |
55 | |
56 | /** @var bool */ |
57 | private $inInsertedContent; |
58 | |
59 | /** @var SerializerState */ |
60 | private $state; |
61 | |
62 | public function __construct( SerializerState $state ) { |
63 | if ( !self::$specializedAttribHandlers ) { |
64 | self::$specializedAttribHandlers = [ |
65 | 'data-mw' => static function ( $nodeA, $dmwA, $nodeB, $dmwB ) { |
66 | return $dmwA == $dmwB; |
67 | } |
68 | ]; |
69 | } |
70 | |
71 | $this->state = $state; |
72 | |
73 | $this->inInsertedContent = false; |
74 | } |
75 | |
76 | private static function similar( Node $a, Node $b ): bool { |
77 | if ( DOMCompat::nodeName( $a ) === 'a' ) { |
78 | // FIXME: Similar to 1ce6a98, DiffDOMUtils::nextNonDeletedSibling is being |
79 | // used in this file where maybe DiffDOMUtils::nextNonSepSibling belongs. |
80 | return $a instanceof Element && $b instanceof Element && |
81 | DiffUtils::attribsEquals( $a, $b, self::IGNORABLE_ATTRS, self::$specializedAttribHandlers ); |
82 | } else { |
83 | $aIsHtml = WTUtils::isLiteralHTMLNode( $a ); |
84 | $bIsHtml = WTUtils::isLiteralHTMLNode( $b ); |
85 | // TODO (Anomie) |
86 | // It looks like $ignorableAttrs is only used when $aIsHtml is true. |
87 | // Or is that the fixme referred to in the comment below? |
88 | $ignorableAttrs = $aIsHtml ? self::HTML_IGNORABLE_ATTRS : self::IGNORABLE_ATTRS; |
89 | |
90 | // FIXME: For non-HTML I/B tags, we seem to be dropping all attributes |
91 | // in our tag handlers (which seems like a bug). Till that is fixed, |
92 | // we'll preserve existing functionality here. |
93 | return ( !$aIsHtml && !$bIsHtml ) || |
94 | ( $aIsHtml && $bIsHtml && |
95 | $a instanceof Element && $b instanceof Element && |
96 | DiffUtils::attribsEquals( $a, $b, $ignorableAttrs, self::$specializedAttribHandlers ) ); |
97 | } |
98 | } |
99 | |
100 | /** |
101 | * Can a and b be merged into a single node? |
102 | * @param Node $a |
103 | * @param Node $b |
104 | * @return bool |
105 | */ |
106 | private static function mergable( Node $a, Node $b ): bool { |
107 | return DOMCompat::nodeName( $a ) === DOMCompat::nodeName( $b ) && self::similar( $a, $b ); |
108 | } |
109 | |
110 | /** |
111 | * Can a and b be combined into a single node |
112 | * if we swap a and a.firstChild? |
113 | * |
114 | * For example: A='<b><i>x</i></b>' b='<i>y</i>' => '<i><b>x</b>y</i>'. |
115 | * @param Node $a |
116 | * @param Node $b |
117 | * @return bool |
118 | */ |
119 | private static function swappable( Node $a, Node $b ): bool { |
120 | return DiffDOMUtils::numNonDeletedChildNodes( $a ) === 1 |
121 | && self::similar( $a, DiffDOMUtils::firstNonDeletedChild( $a ) ) |
122 | && self::mergable( DiffDOMUtils::firstNonDeletedChild( $a ), $b ); |
123 | } |
124 | |
125 | private static function firstChild( Node $node, bool $rtl ): ?Node { |
126 | return $rtl ? DiffDOMUtils::lastNonDeletedChild( $node ) : DiffDOMUtils::firstNonDeletedChild( $node ); |
127 | } |
128 | |
129 | private function isInsertedContent( Node $node ): bool { |
130 | while ( true ) { |
131 | if ( DiffUtils::hasInsertedDiffMark( $node ) ) { |
132 | return true; |
133 | } |
134 | if ( DOMUtils::atTheTop( $node ) ) { |
135 | return false; |
136 | } |
137 | $node = $node->parentNode; |
138 | } |
139 | } |
140 | |
141 | private function rewriteablePair( Node $a, Node $b ): bool { |
142 | if ( isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $a )] ) ) { |
143 | // For <i>/<b> pair, we need not check whether the node being transformed |
144 | // are new / edited, etc. since these minimization scenarios can |
145 | // never show up in HTML that came from parsed wikitext. |
146 | // |
147 | // <i>..</i><i>..</i> can never show up without a <nowiki/> in between. |
148 | // Similarly for <b>..</b><b>..</b> and <b><i>..</i></b><i>..</i>. |
149 | // |
150 | // This is because a sequence of 4 quotes is not parsed as ..</i><i>.. |
151 | // Neither is a sequence of 7 quotes parsed as ..</i></b><i>.. |
152 | // |
153 | // So, if we see a minimizable pair of nodes, it is because the HTML |
154 | // didn't originate from wikitext OR the HTML has been subsequently edited. |
155 | // In both cases, we want to transform the DOM. |
156 | |
157 | return isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $b )] ); |
158 | } elseif ( DOMCompat::nodeName( $a ) === 'a' ) { |
159 | // For <a> tags, we require at least one of the two tags |
160 | // to be a newly created element. |
161 | return DOMCompat::nodeName( $b ) === 'a' && ( WTUtils::isNewElt( $a ) || WTUtils::isNewElt( $b ) ); |
162 | } |
163 | return false; |
164 | } |
165 | |
166 | public function addDiffMarks( Node $node, string $mark, bool $dontRecurse = false ): void { |
167 | if ( !$this->state->selserMode || DiffUtils::hasDiffMark( $node, $mark ) ) { |
168 | return; |
169 | } |
170 | |
171 | // Don't introduce nested inserted markers |
172 | if ( $this->inInsertedContent && $mark === DiffMarkers::INSERTED ) { |
173 | return; |
174 | } |
175 | |
176 | $env = $this->state->getEnv(); |
177 | |
178 | // Newly added elements don't need diff marks |
179 | if ( !WTUtils::isNewElt( $node ) ) { |
180 | DiffUtils::addDiffMark( $node, $env, $mark ); |
181 | if ( $mark === DiffMarkers::INSERTED || $mark === DiffMarkers::DELETED ) { |
182 | DiffUtils::addDiffMark( $node->parentNode, $env, DiffMarkers::CHILDREN_CHANGED ); |
183 | } |
184 | } |
185 | |
186 | if ( $dontRecurse ) { |
187 | return; |
188 | } |
189 | |
190 | // Walk up the subtree and add 'subtree-changed' markers |
191 | $node = $node->parentNode; |
192 | while ( $node instanceof Element && !DOMUtils::atTheTop( $node ) ) { |
193 | if ( DiffUtils::hasDiffMark( $node, DiffMarkers::SUBTREE_CHANGED ) ) { |
194 | return; |
195 | } |
196 | if ( !WTUtils::isNewElt( $node ) ) { |
197 | DiffUtils::addDiffMark( $node, $env, DiffMarkers::SUBTREE_CHANGED ); |
198 | } |
199 | $node = $node->parentNode; |
200 | } |
201 | } |
202 | |
203 | /** |
204 | * Transfer all of b's children to a and delete b. |
205 | * @param Element $a |
206 | * @param Element $b |
207 | * @return Element |
208 | */ |
209 | public function merge( Element $a, Element $b ): Element { |
210 | $sentinel = $b->firstChild; |
211 | |
212 | // Migrate any intermediate nodes (usually 0 / 1 diff markers) |
213 | // present between a and b to a |
214 | $next = $a->nextSibling; |
215 | if ( $next !== $b ) { |
216 | $a->appendChild( $next ); |
217 | } |
218 | |
219 | // The real work of merging |
220 | DOMUtils::migrateChildren( $b, $a ); |
221 | $b->parentNode->removeChild( $b ); |
222 | |
223 | // Normalize the node to merge any adjacent text nodes |
224 | DOMCompat::normalize( $a ); |
225 | |
226 | // Update diff markers |
227 | $this->addDiffMarks( $a->parentNode, DiffMarkers::CHILDREN_CHANGED ); // $b was removed |
228 | $this->addDiffMarks( $a, DiffMarkers::CHILDREN_CHANGED ); // $a got more children |
229 | if ( !DOMUtils::isRemoved( $sentinel ) ) { |
230 | // Nodes starting at 'sentinal' were inserted into 'a' |
231 | // b, which was a's sibling was deleted |
232 | // Only addDiffMarks to sentinel, if it is still part of the dom |
233 | // (and hasn't been deleted by the call to a.normalize() ) |
234 | if ( $sentinel->parentNode ) { |
235 | $this->addDiffMarks( $sentinel, DiffMarkers::MOVED, true ); |
236 | } |
237 | } |
238 | if ( $a->nextSibling ) { |
239 | // FIXME: Hmm .. there is an API hole here |
240 | // about ability to add markers after last child |
241 | $this->addDiffMarks( $a->nextSibling, DiffMarkers::MOVED, true ); |
242 | } |
243 | |
244 | return $a; |
245 | } |
246 | |
247 | /** |
248 | * b is a's sole non-deleted child. Switch them around. |
249 | * @param Element $a |
250 | * @param Element $b |
251 | * @return Element |
252 | */ |
253 | public function swap( Element $a, Element $b ): Element { |
254 | DOMUtils::migrateChildren( $b, $a ); |
255 | $a->parentNode->insertBefore( $b, $a ); |
256 | $b->appendChild( $a ); |
257 | |
258 | // Mark a's subtree, a, and b as all having moved |
259 | if ( $a->firstChild !== null ) { |
260 | $this->addDiffMarks( $a->firstChild, DiffMarkers::MOVED, true ); |
261 | } |
262 | $this->addDiffMarks( $a, DiffMarkers::MOVED, true ); |
263 | $this->addDiffMarks( $b, DiffMarkers::MOVED, true ); |
264 | $this->addDiffMarks( $a, DiffMarkers::CHILDREN_CHANGED, true ); |
265 | $this->addDiffMarks( $b, DiffMarkers::CHILDREN_CHANGED, true ); |
266 | $this->addDiffMarks( $b->parentNode, DiffMarkers::CHILDREN_CHANGED ); |
267 | |
268 | return $b; |
269 | } |
270 | |
271 | public function hoistLinks( Element $node, bool $rtl ): void { |
272 | $sibling = self::firstChild( $node, $rtl ); |
273 | $hasHoistableContent = false; |
274 | |
275 | while ( $sibling ) { |
276 | $next = $rtl |
277 | ? DiffDOMUtils::previousNonDeletedSibling( $sibling ) |
278 | : DiffDOMUtils::nextNonDeletedSibling( $sibling ); |
279 | if ( !DiffDOMUtils::isContentNode( $sibling ) ) { |
280 | // Nothing to do, continue. |
281 | } elseif ( !WTUtils::isRenderingTransparentNode( $sibling ) || |
282 | WTUtils::isEncapsulationWrapper( $sibling ) |
283 | ) { |
284 | // Don't venture into templated content |
285 | break; |
286 | } else { |
287 | $hasHoistableContent = true; |
288 | } |
289 | $sibling = $next; |
290 | } |
291 | |
292 | if ( $hasHoistableContent ) { |
293 | // soak up all the non-content nodes (exclude sibling) |
294 | $move = self::firstChild( $node, $rtl ); |
295 | $firstNode = $move; |
296 | while ( $move !== $sibling ) { |
297 | $refnode = $rtl ? DiffDOMUtils::nextNonDeletedSibling( $node ) : $node; |
298 | $node->parentNode->insertBefore( $move, $refnode ); |
299 | $move = self::firstChild( $node, $rtl ); |
300 | } |
301 | |
302 | // and drop any leading whitespace |
303 | if ( $sibling instanceof Text ) { |
304 | $sibling->nodeValue = $rtl ? rtrim( $sibling->nodeValue ) : ltrim( $sibling->nodeValue ); |
305 | } |
306 | |
307 | // Update diff markers |
308 | $this->addDiffMarks( $firstNode, DiffMarkers::MOVED, true ); |
309 | if ( $sibling ) { |
310 | $this->addDiffMarks( $sibling, DiffMarkers::MOVED, true ); |
311 | } |
312 | $this->addDiffMarks( $node, DiffMarkers::CHILDREN_CHANGED, true ); |
313 | $this->addDiffMarks( $node->parentNode, DiffMarkers::CHILDREN_CHANGED ); |
314 | } |
315 | } |
316 | |
317 | public function stripIfEmpty( Element $node ): ?Node { |
318 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
319 | $dp = DOMDataUtils::getDataParsoid( $node ); |
320 | $autoInserted = isset( $dp->autoInsertedStart ) || isset( $dp->autoInsertedEnd ); |
321 | |
322 | $strippable = |
323 | DiffDOMUtils::nodeEssentiallyEmpty( $node, false ); |
324 | // Ex: "<a..>..</a><b></b>bar" |
325 | // From [[Foo]]<b/>bar usage found on some dewiki pages. |
326 | // FIXME: Should we enable this? |
327 | // !( false /* used to be rt-test mode */ && ( $dp->stx ?? null ) === 'html' ); |
328 | |
329 | if ( $strippable ) { |
330 | // Update diff markers (before the deletion) |
331 | $this->addDiffMarks( $node, DiffMarkers::DELETED, true ); |
332 | $node->parentNode->removeChild( $node ); |
333 | return $next; |
334 | } else { |
335 | return $node; |
336 | } |
337 | } |
338 | |
339 | public function moveTrailingSpacesOut( Node $node ): void { |
340 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
341 | $last = DiffDOMUtils::lastNonDeletedChild( $node ); |
342 | $matches = null; |
343 | if ( $last instanceof Text && |
344 | preg_match( '/\s+$/D', $last->nodeValue, $matches ) > 0 |
345 | ) { |
346 | $trailing = $matches[0]; |
347 | $last->nodeValue = substr( $last->nodeValue, 0, -strlen( $trailing ) ); |
348 | // Try to be a little smarter and drop the spaces if possible. |
349 | if ( $next && ( !( $next instanceof Text ) || !preg_match( '/^\s+/', $next->nodeValue ) ) ) { |
350 | if ( !( $next instanceof Text ) ) { |
351 | $txt = $node->ownerDocument->createTextNode( '' ); |
352 | $node->parentNode->insertBefore( $txt, $next ); |
353 | $next = $txt; |
354 | } |
355 | $next->nodeValue = $trailing . $next->nodeValue; |
356 | // next (a text node) is new / had new content added to it |
357 | $this->addDiffMarks( $next, DiffMarkers::INSERTED, true ); |
358 | } |
359 | $this->addDiffMarks( $last, DiffMarkers::INSERTED, true ); |
360 | $this->addDiffMarks( $node->parentNode, DiffMarkers::CHILDREN_CHANGED ); |
361 | } |
362 | } |
363 | |
364 | public function stripBRs( Element $node ): void { |
365 | $child = $node->firstChild; |
366 | while ( $child ) { |
367 | $next = $child->nextSibling; |
368 | if ( DOMCompat::nodeName( $child ) === 'br' ) { |
369 | // replace <br/> with a single space |
370 | $node->removeChild( $child ); |
371 | $node->insertBefore( $node->ownerDocument->createTextNode( ' ' ), $next ); |
372 | } elseif ( $child instanceof Element ) { |
373 | $this->stripBRs( $child ); |
374 | } |
375 | $child = $next; |
376 | } |
377 | } |
378 | |
379 | /** |
380 | * FIXME see |
381 | * https://gerrit.wikimedia.org/r/#/c/mediawiki/services/parsoid/+/500975/7/src/Html2Wt/DOMNormalizer.php@423 |
382 | * @param Node $node |
383 | * @return Node|null |
384 | */ |
385 | public function stripBidiCharsAroundCategories( Node $node ): ?Node { |
386 | if ( !( $node instanceof Text ) || |
387 | ( !WTUtils::isCategoryLink( $node->previousSibling ) && |
388 | !WTUtils::isCategoryLink( $node->nextSibling ) ) |
389 | ) { |
390 | // Not a text node and not adjacent to a category link |
391 | return $node; |
392 | } |
393 | |
394 | $next = $node->nextSibling; |
395 | if ( !$next || WTUtils::isCategoryLink( $next ) ) { |
396 | // The following can leave behind an empty text node. |
397 | $oldLength = strlen( $node->nodeValue ); |
398 | $node->nodeValue = preg_replace( |
399 | '/([\x{200e}\x{200f}]+\n)?[\x{200e}\x{200f}]+$/uD', |
400 | '', |
401 | $node->nodeValue |
402 | ); |
403 | $newLength = strlen( $node->nodeValue ); |
404 | |
405 | if ( $oldLength !== $newLength ) { |
406 | // Log changes for editors benefit |
407 | $this->state->getEnv()->log( 'warn/html2wt/bidi', |
408 | 'LRM/RLM unicode chars stripped around categories' |
409 | ); |
410 | } |
411 | |
412 | if ( $newLength === 0 ) { |
413 | // Remove empty text nodes to keep DOM in normalized form |
414 | $ret = DiffDOMUtils::nextNonDeletedSibling( $node ); |
415 | $node->parentNode->removeChild( $node ); |
416 | $this->addDiffMarks( $node, DiffMarkers::DELETED ); |
417 | return $ret; |
418 | } |
419 | |
420 | // Treat modified node as having been newly inserted |
421 | $this->addDiffMarks( $node, DiffMarkers::INSERTED ); |
422 | } |
423 | return $node; |
424 | } |
425 | |
426 | /** |
427 | * When an A tag is encountered, if there are format tags inside, move them outside |
428 | * Also merge a single sibling A tag that is mergable |
429 | * The link href and text must match for this normalization to take effect |
430 | * |
431 | * @param Element $node |
432 | * @return Node|null |
433 | */ |
434 | public function moveFormatTagOutsideATag( Element $node ): ?Node { |
435 | if ( DOMCompat::nodeName( $node ) !== 'a' ) { |
436 | return $node; |
437 | } |
438 | $sibling = DiffDOMUtils::nextNonDeletedSibling( $node ); |
439 | if ( $sibling ) { |
440 | $this->normalizeSiblingPair( $node, $sibling ); |
441 | } |
442 | |
443 | $firstChild = DiffDOMUtils::firstNonDeletedChild( $node ); |
444 | $fcNextSibling = null; |
445 | if ( $firstChild ) { |
446 | $fcNextSibling = DiffDOMUtils::nextNonDeletedSibling( $firstChild ); |
447 | } |
448 | |
449 | if ( !$node->hasAttribute( 'href' ) ) { |
450 | return $node; |
451 | } |
452 | $nodeHref = DOMCompat::getAttribute( $node, 'href' ) ?? ''; |
453 | |
454 | // If there are no tags to swap, we are done |
455 | if ( $firstChild instanceof Element && |
456 | // No reordering possible with multiple children |
457 | $fcNextSibling === null && |
458 | // Do not normalize WikiLinks with these attributes |
459 | !$firstChild->hasAttribute( 'color' ) && |
460 | !$firstChild->hasAttribute( 'style' ) && |
461 | !$firstChild->hasAttribute( 'class' ) && |
462 | // Compare textContent to the href, noting that this matching doesn't handle all |
463 | // possible simple-wiki-link scenarios that isSimpleWikiLink in link handler tackles |
464 | $node->textContent === PHPUtils::stripPrefix( $nodeHref, './' ) |
465 | ) { |
466 | for ( |
467 | $child = DiffDOMUtils::firstNonDeletedChild( $node ); |
468 | DOMUtils::isFormattingElt( $child ); |
469 | $child = DiffDOMUtils::firstNonDeletedChild( $node ) |
470 | ) { |
471 | '@phan-var Element $child'; // @var Element $child |
472 | $this->swap( $node, $child ); |
473 | } |
474 | return $firstChild; |
475 | } |
476 | |
477 | return $node; |
478 | } |
479 | |
480 | /** |
481 | * Wikitext normalizations implemented right now: |
482 | * |
483 | * 1. Tag minimization (I/B tags) in normalizeSiblingPair |
484 | * 2. Strip empty headings and style tags |
485 | * 3. Force SOL transparent links to serialize before/after heading |
486 | * 4. Trailing spaces are migrated out of links |
487 | * 5. Space is added before escapable prefixes in table cells |
488 | * 6. Strip <br/> from headings |
489 | * 7. Strip bidi chars around categories |
490 | * 8. When an A tag is encountered, if there are format tags inside, move them outside |
491 | * |
492 | * The return value from this function should respect the |
493 | * following contract: |
494 | * - if input node is unmodified, return it. |
495 | * - if input node is modified, return the new node |
496 | * that it transforms into. |
497 | * If you return a node other than this, normalizations may not |
498 | * apply cleanly and may be skipped. |
499 | * |
500 | * @param Node $node |
501 | * @return Node|null the normalized node |
502 | */ |
503 | public function normalizeNode( Node $node ): ?Node { |
504 | $nodeName = DOMCompat::nodeName( $node ); |
505 | |
506 | if ( $this->state->getEnv()->getSiteConfig()->scrubBidiChars() ) { |
507 | // Strip bidirectional chars around categories |
508 | // Note that this is being done everywhere, |
509 | // not just in selser mode |
510 | $next = $this->stripBidiCharsAroundCategories( $node ); |
511 | if ( $next !== $node ) { |
512 | return $next; |
513 | } |
514 | } |
515 | |
516 | // Skip unmodified content |
517 | if ( $this->state->selserMode && !DOMUtils::atTheTop( $node ) && |
518 | !$this->inInsertedContent && |
519 | !DiffUtils::hasDiffMarkers( $node ) && |
520 | // If orig-src is not valid, this in effect becomes |
521 | // an edited node and needs normalizations applied to it. |
522 | WTSUtils::origSrcValidInEditedContext( $this->state, $node ) |
523 | ) { |
524 | return $node; |
525 | } |
526 | |
527 | // Headings |
528 | if ( DOMUtils::isHeading( $node ) ) { |
529 | '@phan-var Element $node'; // @var Element $node |
530 | $this->hoistLinks( $node, false ); |
531 | $this->hoistLinks( $node, true ); |
532 | $this->stripBRs( $node ); |
533 | |
534 | return $this->stripIfEmpty( $node ); |
535 | |
536 | // Quote tags |
537 | } elseif ( isset( Consts::$WTQuoteTags[$nodeName] ) ) { |
538 | '@phan-var Element $node'; // @var Element $node |
539 | return $this->stripIfEmpty( $node ); |
540 | |
541 | // Anchors |
542 | } elseif ( $nodeName === 'a' ) { |
543 | '@phan-var Element $node'; // @var Element $node |
544 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
545 | // We could have checked for !mw:ExtLink but in |
546 | // the case of links without any annotations, |
547 | // the positive test is semantically safer than the |
548 | // negative test. |
549 | if ( DOMUtils::hasRel( $node, 'mw:WikiLink' ) && |
550 | $this->stripIfEmpty( $node ) !== $node |
551 | ) { |
552 | return $next; |
553 | } |
554 | $this->moveTrailingSpacesOut( $node ); |
555 | |
556 | return $this->moveFormatTagOutsideATag( $node ); |
557 | |
558 | // Table cells |
559 | } elseif ( $nodeName === 'td' ) { |
560 | '@phan-var Element $node'; // @var Element $node |
561 | $dp = DOMDataUtils::getDataParsoid( $node ); |
562 | // * HTML <td>s won't have escapable prefixes |
563 | // * First cell should always be checked for escapable prefixes |
564 | // * Second and later cells in a wikitext td row (with stx='row' flag) |
565 | // won't have escapable prefixes. |
566 | $stx = $dp->stx ?? null; |
567 | if ( $stx === 'html' || |
568 | ( DiffDOMUtils::firstNonSepChild( $node->parentNode ) !== $node && $stx === 'row' ) ) { |
569 | return $node; |
570 | } |
571 | |
572 | $first = DiffDOMUtils::firstNonDeletedChild( $node ); |
573 | // Emit a space before escapable prefix |
574 | // This is preferable to serializing with a nowiki. |
575 | if ( $first instanceof Text && strspn( $first->nodeValue, '-+}', 0, 1 ) ) { |
576 | $first->nodeValue = ' ' . $first->nodeValue; |
577 | $this->addDiffMarks( $first, DiffMarkers::INSERTED, true ); |
578 | } |
579 | |
580 | return $node; |
581 | |
582 | // Font tags without any attributes |
583 | } elseif ( |
584 | $node instanceof Element && $nodeName === 'font' && |
585 | DOMDataUtils::noAttrs( $node ) |
586 | ) { |
587 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
588 | DOMUtils::migrateChildren( $node, $node->parentNode, $node ); |
589 | $node->parentNode->removeChild( $node ); |
590 | |
591 | return $next; |
592 | } elseif ( $node instanceof Element && $nodeName === 'p' |
593 | && !WTUtils::isLiteralHTMLNode( $node ) ) { |
594 | $next = DiffDOMUtils::nextNonSepSibling( $node ); |
595 | // Normalization of <p></p>, <p><br/></p>, <p><meta/></p> and the like to avoid |
596 | // extraneous new lines |
597 | if ( DiffDOMUtils::hasNChildren( $node, 1 ) && |
598 | WTUtils::isMarkerAnnotation( $node->firstChild ) |
599 | ) { |
600 | // Converts <p><meta /></p> (where meta is an annotation tag) to <meta /> without |
601 | // the wrapping <p> (that would typically be added by VE) to avoid getting too many |
602 | // newlines. |
603 | $ann = $node->firstChild; |
604 | DOMUtils::migrateChildren( $node, $node->parentNode, $node ); |
605 | $node->parentNode->removeChild( $node ); |
606 | return $ann; |
607 | } elseif ( |
608 | // Don't apply normalization to <p></p> nodes that |
609 | // were generated through deletions or other normalizations. |
610 | // FIXME: This trick fails for non-selser mode since |
611 | // diff markers are only added in selser mode. |
612 | DiffDOMUtils::hasNChildren( $node, 0, true ) && |
613 | // FIXME: Also, skip if this is the only child. |
614 | // Eliminates spurious test failures in non-selser mode. |
615 | !DiffDOMUtils::hasNChildren( $node->parentNode, 1 ) |
616 | ) { |
617 | // T184755: Convert sequences of <p></p> nodes to sequences of |
618 | // <br/>, <p><br/>..other content..</p>, <p><br/><p/> to ensure |
619 | // they serialize to as many newlines as the count of <p></p> nodes. |
620 | // Also handles <p><meta/></p> case for annotations. |
621 | if ( $next && DOMCompat::nodeName( $next ) === 'p' && |
622 | !WTUtils::isLiteralHTMLNode( $next ) ) { |
623 | // Replace 'node' (<p></p>) with a <br/> and make it the |
624 | // first child of 'next' (<p>..</p>). If 'next' was actually |
625 | // a <p></p> (i.e. empty), 'next' becomes <p><br/></p> |
626 | // which will serialize to 2 newlines. |
627 | $br = $node->ownerDocument->createElement( 'br' ); |
628 | $next->insertBefore( $br, $next->firstChild ); |
629 | |
630 | // Avoid nested insertion markers |
631 | if ( !$this->isInsertedContent( $next ) ) { |
632 | $this->addDiffMarks( $br, DiffMarkers::INSERTED ); |
633 | } |
634 | |
635 | // Delete node |
636 | $this->addDiffMarks( $node->parentNode, DiffMarkers::DELETED ); |
637 | $node->parentNode->removeChild( $node ); |
638 | } |
639 | } else { |
640 | // We cannot merge the <br/> with 'next' because |
641 | // it is not a <p>..</p>. |
642 | } |
643 | return $next; |
644 | } |
645 | // Default |
646 | return $node; |
647 | } |
648 | |
649 | public function normalizeSiblingPair( Node $a, Node $b ): Node { |
650 | if ( !$this->rewriteablePair( $a, $b ) ) { |
651 | return $b; |
652 | } |
653 | |
654 | // Since 'a' and 'b' make a rewriteable tag-pair, we are good to go. |
655 | if ( self::mergable( $a, $b ) ) { |
656 | '@phan-var Element $a'; // @var Element $a |
657 | '@phan-var Element $b'; // @var Element $b |
658 | $a = $this->merge( $a, $b ); |
659 | // The new a's children have new siblings. So let's look |
660 | // at a again. But their grandkids haven't changed, |
661 | // so we don't need to recurse further. |
662 | $this->processSubtree( $a, false ); |
663 | return $a; |
664 | } |
665 | |
666 | if ( self::swappable( $a, $b ) ) { |
667 | '@phan-var Element $a'; // @var Element $a |
668 | '@phan-var Element $b'; // @var Element $b |
669 | $firstNonDeletedChild = DiffDOMUtils::firstNonDeletedChild( $a ); |
670 | '@phan-var Element $firstNonDeletedChild'; // @var Element $firstNonDeletedChild |
671 | $a = $this->merge( $this->swap( $a, $firstNonDeletedChild ), $b ); |
672 | // Again, a has new children, but the grandkids have already |
673 | // been minimized. |
674 | $this->processSubtree( $a, false ); |
675 | return $a; |
676 | } |
677 | |
678 | if ( self::swappable( $b, $a ) ) { |
679 | '@phan-var Element $a'; // @var Element $a |
680 | '@phan-var Element $b'; // @var Element $b |
681 | $firstNonDeletedChild = DiffDOMUtils::firstNonDeletedChild( $b ); |
682 | '@phan-var Element $firstNonDeletedChild'; // @var Element $firstNonDeletedChild |
683 | $a = $this->merge( $a, $this->swap( $b, $firstNonDeletedChild ) ); |
684 | // Again, a has new children, but the grandkids have already |
685 | // been minimized. |
686 | $this->processSubtree( $a, false ); |
687 | return $a; |
688 | } |
689 | |
690 | return $b; |
691 | } |
692 | |
693 | public function processSubtree( Node $node, bool $recurse ): void { |
694 | // Process the first child outside the loop. |
695 | $a = DiffDOMUtils::firstNonDeletedChild( $node ); |
696 | if ( !$a ) { |
697 | return; |
698 | } |
699 | |
700 | $a = $this->processNode( $a, $recurse ); |
701 | while ( $a ) { |
702 | // We need a pair of adjacent siblings for tag minimization. |
703 | $b = DiffDOMUtils::nextNonDeletedSibling( $a ); |
704 | if ( !$b ) { |
705 | return; |
706 | } |
707 | |
708 | // Process subtree rooted at 'b'. |
709 | $b = $this->processNode( $b, $recurse ); |
710 | |
711 | // If we skipped over a bunch of nodes in the middle, |
712 | // we no longer have a pair of adjacent siblings. |
713 | if ( $b && DiffDOMUtils::previousNonDeletedSibling( $b ) === $a ) { |
714 | // Process the pair. |
715 | $a = $this->normalizeSiblingPair( $a, $b ); |
716 | } else { |
717 | $a = $b; |
718 | } |
719 | } |
720 | } |
721 | |
722 | public function processNode( Node $node, bool $recurse ): ?Node { |
723 | // Normalize 'node' and the subtree rooted at 'node' |
724 | // recurse = true => recurse and normalize subtree |
725 | // recurse = false => assume the subtree is already normalized |
726 | |
727 | // Normalize node till it stabilizes |
728 | $next = null; |
729 | while ( true ) { |
730 | // Skip templated content |
731 | while ( $node && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
732 | $node = WTUtils::skipOverEncapsulatedContent( $node ); |
733 | } |
734 | |
735 | if ( !$node ) { |
736 | return null; |
737 | } |
738 | |
739 | // Set insertion marker |
740 | $insertedSubtree = DiffUtils::hasInsertedDiffMark( $node ); |
741 | if ( $insertedSubtree ) { |
742 | if ( $this->inInsertedContent ) { |
743 | // Dump debugging info |
744 | $options = [ 'storeDiffMark' => true, 'saveData' => true ]; |
745 | $dump = ContentUtils::dumpDOM( |
746 | DOMCompat::getBody( $node->ownerDocument ), |
747 | '-- DOM triggering nested inserted dom-diff flags --', |
748 | $options |
749 | ); |
750 | $this->state->getEnv()->log( 'error/html2wt/dom', |
751 | "--- Nested inserted dom-diff flags ---\n", |
752 | 'Node:', |
753 | $node instanceof Element ? ContentUtils::toXML( $node, $options ) : $node->textContent, |
754 | "\nNode's parent:", |
755 | ContentUtils::toXML( $node->parentNode, $options ), |
756 | $dump |
757 | ); |
758 | } |
759 | // FIXME: If this assert is removed, the above dumping code should |
760 | // either be removed OR fixed up to remove uses of ContentUtils.ppToXML |
761 | Assert::invariant( !$this->inInsertedContent, 'Found nested inserted dom-diff flags!' ); |
762 | $this->inInsertedContent = true; |
763 | } |
764 | |
765 | // Post-order traversal: Process subtree first, and current node after. |
766 | // This lets multiple normalizations take effect cleanly. |
767 | if ( $recurse && $node instanceof Element ) { |
768 | $this->processSubtree( $node, true ); |
769 | } |
770 | |
771 | $next = $this->normalizeNode( $node ); |
772 | |
773 | // Clear insertion marker |
774 | if ( $insertedSubtree ) { |
775 | $this->inInsertedContent = false; |
776 | } |
777 | |
778 | if ( $next === $node ) { |
779 | return $node; |
780 | } else { |
781 | $node = $next; |
782 | } |
783 | } |
784 | |
785 | // @phan-suppress-next-line PhanPluginUnreachableCode |
786 | throw new UnreachableException( 'Control should never get here!' ); |
787 | } |
788 | |
789 | /** |
790 | * @param Element|DocumentFragment $node |
791 | */ |
792 | public function normalize( Node $node ) { |
793 | $this->processNode( $node, true ); |
794 | } |
795 | } |