Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.31% |
1 / 318 |
|
4.76% |
1 / 21 |
CRAP | |
0.00% |
0 / 1 |
DOMNormalizer | |
0.31% |
1 / 318 |
|
4.76% |
1 / 21 |
23646.97 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
similar | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
132 | |||
mergable | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
swappable | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
firstChild | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isInsertedContent | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
rewriteablePair | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
30 | |||
addDiffMarks | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
182 | |||
merge | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
30 | |||
swap | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
6 | |||
hoistLinks | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
156 | |||
stripIfEmpty | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
12 | |||
moveTrailingSpacesOut | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
56 | |||
stripBRs | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
stripBidiCharsAroundCategories | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
72 | |||
moveFormatTagOutsideATag | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
156 | |||
normalizeNode | |
0.00% |
0 / 67 |
|
0.00% |
0 / 1 |
1260 | |||
normalizeSiblingPair | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
30 | |||
processSubtree | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
42 | |||
processNode | |
0.00% |
0 / 34 |
|
0.00% |
0 / 1 |
156 | |||
normalize | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
9 | use Wikimedia\Parsoid\DOM\Element; |
10 | use Wikimedia\Parsoid\DOM\Node; |
11 | use Wikimedia\Parsoid\DOM\Text; |
12 | use Wikimedia\Parsoid\Utils\ContentUtils; |
13 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
14 | use Wikimedia\Parsoid\Utils\DOMCompat; |
15 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMUtils; |
17 | use Wikimedia\Parsoid\Utils\PHPUtils; |
18 | use Wikimedia\Parsoid\Utils\WTUtils; |
19 | use Wikimedia\Parsoid\Wikitext\Consts; |
20 | |
21 | /* |
22 | * Tag minimization |
23 | * ---------------- |
24 | * Minimize a pair of tags in the dom tree rooted at node. |
25 | * |
26 | * This function merges adjacent nodes of the same type |
27 | * and swaps nodes where possible to enable further merging. |
28 | * |
29 | * See examples below: |
30 | * |
31 | * 1. <b>X</b><b>Y</b> |
32 | * ==> <b>XY</b> |
33 | * |
34 | * 2. <i>A</i><b><i>X</i></b><b><i>Y</i></b><i>Z</i> |
35 | * ==> <i>A<b>XY</b>Z</i> |
36 | * |
37 | * 3. <a href="Football">Foot</a><a href="Football">ball</a> |
38 | * ==> <a href="Football">Football</a> |
39 | */ |
40 | |
41 | /** |
42 | * DOM normalization. |
43 | * |
44 | * DOM normalizations are performed after DOMDiff is run. |
45 | * So, normalization routines should update diff markers appropriately. |
46 | */ |
47 | class DOMNormalizer { |
48 | |
49 | private const IGNORABLE_ATTRS = [ |
50 | 'data-parsoid', 'id', 'title', DOMDataUtils::DATA_OBJECT_ATTR_NAME |
51 | ]; |
52 | private const HTML_IGNORABLE_ATTRS = [ 'data-parsoid', DOMDataUtils::DATA_OBJECT_ATTR_NAME ]; |
53 | |
54 | private static $specializedAttribHandlers; |
55 | |
56 | /** @var bool */ |
57 | private $inInsertedContent; |
58 | |
59 | /** @var SerializerState */ |
60 | private $state; |
61 | |
62 | public function __construct( SerializerState $state ) { |
63 | if ( !self::$specializedAttribHandlers ) { |
64 | self::$specializedAttribHandlers = [ |
65 | 'data-mw' => static function ( $nodeA, $dmwA, $nodeB, $dmwB ) { |
66 | return $dmwA == $dmwB; |
67 | } |
68 | ]; |
69 | } |
70 | |
71 | $this->state = $state; |
72 | |
73 | $this->inInsertedContent = false; |
74 | } |
75 | |
76 | private static function similar( Node $a, Node $b ): bool { |
77 | if ( DOMCompat::nodeName( $a ) === 'a' ) { |
78 | // FIXME: Similar to 1ce6a98, DiffDOMUtils::nextNonDeletedSibling is being |
79 | // used in this file where maybe DiffDOMUtils::nextNonSepSibling belongs. |
80 | return $a instanceof Element && $b instanceof Element && |
81 | DiffUtils::attribsEquals( $a, $b, self::IGNORABLE_ATTRS, self::$specializedAttribHandlers ); |
82 | } else { |
83 | $aIsHtml = WTUtils::isLiteralHTMLNode( $a ); |
84 | $bIsHtml = WTUtils::isLiteralHTMLNode( $b ); |
85 | // TODO (Anomie) |
86 | // It looks like $ignorableAttrs is only used when $aIsHtml is true. |
87 | // Or is that the fixme referred to in the comment below? |
88 | $ignorableAttrs = $aIsHtml ? self::HTML_IGNORABLE_ATTRS : self::IGNORABLE_ATTRS; |
89 | |
90 | // FIXME: For non-HTML I/B tags, we seem to be dropping all attributes |
91 | // in our tag handlers (which seems like a bug). Till that is fixed, |
92 | // we'll preserve existing functionality here. |
93 | return ( !$aIsHtml && !$bIsHtml ) || |
94 | ( $aIsHtml && $bIsHtml && |
95 | $a instanceof Element && $b instanceof Element && |
96 | DiffUtils::attribsEquals( $a, $b, $ignorableAttrs, self::$specializedAttribHandlers ) ); |
97 | } |
98 | } |
99 | |
100 | /** |
101 | * Can a and b be merged into a single node? |
102 | * @param Node $a |
103 | * @param Node $b |
104 | * @return bool |
105 | */ |
106 | private static function mergable( Node $a, Node $b ): bool { |
107 | return DOMCompat::nodeName( $a ) === DOMCompat::nodeName( $b ) && self::similar( $a, $b ); |
108 | } |
109 | |
110 | /** |
111 | * Can a and b be combined into a single node |
112 | * if we swap a and a.firstChild? |
113 | * |
114 | * For example: A='<b><i>x</i></b>' b='<i>y</i>' => '<i><b>x</b>y</i>'. |
115 | * @param Node $a |
116 | * @param Node $b |
117 | * @return bool |
118 | */ |
119 | private static function swappable( Node $a, Node $b ): bool { |
120 | return DiffDOMUtils::numNonDeletedChildNodes( $a ) === 1 |
121 | && self::similar( $a, DiffDOMUtils::firstNonDeletedChild( $a ) ) |
122 | && self::mergable( DiffDOMUtils::firstNonDeletedChild( $a ), $b ); |
123 | } |
124 | |
125 | private static function firstChild( Node $node, bool $rtl ): ?Node { |
126 | return $rtl ? DiffDOMUtils::lastNonDeletedChild( $node ) : DiffDOMUtils::firstNonDeletedChild( $node ); |
127 | } |
128 | |
129 | private function isInsertedContent( Node $node ): bool { |
130 | while ( true ) { |
131 | if ( DiffUtils::hasInsertedDiffMark( $node ) ) { |
132 | return true; |
133 | } |
134 | if ( DOMUtils::atTheTop( $node ) ) { |
135 | return false; |
136 | } |
137 | $node = $node->parentNode; |
138 | } |
139 | } |
140 | |
141 | private function rewriteablePair( Node $a, Node $b ): bool { |
142 | if ( isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $a )] ) ) { |
143 | // For <i>/<b> pair, we need not check whether the node being transformed |
144 | // are new / edited, etc. since these minimization scenarios can |
145 | // never show up in HTML that came from parsed wikitext. |
146 | // |
147 | // <i>..</i><i>..</i> can never show up without a <nowiki/> in between. |
148 | // Similarly for <b>..</b><b>..</b> and <b><i>..</i></b><i>..</i>. |
149 | // |
150 | // This is because a sequence of 4 quotes is not parsed as ..</i><i>.. |
151 | // Neither is a sequence of 7 quotes parsed as ..</i></b><i>.. |
152 | // |
153 | // So, if we see a minimizable pair of nodes, it is because the HTML |
154 | // didn't originate from wikitext OR the HTML has been subsequently edited. |
155 | // In both cases, we want to transform the DOM. |
156 | |
157 | return isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $b )] ); |
158 | } elseif ( DOMCompat::nodeName( $a ) === 'a' ) { |
159 | // For <a> tags, we require at least one of the two tags |
160 | // to be a newly created element. |
161 | return DOMCompat::nodeName( $b ) === 'a' && ( WTUtils::isNewElt( $a ) || WTUtils::isNewElt( $b ) ); |
162 | } |
163 | return false; |
164 | } |
165 | |
166 | public function addDiffMarks( Node $node, string $mark, bool $dontRecurse = false ): void { |
167 | if ( !$this->state->selserMode || DiffUtils::hasDiffMark( $node, $mark ) ) { |
168 | return; |
169 | } |
170 | |
171 | // Don't introduce nested inserted markers |
172 | if ( $this->inInsertedContent && $mark === DiffMarkers::INSERTED ) { |
173 | return; |
174 | } |
175 | |
176 | $env = $this->state->getEnv(); |
177 | |
178 | // Newly added elements don't need diff marks |
179 | if ( !WTUtils::isNewElt( $node ) ) { |
180 | DiffUtils::addDiffMark( $node, $env, $mark ); |
181 | if ( $mark === DiffMarkers::INSERTED || $mark === DiffMarkers::DELETED ) { |
182 | DiffUtils::addDiffMark( $node->parentNode, $env, DiffMarkers::CHILDREN_CHANGED ); |
183 | } |
184 | } |
185 | |
186 | if ( $dontRecurse ) { |
187 | return; |
188 | } |
189 | |
190 | // Walk up the subtree and add 'subtree-changed' markers |
191 | $node = $node->parentNode; |
192 | while ( $node instanceof Element && !DOMUtils::atTheTop( $node ) ) { |
193 | if ( DiffUtils::hasDiffMark( $node, DiffMarkers::SUBTREE_CHANGED ) ) { |
194 | return; |
195 | } |
196 | if ( !WTUtils::isNewElt( $node ) ) { |
197 | DiffUtils::addDiffMark( $node, $env, DiffMarkers::SUBTREE_CHANGED ); |
198 | } |
199 | $node = $node->parentNode; |
200 | } |
201 | } |
202 | |
203 | /** |
204 | * Transfer all of b's children to a and delete b. |
205 | * @param Element $a |
206 | * @param Element $b |
207 | * @return Element |
208 | */ |
209 | public function merge( Element $a, Element $b ): Element { |
210 | $sentinel = $b->firstChild; |
211 | |
212 | // Migrate any intermediate nodes (usually 0 / 1 diff markers) |
213 | // present between a and b to a |
214 | $next = $a->nextSibling; |
215 | if ( $next !== $b ) { |
216 | $a->appendChild( $next ); |
217 | } |
218 | |
219 | // The real work of merging |
220 | DOMUtils::migrateChildren( $b, $a ); |
221 | $b->parentNode->removeChild( $b ); |
222 | |
223 | // Normalize the node to merge any adjacent text nodes |
224 | DOMCompat::normalize( $a ); |
225 | |
226 | // Update diff markers |
227 | $this->addDiffMarks( $a->parentNode, DiffMarkers::CHILDREN_CHANGED ); // $b was removed |
228 | $this->addDiffMarks( $a, DiffMarkers::CHILDREN_CHANGED ); // $a got more children |
229 | if ( !DOMUtils::isRemoved( $sentinel ) ) { |
230 | // Nodes starting at 'sentinal' were inserted into 'a' |
231 | // b, which was a's sibling was deleted |
232 | // Only addDiffMarks to sentinel, if it is still part of the dom |
233 | // (and hasn't been deleted by the call to a.normalize() ) |
234 | if ( $sentinel->parentNode ) { |
235 | $this->addDiffMarks( $sentinel, DiffMarkers::MOVED, true ); |
236 | } |
237 | } |
238 | if ( $a->nextSibling ) { |
239 | // FIXME: Hmm .. there is an API hole here |
240 | // about ability to add markers after last child |
241 | $this->addDiffMarks( $a->nextSibling, DiffMarkers::MOVED, true ); |
242 | } |
243 | |
244 | return $a; |
245 | } |
246 | |
247 | /** |
248 | * b is a's sole non-deleted child. Switch them around. |
249 | * @param Element $a |
250 | * @param Element $b |
251 | * @return Element |
252 | */ |
253 | public function swap( Element $a, Element $b ): Element { |
254 | DOMUtils::migrateChildren( $b, $a ); |
255 | $a->parentNode->insertBefore( $b, $a ); |
256 | $b->appendChild( $a ); |
257 | |
258 | // Mark a's subtree, a, and b as all having moved |
259 | if ( $a->firstChild !== null ) { |
260 | $this->addDiffMarks( $a->firstChild, DiffMarkers::MOVED, true ); |
261 | } |
262 | $this->addDiffMarks( $a, DiffMarkers::MOVED, true ); |
263 | $this->addDiffMarks( $b, DiffMarkers::MOVED, true ); |
264 | $this->addDiffMarks( $a, DiffMarkers::CHILDREN_CHANGED, true ); |
265 | $this->addDiffMarks( $b, DiffMarkers::CHILDREN_CHANGED, true ); |
266 | $this->addDiffMarks( $b->parentNode, DiffMarkers::CHILDREN_CHANGED ); |
267 | |
268 | return $b; |
269 | } |
270 | |
271 | public function hoistLinks( Element $node, bool $rtl ): void { |
272 | $sibling = self::firstChild( $node, $rtl ); |
273 | $hasHoistableContent = false; |
274 | |
275 | while ( $sibling ) { |
276 | $next = $rtl |
277 | ? DiffDOMUtils::previousNonDeletedSibling( $sibling ) |
278 | : DiffDOMUtils::nextNonDeletedSibling( $sibling ); |
279 | if ( !DiffDOMUtils::isContentNode( $sibling ) ) { |
280 | // Nothing to do, continue. |
281 | } elseif ( !WTUtils::isRenderingTransparentNode( $sibling ) || |
282 | WTUtils::isEncapsulationWrapper( $sibling ) |
283 | ) { |
284 | // Don't venture into templated content |
285 | break; |
286 | } else { |
287 | $hasHoistableContent = true; |
288 | } |
289 | $sibling = $next; |
290 | } |
291 | |
292 | if ( $hasHoistableContent ) { |
293 | // soak up all the non-content nodes (exclude sibling) |
294 | $move = self::firstChild( $node, $rtl ); |
295 | $firstNode = $move; |
296 | while ( $move !== $sibling ) { |
297 | $refnode = $rtl ? DiffDOMUtils::nextNonDeletedSibling( $node ) : $node; |
298 | $node->parentNode->insertBefore( $move, $refnode ); |
299 | $move = self::firstChild( $node, $rtl ); |
300 | } |
301 | |
302 | // and drop any leading whitespace |
303 | if ( $sibling instanceof Text ) { |
304 | $sibling->nodeValue = $rtl ? rtrim( $sibling->nodeValue ) : ltrim( $sibling->nodeValue ); |
305 | } |
306 | |
307 | // Update diff markers |
308 | $this->addDiffMarks( $firstNode, DiffMarkers::MOVED, true ); |
309 | if ( $sibling ) { |
310 | $this->addDiffMarks( $sibling, DiffMarkers::MOVED, true ); |
311 | } |
312 | $this->addDiffMarks( $node, DiffMarkers::CHILDREN_CHANGED, true ); |
313 | $this->addDiffMarks( $node->parentNode, DiffMarkers::CHILDREN_CHANGED ); |
314 | } |
315 | } |
316 | |
317 | public function stripIfEmpty( Element $node ): ?Node { |
318 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
319 | $dp = DOMDataUtils::getDataParsoid( $node ); |
320 | $autoInserted = isset( $dp->autoInsertedStart ) || isset( $dp->autoInsertedEnd ); |
321 | |
322 | $strippable = |
323 | DiffDOMUtils::nodeEssentiallyEmpty( $node, false ); |
324 | // Ex: "<a..>..</a><b></b>bar" |
325 | // From [[Foo]]<b/>bar usage found on some dewiki pages. |
326 | // FIXME: Should we enable this? |
327 | // !( false /* used to be rt-test mode */ && ( $dp->stx ?? null ) === 'html' ); |
328 | |
329 | if ( $strippable ) { |
330 | // Update diff markers (before the deletion) |
331 | $this->addDiffMarks( $node, DiffMarkers::DELETED, true ); |
332 | $node->parentNode->removeChild( $node ); |
333 | return $next; |
334 | } else { |
335 | return $node; |
336 | } |
337 | } |
338 | |
339 | public function moveTrailingSpacesOut( Node $node ): void { |
340 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
341 | $last = DiffDOMUtils::lastNonDeletedChild( $node ); |
342 | $matches = null; |
343 | if ( $last instanceof Text && |
344 | preg_match( '/\s+$/D', $last->nodeValue, $matches ) > 0 |
345 | ) { |
346 | $trailing = $matches[0]; |
347 | $last->nodeValue = substr( $last->nodeValue, 0, -strlen( $trailing ) ); |
348 | // Try to be a little smarter and drop the spaces if possible. |
349 | if ( $next && ( !( $next instanceof Text ) || !preg_match( '/^\s+/', $next->nodeValue ) ) ) { |
350 | if ( !( $next instanceof Text ) ) { |
351 | $txt = $node->ownerDocument->createTextNode( '' ); |
352 | $node->parentNode->insertBefore( $txt, $next ); |
353 | $next = $txt; |
354 | } |
355 | $next->nodeValue = $trailing . $next->nodeValue; |
356 | // next (a text node) is new / had new content added to it |
357 | $this->addDiffMarks( $next, DiffMarkers::INSERTED, true ); |
358 | } |
359 | $this->addDiffMarks( $last, DiffMarkers::INSERTED, true ); |
360 | $this->addDiffMarks( $node->parentNode, DiffMarkers::CHILDREN_CHANGED ); |
361 | } |
362 | } |
363 | |
364 | public function stripBRs( Element $node ): void { |
365 | $child = $node->firstChild; |
366 | while ( $child ) { |
367 | $next = $child->nextSibling; |
368 | if ( DOMCompat::nodeName( $child ) === 'br' ) { |
369 | // replace <br/> with a single space |
370 | $node->removeChild( $child ); |
371 | $node->insertBefore( $node->ownerDocument->createTextNode( ' ' ), $next ); |
372 | } elseif ( $child instanceof Element ) { |
373 | $this->stripBRs( $child ); |
374 | } |
375 | $child = $next; |
376 | } |
377 | } |
378 | |
379 | /** |
380 | * FIXME see |
381 | * https://gerrit.wikimedia.org/r/#/c/mediawiki/services/parsoid/+/500975/7/src/Html2Wt/DOMNormalizer.php@423 |
382 | * @param Node $node |
383 | * @return Node|null |
384 | */ |
385 | public function stripBidiCharsAroundCategories( Node $node ): ?Node { |
386 | if ( !( $node instanceof Text ) || |
387 | ( !WTUtils::isCategoryLink( $node->previousSibling ) && |
388 | !WTUtils::isCategoryLink( $node->nextSibling ) ) |
389 | ) { |
390 | // Not a text node and not adjacent to a category link |
391 | return $node; |
392 | } |
393 | |
394 | $next = $node->nextSibling; |
395 | if ( !$next || WTUtils::isCategoryLink( $next ) ) { |
396 | // The following can leave behind an empty text node. |
397 | $oldLength = strlen( $node->nodeValue ); |
398 | $node->nodeValue = preg_replace( |
399 | '/([\x{200e}\x{200f}]+\n)?[\x{200e}\x{200f}]+$/uD', |
400 | '', |
401 | $node->nodeValue |
402 | ); |
403 | $newLength = strlen( $node->nodeValue ); |
404 | |
405 | if ( $oldLength !== $newLength ) { |
406 | // Log changes for editors benefit |
407 | $this->state->getEnv()->log( 'warn/html2wt/bidi', |
408 | 'LRM/RLM unicode chars stripped around categories' |
409 | ); |
410 | } |
411 | |
412 | if ( $newLength === 0 ) { |
413 | // Remove empty text nodes to keep DOM in normalized form |
414 | $ret = DiffDOMUtils::nextNonDeletedSibling( $node ); |
415 | $node->parentNode->removeChild( $node ); |
416 | $this->addDiffMarks( $node, DiffMarkers::DELETED ); |
417 | return $ret; |
418 | } |
419 | |
420 | // Treat modified node as having been newly inserted |
421 | $this->addDiffMarks( $node, DiffMarkers::INSERTED ); |
422 | } |
423 | return $node; |
424 | } |
425 | |
426 | /** |
427 | * When an A tag is encountered, if there are format tags inside, move them outside |
428 | * Also merge a single sibling A tag that is mergable |
429 | * The link href and text must match for this normalization to take effect |
430 | * |
431 | * @param Element $node |
432 | * @return Node|null |
433 | */ |
434 | public function moveFormatTagOutsideATag( Element $node ): ?Node { |
435 | if ( DOMCompat::nodeName( $node ) !== 'a' ) { |
436 | return $node; |
437 | } |
438 | $sibling = DiffDOMUtils::nextNonDeletedSibling( $node ); |
439 | if ( $sibling ) { |
440 | $this->normalizeSiblingPair( $node, $sibling ); |
441 | } |
442 | |
443 | $firstChild = DiffDOMUtils::firstNonDeletedChild( $node ); |
444 | $fcNextSibling = null; |
445 | if ( $firstChild ) { |
446 | $fcNextSibling = DiffDOMUtils::nextNonDeletedSibling( $firstChild ); |
447 | } |
448 | |
449 | if ( !$node->hasAttribute( 'href' ) ) { |
450 | return $node; |
451 | } |
452 | $nodeHref = DOMCompat::getAttribute( $node, 'href' ) ?? ''; |
453 | |
454 | // If there are no tags to swap, we are done |
455 | if ( $firstChild instanceof Element && |
456 | // No reordering possible with multiple children |
457 | $fcNextSibling === null && |
458 | // Do not normalize WikiLinks with these attributes |
459 | !$firstChild->hasAttribute( 'color' ) && |
460 | !$firstChild->hasAttribute( 'style' ) && |
461 | !$firstChild->hasAttribute( 'class' ) && |
462 | // Compare textContent to the href, noting that this matching doesn't handle all |
463 | // possible simple-wiki-link scenarios that isSimpleWikiLink in link handler tackles |
464 | $node->textContent === PHPUtils::stripPrefix( $nodeHref, './' ) |
465 | ) { |
466 | for ( |
467 | $child = DiffDOMUtils::firstNonDeletedChild( $node ); |
468 | DOMUtils::isFormattingElt( $child ); |
469 | $child = DiffDOMUtils::firstNonDeletedChild( $node ) |
470 | ) { |
471 | '@phan-var Element $child'; // @var Element $child |
472 | $this->swap( $node, $child ); |
473 | } |
474 | return $firstChild; |
475 | } |
476 | |
477 | return $node; |
478 | } |
479 | |
480 | /** |
481 | * Wikitext normalizations implemented right now: |
482 | * |
483 | * 1. Tag minimization (I/B tags) in normalizeSiblingPair |
484 | * 2. Strip empty headings and style tags |
485 | * 3. Force SOL transparent links to serialize before/after heading |
486 | * 4. Trailing spaces are migrated out of links |
487 | * 5. Space is added before escapable prefixes in table cells |
488 | * 6. Strip <br/> from headings |
489 | * 7. Strip bidi chars around categories |
490 | * 8. When an A tag is encountered, if there are format tags inside, move them outside |
491 | * |
492 | * The return value from this function should respect the |
493 | * following contract: |
494 | * - if input node is unmodified, return it. |
495 | * - if input node is modified, return the new node |
496 | * that it transforms into. |
497 | * If you return a node other than this, normalizations may not |
498 | * apply cleanly and may be skipped. |
499 | * |
500 | * @param Node $node |
501 | * @return Node|null the normalized node |
502 | */ |
503 | public function normalizeNode( Node $node ): ?Node { |
504 | $dp = null; |
505 | $nodeName = DOMCompat::nodeName( $node ); |
506 | if ( $nodeName === 'th' || $nodeName === 'td' ) { |
507 | '@phan-var Element $node'; // @var Element $node |
508 | $dp = DOMDataUtils::getDataParsoid( $node ); |
509 | // Table cells (td/th) previously used the stx_v flag for single-row syntax. |
510 | // Newer code uses stx flag since that is used everywhere else. |
511 | // While we still have old HTML in cache / storage, accept |
512 | // the stx_v flag as well. |
513 | // TODO: We are at html version 1.5.0 now. Once storage |
514 | // no longer has version 1.5.0 content, we can get rid of |
515 | // this b/c code. |
516 | if ( isset( $dp->stx_v ) ) { |
517 | // HTML (stx='html') elements will not have the stx_v flag set |
518 | // since the single-row syntax only applies to native-wikitext. |
519 | // So, we can safely override it here. |
520 | $dp->stx = $dp->stx_v; |
521 | } |
522 | } |
523 | |
524 | $next = null; |
525 | |
526 | if ( $this->state->getEnv()->getSiteConfig()->scrubBidiChars() ) { |
527 | // Strip bidirectional chars around categories |
528 | // Note that this is being done everywhere, |
529 | // not just in selser mode |
530 | $next = $this->stripBidiCharsAroundCategories( $node ); |
531 | if ( $next !== $node ) { |
532 | return $next; |
533 | } |
534 | } |
535 | |
536 | // Skip unmodified content |
537 | if ( $this->state->selserMode && !DOMUtils::atTheTop( $node ) && |
538 | !$this->inInsertedContent && |
539 | !DiffUtils::hasDiffMarkers( $node ) && |
540 | // If orig-src is not valid, this in effect becomes |
541 | // an edited node and needs normalizations applied to it. |
542 | WTSUtils::origSrcValidInEditedContext( $this->state, $node ) |
543 | ) { |
544 | return $node; |
545 | } |
546 | |
547 | // Headings |
548 | if ( DOMUtils::isHeading( $node ) ) { |
549 | '@phan-var Element $node'; // @var Element $node |
550 | $this->hoistLinks( $node, false ); |
551 | $this->hoistLinks( $node, true ); |
552 | $this->stripBRs( $node ); |
553 | |
554 | return $this->stripIfEmpty( $node ); |
555 | |
556 | // Quote tags |
557 | } elseif ( isset( Consts::$WTQuoteTags[$nodeName] ) ) { |
558 | return $this->stripIfEmpty( $node ); |
559 | |
560 | // Anchors |
561 | } elseif ( $nodeName === 'a' ) { |
562 | '@phan-var Element $node'; // @var Element $node |
563 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
564 | // We could have checked for !mw:ExtLink but in |
565 | // the case of links without any annotations, |
566 | // the positive test is semantically safer than the |
567 | // negative test. |
568 | if ( DOMUtils::hasRel( $node, 'mw:WikiLink' ) && |
569 | $this->stripIfEmpty( $node ) !== $node |
570 | ) { |
571 | return $next; |
572 | } |
573 | $this->moveTrailingSpacesOut( $node ); |
574 | |
575 | return $this->moveFormatTagOutsideATag( $node ); |
576 | |
577 | // Table cells |
578 | } elseif ( $nodeName === 'td' ) { |
579 | '@phan-var Element $node'; // @var Element $node |
580 | $dp = DOMDataUtils::getDataParsoid( $node ); |
581 | // * HTML <td>s won't have escapable prefixes |
582 | // * First cell should always be checked for escapable prefixes |
583 | // * Second and later cells in a wikitext td row (with stx='row' flag) |
584 | // won't have escapable prefixes. |
585 | $stx = $dp->stx ?? null; |
586 | if ( $stx === 'html' || |
587 | ( DiffDOMUtils::firstNonSepChild( $node->parentNode ) !== $node && $stx === 'row' ) ) { |
588 | return $node; |
589 | } |
590 | |
591 | $first = DiffDOMUtils::firstNonDeletedChild( $node ); |
592 | // Emit a space before escapable prefix |
593 | // This is preferable to serializing with a nowiki. |
594 | if ( $first instanceof Text && strspn( $first->nodeValue, '-+}', 0, 1 ) ) { |
595 | $first->nodeValue = ' ' . $first->nodeValue; |
596 | $this->addDiffMarks( $first, DiffMarkers::INSERTED, true ); |
597 | } |
598 | |
599 | return $node; |
600 | |
601 | // Font tags without any attributes |
602 | } elseif ( $nodeName === 'font' && DOMDataUtils::noAttrs( $node ) ) { |
603 | $next = DiffDOMUtils::nextNonDeletedSibling( $node ); |
604 | DOMUtils::migrateChildren( $node, $node->parentNode, $node ); |
605 | $node->parentNode->removeChild( $node ); |
606 | |
607 | return $next; |
608 | } elseif ( $node instanceof Element && $nodeName === 'p' |
609 | && !WTUtils::isLiteralHTMLNode( $node ) ) { |
610 | $next = DiffDOMUtils::nextNonSepSibling( $node ); |
611 | // Normalization of <p></p>, <p><br/></p>, <p><meta/></p> and the like to avoid |
612 | // extraneous new lines |
613 | if ( DiffDOMUtils::hasNChildren( $node, 1 ) && |
614 | WTUtils::isMarkerAnnotation( $node->firstChild ) |
615 | ) { |
616 | // Converts <p><meta /></p> (where meta is an annotation tag) to <meta /> without |
617 | // the wrapping <p> (that would typically be added by VE) to avoid getting too many |
618 | // newlines. |
619 | $ann = $node->firstChild; |
620 | DOMUtils::migrateChildren( $node, $node->parentNode, $node ); |
621 | $node->parentNode->removeChild( $node ); |
622 | return $ann; |
623 | } elseif ( |
624 | // Don't apply normalization to <p></p> nodes that |
625 | // were generated through deletions or other normalizations. |
626 | // FIXME: This trick fails for non-selser mode since |
627 | // diff markers are only added in selser mode. |
628 | DiffDOMUtils::hasNChildren( $node, 0, true ) && |
629 | // FIXME: Also, skip if this is the only child. |
630 | // Eliminates spurious test failures in non-selser mode. |
631 | !DiffDOMUtils::hasNChildren( $node->parentNode, 1 ) |
632 | ) { |
633 | // T184755: Convert sequences of <p></p> nodes to sequences of |
634 | // <br/>, <p><br/>..other content..</p>, <p><br/><p/> to ensure |
635 | // they serialize to as many newlines as the count of <p></p> nodes. |
636 | // Also handles <p><meta/></p> case for annotations. |
637 | if ( $next && DOMCompat::nodeName( $next ) === 'p' && |
638 | !WTUtils::isLiteralHTMLNode( $next ) ) { |
639 | // Replace 'node' (<p></p>) with a <br/> and make it the |
640 | // first child of 'next' (<p>..</p>). If 'next' was actually |
641 | // a <p></p> (i.e. empty), 'next' becomes <p><br/></p> |
642 | // which will serialize to 2 newlines. |
643 | $br = $node->ownerDocument->createElement( 'br' ); |
644 | $next->insertBefore( $br, $next->firstChild ); |
645 | |
646 | // Avoid nested insertion markers |
647 | if ( !$this->isInsertedContent( $next ) ) { |
648 | $this->addDiffMarks( $br, DiffMarkers::INSERTED ); |
649 | } |
650 | |
651 | // Delete node |
652 | $this->addDiffMarks( $node->parentNode, DiffMarkers::DELETED ); |
653 | $node->parentNode->removeChild( $node ); |
654 | } |
655 | } else { |
656 | // We cannot merge the <br/> with 'next' because |
657 | // it is not a <p>..</p>. |
658 | } |
659 | return $next; |
660 | } |
661 | // Default |
662 | return $node; |
663 | } |
664 | |
665 | public function normalizeSiblingPair( Node $a, Node $b ): Node { |
666 | if ( !$this->rewriteablePair( $a, $b ) ) { |
667 | return $b; |
668 | } |
669 | |
670 | // Since 'a' and 'b' make a rewriteable tag-pair, we are good to go. |
671 | if ( self::mergable( $a, $b ) ) { |
672 | '@phan-var Element $a'; // @var Element $a |
673 | '@phan-var Element $b'; // @var Element $b |
674 | $a = $this->merge( $a, $b ); |
675 | // The new a's children have new siblings. So let's look |
676 | // at a again. But their grandkids haven't changed, |
677 | // so we don't need to recurse further. |
678 | $this->processSubtree( $a, false ); |
679 | return $a; |
680 | } |
681 | |
682 | if ( self::swappable( $a, $b ) ) { |
683 | '@phan-var Element $a'; // @var Element $a |
684 | '@phan-var Element $b'; // @var Element $b |
685 | $firstNonDeletedChild = DiffDOMUtils::firstNonDeletedChild( $a ); |
686 | '@phan-var Element $firstNonDeletedChild'; // @var Element $firstNonDeletedChild |
687 | $a = $this->merge( $this->swap( $a, $firstNonDeletedChild ), $b ); |
688 | // Again, a has new children, but the grandkids have already |
689 | // been minimized. |
690 | $this->processSubtree( $a, false ); |
691 | return $a; |
692 | } |
693 | |
694 | if ( self::swappable( $b, $a ) ) { |
695 | '@phan-var Element $a'; // @var Element $a |
696 | '@phan-var Element $b'; // @var Element $b |
697 | $firstNonDeletedChild = DiffDOMUtils::firstNonDeletedChild( $b ); |
698 | '@phan-var Element $firstNonDeletedChild'; // @var Element $firstNonDeletedChild |
699 | $a = $this->merge( $a, $this->swap( $b, $firstNonDeletedChild ) ); |
700 | // Again, a has new children, but the grandkids have already |
701 | // been minimized. |
702 | $this->processSubtree( $a, false ); |
703 | return $a; |
704 | } |
705 | |
706 | return $b; |
707 | } |
708 | |
709 | public function processSubtree( Node $node, bool $recurse ): void { |
710 | // Process the first child outside the loop. |
711 | $a = DiffDOMUtils::firstNonDeletedChild( $node ); |
712 | if ( !$a ) { |
713 | return; |
714 | } |
715 | |
716 | $a = $this->processNode( $a, $recurse ); |
717 | while ( $a ) { |
718 | // We need a pair of adjacent siblings for tag minimization. |
719 | $b = DiffDOMUtils::nextNonDeletedSibling( $a ); |
720 | if ( !$b ) { |
721 | return; |
722 | } |
723 | |
724 | // Process subtree rooted at 'b'. |
725 | $b = $this->processNode( $b, $recurse ); |
726 | |
727 | // If we skipped over a bunch of nodes in the middle, |
728 | // we no longer have a pair of adjacent siblings. |
729 | if ( $b && DiffDOMUtils::previousNonDeletedSibling( $b ) === $a ) { |
730 | // Process the pair. |
731 | $a = $this->normalizeSiblingPair( $a, $b ); |
732 | } else { |
733 | $a = $b; |
734 | } |
735 | } |
736 | } |
737 | |
738 | public function processNode( Node $node, bool $recurse ): ?Node { |
739 | // Normalize 'node' and the subtree rooted at 'node' |
740 | // recurse = true => recurse and normalize subtree |
741 | // recurse = false => assume the subtree is already normalized |
742 | |
743 | // Normalize node till it stabilizes |
744 | $next = null; |
745 | while ( true ) { |
746 | // Skip templated content |
747 | while ( $node && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
748 | $node = WTUtils::skipOverEncapsulatedContent( $node ); |
749 | } |
750 | |
751 | if ( !$node ) { |
752 | return null; |
753 | } |
754 | |
755 | // Set insertion marker |
756 | $insertedSubtree = DiffUtils::hasInsertedDiffMark( $node ); |
757 | if ( $insertedSubtree ) { |
758 | if ( $this->inInsertedContent ) { |
759 | // Dump debugging info |
760 | $options = [ 'storeDiffMark' => true, 'saveData' => true ]; |
761 | $dump = ContentUtils::dumpDOM( |
762 | DOMCompat::getBody( $node->ownerDocument ), |
763 | '-- DOM triggering nested inserted dom-diff flags --', |
764 | $options |
765 | ); |
766 | $this->state->getEnv()->log( 'error/html2wt/dom', |
767 | "--- Nested inserted dom-diff flags ---\n", |
768 | 'Node:', |
769 | $node instanceof Element ? ContentUtils::toXML( $node, $options ) : $node->textContent, |
770 | "\nNode's parent:", |
771 | ContentUtils::toXML( $node->parentNode, $options ), |
772 | $dump |
773 | ); |
774 | } |
775 | // FIXME: If this assert is removed, the above dumping code should |
776 | // either be removed OR fixed up to remove uses of ContentUtils.ppToXML |
777 | Assert::invariant( !$this->inInsertedContent, 'Found nested inserted dom-diff flags!' ); |
778 | $this->inInsertedContent = true; |
779 | } |
780 | |
781 | // Post-order traversal: Process subtree first, and current node after. |
782 | // This lets multiple normalizations take effect cleanly. |
783 | if ( $recurse && $node instanceof Element ) { |
784 | $this->processSubtree( $node, true ); |
785 | } |
786 | |
787 | $next = $this->normalizeNode( $node ); |
788 | |
789 | // Clear insertion marker |
790 | if ( $insertedSubtree ) { |
791 | $this->inInsertedContent = false; |
792 | } |
793 | |
794 | if ( $next === $node ) { |
795 | return $node; |
796 | } else { |
797 | $node = $next; |
798 | } |
799 | } |
800 | |
801 | // @phan-suppress-next-line PhanPluginUnreachableCode |
802 | throw new UnreachableException( 'Control should never get here!' ); |
803 | } |
804 | |
805 | /** |
806 | * @param Element|DocumentFragment $node |
807 | */ |
808 | public function normalize( Node $node ) { |
809 | $this->processNode( $node, true ); |
810 | } |
811 | } |