Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.32% |
1 / 317 |
|
4.76% |
1 / 21 |
CRAP | |
0.00% |
0 / 1 |
DOMNormalizer | |
0.32% |
1 / 317 |
|
4.76% |
1 / 21 |
23646.27 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
similar | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
132 | |||
mergable | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
swappable | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
firstChild | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
6 | |||
isInsertedContent | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
20 | |||
rewriteablePair | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
30 | |||
addDiffMarks | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
182 | |||
merge | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
30 | |||
swap | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
6 | |||
hoistLinks | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
156 | |||
stripIfEmpty | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
12 | |||
moveTrailingSpacesOut | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
56 | |||
stripBRs | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
stripBidiCharsAroundCategories | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
72 | |||
moveFormatTagOutsideATag | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
156 | |||
normalizeNode | |
0.00% |
0 / 66 |
|
0.00% |
0 / 1 |
1260 | |||
normalizeSiblingPair | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
30 | |||
processSubtree | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
42 | |||
processNode | |
0.00% |
0 / 34 |
|
0.00% |
0 / 1 |
156 | |||
normalize | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
9 | use Wikimedia\Parsoid\DOM\Element; |
10 | use Wikimedia\Parsoid\DOM\Node; |
11 | use Wikimedia\Parsoid\DOM\Text; |
12 | use Wikimedia\Parsoid\Utils\ContentUtils; |
13 | use Wikimedia\Parsoid\Utils\DOMCompat; |
14 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
15 | use Wikimedia\Parsoid\Utils\DOMUtils; |
16 | use Wikimedia\Parsoid\Utils\PHPUtils; |
17 | use Wikimedia\Parsoid\Utils\WTUtils; |
18 | use Wikimedia\Parsoid\Wikitext\Consts; |
19 | |
20 | /* |
21 | * Tag minimization |
22 | * ---------------- |
23 | * Minimize a pair of tags in the dom tree rooted at node. |
24 | * |
25 | * This function merges adjacent nodes of the same type |
26 | * and swaps nodes where possible to enable further merging. |
27 | * |
28 | * See examples below: |
29 | * |
30 | * 1. <b>X</b><b>Y</b> |
31 | * ==> <b>XY</b> |
32 | * |
33 | * 2. <i>A</i><b><i>X</i></b><b><i>Y</i></b><i>Z</i> |
34 | * ==> <i>A<b>XY</b>Z</i> |
35 | * |
36 | * 3. <a href="Football">Foot</a><a href="Football">ball</a> |
37 | * ==> <a href="Football">Football</a> |
38 | */ |
39 | |
40 | /** |
41 | * DOM normalization. |
42 | * |
43 | * DOM normalizations are performed after DOMDiff is run. |
44 | * So, normalization routines should update diff markers appropriately. |
45 | */ |
46 | class DOMNormalizer { |
47 | |
48 | private const IGNORABLE_ATTRS = [ |
49 | 'data-parsoid', 'id', 'title', DOMDataUtils::DATA_OBJECT_ATTR_NAME |
50 | ]; |
51 | private const HTML_IGNORABLE_ATTRS = [ 'data-parsoid', DOMDataUtils::DATA_OBJECT_ATTR_NAME ]; |
52 | |
53 | private static $specializedAttribHandlers; |
54 | |
55 | /** @var bool */ |
56 | private $inInsertedContent; |
57 | |
58 | /** @var SerializerState */ |
59 | private $state; |
60 | |
61 | /** |
62 | * @param SerializerState $state |
63 | */ |
64 | public function __construct( SerializerState $state ) { |
65 | if ( !self::$specializedAttribHandlers ) { |
66 | self::$specializedAttribHandlers = [ |
67 | 'data-mw' => static function ( $nodeA, $dmwA, $nodeB, $dmwB ) { |
68 | return $dmwA == $dmwB; |
69 | } |
70 | ]; |
71 | } |
72 | |
73 | $this->state = $state; |
74 | |
75 | $this->inInsertedContent = false; |
76 | } |
77 | |
78 | /** |
79 | * @param Node $a |
80 | * @param Node $b |
81 | * @return bool |
82 | */ |
83 | private static function similar( Node $a, Node $b ): bool { |
84 | if ( DOMCompat::nodeName( $a ) === 'a' ) { |
85 | // FIXME: Similar to 1ce6a98, DOMUtils.nextNonDeletedSibling is being |
86 | // used in this file where maybe DOMUtils.nextNonSepSibling belongs. |
87 | return $a instanceof Element && $b instanceof Element && |
88 | DiffUtils::attribsEquals( $a, $b, self::IGNORABLE_ATTRS, self::$specializedAttribHandlers ); |
89 | } else { |
90 | $aIsHtml = WTUtils::isLiteralHTMLNode( $a ); |
91 | $bIsHtml = WTUtils::isLiteralHTMLNode( $b ); |
92 | // TODO (Anomie) |
93 | // It looks like $ignorableAttrs is only used when $aIsHtml is true. |
94 | // Or is that the fixme referred to in the comment below? |
95 | $ignorableAttrs = $aIsHtml ? self::HTML_IGNORABLE_ATTRS : self::IGNORABLE_ATTRS; |
96 | |
97 | // FIXME: For non-HTML I/B tags, we seem to be dropping all attributes |
98 | // in our tag handlers (which seems like a bug). Till that is fixed, |
99 | // we'll preserve existing functionality here. |
100 | return ( !$aIsHtml && !$bIsHtml ) || |
101 | ( $aIsHtml && $bIsHtml && |
102 | $a instanceof Element && $b instanceof Element && |
103 | DiffUtils::attribsEquals( $a, $b, $ignorableAttrs, self::$specializedAttribHandlers ) ); |
104 | } |
105 | } |
106 | |
107 | /** |
108 | * Can a and b be merged into a single node? |
109 | * @param Node $a |
110 | * @param Node $b |
111 | * @return bool |
112 | */ |
113 | private static function mergable( Node $a, Node $b ): bool { |
114 | return DOMCompat::nodeName( $a ) === DOMCompat::nodeName( $b ) && self::similar( $a, $b ); |
115 | } |
116 | |
117 | /** |
118 | * Can a and b be combined into a single node |
119 | * if we swap a and a.firstChild? |
120 | * |
121 | * For example: A='<b><i>x</i></b>' b='<i>y</i>' => '<i><b>x</b>y</i>'. |
122 | * @param Node $a |
123 | * @param Node $b |
124 | * @return bool |
125 | */ |
126 | private static function swappable( Node $a, Node $b ): bool { |
127 | return DOMUtils::numNonDeletedChildNodes( $a ) === 1 |
128 | && self::similar( $a, DOMUtils::firstNonDeletedChild( $a ) ) |
129 | && self::mergable( DOMUtils::firstNonDeletedChild( $a ), $b ); |
130 | } |
131 | |
132 | /** |
133 | * @param Node $node |
134 | * @param bool $rtl |
135 | * @return Node|null |
136 | */ |
137 | private static function firstChild( Node $node, bool $rtl ): ?Node { |
138 | return $rtl ? DOMUtils::lastNonDeletedChild( $node ) : DOMUtils::firstNonDeletedChild( $node ); |
139 | } |
140 | |
141 | /** |
142 | * @param Node $node |
143 | * @return bool |
144 | */ |
145 | private function isInsertedContent( Node $node ): bool { |
146 | while ( true ) { |
147 | if ( DiffUtils::hasInsertedDiffMark( $node, $this->state->getEnv() ) ) { |
148 | return true; |
149 | } |
150 | if ( DOMUtils::atTheTop( $node ) ) { |
151 | return false; |
152 | } |
153 | $node = $node->parentNode; |
154 | } |
155 | } |
156 | |
157 | /** |
158 | * @param Node $a |
159 | * @param Node $b |
160 | * @return bool |
161 | */ |
162 | private function rewriteablePair( Node $a, Node $b ): bool { |
163 | if ( isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $a )] ) ) { |
164 | // For <i>/<b> pair, we need not check whether the node being transformed |
165 | // are new / edited, etc. since these minimization scenarios can |
166 | // never show up in HTML that came from parsed wikitext. |
167 | // |
168 | // <i>..</i><i>..</i> can never show up without a <nowiki/> in between. |
169 | // Similarly for <b>..</b><b>..</b> and <b><i>..</i></b><i>..</i>. |
170 | // |
171 | // This is because a sequence of 4 quotes is not parsed as ..</i><i>.. |
172 | // Neither is a sequence of 7 quotes parsed as ..</i></b><i>.. |
173 | // |
174 | // So, if we see a minimizable pair of nodes, it is because the HTML |
175 | // didn't originate from wikitext OR the HTML has been subsequently edited. |
176 | // In both cases, we want to transform the DOM. |
177 | |
178 | return isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $b )] ); |
179 | } elseif ( DOMCompat::nodeName( $a ) === 'a' ) { |
180 | // For <a> tags, we require at least one of the two tags |
181 | // to be a newly created element. |
182 | return DOMCompat::nodeName( $b ) === 'a' && ( WTUtils::isNewElt( $a ) || WTUtils::isNewElt( $b ) ); |
183 | } |
184 | return false; |
185 | } |
186 | |
187 | /** |
188 | * @param Node $node |
189 | * @param string $mark |
190 | * @param bool $dontRecurse |
191 | */ |
192 | public function addDiffMarks( Node $node, string $mark, bool $dontRecurse = false ): void { |
193 | $env = $this->state->getEnv(); |
194 | if ( !$this->state->selserMode || DiffUtils::hasDiffMark( $node, $env, $mark ) ) { |
195 | return; |
196 | } |
197 | |
198 | // Don't introduce nested inserted markers |
199 | if ( $this->inInsertedContent && $mark === 'inserted' ) { |
200 | return; |
201 | } |
202 | |
203 | // Newly added elements don't need diff marks |
204 | if ( !WTUtils::isNewElt( $node ) ) { |
205 | DiffUtils::addDiffMark( $node, $env, $mark ); |
206 | if ( $mark === 'inserted' || $mark === 'deleted' ) { |
207 | DiffUtils::addDiffMark( $node->parentNode, $env, 'children-changed' ); |
208 | } |
209 | } |
210 | |
211 | if ( $dontRecurse ) { |
212 | return; |
213 | } |
214 | |
215 | // Walk up the subtree and add 'subtree-changed' markers |
216 | $node = $node->parentNode; |
217 | while ( $node instanceof Element && !DOMUtils::atTheTop( $node ) ) { |
218 | if ( DiffUtils::hasDiffMark( $node, $env, 'subtree-changed' ) ) { |
219 | return; |
220 | } |
221 | if ( !WTUtils::isNewElt( $node ) ) { |
222 | DiffUtils::setDiffMark( $node, $env, 'subtree-changed' ); |
223 | } |
224 | $node = $node->parentNode; |
225 | } |
226 | } |
227 | |
228 | /** |
229 | * Transfer all of b's children to a and delete b. |
230 | * @param Element $a |
231 | * @param Element $b |
232 | * @return Element |
233 | */ |
234 | public function merge( Element $a, Element $b ): Element { |
235 | $sentinel = $b->firstChild; |
236 | |
237 | // Migrate any intermediate nodes (usually 0 / 1 diff markers) |
238 | // present between a and b to a |
239 | $next = $a->nextSibling; |
240 | if ( $next !== $b ) { |
241 | $a->appendChild( $next ); |
242 | } |
243 | |
244 | // The real work of merging |
245 | DOMUtils::migrateChildren( $b, $a ); |
246 | $b->parentNode->removeChild( $b ); |
247 | |
248 | // Normalize the node to merge any adjacent text nodes |
249 | DOMCompat::normalize( $a ); |
250 | |
251 | // Update diff markers |
252 | $this->addDiffMarks( $a->parentNode, 'children-changed' ); // $b was removed |
253 | $this->addDiffMarks( $a, 'children-changed' ); // $a got more children |
254 | if ( !DOMUtils::isRemoved( $sentinel ) ) { |
255 | // Nodes starting at 'sentinal' were inserted into 'a' |
256 | // b, which was a's sibling was deleted |
257 | // Only addDiffMarks to sentinel, if it is still part of the dom |
258 | // (and hasn't been deleted by the call to a.normalize() ) |
259 | if ( $sentinel->parentNode ) { |
260 | $this->addDiffMarks( $sentinel, 'moved', true ); |
261 | } |
262 | } |
263 | if ( $a->nextSibling ) { |
264 | // FIXME: Hmm .. there is an API hole here |
265 | // about ability to add markers after last child |
266 | $this->addDiffMarks( $a->nextSibling, 'moved', true ); |
267 | } |
268 | |
269 | return $a; |
270 | } |
271 | |
272 | /** |
273 | * b is a's sole non-deleted child. Switch them around. |
274 | * @param Element $a |
275 | * @param Element $b |
276 | * @return Element |
277 | */ |
278 | public function swap( Element $a, Element $b ): Element { |
279 | DOMUtils::migrateChildren( $b, $a ); |
280 | $a->parentNode->insertBefore( $b, $a ); |
281 | $b->appendChild( $a ); |
282 | |
283 | // Mark a's subtree, a, and b as all having moved |
284 | if ( $a->firstChild !== null ) { |
285 | $this->addDiffMarks( $a->firstChild, 'moved', true ); |
286 | } |
287 | $this->addDiffMarks( $a, 'moved', true ); |
288 | $this->addDiffMarks( $b, 'moved', true ); |
289 | $this->addDiffMarks( $a, 'children-changed', true ); |
290 | $this->addDiffMarks( $b, 'children-changed', true ); |
291 | $this->addDiffMarks( $b->parentNode, 'children-changed' ); |
292 | |
293 | return $b; |
294 | } |
295 | |
296 | /** |
297 | * @param Element $node |
298 | * @param bool $rtl |
299 | */ |
300 | public function hoistLinks( Element $node, bool $rtl ): void { |
301 | $sibling = self::firstChild( $node, $rtl ); |
302 | $hasHoistableContent = false; |
303 | |
304 | while ( $sibling ) { |
305 | $next = $rtl |
306 | ? DOMUtils::previousNonDeletedSibling( $sibling ) |
307 | : DOMUtils::nextNonDeletedSibling( $sibling ); |
308 | if ( !DOMUtils::isContentNode( $sibling ) ) { |
309 | // Nothing to do, continue. |
310 | } elseif ( !WTUtils::isRenderingTransparentNode( $sibling ) || |
311 | WTUtils::isEncapsulationWrapper( $sibling ) |
312 | ) { |
313 | // Don't venture into templated content |
314 | break; |
315 | } else { |
316 | $hasHoistableContent = true; |
317 | } |
318 | $sibling = $next; |
319 | } |
320 | |
321 | if ( $hasHoistableContent ) { |
322 | // soak up all the non-content nodes (exclude sibling) |
323 | $move = self::firstChild( $node, $rtl ); |
324 | $firstNode = $move; |
325 | while ( $move !== $sibling ) { |
326 | $refnode = $rtl ? DOMUtils::nextNonDeletedSibling( $node ) : $node; |
327 | $node->parentNode->insertBefore( $move, $refnode ); |
328 | $move = self::firstChild( $node, $rtl ); |
329 | } |
330 | |
331 | // and drop any leading whitespace |
332 | if ( $sibling instanceof Text ) { |
333 | $sibling->nodeValue = $rtl ? rtrim( $sibling->nodeValue ) : ltrim( $sibling->nodeValue ); |
334 | } |
335 | |
336 | // Update diff markers |
337 | $this->addDiffMarks( $firstNode, 'moved', true ); |
338 | if ( $sibling ) { |
339 | $this->addDiffMarks( $sibling, 'moved', true ); |
340 | } |
341 | $this->addDiffMarks( $node, 'children-changed', true ); |
342 | $this->addDiffMarks( $node->parentNode, 'children-changed' ); |
343 | } |
344 | } |
345 | |
346 | /** |
347 | * @param Element $node |
348 | * @return Node|null |
349 | */ |
350 | public function stripIfEmpty( Element $node ): ?Node { |
351 | $next = DOMUtils::nextNonDeletedSibling( $node ); |
352 | $dp = DOMDataUtils::getDataParsoid( $node ); |
353 | $autoInserted = isset( $dp->autoInsertedStart ) || isset( $dp->autoInsertedEnd ); |
354 | |
355 | $strippable = |
356 | DOMUtils::nodeEssentiallyEmpty( $node, false ); |
357 | // Ex: "<a..>..</a><b></b>bar" |
358 | // From [[Foo]]<b/>bar usage found on some dewiki pages. |
359 | // FIXME: Should we enable this? |
360 | // !( false /* used to be rt-test mode */ && ( $dp->stx ?? null ) === 'html' ); |
361 | |
362 | if ( $strippable ) { |
363 | // Update diff markers (before the deletion) |
364 | $this->addDiffMarks( $node, 'deleted', true ); |
365 | $node->parentNode->removeChild( $node ); |
366 | return $next; |
367 | } else { |
368 | return $node; |
369 | } |
370 | } |
371 | |
372 | /** |
373 | * @param Node $node |
374 | */ |
375 | public function moveTrailingSpacesOut( Node $node ): void { |
376 | $next = DOMUtils::nextNonDeletedSibling( $node ); |
377 | $last = DOMUtils::lastNonDeletedChild( $node ); |
378 | $matches = null; |
379 | if ( $last instanceof Text && |
380 | preg_match( '/\s+$/D', $last->nodeValue, $matches ) > 0 |
381 | ) { |
382 | $trailing = $matches[0]; |
383 | $last->nodeValue = substr( $last->nodeValue, 0, -strlen( $trailing ) ); |
384 | // Try to be a little smarter and drop the spaces if possible. |
385 | if ( $next && ( !( $next instanceof Text ) || !preg_match( '/^\s+/', $next->nodeValue ) ) ) { |
386 | if ( !( $next instanceof Text ) ) { |
387 | $txt = $node->ownerDocument->createTextNode( '' ); |
388 | $node->parentNode->insertBefore( $txt, $next ); |
389 | $next = $txt; |
390 | } |
391 | $next->nodeValue = $trailing . $next->nodeValue; |
392 | // next (a text node) is new / had new content added to it |
393 | $this->addDiffMarks( $next, 'inserted', true ); |
394 | } |
395 | $this->addDiffMarks( $last, 'inserted', true ); |
396 | $this->addDiffMarks( $node->parentNode, 'children-changed' ); |
397 | } |
398 | } |
399 | |
400 | /** |
401 | * @param Element $node |
402 | */ |
403 | public function stripBRs( Element $node ): void { |
404 | $child = $node->firstChild; |
405 | while ( $child ) { |
406 | $next = $child->nextSibling; |
407 | if ( DOMCompat::nodeName( $child ) === 'br' ) { |
408 | // replace <br/> with a single space |
409 | $node->removeChild( $child ); |
410 | $node->insertBefore( $node->ownerDocument->createTextNode( ' ' ), $next ); |
411 | } elseif ( $child instanceof Element ) { |
412 | $this->stripBRs( $child ); |
413 | } |
414 | $child = $next; |
415 | } |
416 | } |
417 | |
418 | /** |
419 | * FIXME see |
420 | * https://gerrit.wikimedia.org/r/#/c/mediawiki/services/parsoid/+/500975/7/src/Html2Wt/DOMNormalizer.php@423 |
421 | * @param Node $node |
422 | * @return Node|null |
423 | */ |
424 | public function stripBidiCharsAroundCategories( Node $node ): ?Node { |
425 | if ( !( $node instanceof Text ) || |
426 | ( !WTUtils::isCategoryLink( $node->previousSibling ) && |
427 | !WTUtils::isCategoryLink( $node->nextSibling ) ) |
428 | ) { |
429 | // Not a text node and not adjacent to a category link |
430 | return $node; |
431 | } |
432 | |
433 | $next = $node->nextSibling; |
434 | if ( !$next || WTUtils::isCategoryLink( $next ) ) { |
435 | // The following can leave behind an empty text node. |
436 | $oldLength = strlen( $node->nodeValue ); |
437 | $node->nodeValue = preg_replace( |
438 | '/([\x{200e}\x{200f}]+\n)?[\x{200e}\x{200f}]+$/uD', |
439 | '', |
440 | $node->nodeValue |
441 | ); |
442 | $newLength = strlen( $node->nodeValue ); |
443 | |
444 | if ( $oldLength !== $newLength ) { |
445 | // Log changes for editors benefit |
446 | $this->state->getEnv()->log( 'warn/html2wt/bidi', |
447 | 'LRM/RLM unicode chars stripped around categories' |
448 | ); |
449 | } |
450 | |
451 | if ( $newLength === 0 ) { |
452 | // Remove empty text nodes to keep DOM in normalized form |
453 | $ret = DOMUtils::nextNonDeletedSibling( $node ); |
454 | $node->parentNode->removeChild( $node ); |
455 | $this->addDiffMarks( $node, 'deleted' ); |
456 | return $ret; |
457 | } |
458 | |
459 | // Treat modified node as having been newly inserted |
460 | $this->addDiffMarks( $node, 'inserted' ); |
461 | } |
462 | return $node; |
463 | } |
464 | |
465 | /** |
466 | * When an A tag is encountered, if there are format tags inside, move them outside |
467 | * Also merge a single sibling A tag that is mergable |
468 | * The link href and text must match for this normalization to take effect |
469 | * |
470 | * @param Element $node |
471 | * @return Node|null |
472 | */ |
473 | public function moveFormatTagOutsideATag( Element $node ): ?Node { |
474 | if ( DOMCompat::nodeName( $node ) !== 'a' ) { |
475 | return $node; |
476 | } |
477 | $sibling = DOMUtils::nextNonDeletedSibling( $node ); |
478 | if ( $sibling ) { |
479 | $this->normalizeSiblingPair( $node, $sibling ); |
480 | } |
481 | |
482 | $firstChild = DOMUtils::firstNonDeletedChild( $node ); |
483 | $fcNextSibling = null; |
484 | if ( $firstChild ) { |
485 | $fcNextSibling = DOMUtils::nextNonDeletedSibling( $firstChild ); |
486 | } |
487 | |
488 | if ( !$node->hasAttribute( 'href' ) ) { |
489 | return $node; |
490 | } |
491 | $nodeHref = $node->getAttribute( 'href' ); |
492 | |
493 | // If there are no tags to swap, we are done |
494 | if ( $firstChild instanceof Element && |
495 | // No reordering possible with multiple children |
496 | $fcNextSibling === null && |
497 | // Do not normalize WikiLinks with these attributes |
498 | !$firstChild->hasAttribute( 'color' ) && |
499 | !$firstChild->hasAttribute( 'style' ) && |
500 | !$firstChild->hasAttribute( 'class' ) && |
501 | // Compare textContent to the href, noting that this matching doesn't handle all |
502 | // possible simple-wiki-link scenarios that isSimpleWikiLink in link handler tackles |
503 | $node->textContent === PHPUtils::stripPrefix( $nodeHref, './' ) |
504 | ) { |
505 | for ( $child = DOMUtils::firstNonDeletedChild( $node ); |
506 | DOMUtils::isFormattingElt( $child ); |
507 | $child = DOMUtils::firstNonDeletedChild( $node ) |
508 | ) { |
509 | '@phan-var Element $child'; // @var Element $child |
510 | $this->swap( $node, $child ); |
511 | } |
512 | return $firstChild; |
513 | } |
514 | |
515 | return $node; |
516 | } |
517 | |
518 | /** |
519 | * Wikitext normalizations implemented right now: |
520 | * |
521 | * 1. Tag minimization (I/B tags) in normalizeSiblingPair |
522 | * 2. Strip empty headings and style tags |
523 | * 3. Force SOL transparent links to serialize before/after heading |
524 | * 4. Trailing spaces are migrated out of links |
525 | * 5. Space is added before escapable prefixes in table cells |
526 | * 6. Strip <br/> from headings |
527 | * 7. Strip bidi chars around categories |
528 | * 8. When an A tag is encountered, if there are format tags inside, move them outside |
529 | * |
530 | * The return value from this function should respect the |
531 | * following contract: |
532 | * - if input node is unmodified, return it. |
533 | * - if input node is modified, return the new node |
534 | * that it transforms into. |
535 | * If you return a node other than this, normalizations may not |
536 | * apply cleanly and may be skipped. |
537 | * |
538 | * @param Node $node |
539 | * @return Node|null the normalized node |
540 | */ |
541 | public function normalizeNode( Node $node ): ?Node { |
542 | $dp = null; |
543 | if ( DOMCompat::nodeName( $node ) === 'th' || DOMCompat::nodeName( $node ) === 'td' ) { |
544 | '@phan-var Element $node'; // @var Element $node |
545 | $dp = DOMDataUtils::getDataParsoid( $node ); |
546 | // Table cells (td/th) previously used the stx_v flag for single-row syntax. |
547 | // Newer code uses stx flag since that is used everywhere else. |
548 | // While we still have old HTML in cache / storage, accept |
549 | // the stx_v flag as well. |
550 | // TODO: We are at html version 1.5.0 now. Once storage |
551 | // no longer has version 1.5.0 content, we can get rid of |
552 | // this b/c code. |
553 | if ( isset( $dp->stx_v ) ) { |
554 | // HTML (stx='html') elements will not have the stx_v flag set |
555 | // since the single-row syntax only applies to native-wikitext. |
556 | // So, we can safely override it here. |
557 | $dp->stx = $dp->stx_v; |
558 | } |
559 | } |
560 | |
561 | $next = null; |
562 | |
563 | if ( $this->state->getEnv()->getSiteConfig()->scrubBidiChars() ) { |
564 | // Strip bidirectional chars around categories |
565 | // Note that this is being done everywhere, |
566 | // not just in selser mode |
567 | $next = $this->stripBidiCharsAroundCategories( $node ); |
568 | if ( $next !== $node ) { |
569 | return $next; |
570 | } |
571 | } |
572 | |
573 | // Skip unmodified content |
574 | if ( $this->state->selserMode && !DOMUtils::atTheTop( $node ) && |
575 | !$this->inInsertedContent && |
576 | !DiffUtils::hasDiffMarkers( $node, $this->state->getEnv() ) && |
577 | // If orig-src is not valid, this in effect becomes |
578 | // an edited node and needs normalizations applied to it. |
579 | WTSUtils::origSrcValidInEditedContext( $this->state, $node ) |
580 | ) { |
581 | return $node; |
582 | } |
583 | |
584 | // Headings |
585 | if ( preg_match( '/^h[1-6]$/D', DOMCompat::nodeName( $node ) ) ) { |
586 | '@phan-var Element $node'; // @var Element $node |
587 | $this->hoistLinks( $node, false ); |
588 | $this->hoistLinks( $node, true ); |
589 | $this->stripBRs( $node ); |
590 | |
591 | return $this->stripIfEmpty( $node ); |
592 | |
593 | // Quote tags |
594 | } elseif ( isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $node )] ) ) { |
595 | return $this->stripIfEmpty( $node ); |
596 | |
597 | // Anchors |
598 | } elseif ( DOMCompat::nodeName( $node ) === 'a' ) { |
599 | '@phan-var Element $node'; // @var Element $node |
600 | $next = DOMUtils::nextNonDeletedSibling( $node ); |
601 | // We could have checked for !mw:ExtLink but in |
602 | // the case of links without any annotations, |
603 | // the positive test is semantically safer than the |
604 | // negative test. |
605 | if ( DOMUtils::hasRel( $node, 'mw:WikiLink' ) && |
606 | $this->stripIfEmpty( $node ) !== $node |
607 | ) { |
608 | return $next; |
609 | } |
610 | $this->moveTrailingSpacesOut( $node ); |
611 | |
612 | return $this->moveFormatTagOutsideATag( $node ); |
613 | |
614 | // Table cells |
615 | } elseif ( DOMCompat::nodeName( $node ) === 'td' ) { |
616 | '@phan-var Element $node'; // @var Element $node |
617 | $dp = DOMDataUtils::getDataParsoid( $node ); |
618 | // * HTML <td>s won't have escapable prefixes |
619 | // * First cell should always be checked for escapable prefixes |
620 | // * Second and later cells in a wikitext td row (with stx='row' flag) |
621 | // won't have escapable prefixes. |
622 | $stx = $dp->stx ?? null; |
623 | if ( $stx === 'html' || |
624 | ( DOMUtils::firstNonSepChild( $node->parentNode ) !== $node && $stx === 'row' ) ) { |
625 | return $node; |
626 | } |
627 | |
628 | $first = DOMUtils::firstNonDeletedChild( $node ); |
629 | // Emit a space before escapable prefix |
630 | // This is preferable to serializing with a nowiki. |
631 | if ( $first instanceof Text && strspn( $first->nodeValue, '-+}', 0, 1 ) ) { |
632 | $first->nodeValue = ' ' . $first->nodeValue; |
633 | $this->addDiffMarks( $first, 'inserted', true ); |
634 | } |
635 | |
636 | return $node; |
637 | |
638 | // Font tags without any attributes |
639 | } elseif ( DOMCompat::nodeName( $node ) === 'font' && DOMDataUtils::noAttrs( $node ) ) { |
640 | $next = DOMUtils::nextNonDeletedSibling( $node ); |
641 | DOMUtils::migrateChildren( $node, $node->parentNode, $node ); |
642 | $node->parentNode->removeChild( $node ); |
643 | |
644 | return $next; |
645 | } elseif ( $node instanceof Element && DOMCompat::nodeName( $node ) === 'p' |
646 | && !WTUtils::isLiteralHTMLNode( $node ) ) { |
647 | $next = DOMUtils::nextNonSepSibling( $node ); |
648 | // Normalization of <p></p>, <p><br/></p>, <p><meta/></p> and the like to avoid |
649 | // extraneous new lines |
650 | if ( DOMUtils::hasNChildren( $node, 1 ) && |
651 | WTUtils::isMarkerAnnotation( $node->firstChild ) |
652 | ) { |
653 | // Converts <p><meta /></p> (where meta is an annotation tag) to <meta /> without |
654 | // the wrapping <p> (that would typically be added by VE) to avoid getting too many |
655 | // newlines. |
656 | $ann = $node->firstChild; |
657 | DOMUtils::migrateChildren( $node, $node->parentNode, $node ); |
658 | $node->parentNode->removeChild( $node ); |
659 | return $ann; |
660 | } elseif ( |
661 | // Don't apply normalization to <p></p> nodes that |
662 | // were generated through deletions or other normalizations. |
663 | // FIXME: This trick fails for non-selser mode since |
664 | // diff markers are only added in selser mode. |
665 | DOMUtils::hasNChildren( $node, 0, true ) && |
666 | // FIXME: Also, skip if this is the only child. |
667 | // Eliminates spurious test failures in non-selser mode. |
668 | !DOMUtils::hasNChildren( $node->parentNode, 1 ) |
669 | ) { |
670 | // T184755: Convert sequences of <p></p> nodes to sequences of |
671 | // <br/>, <p><br/>..other content..</p>, <p><br/><p/> to ensure |
672 | // they serialize to as many newlines as the count of <p></p> nodes. |
673 | // Also handles <p><meta/></p> case for annotations. |
674 | if ( $next && DOMCompat::nodeName( $next ) === 'p' && |
675 | !WTUtils::isLiteralHTMLNode( $next ) ) { |
676 | // Replace 'node' (<p></p>) with a <br/> and make it the |
677 | // first child of 'next' (<p>..</p>). If 'next' was actually |
678 | // a <p></p> (i.e. empty), 'next' becomes <p><br/></p> |
679 | // which will serialize to 2 newlines. |
680 | $br = $node->ownerDocument->createElement( 'br' ); |
681 | $next->insertBefore( $br, $next->firstChild ); |
682 | |
683 | // Avoid nested insertion markers |
684 | if ( !$this->isInsertedContent( $next ) ) { |
685 | $this->addDiffMarks( $br, 'inserted' ); |
686 | } |
687 | |
688 | // Delete node |
689 | $this->addDiffMarks( $node->parentNode, 'deleted' ); |
690 | $node->parentNode->removeChild( $node ); |
691 | } |
692 | } else { |
693 | // We cannot merge the <br/> with 'next' because |
694 | // it is not a <p>..</p>. |
695 | } |
696 | return $next; |
697 | } |
698 | // Default |
699 | return $node; |
700 | } |
701 | |
702 | /** |
703 | * @param Node $a |
704 | * @param Node $b |
705 | * @return Node |
706 | */ |
707 | public function normalizeSiblingPair( Node $a, Node $b ): Node { |
708 | if ( !$this->rewriteablePair( $a, $b ) ) { |
709 | return $b; |
710 | } |
711 | |
712 | // Since 'a' and 'b' make a rewriteable tag-pair, we are good to go. |
713 | if ( self::mergable( $a, $b ) ) { |
714 | '@phan-var Element $a'; // @var Element $a |
715 | '@phan-var Element $b'; // @var Element $b |
716 | $a = $this->merge( $a, $b ); |
717 | // The new a's children have new siblings. So let's look |
718 | // at a again. But their grandkids haven't changed, |
719 | // so we don't need to recurse further. |
720 | $this->processSubtree( $a, false ); |
721 | return $a; |
722 | } |
723 | |
724 | if ( self::swappable( $a, $b ) ) { |
725 | '@phan-var Element $a'; // @var Element $a |
726 | '@phan-var Element $b'; // @var Element $b |
727 | $firstNonDeletedChild = DOMUtils::firstNonDeletedChild( $a ); |
728 | '@phan-var Element $firstNonDeletedChild'; // @var Element $firstNonDeletedChild |
729 | $a = $this->merge( $this->swap( $a, $firstNonDeletedChild ), $b ); |
730 | // Again, a has new children, but the grandkids have already |
731 | // been minimized. |
732 | $this->processSubtree( $a, false ); |
733 | return $a; |
734 | } |
735 | |
736 | if ( self::swappable( $b, $a ) ) { |
737 | '@phan-var Element $a'; // @var Element $a |
738 | '@phan-var Element $b'; // @var Element $b |
739 | $firstNonDeletedChild = DOMUtils::firstNonDeletedChild( $b ); |
740 | '@phan-var Element $firstNonDeletedChild'; // @var Element $firstNonDeletedChild |
741 | $a = $this->merge( $a, $this->swap( $b, $firstNonDeletedChild ) ); |
742 | // Again, a has new children, but the grandkids have already |
743 | // been minimized. |
744 | $this->processSubtree( $a, false ); |
745 | return $a; |
746 | } |
747 | |
748 | return $b; |
749 | } |
750 | |
751 | /** |
752 | * @param Node $node |
753 | * @param bool $recurse |
754 | */ |
755 | public function processSubtree( Node $node, bool $recurse ): void { |
756 | // Process the first child outside the loop. |
757 | $a = DOMUtils::firstNonDeletedChild( $node ); |
758 | if ( !$a ) { |
759 | return; |
760 | } |
761 | |
762 | $a = $this->processNode( $a, $recurse ); |
763 | while ( $a ) { |
764 | // We need a pair of adjacent siblings for tag minimization. |
765 | $b = DOMUtils::nextNonDeletedSibling( $a ); |
766 | if ( !$b ) { |
767 | return; |
768 | } |
769 | |
770 | // Process subtree rooted at 'b'. |
771 | $b = $this->processNode( $b, $recurse ); |
772 | |
773 | // If we skipped over a bunch of nodes in the middle, |
774 | // we no longer have a pair of adjacent siblings. |
775 | if ( $b && DOMUtils::previousNonDeletedSibling( $b ) === $a ) { |
776 | // Process the pair. |
777 | $a = $this->normalizeSiblingPair( $a, $b ); |
778 | } else { |
779 | $a = $b; |
780 | } |
781 | } |
782 | } |
783 | |
784 | /** |
785 | * @param Node $node |
786 | * @param bool $recurse |
787 | * @return Node|null |
788 | */ |
789 | public function processNode( Node $node, bool $recurse ): ?Node { |
790 | // Normalize 'node' and the subtree rooted at 'node' |
791 | // recurse = true => recurse and normalize subtree |
792 | // recurse = false => assume the subtree is already normalized |
793 | |
794 | // Normalize node till it stabilizes |
795 | $next = null; |
796 | while ( true ) { |
797 | // Skip templated content |
798 | while ( $node && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
799 | $node = WTUtils::skipOverEncapsulatedContent( $node ); |
800 | } |
801 | |
802 | if ( !$node ) { |
803 | return null; |
804 | } |
805 | |
806 | // Set insertion marker |
807 | $insertedSubtree = DiffUtils::hasInsertedDiffMark( $node, $this->state->getEnv() ); |
808 | if ( $insertedSubtree ) { |
809 | if ( $this->inInsertedContent ) { |
810 | // Dump debugging info |
811 | $options = [ 'storeDiffMark' => true ]; |
812 | $dump = ContentUtils::dumpDOM( |
813 | DOMCompat::getBody( $node->ownerDocument ), |
814 | '-- DOM triggering nested inserted dom-diff flags --', |
815 | $options |
816 | ); |
817 | $this->state->getEnv()->log( 'error/html2wt/dom', |
818 | "--- Nested inserted dom-diff flags ---\n", |
819 | 'Node:', |
820 | $node instanceof Element ? ContentUtils::ppToXML( $node ) : $node->textContent, |
821 | "\nNode's parent:", |
822 | ContentUtils::ppToXML( $node->parentNode ), |
823 | $dump |
824 | ); |
825 | } |
826 | // FIXME: If this assert is removed, the above dumping code should |
827 | // either be removed OR fixed up to remove uses of ContentUtils.ppToXML |
828 | Assert::invariant( !$this->inInsertedContent, 'Found nested inserted dom-diff flags!' ); |
829 | $this->inInsertedContent = true; |
830 | } |
831 | |
832 | // Post-order traversal: Process subtree first, and current node after. |
833 | // This lets multiple normalizations take effect cleanly. |
834 | if ( $recurse && $node instanceof Element ) { |
835 | $this->processSubtree( $node, true ); |
836 | } |
837 | |
838 | $next = $this->normalizeNode( $node ); |
839 | |
840 | // Clear insertion marker |
841 | if ( $insertedSubtree ) { |
842 | $this->inInsertedContent = false; |
843 | } |
844 | |
845 | if ( $next === $node ) { |
846 | return $node; |
847 | } else { |
848 | $node = $next; |
849 | } |
850 | } |
851 | |
852 | // @phan-suppress-next-line PhanPluginUnreachableCode |
853 | throw new UnreachableException( 'Control should never get here!' ); |
854 | } |
855 | |
856 | /** |
857 | * @param Element|DocumentFragment $node |
858 | */ |
859 | public function normalize( Node $node ) { |
860 | $this->processNode( $node, true ); |
861 | } |
862 | } |