Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
58.20% |
71 / 122 |
|
20.00% |
1 / 5 |
CRAP | |
0.00% |
0 / 1 |
UnpackDOMFragments | |
58.20% |
71 / 122 |
|
20.00% |
1 / 5 |
137.01 | |
0.00% |
0 / 1 |
hasBadNesting | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
fixAbouts | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
30 | |||
makeChildrenEncapWrappers | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
markMisnested | |
50.00% |
4 / 8 |
|
0.00% |
0 / 1 |
6.00 | |||
handler | |
68.42% |
65 / 95 |
|
0.00% |
0 / 1 |
42.14 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Handlers; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\Core\DomSourceRange; |
9 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\NodeData\TempData; |
13 | use Wikimedia\Parsoid\Utils\ContentUtils; |
14 | use Wikimedia\Parsoid\Utils\DOMCompat; |
15 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMTraverser; |
17 | use Wikimedia\Parsoid\Utils\DOMUtils; |
18 | use Wikimedia\Parsoid\Utils\DTState; |
19 | use Wikimedia\Parsoid\Utils\PipelineUtils; |
20 | |
21 | class UnpackDOMFragments { |
22 | |
23 | private static function hasBadNesting( |
24 | Node $targetNode, DocumentFragment $fragment |
25 | ): bool { |
26 | // T165098: This is not entirely correct. This is only |
27 | // looking for nesting of identical tags. But, HTML tree building |
28 | // has lot more restrictions on nesting. It seems the simplest way |
29 | // to get all the rules right is to (serialize + reparse). |
30 | |
31 | // A-tags cannot ever be nested inside each other at any level. |
32 | // This is the one scenario we definitely have to handle right now. |
33 | // We need a generic robust solution for other nesting scenarios. |
34 | // |
35 | // In the general case, we need to be walking up the ancestor chain |
36 | // of $targetNode to see if there is any 'A' tag there. But, since |
37 | // all link text is handled as DOM fragments, if there is any instance |
38 | // where that fragment generates A-tags, we'll always catch it. |
39 | // |
40 | // The only scenario we would miss would be if there were an A-tag whose |
41 | // link text wasn't a fragment but which had an embedded dom-fragment |
42 | // that generated an A-tag. Consider this example below: |
43 | // "<ext-X>...<div><ext-Y>..</ext-Y></div>..</ext-X>" |
44 | // If ext-X generates an A-tag and ext-Y also generates an A-tag, then |
45 | // when we unpack ext-Y's dom fragment, the simple check below would |
46 | // miss the misnesting. |
47 | return DOMCompat::nodeName( $targetNode ) === 'a' && |
48 | DOMUtils::treeHasElement( $fragment, 'a' ); |
49 | } |
50 | |
51 | private static function fixAbouts( Env $env, Node $node, array &$aboutIdMap = [] ): void { |
52 | $c = $node->firstChild; |
53 | while ( $c ) { |
54 | if ( $c instanceof Element ) { |
55 | $cAbout = DOMCompat::getAttribute( $c, 'about' ); |
56 | if ( $cAbout !== null ) { |
57 | // Update about |
58 | $newAbout = $aboutIdMap[$cAbout] ?? null; |
59 | if ( !$newAbout ) { |
60 | $newAbout = $env->newAboutId(); |
61 | $aboutIdMap[$cAbout] = $newAbout; |
62 | } |
63 | $c->setAttribute( 'about', $newAbout ); |
64 | } |
65 | self::fixAbouts( $env, $c, $aboutIdMap ); |
66 | } |
67 | $c = $c->nextSibling; |
68 | } |
69 | } |
70 | |
71 | private static function makeChildrenEncapWrappers( |
72 | DocumentFragment $domFragment, string $about |
73 | ): void { |
74 | PipelineUtils::addSpanWrappers( $domFragment->childNodes ); |
75 | |
76 | $c = $domFragment->firstChild; |
77 | while ( $c ) { |
78 | /** |
79 | * We just span wrapped the child nodes, so it's safe to assume |
80 | * they're all Elements. |
81 | * |
82 | * @var Element $c |
83 | */ |
84 | '@phan-var Element $c'; |
85 | // FIXME: This unconditionally sets about on children |
86 | // This is currently safe since all of them are nested |
87 | // inside a transclusion, but do we need future-proofing? |
88 | $c->setAttribute( 'about', $about ); |
89 | $c = $c->nextSibling; |
90 | } |
91 | } |
92 | |
93 | private static function markMisnested( Env $env, Element $n, ?int &$newOffset ): void { |
94 | $dp = DOMDataUtils::getDataParsoid( $n ); |
95 | if ( $newOffset === null ) { |
96 | // We end up here when $placeholderParent is part of encapsulated content. |
97 | // Till we add logic to prevent that from happening, we need this fallback. |
98 | if ( isset( $dp->dsr->start ) ) { |
99 | $newOffset = $dp->dsr->start; |
100 | } |
101 | |
102 | // If still null, set to some dummy value that is larger |
103 | // than page size to avoid pointing to something in source. |
104 | // Trying to fetch outside page source returns "". |
105 | if ( $newOffset === null ) { |
106 | $newOffset = strlen( $env->topFrame->getSrcText() ) + 1; |
107 | } |
108 | } |
109 | $dp->dsr = new DomSourceRange( $newOffset, $newOffset, null, null ); |
110 | $dp->misnested = true; |
111 | } |
112 | |
113 | /** |
114 | * DOMTraverser handler that unpacks DOM fragments which were injected in the |
115 | * token pipeline. |
116 | * @param Node $placeholder |
117 | * @param DTState $state |
118 | * @return bool|Node |
119 | */ |
120 | public static function handler( Node $placeholder, DTState $state ) { |
121 | if ( !$placeholder instanceof Element ) { |
122 | return true; |
123 | } |
124 | |
125 | // Sealed fragments shouldn't make it past this point |
126 | if ( !DOMUtils::hasTypeOf( $placeholder, 'mw:DOMFragment' ) ) { |
127 | return true; |
128 | } |
129 | |
130 | $env = $state->env; |
131 | $placeholderDP = DOMDataUtils::getDataParsoid( $placeholder ); |
132 | Assert::invariant( str_starts_with( $placeholderDP->html, 'mwf' ), '' ); |
133 | $fragmentDOM = $env->getDOMFragment( $placeholderDP->html ); |
134 | $fragmentContent = $fragmentDOM->firstChild; |
135 | $placeholderParent = $placeholder->parentNode; |
136 | |
137 | // FIXME: What about mw:Param? |
138 | $isTransclusion = DOMUtils::hasTypeOf( $placeholder, 'mw:Transclusion' ); |
139 | if ( $isTransclusion ) { |
140 | // Ensure our `firstChild` is an element to add annotation. At present, |
141 | // we're unlikely to end up with translusion annotations on fragments |
142 | // where span wrapping hasn't occurred (ie. link contents, since that's |
143 | // placed on the anchor itself) but in the future, nowiki spans may be |
144 | // omitted or new uses for dom fragments found. For now, the test case |
145 | // necessitating this is an edgy link-in-link scenario: |
146 | // [[Test|{{1x|[[Hmm|Something <sup>strange</sup>]]}}]] |
147 | // A new use of dom fragments is for parser functions returning html |
148 | // (special page transclusions) which don't do span wrapping. |
149 | PipelineUtils::addSpanWrappers( $fragmentDOM->childNodes ); |
150 | // Reset `fragmentContent`, since the `firstChild` may have changed in |
151 | // span wrapping. |
152 | $fragmentContent = $fragmentDOM->firstChild; |
153 | DOMUtils::assertElt( $fragmentContent ); |
154 | // Transfer typeof, data-mw, and param info |
155 | // about attributes are transferred below. |
156 | DOMDataUtils::setDataMw( $fragmentContent, clone DOMDataUtils::getDataMw( $placeholder ) ); |
157 | DOMUtils::addTypeOf( $fragmentContent, 'mw:Transclusion' ); |
158 | DOMDataUtils::getDataParsoid( $fragmentContent )->pi = $placeholderDP->pi ?? null; |
159 | } |
160 | |
161 | // Update DSR: |
162 | // |
163 | // - Only update DSR for content that came from cache. |
164 | // - For new DOM fragments from this pipeline, |
165 | // previously-computed DSR is valid. |
166 | // - EXCEPTION: fostered content from tables get their DSR reset |
167 | // to zero-width. |
168 | // - EXCEPTION: if we just transferred a transclusion marker, |
169 | // bring along the associated DSR. |
170 | // - FIXME: We seem to also be doing this for new extension content, |
171 | // which is the only place still using `setDSR`. |
172 | // |
173 | // There is currently no DSR for DOMFragments nested inside |
174 | // transclusion / extension content (extension inside template |
175 | // content etc). |
176 | // FIXME: Is that always the case? TSR info is stripped from tokens |
177 | // in transclusion but DSR computation happens before template wrapping |
178 | // and seems to sometimes assign DSR to DOMFragments regardless of having |
179 | // not having TSR set. |
180 | // TODO: Make sure that is the only reason for not having a DSR here. |
181 | $placeholderDSR = $placeholderDP->dsr ?? null; |
182 | if ( $placeholderDSR && ( |
183 | $placeholderDP->getTempFlag( TempData::SET_DSR ) || |
184 | $placeholderDP->getTempFlag( TempData::FROM_CACHE ) || |
185 | !empty( $placeholderDP->fostered ) || |
186 | $isTransclusion |
187 | ) ) { |
188 | DOMUtils::assertElt( $fragmentContent ); |
189 | $fragmentDP = DOMDataUtils::getDataParsoid( $fragmentContent ); |
190 | if ( $isTransclusion ) { |
191 | // FIXME: An old comment from c28f137 said we just use dsr->start and |
192 | // dsr->end since tag-widths will be incorrect for reuse of template |
193 | // expansions. The comment was removed in ca9e760. |
194 | $fragmentDP->dsr = new DomSourceRange( $placeholderDSR->start, $placeholderDSR->end, null, null ); |
195 | } elseif ( |
196 | DOMUtils::matchTypeOf( $fragmentContent, '/^mw:(Nowiki|Extension(\/\S+))$/' ) !== null |
197 | ) { |
198 | $fragmentDP->dsr = $placeholderDSR; |
199 | } else { // non-transcluded images |
200 | $fragmentDP->dsr = new DomSourceRange( $placeholderDSR->start, $placeholderDSR->end, 2, 2 ); |
201 | } |
202 | } |
203 | |
204 | if ( $placeholderDP->getTempFlag( TempData::FROM_CACHE ) ) { |
205 | // Replace old about-id with new about-id that is |
206 | // unique to the global page environment object. |
207 | // |
208 | // <figure>s are reused from cache. Note that figure captions |
209 | // can contain multiple independent transclusions. Each one |
210 | // of those individual transclusions should get a new unique |
211 | // about id. Hence a need for an aboutIdMap and the need to |
212 | // walk the entire tree. |
213 | self::fixAbouts( $env, $fragmentDOM ); |
214 | } |
215 | |
216 | // If the fragment wrapper has an about id, it came from template |
217 | // annotating (the wrapper was an about sibling) and should be transferred |
218 | // to top-level nodes after span wrapping. This should happen regardless |
219 | // of whether we're coming `fromCache` or not. |
220 | // FIXME: Presumably we have a nesting issue here if this is a cached |
221 | // transclusion. |
222 | $about = DOMCompat::getAttribute( $placeholder, 'about' ); |
223 | if ( $about !== null ) { |
224 | // Span wrapping may not have happened for the transclusion above if |
225 | // the fragment is not the first encapsulation wrapper node. |
226 | PipelineUtils::addSpanWrappers( $fragmentDOM->childNodes ); |
227 | $c = $fragmentDOM->firstChild; |
228 | while ( $c ) { |
229 | DOMUtils::assertElt( $c ); |
230 | $c->setAttribute( 'about', $about ); |
231 | $c = $c->nextSibling; |
232 | } |
233 | } |
234 | |
235 | $nextNode = $placeholder->nextSibling; |
236 | |
237 | if ( self::hasBadNesting( $placeholderParent, $fragmentDOM ) ) { |
238 | $nodeName = DOMCompat::nodeName( $placeholderParent ); |
239 | Assert::invariant( $nodeName === 'a', "Unsupported Bad Nesting scenario for $nodeName" ); |
240 | /* ----------------------------------------------------------------------- |
241 | * If placeholderParent is an A element and fragmentDOM contains another |
242 | * A element, we have an invalid nesting of A elements and needs fixing up. |
243 | * ----------------------------------------------------------------------- */ |
244 | |
245 | // If placeholderParent has an about, it presumably is nested inside a template |
246 | // Post fixup, its children will surface to the encapsulation wrapper level. |
247 | // So, we have to fix them up so they dont break the encapsulation. |
248 | // |
249 | // Ex: {{1x|[http://foo.com This is [[bad]], very bad]}} |
250 | // |
251 | // In this example, the <a> corresponding to Foo is placeholderParent and has an about. |
252 | // dummyNode is the DOM corresponding to "This is [[bad]], very bad". Post-fixup |
253 | // "[[bad]], very bad" are at encapsulation level and need about ids. |
254 | DOMUtils::assertElt( $placeholderParent ); // satisfy phan |
255 | $about = DOMCompat::getAttribute( $placeholderParent, 'about' ); |
256 | if ( $about !== null ) { |
257 | self::makeChildrenEncapWrappers( $fragmentDOM, $about ); |
258 | } |
259 | |
260 | while ( $fragmentDOM->firstChild ) { |
261 | $placeholderParent->insertBefore( $fragmentDOM->firstChild, $placeholder ); |
262 | } |
263 | $placeholderParent->removeChild( $placeholder ); |
264 | |
265 | $markerNode = $placeholderParent->previousSibling; |
266 | |
267 | // We rely on HTML5 parser to fixup the bad nesting (see big comment above) |
268 | $placeholderParentHTML = ContentUtils::ppToXML( $placeholderParent, [ |
269 | // We just added some span wrappers and we need to keep |
270 | // that tmp info so the unnecessary ones get stripped. |
271 | // Should be fine since tmp was stripped before packing. |
272 | 'keepTmp' => true |
273 | ] ); |
274 | |
275 | $unpackedFragment = DOMUtils::parseHTMLToFragment( |
276 | $placeholderParent->ownerDocument, $placeholderParentHTML |
277 | ); |
278 | |
279 | DOMUtils::migrateChildren( |
280 | $unpackedFragment, $placeholderParent->parentNode, $placeholderParent |
281 | ); |
282 | |
283 | // Identify the new link node. All following siblings till placeholderParent |
284 | // are nodes that have been hoisted out of the link. |
285 | // - Add span wrappers where necessary |
286 | // - Load data-attribs |
287 | // - Zero-out DSR |
288 | |
289 | if ( $markerNode ) { |
290 | $linkNode = $markerNode->nextSibling; |
291 | } else { |
292 | $linkNode = $placeholderParent->parentNode->firstChild; |
293 | } |
294 | PipelineUtils::addSpanWrappers( |
295 | $linkNode->parentNode->childNodes, $linkNode->nextSibling, $placeholderParent ); |
296 | |
297 | $newOffset = null; |
298 | $node = $linkNode; |
299 | while ( $node !== $placeholderParent ) { |
300 | DOMDataUtils::visitAndLoadDataAttribs( $node ); |
301 | |
302 | if ( $node === $linkNode ) { |
303 | $newOffset = DOMDataUtils::getDataParsoid( $linkNode )->dsr->end ?? null; |
304 | } else { |
305 | $dsrFixer = new DOMTraverser(); |
306 | $dsrFixer->addHandler( null, static function ( Node $n ) use( $env, &$newOffset ) { |
307 | if ( $n instanceof Element ) { |
308 | self::markMisnested( $env, $n, $newOffset ); |
309 | } |
310 | return true; |
311 | } ); |
312 | $dsrFixer->traverse( null, $node ); |
313 | } |
314 | |
315 | $node = $node->nextSibling; |
316 | } |
317 | |
318 | // Set nextNode to the previous-sibling of former placeholderParent (which will get deleted) |
319 | // This will ensure that all nodes will get handled |
320 | $nextNode = $placeholderParent->previousSibling; |
321 | |
322 | // placeholderParent itself is useless now |
323 | $placeholderParent->parentNode->removeChild( $placeholderParent ); |
324 | } else { |
325 | // Preserve fostered flag from DOM fragment |
326 | if ( $fragmentContent instanceof Element ) { |
327 | if ( !empty( $placeholderDP->fostered ) ) { |
328 | $n = $fragmentContent; |
329 | while ( $n ) { |
330 | $dp = DOMDataUtils::getDataParsoid( $n ); |
331 | $dp->fostered = true; |
332 | $n = $n->nextSibling; |
333 | } |
334 | } |
335 | } |
336 | |
337 | // Move the content nodes over and delete the placeholder node |
338 | DOMUtils::migrateChildren( $fragmentDOM, $placeholderParent, $placeholder ); |
339 | $placeholderParent->removeChild( $placeholder ); |
340 | |
341 | } |
342 | |
343 | // Empty out $fragmentDOM since the call below asserts it |
344 | DOMCompat::replaceChildren( $fragmentDOM ); |
345 | $env->removeDOMFragment( $placeholderDP->html ); |
346 | |
347 | return $nextNode; |
348 | } |
349 | } |