Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
58.87% |
73 / 124 |
|
20.00% |
1 / 5 |
CRAP | |
0.00% |
0 / 1 |
UnpackDOMFragments | |
58.87% |
73 / 124 |
|
20.00% |
1 / 5 |
126.17 | |
0.00% |
0 / 1 |
hasBadNesting | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
fixAbouts | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
30 | |||
makeChildrenEncapWrappers | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 | |||
markMisnested | |
50.00% |
4 / 8 |
|
0.00% |
0 / 1 |
6.00 | |||
handler | |
69.07% |
67 / 97 |
|
0.00% |
0 / 1 |
38.65 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\PP\Handlers; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\Core\DomSourceRange; |
9 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\NodeData\TempData; |
13 | use Wikimedia\Parsoid\Utils\ContentUtils; |
14 | use Wikimedia\Parsoid\Utils\DOMCompat; |
15 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMTraverser; |
17 | use Wikimedia\Parsoid\Utils\DOMUtils; |
18 | use Wikimedia\Parsoid\Utils\PipelineUtils; |
19 | use Wikimedia\Parsoid\Utils\Utils; |
20 | |
21 | class UnpackDOMFragments { |
22 | |
23 | private static function hasBadNesting( |
24 | Node $targetNode, DocumentFragment $fragment |
25 | ): bool { |
26 | // T165098: This is not entirely correct. This is only |
27 | // looking for nesting of identical tags. But, HTML tree building |
28 | // has lot more restrictions on nesting. It seems the simplest way |
29 | // to get all the rules right is to (serialize + reparse). |
30 | |
31 | // A-tags cannot ever be nested inside each other at any level. |
32 | // This is the one scenario we definitely have to handle right now. |
33 | // We need a generic robust solution for other nesting scenarios. |
34 | // |
35 | // In the general case, we need to be walking up the ancestor chain |
36 | // of $targetNode to see if there is any 'A' tag there. But, since |
37 | // all link text is handled as DOM fragments, if there is any instance |
38 | // where that fragment generates A-tags, we'll always catch it. |
39 | // |
40 | // The only scenario we would miss would be if there were an A-tag whose |
41 | // link text wasn't a fragment but which had an embedded dom-fragment |
42 | // that generated an A-tag. Consider this example below: |
43 | // "<ext-X>...<div><ext-Y>..</ext-Y></div>..</ext-X>" |
44 | // If ext-X generates an A-tag and ext-Y also generates an A-tag, then |
45 | // when we unpack ext-Y's dom fragment, the simple check below would |
46 | // miss the misnesting. |
47 | return DOMCompat::nodeName( $targetNode ) === 'a' && |
48 | DOMUtils::treeHasElement( $fragment, 'a' ); |
49 | } |
50 | |
51 | private static function fixAbouts( Env $env, Node $node, array &$aboutIdMap = [] ): void { |
52 | $c = $node->firstChild; |
53 | while ( $c ) { |
54 | if ( $c instanceof Element ) { |
55 | $cAbout = DOMCompat::getAttribute( $c, 'about' ); |
56 | if ( $cAbout !== null ) { |
57 | // Update about |
58 | $newAbout = $aboutIdMap[$cAbout] ?? null; |
59 | if ( !$newAbout ) { |
60 | $newAbout = $env->newAboutId(); |
61 | $aboutIdMap[$cAbout] = $newAbout; |
62 | } |
63 | $c->setAttribute( 'about', $newAbout ); |
64 | } |
65 | self::fixAbouts( $env, $c, $aboutIdMap ); |
66 | } |
67 | $c = $c->nextSibling; |
68 | } |
69 | } |
70 | |
71 | private static function makeChildrenEncapWrappers( |
72 | DocumentFragment $domFragment, string $about |
73 | ): void { |
74 | PipelineUtils::addSpanWrappers( $domFragment->childNodes ); |
75 | |
76 | $c = $domFragment->firstChild; |
77 | while ( $c ) { |
78 | /** |
79 | * We just span wrapped the child nodes, so it's safe to assume |
80 | * they're all Elements. |
81 | * |
82 | * @var Element $c |
83 | */ |
84 | '@phan-var Element $c'; |
85 | // FIXME: This unconditionally sets about on children |
86 | // This is currently safe since all of them are nested |
87 | // inside a transclusion, but do we need future-proofing? |
88 | $c->setAttribute( 'about', $about ); |
89 | $c = $c->nextSibling; |
90 | } |
91 | } |
92 | |
93 | private static function markMisnested( Env $env, Element $n, ?int &$newOffset ): void { |
94 | $dp = DOMDataUtils::getDataParsoid( $n ); |
95 | if ( $newOffset === null ) { |
96 | // We end up here when $placeholderParent is part of encapsulated content. |
97 | // Till we add logic to prevent that from happening, we need this fallback. |
98 | if ( isset( $dp->dsr->start ) ) { |
99 | $newOffset = $dp->dsr->start; |
100 | } |
101 | |
102 | // If still null, set to some dummy value that is larger |
103 | // than page size to avoid pointing to something in source. |
104 | // Trying to fetch outside page source returns "". |
105 | if ( $newOffset === null ) { |
106 | $newOffset = strlen( $env->topFrame->getSrcText() ) + 1; |
107 | } |
108 | } |
109 | $dp->dsr = new DomSourceRange( $newOffset, $newOffset, null, null ); |
110 | $dp->misnested = true; |
111 | } |
112 | |
113 | /** |
114 | * DOMTraverser handler that unpacks DOM fragments which were injected in the |
115 | * token pipeline. |
116 | * @param Node $placeholder |
117 | * @param Env $env |
118 | * @return bool|Node |
119 | */ |
120 | public static function handler( Node $placeholder, Env $env ) { |
121 | if ( !$placeholder instanceof Element ) { |
122 | return true; |
123 | } |
124 | |
125 | // Sealed fragments shouldn't make it past this point |
126 | if ( !DOMUtils::hasTypeOf( $placeholder, 'mw:DOMFragment' ) ) { |
127 | return true; |
128 | } |
129 | |
130 | $placeholderDP = DOMDataUtils::getDataParsoid( $placeholder ); |
131 | Assert::invariant( str_starts_with( $placeholderDP->html, 'mwf' ), '' ); |
132 | $fragmentDOM = $env->getDOMFragment( $placeholderDP->html ); |
133 | $fragmentContent = $fragmentDOM->firstChild; |
134 | $placeholderParent = $placeholder->parentNode; |
135 | |
136 | // FIXME: What about mw:Param? |
137 | $isTransclusion = DOMUtils::hasTypeOf( $placeholder, 'mw:Transclusion' ); |
138 | if ( $isTransclusion ) { |
139 | // Ensure our `firstChild` is an element to add annotation. At present, |
140 | // we're unlikely to end up with translusion annotations on fragments |
141 | // where span wrapping hasn't occurred (ie. link contents, since that's |
142 | // placed on the anchor itself) but in the future, nowiki spans may be |
143 | // omitted or new uses for dom fragments found. For now, the test case |
144 | // necessitating this is an edgy link-in-link scenario: |
145 | // [[Test|{{1x|[[Hmm|Something <sup>strange</sup>]]}}]] |
146 | // A new use of dom fragments is for parser functions returning html |
147 | // (special page transclusions) which don't do span wrapping. |
148 | PipelineUtils::addSpanWrappers( $fragmentDOM->childNodes ); |
149 | // Reset `fragmentContent`, since the `firstChild` may have changed in |
150 | // span wrapping. |
151 | $fragmentContent = $fragmentDOM->firstChild; |
152 | DOMUtils::assertElt( $fragmentContent ); |
153 | // Transfer typeof, data-mw, and param info |
154 | // about attributes are transferred below. |
155 | DOMDataUtils::setDataMw( $fragmentContent, Utils::clone( DOMDataUtils::getDataMw( $placeholder ) ) ); |
156 | DOMUtils::addTypeOf( $fragmentContent, 'mw:Transclusion' ); |
157 | DOMDataUtils::getDataParsoid( $fragmentContent )->pi = $placeholderDP->pi ?? null; |
158 | } |
159 | |
160 | // Update DSR: |
161 | // |
162 | // - Only update DSR for content that came from cache. |
163 | // - For new DOM fragments from this pipeline, |
164 | // previously-computed DSR is valid. |
165 | // - EXCEPTION: fostered content from tables get their DSR reset |
166 | // to zero-width. |
167 | // - EXCEPTION: if we just transferred a transclusion marker, |
168 | // bring along the associated DSR. |
169 | // - FIXME: We seem to also be doing this for new extension content, |
170 | // which is the only place still using `setDSR`. |
171 | // |
172 | // There is currently no DSR for DOMFragments nested inside |
173 | // transclusion / extension content (extension inside template |
174 | // content etc). |
175 | // FIXME: Is that always the case? TSR info is stripped from tokens |
176 | // in transclusion but DSR computation happens before template wrapping |
177 | // and seems to sometimes assign DSR to DOMFragments regardless of having |
178 | // not having TSR set. |
179 | // TODO: Make sure that is the only reason for not having a DSR here. |
180 | $placeholderDSR = $placeholderDP->dsr ?? null; |
181 | if ( $placeholderDSR && ( |
182 | $placeholderDP->getTempFlag( TempData::SET_DSR ) || |
183 | $placeholderDP->getTempFlag( TempData::FROM_CACHE ) || |
184 | !empty( $placeholderDP->fostered ) || |
185 | $isTransclusion |
186 | ) ) { |
187 | DOMUtils::assertElt( $fragmentContent ); |
188 | $fragmentDP = DOMDataUtils::getDataParsoid( $fragmentContent ); |
189 | if ( $isTransclusion ) { |
190 | // FIXME: An old comment from c28f137 said we just use dsr->start and |
191 | // dsr->end since tag-widths will be incorrect for reuse of template |
192 | // expansions. The comment was removed in ca9e760. |
193 | $fragmentDP->dsr = new DomSourceRange( $placeholderDSR->start, $placeholderDSR->end, null, null ); |
194 | } elseif ( |
195 | DOMUtils::matchTypeOf( $fragmentContent, '/^mw:(Nowiki|Extension(\/\S+))$/' ) !== null |
196 | ) { |
197 | $fragmentDP->dsr = $placeholderDSR; |
198 | } else { // non-transcluded images |
199 | $fragmentDP->dsr = new DomSourceRange( $placeholderDSR->start, $placeholderDSR->end, 2, 2 ); |
200 | } |
201 | } |
202 | |
203 | if ( $placeholderDP->getTempFlag( TempData::FROM_CACHE ) ) { |
204 | // Replace old about-id with new about-id that is |
205 | // unique to the global page environment object. |
206 | // |
207 | // <figure>s are reused from cache. Note that figure captions |
208 | // can contain multiple independent transclusions. Each one |
209 | // of those individual transclusions should get a new unique |
210 | // about id. Hence a need for an aboutIdMap and the need to |
211 | // walk the entire tree. |
212 | self::fixAbouts( $env, $fragmentDOM ); |
213 | } |
214 | |
215 | // If the fragment wrapper has an about id, it came from template |
216 | // annotating (the wrapper was an about sibling) and should be transferred |
217 | // to top-level nodes after span wrapping. This should happen regardless |
218 | // of whether we're coming `fromCache` or not. |
219 | // FIXME: Presumably we have a nesting issue here if this is a cached |
220 | // transclusion. |
221 | $about = DOMCompat::getAttribute( $placeholder, 'about' ); |
222 | if ( $about !== null ) { |
223 | // Span wrapping may not have happened for the transclusion above if |
224 | // the fragment is not the first encapsulation wrapper node. |
225 | PipelineUtils::addSpanWrappers( $fragmentDOM->childNodes ); |
226 | $c = $fragmentDOM->firstChild; |
227 | while ( $c ) { |
228 | DOMUtils::assertElt( $c ); |
229 | $c->setAttribute( 'about', $about ); |
230 | $c = $c->nextSibling; |
231 | } |
232 | } |
233 | |
234 | $nextNode = $placeholder->nextSibling; |
235 | |
236 | if ( self::hasBadNesting( $placeholderParent, $fragmentDOM ) ) { |
237 | $nodeName = DOMCompat::nodeName( $placeholderParent ); |
238 | Assert::invariant( $nodeName === 'a', "Unsupported Bad Nesting scenario for $nodeName" ); |
239 | /* ----------------------------------------------------------------------- |
240 | * If placeholderParent is an A element and fragmentDOM contains another |
241 | * A element, we have an invalid nesting of A elements and needs fixing up. |
242 | * |
243 | * $doc1: ... $placeholderParent -> [... $placeholder=mw:DOMFragment, ...] ... |
244 | * |
245 | * 1. Change doc1:$placeholderParent -> [... "#unique-hash-code", ...] by replacing |
246 | * $placeholder with the "#unique-hash-code" text string |
247 | * |
248 | * 2. $str = $placeholderParent->str_replace(#unique-hash-code, $placeholderHTML) |
249 | * We now have a HTML string with the bad nesting. We will now use the HTML5 |
250 | * parser to parse this HTML string and give us the fixed up DOM |
251 | * |
252 | * 3. ParseHTML(str) to get |
253 | * $doc2: [BODY -> [[placeholderParent -> [...], nested-A-tag-from-placeholder, ...]]] |
254 | * |
255 | * 4. Replace $placeholderParent (in $doc1) with $doc2->body->childNodes |
256 | * ----------------------------------------------------------------------- */ |
257 | // FIXME: This is not the most robust hashcode function to use here. |
258 | // With a granularity of a second, if replacements aren't done right away, |
259 | // you can get hash conflicts. It is also conceivable that there is a use |
260 | // of a parser function that returns the value of time and that may lead to |
261 | // hashcode conflicts as well. |
262 | $hashCode = (string)time(); |
263 | $placeholderParent->replaceChild( $placeholder->ownerDocument->createTextNode( $hashCode ), $placeholder ); |
264 | |
265 | // If placeholderParent has an about, it presumably is nested inside a template |
266 | // Post fixup, its children will surface to the encapsulation wrapper level. |
267 | // So, we have to fix them up so they dont break the encapsulation. |
268 | // |
269 | // Ex: {{1x|[http://foo.com This is [[bad]], very bad]}} |
270 | // |
271 | // In this example, the <a> corresponding to Foo is placeholderParent and has an about. |
272 | // dummyNode is the DOM corresponding to "This is [[bad]], very bad". Post-fixup |
273 | // "[[bad]], very bad" are at encapsulation level and need about ids. |
274 | DOMUtils::assertElt( $placeholderParent ); // satisfy phan |
275 | $about = DOMCompat::getAttribute( $placeholderParent, 'about' ); |
276 | if ( $about !== null ) { |
277 | self::makeChildrenEncapWrappers( $fragmentDOM, $about ); |
278 | } |
279 | |
280 | $fragmentHTML = ContentUtils::ppToXML( $fragmentDOM, [ |
281 | 'innerXML' => true, |
282 | // We just added some span wrappers and we need to keep |
283 | // that tmp info so the unnecessary ones get stripped. |
284 | // Should be fine since tmp was stripped before packing. |
285 | 'keepTmp' => true |
286 | ] |
287 | ); |
288 | |
289 | $markerNode = $placeholderParent->previousSibling; |
290 | |
291 | // We rely on HTML5 parser to fixup the bad nesting (see big comment above) |
292 | $placeholderParentHTML = ContentUtils::ppToXML( $placeholderParent ); |
293 | $unpackedMisnestedHTML = str_replace( $hashCode, $fragmentHTML, $placeholderParentHTML ); |
294 | $unpackedFragment = DOMUtils::parseHTMLToFragment( |
295 | $placeholderParent->ownerDocument, $unpackedMisnestedHTML |
296 | ); |
297 | |
298 | DOMUtils::migrateChildren( |
299 | $unpackedFragment, $placeholderParent->parentNode, $placeholderParent |
300 | ); |
301 | |
302 | // Identify the new link node. All following siblings till placeholderParent |
303 | // are nodes that have been hoisted out of the link. |
304 | // - Add span wrappers where necessary |
305 | // - Load data-attribs |
306 | // - Zero-out DSR |
307 | |
308 | if ( $markerNode ) { |
309 | $linkNode = $markerNode->nextSibling; |
310 | } else { |
311 | $linkNode = $placeholderParent->parentNode->firstChild; |
312 | } |
313 | PipelineUtils::addSpanWrappers( |
314 | $linkNode->parentNode->childNodes, $linkNode->nextSibling, $placeholderParent ); |
315 | |
316 | $newOffset = null; |
317 | $node = $linkNode; |
318 | while ( $node !== $placeholderParent ) { |
319 | DOMDataUtils::visitAndLoadDataAttribs( $node ); |
320 | |
321 | if ( $node === $linkNode ) { |
322 | $newOffset = DOMDataUtils::getDataParsoid( $linkNode )->dsr->end ?? null; |
323 | } else { |
324 | $dsrFixer = new DOMTraverser(); |
325 | $dsrFixer->addHandler( null, static function ( Node $n ) use( $env, &$newOffset ) { |
326 | if ( $n instanceof Element ) { |
327 | self::markMisnested( $env, $n, $newOffset ); |
328 | } |
329 | return true; |
330 | } ); |
331 | $dsrFixer->traverse( null, $node ); |
332 | } |
333 | |
334 | $node = $node->nextSibling; |
335 | } |
336 | |
337 | // Set nextNode to the previous-sibling of former placeholderParent (which will get deleted) |
338 | // This will ensure that all nodes will get handled |
339 | $nextNode = $placeholderParent->previousSibling; |
340 | |
341 | // placeholderParent itself is useless now |
342 | $placeholderParent->parentNode->removeChild( $placeholderParent ); |
343 | } else { |
344 | // Preserve fostered flag from DOM fragment |
345 | if ( $fragmentContent instanceof Element ) { |
346 | if ( !empty( $placeholderDP->fostered ) ) { |
347 | $n = $fragmentContent; |
348 | while ( $n ) { |
349 | $dp = DOMDataUtils::getDataParsoid( $n ); |
350 | $dp->fostered = true; |
351 | $n = $n->nextSibling; |
352 | } |
353 | } |
354 | } |
355 | |
356 | // Move the content nodes over and delete the placeholder node |
357 | DOMUtils::migrateChildren( $fragmentDOM, $placeholderParent, $placeholder ); |
358 | $placeholderParent->removeChild( $placeholder ); |
359 | |
360 | } |
361 | |
362 | // Empty out $fragmentDOM since the call below asserts it |
363 | DOMCompat::replaceChildren( $fragmentDOM ); |
364 | $env->removeDOMFragment( $placeholderDP->html ); |
365 | |
366 | return $nextNode; |
367 | } |
368 | } |