Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
43.23% |
67 / 155 |
|
14.29% |
1 / 7 |
CRAP | |
0.00% |
0 / 1 |
CleanUp | |
43.23% |
67 / 155 |
|
14.29% |
1 / 7 |
1343.69 | |
0.00% |
0 / 1 |
stripMarkerMetas | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
90 | |||
isEmptyNode | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
110 | |||
handleEmptyElements | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
132 | |||
inNativeContent | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
trimWhiteSpace | |
100.00% |
38 / 38 |
|
100.00% |
1 / 1 |
17 | |||
finalCleanup | |
78.38% |
29 / 37 |
|
0.00% |
0 / 1 |
31.32 | |||
saveDataParsoid | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
72 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Handlers; |
5 | |
6 | use Wikimedia\Parsoid\Config\Env; |
7 | use Wikimedia\Parsoid\Core\DomSourceRange; |
8 | use Wikimedia\Parsoid\DOM\Comment; |
9 | use Wikimedia\Parsoid\DOM\Element; |
10 | use Wikimedia\Parsoid\DOM\Node; |
11 | use Wikimedia\Parsoid\DOM\Text; |
12 | use Wikimedia\Parsoid\NodeData\TempData; |
13 | use Wikimedia\Parsoid\Utils\DOMCompat; |
14 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
15 | use Wikimedia\Parsoid\Utils\DOMUtils; |
16 | use Wikimedia\Parsoid\Utils\DTState; |
17 | use Wikimedia\Parsoid\Utils\Utils; |
18 | use Wikimedia\Parsoid\Utils\WTUtils; |
19 | use Wikimedia\Parsoid\Wikitext\Consts; |
20 | use Wikimedia\Parsoid\Wt2Html\TT\PreHandler; |
21 | |
22 | class CleanUp { |
23 | /** |
24 | * @param Element $node |
25 | * @return bool|Element |
26 | */ |
27 | public static function stripMarkerMetas( Element $node ) { |
28 | // This meta tag can never have data-mw associated with it. |
29 | // If it were produced by a template, it would always have a <pre> |
30 | // wrapper around which carries any relevant data-mw & typeof properties. |
31 | $isIndentPreSpace = PreHandler::isIndentPreWS( $node ); |
32 | if ( $isIndentPreSpace || |
33 | DOMUtils::hasTypeOf( $node, "mw:Placeholder/UnclosedComment" ) || |
34 | // Sometimes a non-tpl meta node might get the mw:Transclusion typeof |
35 | // element attached to it. So, check if the node has data-mw, |
36 | // in which case we also have to keep it. |
37 | ( !DOMDataUtils::validDataMw( $node ) && ( |
38 | ( |
39 | DOMUtils::hasTypeOf( $node, 'mw:Placeholder/StrippedTag' ) && |
40 | // NOTE: In ComputeDSR, we don't zero out the width of these |
41 | // markers because they're staying in the DOM and serializeDOMNode |
42 | // only handles a few cases of zero width nodes. |
43 | !DOMUtils::isNestedInListItem( $node ) |
44 | ) || |
45 | DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) |
46 | ) ) |
47 | ) { |
48 | $nextNode = $node->nextSibling; |
49 | $parent = $node->parentNode; |
50 | if ( $isIndentPreSpace ) { |
51 | $dsr = DOMDataUtils::getDataParsoid( $parent )->dsr ?? null; |
52 | if ( $dsr ) { |
53 | // @see explanation in PreHandler::newIndentPreWS() |
54 | $dsr->openWidth = 1; |
55 | } |
56 | // Strip this in the cleanup handler since |
57 | // DOM passes till the end may need DSR info from this tag. |
58 | return true; |
59 | } else { |
60 | $parent->removeChild( $node ); |
61 | // stop the traversal, since this node is no longer in the DOM. |
62 | return $nextNode; |
63 | } |
64 | } else { |
65 | return true; |
66 | } |
67 | } |
68 | |
69 | /** |
70 | * The following are considered "empty node"s: |
71 | * - Comments, rendering transparent nodes, nowiki spans without content |
72 | * are all stripped by the core parser. |
73 | * - Text nodes with whitespace don't count either. |
74 | * - Parsoid-added span wrappers around other "empty node"s. |
75 | * |
76 | * @param Node $node |
77 | * @param bool &$hasRTNodes Set to true if the node contained rendering transparent nodes. |
78 | * Note this value is only reliable if ::isEmptyNode() returns true. |
79 | * @return bool |
80 | */ |
81 | private static function isEmptyNode( Node $node, bool &$hasRTNodes ): bool { |
82 | for ( $n = $node->firstChild; $n !== null; $n = $n->nextSibling ) { |
83 | if ( $n instanceof Comment ) { |
84 | continue; |
85 | } elseif ( $n instanceof Text ) { |
86 | if ( !preg_match( '/^[ \t\r\n]*$/D', $n->nodeValue ) ) { |
87 | return false; |
88 | } |
89 | continue; |
90 | } elseif ( $n instanceof Element ) { |
91 | if ( WTUtils::isRenderingTransparentNode( $n ) ) { |
92 | $hasRTNodes = true; |
93 | continue; |
94 | } |
95 | if ( |
96 | ( |
97 | DOMUtils::hasTypeOf( $n, 'mw:Nowiki' ) || |
98 | DOMDataUtils::getDataParsoid( $n )->getTempFlag( TempData::WRAPPER ) |
99 | ) && self::isEmptyNode( $n, $hasRTNodes ) |
100 | ) { |
101 | continue; |
102 | } |
103 | return false; |
104 | } else { |
105 | return false; |
106 | } |
107 | } |
108 | return true; |
109 | } |
110 | |
111 | // These template-wrapping attributes can be ignored while looking for |
112 | // empty elements. Note that data-mw & data-parsoid are unlikely to exist |
113 | // at this stage of DOM processing. This is conservative but safe. |
114 | // In this case, it is also sufficient since only p, li, tr can be deleted. |
115 | public const ALLOWED_TPL_WRAPPER_ATTRS = [ |
116 | 'about' => 1, |
117 | 'typeof' => 1, |
118 | ]; |
119 | |
120 | /** |
121 | * @param Node $node |
122 | * @param DTState $state |
123 | * @return bool|Node |
124 | */ |
125 | public static function handleEmptyElements( Node $node, DTState $state ) { |
126 | // Set by isEmptyNode() to indicate whether a node which is "empty" contained |
127 | // invisible "rendering transparent" nodes. |
128 | $hasRTNodes = false; |
129 | |
130 | if ( !( $node instanceof Element ) || |
131 | !isset( Consts::$Output['FlaggedEmptyElts'][DOMCompat::nodeName( $node )] ) || |
132 | !self::isEmptyNode( $node, $hasRTNodes ) |
133 | ) { |
134 | return true; |
135 | } |
136 | foreach ( DOMUtils::attributes( $node ) as $name => $value ) { |
137 | // Skip the Parsoid-added data attribute and template-wrapping attributes |
138 | if ( $name === DOMDataUtils::DATA_OBJECT_ATTR_NAME || |
139 | ( ( $state->tplInfo ?? null ) && isset( self::ALLOWED_TPL_WRAPPER_ATTRS[$name] ) ) |
140 | ) { |
141 | continue; |
142 | } |
143 | |
144 | return true; |
145 | } |
146 | |
147 | /** |
148 | * The node is known to be empty and a deletion candidate |
149 | * - If node is part of template content and is not the |
150 | * first encapsulation wrapper node, and doesn't contain |
151 | * any rendering transparent nodes, it can be deleted. |
152 | * - If not, we add the mw-empty-elt class so that wikis |
153 | * can decide what to do with them. |
154 | */ |
155 | if ( |
156 | $state->tplInfo && |
157 | $state->tplInfo->first !== $node && |
158 | !$hasRTNodes |
159 | ) { |
160 | $nextNode = $node->nextSibling; |
161 | $node->parentNode->removeChild( $node ); |
162 | return $nextNode; |
163 | } |
164 | |
165 | DOMCompat::getClassList( $node )->add( 'mw-empty-elt' ); |
166 | return true; |
167 | } |
168 | |
169 | /** |
170 | * FIXME: Worry about "about" siblings |
171 | * |
172 | * @param Env $env |
173 | * @param Element $node |
174 | * @return bool |
175 | */ |
176 | private static function inNativeContent( Env $env, Element $node ): bool { |
177 | while ( !DOMUtils::atTheTop( $node ) ) { |
178 | if ( WTUtils::getNativeExt( $env, $node ) !== null ) { |
179 | return true; |
180 | } |
181 | $node = $node->parentNode; |
182 | } |
183 | return false; |
184 | } |
185 | |
186 | /** |
187 | * Whitespace in this function refers to [ \t] only |
188 | * @param Element $node |
189 | * @param ?DomSourceRange $dsr |
190 | */ |
191 | private static function trimWhiteSpace( Element $node, ?DomSourceRange $dsr ): void { |
192 | // Trim leading ws (on the first line) |
193 | $trimmedLen = 0; |
194 | $updateDSR = true; |
195 | $skipped = false; |
196 | for ( $c = $node->firstChild; $c; $c = $next ) { |
197 | $next = $c->nextSibling; |
198 | if ( $c instanceof Text && preg_match( '/^[ \t]*$/D', $c->nodeValue ) ) { |
199 | $node->removeChild( $c ); |
200 | $trimmedLen += strlen( $c->nodeValue ); |
201 | $updateDSR = !$skipped; |
202 | } elseif ( !WTUtils::isRenderingTransparentNode( $c ) ) { |
203 | break; |
204 | } else { |
205 | // We are now skipping over a rendering transparent node |
206 | // and will trim additional whitespace => we cannot reliably |
207 | // maintain info about trimmed whitespace. |
208 | $skipped = true; |
209 | } |
210 | } |
211 | |
212 | if ( $c instanceof Text && |
213 | preg_match( '/^([ \t]+)([\s\S]*)$/D', $c->nodeValue, $matches ) |
214 | ) { |
215 | $updateDSR = !$skipped; |
216 | $c->nodeValue = $matches[2]; |
217 | $trimmedLen += strlen( $matches[1] ); |
218 | } |
219 | |
220 | if ( $dsr ) { |
221 | $dsr->leadingWS = $updateDSR ? $trimmedLen : -1; |
222 | } |
223 | |
224 | // Trim trailing ws (on the last line) |
225 | $trimmedLen = 0; |
226 | $updateDSR = true; |
227 | $skipped = false; |
228 | for ( $c = $node->lastChild; $c; $c = $prev ) { |
229 | $prev = $c->previousSibling; |
230 | if ( $c instanceof Text && preg_match( '/^[ \t]*$/D', $c->nodeValue ) ) { |
231 | $trimmedLen += strlen( $c->nodeValue ); |
232 | $node->removeChild( $c ); |
233 | $updateDSR = !$skipped; |
234 | } elseif ( !WTUtils::isRenderingTransparentNode( $c ) ) { |
235 | break; |
236 | } else { |
237 | // We are now skipping over a rendering transparent node |
238 | // and will trim additional whitespace => we cannot reliably |
239 | // maintain info about trimmed whitespace. |
240 | $skipped = true; |
241 | } |
242 | } |
243 | |
244 | if ( $c instanceof Text && |
245 | preg_match( '/^([\s\S]*\S)([ \t]+)$/D', $c->nodeValue, $matches ) |
246 | ) { |
247 | $updateDSR = !$skipped; |
248 | $c->nodeValue = $matches[1]; |
249 | $trimmedLen += strlen( $matches[2] ); |
250 | } |
251 | |
252 | if ( $dsr ) { |
253 | $dsr->trailingWS = $updateDSR ? $trimmedLen : -1; |
254 | } |
255 | } |
256 | |
257 | /** |
258 | * Perform some final cleanup |
259 | * |
260 | * @param Node $node |
261 | * @param DTState $state |
262 | * @return bool|Node The next node or true to continue with $node->nextSibling |
263 | */ |
264 | public static function finalCleanup( Node $node, DTState $state ) { |
265 | if ( !( $node instanceof Element ) ) { |
266 | return true; |
267 | } |
268 | |
269 | $dp = DOMDataUtils::getDataParsoid( $node ); |
270 | // Delete from data parsoid, wikitext originating autoInsertedEnd info |
271 | if ( !empty( $dp->autoInsertedEnd ) && !WTUtils::hasLiteralHTMLMarker( $dp ) && |
272 | isset( Consts::$WTTagsWithNoClosingTags[DOMCompat::nodeName( $node )] ) |
273 | ) { |
274 | unset( $dp->autoInsertedEnd ); |
275 | } |
276 | |
277 | $isFirstEncapsulationWrapperNode = ( $state->tplInfo->first ?? null ) === $node || |
278 | // Traversal isn't done with tplInfo for section tags, but we should |
279 | // still clean them up as if they are the head of encapsulation. |
280 | WTUtils::isParsoidSectionTag( $node ); |
281 | |
282 | // Remove dp.src from elements that have valid data-mw and dsr. |
283 | // This should reduce data-parsoid bloat. |
284 | // |
285 | // Presence of data-mw is a proxy for us knowing how to serialize |
286 | // this content from HTML. Token handlers should strip src for |
287 | // content where data-mw isn't necessary and html2wt knows how to |
288 | // handle the HTML markup. |
289 | $validDSR = DOMDataUtils::validDataMw( $node ) && Utils::isValidDSR( $dp->dsr ?? null ); |
290 | $isPageProp = DOMCompat::nodeName( $node ) === 'meta' && |
291 | str_starts_with( DOMCompat::getAttribute( $node, 'property' ) ?? '', 'mw:PageProp/' ); |
292 | if ( $validDSR && !$isPageProp ) { |
293 | unset( $dp->src ); |
294 | } elseif ( $isFirstEncapsulationWrapperNode && ( !$state->atTopLevel || empty( $dp->tsr ) ) ) { |
295 | // Transcluded nodes will not have dp.tsr set |
296 | // and don't need dp.src either. |
297 | unset( $dp->src ); |
298 | } |
299 | |
300 | // Remove tsr |
301 | if ( property_exists( $dp, 'tsr' ) ) { |
302 | unset( $dp->tsr ); |
303 | } |
304 | |
305 | // Remove temporary information |
306 | // @phan-suppress-next-line PhanTypeObjectUnsetDeclaredProperty |
307 | unset( $dp->tmp ); |
308 | unset( $dp->extLinkContentOffsets ); // not stored in tmp currently |
309 | |
310 | // Various places, like ContentUtils::shiftDSR, can set this to `null` |
311 | if ( property_exists( $dp, 'dsr' ) && $dp->dsr === null ) { |
312 | unset( $dp->dsr ); |
313 | } |
314 | |
315 | // Make dsr zero-range for fostered content |
316 | // to prevent selser from duplicating this content |
317 | // outside the table from where this came. |
318 | // |
319 | // But, do not zero it out if the node has template encapsulation |
320 | // information. That will be disastrous (see T54638, T54488). |
321 | if ( !empty( $dp->fostered ) && !empty( $dp->dsr ) && !$isFirstEncapsulationWrapperNode ) { |
322 | $dp->dsr->start = $dp->dsr->end; |
323 | } |
324 | |
325 | // Strip nowiki spans from encapsulated content but leave behind |
326 | // wrappers on root nodes since they have valid about ids and we |
327 | // don't want to break the about-chain by stripping the wrapper |
328 | // and associated ids (we cannot add an about id on the nowiki-ed |
329 | // content since that would be a text node). |
330 | if ( ( $state->tplInfo ?? null ) && !WTUtils::isEncapsulatedDOMForestRoot( $node ) && |
331 | DOMUtils::hasTypeOf( $node, 'mw:Nowiki' ) |
332 | ) { |
333 | DOMUtils::migrateChildren( $node, $node->parentNode, $node->nextSibling ); |
334 | $next = $node->nextSibling; |
335 | $node->parentNode->removeChild( $node ); |
336 | return $next; |
337 | } |
338 | |
339 | // Strip IndentPre marker metas |
340 | if ( PreHandler::isIndentPreWS( $node ) ) { |
341 | $nextNode = $node->nextSibling; |
342 | $node->parentNode->removeChild( $node ); |
343 | return $nextNode; |
344 | } |
345 | |
346 | // Trim whitespace from some wikitext markup |
347 | // not involving explicit HTML tags (T157481) |
348 | if ( !WTUtils::hasLiteralHTMLMarker( $dp ) && |
349 | isset( Consts::$WikitextTagsWithTrimmableWS[DOMCompat::nodeName( $node )] ) |
350 | ) { |
351 | self::trimWhiteSpace( $node, $dp->dsr ?? null ); |
352 | } |
353 | |
354 | return true; |
355 | } |
356 | |
357 | /** |
358 | * Perform some final cleanup |
359 | * |
360 | * @param array $usedIdIndex |
361 | * @param Node $node |
362 | * @param Env $env |
363 | * @param DTState $state |
364 | * @return bool|Node The next node or true to continue with $node->nextSibling |
365 | */ |
366 | public static function saveDataParsoid( |
367 | array $usedIdIndex, Node $node, Env $env, DTState $state |
368 | ) { |
369 | if ( !( $node instanceof Element ) ) { |
370 | return true; |
371 | } |
372 | |
373 | $dp = DOMDataUtils::getDataParsoid( $node ); |
374 | $isFirstEncapsulationWrapperNode = ( $state->tplInfo->first ?? null ) === $node || |
375 | // Traversal isn't done with tplInfo for section tags, but we should |
376 | // still clean them up as if they are the head of encapsulation. |
377 | WTUtils::isParsoidSectionTag( $node ); |
378 | $discardDataParsoid = $env->discardDataParsoid; |
379 | |
380 | // Strip data-parsoid from templated content, where unnecessary. |
381 | if ( ( $state->tplInfo ?? null ) && |
382 | // Always keep info for the first node |
383 | !$isFirstEncapsulationWrapperNode && |
384 | // We can't remove data-parsoid from inside <references> text, |
385 | // as that's the only HTML representation we have left for it. |
386 | !self::inNativeContent( $env, $node ) && |
387 | // FIXME: We can't remove dp from nodes with stx information |
388 | // because the serializer uses stx information in some cases to |
389 | // emit the right newline separators. |
390 | // |
391 | // For example, "a\n\nb" and "<p>a</p><p>b/p>" both generate |
392 | // identical html but serialize to different wikitext. |
393 | // |
394 | // This is only needed for the last top-level node . |
395 | ( empty( $dp->stx ) || ( $state->tplInfo->last ?? null ) !== $node ) |
396 | ) { |
397 | $discardDataParsoid = true; |
398 | } |
399 | |
400 | DOMDataUtils::storeDataAttribs( $node, [ |
401 | 'discardDataParsoid' => $discardDataParsoid, |
402 | // Even though we're passing in the `env`, this is the only place |
403 | // we want the storage to happen, so don't refactor this in there. |
404 | 'storeInPageBundle' => $env->pageBundle, |
405 | 'idIndex' => $usedIdIndex, |
406 | 'env' => $env |
407 | ] |
408 | ); |
409 | |
410 | return true; |
411 | } |
412 | |
413 | } |