Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
40.99% |
66 / 161 |
|
14.29% |
1 / 7 |
CRAP | |
0.00% |
0 / 1 |
CleanUp | |
40.99% |
66 / 161 |
|
14.29% |
1 / 7 |
1463.40 | |
0.00% |
0 / 1 |
stripMarkerMetas | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
90 | |||
isEmptyNode | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
110 | |||
handleEmptyElements | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
132 | |||
inNativeContent | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
trimWhiteSpace | |
100.00% |
38 / 38 |
|
100.00% |
1 / 1 |
17 | |||
finalCleanup | |
77.78% |
28 / 36 |
|
0.00% |
0 / 1 |
30.32 | |||
saveDataParsoid | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
72 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Handlers; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\Core\DomSourceRange; |
9 | use Wikimedia\Parsoid\DOM\Comment; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\DOM\Text; |
13 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
14 | use Wikimedia\Parsoid\NodeData\TempData; |
15 | use Wikimedia\Parsoid\Utils\DOMCompat; |
16 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
17 | use Wikimedia\Parsoid\Utils\DOMUtils; |
18 | use Wikimedia\Parsoid\Utils\DTState; |
19 | use Wikimedia\Parsoid\Utils\Utils; |
20 | use Wikimedia\Parsoid\Utils\WTUtils; |
21 | use Wikimedia\Parsoid\Wikitext\Consts; |
22 | use Wikimedia\Parsoid\Wt2Html\TT\PreHandler; |
23 | |
24 | class CleanUp { |
25 | /** |
26 | * @param Element $node |
27 | * @return bool|Element |
28 | */ |
29 | public static function stripMarkerMetas( Element $node ) { |
30 | // This meta tag can never have data-mw associated with it. |
31 | // If it were produced by a template, it would always have a <pre> |
32 | // wrapper around which carries any relevant data-mw & typeof properties. |
33 | $isIndentPreSpace = PreHandler::isIndentPreWS( $node ); |
34 | if ( $isIndentPreSpace || |
35 | DOMUtils::hasTypeOf( $node, "mw:Placeholder/UnclosedComment" ) || |
36 | // Sometimes a non-tpl meta node might get the mw:Transclusion typeof |
37 | // element attached to it. So, check if the node has data-mw, |
38 | // in which case we also have to keep it. |
39 | ( !DOMDataUtils::validDataMw( $node ) && ( |
40 | ( |
41 | DOMUtils::hasTypeOf( $node, 'mw:Placeholder/StrippedTag' ) && |
42 | // NOTE: In ComputeDSR, we don't zero out the width of these |
43 | // markers because they're staying in the DOM and serializeDOMNode |
44 | // only handles a few cases of zero width nodes. |
45 | !DOMUtils::isNestedInListItem( $node ) |
46 | ) || |
47 | DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) |
48 | ) ) |
49 | ) { |
50 | $nextNode = $node->nextSibling; |
51 | $parent = $node->parentNode; |
52 | if ( $isIndentPreSpace ) { |
53 | $dsr = DOMDataUtils::getDataParsoid( $parent )->dsr ?? null; |
54 | if ( $dsr ) { |
55 | // @see explanation in PreHandler::newIndentPreWS() |
56 | $dsr->openWidth = 1; |
57 | } |
58 | // Strip this in the cleanup handler since |
59 | // DOM passes till the end may need DSR info from this tag. |
60 | return true; |
61 | } else { |
62 | $parent->removeChild( $node ); |
63 | // stop the traversal, since this node is no longer in the DOM. |
64 | return $nextNode; |
65 | } |
66 | } else { |
67 | return true; |
68 | } |
69 | } |
70 | |
71 | /** |
72 | * The following are considered "empty node"s: |
73 | * - Comments, rendering transparent nodes, nowiki spans without content |
74 | * are all stripped by the core parser. |
75 | * - Text nodes with whitespace don't count either. |
76 | * - Parsoid-added span wrappers around other "empty node"s. |
77 | * |
78 | * @param Node $node |
79 | * @param bool &$hasRTNodes Set to true if the node contained rendering transparent nodes. |
80 | * Note this value is only reliable if ::isEmptyNode() returns true. |
81 | * @return bool |
82 | */ |
83 | private static function isEmptyNode( Node $node, bool &$hasRTNodes ): bool { |
84 | for ( $n = $node->firstChild; $n !== null; $n = $n->nextSibling ) { |
85 | if ( $n instanceof Comment ) { |
86 | continue; |
87 | } elseif ( $n instanceof Text ) { |
88 | if ( !preg_match( '/^[ \t\r\n]*$/D', $n->nodeValue ) ) { |
89 | return false; |
90 | } |
91 | continue; |
92 | } elseif ( $n instanceof Element ) { |
93 | if ( WTUtils::isRenderingTransparentNode( $n ) ) { |
94 | $hasRTNodes = true; |
95 | continue; |
96 | } |
97 | if ( |
98 | ( |
99 | DOMUtils::hasTypeOf( $n, 'mw:Nowiki' ) || |
100 | DOMDataUtils::getDataParsoid( $n )->getTempFlag( TempData::WRAPPER ) |
101 | ) && self::isEmptyNode( $n, $hasRTNodes ) |
102 | ) { |
103 | continue; |
104 | } |
105 | return false; |
106 | } else { |
107 | return false; |
108 | } |
109 | } |
110 | return true; |
111 | } |
112 | |
113 | // These template-wrapping attributes can be ignored while looking for |
114 | // empty elements. Note that data-mw & data-parsoid are unlikely to exist |
115 | // at this stage of DOM processing. This is conservative but safe. |
116 | // In this case, it is also sufficient since only p, li, tr can be deleted. |
117 | public const ALLOWED_TPL_WRAPPER_ATTRS = [ |
118 | 'about' => 1, |
119 | 'typeof' => 1, |
120 | ]; |
121 | |
122 | /** |
123 | * @param Node $node |
124 | * @param DTState $state |
125 | * @return bool|Node |
126 | */ |
127 | public static function handleEmptyElements( Node $node, DTState $state ) { |
128 | // Set by isEmptyNode() to indicate whether a node which is "empty" contained |
129 | // invisible "rendering transparent" nodes. |
130 | $hasRTNodes = false; |
131 | |
132 | if ( !( $node instanceof Element ) || |
133 | !isset( Consts::$Output['FlaggedEmptyElts'][DOMCompat::nodeName( $node )] ) || |
134 | !self::isEmptyNode( $node, $hasRTNodes ) |
135 | ) { |
136 | return true; |
137 | } |
138 | foreach ( DOMUtils::attributes( $node ) as $name => $value ) { |
139 | // Skip the Parsoid-added data attribute and template-wrapping attributes |
140 | if ( $name === DOMDataUtils::DATA_OBJECT_ATTR_NAME || |
141 | ( ( $state->tplInfo ?? null ) && isset( self::ALLOWED_TPL_WRAPPER_ATTRS[$name] ) ) |
142 | ) { |
143 | continue; |
144 | } |
145 | |
146 | return true; |
147 | } |
148 | |
149 | /** |
150 | * The node is known to be empty and a deletion candidate |
151 | * - If node is part of template content and is not the |
152 | * first encapsulation wrapper node, and doesn't contain |
153 | * any rendering transparent nodes, it can be deleted. |
154 | * - If not, we add the mw-empty-elt class so that wikis |
155 | * can decide what to do with them. |
156 | */ |
157 | if ( |
158 | $state->tplInfo && |
159 | $state->tplInfo->first !== $node && |
160 | !$hasRTNodes |
161 | ) { |
162 | $nextNode = $node->nextSibling; |
163 | $node->parentNode->removeChild( $node ); |
164 | return $nextNode; |
165 | } |
166 | |
167 | DOMCompat::getClassList( $node )->add( 'mw-empty-elt' ); |
168 | return true; |
169 | } |
170 | |
171 | /** |
172 | * FIXME: Worry about "about" siblings |
173 | * |
174 | * @param Env $env |
175 | * @param Element $node |
176 | * @return bool |
177 | */ |
178 | private static function inNativeContent( Env $env, Element $node ): bool { |
179 | while ( !DOMUtils::atTheTop( $node ) ) { |
180 | if ( WTUtils::getNativeExt( $env, $node ) !== null ) { |
181 | return true; |
182 | } |
183 | $node = $node->parentNode; |
184 | } |
185 | return false; |
186 | } |
187 | |
188 | /** |
189 | * Whitespace in this function refers to [ \t] only |
190 | * @param Element $node |
191 | * @param ?DomSourceRange $dsr |
192 | */ |
193 | private static function trimWhiteSpace( Element $node, ?DomSourceRange $dsr ): void { |
194 | // Trim leading ws (on the first line) |
195 | $trimmedLen = 0; |
196 | $updateDSR = true; |
197 | $skipped = false; |
198 | for ( $c = $node->firstChild; $c; $c = $next ) { |
199 | $next = $c->nextSibling; |
200 | if ( $c instanceof Text && preg_match( '/^[ \t]*$/D', $c->nodeValue ) ) { |
201 | $node->removeChild( $c ); |
202 | $trimmedLen += strlen( $c->nodeValue ); |
203 | $updateDSR = !$skipped; |
204 | } elseif ( !WTUtils::isRenderingTransparentNode( $c ) ) { |
205 | break; |
206 | } else { |
207 | // We are now skipping over a rendering transparent node |
208 | // and will trim additional whitespace => we cannot reliably |
209 | // maintain info about trimmed whitespace. |
210 | $skipped = true; |
211 | } |
212 | } |
213 | |
214 | if ( $c instanceof Text && |
215 | preg_match( '/^([ \t]+)([\s\S]*)$/D', $c->nodeValue, $matches ) |
216 | ) { |
217 | $updateDSR = !$skipped; |
218 | $c->nodeValue = $matches[2]; |
219 | $trimmedLen += strlen( $matches[1] ); |
220 | } |
221 | |
222 | if ( $dsr ) { |
223 | $dsr->leadingWS = $updateDSR ? $trimmedLen : -1; |
224 | } |
225 | |
226 | // Trim trailing ws (on the last line) |
227 | $trimmedLen = 0; |
228 | $updateDSR = true; |
229 | $skipped = false; |
230 | for ( $c = $node->lastChild; $c; $c = $prev ) { |
231 | $prev = $c->previousSibling; |
232 | if ( $c instanceof Text && preg_match( '/^[ \t]*$/D', $c->nodeValue ) ) { |
233 | $trimmedLen += strlen( $c->nodeValue ); |
234 | $node->removeChild( $c ); |
235 | $updateDSR = !$skipped; |
236 | } elseif ( !WTUtils::isRenderingTransparentNode( $c ) ) { |
237 | break; |
238 | } else { |
239 | // We are now skipping over a rendering transparent node |
240 | // and will trim additional whitespace => we cannot reliably |
241 | // maintain info about trimmed whitespace. |
242 | $skipped = true; |
243 | } |
244 | } |
245 | |
246 | if ( $c instanceof Text && |
247 | preg_match( '/^([\s\S]*\S)([ \t]+)$/D', $c->nodeValue, $matches ) |
248 | ) { |
249 | $updateDSR = !$skipped; |
250 | $c->nodeValue = $matches[1]; |
251 | $trimmedLen += strlen( $matches[2] ); |
252 | } |
253 | |
254 | if ( $dsr ) { |
255 | $dsr->trailingWS = $updateDSR ? $trimmedLen : -1; |
256 | } |
257 | } |
258 | |
259 | /** |
260 | * Perform some final cleanup |
261 | * |
262 | * @param Node $node |
263 | * @param DTState $state |
264 | * @return bool|Node The next node or true to continue with $node->nextSibling |
265 | */ |
266 | public static function finalCleanup( Node $node, DTState $state ) { |
267 | if ( !( $node instanceof Element ) ) { |
268 | return true; |
269 | } |
270 | |
271 | Assert::invariant( $state->atTopLevel, 'This pass should only be run on the top-level' ); |
272 | |
273 | $dp = DOMDataUtils::getDataParsoid( $node ); |
274 | // Delete from data parsoid, wikitext originating autoInsertedEnd info |
275 | if ( !empty( $dp->autoInsertedEnd ) && !WTUtils::hasLiteralHTMLMarker( $dp ) && |
276 | isset( Consts::$WTTagsWithNoClosingTags[DOMCompat::nodeName( $node )] ) |
277 | ) { |
278 | unset( $dp->autoInsertedEnd ); |
279 | } |
280 | |
281 | $isFirstEncapsulationWrapperNode = ( $state->tplInfo->first ?? null ) === $node || |
282 | // Traversal isn't done with tplInfo for section tags, but we should |
283 | // still clean them up as if they are the head of encapsulation. |
284 | WTUtils::isParsoidSectionTag( $node ); |
285 | |
286 | // Remove dp.src from elements that have valid data-mw and dsr. |
287 | // This should reduce data-parsoid bloat. |
288 | // |
289 | // Presence of data-mw is a proxy for us knowing how to serialize |
290 | // this content from HTML. Token handlers should strip src for |
291 | // content where data-mw isn't necessary and html2wt knows how to |
292 | // handle the HTML markup. |
293 | $validDSR = DOMDataUtils::validDataMw( $node ) && Utils::isValidDSR( $dp->dsr ?? null ); |
294 | $isPageProp = DOMCompat::nodeName( $node ) === 'meta' && |
295 | str_starts_with( DOMCompat::getAttribute( $node, 'property' ) ?? '', 'mw:PageProp/' ); |
296 | if ( $validDSR && !$isPageProp ) { |
297 | unset( $dp->src ); |
298 | } elseif ( $isFirstEncapsulationWrapperNode && empty( $dp->tsr ) ) { |
299 | // Transcluded nodes will not have dp.tsr set |
300 | // and don't need dp.src either. |
301 | unset( $dp->src ); |
302 | } |
303 | |
304 | // Remove tsr |
305 | if ( property_exists( $dp, 'tsr' ) ) { |
306 | unset( $dp->tsr ); |
307 | } |
308 | |
309 | // Various places, like ContentUtils::shiftDSR, can set this to `null` |
310 | if ( property_exists( $dp, 'dsr' ) && $dp->dsr === null ) { |
311 | unset( $dp->dsr ); |
312 | } |
313 | |
314 | // Make dsr zero-range for fostered content |
315 | // to prevent selser from duplicating this content |
316 | // outside the table from where this came. |
317 | // |
318 | // But, do not zero it out if the node has template encapsulation |
319 | // information. That will be disastrous (see T54638, T54488). |
320 | if ( !empty( $dp->fostered ) && !empty( $dp->dsr ) && !$isFirstEncapsulationWrapperNode ) { |
321 | $dp->dsr->start = $dp->dsr->end; |
322 | } |
323 | |
324 | // Strip nowiki spans from encapsulated content but leave behind |
325 | // wrappers on root nodes since they have valid about ids and we |
326 | // don't want to break the about-chain by stripping the wrapper |
327 | // and associated ids (we cannot add an about id on the nowiki-ed |
328 | // content since that would be a text node). |
329 | if ( ( $state->tplInfo ?? null ) && !WTUtils::isEncapsulatedDOMForestRoot( $node ) && |
330 | DOMUtils::hasTypeOf( $node, 'mw:Nowiki' ) |
331 | ) { |
332 | DOMUtils::migrateChildren( $node, $node->parentNode, $node->nextSibling ); |
333 | $next = $node->nextSibling; |
334 | $node->parentNode->removeChild( $node ); |
335 | return $next; |
336 | } |
337 | |
338 | // Strip IndentPre marker metas |
339 | if ( PreHandler::isIndentPreWS( $node ) ) { |
340 | $nextNode = $node->nextSibling; |
341 | $node->parentNode->removeChild( $node ); |
342 | return $nextNode; |
343 | } |
344 | |
345 | // Trim whitespace from some wikitext markup |
346 | // not involving explicit HTML tags (T157481) |
347 | if ( !WTUtils::hasLiteralHTMLMarker( $dp ) && |
348 | isset( Consts::$WikitextTagsWithTrimmableWS[DOMCompat::nodeName( $node )] ) |
349 | ) { |
350 | self::trimWhiteSpace( $node, $dp->dsr ?? null ); |
351 | } |
352 | |
353 | return true; |
354 | } |
355 | |
356 | /** |
357 | * Perform some final cleanup |
358 | * |
359 | * @param Node $node |
360 | * @param DTState $state |
361 | * @return bool|Node The next node or true to continue with $node->nextSibling |
362 | */ |
363 | public static function saveDataParsoid( Node $node, DTState $state ) { |
364 | if ( !( $node instanceof Element ) ) { |
365 | return true; |
366 | } |
367 | Assert::invariant( $state->atTopLevel, 'This pass should only be run on the top-level' ); |
368 | |
369 | $env = $state->env; |
370 | $dp = DOMDataUtils::getDataParsoid( $node ); |
371 | $isFirstEncapsulationWrapperNode = ( $state->tplInfo->first ?? null ) === $node || |
372 | // Traversal isn't done with tplInfo for section tags, but we should |
373 | // still clean them up as if they are the head of encapsulation. |
374 | WTUtils::isParsoidSectionTag( $node ); |
375 | |
376 | // Strip data-parsoid from templated content, where unnecessary. |
377 | $discardDataParsoid = ( |
378 | ( $state->tplInfo ?? null ) && |
379 | // Always keep info for the first node |
380 | !$isFirstEncapsulationWrapperNode && |
381 | // We can't remove data-parsoid from inside <references> text, |
382 | // as that's the only HTML representation we have left for it. |
383 | !self::inNativeContent( $env, $node ) && |
384 | // FIXME: We can't remove dp from nodes with stx information |
385 | // because the serializer uses stx information in some cases to |
386 | // emit the right newline separators. |
387 | // |
388 | // For example, "a\n\nb" and "<p>a</p><p>b/p>" both generate |
389 | // identical html but serialize to different wikitext. |
390 | // |
391 | // This is only needed for the last top-level node . |
392 | ( empty( $dp->stx ) || ( $state->tplInfo->last ?? null ) !== $node ) |
393 | ); |
394 | |
395 | // Mark this as an empty AND new data-parsoid |
396 | if ( $discardDataParsoid ) { |
397 | // We cannot unset data-parsoid because any code that runs after |
398 | // this that calls DOMDataUtils::getDataParsoid will reinitialize |
399 | // it to an empty object. So, we do that re-init here and set the |
400 | // IS_NEW flag to ensure DOMDataUtils::storeDataParsoid discards this |
401 | // if unmodified. The empty data-parsoid blob is considered unmodified. |
402 | $dp = new DataParsoid; |
403 | $dp->setTempFlag( TempData::IS_NEW ); |
404 | DOMDataUtils::setDataParsoid( $node, $dp ); |
405 | } |
406 | |
407 | return true; |
408 | } |
409 | |
410 | } |