Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 281 |
|
0.00% |
0 / 10 |
CRAP | |
0.00% |
0 / 1 |
TableFixups | |
0.00% |
0 / 281 |
|
0.00% |
0 / 10 |
11990 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
stripDoubleTDs | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
110 | |||
isSimpleTemplatedSpan | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
fillDSRGap | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
hoistTransclusionInfo | |
0.00% |
0 / 55 |
|
0.00% |
0 / 1 |
182 | |||
collectAttributishContent | |
0.00% |
0 / 38 |
|
0.00% |
0 / 1 |
210 | |||
reparseTemplatedAttributes | |
0.00% |
0 / 29 |
|
0.00% |
0 / 1 |
72 | |||
combineAttrsWithPreviousCell | |
0.00% |
0 / 37 |
|
0.00% |
0 / 1 |
72 | |||
getReparseType | |
0.00% |
0 / 35 |
|
0.00% |
0 / 1 |
600 | |||
handleTableCellTemplates | |
0.00% |
0 / 62 |
|
0.00% |
0 / 1 |
702 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Handlers; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\Core\Sanitizer; |
9 | use Wikimedia\Parsoid\DOM\Comment; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\DOM\Text; |
13 | use Wikimedia\Parsoid\NodeData\DataMw; |
14 | use Wikimedia\Parsoid\NodeData\TempData; |
15 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
16 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
17 | use Wikimedia\Parsoid\Utils\DOMCompat; |
18 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
19 | use Wikimedia\Parsoid\Utils\DOMUtils; |
20 | use Wikimedia\Parsoid\Utils\DTState; |
21 | use Wikimedia\Parsoid\Utils\PHPUtils; |
22 | use Wikimedia\Parsoid\Utils\Utils; |
23 | use Wikimedia\Parsoid\Utils\WTUtils; |
24 | use Wikimedia\Parsoid\Wt2Html\Frame; |
25 | use Wikimedia\Parsoid\Wt2Html\PegTokenizer; |
26 | |
27 | /** |
28 | * Provides DOMTraverser visitors that fix template-induced interrupted table cell parsing |
29 | * by recombining table cells and/or reparsing table cell content as attributes. |
30 | * - stripDoubleTDs |
31 | * - handleTableCellTemplates |
32 | */ |
33 | class TableFixups { |
34 | /** |
35 | * @var PegTokenizer |
36 | */ |
37 | private $tokenizer; |
38 | |
39 | public function __construct( Env $env ) { |
40 | /** |
41 | * Set up some helper objects for reparseTemplatedAttributes |
42 | */ |
43 | |
44 | /** |
45 | * Actually the regular tokenizer, but we'll use |
46 | * tokenizeTableCellAttributes only. |
47 | */ |
48 | $this->tokenizer = new PegTokenizer( $env ); |
49 | } |
50 | |
51 | /** |
52 | * DOM visitor that strips the double td for this test case: |
53 | * ``` |
54 | * |{{1x|{{!}} Foo}} |
55 | * ``` |
56 | * |
57 | * @see https://phabricator.wikimedia.org/T52603 |
58 | * @param Element $node |
59 | * @param Frame $frame |
60 | * @return bool|Node |
61 | */ |
62 | public function stripDoubleTDs( Element $node, Frame $frame ) { |
63 | $nextNode = $node->nextSibling; |
64 | if ( !WTUtils::isLiteralHTMLNode( $node ) && |
65 | $nextNode instanceof Element && |
66 | DOMCompat::nodeName( $nextNode ) === 'td' && |
67 | !WTUtils::isLiteralHTMLNode( $nextNode ) && |
68 | DiffDOMUtils::nodeEssentiallyEmpty( $node ) && ( |
69 | // Since typeof will not be set for nested template, |
70 | // use a hacky work-around for nested templates. |
71 | DOMUtils::hasTypeOf( $nextNode, 'mw:Transclusion' ) || |
72 | preg_match( '/^{{.*?}}$/D', DOMDataUtils::getDataParsoid( $nextNode )->src ?? '' ) |
73 | ) |
74 | ) { |
75 | // Update the dsr. Since we are coalescing the first |
76 | // node with the second (or, more precisely, deleting |
77 | // the first node), we have to update the second DSR's |
78 | // starting point and start tag width. |
79 | $nodeDSR = DOMDataUtils::getDataParsoid( $node )->dsr ?? null; |
80 | $nextNodeDP = DOMDataUtils::getDataParsoid( $nextNode ); |
81 | |
82 | if ( $nodeDSR && !empty( $nextNodeDP->dsr ) ) { |
83 | $nextNodeDP->dsr->start = $nodeDSR->start; |
84 | } |
85 | |
86 | $dataMW = DOMDataUtils::getDataMw( $nextNode ); |
87 | $nodeSrc = WTUtils::getWTSource( $frame, $node ); |
88 | $dataMW->parts ??= []; |
89 | array_unshift( $dataMW->parts, $nodeSrc ); |
90 | |
91 | // Delete the duplicated <td> node. |
92 | $node->parentNode->removeChild( $node ); |
93 | // This node was deleted, so don't continue processing on it. |
94 | return $nextNode; |
95 | } |
96 | |
97 | return true; |
98 | } |
99 | |
100 | private function isSimpleTemplatedSpan( Node $node ): bool { |
101 | return DOMCompat::nodeName( $node ) === 'span' && |
102 | DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) && |
103 | DOMUtils::allChildrenAreTextOrComments( $node ); |
104 | } |
105 | |
106 | /** |
107 | * @param list<string|TemplateInfo> &$parts |
108 | * @param Frame $frame |
109 | * @param int $offset1 |
110 | * @param int $offset2 |
111 | */ |
112 | private function fillDSRGap( array &$parts, Frame $frame, int $offset1, int $offset2 ): void { |
113 | if ( $offset1 < $offset2 ) { |
114 | $parts[] = PHPUtils::safeSubstr( $frame->getSrcText(), $offset1, $offset2 - $offset1 ); |
115 | } |
116 | } |
117 | |
118 | /** |
119 | * Hoist transclusion information from cell content / attributes |
120 | * onto the cell itself. |
121 | */ |
122 | private function hoistTransclusionInfo( |
123 | Frame $frame, array $transclusions, Element $td, DTState $dtState |
124 | ): void { |
125 | // Initialize dsr for $td |
126 | // In `handleTableCellTemplates`, we're creating a cell w/o dsr info. |
127 | $tdDp = DOMDataUtils::getDataParsoid( $td ); |
128 | if ( !Utils::isValidDSR( $tdDp->dsr ?? null ) ) { |
129 | $tplDp = DOMDataUtils::getDataParsoid( $transclusions[0] ); |
130 | Assert::invariant( Utils::isValidDSR( $tplDp->dsr ?? null ), 'Expected valid DSR' ); |
131 | $tdDp->dsr = clone $tplDp->dsr; |
132 | } |
133 | |
134 | // Build up $parts, $pi to set up the combined transclusion info on $td. |
135 | // Note that content for all but the last template has been swallowed into |
136 | // the attributes of $td. |
137 | $parts = []; |
138 | $pi = []; |
139 | $lastTpl = null; |
140 | $prevDp = null; |
141 | |
142 | $index = 0; |
143 | foreach ( $transclusions as $i => $tpl ) { |
144 | $tplDp = DOMDataUtils::getDataParsoid( $tpl ); |
145 | Assert::invariant( Utils::isValidDSR( $tplDp->dsr ?? null ), 'Expected valid DSR' ); |
146 | |
147 | // Plug DSR gaps between transclusions |
148 | if ( !$prevDp ) { |
149 | $this->fillDSRGap( $parts, $frame, $tdDp->dsr->start, $tplDp->dsr->start ); |
150 | } else { |
151 | $this->fillDSRGap( $parts, $frame, $prevDp->dsr->end, $tplDp->dsr->start ); |
152 | } |
153 | |
154 | // Assimilate $tpl's data-mw and data-parsoid pi info |
155 | $dmw = DOMDataUtils::getDataMw( $tpl ); |
156 | foreach ( $dmw->parts ?? [] as $part ) { |
157 | // Template index is relative to other transclusions. |
158 | // This index is used to extract whitespace information from |
159 | // data-parsoid and that array only includes info for templates. |
160 | // So skip over strings here. |
161 | if ( !is_string( $part ) ) { |
162 | // Cloning is strictly not needed here, but mimicking |
163 | // code in WrapSectionsState.php |
164 | $part = clone $part; |
165 | $part->i = $index++; |
166 | } |
167 | $parts[] = $part; |
168 | } |
169 | PHPUtils::pushArray( $pi, $tplDp->pi ?? [ [] ] ); |
170 | DOMDataUtils::setDataMw( $tpl, null ); |
171 | |
172 | $lastTpl = $tpl; |
173 | $prevDp = $tplDp; |
174 | } |
175 | |
176 | $aboutId = DOMCompat::getAttribute( $lastTpl, 'about' ); |
177 | |
178 | // Hoist transclusion information to $td. |
179 | $td->setAttribute( 'typeof', 'mw:Transclusion' ); |
180 | $td->setAttribute( 'about', $aboutId ); |
181 | |
182 | // Add wikitext for the table cell content following $lastTpl |
183 | $this->fillDSRGap( $parts, $frame, $prevDp->dsr->end, $tdDp->dsr->end ); |
184 | |
185 | // Save the new data-mw on the td |
186 | $dmw = new DataMw( [] ); |
187 | $dmw->parts = $parts; |
188 | DOMDataUtils::setDataMw( $td, $dmw ); |
189 | $tdDp->pi = $pi; |
190 | |
191 | // td wraps everything now. |
192 | // Remove template encapsulation from here on. |
193 | // This simplifies the problem of analyzing the <td> |
194 | // for additional fixups (|| Boo || Baz) by potentially |
195 | // invoking 'reparseTemplatedAttributes' on split cells |
196 | // with some modifications. |
197 | $child = $lastTpl; |
198 | |
199 | // Transclusions may be nested in elements in some ugly wikitext so |
200 | // make sure we're starting at a direct descendant of td |
201 | while ( $child->parentNode !== $td ) { |
202 | $child = $child->parentNode; |
203 | } |
204 | |
205 | while ( $child ) { |
206 | if ( |
207 | DOMCompat::nodeName( $child ) === 'span' && |
208 | DOMCompat::getAttribute( $child, 'about' ) === $aboutId |
209 | ) { |
210 | // Remove the encapsulation attributes. If there are no more attributes left, |
211 | // the span wrapper is useless and can be removed. |
212 | $child->removeAttribute( 'about' ); |
213 | $child->removeAttribute( 'typeof' ); |
214 | if ( DOMDataUtils::noAttrs( $child ) ) { |
215 | $next = $child->firstChild ?: $child->nextSibling; |
216 | DOMUtils::migrateChildren( $child, $td, $child ); |
217 | $child->parentNode->removeChild( $child ); |
218 | $child = $next; |
219 | } else { |
220 | $child = $child->nextSibling; |
221 | } |
222 | } else { |
223 | $child = $child->nextSibling; |
224 | } |
225 | } |
226 | |
227 | // $dtState->tplInfo can be null when information is hoisted |
228 | // from children to $td because DOMTraverser hasn't seen the |
229 | // children yet! |
230 | if ( !$dtState->tplInfo ) { |
231 | $dtState->tplInfo = (object)[ |
232 | 'first' => $td, |
233 | 'last' => $td, |
234 | 'clear' => false |
235 | ]; |
236 | } |
237 | } |
238 | |
239 | /** |
240 | * Collect potential attribute content. |
241 | * |
242 | * We expect this to be text nodes without a pipe character followed by one or |
243 | * more nowiki spans, followed by a template encapsulation with pure-text and |
244 | * nowiki content. Collection stops when encountering a pipe character. |
245 | * |
246 | * @param Env $env |
247 | * @param Element $cell known to be <td> / <th> |
248 | * @param ?Element $templateWrapper |
249 | * @return ?array |
250 | */ |
251 | public function collectAttributishContent( |
252 | Env $env, Element $cell, ?Element $templateWrapper |
253 | ): ?array { |
254 | $buf = []; |
255 | $nowikis = []; |
256 | $transclusions = $templateWrapper ? [ $templateWrapper ] : []; |
257 | |
258 | // Some of this logic could be replaced by DSR-based recovery of |
259 | // wikitext that is outside templates. But since we have to walk over |
260 | // templated content in this fashion anyway, we might as well use the |
261 | // same logic uniformly. |
262 | |
263 | $traverse = static function ( ?Node $child ) use ( |
264 | &$traverse, &$buf, &$nowikis, &$transclusions |
265 | ): bool { |
266 | while ( $child ) { |
267 | if ( $child instanceof Comment ) { |
268 | // Legacy parser strips comments during parsing => drop them. |
269 | } elseif ( $child instanceof Text ) { |
270 | $text = $child->nodeValue; |
271 | $buf[] = $text; |
272 | |
273 | // Are we done accumulating? |
274 | if ( preg_match( '/(?:^|[^|])\|(?:[^|]|$)/D', $text ) ) { |
275 | return true; |
276 | } |
277 | } else { |
278 | '@phan-var Element $child'; /** @var Element $child */ |
279 | if ( DOMUtils::hasTypeOf( $child, 'mw:Transclusion' ) ) { |
280 | $transclusions[] = $child; |
281 | } |
282 | |
283 | if ( WTUtils::isFirstExtensionWrapperNode( $child ) ) { |
284 | // "|" chars in extension content don't trigger table-cell parsing |
285 | // since they have higher precedence in tokenization. The extension |
286 | // content will simply be dropped (but any side effects it had will |
287 | // continue to apply. Ex: <ref> tags might leave an orphaned ref in |
288 | // the <references> section). |
289 | $child = WTUtils::skipOverEncapsulatedContent( $child ); |
290 | continue; |
291 | } elseif ( DOMUtils::hasTypeOf( $child, 'mw:Entity' ) ) { |
292 | // Get entity's wikitext source, not rendered content. |
293 | // " " is "\n" which breaks attribute parsing! |
294 | $buf[] = DOMDataUtils::getDataParsoid( $child )->src ?? $child->textContent; |
295 | } elseif ( DOMUtils::hasTypeOf( $child, 'mw:Nowiki' ) ) { |
296 | // Nowiki span were added to protect otherwise |
297 | // meaningful wikitext chars used in attributes. |
298 | // Save the content and add in a marker to splice out later. |
299 | $nowikis[] = $child->textContent; |
300 | $buf[] = '<nowiki-marker>'; |
301 | } elseif ( DOMUtils::hasRel( $child, 'mw:WikiLink' ) || |
302 | WTUtils::isGeneratedFigure( $child ) |
303 | ) { |
304 | // Wikilinks/images abort attribute parsing |
305 | return true; |
306 | } else { |
307 | if ( $traverse( $child->firstChild ) ) { |
308 | return true; |
309 | } |
310 | } |
311 | } |
312 | |
313 | $child = $child->nextSibling; |
314 | } |
315 | |
316 | return false; |
317 | }; |
318 | |
319 | if ( $traverse( $cell->firstChild ) ) { |
320 | return [ |
321 | 'txt' => implode( '', $buf ), |
322 | 'nowikis' => $nowikis, |
323 | 'transclusions' => $transclusions, |
324 | ]; |
325 | } else { |
326 | return null; |
327 | } |
328 | } |
329 | |
330 | /** |
331 | * T46498, second part of T52603 |
332 | * |
333 | * Handle wikitext like |
334 | * ``` |
335 | * {| |
336 | * |{{nom|Bar}} |
337 | * |} |
338 | * ``` |
339 | * where nom expands to `style="foo" class="bar"|Bar`. The attributes are |
340 | * tokenized and stripped from the table contents. |
341 | * |
342 | * This method works well for the templates documented in |
343 | * https://en.wikipedia.org/wiki/Template:Table_cell_templates/doc |
344 | * |
345 | * Nevertheless, there are some limitations: |
346 | * - We assume that attributes don't contain wiki markup (apart from <nowiki>) |
347 | * and end up in text or nowiki nodes. |
348 | * - Only a single table cell is produced / opened by the template that |
349 | * contains the attributes. This limitation could be lifted with more |
350 | * aggressive re-parsing if really needed in practice. |
351 | * - There is only a single transclusion in the table cell content. This |
352 | * limitation can be lifted with more advanced data-mw construction. |
353 | * |
354 | * $cell known to be <td> / <th> |
355 | */ |
356 | public function reparseTemplatedAttributes( |
357 | Frame $frame, Element $cell, ?Element $templateWrapper, DTSTate $dtState |
358 | ): void { |
359 | $env = $frame->getEnv(); |
360 | // Collect attribute content and examine it |
361 | $attributishContent = $this->collectAttributishContent( $env, $cell, $templateWrapper ); |
362 | if ( !$attributishContent ) { |
363 | return; |
364 | } |
365 | |
366 | /** |
367 | * FIXME: These checks are insufficient. |
368 | * Previous rounds of table fixups might have created this cell without |
369 | * any templated content (the while loop in handleTableCellTemplates). |
370 | * Till we figure out a reliable test for this, we'll reparse attributes always. |
371 | * |
372 | * // This DOM pass is trying to bridge broken parses across |
373 | * // template boundaries. So, if templates aren't involved, |
374 | * // no reason to reparse. |
375 | * if ( count( $attributishContent['transclusions'] ) === 0 && |
376 | * !WTUtils::fromEncapsulatedContent( $cell ) |
377 | * ) { |
378 | * return; |
379 | * } |
380 | */ |
381 | |
382 | $attrText = $attributishContent['txt']; |
383 | if ( !preg_match( '/(^[^|]+\|)([^|]|$)/D', $attrText, $matches ) ) { |
384 | return; |
385 | } |
386 | $attributishPrefix = $matches[1]; |
387 | |
388 | // Splice in nowiki content. We added in <nowiki> markers to prevent the |
389 | // above regexps from matching on nowiki-protected chars. |
390 | if ( str_contains( $attributishPrefix, '<nowiki-marker>' ) ) { |
391 | $attributishPrefix = preg_replace_callback( |
392 | '/<nowiki-marker>/', |
393 | static function ( $unused ) use ( &$attributishContent ) { |
394 | // This is a little tricky. We want to use the content from the |
395 | // nowikis to reparse the string to key/val pairs but the rule, |
396 | // single_cell_table_args, will invariably get tripped up on |
397 | // newlines which, to this point, were shuttled through in the |
398 | // nowiki. Core sanitizer will do this replacement in attr vals |
399 | // so it's a safe normalization to do here. |
400 | return preg_replace( '/\s+/', ' ', array_shift( $attributishContent['nowikis'] ) ); |
401 | }, |
402 | $attributishPrefix |
403 | ); |
404 | } |
405 | |
406 | // re-parse the attributish prefix |
407 | $attributeTokens = $this->tokenizer->tokenizeTableCellAttributes( $attributishPrefix, false ); |
408 | |
409 | // No attributes => nothing more to do! |
410 | if ( !$attributeTokens ) { |
411 | return; |
412 | } |
413 | |
414 | // Note that `row_syntax_table_args` (the rule used for tokenizing above) |
415 | // returns an array consisting of [table_attributes, spaces, pipe] |
416 | $attrs = $attributeTokens[0]; |
417 | |
418 | // Sanitize attrs and transfer them to the td node |
419 | Sanitizer::applySanitizedArgs( $env->getSiteConfig(), $cell, $attrs ); |
420 | $cellDp = DOMDataUtils::getDataParsoid( $cell ); |
421 | // Reparsed cells start off as non-mergeable-table cells |
422 | // and preserve that property after reparsing |
423 | $cellDp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ); |
424 | $cellDp->setTempFlag( TempData::NO_ATTRS, false ); |
425 | |
426 | // If the transclusion node was embedded within the td node, |
427 | // lift up the about group to the td node. |
428 | $transclusions = $attributishContent['transclusions']; |
429 | if ( $transclusions && ( $cell !== $transclusions[0] || count( $transclusions ) > 1 ) ) { |
430 | $this->hoistTransclusionInfo( $frame, $transclusions, $cell, $dtState ); |
431 | } |
432 | |
433 | // Drop content that has been consumed by the reparsed attribute content. |
434 | // NOTE: We serialize and reparse data-object-id attributes as well which |
435 | // ensures stashed data-* attributes continue to be usable. |
436 | // FIXME: This is too naive. What about all the care we showed in `collectAttributishContent`? |
437 | DOMCompat::setInnerHTML( $cell, |
438 | preg_replace( '/^[^|]*\|/', '', DOMCompat::getInnerHTML( $cell ) ) ); |
439 | } |
440 | |
441 | /** |
442 | * Possibilities: |
443 | * - $cell and $prev are both <td>s (or both <th>s) |
444 | * - Common case |
445 | * - Ex: "|align=left {{tpl returning | foobar}}" |
446 | * So, "|align=left |foobar" is the combined string |
447 | * - Combined cell is a <td> (will be <th> if both were <th>s) |
448 | * - We assign new attributes to $cell and drop $prev |
449 | * - $cell is <td> and $prev is <th> |
450 | * - Ex: "!align=left {{tpl returning | foobar}}" |
451 | * So, "!align=left |foobar" is the combined string |
452 | * - The combined cell will be a <th> with attributes "align=left" |
453 | * and content "foobar" |
454 | * - $cell is <th> and $prev is <td> |
455 | * - Ex: "|align=left {{tpl returning !scope=row | foobar}}" |
456 | * So "|align=left !scope=row | foobar" is the combined string |
457 | * and we need to drop the th-attributes entirely after combining |
458 | * - The combined cell will be a <td> |
459 | * - $cell's attribute is dropped |
460 | * - $prev's content is dropped |
461 | * |
462 | * FIXME: There are a number of other merge possibilities that end up |
463 | * in this function that aren't accounted for yet! Couple of them are |
464 | * in the unsupported scenario 1/2 buckets below. |
465 | * |
466 | * @param Frame $frame |
467 | * @param Element $cell |
468 | * @return bool |
469 | */ |
470 | private function combineAttrsWithPreviousCell( Frame $frame, Element $cell ): bool { |
471 | // UNSUPPORTED SCENARIO 1: |
472 | // In this cell-combining scenario, $prev can have attributes only if it |
473 | // also had content. See example below: |
474 | // Ex: |class="foo"|bar{{1x|1={{!}}foo}} |
475 | // should parse as <td class="foo">bar|foo</td> |
476 | // In this case, the attributes/content of $cell would end up as the |
477 | // content of the combined cell. |
478 | // |
479 | // UNSUPPORTED SCENARIO 2: |
480 | // The template produced attributes as well as maybe a new cell. |
481 | // Ex: |class="foo"{{1x| foo}} and |class="foo"{{1x| foo}} |
482 | // We let the more general 'reparseTemplatedAttributes' code handle |
483 | // this scenario for now. |
484 | $prev = $cell->previousSibling; |
485 | DOMUtils::assertElt( $prev ); |
486 | $prevDp = DOMDataUtils::getDataParsoid( $prev ); |
487 | |
488 | // Build the attribute string |
489 | $prevCellSrc = PHPUtils::safeSubstr( |
490 | $frame->getSrcText(), $prevDp->dsr->start, $prevDp->dsr->length() ); |
491 | // "|" or "!", but doesn't matter since we discard that anyway |
492 | $reparseSrc = substr( $prevCellSrc, $prevDp->dsr->openWidth ) . "|"; |
493 | |
494 | // Reparse the attributish prefix |
495 | $env = $frame->getEnv(); |
496 | $attributeTokens = $this->tokenizer->tokenizeTableCellAttributes( $reparseSrc, false ); |
497 | if ( !is_array( $attributeTokens ) ) { |
498 | $env->log( "error/wt2html", |
499 | "TableFixups: Failed to successfully reparse $reparseSrc as table cell attributes" ); |
500 | return false; |
501 | } |
502 | |
503 | // Update data-mw, DSR if $cell is an encapsulation wrapper |
504 | $cellDp = DOMDataUtils::getDataParsoid( $cell ); |
505 | if ( DOMUtils::hasTypeOf( $cell, 'mw:Transclusion' ) ) { |
506 | $dataMW = DOMDataUtils::getDataMw( $cell ); |
507 | array_unshift( $dataMW->parts, $prevCellSrc ); |
508 | $cellDSR = $cellDp->dsr ?? null; |
509 | if ( $cellDSR && $cellDSR->start ) { |
510 | $cellDSR->start -= strlen( $prevCellSrc ); |
511 | } |
512 | } |
513 | |
514 | $parent = $cell->parentNode; |
515 | if ( DOMCompat::nodeName( $cell ) === DOMCompat::nodeName( $prev ) ) { |
516 | // Matching cell types |
517 | $combinedCell = $cell; |
518 | $combinedCellDp = $cellDp; |
519 | $parent->removeChild( $prev ); |
520 | } else { |
521 | // Different cell types |
522 | $combinedCell = $prev; |
523 | |
524 | // Remove all content on $prev which will |
525 | // become the new combined cell |
526 | while ( $prev->firstChild ) { |
527 | $prev->removeChild( $prev->firstChild ); |
528 | } |
529 | // Note that this implicitly migrates data-mw and data-parsoid |
530 | foreach ( DOMUtils::attributes( $cell ) as $k => $v ) { |
531 | $combinedCell->setAttribute( $k, $v ); |
532 | } |
533 | DOMUtils::migrateChildren( $cell, $combinedCell ); |
534 | $parent->removeChild( $cell ); |
535 | |
536 | $combinedCellDp = DOMDataUtils::getDataParsoid( $combinedCell ); |
537 | } |
538 | |
539 | // Note that `row_syntax_table_args` (the rule used for tokenizing above) |
540 | // returns an array consisting of [table_attributes, spaces, pipe] |
541 | $attrs = $attributeTokens[0]; |
542 | Sanitizer::applySanitizedArgs( $env->getSiteConfig(), $combinedCell, $attrs ); |
543 | // Combined cells don't merge further |
544 | $combinedCellDp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ); |
545 | $combinedCellDp->setTempFlag( TempData::NO_ATTRS, false ); |
546 | |
547 | return true; |
548 | } |
549 | |
550 | private const NO_REPARSING = 0; |
551 | private const COMBINE_WITH_PREV_CELL = 1; |
552 | private const OTHER_REPARSE = 2; |
553 | |
554 | /** |
555 | * $cell is known to be <td>/<th> |
556 | */ |
557 | private function getReparseType( Element $cell, DTState $dtState ): int { |
558 | $inTplContent = $dtState->tplInfo !== null; |
559 | $dp = DOMDataUtils::getDataParsoid( $cell ); |
560 | if ( !$dp->getTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ) && |
561 | !$dp->getTempFlag( TempData::FAILED_REPARSE ) && |
562 | // This is a good proxy for what we need: "Is $cell a template wrapper?". |
563 | // That info won't be available for nested templates unless we want |
564 | // to use the more expensive hacky check as used in "stripDoubleTDs" |
565 | // above. "inTplContent" is sufficient because we won't have mergeable |
566 | // cells for wikitext that doesn't get any part of its content from |
567 | // templates because NON_MERGEABLE_TABLE_CELL prevents such merges. |
568 | $inTplContent |
569 | ) { |
570 | // Look for opportunities where table cells could combine. This requires |
571 | // $cell to be a templated cell. But, we don't support combining |
572 | // templated cells with other templated cells. So, previous sibling |
573 | // cannot be templated. |
574 | // |
575 | // So, bail out of scenarios where prevDp comes from a template (the checks |
576 | // for isValidDSR( $prevDp-> dsr ) and valid opening tag width catch this. |
577 | $prev = $cell->previousSibling; |
578 | $prevDp = $prev instanceof Element ? DOMDataUtils::getDataParsoid( $prev ) : null; |
579 | if ( $prevDp && |
580 | !WTUtils::hasLiteralHTMLMarker( $prevDp ) && |
581 | Utils::isValidDSR( $prevDp->dsr ?? null, true ) && |
582 | !DOMUtils::hasTypeOf( $prev, 'mw:Transclusion' ) && |
583 | !str_contains( DOMCompat::getInnerHTML( $prev ), "\n" ) |
584 | ) { |
585 | return self::COMBINE_WITH_PREV_CELL; |
586 | } |
587 | } |
588 | |
589 | $cellIsTd = DOMCompat::nodeName( $cell ) === 'td'; |
590 | $testRE = $cellIsTd ? '/[|]/' : '/[!|]/'; |
591 | $child = $cell->firstChild; |
592 | while ( $child ) { |
593 | if ( !$inTplContent && DOMUtils::hasTypeOf( $child, 'mw:Transclusion' ) ) { |
594 | $inTplContent = true; |
595 | } |
596 | |
597 | if ( $inTplContent && |
598 | $child instanceof Text && |
599 | preg_match( $testRE, $child->textContent ) |
600 | ) { |
601 | return self::OTHER_REPARSE; |
602 | } |
603 | |
604 | if ( $child instanceof Element ) { |
605 | if ( WTUtils::isFirstExtensionWrapperNode( $child ) ) { |
606 | // "|" chars in extension/language variant content don't trigger |
607 | // table-cell parsing since they have higher precedence in tokenization |
608 | $child = WTUtils::skipOverEncapsulatedContent( $child ); |
609 | } else { |
610 | if ( DOMUtils::hasRel( $child, 'mw:WikiLink' ) || |
611 | WTUtils::isGeneratedFigure( $child ) |
612 | ) { |
613 | // Wikilinks/images abort attribute parsing |
614 | return self::NO_REPARSING; |
615 | } |
616 | // FIXME: Ugly for now |
617 | $outerHTML = DOMCompat::getOuterHTML( $child ); |
618 | if ( preg_match( $testRE, $outerHTML ) && |
619 | ( $inTplContent || preg_match( '/"mw:Transclusion"/', $outerHTML ) ) |
620 | ) { |
621 | // A "|" char in the HTML will trigger table cell tokenization. |
622 | // Ex: "| foobar <div> x | y </div>" will split the <div> |
623 | // in table-cell tokenization context. |
624 | return self::OTHER_REPARSE; |
625 | } |
626 | $child = $child->nextSibling; |
627 | } |
628 | } else { |
629 | $child = $child->nextSibling; |
630 | } |
631 | } |
632 | |
633 | return self::NO_REPARSING; |
634 | } |
635 | |
636 | /** |
637 | * In a wikitext-syntax-table-parsing context, the meaning of |
638 | * "|", "||", "!", "!!" is context-sensitive. Additionally, the |
639 | * complete syntactical construct for a table cell (including leading |
640 | * pipes, attributes, and content-separating pipe char) might straddle |
641 | * a template boundary - with some content coming from the top-level and |
642 | * some from a template. |
643 | * |
644 | * This impacts parsing of tables when some cells are templated since |
645 | * Parsoid parses template content independent of top-level content |
646 | * (without any preceding context). This means that Parsoid's table-cell |
647 | * parsing in templated contexts might be incorrect |
648 | * |
649 | * To deal with this, Parsoid implements this table-fixups pass that |
650 | * has to deal with cell-merging and cell-reparsing scenarios. |
651 | * |
652 | * HTML-syntax cells and non-templated cells without any templated content |
653 | * are not subject to this transformation and can be skipped right away. |
654 | * |
655 | * FIXME: This pass can benefit from a customized procsssor rather than |
656 | * piggyback on top of DOMTraverser since the DOM can be significantly |
657 | * mutated in these handlers. |
658 | * |
659 | * @param Element $cell $cell is known to be <td>/<th> |
660 | * @param Frame $frame |
661 | * @param DTState $dtState |
662 | * @return mixed |
663 | */ |
664 | public function handleTableCellTemplates( Element $cell, Frame $frame, DTState $dtState ) { |
665 | if ( WTUtils::isLiteralHTMLNode( $cell ) ) { |
666 | return true; |
667 | } |
668 | |
669 | // Deal with <th> special case where "!! foo" is parsed as <th>! foo</th> |
670 | // but should have been parsed as <th>foo</th> when not the first child |
671 | if ( DOMCompat::nodeName( $cell ) === 'th' && |
672 | DOMUtils::hasTypeOf( $cell, 'mw:Transclusion' ) && |
673 | // This is checking that previous sibling is not "\n" which would |
674 | // signal that this <th> is on a fresh line and the "!" shouldn't be stripped. |
675 | // If this weren't template output, we would check for "stx" === 'row'. |
676 | // FIXME: Note that ths check is fragile and doesn't work always, but this is |
677 | // the price we pay for Parsoid's independent template parsing! |
678 | $cell->previousSibling instanceof Element |
679 | ) { |
680 | $fc = DiffDOMUtils::firstNonSepChild( $cell ); |
681 | if ( $fc instanceof Text ) { |
682 | $leadingText = $fc->nodeValue; |
683 | if ( str_starts_with( $leadingText, "!" ) ) { |
684 | $fc->nodeValue = substr( $leadingText, 1 ); |
685 | } |
686 | } |
687 | } |
688 | |
689 | $reparseType = $this->getReparseType( $cell, $dtState ); |
690 | if ( $reparseType === self::NO_REPARSING ) { |
691 | return true; |
692 | } |
693 | |
694 | $cellDp = DOMDataUtils::getDataParsoid( $cell ); |
695 | if ( $reparseType === self::COMBINE_WITH_PREV_CELL ) { |
696 | if ( $this->combineAttrsWithPreviousCell( $frame, $cell ) ) { |
697 | return true; |
698 | } else { |
699 | // Clear property and retry $cell for other reparses |
700 | // The DOMTraverser will resume the handler on the |
701 | // returned $cell. |
702 | $cellDp->setTempFlag( TempData::FAILED_REPARSE ); |
703 | return $cell; |
704 | } |
705 | } |
706 | |
707 | // If the cell didn't have attrs, extract and reparse templated attrs |
708 | if ( $cellDp->getTempFlag( TempData::NO_ATTRS ) ) { |
709 | $templateWrapper = DOMUtils::hasTypeOf( $cell, 'mw:Transclusion' ) ? $cell : null; |
710 | $this->reparseTemplatedAttributes( $frame, $cell, $templateWrapper, $dtState ); |
711 | } |
712 | |
713 | // Now, examine the <td> to see if it hides additional <td>s |
714 | // and split it up if required. |
715 | // |
716 | // DOMTraverser will process the new cell and invoke |
717 | // handleTableCellTemplates on it which ensures that |
718 | // if any addition attribute fixup or splits are required, |
719 | // they will get done. |
720 | $newCell = null; |
721 | $isTd = DOMCompat::nodeName( $cell ) === 'td'; |
722 | $ownerDoc = $cell->ownerDocument; |
723 | $child = $cell->firstChild; |
724 | while ( $child ) { |
725 | $next = $child->nextSibling; |
726 | |
727 | if ( $newCell ) { |
728 | $newCell->appendChild( $child ); |
729 | } elseif ( $child instanceof Text || $this->isSimpleTemplatedSpan( $child ) ) { |
730 | // FIXME: This skips over scenarios like <div>foo||bar</div>. |
731 | $cellName = DOMCompat::nodeName( $cell ); |
732 | $hasSpanWrapper = !( $child instanceof Text ); |
733 | $match = $match1 = $match2 = null; |
734 | |
735 | // Find the first match of || |
736 | preg_match( '/^((?:[^|]*(?:\|[^|])?)*)\|\|([^|].*)?$/D', $child->textContent, $match1 ); |
737 | if ( $isTd ) { |
738 | $match = $match1; |
739 | } else { |
740 | // Find the first match !! |
741 | preg_match( '/^((?:[^!]*(?:\![^!])?)*)\!\!([^!].*)?$/D', $child->textContent, $match2 ); |
742 | |
743 | // Pick the shortest match |
744 | if ( $match1 && $match2 ) { |
745 | $match = strlen( $match1[1] ?? '' ) < strlen( $match2[1] ?? '' ) |
746 | ? $match1 |
747 | : $match2; |
748 | } else { |
749 | $match = $match1 ?: $match2; |
750 | } |
751 | } |
752 | |
753 | if ( $match ) { |
754 | $child->textContent = $match[1] ?? ''; |
755 | |
756 | $newCell = $ownerDoc->createElement( $cellName ); |
757 | if ( $hasSpanWrapper ) { |
758 | /** |
759 | * $hasSpanWrapper above ensures $child is a span. |
760 | * |
761 | * @var Element $child |
762 | */ |
763 | '@phan-var Element $child'; |
764 | // Fix up transclusion wrapping |
765 | $about = DOMCompat::getAttribute( $child, 'about' ); |
766 | $this->hoistTransclusionInfo( $frame, [ $child ], $cell, $dtState ); |
767 | } else { |
768 | // Refetch the about attribute since 'reparseTemplatedAttributes' |
769 | // might have added one to it. |
770 | $about = DOMCompat::getAttribute( $cell, 'about' ); |
771 | } |
772 | |
773 | // about may not be present if the cell was inside |
774 | // wrapped template content rather than being part |
775 | // of the outermost wrapper. |
776 | if ( $about !== null ) { |
777 | $newCell->setAttribute( 'about', $about ); |
778 | if ( $dtState->tplInfo && $dtState->tplInfo->last === $cell ) { |
779 | $dtState->tplInfo->last = $newCell; |
780 | } |
781 | } |
782 | $newCell->appendChild( $ownerDoc->createTextNode( $match[2] ?? '' ) ); |
783 | $cell->parentNode->insertBefore( $newCell, $cell->nextSibling ); |
784 | |
785 | // Set data-parsoid noAttrs flag |
786 | $newCellDp = DOMDataUtils::getDataParsoid( $newCell ); |
787 | // This new cell has 'row' stx (would be set if the tokenizer had parsed it) |
788 | $newCellDp->stx = 'row'; |
789 | $newCellDp->setTempFlag( TempData::NO_ATTRS ); |
790 | // It is important to set this so that when $newCell is processed by this pass, |
791 | // it won't accidentally recombine again with the previous cell! |
792 | $newCellDp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ); |
793 | } |
794 | } |
795 | |
796 | $child = $next; |
797 | } |
798 | |
799 | return true; |
800 | } |
801 | } |