Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 249 |
|
0.00% |
0 / 10 |
CRAP | |
0.00% |
0 / 1 |
TableFixups | |
0.00% |
0 / 249 |
|
0.00% |
0 / 10 |
9506 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
stripDoubleTDs | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
132 | |||
isSimpleTemplatedSpan | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
fillDSRGap | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
hoistTransclusionInfo | |
0.00% |
0 / 48 |
|
0.00% |
0 / 1 |
182 | |||
collectAttributishContent | |
0.00% |
0 / 38 |
|
0.00% |
0 / 1 |
210 | |||
reparseTemplatedAttributes | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
72 | |||
combineWithPreviousCell | |
0.00% |
0 / 34 |
|
0.00% |
0 / 1 |
72 | |||
getReparseType | |
0.00% |
0 / 27 |
|
0.00% |
0 / 1 |
342 | |||
handleTableCellTemplates | |
0.00% |
0 / 50 |
|
0.00% |
0 / 1 |
380 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\PP\Handlers; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\Core\Sanitizer; |
9 | use Wikimedia\Parsoid\DOM\Comment; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\DOM\Text; |
13 | use Wikimedia\Parsoid\NodeData\TempData; |
14 | use Wikimedia\Parsoid\Utils\DOMCompat; |
15 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMUtils; |
17 | use Wikimedia\Parsoid\Utils\PHPUtils; |
18 | use Wikimedia\Parsoid\Utils\Utils; |
19 | use Wikimedia\Parsoid\Utils\WTUtils; |
20 | use Wikimedia\Parsoid\Wt2Html\Frame; |
21 | use Wikimedia\Parsoid\Wt2Html\PegTokenizer; |
22 | |
23 | /** |
24 | * Provides DOMTraverser visitors that fix template-induced interrupted table cell parsing |
25 | * by recombining table cells and/or reparsing table cell content as attributes. |
26 | * - stripDoubleTDs |
27 | * - handleTableCellTemplates |
28 | */ |
29 | class TableFixups { |
30 | /** |
31 | * @var PegTokenizer |
32 | */ |
33 | private $tokenizer; |
34 | |
35 | /** |
36 | * @param Env $env |
37 | */ |
38 | public function __construct( Env $env ) { |
39 | /** |
40 | * Set up some helper objects for reparseTemplatedAttributes |
41 | */ |
42 | |
43 | /** |
44 | * Actually the regular tokenizer, but we'll use |
45 | * tokenizeTableCellAttributes only. |
46 | */ |
47 | $this->tokenizer = new PegTokenizer( $env ); |
48 | } |
49 | |
50 | /** |
51 | * DOM visitor that strips the double td for this test case: |
52 | * ``` |
53 | * |{{1x|{{!}} Foo}} |
54 | * ``` |
55 | * |
56 | * @see https://phabricator.wikimedia.org/T52603 |
57 | * @param Element $node |
58 | * @param Frame $frame |
59 | * @return bool|Node |
60 | */ |
61 | public function stripDoubleTDs( Element $node, Frame $frame ) { |
62 | $nextNode = $node->nextSibling; |
63 | if ( !WTUtils::isLiteralHTMLNode( $node ) && |
64 | $nextNode instanceof Element && |
65 | DOMCompat::nodeName( $nextNode ) === 'td' && |
66 | !WTUtils::isLiteralHTMLNode( $nextNode ) && |
67 | DOMUtils::nodeEssentiallyEmpty( $node ) && ( |
68 | // FIXME: will not be set for nested templates |
69 | DOMUtils::hasTypeOf( $nextNode, 'mw:Transclusion' ) || |
70 | // Hacky work-around for nested templates |
71 | preg_match( '/^{{.*?}}$/D', DOMDataUtils::getDataParsoid( $nextNode )->src ?? '' ) |
72 | ) |
73 | ) { |
74 | // Update the dsr. Since we are coalescing the first |
75 | // node with the second (or, more precisely, deleting |
76 | // the first node), we have to update the second DSR's |
77 | // starting point and start tag width. |
78 | $nodeDSR = DOMDataUtils::getDataParsoid( $node )->dsr ?? null; |
79 | $nextNodeDP = DOMDataUtils::getDataParsoid( $nextNode ); |
80 | |
81 | if ( $nodeDSR && !empty( $nextNodeDP->dsr ) ) { |
82 | $nextNodeDP->dsr->start = $nodeDSR->start; |
83 | } |
84 | |
85 | $dataMW = DOMDataUtils::getDataMw( $nextNode ); |
86 | $nodeSrc = WTUtils::getWTSource( $frame, $node ); |
87 | if ( !isset( $dataMW->parts ) ) { |
88 | $dataMW->parts = []; |
89 | } |
90 | array_unshift( $dataMW->parts, $nodeSrc ); |
91 | |
92 | // Delete the duplicated <td> node. |
93 | $node->parentNode->removeChild( $node ); |
94 | // This node was deleted, so don't continue processing on it. |
95 | return $nextNode; |
96 | } |
97 | |
98 | return true; |
99 | } |
100 | |
101 | /** |
102 | * @param Node $node |
103 | * @return bool |
104 | */ |
105 | private function isSimpleTemplatedSpan( Node $node ): bool { |
106 | return DOMCompat::nodeName( $node ) === 'span' && |
107 | DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) && |
108 | DOMUtils::allChildrenAreTextOrComments( $node ); |
109 | } |
110 | |
111 | /** |
112 | * @param array &$parts |
113 | * @param Frame $frame |
114 | * @param int $offset1 |
115 | * @param int $offset2 |
116 | */ |
117 | private function fillDSRGap( array &$parts, Frame $frame, int $offset1, int $offset2 ): void { |
118 | if ( $offset1 < $offset2 ) { |
119 | $parts[] = PHPUtils::safeSubstr( $frame->getSrcText(), $offset1, $offset2 - $offset1 ); |
120 | } |
121 | } |
122 | |
123 | /** |
124 | * Hoist transclusion information from cell content / attributes |
125 | * onto the cell itself. |
126 | * |
127 | * @param Frame $frame |
128 | * @param Element[] $transclusions |
129 | * @param Element $td |
130 | */ |
131 | private function hoistTransclusionInfo( |
132 | Frame $frame, array $transclusions, Element $td |
133 | ): void { |
134 | // Initialize dsr for $td |
135 | // In `handleTableCellTemplates`, we're creating a cell w/o dsr info. |
136 | $tdDp = DOMDataUtils::getDataParsoid( $td ); |
137 | if ( !Utils::isValidDSR( $tdDp->dsr ?? null ) ) { |
138 | $tplDp = DOMDataUtils::getDataParsoid( $transclusions[0] ); |
139 | Assert::invariant( Utils::isValidDSR( $tplDp->dsr ?? null ), 'Expected valid DSR' ); |
140 | $tdDp->dsr = clone $tplDp->dsr; |
141 | } |
142 | |
143 | // Build up $parts, $pi to set up the combined transclusion info on $td. |
144 | // Note that content for all but the last template has been swallowed into |
145 | // the attributes of $td. |
146 | $parts = []; |
147 | $pi = []; |
148 | $lastTpl = null; |
149 | $prevDp = null; |
150 | |
151 | $index = 0; |
152 | foreach ( $transclusions as $i => $tpl ) { |
153 | $tplDp = DOMDataUtils::getDataParsoid( $tpl ); |
154 | Assert::invariant( Utils::isValidDSR( $tplDp->dsr ?? null ), 'Expected valid DSR' ); |
155 | |
156 | // Plug DSR gaps between transclusions |
157 | if ( !$prevDp ) { |
158 | $this->fillDSRGap( $parts, $frame, $tdDp->dsr->start, $tplDp->dsr->start ); |
159 | } else { |
160 | $this->fillDSRGap( $parts, $frame, $prevDp->dsr->end, $tplDp->dsr->start ); |
161 | } |
162 | |
163 | // Assimilate $tpl's data-mw and data-parsoid pi info |
164 | $dmw = DOMDataUtils::getDataMw( $tpl ); |
165 | foreach ( $dmw->parts ?? [] as $part ) { |
166 | // Template index is relative to other transclusions. |
167 | // This index is used to extract whitespace information from |
168 | // data-parsoid and that array only includes info for templates. |
169 | // So skip over strings here. |
170 | if ( !is_string( $part ) ) { |
171 | // Cloning is strictly not needed here, but mimicing |
172 | // code in WrapSectionsState.php |
173 | $part = clone $part; |
174 | if ( isset( $part->template ) ) { |
175 | $part->template->i = $index++; |
176 | } else { |
177 | $part->templatearg->i = $index++; |
178 | } |
179 | } |
180 | $parts[] = $part; |
181 | } |
182 | PHPUtils::pushArray( $pi, $tplDp->pi ?? [ [] ] ); |
183 | DOMDataUtils::setDataMw( $tpl, null ); |
184 | |
185 | $lastTpl = $tpl; |
186 | $prevDp = $tplDp; |
187 | } |
188 | |
189 | $aboutId = $lastTpl->getAttribute( 'about' ) ?? ''; |
190 | |
191 | // Hoist transclusion information to $td. |
192 | $td->setAttribute( 'typeof', 'mw:Transclusion' ); |
193 | $td->setAttribute( 'about', $aboutId ); |
194 | |
195 | // Add wikitext for the table cell content following $lastTpl |
196 | $this->fillDSRGap( $parts, $frame, $prevDp->dsr->end, $tdDp->dsr->end ); |
197 | |
198 | // Save the new data-mw on the td |
199 | DOMDataUtils::setDataMw( $td, (object)[ 'parts' => $parts ] ); |
200 | $tdDp->pi = $pi; |
201 | |
202 | // td wraps everything now. |
203 | // Remove template encapsulation from here on. |
204 | // This simplifies the problem of analyzing the <td> |
205 | // for additional fixups (|| Boo || Baz) by potentially |
206 | // invoking 'reparseTemplatedAttributes' on split cells |
207 | // with some modifications. |
208 | $child = $lastTpl; |
209 | |
210 | // Transclusions may be nested in elements in some ugly wikitext so |
211 | // make sure we're starting at a direct descendant of td |
212 | while ( $child->parentNode !== $td ) { |
213 | $child = $child->parentNode; |
214 | } |
215 | |
216 | while ( $child ) { |
217 | if ( DOMCompat::nodeName( $child ) === 'span' && $child->getAttribute( 'about' ) === $aboutId ) { |
218 | // Remove the encapsulation attributes. If there are no more attributes left, |
219 | // the span wrapper is useless and can be removed. |
220 | $child->removeAttribute( 'about' ); |
221 | $child->removeAttribute( 'typeof' ); |
222 | if ( DOMDataUtils::noAttrs( $child ) ) { |
223 | $next = $child->firstChild ?: $child->nextSibling; |
224 | DOMUtils::migrateChildren( $child, $td, $child ); |
225 | $child->parentNode->removeChild( $child ); |
226 | $child = $next; |
227 | } else { |
228 | $child = $child->nextSibling; |
229 | } |
230 | } else { |
231 | $child = $child->nextSibling; |
232 | } |
233 | } |
234 | } |
235 | |
236 | /** |
237 | * Collect potential attribute content. |
238 | * |
239 | * We expect this to be text nodes without a pipe character followed by one or |
240 | * more nowiki spans, followed by a template encapsulation with pure-text and |
241 | * nowiki content. Collection stops when encountering a pipe character. |
242 | * |
243 | * @param Env $env |
244 | * @param Element $cell known to be <td> / <th> |
245 | * @param ?Element $templateWrapper |
246 | * @return ?array |
247 | */ |
248 | public function collectAttributishContent( |
249 | Env $env, Element $cell, ?Element $templateWrapper |
250 | ): ?array { |
251 | $buf = []; |
252 | $nowikis = []; |
253 | $transclusions = $templateWrapper ? [ $templateWrapper ] : []; |
254 | |
255 | // Some of this logic could be replaced by DSR-based recovery of |
256 | // wikitext that is outside templates. But since we have to walk over |
257 | // templated content in this fashion anyway, we might as well use the |
258 | // same logic uniformly. |
259 | |
260 | $traverse = static function ( ?Node $child ) use ( |
261 | &$traverse, &$buf, &$nowikis, &$transclusions |
262 | ): bool { |
263 | while ( $child ) { |
264 | if ( $child instanceof Comment ) { |
265 | // Legacy parser strips comments during parsing => drop them. |
266 | } elseif ( $child instanceof Text ) { |
267 | $text = $child->nodeValue; |
268 | $buf[] = $text; |
269 | |
270 | // Are we done accumulating? |
271 | if ( preg_match( '/(?:^|[^|])\|(?:[^|]|$)/D', $text ) ) { |
272 | return true; |
273 | } |
274 | } else { |
275 | '@phan-var Element $child'; /** @var Element $child */ |
276 | if ( DOMUtils::hasTypeOf( $child, 'mw:Transclusion' ) ) { |
277 | $transclusions[] = $child; |
278 | } |
279 | |
280 | if ( DOMUtils::matchTypeOf( $child, "#mw:Extension/#" ) ) { |
281 | // "|" chars in extension content don't trigger table-cell parsing |
282 | // since they have higher precedence in tokenization. The extension |
283 | // content will simply be dropped (but any side effects it had will |
284 | // continue to apply. Ex: <ref> tags might leave an orphaned ref in |
285 | // the <references> section). |
286 | $child = WTUtils::skipOverEncapsulatedContent( $child ); |
287 | continue; |
288 | } elseif ( DOMUtils::hasTypeOf( $child, 'mw:Entity' ) ) { |
289 | // Get entity's wikitext source, not rendered content. |
290 | // " " is "\n" which breaks attribute parsing! |
291 | $buf[] = DOMDataUtils::getDataParsoid( $child )->src ?? $child->textContent; |
292 | } elseif ( DOMUtils::hasTypeOf( $child, 'mw:Nowiki' ) ) { |
293 | // Nowiki span were added to protect otherwise |
294 | // meaningful wikitext chars used in attributes. |
295 | // Save the content and add in a marker to splice out later. |
296 | $nowikis[] = $child->textContent; |
297 | $buf[] = '<nowiki-marker>'; |
298 | } elseif ( DOMUtils::hasRel( $child, 'mw:WikiLink' ) || |
299 | WTUtils::isGeneratedFigure( $child ) |
300 | ) { |
301 | // Wikilinks/images abort attribute parsing |
302 | return true; |
303 | } else { |
304 | if ( $traverse( $child->firstChild ) ) { |
305 | return true; |
306 | } |
307 | } |
308 | } |
309 | |
310 | $child = $child->nextSibling; |
311 | } |
312 | |
313 | return false; |
314 | }; |
315 | |
316 | if ( $traverse( $cell->firstChild ) ) { |
317 | return [ |
318 | 'txt' => implode( '', $buf ), |
319 | 'nowikis' => $nowikis, |
320 | 'transclusions' => $transclusions, |
321 | ]; |
322 | } else { |
323 | return null; |
324 | } |
325 | } |
326 | |
327 | /** |
328 | * T46498, second part of T52603 |
329 | * |
330 | * Handle wikitext like |
331 | * ``` |
332 | * {| |
333 | * |{{nom|Bar}} |
334 | * |} |
335 | * ``` |
336 | * where nom expands to `style="foo" class="bar"|Bar`. The attributes are |
337 | * tokenized and stripped from the table contents. |
338 | * |
339 | * This method works well for the templates documented in |
340 | * https://en.wikipedia.org/wiki/Template:Table_cell_templates/doc |
341 | * |
342 | * Nevertheless, there are some limitations: |
343 | * - We assume that attributes don't contain wiki markup (apart from <nowiki>) |
344 | * and end up in text or nowiki nodes. |
345 | * - Only a single table cell is produced / opened by the template that |
346 | * contains the attributes. This limitation could be lifted with more |
347 | * aggressive re-parsing if really needed in practice. |
348 | * - There is only a single transclusion in the table cell content. This |
349 | * limitation can be lifted with more advanced data-mw construction. |
350 | * |
351 | * @param Frame $frame |
352 | * @param Element $cell known to be <td> / <th> |
353 | * @param ?Element $templateWrapper |
354 | */ |
355 | public function reparseTemplatedAttributes( |
356 | Frame $frame, Element $cell, ?Element $templateWrapper |
357 | ): void { |
358 | $env = $frame->getEnv(); |
359 | // Collect attribute content and examine it |
360 | $attributishContent = $this->collectAttributishContent( $env, $cell, $templateWrapper ); |
361 | if ( !$attributishContent ) { |
362 | return; |
363 | } |
364 | |
365 | /** |
366 | * FIXME: These checks are insufficient. |
367 | * Previous rounds of table fixups might have created this cell without |
368 | * any templated content (the while loop in handleTableCellTemplates). |
369 | * Till we figure out a reliable test for this, we'll reparse attributes always. |
370 | * |
371 | * // This DOM pass is trying to bridge broken parses across |
372 | * // template boundaries. so, if templates aren't involved, |
373 | * // no reason to reparse. |
374 | * if ( count( $attributishContent['transclusions'] ) === 0 && |
375 | * !WTUtils::fromEncapsulatedContent( $cell ) |
376 | * ) { |
377 | * return; |
378 | * } |
379 | */ |
380 | |
381 | $attrText = $attributishContent['txt']; |
382 | if ( !preg_match( '/(^[^|]+\|)([^|]|$)/D', $attrText, $matches ) ) { |
383 | return; |
384 | } |
385 | $attributishPrefix = $matches[1]; |
386 | |
387 | // Splice in nowiki content. We added in <nowiki> markers to prevent the |
388 | // above regexps from matching on nowiki-protected chars. |
389 | if ( str_contains( $attributishPrefix, '<nowiki-marker>' ) ) { |
390 | $attributishPrefix = preg_replace_callback( |
391 | '/<nowiki-marker>/', |
392 | static function ( $unused ) use ( &$attributishContent ) { |
393 | // This is a little tricky. We want to use the content from the |
394 | // nowikis to reparse the string to key/val pairs but the rule, |
395 | // single_cell_table_args, will invariably get tripped up on |
396 | // newlines which, to this point, were shuttled through in the |
397 | // nowiki. Core sanitizer will do this replacement in attr vals |
398 | // so it's a safe normalization to do here. |
399 | return preg_replace( '/\s+/', ' ', array_shift( $attributishContent['nowikis'] ) ); |
400 | }, |
401 | $attributishPrefix |
402 | ); |
403 | } |
404 | |
405 | // re-parse the attributish prefix |
406 | $attributeTokens = $this->tokenizer->tokenizeTableCellAttributes( $attributishPrefix, false ); |
407 | |
408 | // No attributes => nothing more to do! |
409 | if ( !$attributeTokens ) { |
410 | return; |
411 | } |
412 | |
413 | // Note that `row_syntax_table_args` (the rule used for tokenizing above) |
414 | // returns an array consisting of [table_attributes, spaces, pipe] |
415 | $attrs = $attributeTokens[0]; |
416 | |
417 | // Sanitize attrs and transfer them to the td node |
418 | Sanitizer::applySanitizedArgs( $env->getSiteConfig(), $cell, $attrs ); |
419 | |
420 | // If the transclusion node was embedded within the td node, |
421 | // lift up the about group to the td node. |
422 | $transclusions = $attributishContent['transclusions']; |
423 | if ( $transclusions && ( $cell !== $transclusions[0] || count( $transclusions ) > 1 ) ) { |
424 | $this->hoistTransclusionInfo( $frame, $transclusions, $cell ); |
425 | } |
426 | |
427 | // Drop content that has been consumed by the reparsed attribute content. |
428 | // NOTE: We serialize and reparse data-object-id attributes as well which |
429 | // ensures stashed data-* attributes continue to be usable. |
430 | // FIXME: This is too naive. What about all the care we showed in `collectAttributishContent`? |
431 | DOMCompat::setInnerHTML( $cell, |
432 | preg_replace( '/^[^|]*\|/', '', DOMCompat::getInnerHTML( $cell ) ) ); |
433 | } |
434 | |
435 | /** |
436 | * @param Frame $frame |
437 | * @param Element $cell |
438 | * @return bool |
439 | */ |
440 | private function combineWithPreviousCell( Frame $frame, Element $cell ): bool { |
441 | // UNSUPPORTED SCENARIO 1: |
442 | // While in the general case, we should look for combinability no matter |
443 | // whether $cell has attributes or not, we are currently restricting |
444 | // our support to use cases where $cell doesn't have attributes since that |
445 | // is the common scenario and use case for this kind of markup. |
446 | // |
447 | // Ex: |class="foo"{{1x|1={{!}}title="x"{{!}}foo}} |
448 | // should parse as <td class="foo">title="x"|foo</td> |
449 | $cellDp = DOMDataUtils::getDataParsoid( $cell ); |
450 | if ( !$cellDp->getTempFlag( TempData::NO_ATTRS ) ) { |
451 | return false; |
452 | } |
453 | |
454 | $prev = $cell->previousSibling; |
455 | DOMUtils::assertElt( $prev ); |
456 | |
457 | // UNSUPPORTED SCENARIO 2: |
458 | // If the previous cell had attributes, the attributes/content of $cell |
459 | // would end up as the content of the combined cell. |
460 | // |
461 | // Ex: |class="foo"|bar{{1x|1={{!}}foo}} |
462 | // should parse as <td class="foo">bar|foo</td> |
463 | // |
464 | // UNSUPPORTED SCENARIO 3: |
465 | // The template produced attributes as well as maybe a new cell. |
466 | // Ex: |class="foo"{{1x| foo}} and |class="foo"{{1x| foo}} |
467 | // We let the more general 'reparseTemplatedAttributes' code handle |
468 | // this scenario for now. |
469 | $prevDp = DOMDataUtils::getDataParsoid( $prev ); |
470 | if ( !$prevDp->getTempFlag( TempData::NO_ATTRS ) ) { |
471 | return false; |
472 | } |
473 | |
474 | // Build the attribute string |
475 | $prevCellSrc = PHPUtils::safeSubstr( |
476 | $frame->getSrcText(), $prevDp->dsr->start, $prevDp->dsr->length() ); |
477 | $cellAttrSrc = substr( $prevCellSrc, $prevDp->dsr->openWidth ); |
478 | $reparseSrc = $cellAttrSrc . "|"; // "|" or "!", but doesn't matter since we discard that anyway |
479 | |
480 | // Reparse the attributish prefix |
481 | $attributeTokens = $this->tokenizer->tokenizeTableCellAttributes( $reparseSrc, false ); |
482 | if ( !is_array( $attributeTokens ) ) { |
483 | $frame->getEnv()->log( "error/wt2html", |
484 | "TableFixups: Failed to successfully reparse $reparseSrc as table cell attributes" ); |
485 | return false; |
486 | } |
487 | |
488 | // Note that `row_syntax_table_args` (the rule used for tokenizing above) |
489 | // returns an array consisting of [table_attributes, spaces, pipe] |
490 | $attrs = $attributeTokens[0]; |
491 | |
492 | Sanitizer::applySanitizedArgs( $frame->getEnv()->getSiteConfig(), $cell, $attrs ); |
493 | |
494 | // Update data-mw, DSR |
495 | $dataMW = DOMDataUtils::getDataMw( $cell ); |
496 | array_unshift( $dataMW->parts, $prevCellSrc ); |
497 | $cellDSR = $cellDp->dsr ?? null; |
498 | if ( $cellDSR && $cellDSR->start ) { |
499 | $cellDSR->start -= strlen( $prevCellSrc ); |
500 | } |
501 | |
502 | $parent = $cell->parentNode; |
503 | // If $prev is not a <td> (has to be a <th>), the merged cell has |
504 | // to be a <th> as well. Since $cell is filtered to be a <td> in |
505 | // getReparseType(..), we create a $newCell as a <th>, transfer |
506 | // $cell's attributes over, and remove $cell. |
507 | // |
508 | // This is an edge case. |
509 | // |
510 | // Doing it this way is simpler than trying to fix the logic above |
511 | // since we'll have to deal with $cell's existing attrs & content. |
512 | if ( DOMCompat::nodeName( $prev ) === 'th' ) { |
513 | $newCell = $cell->ownerDocument->createElement( 'th' ); |
514 | foreach ( DOMUtils::attributes( $cell ) as $k => $v ) { |
515 | $newCell->setAttribute( $k, $v ); |
516 | } |
517 | DOMUtils::migrateChildren( $cell, $newCell ); |
518 | $parent->insertBefore( $newCell, $cell ); |
519 | $parent->removeChild( $cell ); |
520 | } |
521 | |
522 | $parent->removeChild( $prev ); |
523 | |
524 | return true; |
525 | } |
526 | |
527 | private const NO_REPARSING = 0; |
528 | private const COMBINE_WITH_PREV_CELL = 1; |
529 | private const OTHER_REPARSE = 2; |
530 | |
531 | /** |
532 | * @param Element $cell $cell is known to be <td>/<th> |
533 | * @return int |
534 | */ |
535 | private function getReparseType( Element $cell ): int { |
536 | $isTd = DOMCompat::nodeName( $cell ) === 'td'; |
537 | $dp = DOMDataUtils::getDataParsoid( $cell ); |
538 | if ( $isTd && // only | can separate attributes & content => $cell has to be <td> |
539 | WTUtils::isFirstEncapsulationWrapperNode( $cell ) && // See long comment below |
540 | !$dp->getTempFlag( TempData::FAILED_REPARSE ) && |
541 | !isset( $dp->stx ) // has to be first cell of the row |
542 | ) { |
543 | // Parsoid parses content of templates independent of top-level content. |
544 | // But, this breaks legacy-parser-supported use-cases where template |
545 | // content combines with top-level content to yield a table cell whose |
546 | // source straddles the template boundary. |
547 | // |
548 | // In Parsoid, we handle this by looking for opportunities where |
549 | // table cells could combine. This obviously requires $cell to be |
550 | // a templated cell. But, we don't support combining templated cells |
551 | // with other templated cells. So, previous sibling cannot be templated. |
552 | |
553 | $prev = $cell->previousSibling; |
554 | if ( $prev instanceof Element && |
555 | !WTUtils::hasLiteralHTMLMarker( DOMDataUtils::getDataParsoid( $prev ) ) && |
556 | !DOMUtils::hasTypeOf( $prev, 'mw:Transclusion' ) && |
557 | !str_contains( DOMCompat::getInnerHTML( $prev ), "\n" ) |
558 | ) { |
559 | return self::COMBINE_WITH_PREV_CELL; |
560 | } |
561 | } |
562 | |
563 | $testRE = $isTd ? '/[|]/' : '/[!|]/'; |
564 | $child = $cell->firstChild; |
565 | while ( $child ) { |
566 | if ( $child instanceof Text && preg_match( $testRE, $child->textContent ) ) { |
567 | return self::OTHER_REPARSE; |
568 | } |
569 | |
570 | if ( DOMUtils::matchTypeOf( $child, "#mw:Extension/#" ) ) { |
571 | // "|" chars in extension content don't trigger table-cell parsing |
572 | // since they have higher precedence in tokenization |
573 | $child = WTUtils::skipOverEncapsulatedContent( $child ); |
574 | } else { |
575 | if ( $child instanceof Element ) { |
576 | if ( DOMUtils::hasRel( $child, 'mw:WikiLink' ) || |
577 | WTUtils::isGeneratedFigure( $child ) |
578 | ) { |
579 | // Wikilinks/images abort attribute parsing |
580 | return self::NO_REPARSING; |
581 | } |
582 | if ( preg_match( $testRE, DOMCompat::getOuterHTML( $child ) ) ) { |
583 | // A "|" char in the HTML will trigger table cell tokenization. |
584 | // Ex: "| foobar <div> x | y </div>" will split the <div> |
585 | // in table-cell tokenization context. |
586 | return self::OTHER_REPARSE; |
587 | } |
588 | } |
589 | $child = $child->nextSibling; |
590 | } |
591 | } |
592 | |
593 | return self::NO_REPARSING; |
594 | } |
595 | |
596 | /** |
597 | * @param Element $cell $cell is known to be <td>/<th> |
598 | * @param Frame $frame |
599 | * @return mixed |
600 | */ |
601 | public function handleTableCellTemplates( |
602 | Element $cell, Frame $frame |
603 | ) { |
604 | if ( WTUtils::isLiteralHTMLNode( $cell ) ) { |
605 | return true; |
606 | } |
607 | |
608 | $reparseType = $this->getReparseType( $cell ); |
609 | if ( $reparseType === self::NO_REPARSING ) { |
610 | return true; |
611 | } |
612 | |
613 | if ( $reparseType === self::COMBINE_WITH_PREV_CELL ) { |
614 | if ( $this->combineWithPreviousCell( $frame, $cell ) ) { |
615 | return true; |
616 | } else { |
617 | // Clear property and retry $cell for other reparses |
618 | // The DOMTraverser will resume the handler on the |
619 | // returned $cell. |
620 | DOMDataUtils::getDataParsoid( $cell )->setTempFlag( TempData::FAILED_REPARSE ); |
621 | return $cell; |
622 | } |
623 | } |
624 | |
625 | // If the cell didn't have attrs, extract and reparse templated attrs |
626 | $dp = DOMDataUtils::getDataParsoid( $cell ); |
627 | if ( $dp->getTempFlag( TempData::NO_ATTRS ) ) { |
628 | $templateWrapper = DOMUtils::hasTypeOf( $cell, 'mw:Transclusion' ) ? $cell : null; |
629 | $this->reparseTemplatedAttributes( $frame, $cell, $templateWrapper ); |
630 | } |
631 | |
632 | // Now, examine the <td> to see if it hides additional <td>s |
633 | // and split it up if required. |
634 | // |
635 | // DOMTraverser will process the new cell and invoke |
636 | // handleTableCellTemplates on it which ensures that |
637 | // if any addition attribute fixup or splits are required, |
638 | // they will get done. |
639 | $newCell = null; |
640 | $isTd = DOMCompat::nodeName( $cell ) === 'td'; |
641 | $ownerDoc = $cell->ownerDocument; |
642 | $child = $cell->firstChild; |
643 | while ( $child ) { |
644 | $next = $child->nextSibling; |
645 | |
646 | if ( $newCell ) { |
647 | $newCell->appendChild( $child ); |
648 | } elseif ( $child instanceof Text || $this->isSimpleTemplatedSpan( $child ) ) { |
649 | // FIXME: This skips over scenarios like <div>foo||bar</div>. |
650 | $cellName = DOMCompat::nodeName( $cell ); |
651 | $hasSpanWrapper = !( $child instanceof Text ); |
652 | $match = null; |
653 | |
654 | if ( $isTd ) { |
655 | preg_match( '/^(.*?[^|])?\|\|([^|].*)?$/D', $child->textContent, $match ); |
656 | } else { /* cellName === 'th' */ |
657 | // Find the first match of || or !! |
658 | preg_match( '/^(.*?[^|])?\|\|([^|].*)?$/D', $child->textContent, $match1 ); |
659 | preg_match( '/^(.*?[^!])?\!\!([^!].*)?$/D', $child->textContent, $match2 ); |
660 | if ( $match1 && $match2 ) { |
661 | $match = strlen( $match1[1] ?? '' ) < strlen( $match2[1] ?? '' ) |
662 | ? $match1 |
663 | : $match2; |
664 | } else { |
665 | $match = $match1 ?: $match2; |
666 | } |
667 | } |
668 | |
669 | if ( $match ) { |
670 | $child->textContent = $match[1] ?? ''; |
671 | |
672 | $newCell = $ownerDoc->createElement( $cellName ); |
673 | if ( $hasSpanWrapper ) { |
674 | /** |
675 | * $hasSpanWrapper above ensures $child is a span. |
676 | * |
677 | * @var Element $child |
678 | */ |
679 | '@phan-var Element $child'; |
680 | // Fix up transclusion wrapping |
681 | $about = $child->getAttribute( 'about' ) ?? ''; |
682 | $this->hoistTransclusionInfo( $frame, [ $child ], $cell ); |
683 | } else { |
684 | // Refetch the about attribute since 'reparseTemplatedAttributes' |
685 | // might have added one to it. |
686 | $about = $cell->getAttribute( 'about' ) ?? ''; |
687 | } |
688 | |
689 | // about may not be present if the cell was inside |
690 | // wrapped template content rather than being part |
691 | // of the outermost wrapper. |
692 | if ( $about ) { |
693 | $newCell->setAttribute( 'about', $about ); |
694 | } |
695 | $newCell->appendChild( $ownerDoc->createTextNode( $match[2] ?? '' ) ); |
696 | $cell->parentNode->insertBefore( $newCell, $cell->nextSibling ); |
697 | |
698 | // Set data-parsoid noAttrs flag |
699 | $newCellDP = DOMDataUtils::getDataParsoid( $newCell ); |
700 | $newCellDP->setTempFlag( TempData::NO_ATTRS ); |
701 | } |
702 | } |
703 | |
704 | $child = $next; |
705 | } |
706 | |
707 | return true; |
708 | } |
709 | } |