Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 274 |
|
0.00% |
0 / 8 |
CRAP | |
0.00% |
0 / 1 |
TableFixups | |
0.00% |
0 / 274 |
|
0.00% |
0 / 8 |
10302 | |
0.00% |
0 / 1 |
isSimpleTemplatedSpan | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
12 | |||
fillDSRGap | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
hoistTransclusionInfo | |
0.00% |
0 / 56 |
|
0.00% |
0 / 1 |
182 | |||
collectAttributishContent | |
0.00% |
0 / 38 |
|
0.00% |
0 / 1 |
210 | |||
reparseTemplatedAttributes | |
0.00% |
0 / 32 |
|
0.00% |
0 / 1 |
90 | |||
combineAttrsWithPreviousCell | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
110 | |||
getReparseType | |
0.00% |
0 / 35 |
|
0.00% |
0 / 1 |
600 | |||
handleTableCellTemplates | |
0.00% |
0 / 63 |
|
0.00% |
0 / 1 |
702 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Handlers; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\Core\Sanitizer; |
9 | use Wikimedia\Parsoid\DOM\Comment; |
10 | use Wikimedia\Parsoid\DOM\Element; |
11 | use Wikimedia\Parsoid\DOM\Node; |
12 | use Wikimedia\Parsoid\DOM\Text; |
13 | use Wikimedia\Parsoid\NodeData\DataMw; |
14 | use Wikimedia\Parsoid\NodeData\TempData; |
15 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
16 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
17 | use Wikimedia\Parsoid\Utils\DOMCompat; |
18 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
19 | use Wikimedia\Parsoid\Utils\DOMUtils; |
20 | use Wikimedia\Parsoid\Utils\DTState; |
21 | use Wikimedia\Parsoid\Utils\PHPUtils; |
22 | use Wikimedia\Parsoid\Utils\Utils; |
23 | use Wikimedia\Parsoid\Utils\WTUtils; |
24 | use Wikimedia\Parsoid\Wt2Html\Frame; |
25 | use Wikimedia\Parsoid\Wt2Html\PegTokenizer; |
26 | |
27 | /** |
28 | * Provides DOMTraverser visitors that fix template-induced interrupted table cell parsing |
29 | * by recombining table cells and/or reparsing table cell content as attributes. |
30 | * - handleTableCellTemplates |
31 | */ |
32 | class TableFixups { |
33 | |
34 | private static function isSimpleTemplatedSpan( Node $node ): bool { |
35 | return DOMCompat::nodeName( $node ) === 'span' && |
36 | DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) && |
37 | DOMUtils::allChildrenAreTextOrComments( $node ); |
38 | } |
39 | |
40 | /** |
41 | * @param list<string|TemplateInfo> &$parts |
42 | * @param Frame $frame |
43 | * @param int $offset1 |
44 | * @param int $offset2 |
45 | */ |
46 | private static function fillDSRGap( array &$parts, Frame $frame, int $offset1, int $offset2 ): void { |
47 | if ( $offset1 < $offset2 ) { |
48 | $parts[] = PHPUtils::safeSubstr( $frame->getSrcText(), $offset1, $offset2 - $offset1 ); |
49 | } |
50 | } |
51 | |
52 | /** |
53 | * Hoist transclusion information from cell content / attributes |
54 | * onto the cell itself. |
55 | */ |
56 | private static function hoistTransclusionInfo( |
57 | DTState $dtState, array $transclusions, Element $td |
58 | ): void { |
59 | // Initialize dsr for $td |
60 | // In `handleTableCellTemplates`, we're creating a cell w/o dsr info. |
61 | $tdDp = DOMDataUtils::getDataParsoid( $td ); |
62 | if ( !Utils::isValidDSR( $tdDp->dsr ?? null ) ) { |
63 | $tplDp = DOMDataUtils::getDataParsoid( $transclusions[0] ); |
64 | Assert::invariant( Utils::isValidDSR( $tplDp->dsr ?? null ), 'Expected valid DSR' ); |
65 | $tdDp->dsr = clone $tplDp->dsr; |
66 | } |
67 | |
68 | // Build up $parts, $pi to set up the combined transclusion info on $td. |
69 | // Note that content for all but the last template has been swallowed into |
70 | // the attributes of $td. |
71 | $parts = []; |
72 | $pi = []; |
73 | $lastTpl = null; |
74 | $prevDp = null; |
75 | $frame = $dtState->options['frame']; |
76 | |
77 | $index = 0; |
78 | foreach ( $transclusions as $i => $tpl ) { |
79 | $tplDp = DOMDataUtils::getDataParsoid( $tpl ); |
80 | Assert::invariant( Utils::isValidDSR( $tplDp->dsr ?? null ), 'Expected valid DSR' ); |
81 | |
82 | // Plug DSR gaps between transclusions |
83 | if ( !$prevDp ) { |
84 | self::fillDSRGap( $parts, $frame, $tdDp->dsr->start, $tplDp->dsr->start ); |
85 | } else { |
86 | self::fillDSRGap( $parts, $frame, $prevDp->dsr->end, $tplDp->dsr->start ); |
87 | } |
88 | |
89 | // Assimilate $tpl's data-mw and data-parsoid pi info |
90 | $dmw = DOMDataUtils::getDataMw( $tpl ); |
91 | foreach ( $dmw->parts ?? [] as $part ) { |
92 | // Template index is relative to other transclusions. |
93 | // This index is used to extract whitespace information from |
94 | // data-parsoid and that array only includes info for templates. |
95 | // So skip over strings here. |
96 | if ( !is_string( $part ) ) { |
97 | // Cloning is strictly not needed here, but mimicking |
98 | // code in WrapSectionsState.php |
99 | $part = clone $part; |
100 | $part->i = $index++; |
101 | } |
102 | $parts[] = $part; |
103 | } |
104 | PHPUtils::pushArray( $pi, $tplDp->pi ?? [ [] ] ); |
105 | DOMDataUtils::setDataMw( $tpl, null ); |
106 | |
107 | $lastTpl = $tpl; |
108 | $prevDp = $tplDp; |
109 | } |
110 | |
111 | $aboutId = DOMCompat::getAttribute( $lastTpl, 'about' ); |
112 | |
113 | // Hoist transclusion information to $td. |
114 | $td->setAttribute( 'typeof', 'mw:Transclusion' ); |
115 | $td->setAttribute( 'about', $aboutId ); |
116 | |
117 | // Add wikitext for the table cell content following $lastTpl |
118 | self::fillDSRGap( $parts, $frame, $prevDp->dsr->end, $tdDp->dsr->end ); |
119 | |
120 | // Save the new data-mw on the td |
121 | $dmw = new DataMw( [] ); |
122 | $dmw->parts = $parts; |
123 | DOMDataUtils::setDataMw( $td, $dmw ); |
124 | $tdDp->pi = $pi; |
125 | |
126 | // td wraps everything now. |
127 | // Remove template encapsulation from here on. |
128 | // This simplifies the problem of analyzing the <td> |
129 | // for additional fixups (|| Boo || Baz) by potentially |
130 | // invoking 'reparseTemplatedAttributes' on split cells |
131 | // with some modifications. |
132 | $child = $lastTpl; |
133 | |
134 | // Transclusions may be nested in elements in some ugly wikitext so |
135 | // make sure we're starting at a direct descendant of td |
136 | while ( $child->parentNode !== $td ) { |
137 | $child = $child->parentNode; |
138 | } |
139 | |
140 | while ( $child ) { |
141 | if ( |
142 | DOMCompat::nodeName( $child ) === 'span' && |
143 | DOMCompat::getAttribute( $child, 'about' ) === $aboutId |
144 | ) { |
145 | // Remove the encapsulation attributes. If there are no more attributes left, |
146 | // the span wrapper is useless and can be removed. |
147 | $child->removeAttribute( 'about' ); |
148 | $child->removeAttribute( 'typeof' ); |
149 | if ( DOMDataUtils::noAttrs( $child ) ) { |
150 | $next = $child->firstChild ?: $child->nextSibling; |
151 | DOMUtils::migrateChildren( $child, $td, $child ); |
152 | $child->parentNode->removeChild( $child ); |
153 | $child = $next; |
154 | } else { |
155 | $child = $child->nextSibling; |
156 | } |
157 | } else { |
158 | $child = $child->nextSibling; |
159 | } |
160 | } |
161 | |
162 | // $dtState->tplInfo can be null when information is hoisted |
163 | // from children to $td because DOMTraverser hasn't seen the |
164 | // children yet! |
165 | if ( !$dtState->tplInfo ) { |
166 | $dtState->tplInfo = (object)[ |
167 | 'first' => $td, |
168 | 'last' => $td, |
169 | 'clear' => false |
170 | ]; |
171 | } |
172 | } |
173 | |
174 | /** |
175 | * Collect potential attribute content. |
176 | * |
177 | * We expect this to be text nodes without a pipe character followed by one or |
178 | * more nowiki spans, followed by a template encapsulation with pure-text and |
179 | * nowiki content. Collection stops when encountering a pipe character. |
180 | * |
181 | * @param Env $env |
182 | * @param Element $cell known to be <td> / <th> |
183 | * @param ?Element $templateWrapper |
184 | * @return ?array |
185 | */ |
186 | public static function collectAttributishContent( |
187 | Env $env, Element $cell, ?Element $templateWrapper |
188 | ): ?array { |
189 | $buf = []; |
190 | $nowikis = []; |
191 | $transclusions = $templateWrapper ? [ $templateWrapper ] : []; |
192 | |
193 | // Some of this logic could be replaced by DSR-based recovery of |
194 | // wikitext that is outside templates. But since we have to walk over |
195 | // templated content in this fashion anyway, we might as well use the |
196 | // same logic uniformly. |
197 | |
198 | $traverse = static function ( ?Node $child ) use ( |
199 | &$traverse, &$buf, &$nowikis, &$transclusions |
200 | ): bool { |
201 | while ( $child ) { |
202 | if ( $child instanceof Comment ) { |
203 | // Legacy parser strips comments during parsing => drop them. |
204 | } elseif ( $child instanceof Text ) { |
205 | $text = $child->nodeValue; |
206 | $buf[] = $text; |
207 | |
208 | // Are we done accumulating? |
209 | if ( preg_match( '/(?:^|[^|])\|(?:[^|]|$)/D', $text ) ) { |
210 | return true; |
211 | } |
212 | } else { |
213 | '@phan-var Element $child'; /** @var Element $child */ |
214 | if ( DOMUtils::hasTypeOf( $child, 'mw:Transclusion' ) ) { |
215 | $transclusions[] = $child; |
216 | } |
217 | |
218 | if ( WTUtils::isFirstExtensionWrapperNode( $child ) ) { |
219 | // "|" chars in extension content don't trigger table-cell parsing |
220 | // since they have higher precedence in tokenization. The extension |
221 | // content will simply be dropped (but any side effects it had will |
222 | // continue to apply. Ex: <ref> tags might leave an orphaned ref in |
223 | // the <references> section). |
224 | $child = WTUtils::skipOverEncapsulatedContent( $child ); |
225 | continue; |
226 | } elseif ( DOMUtils::hasTypeOf( $child, 'mw:Entity' ) ) { |
227 | // Get entity's wikitext source, not rendered content. |
228 | // " " is "\n" which breaks attribute parsing! |
229 | $buf[] = DOMDataUtils::getDataParsoid( $child )->src ?? $child->textContent; |
230 | } elseif ( DOMUtils::hasTypeOf( $child, 'mw:Nowiki' ) ) { |
231 | // Nowiki span were added to protect otherwise |
232 | // meaningful wikitext chars used in attributes. |
233 | // Save the content and add in a marker to splice out later. |
234 | $nowikis[] = $child->textContent; |
235 | $buf[] = '<nowiki-marker>'; |
236 | } elseif ( |
237 | DOMUtils::matchRel( $child, '#^mw:WikiLink(/Interwiki)?$#' ) || |
238 | WTUtils::isGeneratedFigure( $child ) |
239 | ) { |
240 | // Wikilinks/images abort attribute parsing |
241 | return true; |
242 | } else { |
243 | if ( $traverse( $child->firstChild ) ) { |
244 | return true; |
245 | } |
246 | } |
247 | } |
248 | |
249 | $child = $child->nextSibling; |
250 | } |
251 | |
252 | return false; |
253 | }; |
254 | |
255 | if ( $traverse( $cell->firstChild ) ) { |
256 | return [ |
257 | 'txt' => implode( '', $buf ), |
258 | 'nowikis' => $nowikis, |
259 | 'transclusions' => $transclusions, |
260 | ]; |
261 | } else { |
262 | return null; |
263 | } |
264 | } |
265 | |
266 | /** |
267 | * T46498, second part of T52603 |
268 | * |
269 | * Handle wikitext like |
270 | * ``` |
271 | * {| |
272 | * |{{nom|Bar}} |
273 | * |} |
274 | * ``` |
275 | * where nom expands to `style="foo" class="bar"|Bar`. The attributes are |
276 | * tokenized and stripped from the table contents. |
277 | * |
278 | * This method works well for the templates documented in |
279 | * https://en.wikipedia.org/wiki/Template:Table_cell_templates/doc |
280 | * |
281 | * Nevertheless, there are some limitations: |
282 | * - We assume that attributes don't contain wiki markup (apart from <nowiki>) |
283 | * and end up in text or nowiki nodes. |
284 | * - Only a single table cell is produced / opened by the template that |
285 | * contains the attributes. This limitation could be lifted with more |
286 | * aggressive re-parsing if really needed in practice. |
287 | * - There is only a single transclusion in the table cell content. This |
288 | * limitation can be lifted with more advanced data-mw construction. |
289 | * |
290 | * $cell known to be <td> / <th> |
291 | */ |
292 | public static function reparseTemplatedAttributes( |
293 | DTState $dtState, Element $cell, ?Element $templateWrapper |
294 | ): void { |
295 | $env = $dtState->env; |
296 | $frame = $dtState->options['frame']; |
297 | // Collect attribute content and examine it |
298 | $attributishContent = self::collectAttributishContent( $env, $cell, $templateWrapper ); |
299 | if ( !$attributishContent ) { |
300 | return; |
301 | } |
302 | |
303 | /** |
304 | * FIXME: These checks are insufficient. |
305 | * Previous rounds of table fixups might have created this cell without |
306 | * any templated content (the while loop in handleTableCellTemplates). |
307 | * Till we figure out a reliable test for this, we'll reparse attributes always. |
308 | * |
309 | * // This DOM pass is trying to bridge broken parses across |
310 | * // template boundaries. So, if templates aren't involved, |
311 | * // no reason to reparse. |
312 | * if ( count( $attributishContent['transclusions'] ) === 0 && |
313 | * !WTUtils::fromEncapsulatedContent( $cell ) |
314 | * ) { |
315 | * return; |
316 | * } |
317 | */ |
318 | |
319 | $attrText = $attributishContent['txt']; |
320 | if ( !preg_match( '/(^[^|]+\|)([^|]|$)/D', $attrText, $matches ) ) { |
321 | return; |
322 | } |
323 | $attributishPrefix = $matches[1]; |
324 | |
325 | // Splice in nowiki content. We added in <nowiki> markers to prevent the |
326 | // above regexps from matching on nowiki-protected chars. |
327 | if ( str_contains( $attributishPrefix, '<nowiki-marker>' ) ) { |
328 | $attributishPrefix = preg_replace_callback( |
329 | '/<nowiki-marker>/', |
330 | static function ( $unused ) use ( &$attributishContent ) { |
331 | // This is a little tricky. We want to use the content from the |
332 | // nowikis to reparse the string to key/val pairs but the rule, |
333 | // single_cell_table_args, will invariably get tripped up on |
334 | // newlines which, to this point, were shuttled through in the |
335 | // nowiki. Core sanitizer will do this replacement in attr vals |
336 | // so it's a safe normalization to do here. |
337 | return preg_replace( '/\s+/', ' ', array_shift( $attributishContent['nowikis'] ) ); |
338 | }, |
339 | $attributishPrefix |
340 | ); |
341 | } |
342 | |
343 | // re-parse the attributish prefix |
344 | if ( !$dtState->tokenizer ) { |
345 | $dtState->tokenizer = new PegTokenizer( $env ); |
346 | } |
347 | $attributeTokens = $dtState->tokenizer->tokenizeTableCellAttributes( $attributishPrefix, false ); |
348 | |
349 | // No attributes => nothing more to do! |
350 | if ( !$attributeTokens ) { |
351 | return; |
352 | } |
353 | |
354 | // Note that `row_syntax_table_args` (the rule used for tokenizing above) |
355 | // returns an array consisting of [table_attributes, spaces, pipe] |
356 | $attrs = $attributeTokens[0]; |
357 | |
358 | // Sanitize attrs and transfer them to the td node |
359 | Sanitizer::applySanitizedArgs( $env->getSiteConfig(), $cell, $attrs ); |
360 | $cellDp = DOMDataUtils::getDataParsoid( $cell ); |
361 | // Reparsed cells start off as non-mergeable-table cells |
362 | // and preserve that property after reparsing |
363 | $cellDp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ); |
364 | $cellDp->setTempFlag( TempData::NO_ATTRS, false ); |
365 | |
366 | // If the transclusion node was embedded within the td node, |
367 | // lift up the about group to the td node. |
368 | $transclusions = $attributishContent['transclusions']; |
369 | if ( $transclusions && ( $cell !== $transclusions[0] || count( $transclusions ) > 1 ) ) { |
370 | self::hoistTransclusionInfo( $dtState, $transclusions, $cell ); |
371 | } |
372 | |
373 | // Drop content that has been consumed by the reparsed attribute content. |
374 | // NOTE: We serialize and reparse data-object-id attributes as well which |
375 | // ensures stashed data-* attributes continue to be usable. |
376 | // FIXME: This is too naive. What about all the care we showed in `collectAttributishContent`? |
377 | DOMCompat::setInnerHTML( $cell, |
378 | preg_replace( '/^[^|]*\|/', '', DOMCompat::getInnerHTML( $cell ) ) ); |
379 | } |
380 | |
381 | /** |
382 | * Possibilities: |
383 | * - $cell and $prev are both <td>s (or both <th>s) |
384 | * - Common case |
385 | * - Ex: "|align=left {{tpl returning | foobar}}" |
386 | * So, "|align=left |foobar" is the combined string |
387 | * - Combined cell is a <td> (will be <th> if both were <th>s) |
388 | * - We assign new attributes to $cell and drop $prev |
389 | * - $cell is <td> and $prev is <th> |
390 | * - Ex: "!align=left {{tpl returning | foobar}}" |
391 | * So, "!align=left |foobar" is the combined string |
392 | * - The combined cell will be a <th> with attributes "align=left" |
393 | * and content "foobar" |
394 | * - $cell is <th> and $prev is <td> |
395 | * - Ex: "|align=left {{tpl returning !scope=row | foobar}}" |
396 | * So "|align=left !scope=row | foobar" is the combined string |
397 | * and we need to drop the th-attributes entirely after combining |
398 | * - The combined cell will be a <td> |
399 | * - $cell's attribute is dropped |
400 | * - $prev's content is dropped |
401 | * |
402 | * FIXME: There are a number of other merge possibilities that end up |
403 | * in this function that aren't accounted for yet! Couple of them are |
404 | * in the unsupported scenario 1/2 buckets below. |
405 | * |
406 | * @param DTState $dtState |
407 | * @param Element $cell |
408 | * @return bool |
409 | */ |
410 | private static function combineAttrsWithPreviousCell( DTState $dtState, Element $cell ): bool { |
411 | // UNSUPPORTED SCENARIO 1: |
412 | // In this cell-combining scenario, $prev can have attributes only if it |
413 | // also had content. See example below: |
414 | // Ex: |class="foo"|bar{{1x|1={{!}}foo}} |
415 | // should parse as <td class="foo">bar|foo</td> |
416 | // In this case, the attributes/content of $cell would end up as the |
417 | // content of the combined cell. |
418 | // |
419 | // UNSUPPORTED SCENARIO 2: |
420 | // The template produced attributes as well as maybe a new cell. |
421 | // Ex: |class="foo"{{1x| foo}} and |class="foo"{{1x| foo}} |
422 | // We let the more general 'reparseTemplatedAttributes' code handle |
423 | // this scenario for now. |
424 | $prev = $cell->previousSibling; |
425 | DOMUtils::assertElt( $prev ); |
426 | $prevDp = DOMDataUtils::getDataParsoid( $prev ); |
427 | |
428 | // If $prevDp has attributes already, we don't want to reparse content |
429 | // as the attributes. However, we might be in unsupported scenario 1 |
430 | // above, but that's definitionally still unsupported so bail for now. |
431 | if ( !$prevDp->getTempFlag( TempData::NO_ATTRS ) ) { |
432 | return false; |
433 | } |
434 | |
435 | // Build the attribute string |
436 | $frame = $dtState->options['frame']; |
437 | $prevCellSrc = PHPUtils::safeSubstr( |
438 | $frame->getSrcText(), $prevDp->dsr->start, $prevDp->dsr->length() |
439 | ); |
440 | $reparseSrc = substr( $prevCellSrc, $prevDp->dsr->openWidth ); |
441 | |
442 | // The previous cell had NO_ATTRS, from the check above, but the cell |
443 | // ends in a vertical bar. This isn't a scenario where we'll combine |
444 | // the cell content to form attributes, so there's no sense in trying |
445 | // to tokenize them below; they probably already failed during the |
446 | // original tokenizing However, the trailing vertical probably does |
447 | // want to be hoisted into the next cell, to combine to form row syntax. |
448 | if ( substr( $reparseSrc, -1 ) === "|" ) { |
449 | return false; |
450 | } |
451 | |
452 | // "|" or "!", but doesn't matter since we discard that anyway |
453 | $reparseSrc .= "|"; |
454 | |
455 | // Reparse the attributish prefix |
456 | $env = $dtState->env; |
457 | if ( !$dtState->tokenizer ) { |
458 | $dtState->tokenizer = new PegTokenizer( $env ); |
459 | } |
460 | $attributeTokens = $dtState->tokenizer->tokenizeTableCellAttributes( $reparseSrc, false ); |
461 | if ( !is_array( $attributeTokens ) ) { |
462 | $env->log( "error/wt2html", |
463 | "TableFixups: Failed to successfully reparse $reparseSrc as table cell attributes" ); |
464 | return false; |
465 | } |
466 | |
467 | // Update data-mw, DSR if $cell is an encapsulation wrapper |
468 | $cellDp = DOMDataUtils::getDataParsoid( $cell ); |
469 | if ( DOMUtils::hasTypeOf( $cell, 'mw:Transclusion' ) ) { |
470 | $dataMW = DOMDataUtils::getDataMw( $cell ); |
471 | array_unshift( $dataMW->parts, $prevCellSrc ); |
472 | $cellDSR = $cellDp->dsr ?? null; |
473 | if ( $cellDSR && $cellDSR->start ) { |
474 | $cellDSR->start -= strlen( $prevCellSrc ); |
475 | } |
476 | } |
477 | |
478 | $parent = $cell->parentNode; |
479 | if ( DOMCompat::nodeName( $cell ) === DOMCompat::nodeName( $prev ) ) { |
480 | // Matching cell types |
481 | $combinedCell = $cell; |
482 | $combinedCellDp = $cellDp; |
483 | $parent->removeChild( $prev ); |
484 | } else { |
485 | // Different cell types |
486 | $combinedCell = $prev; |
487 | |
488 | // Remove all content on $prev which will |
489 | // become the new combined cell |
490 | DOMCompat::replaceChildren( $prev ); |
491 | |
492 | // Note that this implicitly migrates data-mw and data-parsoid |
493 | foreach ( DOMUtils::attributes( $cell ) as $k => $v ) { |
494 | $combinedCell->setAttribute( $k, $v ); |
495 | } |
496 | DOMUtils::migrateChildren( $cell, $combinedCell ); |
497 | $parent->removeChild( $cell ); |
498 | |
499 | $combinedCellDp = DOMDataUtils::getDataParsoid( $combinedCell ); |
500 | } |
501 | |
502 | // Note that `row_syntax_table_args` (the rule used for tokenizing above) |
503 | // returns an array consisting of [table_attributes, spaces, pipe] |
504 | $attrs = $attributeTokens[0]; |
505 | Sanitizer::applySanitizedArgs( $env->getSiteConfig(), $combinedCell, $attrs ); |
506 | // Combined cells don't merge further |
507 | $combinedCellDp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ); |
508 | $combinedCellDp->setTempFlag( TempData::NO_ATTRS, false ); |
509 | |
510 | return true; |
511 | } |
512 | |
513 | private const NO_REPARSING = 0; |
514 | private const COMBINE_WITH_PREV_CELL = 1; |
515 | private const OTHER_REPARSE = 2; |
516 | |
517 | /** |
518 | * $cell is known to be <td>/<th> |
519 | */ |
520 | private static function getReparseType( Element $cell, DTState $dtState ): int { |
521 | $inTplContent = $dtState->tplInfo !== null; |
522 | $dp = DOMDataUtils::getDataParsoid( $cell ); |
523 | if ( !$dp->getTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ) && |
524 | !$dp->getTempFlag( TempData::FAILED_REPARSE ) && |
525 | // This is a good proxy for what we need: "Is $cell a template wrapper?". |
526 | // That info won't be available for nested templates unless we want |
527 | // to use a more expensive hacky check. |
528 | // "inTplContent" is sufficient because we won't have mergeable |
529 | // cells for wikitext that doesn't get any part of its content from |
530 | // templates because NON_MERGEABLE_TABLE_CELL prevents such merges. |
531 | $inTplContent |
532 | ) { |
533 | // Look for opportunities where table cells could combine. This requires |
534 | // $cell to be a templated cell. But, we don't support combining |
535 | // templated cells with other templated cells. So, previous sibling |
536 | // cannot be templated. |
537 | // |
538 | // So, bail out of scenarios where prevDp comes from a template (the checks |
539 | // for isValidDSR( $prevDp-> dsr ) and valid opening tag width catch this. |
540 | $prev = $cell->previousSibling; |
541 | $prevDp = $prev instanceof Element ? DOMDataUtils::getDataParsoid( $prev ) : null; |
542 | if ( $prevDp && |
543 | !WTUtils::hasLiteralHTMLMarker( $prevDp ) && |
544 | Utils::isValidDSR( $prevDp->dsr ?? null, true ) && |
545 | !DOMUtils::hasTypeOf( $prev, 'mw:Transclusion' ) && |
546 | !str_contains( DOMCompat::getInnerHTML( $prev ), "\n" ) |
547 | ) { |
548 | return self::COMBINE_WITH_PREV_CELL; |
549 | } |
550 | } |
551 | |
552 | $cellIsTd = DOMCompat::nodeName( $cell ) === 'td'; |
553 | $testRE = $cellIsTd ? '/[|]/' : '/[!|]/'; |
554 | $child = $cell->firstChild; |
555 | while ( $child ) { |
556 | if ( !$inTplContent && DOMUtils::hasTypeOf( $child, 'mw:Transclusion' ) ) { |
557 | $inTplContent = true; |
558 | } |
559 | |
560 | if ( $inTplContent && |
561 | $child instanceof Text && |
562 | preg_match( $testRE, $child->textContent ) |
563 | ) { |
564 | return self::OTHER_REPARSE; |
565 | } |
566 | |
567 | if ( $child instanceof Element ) { |
568 | if ( WTUtils::isFirstExtensionWrapperNode( $child ) ) { |
569 | // "|" chars in extension/language variant content don't trigger |
570 | // table-cell parsing since they have higher precedence in tokenization |
571 | $child = WTUtils::skipOverEncapsulatedContent( $child ); |
572 | } else { |
573 | if ( |
574 | DOMUtils::matchRel( $child, '#^mw:WikiLink(/Interwiki)?$#' ) || |
575 | WTUtils::isGeneratedFigure( $child ) |
576 | ) { |
577 | // Wikilinks/images abort attribute parsing |
578 | return self::NO_REPARSING; |
579 | } |
580 | // FIXME: Ugly for now |
581 | $outerHTML = DOMCompat::getOuterHTML( $child ); |
582 | if ( preg_match( $testRE, $outerHTML ) && |
583 | ( $inTplContent || preg_match( '/"mw:Transclusion"/', $outerHTML ) ) |
584 | ) { |
585 | // A "|" char in the HTML will trigger table cell tokenization. |
586 | // Ex: "| foobar <div> x | y </div>" will split the <div> |
587 | // in table-cell tokenization context. |
588 | return self::OTHER_REPARSE; |
589 | } |
590 | $child = $child->nextSibling; |
591 | } |
592 | } else { |
593 | $child = $child->nextSibling; |
594 | } |
595 | } |
596 | |
597 | return self::NO_REPARSING; |
598 | } |
599 | |
600 | /** |
601 | * In a wikitext-syntax-table-parsing context, the meaning of |
602 | * "|", "||", "!", "!!" is context-sensitive. Additionally, the |
603 | * complete syntactical construct for a table cell (including leading |
604 | * pipes, attributes, and content-separating pipe char) might straddle |
605 | * a template boundary - with some content coming from the top-level and |
606 | * some from a template. |
607 | * |
608 | * This impacts parsing of tables when some cells are templated since |
609 | * Parsoid parses template content independent of top-level content |
610 | * (without any preceding context). This means that Parsoid's table-cell |
611 | * parsing in templated contexts might be incorrect |
612 | * |
613 | * To deal with this, Parsoid implements this table-fixups pass that |
614 | * has to deal with cell-merging and cell-reparsing scenarios. |
615 | * |
616 | * HTML-syntax cells and non-templated cells without any templated content |
617 | * are not subject to this transformation and can be skipped right away. |
618 | * |
619 | * FIXME: This pass can benefit from a customized procsssor rather than |
620 | * piggyback on top of DOMTraverser since the DOM can be significantly |
621 | * mutated in these handlers. |
622 | * |
623 | * @param Element $cell $cell is known to be <td>/<th> |
624 | * @param DTState $dtState |
625 | * @return mixed |
626 | */ |
627 | public static function handleTableCellTemplates( Element $cell, DTState $dtState ) { |
628 | if ( WTUtils::isLiteralHTMLNode( $cell ) ) { |
629 | return true; |
630 | } |
631 | |
632 | // Deal with <th> special case where "!! foo" is parsed as <th>! foo</th> |
633 | // but should have been parsed as <th>foo</th> when not the first child |
634 | if ( DOMCompat::nodeName( $cell ) === 'th' && |
635 | DOMUtils::hasTypeOf( $cell, 'mw:Transclusion' ) && |
636 | // This is checking that previous sibling is not "\n" which would |
637 | // signal that this <th> is on a fresh line and the "!" shouldn't be stripped. |
638 | // If this weren't template output, we would check for "stx" === 'row'. |
639 | // FIXME: Note that ths check is fragile and doesn't work always, but this is |
640 | // the price we pay for Parsoid's independent template parsing! |
641 | $cell->previousSibling instanceof Element |
642 | ) { |
643 | $fc = DiffDOMUtils::firstNonSepChild( $cell ); |
644 | if ( $fc instanceof Text ) { |
645 | $leadingText = $fc->nodeValue; |
646 | if ( str_starts_with( $leadingText, "!" ) ) { |
647 | $fc->nodeValue = substr( $leadingText, 1 ); |
648 | } |
649 | } |
650 | } |
651 | |
652 | $reparseType = self::getReparseType( $cell, $dtState ); |
653 | if ( $reparseType === self::NO_REPARSING ) { |
654 | return true; |
655 | } |
656 | |
657 | $cellDp = DOMDataUtils::getDataParsoid( $cell ); |
658 | if ( $reparseType === self::COMBINE_WITH_PREV_CELL ) { |
659 | if ( self::combineAttrsWithPreviousCell( $dtState, $cell ) ) { |
660 | return true; |
661 | } else { |
662 | // Clear property and retry $cell for other reparses |
663 | // The DOMTraverser will resume the handler on the |
664 | // returned $cell. |
665 | $cellDp->setTempFlag( TempData::FAILED_REPARSE ); |
666 | return $cell; |
667 | } |
668 | } |
669 | |
670 | // If the cell didn't have attrs, extract and reparse templated attrs |
671 | if ( $cellDp->getTempFlag( TempData::NO_ATTRS ) ) { |
672 | $frame = $dtState->options['frame']; |
673 | $templateWrapper = DOMUtils::hasTypeOf( $cell, 'mw:Transclusion' ) ? $cell : null; |
674 | self::reparseTemplatedAttributes( $dtState, $cell, $templateWrapper ); |
675 | } |
676 | |
677 | // Now, examine the <td> to see if it hides additional <td>s |
678 | // and split it up if required. |
679 | // |
680 | // DOMTraverser will process the new cell and invoke |
681 | // handleTableCellTemplates on it which ensures that |
682 | // if any addition attribute fixup or splits are required, |
683 | // they will get done. |
684 | $newCell = null; |
685 | $isTd = DOMCompat::nodeName( $cell ) === 'td'; |
686 | $ownerDoc = $cell->ownerDocument; |
687 | $child = $cell->firstChild; |
688 | while ( $child ) { |
689 | $next = $child->nextSibling; |
690 | |
691 | if ( $newCell ) { |
692 | $newCell->appendChild( $child ); |
693 | } elseif ( $child instanceof Text || self::isSimpleTemplatedSpan( $child ) ) { |
694 | // FIXME: This skips over scenarios like <div>foo||bar</div>. |
695 | $cellName = DOMCompat::nodeName( $cell ); |
696 | $hasSpanWrapper = !( $child instanceof Text ); |
697 | $match = $match1 = $match2 = null; |
698 | |
699 | // Find the first match of || |
700 | preg_match( '/^((?:[^|]*(?:\|[^|])?)*)\|\|([^|].*)?$/D', $child->textContent, $match1 ); |
701 | if ( $isTd ) { |
702 | $match = $match1; |
703 | } else { |
704 | // Find the first match !! |
705 | preg_match( '/^((?:[^!]*(?:\![^!])?)*)\!\!([^!].*)?$/D', $child->textContent, $match2 ); |
706 | |
707 | // Pick the shortest match |
708 | if ( $match1 && $match2 ) { |
709 | $match = strlen( $match1[1] ?? '' ) < strlen( $match2[1] ?? '' ) |
710 | ? $match1 |
711 | : $match2; |
712 | } else { |
713 | $match = $match1 ?: $match2; |
714 | } |
715 | } |
716 | |
717 | if ( $match ) { |
718 | $child->textContent = $match[1] ?? ''; |
719 | |
720 | $newCell = $ownerDoc->createElement( $cellName ); |
721 | if ( $hasSpanWrapper ) { |
722 | /** |
723 | * $hasSpanWrapper above ensures $child is a span. |
724 | * |
725 | * @var Element $child |
726 | */ |
727 | '@phan-var Element $child'; |
728 | // Fix up transclusion wrapping |
729 | $about = DOMCompat::getAttribute( $child, 'about' ); |
730 | self::hoistTransclusionInfo( $dtState, [ $child ], $cell ); |
731 | } else { |
732 | // Refetch the about attribute since 'reparseTemplatedAttributes' |
733 | // might have added one to it. |
734 | $about = DOMCompat::getAttribute( $cell, 'about' ); |
735 | } |
736 | |
737 | // about may not be present if the cell was inside |
738 | // wrapped template content rather than being part |
739 | // of the outermost wrapper. |
740 | if ( $about !== null ) { |
741 | $newCell->setAttribute( 'about', $about ); |
742 | if ( $dtState->tplInfo && $dtState->tplInfo->last === $cell ) { |
743 | $dtState->tplInfo->last = $newCell; |
744 | } |
745 | } |
746 | $newCell->appendChild( $ownerDoc->createTextNode( $match[2] ?? '' ) ); |
747 | $cell->parentNode->insertBefore( $newCell, $cell->nextSibling ); |
748 | |
749 | // Set data-parsoid noAttrs flag |
750 | $newCellDp = DOMDataUtils::getDataParsoid( $newCell ); |
751 | // This new cell has 'row' stx (would be set if the tokenizer had parsed it) |
752 | $newCellDp->stx = 'row'; |
753 | $newCellDp->setTempFlag( TempData::NO_ATTRS ); |
754 | // It is important to set this so that when $newCell is processed by this pass, |
755 | // it won't accidentally recombine again with the previous cell! |
756 | $newCellDp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ); |
757 | } |
758 | } |
759 | |
760 | $child = $next; |
761 | } |
762 | |
763 | return true; |
764 | } |
765 | } |