Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 433 |
|
0.00% |
0 / 14 |
CRAP | |
0.00% |
0 / 1 |
| TableFixups | |
0.00% |
0 / 433 |
|
0.00% |
0 / 14 |
29756 | |
0.00% |
0 / 1 |
| isSimpleTemplatedSpan | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
20 | |||
| fillDSRGap | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
| hoistTransclusionInfo | |
0.00% |
0 / 62 |
|
0.00% |
0 / 1 |
210 | |||
| collectAttributishContent | |
0.00% |
0 / 35 |
|
0.00% |
0 / 1 |
156 | |||
| reparseTemplatedAttributes | |
0.00% |
0 / 31 |
|
0.00% |
0 / 1 |
90 | |||
| stripTrailingPipe | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
| transferSourceBetweenCells | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
210 | |||
| mergeCells | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
42 | |||
| convertAttribsToContent | |
0.00% |
0 / 53 |
|
0.00% |
0 / 1 |
342 | |||
| reparseWithPreviousCell | |
0.00% |
0 / 57 |
|
0.00% |
0 / 1 |
506 | |||
| shouldAbortAttr | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
| pipeStatusInContent | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
210 | |||
| getReparseType | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
240 | |||
| handleTableCellTemplates | |
0.00% |
0 / 93 |
|
0.00% |
0 / 1 |
1332 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Handlers; |
| 5 | |
| 6 | use Wikimedia\Assert\Assert; |
| 7 | use Wikimedia\Parsoid\Config\Env; |
| 8 | use Wikimedia\Parsoid\Core\DOMCompat; |
| 9 | use Wikimedia\Parsoid\Core\Sanitizer; |
| 10 | use Wikimedia\Parsoid\Core\Source; |
| 11 | use Wikimedia\Parsoid\Core\SourceRange; |
| 12 | use Wikimedia\Parsoid\DOM\Comment; |
| 13 | use Wikimedia\Parsoid\DOM\Element; |
| 14 | use Wikimedia\Parsoid\DOM\Node; |
| 15 | use Wikimedia\Parsoid\DOM\Text; |
| 16 | use Wikimedia\Parsoid\NodeData\DataMw; |
| 17 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
| 18 | use Wikimedia\Parsoid\NodeData\TempData; |
| 19 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
| 20 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
| 21 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 22 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 23 | use Wikimedia\Parsoid\Utils\DTState; |
| 24 | use Wikimedia\Parsoid\Utils\PHPUtils; |
| 25 | use Wikimedia\Parsoid\Utils\PipelineUtils; |
| 26 | use Wikimedia\Parsoid\Utils\Utils; |
| 27 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 28 | use Wikimedia\Parsoid\Wt2Html\Frame; |
| 29 | use Wikimedia\Parsoid\Wt2Html\PegTokenizer; |
| 30 | |
| 31 | /** |
| 32 | * Provides DOMTraverser visitors that fix template-induced interrupted table cell parsing |
| 33 | * by recombining table cells and/or reparsing table cell content as attributes. |
| 34 | * - handleTableCellTemplates |
| 35 | */ |
| 36 | class TableFixups { |
| 37 | |
| 38 | private static function isSimpleTemplatedSpan( Node $node ): bool { |
| 39 | return DOMUtils::nodeName( $node ) === 'span' && |
| 40 | // This check only works because it is called from a context |
| 41 | // where we are tracking templates. |
| 42 | $node instanceof Element && /* make phan happy */ |
| 43 | DOMCompat::getAttribute( $node, 'about' ) !== null && |
| 44 | DOMUtils::allChildrenAreTextOrComments( $node ); |
| 45 | } |
| 46 | |
| 47 | /** |
| 48 | * @param list<string|TemplateInfo> &$parts |
| 49 | * @param Source $source |
| 50 | * @param int $offset1 |
| 51 | * @param int $offset2 |
| 52 | */ |
| 53 | private static function fillDSRGap( array &$parts, Source $source, int $offset1, int $offset2 ): void { |
| 54 | if ( $offset1 < $offset2 ) { |
| 55 | $parts[] = PHPUtils::safeSubstr( $source->getSrcText(), $offset1, $offset2 - $offset1 ); |
| 56 | } |
| 57 | } |
| 58 | |
| 59 | /** |
| 60 | * Hoist transclusion information from cell content / attributes |
| 61 | * onto the cell itself. |
| 62 | */ |
| 63 | private static function hoistTransclusionInfo( |
| 64 | DTState $dtState, array $transclusions, Element $td |
| 65 | ): void { |
| 66 | // Initialize dsr for $td |
| 67 | // In `handleTableCellTemplates`, we're creating a cell w/o dsr info. |
| 68 | $tdDp = DOMDataUtils::getDataParsoid( $td ); |
| 69 | if ( !Utils::isValidDSR( $tdDp->dsr ?? null ) ) { |
| 70 | $tplDp = DOMDataUtils::getDataParsoid( $transclusions[0] ); |
| 71 | Assert::invariant( Utils::isValidDSR( $tplDp->dsr ?? null ), 'Expected valid DSR' ); |
| 72 | $tdDp->dsr = clone $tplDp->dsr; |
| 73 | } |
| 74 | |
| 75 | // Build up $parts, $pi to set up the combined transclusion info on $td. |
| 76 | // Note that content for all but the last template has been swallowed into |
| 77 | // the attributes of $td. |
| 78 | $parts = []; |
| 79 | $pi = []; |
| 80 | $lastTpl = null; |
| 81 | $prevDp = null; |
| 82 | $frame = $dtState->options['frame']; |
| 83 | $aboutIdArray = []; |
| 84 | |
| 85 | $index = 0; |
| 86 | foreach ( $transclusions as $tpl ) { |
| 87 | $aboutIdArray[] = DOMCompat::getAttribute( $tpl, 'about' ); |
| 88 | |
| 89 | $tplDp = DOMDataUtils::getDataParsoid( $tpl ); |
| 90 | Assert::invariant( Utils::isValidDSR( $tplDp->dsr ?? null ), 'Expected valid DSR' ); |
| 91 | |
| 92 | // Plug DSR gaps between transclusions |
| 93 | if ( !$prevDp ) { |
| 94 | self::fillDSRGap( |
| 95 | $parts, $tdDp->dsr->source ?? $frame->getSource(), |
| 96 | $tdDp->dsr->start, $tplDp->dsr->start ); |
| 97 | } else { |
| 98 | self::fillDSRGap( |
| 99 | $parts, $prevDp->dsr->source ?? $frame->getSource(), |
| 100 | $prevDp->dsr->end, $tplDp->dsr->start ); |
| 101 | } |
| 102 | |
| 103 | // Assimilate $tpl's data-mw and data-parsoid pi info |
| 104 | $dmw = DOMDataUtils::getDataMw( $tpl ); |
| 105 | foreach ( $dmw->parts ?? [] as $part ) { |
| 106 | // Template index is relative to other transclusions. |
| 107 | // This index is used to extract whitespace information from |
| 108 | // data-parsoid and that array only includes info for templates. |
| 109 | // So skip over strings here. |
| 110 | if ( !is_string( $part ) ) { |
| 111 | // Cloning is strictly not needed here, but mimicking |
| 112 | // code in WrapSectionsState.php |
| 113 | $part = clone $part; |
| 114 | $part->i = $index++; |
| 115 | } |
| 116 | $parts[] = $part; |
| 117 | } |
| 118 | PHPUtils::pushArray( $pi, $tplDp->pi ?? [ [] ] ); |
| 119 | DOMDataUtils::setDataMw( $tpl, null ); |
| 120 | |
| 121 | $lastTpl = $tpl; |
| 122 | $prevDp = $tplDp; |
| 123 | } |
| 124 | |
| 125 | $aboutId = DOMCompat::getAttribute( $lastTpl, 'about' ); |
| 126 | |
| 127 | // Hoist transclusion information to $td. |
| 128 | $td->setAttribute( 'typeof', 'mw:Transclusion' ); |
| 129 | $td->setAttribute( 'about', $aboutId ); |
| 130 | |
| 131 | // Add wikitext for the table cell content following $lastTpl |
| 132 | self::fillDSRGap( $parts, $prevDp->dsr->source ?? $frame->getSource(), $prevDp->dsr->end, $tdDp->dsr->end ); |
| 133 | |
| 134 | // Save the new data-mw on the td |
| 135 | $dmw = new DataMw( [] ); |
| 136 | $dmw->parts = $parts; |
| 137 | DOMDataUtils::setDataMw( $td, $dmw ); |
| 138 | $tdDp->pi = $pi; |
| 139 | |
| 140 | // td wraps everything now. |
| 141 | // Remove template encapsulation from here on. |
| 142 | // This simplifies the problem of analyzing the <td> |
| 143 | // for additional fixups (|| Boo || Baz) by potentially |
| 144 | // invoking 'reparseTemplatedAttributes' on split cells |
| 145 | // with some modifications. |
| 146 | $cell = $td; |
| 147 | while ( $cell instanceof Element && DOMCompat::getAttribute( $cell, 'about' ) === $aboutId ) { |
| 148 | $child = $cell->firstChild; |
| 149 | while ( $child ) { |
| 150 | $next = $child->nextSibling; |
| 151 | if ( $child instanceof Element && |
| 152 | in_array( DOMCompat::getAttribute( $child, 'about' ), $aboutIdArray, true ) |
| 153 | ) { |
| 154 | // Remove the encapsulation attributes. |
| 155 | $child->removeAttribute( 'about' ); |
| 156 | DOMUtils::removeTypeOf( $child, 'mw:Transclusion' ); |
| 157 | // If there are no more attributes left, useless spans wrapper can be removed. |
| 158 | if ( DOMDataUtils::getDataParsoid( $child )->getTempFlag( TempData::WRAPPER ) ) { |
| 159 | $next = $child->firstChild ?: $child->nextSibling; |
| 160 | DOMUtils::migrateChildren( $child, $cell, $child ); |
| 161 | $child->parentNode->removeChild( $child ); |
| 162 | } |
| 163 | } |
| 164 | $child = $next; |
| 165 | } |
| 166 | $cell = $cell->nextSibling; |
| 167 | } |
| 168 | |
| 169 | // $dtState->tplInfo can be null when information is hoisted |
| 170 | // from children to $td because DOMTraverser hasn't seen the |
| 171 | // children yet! |
| 172 | if ( !$dtState->tplInfo ) { |
| 173 | $dtState->tplInfo = (object)[ |
| 174 | 'first' => $td, |
| 175 | 'last' => $td, |
| 176 | 'clear' => false |
| 177 | ]; |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | /** |
| 182 | * Collect potential attribute content. |
| 183 | * |
| 184 | * We expect this to be text nodes without a pipe character followed by one or |
| 185 | * more nowiki spans, followed by a template encapsulation with pure-text and |
| 186 | * nowiki content. Collection stops when encountering a pipe character. |
| 187 | * |
| 188 | * @param Env $env |
| 189 | * @param Element $cell known to be <td> / <th> |
| 190 | * @param ?Element $templateWrapper |
| 191 | * @return ?array{txt: string, frags: list<?string>, transclusions: list<Element>} |
| 192 | */ |
| 193 | public static function collectAttributishContent( |
| 194 | Env $env, Element $cell, ?Element $templateWrapper |
| 195 | ): ?array { |
| 196 | $buf = []; |
| 197 | $frags = []; |
| 198 | $transclusions = $templateWrapper ? [ $templateWrapper ] : []; |
| 199 | |
| 200 | // Some of this logic could be replaced by DSR-based recovery of |
| 201 | // wikitext that is outside templates. But since we have to walk over |
| 202 | // templated content in this fashion anyway, we might as well use the |
| 203 | // same logic uniformly. |
| 204 | |
| 205 | $traverse = static function ( ?Node $child ) use ( |
| 206 | &$traverse, &$buf, &$frags, &$transclusions |
| 207 | ): bool { |
| 208 | while ( $child ) { |
| 209 | if ( $child instanceof Comment ) { |
| 210 | // Legacy parser strips comments during parsing => drop them. |
| 211 | } elseif ( $child instanceof Text ) { |
| 212 | $text = $child->nodeValue; |
| 213 | $buf[] = $text; |
| 214 | |
| 215 | // Are we done accumulating? |
| 216 | if ( preg_match( '/(?:^|[^|])\|(?:[^|]|$)/D', $text ) ) { |
| 217 | return true; |
| 218 | } |
| 219 | } else { |
| 220 | '@phan-var Element $child'; /** @var Element $child */ |
| 221 | if ( DOMUtils::hasTypeOf( $child, 'mw:Transclusion' ) ) { |
| 222 | $transclusions[] = $child; |
| 223 | } |
| 224 | |
| 225 | if ( DOMUtils::hasTypeOf( $child, 'mw:Entity' ) ) { |
| 226 | // Get entity's wikitext source, not rendered content. |
| 227 | // " " is "\n" which breaks attribute parsing! |
| 228 | $buf[] = DOMDataUtils::getDataParsoid( $child )->src ?? $child->textContent; |
| 229 | } elseif ( DOMUtils::hasTypeOf( $child, 'mw:DOMFragment' ) ) { |
| 230 | $fragDOM = WTUtils::getDOMFragmentContents( $child ); |
| 231 | // FIXME: This is correct only for nowikis. |
| 232 | // For everything else, we need to figure out what needs to happen |
| 233 | // here wrt the extension opening & closing tags. |
| 234 | $frags[] = $fragDOM->firstChild->textContent; |
| 235 | $buf[] = '<frag-marker>'; |
| 236 | } elseif ( self::shouldAbortAttr( $child ) ) { |
| 237 | return true; |
| 238 | } else { |
| 239 | if ( $traverse( $child->firstChild ) ) { |
| 240 | return true; |
| 241 | } |
| 242 | } |
| 243 | } |
| 244 | |
| 245 | $child = $child->nextSibling; |
| 246 | } |
| 247 | |
| 248 | return false; |
| 249 | }; |
| 250 | |
| 251 | if ( $traverse( $cell->firstChild ) ) { |
| 252 | return [ |
| 253 | 'txt' => implode( '', $buf ), |
| 254 | 'frags' => $frags, |
| 255 | 'transclusions' => $transclusions, |
| 256 | ]; |
| 257 | } else { |
| 258 | return null; |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | /** |
| 263 | * T46498, second part of T52603 |
| 264 | * |
| 265 | * Handle wikitext like |
| 266 | * ``` |
| 267 | * {| |
| 268 | * |{{nom|Bar}} |
| 269 | * |} |
| 270 | * ``` |
| 271 | * where nom expands to `style="foo" class="bar"|Bar`. The attributes are |
| 272 | * tokenized and stripped from the table contents. |
| 273 | * |
| 274 | * This method works well for the templates documented in |
| 275 | * https://en.wikipedia.org/wiki/Template:Table_cell_templates/doc |
| 276 | * |
| 277 | * Nevertheless, there are some limitations: |
| 278 | * - We assume that attributes don't contain wiki markup (apart from <nowiki>) |
| 279 | * and end up in text or nowiki nodes. |
| 280 | * - Only a single table cell is produced / opened by the template that |
| 281 | * contains the attributes. This limitation could be lifted with more |
| 282 | * aggressive re-parsing if really needed in practice. |
| 283 | * - There is only a single transclusion in the table cell content. This |
| 284 | * limitation can be lifted with more advanced data-mw construction. |
| 285 | * |
| 286 | * $cell known to be <td> / <th> |
| 287 | */ |
| 288 | public static function reparseTemplatedAttributes( |
| 289 | DTState $dtState, Element $cell, ?Element $templateWrapper |
| 290 | ): void { |
| 291 | $env = $dtState->env; |
| 292 | // Collect attribute content and examine it |
| 293 | $attributishContent = self::collectAttributishContent( $env, $cell, $templateWrapper ); |
| 294 | if ( !$attributishContent ) { |
| 295 | return; |
| 296 | } |
| 297 | |
| 298 | /** |
| 299 | * FIXME: These checks are insufficient. |
| 300 | * Previous rounds of table fixups might have created this cell without |
| 301 | * any templated content (the while loop in handleTableCellTemplates). |
| 302 | * Till we figure out a reliable test for this, we'll reparse attributes always. |
| 303 | * |
| 304 | * // This DOM pass is trying to bridge broken parses across |
| 305 | * // template boundaries. So, if templates aren't involved, |
| 306 | * // no reason to reparse. |
| 307 | * if ( count( $attributishContent['transclusions'] ) === 0 && |
| 308 | * !WTUtils::fromEncapsulatedContent( $cell ) |
| 309 | * ) { |
| 310 | * return; |
| 311 | * } |
| 312 | */ |
| 313 | |
| 314 | $attrText = $attributishContent['txt']; |
| 315 | if ( !preg_match( '/(^[^|]+\|)([^|]|$)/D', $attrText, $matches ) ) { |
| 316 | return; |
| 317 | } |
| 318 | $attributishPrefix = $matches[1]; |
| 319 | |
| 320 | // Splice in fragment content. We added in <frag-marker> markers to prevent |
| 321 | // the above regexps from matching protected chars. |
| 322 | if ( str_contains( $attributishPrefix, '<frag-marker>' ) ) { |
| 323 | $attributishPrefix = preg_replace_callback( |
| 324 | '/<frag-marker>/', |
| 325 | static function ( $unused ) use ( &$attributishContent ) { |
| 326 | // This is a little tricky. We want to use the content from the |
| 327 | // fragments to reparse the string to key/val pairs but the rule, |
| 328 | // single_cell_table_args, will invariably get tripped up on |
| 329 | // newlines which, to this point, were shuttled through the fragment. |
| 330 | // Core sanitizer will do this replacement in attr vals |
| 331 | // so it's a safe normalization to do here. |
| 332 | return preg_replace( '/\s+/', ' ', array_shift( $attributishContent['frags'] ) ); |
| 333 | }, |
| 334 | $attributishPrefix |
| 335 | ); |
| 336 | } |
| 337 | |
| 338 | // re-parse the attributish prefix |
| 339 | if ( !$dtState->tokenizer ) { |
| 340 | $dtState->tokenizer = new PegTokenizer( $env ); |
| 341 | } |
| 342 | $attributeTokens = $dtState->tokenizer->tokenizeTableCellAttributes( $attributishPrefix, false ); |
| 343 | |
| 344 | // No attributes => nothing more to do! |
| 345 | if ( !$attributeTokens ) { |
| 346 | return; |
| 347 | } |
| 348 | |
| 349 | // Note that `row_syntax_table_args` (the rule used for tokenizing above) |
| 350 | // returns an array consisting of [table_attributes, spaces, pipe] |
| 351 | $attrs = $attributeTokens[0]; |
| 352 | |
| 353 | // Sanitize attrs and transfer them to the td node |
| 354 | Sanitizer::applySanitizedArgs( $env->getSiteConfig(), $cell, $attrs ); |
| 355 | $cellDp = DOMDataUtils::getDataParsoid( $cell ); |
| 356 | // Reparsed cells start off as non-mergeable-table cells |
| 357 | // and preserve that property after reparsing |
| 358 | $cellDp->setTempFlag( TempData::MERGED_TABLE_CELL ); |
| 359 | $cellDp->setTempFlag( TempData::TABLE_CELL_WITH_NO_ATTRIBUTE_SYNTAX, false ); |
| 360 | |
| 361 | // If the transclusion node was embedded within the td node, |
| 362 | // lift up the about group to the td node. |
| 363 | $transclusions = $attributishContent['transclusions']; |
| 364 | if ( $transclusions && ( $cell !== $transclusions[0] || count( $transclusions ) > 1 ) ) { |
| 365 | self::hoistTransclusionInfo( $dtState, $transclusions, $cell ); |
| 366 | } |
| 367 | |
| 368 | // Drop content that has been consumed by the reparsed attribute content. |
| 369 | // NOTE: We serialize and reparse data-object-id attributes as well which |
| 370 | // ensures stashed data-* attributes continue to be usable. |
| 371 | // FIXME: This is too naive. What about all the care we showed in `collectAttributishContent`? |
| 372 | DOMCompat::setInnerHTML( $cell, |
| 373 | preg_replace( '/^[^|]*\|/', '', DOMCompat::getInnerHTML( $cell ) ) ); |
| 374 | } |
| 375 | |
| 376 | /** |
| 377 | * $cell's last character is known to be a '|' (for <td>) of '!' (for <th>) |
| 378 | */ |
| 379 | private static function stripTrailingPipe( Element $cell ): ?string { |
| 380 | $lc = $cell->lastChild; |
| 381 | while ( $lc && !( $lc instanceof Text ) ) { |
| 382 | $lc = $lc->lastChild; |
| 383 | } |
| 384 | |
| 385 | if ( !$lc ) { |
| 386 | // FIXME: Is this code reachable? |
| 387 | return null; |
| 388 | } |
| 389 | |
| 390 | $txt = $lc->textContent; |
| 391 | $lastCharIndex = strlen( $txt ) - 1; |
| 392 | $lc->textContent = substr( $txt, 0, $lastCharIndex ); |
| 393 | return $txt[$lastCharIndex]; |
| 394 | } |
| 395 | |
| 396 | private const PARSOID_ATTRIBUTES = [ |
| 397 | 'data-object-id', 'typeof', 'about', 'data-parsoid', 'data-mw' |
| 398 | ]; |
| 399 | |
| 400 | /** |
| 401 | * Ths is called in two cases: |
| 402 | * (a) when two cells are merged, source is transferred from source |
| 403 | * to target cell. |
| 404 | * |
| 405 | * This is called from mergeCells( .. ) |
| 406 | * |
| 407 | * (b) when a pipe (| for td, ! for th) is being transferred from one cell |
| 408 | * to another making the recepient cell a 'row' syntax cell. In this |
| 409 | * case, the pipe char could come from content (when the cell has content) |
| 410 | * OR from the attribute-terminator (when the cell has no content). |
| 411 | * In the attribute-terminator case, the pipe transfer requires that |
| 412 | * the openWidth dsr property be decremnted by 1 for the source cell. |
| 413 | * |
| 414 | * This is called from reparseWithPreviousCell( .. ) |
| 415 | */ |
| 416 | private static function transferSourceBetweenCells( |
| 417 | string $src, Element $from, Element $to, bool $emptyFromContent |
| 418 | ): void { |
| 419 | if ( DOMUtils::hasTypeOf( $to, 'mw:Transclusion' ) ) { |
| 420 | $dataMW = DOMDataUtils::getDataMw( $to ); |
| 421 | array_unshift( $dataMW->parts, $src ); |
| 422 | } |
| 423 | |
| 424 | $rowSyntaxChar = DOMUtils::nodeName( $to ) === 'td' ? '|' : '!'; |
| 425 | $fromDp = DOMDataUtils::getDataParsoid( $from ); |
| 426 | if ( $rowSyntaxChar === '|' ) { |
| 427 | unset( $fromDp->startTagSrc ); |
| 428 | unset( $fromDp->attrSepSrc ); |
| 429 | } |
| 430 | |
| 431 | $hasRowSyntax = false; |
| 432 | $toDp = DOMDataUtils::getDataParsoid( $to ); |
| 433 | if ( str_ends_with( $src, $rowSyntaxChar ) ) { |
| 434 | $hasRowSyntax = true; |
| 435 | $toDp->stx = 'row'; |
| 436 | } |
| 437 | |
| 438 | $srcLen = strlen( $src ); |
| 439 | $toDSR = $toDp->dsr ?? null; |
| 440 | if ( $toDSR ) { |
| 441 | if ( $toDSR->start ) { |
| 442 | $toDSR->start -= $srcLen; |
| 443 | } |
| 444 | if ( $hasRowSyntax && $toDSR->openWidth ) { |
| 445 | $toDSR->openWidth += 1; |
| 446 | } |
| 447 | } |
| 448 | |
| 449 | $fromDSR = $fromDp->dsr ?? null; |
| 450 | if ( $fromDSR ) { |
| 451 | if ( $fromDSR->end ) { |
| 452 | $fromDSR->end -= $srcLen; |
| 453 | } |
| 454 | if ( $hasRowSyntax && $fromDSR->openWidth && $emptyFromContent ) { |
| 455 | $fromDSR->openWidth -= 1; |
| 456 | } |
| 457 | } |
| 458 | } |
| 459 | |
| 460 | private static function mergeCells( string $fromSrc, Element $from, Element $to ): void { |
| 461 | // Update data-mw, DSR if $to is an encapsulation wrapper |
| 462 | self::transferSourceBetweenCells( $fromSrc, $from, $to, false ); |
| 463 | |
| 464 | $identicalCellTypes = DOMUtils::nodeName( $from ) === DOMUtils::nodeName( $to ); |
| 465 | [ $src, $tgt ] = $identicalCellTypes ? [ $from, $to ] : [ $to, $from ]; |
| 466 | // For non-identical cell types, $from is the authoritative cell but |
| 467 | // $to has transclusion attributes. So, we need to migrate data-mw, |
| 468 | // data-parsoid, etc. as well to the $tgt ($from in this case). |
| 469 | $ignoreParsoidAttributes = $identicalCellTypes; |
| 470 | |
| 471 | foreach ( $src->attributes as $attr ) { |
| 472 | if ( !$ignoreParsoidAttributes || !in_array( $attr->name, self::PARSOID_ATTRIBUTES, true ) ) { |
| 473 | $tgt->setAttribute( $attr->name, $attr->value ); |
| 474 | } |
| 475 | } |
| 476 | |
| 477 | DOMUtils::migrateChildren( $src, $tgt, $identicalCellTypes ? $tgt->firstChild : null ); |
| 478 | $src->parentNode->removeChild( $src ); |
| 479 | |
| 480 | // Combined cells don't merge further |
| 481 | $tgtDp = DOMDataUtils::getDataParsoid( $tgt ); |
| 482 | $tgtDp->setTempFlag( TempData::MERGED_TABLE_CELL ); |
| 483 | $tgtDp->setTempFlag( TempData::TABLE_CELL_WITH_NO_ATTRIBUTE_SYNTAX, false ); |
| 484 | } |
| 485 | |
| 486 | /** |
| 487 | * Reprocess attribute source as a WT -> HTML transform |
| 488 | * - If $cell's attributes were templated (mw:ExpandedAttrs typeof), |
| 489 | * we would have already processed these in AttributeExpander and |
| 490 | * stuffed it in data-mw. Just pull it out of there. |
| 491 | * - If not, extract attribute source from the $cell and process it |
| 492 | * in a wikitext-to-fragment pipeline. |
| 493 | */ |
| 494 | private static function convertAttribsToContent( |
| 495 | Env $env, Frame $frame, Element $cell, bool $leadingPipe, bool $trailingPipe |
| 496 | ): void { |
| 497 | $doc = $cell->ownerDocument; |
| 498 | $cellDp = DOMDataUtils::getDataParsoid( $cell ); |
| 499 | $cellAttrSrc = $cellDp->getTemp()->attrSrc ?? null; |
| 500 | |
| 501 | if ( DOMUtils::matchTypeOf( $cell, "/\bmw:ExpandedAttrs\b/" ) ) { |
| 502 | DOMUtils::removeTypeOf( $cell, 'mw:ExpandedAttrs' ); |
| 503 | $dataMw = DOMDataUtils::getDataMw( $cell ); |
| 504 | unset( $dataMw->attribs ); |
| 505 | } |
| 506 | |
| 507 | // Process attribute wikitext as HTML |
| 508 | $leadingPipeChar = DOMUtils::nodeName( $cell ) === 'td' ? '|' : '!'; |
| 509 | // FIXME: Encapsulated doesn't necessarily mean templated |
| 510 | $fromTpl = WTUtils::fromEncapsulatedContent( $cell ); |
| 511 | if ( !preg_match( "#['[{<]#", $cellAttrSrc ) ) { |
| 512 | // Optimization: |
| 513 | // - SOL constructs like =-*# won't be found here |
| 514 | // - If no non-sol wikitext constructs, this will just a plain string |
| 515 | $str = ( $leadingPipe ? $leadingPipeChar : '' ) . |
| 516 | $cellAttrSrc . |
| 517 | ( $cellAttrSrc && $trailingPipe ? '|' : '' ); |
| 518 | $children = [ $doc->createTextNode( $str ) ]; |
| 519 | } else { |
| 520 | if ( isset( $cellDp->startTagSrc ) ) { |
| 521 | $attrSrcOffset = strlen( $cellDp->startTagSrc ); |
| 522 | } elseif ( ( $cellDp->stx ?? '' ) === 'row' ) { |
| 523 | $attrSrcOffset = 2; |
| 524 | } else { |
| 525 | $attrSrcOffset = 1; |
| 526 | } |
| 527 | $frag = PipelineUtils::processContentInPipeline( |
| 528 | $env, $frame, $cellAttrSrc, [ |
| 529 | 'sol' => false, |
| 530 | 'toplevel' => !$fromTpl, |
| 531 | 'srcOffsets' => $fromTpl ? null : new SourceRange( |
| 532 | $cellDp->tsr->start + $attrSrcOffset, $cellDp->tsr->end - 1 |
| 533 | ), |
| 534 | 'pipelineType' => 'wikitext-to-fragment', |
| 535 | 'pipelineOpts' => [ 'inlineContext' => true ] |
| 536 | ] |
| 537 | ); |
| 538 | |
| 539 | if ( $leadingPipe ) { |
| 540 | $fc = $frag->firstChild; |
| 541 | if ( $fc instanceof Text ) { |
| 542 | $fc->textContent = $leadingPipeChar . $fc->textContent; |
| 543 | } else { |
| 544 | $frag->insertBefore( $doc->createTextNode( $leadingPipeChar ), $fc ); |
| 545 | } |
| 546 | } |
| 547 | if ( $trailingPipe ) { |
| 548 | $lc = $frag->lastChild; |
| 549 | if ( $lc instanceof Text ) { |
| 550 | $lc->textContent .= '|'; |
| 551 | } else { |
| 552 | $frag->appendChild( $doc->createTextNode( '|' ) ); |
| 553 | } |
| 554 | } |
| 555 | $children = DOMUtils::childNodes( $frag ); |
| 556 | } |
| 557 | |
| 558 | // Append new children |
| 559 | $sentinel = $cell->firstChild; |
| 560 | foreach ( $children as $c ) { |
| 561 | $cell->insertBefore( $c, $sentinel ); |
| 562 | } |
| 563 | |
| 564 | // Remove $cell's attributes |
| 565 | foreach ( iterator_to_array( $cell->attributes ) as $attr ) { |
| 566 | if ( !in_array( $attr->name, self::PARSOID_ATTRIBUTES, true ) ) { |
| 567 | $cell->removeAttribute( $attr->name ); |
| 568 | } |
| 569 | } |
| 570 | |
| 571 | // Remove shadow attributes to suppress them from wt2wt output! |
| 572 | unset( $cellDp->a ); |
| 573 | unset( $cellDp->sa ); |
| 574 | |
| 575 | // Update DSR |
| 576 | if ( !$fromTpl ) { |
| 577 | $excessDP = strlen( $cellAttrSrc ) + (int)$leadingPipe + (int)$trailingPipe; |
| 578 | $cellDp->dsr->openWidth -= $excessDP; |
| 579 | } |
| 580 | |
| 581 | // This has no attributes now |
| 582 | $cellDp->setTempFlag( TempData::TABLE_CELL_WITH_NO_ATTRIBUTE_SYNTAX ); |
| 583 | } |
| 584 | |
| 585 | /** |
| 586 | * Given: $cell is not a NON_MERGEABLE_TABLE_CELL |
| 587 | * => $cell syntax is of the form: "|..." or "|..|.." (if <td>) |
| 588 | * or: "!..." or "!..|.." (if <th>) |
| 589 | * |
| 590 | * Examine combined $prev and $cell syntax to see how it should |
| 591 | * have actually parsed and fix up $prev & $cell appropriately. |
| 592 | * |
| 593 | * @param DTState $dtState |
| 594 | * @param Element $cell |
| 595 | * @return bool |
| 596 | */ |
| 597 | private static function reparseWithPreviousCell( DTState $dtState, Element $cell ): bool { |
| 598 | // NOTE: The comments in this method always assume |
| 599 | // <td> && '|', but sometimes <th> & '!' are involved. |
| 600 | |
| 601 | $env = $dtState->env; |
| 602 | $frame = $dtState->options['frame']; |
| 603 | |
| 604 | $prev = $cell->previousSibling; |
| 605 | '@phan-var Element $prev'; // @var Element $prev |
| 606 | |
| 607 | $prevIsTd = DOMUtils::nodeName( $prev ) === 'td'; |
| 608 | $prevDp = DOMDataUtils::getDataParsoid( $prev ); |
| 609 | $prevHasAttrs = !$prevDp->getTempFlag( TempData::TABLE_CELL_WITH_NO_ATTRIBUTE_SYNTAX ); |
| 610 | |
| 611 | $cellIsTd = DOMUtils::nodeName( $cell ) === 'td'; |
| 612 | $cellDp = DOMDataUtils::getDataParsoid( $cell ); |
| 613 | $cellHasAttrs = !$cellDp->getTempFlag( TempData::TABLE_CELL_WITH_NO_ATTRIBUTE_SYNTAX ); |
| 614 | |
| 615 | // Even though we have valid dsr for $prev as a condition of entering |
| 616 | // here, use tsr start because dsr computation may have expanded the range |
| 617 | // to include fostered content |
| 618 | $prevDsr = clone $prevDp->dsr; |
| 619 | $prevDsr->start = $prevDp->tsr->start; |
| 620 | |
| 621 | $prevCellSrc = $prevDsr->substr( $frame->getSource() ); |
| 622 | |
| 623 | // $prevCellContent = substr( $prevCellSrc, $prevDp->dsr->openWidth ); |
| 624 | // The following is equivalent because td/th has zero end-tag width |
| 625 | $prevCellContent = $prevDsr->innerSubstr( $frame->getSource() ); |
| 626 | |
| 627 | // Parsoid currently doesn't support parsing "|<--cmt-->|" as |
| 628 | // a "||" which legacy parser does. We won't support this. |
| 629 | // |
| 630 | // FIXME: $prev content could have a {{..}} that ended in a "|" |
| 631 | // and that check is missing here. For now, we won't support this |
| 632 | // use case unless necessary. |
| 633 | $prevHasTrailingPipe = |
| 634 | ( $cellIsTd && str_ends_with( $prevCellContent, "|" ) ) || |
| 635 | ( !$cellIsTd && !$prevIsTd && str_ends_with( $prevCellContent, "!" ) ); |
| 636 | |
| 637 | if ( $prevHasTrailingPipe ) { |
| 638 | // $prev is of form "..|" |
| 639 | // => no cell merging |
| 640 | // strip "|" from $prev |
| 641 | // migrate "|" to $cell |
| 642 | $strippedChar = self::stripTrailingPipe( $prev ); |
| 643 | if ( !$strippedChar ) { |
| 644 | // We saw these in T384737, it's worth keeping around these conservative |
| 645 | // checks for the time being |
| 646 | $env->log( "error/wt2html", "TableFixups: stripTrailingPipe failed." ); |
| 647 | } else { |
| 648 | self::transferSourceBetweenCells( |
| 649 | // $prevHasTrailingPipe => $prevCellContent !== '' => last arg is false |
| 650 | $strippedChar, $prev, $cell, false /* emptyFromContent */ |
| 651 | ); |
| 652 | } |
| 653 | } elseif ( $prevIsTd && |
| 654 | $prevDp->getTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ) |
| 655 | && ( $prevDp->stx ?? '' ) !== 'row' |
| 656 | ) { |
| 657 | if ( $prevCellContent !== '' ) { |
| 658 | // $prev is of form "||.." in SOL position, no attributes, some content |
| 659 | // Combined wikitext is "||..|.." |
| 660 | // => <td>..|..</td> |
| 661 | self::convertAttribsToContent( $env, $frame, $cell, true, true ); |
| 662 | self::mergeCells( $prevCellSrc, $prev, $cell ); |
| 663 | } else { |
| 664 | // $prev is of form "||" in SOL position, no attributes, no content |
| 665 | // Combined wikitext is "|||.." |
| 666 | // => <td></td><td..>..</td> |
| 667 | // migrate "|" to $cell |
| 668 | self::transferSourceBetweenCells( '|', $prev, $cell, true /* emptyFromContent */ ); // '!' |
| 669 | } |
| 670 | } elseif ( !$prevHasAttrs ) { |
| 671 | // $prev has no attributes and is of form "|.." in SOL posn OR "||.." in non-SOL posn |
| 672 | // => merge $prev into $cell |
| 673 | // if $cell had attributes, those become $cell's leading content with a trailing pipe |
| 674 | if ( $cellIsTd && $cellHasAttrs ) { |
| 675 | self::convertAttribsToContent( $env, $frame, $cell, false, true ); |
| 676 | } |
| 677 | |
| 678 | // If $cell is a <th>, we need a pipe for us to reprocess $prev's content |
| 679 | // as $cell's attributes. So, <th> without attributes need special handling. |
| 680 | if ( !$cellIsTd && !$cellHasAttrs ) { |
| 681 | // $cell's "!" char should become content now when $prev |
| 682 | // and $cell are merged below. This code is equivalent to |
| 683 | // calling convertAttribsToContent( $env, $frame, $cell, true, false ) |
| 684 | $pipe = $cell->ownerDocument->createTextNode( '!' ); |
| 685 | $cell->insertBefore( $pipe, $cell->firstChild ); |
| 686 | } elseif ( $prevCellContent !== '' ) { |
| 687 | // If $prev cell had content, those become $cell's attributes |
| 688 | $reparseSrc = $prevCellContent . '|'; |
| 689 | |
| 690 | // Reparse the attributish prefix |
| 691 | if ( !$dtState->tokenizer ) { |
| 692 | $dtState->tokenizer = new PegTokenizer( $env ); |
| 693 | } |
| 694 | $attributeTokens = $dtState->tokenizer->tokenizeTableCellAttributes( $reparseSrc, false ); |
| 695 | if ( is_array( $attributeTokens ) ) { |
| 696 | // Note that `row_syntax_table_args` (the rule used for tokenizing above) |
| 697 | // returns an array consisting of [table_attributes, spaces, pipe] |
| 698 | $attrs = $attributeTokens[0]; |
| 699 | Sanitizer::applySanitizedArgs( $env->getSiteConfig(), $cell, $attrs ); |
| 700 | |
| 701 | // Remove all $prev's children |
| 702 | DOMCompat::replaceChildren( $prev ); |
| 703 | } else { |
| 704 | // FIXME: Why would this happen? |
| 705 | // For now, should we just log errors to better understand this? |
| 706 | // |
| 707 | // Failed to successfully reparse $reparseSrc as table cell attributes |
| 708 | // We'll let the cells merge, but we have to convert cell's attributes to content as well |
| 709 | if ( $cellIsTd ) { |
| 710 | // The leading pipe should become content since we skipped it |
| 711 | // in the call to convertAttribsToContent above. |
| 712 | $pipe = $cell->ownerDocument->createTextNode( '|' ); |
| 713 | $cell->insertBefore( $pipe, $cell->firstChild ); |
| 714 | } elseif ( $cellHasAttrs ) { |
| 715 | // We skipped <th> above |
| 716 | self::convertAttribsToContent( $env, $frame, $cell, true, true ); |
| 717 | } |
| 718 | } |
| 719 | } |
| 720 | |
| 721 | // Merge cells |
| 722 | self::mergeCells( $prevCellSrc, $prev, $cell ); |
| 723 | } elseif ( $prevCellContent === '' ) { |
| 724 | // $prev has attributes and is of form "|..|" in SOL or "||..|" in non-SOL |
| 725 | // => no cell merging, |
| 726 | // $prev's attributes are actually its contents |
| 727 | // migrate "|" to $cell |
| 728 | self::convertAttribsToContent( $env, $frame, $prev, false, false ); |
| 729 | self::transferSourceBetweenCells( '|', $prev, $cell, true /* emptyFromContent */ ); |
| 730 | } else { |
| 731 | // $prev has attributes and is of form "|..|.." in SOL or "||..|.." in non-SOL |
| 732 | // => $cell merges into $prev (its attrs & pipes become content) |
| 733 | self::convertAttribsToContent( $env, $frame, $cell, true, true ); |
| 734 | self::mergeCells( $prevCellSrc, $prev, $cell ); |
| 735 | } |
| 736 | |
| 737 | return true; |
| 738 | } |
| 739 | |
| 740 | /** |
| 741 | * The legacy parser naively aborts attributes on '/\[\[|-\{/' |
| 742 | * Wikilinks and language converter constructs should follow suit |
| 743 | */ |
| 744 | private static function shouldAbortAttr( Element $child ): bool { |
| 745 | return DOMUtils::matchRel( $child, WTUtils::WIKILINK_SYNTAX_CONSTRUCTS_REGEXP ) || |
| 746 | WTUtils::isGeneratedFigure( $child ); |
| 747 | } |
| 748 | |
| 749 | private static function pipeStatusInContent( |
| 750 | Element $node, string $testRE, bool $inTplContent, bool $noAttrReparsing = false |
| 751 | ): ReparseScenario { |
| 752 | $about = null; |
| 753 | $child = $node->firstChild; |
| 754 | while ( $child ) { |
| 755 | if ( $inTplContent && |
| 756 | $child instanceof Text && |
| 757 | preg_match( $testRE, $child->textContent ) |
| 758 | ) { |
| 759 | return $noAttrReparsing ? ReparseScenario::MAYBE_SPLIT_CELL : ReparseScenario::MAYBE_REPARSE_ATTRS; |
| 760 | } |
| 761 | |
| 762 | if ( $child instanceof Element ) { |
| 763 | if ( $about && DOMCompat::getAttribute( $child, 'about' ) !== $about ) { |
| 764 | $inTplContent = false; |
| 765 | $about = null; |
| 766 | } |
| 767 | |
| 768 | if ( !$inTplContent && DOMUtils::hasTypeOf( $child, 'mw:Transclusion' ) ) { |
| 769 | $inTplContent = true; |
| 770 | $about = DOMCompat::getAttribute( $child, 'about' ); |
| 771 | } |
| 772 | |
| 773 | // FIXME: Extlinks can hide pipes in dom fragments, but we are not handling that |
| 774 | // right now -- it is likely an edge case and it is icky to deal with it. |
| 775 | // There can be other sources of dom fragments (parser functions returning HTML,) |
| 776 | // that could hide pipes but it is unclear we need to support that form of |
| 777 | // string gluing here. For now, we treat that as unsupported behavior unless |
| 778 | // we find real uses that need to be dealt with. |
| 779 | if ( !DOMUtils::hasTypeOf( $child, 'mw:DOMFragment' ) ) { |
| 780 | // "|" chars in extension/language variant content don't trigger |
| 781 | // table-cell parsing since they have higher precedence in tokenization |
| 782 | if ( self::shouldAbortAttr( $child ) ) { |
| 783 | $noAttrReparsing = true; |
| 784 | } |
| 785 | |
| 786 | // A "|" char in the HTML will trigger table cell tokenization. |
| 787 | // Ex: "| foobar <div> x | y </div>" will split the <div> |
| 788 | // in table-cell tokenization context. |
| 789 | $status = self::pipeStatusInContent( |
| 790 | $child, $testRE, $inTplContent, $noAttrReparsing ); |
| 791 | if ( $status !== ReparseScenario::NOT_NEEDED ) { |
| 792 | return $status; |
| 793 | } |
| 794 | |
| 795 | // Continue with next sibling to keep looking for reparse opportunities |
| 796 | } |
| 797 | } |
| 798 | |
| 799 | $child = $child->nextSibling; |
| 800 | } |
| 801 | |
| 802 | return ReparseScenario::NOT_NEEDED; |
| 803 | } |
| 804 | |
| 805 | /** |
| 806 | * $cell is known to be <td>/<th> |
| 807 | */ |
| 808 | private static function getReparseType( Element $cell, DTState $dtState ): ReparseScenario { |
| 809 | $dp = DOMDataUtils::getDataParsoid( $cell ); |
| 810 | if ( |
| 811 | // Template wrapping, which happens prior to this pass, may have combined |
| 812 | // various regions. The important indicator of whether we want to try |
| 813 | // to combine is if the $cell was the first node of a template. |
| 814 | $dp->getTempFlag( TempData::AT_SRC_START ) && |
| 815 | !$dp->getTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ) && |
| 816 | !$dp->getTempFlag( TempData::MERGED_TABLE_CELL ) && |
| 817 | !$dp->getTempFlag( TempData::FAILED_REPARSE ) |
| 818 | ) { |
| 819 | // Look for opportunities where table cells could combine. This requires |
| 820 | // $cell to be a templated cell. But, we don't support combining |
| 821 | // templated cells with other templated cells. So, previous sibling |
| 822 | // cannot be templated. |
| 823 | // |
| 824 | // So, bail out of scenarios where prevDp comes from a template (the checks |
| 825 | // for isValidDSR( $prevDp-> dsr ) and valid opening tag width catch this. |
| 826 | $prev = $cell->previousSibling; |
| 827 | $prevDp = $prev instanceof Element ? DOMDataUtils::getDataParsoid( $prev ) : null; |
| 828 | if ( $prevDp && |
| 829 | !WTUtils::hasLiteralHTMLMarker( $prevDp ) && |
| 830 | Utils::isValidDSR( $prevDp->dsr ?? null, true ) && |
| 831 | !DOMUtils::hasTypeOf( $prev, 'mw:Transclusion' ) && |
| 832 | !str_contains( DOMCompat::getInnerHTML( $prev ), "\n" ) |
| 833 | ) { |
| 834 | return ReparseScenario::MAYBE_COMBINE_WITH_PREV_CELL; |
| 835 | } |
| 836 | } |
| 837 | |
| 838 | // FIXME: We're traversing with the outermost encapsulation, but encapsulations |
| 839 | // can be nested (ie. template in extension content) so the check is insufficient |
| 840 | $inTplContent = $dtState->tplInfo !== null && |
| 841 | DOMUtils::hasTypeOf( $dtState->tplInfo->first, 'mw:Transclusion' ); |
| 842 | $testRE = DOMUtils::nodeName( $cell ) === 'td' ? '/[|]/' : '/[!|]/'; |
| 843 | $noAttrReparsing = !$dp->getTempFlag( TempData::TABLE_CELL_WITH_NO_ATTRIBUTE_SYNTAX ) || |
| 844 | // In TokenizerUtils::buildTableTokens(), we have a special case to add the |
| 845 | // no attribute syntax flag to || found in SOL position, since, coming from a |
| 846 | // template, we don't have the context of whether the cell is truly at SOL |
| 847 | // or if this should be interpreted as row syntax |
| 848 | ( $dp->getTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ) && |
| 849 | // Alternatively, we can check for SOL |
| 850 | // ( $cell->previousSibling instanceof Text && |
| 851 | // preg_match( '/\n/', $cell->previousSibling->nodeValue ?? '' ) ) ); |
| 852 | ( $dp->stx ?? '' ) !== 'row' ); |
| 853 | return self::pipeStatusInContent( $cell, $testRE, $inTplContent, $noAttrReparsing ); |
| 854 | } |
| 855 | |
| 856 | /** |
| 857 | * In a wikitext-syntax-table-parsing context, the meaning of |
| 858 | * "|", "||", "!", "!!" is context-sensitive. Additionally, the |
| 859 | * complete syntactical construct for a table cell (including leading |
| 860 | * pipes, attributes, and content-separating pipe char) might straddle |
| 861 | * a template boundary - with some content coming from the top-level and |
| 862 | * some from a template. |
| 863 | * |
| 864 | * This impacts parsing of tables when some cells are templated since |
| 865 | * Parsoid parses template content independent of top-level content |
| 866 | * (without any preceding context). This means that Parsoid's table-cell |
| 867 | * parsing in templated contexts might be incorrect. |
| 868 | * |
| 869 | * To deal with this, Parsoid implements this table-fixups pass that |
| 870 | * has to deal with cell-merging and cell-reparsing scenarios. |
| 871 | * |
| 872 | * HTML-syntax cells and non-templated cells without any templated content |
| 873 | * are not subject to this transformation and can be skipped right away. |
| 874 | * |
| 875 | * FIXME: This pass can benefit from a customized procsssor rather than |
| 876 | * piggyback on top of DOMTraverser since the DOM can be significantly |
| 877 | * mutated in these handlers. |
| 878 | * |
| 879 | * @param Element $tableOrCell If a cell, $cell is known to be <td>/<th> |
| 880 | * @param DTState $dtState |
| 881 | * @return mixed |
| 882 | */ |
| 883 | public static function handleTableCellTemplates( Element $tableOrCell, DTState $dtState ) { |
| 884 | $cellName = $nodeName = DOMUtils::nodeName( $tableOrCell ); |
| 885 | $isTemplatedCell = $isTemplatedNode = DOMUtils::hasTypeOf( $tableOrCell, 'mw:Transclusion' ); |
| 886 | if ( $nodeName === 'table' ) { |
| 887 | // If the table is templated and is from a well-balanced template, individual cells |
| 888 | // had been expanded in the preprocessor and there is no need to examine individual |
| 889 | // cells for reparsing. Skip the entire table. |
| 890 | if ( $isTemplatedNode && DOMDataUtils::getDataMw( $tableOrCell )->fromWellBalancedTemplate() ) { |
| 891 | return $tableOrCell->nextSibling; |
| 892 | } |
| 893 | return true; |
| 894 | } |
| 895 | |
| 896 | $cell = $tableOrCell; |
| 897 | if ( WTUtils::isLiteralHTMLNode( $cell ) ) { |
| 898 | return true; |
| 899 | } |
| 900 | |
| 901 | $cellDp = DOMDataUtils::getDataParsoid( $cell ); |
| 902 | if ( isset( $cellDp->getTemp()->cellAttrTerminatorSeen ) ) { |
| 903 | self::convertAttribsToContent( $dtState->env, $dtState->options['frame'], $cell, false, true ); |
| 904 | unset( $cellDp->getTemp()->cellAttrTerminatorSeen ); |
| 905 | // Reprocess $cell in case this round makes it suitable |
| 906 | // for additional processing. |
| 907 | return $cell; |
| 908 | } |
| 909 | |
| 910 | // Deal with <th> special case where "!! foo" is parsed as <th>! foo</th> |
| 911 | // but should have been parsed as <th>foo</th> when not the first child |
| 912 | if ( $cellName === 'th' && $isTemplatedCell && |
| 913 | // The ! wouldn't be the first content char if attrs were present |
| 914 | $cellDp->getTempFlag( TempData::TABLE_CELL_WITH_NO_ATTRIBUTE_SYNTAX ) && |
| 915 | // This is checking that previous sibling is not "\n" which would |
| 916 | // signal that this <th> is on a fresh line and the "!" shouldn't be stripped. |
| 917 | // If this weren't template output, we would check for "stx" === 'row'. |
| 918 | // FIXME: Note that this check is fragile and doesn't work always, but this is |
| 919 | // the price we pay for Parsoid's independent template parsing! |
| 920 | $cell->previousSibling instanceof Element |
| 921 | ) { |
| 922 | $fc = DiffDOMUtils::firstNonSepChild( $cell ); |
| 923 | if ( $fc instanceof Text ) { |
| 924 | $leadingText = $fc->nodeValue; |
| 925 | if ( str_starts_with( $leadingText, "!" ) ) { |
| 926 | $fc->nodeValue = substr( $leadingText, 1 ); |
| 927 | $cellDp->stx = 'row'; |
| 928 | $cellDp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ); |
| 929 | } |
| 930 | } |
| 931 | } |
| 932 | |
| 933 | $reparseType = self::getReparseType( $cell, $dtState ); |
| 934 | if ( $reparseType === ReparseScenario::NOT_NEEDED ) { |
| 935 | return true; |
| 936 | } |
| 937 | |
| 938 | if ( $reparseType === ReparseScenario::MAYBE_COMBINE_WITH_PREV_CELL ) { |
| 939 | if ( self::reparseWithPreviousCell( $dtState, $cell ) ) { |
| 940 | return true; |
| 941 | } else { |
| 942 | // Clear property and retry $cell for other reparses |
| 943 | // The DOMTraverser will resume the handler on the |
| 944 | // returned $cell. |
| 945 | $cellDp->setTempFlag( TempData::FAILED_REPARSE ); |
| 946 | return $cell; |
| 947 | } |
| 948 | } |
| 949 | |
| 950 | // If the cell didn't have attrs, extract and reparse templated attrs |
| 951 | if ( |
| 952 | $reparseType === ReparseScenario::MAYBE_REPARSE_ATTRS && |
| 953 | $cellDp->getTempFlag( TempData::TABLE_CELL_WITH_NO_ATTRIBUTE_SYNTAX ) |
| 954 | ) { |
| 955 | $templateWrapper = $isTemplatedCell ? $cell : null; |
| 956 | self::reparseTemplatedAttributes( $dtState, $cell, $templateWrapper ); |
| 957 | } |
| 958 | |
| 959 | // Now, examine the <td> to see if it hides additional <td>s |
| 960 | // and split it up if required. |
| 961 | // |
| 962 | // DOMTraverser will process the new cell and invoke |
| 963 | // handleTableCellTemplates on it which ensures that |
| 964 | // if any additional attribute fixup or splits are required, |
| 965 | // they will get done. |
| 966 | $origCell = $cell; |
| 967 | $newCell = null; |
| 968 | $newCellInsertPosn = $cell->nextSibling; |
| 969 | $ownerDoc = $cell->ownerDocument; |
| 970 | $tplStart = null; |
| 971 | $tplAbout = null; |
| 972 | $transclusions = []; |
| 973 | $needsTplInfoHoisted = false; |
| 974 | $isTd = $cellName === 'td'; |
| 975 | $child = $cell->firstChild; |
| 976 | while ( $child ) { |
| 977 | $next = $child->nextSibling; |
| 978 | |
| 979 | if ( $newCell ) { |
| 980 | $newCell->appendChild( $child ); |
| 981 | $cell = $newCell; |
| 982 | } |
| 983 | |
| 984 | if ( DOMUtils::hasTypeOf( $child, 'mw:Transclusion' ) ) { |
| 985 | '@phan-var Element $child'; // @var Element $child |
| 986 | $tplStart = $child; |
| 987 | $transclusions[] = $child; |
| 988 | $tplAbout = DOMCompat::getAttribute( $child, 'about' ); |
| 989 | } elseif ( !$child instanceof Element || DOMCompat::getAttribute( $child, 'about' ) !== $tplAbout ) { |
| 990 | $tplStart = null; |
| 991 | $tplAbout = null; |
| 992 | } |
| 993 | |
| 994 | if ( $child instanceof Text || ( $tplStart !== null && self::isSimpleTemplatedSpan( $child ) ) ) { |
| 995 | // FIXME: This skips over scenarios like <div>foo||bar</div>. |
| 996 | $hasSpanWrapper = !( $child instanceof Text ); |
| 997 | $match1 = $match2 = null; |
| 998 | |
| 999 | // Find the first match of || |
| 1000 | preg_match( '/^((?:[^|]*(?:\|[^|])?)*)\|\|(.*)$/D', $child->textContent, $match1 ); |
| 1001 | if ( $isTd ) { |
| 1002 | $match = $match1; |
| 1003 | } else { |
| 1004 | // Find the first match !! |
| 1005 | preg_match( '/^((?:[^!]*(?:\![^!])?)*)\!\!(.*)$/D', $child->textContent, $match2 ); |
| 1006 | |
| 1007 | // Pick the shortest match |
| 1008 | if ( $match1 && $match2 ) { |
| 1009 | $match = strlen( $match1[1] ?? '' ) < strlen( $match2[1] ?? '' ) |
| 1010 | ? $match1 |
| 1011 | : $match2; |
| 1012 | } else { |
| 1013 | $match = $match1 ?: $match2; |
| 1014 | } |
| 1015 | } |
| 1016 | |
| 1017 | if ( $match ) { |
| 1018 | // Adjust $child's content & create $newCell |
| 1019 | $child->textContent = $match[1] ?? ''; |
| 1020 | |
| 1021 | $newCell = $ownerDoc->createElement( DOMUtils::nodeName( $cell ) ); |
| 1022 | $newCell->appendChild( $ownerDoc->createTextNode( $match[2] ?? '' ) ); |
| 1023 | |
| 1024 | $newCellDp = new DataParsoid; |
| 1025 | // This new cell has 'row' stx (would be set if the tokenizer had parsed it) |
| 1026 | $newCellDp->stx = 'row'; |
| 1027 | $newCellDp->setTempFlag( TempData::TABLE_CELL_WITH_NO_ATTRIBUTE_SYNTAX ); |
| 1028 | // It is important to set this so that when $newCell is processed by this pass, |
| 1029 | // it won't accidentally recombine again with the previous cell! |
| 1030 | $newCellDp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL ); |
| 1031 | DOMDataUtils::setDataParsoid( $newCell, $newCellDp ); |
| 1032 | |
| 1033 | $origCell->parentNode->insertBefore( $newCell, $newCellInsertPosn ); |
| 1034 | |
| 1035 | if ( $hasSpanWrapper ) { |
| 1036 | // $hasSpanWrapper above ensures $child is a span. |
| 1037 | '@phan-var Element $child'; // @var Element $child |
| 1038 | $about = DOMCompat::getAttribute( $child, 'about' ); |
| 1039 | $needsTplInfoHoisted = true; |
| 1040 | } else { |
| 1041 | // Refetch the about attribute since 'reparseTemplatedAttributes' |
| 1042 | // might have added one to it. |
| 1043 | $about = DOMCompat::getAttribute( $origCell, 'about' ); |
| 1044 | } |
| 1045 | |
| 1046 | // about may not be present if the cell was inside |
| 1047 | // wrapped template content rather than being part |
| 1048 | // of the outermost wrapper. |
| 1049 | if ( $about !== null ) { |
| 1050 | $newCell->setAttribute( 'about', $about ); |
| 1051 | // This update is necessary to prevent DOMTraverser |
| 1052 | // from clearing dtState->tplInfo prematurely. |
| 1053 | if ( $dtState->tplInfo?->last === $cell ) { |
| 1054 | $dtState->tplInfo->last = $newCell; |
| 1055 | } |
| 1056 | } |
| 1057 | } |
| 1058 | } |
| 1059 | |
| 1060 | $child = $next; |
| 1061 | } |
| 1062 | |
| 1063 | // Fix up transclusion wrapping (but only if we created new cells) |
| 1064 | if ( $needsTplInfoHoisted ) { |
| 1065 | self::hoistTransclusionInfo( $dtState, $transclusions, $origCell ); |
| 1066 | $dtState->tplInfo->last = $newCell; |
| 1067 | } |
| 1068 | |
| 1069 | return true; |
| 1070 | } |
| 1071 | } |