Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
83.57% |
239 / 286 |
|
25.00% |
2 / 8 |
CRAP | |
0.00% |
0 / 1 |
| ComputeDSR | |
83.57% |
239 / 286 |
|
25.00% |
2 / 8 |
256.89 | |
0.00% |
0 / 1 |
| tsrSpansTagDOM | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 | |||
| acceptableInconsistency | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
6 | |||
| computeListEltWidth | |
78.57% |
11 / 14 |
|
0.00% |
0 / 1 |
10.98 | |||
| computeATagWidth | |
80.00% |
12 / 15 |
|
0.00% |
0 / 1 |
11.97 | |||
| computeTagWidths | |
88.89% |
24 / 27 |
|
0.00% |
0 / 1 |
15.31 | |||
| trace | |
33.33% |
2 / 6 |
|
0.00% |
0 / 1 |
5.67 | |||
| computeNodeDSR | |
83.16% |
163 / 196 |
|
0.00% |
0 / 1 |
151.66 | |||
| run | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
3.01 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; |
| 5 | |
| 6 | use Wikimedia\Parsoid\Config\Env; |
| 7 | use Wikimedia\Parsoid\Core\DomSourceRange; |
| 8 | use Wikimedia\Parsoid\DOM\Comment; |
| 9 | use Wikimedia\Parsoid\DOM\Element; |
| 10 | use Wikimedia\Parsoid\DOM\Node; |
| 11 | use Wikimedia\Parsoid\DOM\Text; |
| 12 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
| 13 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 14 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 15 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 16 | use Wikimedia\Parsoid\Utils\PHPUtils; |
| 17 | use Wikimedia\Parsoid\Utils\Utils; |
| 18 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 19 | use Wikimedia\Parsoid\Wikitext\Consts; |
| 20 | use Wikimedia\Parsoid\Wt2Html\Frame; |
| 21 | use Wikimedia\Parsoid\Wt2Html\TT\PreHandler; |
| 22 | use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; |
| 23 | |
| 24 | class ComputeDSR implements Wt2HtmlDOMProcessor { |
| 25 | /** |
| 26 | * For an explanation of what TSR is, see ComputeDSR::computeNodeDSR() |
| 27 | * |
| 28 | * TSR info on all these tags are only valid for the opening tag. |
| 29 | * |
| 30 | * On other tags, a, hr, br, meta-marker tags, the tsr spans |
| 31 | * the entire DOM, not just the tag. |
| 32 | * |
| 33 | * This code is not in Wikitext\Consts.php because this |
| 34 | * information is Parsoid-implementation-specific. |
| 35 | */ |
| 36 | private const WT_TAGS_WITH_LIMITED_TSR = [ |
| 37 | "b" => true, |
| 38 | "i" => true, |
| 39 | "h1" => true, |
| 40 | "h2" => true, |
| 41 | "h3" => true, |
| 42 | "h4" => true, |
| 43 | "h5" => true, |
| 44 | "h6" => true, |
| 45 | "ul" => true, |
| 46 | "ol" => true, |
| 47 | "dl" => true, |
| 48 | "li" => true, |
| 49 | "dt" => true, |
| 50 | "dd" => true, |
| 51 | "table" => true, |
| 52 | "caption" => true, |
| 53 | "tr" => true, |
| 54 | "td" => true, |
| 55 | "th" => true, |
| 56 | "hr" => true, // void element |
| 57 | "br" => true, // void element |
| 58 | "pre" => true, |
| 59 | ]; |
| 60 | |
| 61 | /** |
| 62 | * Do $parsoidData->tsr values span the entire DOM subtree rooted at $n? |
| 63 | * |
| 64 | * @param Element $n |
| 65 | * @param DataParsoid $parsoidData |
| 66 | * @return bool |
| 67 | */ |
| 68 | private function tsrSpansTagDOM( Element $n, DataParsoid $parsoidData ): bool { |
| 69 | // - tags known to have tag-specific tsr |
| 70 | // - html tags with 'stx' set |
| 71 | // - tags with certain typeof properties (Parsoid-generated |
| 72 | // constructs: placeholders, lang variants) |
| 73 | $name = DOMCompat::nodeName( $n ); |
| 74 | return !( |
| 75 | isset( self::WT_TAGS_WITH_LIMITED_TSR[$name] ) || |
| 76 | DOMUtils::matchTypeOf( |
| 77 | $n, |
| 78 | '/^mw:(Placeholder|LanguageVariant)$/D' |
| 79 | ) || |
| 80 | WTUtils::hasLiteralHTMLMarker( $parsoidData ) |
| 81 | ); |
| 82 | } |
| 83 | |
| 84 | /** |
| 85 | * Is the inconsistency between two different ways of computing |
| 86 | * start offset ($cs, $s) explainable and acceptable? |
| 87 | * If so, we can suppress warnings. |
| 88 | * |
| 89 | * @param array $opts |
| 90 | * @param Node $node |
| 91 | * @param int $cs |
| 92 | * @param int $s |
| 93 | * @return bool |
| 94 | */ |
| 95 | private function acceptableInconsistency( array $opts, Node $node, int $cs, int $s ): bool { |
| 96 | /** |
| 97 | * 1. For wikitext URL links, suppress cs-s diff warnings because |
| 98 | * the diffs can come about because of various reasons since the |
| 99 | * canonicalized/decoded href will become the a-link text whose width |
| 100 | * will not match the tsr width of source wikitext |
| 101 | * |
| 102 | * (a) urls with encoded chars (ex: 'http://example.com/?foo=bar') |
| 103 | * (b) non-canonical spaces (ex: 'RFC 123' instead of 'RFC 123') |
| 104 | * |
| 105 | * 2. We currently don't have source offsets for attributes. |
| 106 | * So, we get a lot of spurious complaints about cs/s mismatch |
| 107 | * when DSR computation hit the <body> tag on this attribute. |
| 108 | * $opts['attrExpansion'] tell us when we are processing an attribute |
| 109 | * and let us suppress the mismatch warning on the <body> tag. |
| 110 | * |
| 111 | * 3. Other scenarios .. to be added |
| 112 | */ |
| 113 | if ( $node instanceof Element && ( |
| 114 | WTUtils::isATagFromURLLinkSyntax( $node ) || |
| 115 | WTUtils::isATagFromMagicLinkSyntax( $node ) |
| 116 | ) ) { |
| 117 | return true; |
| 118 | } elseif ( isset( $opts['attrExpansion'] ) && DOMUtils::atTheTop( $node ) ) { |
| 119 | return true; |
| 120 | } else { |
| 121 | return false; |
| 122 | } |
| 123 | } |
| 124 | |
| 125 | /** |
| 126 | * Compute wikitext string length that contributes to this |
| 127 | * list item's open tag. Closing tag width is always 0 for lists. |
| 128 | * |
| 129 | * @param Element $li |
| 130 | * @return int |
| 131 | */ |
| 132 | private function computeListEltWidth( Element $li ): int { |
| 133 | if ( !$li->previousSibling && $li->firstChild ) { |
| 134 | if ( DOMUtils::isList( $li->firstChild ) ) { |
| 135 | // Special case!! |
| 136 | // First child of a list that is on a chain |
| 137 | // of nested lists doesn't get a width. |
| 138 | return 0; |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | // count nest listing depth and assign |
| 143 | // that to the opening tag width. |
| 144 | $depth = 0; |
| 145 | |
| 146 | // This is the crux of the algorithm in DOMHandler::getListBullets() |
| 147 | while ( !DOMUtils::atTheTop( $li ) ) { |
| 148 | $dp = DOMDataUtils::getDataParsoid( $li ); |
| 149 | if ( DOMUtils::isListOrListItem( $li ) ) { |
| 150 | if ( DOMUtils::isListItem( $li ) ) { |
| 151 | $depth++; |
| 152 | } |
| 153 | } elseif ( |
| 154 | !WTUtils::isLiteralHTMLNode( $li ) || |
| 155 | empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) |
| 156 | ) { |
| 157 | break; |
| 158 | } |
| 159 | $li = $li->parentNode; |
| 160 | } |
| 161 | |
| 162 | return $depth; |
| 163 | } |
| 164 | |
| 165 | /** |
| 166 | * Compute wikitext string lengths that contribute to this |
| 167 | * anchor's opening (<a>) and closing (</a>) tags. |
| 168 | * |
| 169 | * @param Element $node |
| 170 | * @param ?DataParsoid $dp |
| 171 | * @return int[]|null |
| 172 | */ |
| 173 | private function computeATagWidth( |
| 174 | Element $node, ?DataParsoid $dp |
| 175 | ): ?array { |
| 176 | /* ------------------------------------------------------------- |
| 177 | * Tag widths are computed as per this logic here: |
| 178 | * |
| 179 | * 1. [[Foo|bar]] <-- piped mw:WikiLink |
| 180 | * -> start-tag: "[[Foo|" |
| 181 | * -> content : "bar" |
| 182 | * -> end-tag : "]]" |
| 183 | * |
| 184 | * 2. [[Foo]] <-- non-piped mw:WikiLink |
| 185 | * -> start-tag: "[[" |
| 186 | * -> content : "Foo" |
| 187 | * -> end-tag : "]]" |
| 188 | * |
| 189 | * 3. [[{{1x|Foo}}|Foo]] <-- tpl-attr mw:WikiLink |
| 190 | * Don't bother setting tag widths since dp->sa['href'] will be |
| 191 | * the expanded target and won't correspond to original source. |
| 192 | * |
| 193 | * 4. [http://wp.org foo] <-- mw:ExtLink |
| 194 | * -> start-tag: "[http://wp.org " |
| 195 | * -> content : "foo" |
| 196 | * -> end-tag : "]" |
| 197 | * -------------------------------------------------------------- */ |
| 198 | if ( !$dp ) { |
| 199 | return null; |
| 200 | } else { |
| 201 | if ( WTUtils::isATagFromWikiLinkSyntax( $node ) && !WTUtils::hasExpandedAttrsType( $node ) ) { |
| 202 | if ( isset( $dp->stx ) && $dp->stx === "piped" ) { |
| 203 | // this seems like some kind of a phan bug |
| 204 | $href = $dp->sa['href'] ?? null; |
| 205 | if ( $href ) { |
| 206 | return [ strlen( $href ) + 3, 2 ]; |
| 207 | } else { |
| 208 | return null; |
| 209 | } |
| 210 | } else { |
| 211 | return [ 2, 2 ]; |
| 212 | } |
| 213 | } elseif ( isset( $dp->tsr ) && WTUtils::isATagFromExtLinkSyntax( $node ) ) { |
| 214 | return [ $dp->tmp->extLinkContentOffsets->start - $dp->tsr->start, 1 ]; |
| 215 | } elseif ( WTUtils::isATagFromURLLinkSyntax( $node ) || |
| 216 | WTUtils::isATagFromMagicLinkSyntax( $node ) |
| 217 | ) { |
| 218 | return [ 0, 0 ]; |
| 219 | } else { |
| 220 | return null; |
| 221 | } |
| 222 | } |
| 223 | } |
| 224 | |
| 225 | /** |
| 226 | * Compute wikitext string lengths that contribute to this |
| 227 | * node's opening and closing tags. |
| 228 | * |
| 229 | * @param int|null $stWidth Start tag width |
| 230 | * @param int|null $etWidth End tag width |
| 231 | * @param Element $node |
| 232 | * @param DataParsoid $dp |
| 233 | * @return int[] Start and end tag widths |
| 234 | */ |
| 235 | private function computeTagWidths( $stWidth, $etWidth, Element $node, DataParsoid $dp ): array { |
| 236 | if ( isset( $dp->extTagOffsets ) ) { |
| 237 | return [ |
| 238 | $dp->extTagOffsets->openWidth, |
| 239 | $dp->extTagOffsets->closeWidth |
| 240 | ]; |
| 241 | } |
| 242 | |
| 243 | if ( WTUtils::hasLiteralHTMLMarker( $dp ) ) { |
| 244 | if ( !empty( $dp->selfClose ) ) { |
| 245 | $etWidth = 0; |
| 246 | } |
| 247 | } elseif ( DOMUtils::hasTypeOf( $node, 'mw:LanguageVariant' ) ) { |
| 248 | $stWidth = 2; // -{ |
| 249 | $etWidth = 2; // }- |
| 250 | } else { |
| 251 | $nodeName = DOMCompat::nodeName( $node ); |
| 252 | // 'tr' tags not in the original source have zero width |
| 253 | if ( $nodeName === 'tr' && !isset( $dp->startTagSrc ) ) { |
| 254 | $stWidth = 0; |
| 255 | $etWidth = 0; |
| 256 | } else { |
| 257 | $wtTagWidth = Consts::$WtTagWidths[$nodeName] ?? null; |
| 258 | if ( $stWidth === null ) { |
| 259 | // we didn't have a tsr to tell us how wide this tag was. |
| 260 | if ( $nodeName === 'a' ) { |
| 261 | $wtTagWidth = $this->computeATagWidth( $node, $dp ); |
| 262 | $stWidth = $wtTagWidth ? $wtTagWidth[0] : null; |
| 263 | } elseif ( $nodeName === 'li' || $nodeName === 'dd' ) { |
| 264 | $stWidth = $this->computeListEltWidth( $node ); |
| 265 | } elseif ( $wtTagWidth ) { |
| 266 | $stWidth = $wtTagWidth[0]; |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | if ( $etWidth === null && $wtTagWidth ) { |
| 271 | $etWidth = $wtTagWidth[1]; |
| 272 | } |
| 273 | } |
| 274 | } |
| 275 | |
| 276 | return [ $stWidth, $etWidth ]; |
| 277 | } |
| 278 | |
| 279 | /** |
| 280 | * @param Env $env |
| 281 | * @param mixed ...$args |
| 282 | */ |
| 283 | private function trace( Env $env, ...$args ): void { |
| 284 | $env->log( "trace/dsr", static function () use ( $args ) { |
| 285 | $buf = ''; |
| 286 | foreach ( $args as $arg ) { |
| 287 | $buf .= is_string( $arg ) ? $arg : PHPUtils::jsonEncode( $arg ); |
| 288 | } |
| 289 | return $buf; |
| 290 | } ); |
| 291 | } |
| 292 | |
| 293 | /** |
| 294 | * TSR = "Tag Source Range". Start and end offsets giving the location |
| 295 | * where the tag showed up in the original source. |
| 296 | * |
| 297 | * DSR = "DOM Source Range". dsr->start and dsr->end are open and end, |
| 298 | * dsr->openWidth and dsr->closeWidth are widths of the container tag. |
| 299 | * |
| 300 | * TSR is set by the tokenizer. In most cases, it only applies to the |
| 301 | * specific tag (opening or closing). However, for self-closing |
| 302 | * tags that the tokenizer generates, the TSR values applies to the entire |
| 303 | * DOM subtree (opening tag + content + closing tag). |
| 304 | * |
| 305 | * Ex: So [[foo]] will get tokenized to a SelfClosingTagTk(...) with a TSR |
| 306 | * value of [0,7]. The DSR algorithm will then use that info and assign |
| 307 | * the a-tag rooted at the <a href='...'>foo</a> DOM subtree a DSR value of |
| 308 | * [0,7,2,2], where 2 and 2 refer to the opening and closing tag widths. |
| 309 | * |
| 310 | * [s,e) -- if defined, start/end position of wikitext source that generated |
| 311 | * node's subtree |
| 312 | * |
| 313 | * @param Frame $frame |
| 314 | * @param Node $node node to process |
| 315 | * @param ?int $s start position, inclusive |
| 316 | * @param ?int $e end position, exclusive |
| 317 | * @param int $dsrCorrection |
| 318 | * @param array $opts |
| 319 | * @return array |
| 320 | */ |
| 321 | private function computeNodeDSR( |
| 322 | Frame $frame, Node $node, ?int $s, ?int $e, int $dsrCorrection, |
| 323 | array $opts |
| 324 | ): array { |
| 325 | $env = $frame->getEnv(); |
| 326 | if ( $e === null && !$node->hasChildNodes() ) { |
| 327 | $e = $s; |
| 328 | } |
| 329 | |
| 330 | $this->trace( $env, "BEG: ", DOMCompat::nodeName( $node ), " with [s, e]=", [ $s, $e ] ); |
| 331 | |
| 332 | /** @var int|null $ce Child end */ |
| 333 | $ce = $e; |
| 334 | // Initialize $cs to $ce to handle the zero-children case properly |
| 335 | // if this $node has no child content, then the start and end for |
| 336 | // the child dom are indeed identical. Alternatively, we could |
| 337 | // explicitly code this check before everything and bypass this. |
| 338 | /** @var int|null $cs Child start */ |
| 339 | $cs = $ce; |
| 340 | |
| 341 | $child = $node->lastChild; |
| 342 | while ( $child !== null ) { |
| 343 | $prevChild = $child->previousSibling; |
| 344 | $origCE = $ce; |
| 345 | $cType = $child->nodeType; |
| 346 | $fosteredNode = false; |
| 347 | $cs = null; |
| 348 | |
| 349 | if ( $child instanceof Element ) { |
| 350 | $dp = DOMDataUtils::getDataParsoid( $child ); |
| 351 | $endTSR = $dp->tmp->endTSR ?? null; |
| 352 | if ( $endTSR ) { |
| 353 | $ce = $endTSR->end; |
| 354 | } |
| 355 | } else { |
| 356 | $endTSR = null; |
| 357 | } |
| 358 | |
| 359 | // StrippedTag marker tags will be removed and won't |
| 360 | // be around to fill in the missing gap. So, absorb its width into |
| 361 | // the DSR of its previous sibling. Currently, this fix is only for |
| 362 | // B and I tags where the fix is clear-cut and obvious. |
| 363 | $next = $child->nextSibling; |
| 364 | if ( $next instanceof Element ) { |
| 365 | $ndp = DOMDataUtils::getDataParsoid( $next ); |
| 366 | if ( |
| 367 | isset( $ndp->src ) && |
| 368 | DOMUtils::hasTypeOf( $next, 'mw:Placeholder/StrippedTag' ) && |
| 369 | // NOTE: This inlist check matches the case in CleanUp where |
| 370 | // the placeholders are not removed from the DOM. We don't want |
| 371 | // to move the width into the sibling here and then leave around a |
| 372 | // a zero width placeholder because serializeDOMNode only handles |
| 373 | // a few cases of zero width nodes, so we'll end up duplicating |
| 374 | // it from ->src. |
| 375 | !DOMUtils::isNestedInListItem( $next ) |
| 376 | ) { |
| 377 | if ( isset( Consts::$WTQuoteTags[$ndp->name] ) && |
| 378 | isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $child )] ) ) { |
| 379 | $correction = strlen( $ndp->src ); |
| 380 | $ce += $correction; |
| 381 | $dsrCorrection = $correction; |
| 382 | if ( Utils::isValidDSR( $ndp->dsr ?? null ) ) { |
| 383 | // Record original DSR for the meta tag |
| 384 | // since it will now get corrected to zero width |
| 385 | // since child acquires its width-> |
| 386 | $ndp->getTemp()->origDSR = new DomSourceRange( |
| 387 | $ndp->dsr->start, $ndp->dsr->end, null, null ); |
| 388 | } |
| 389 | } |
| 390 | } |
| 391 | } |
| 392 | |
| 393 | $env->log( "trace/dsr", static function () use ( $child, $cs, $ce ) { |
| 394 | // slow, for debugging only |
| 395 | $i = 0; |
| 396 | foreach ( $child->parentNode->childNodes as $x ) { |
| 397 | if ( $x === $child ) { |
| 398 | break; |
| 399 | } |
| 400 | $i++; |
| 401 | } |
| 402 | return " CHILD: <" . DOMCompat::nodeName( $child->parentNode ) . ":" . $i . |
| 403 | ">=" . |
| 404 | ( $child instanceof Element ? '' : ( $child instanceof Text ? '#' : '!' ) ) . |
| 405 | ( ( $child instanceof Element ) ? |
| 406 | ( DOMCompat::nodeName( $child ) === 'meta' ? |
| 407 | DOMCompat::getOuterHTML( $child ) : DOMCompat::nodeName( $child ) ) : |
| 408 | PHPUtils::jsonEncode( $child->nodeValue ) ) . |
| 409 | " with " . PHPUtils::jsonEncode( [ $cs, $ce ] ); |
| 410 | } ); |
| 411 | |
| 412 | if ( $cType === XML_TEXT_NODE ) { |
| 413 | if ( $ce !== null ) { |
| 414 | $cs = $ce - strlen( $child->textContent ); |
| 415 | } |
| 416 | } elseif ( $cType === XML_COMMENT_NODE ) { |
| 417 | '@phan-var Comment $child'; // @var Comment $child |
| 418 | if ( $ce !== null ) { |
| 419 | // Decode HTML entities & re-encode as wikitext to find length |
| 420 | $cs = $ce - WTUtils::decodedCommentLength( $child ); |
| 421 | } |
| 422 | } elseif ( $cType === XML_ELEMENT_NODE ) { |
| 423 | DOMUtils::assertElt( $child ); |
| 424 | $dp = DOMDataUtils::getDataParsoid( $child ); |
| 425 | $tsr = $dp->tsr ?? null; |
| 426 | $oldCE = $tsr ? $tsr->end : null; |
| 427 | $propagateRight = false; |
| 428 | $stWidth = null; |
| 429 | $etWidth = null; |
| 430 | |
| 431 | $fosteredNode = $dp->fostered ?? false; |
| 432 | |
| 433 | // We are making dsr corrections to account for |
| 434 | // stripped tags (end tags usually). When stripping happens, |
| 435 | // in most common use cases, a corresponding end tag is added |
| 436 | // back elsewhere in the DOM. |
| 437 | // |
| 438 | // So, when an autoInsertedEnd tag is encountered and a matching |
| 439 | // dsr-correction is found, make a 1-time correction in the |
| 440 | // other direction. |
| 441 | // |
| 442 | // Currently, this fix is only for |
| 443 | // B and I tags where the fix is clear-cut and obvious. |
| 444 | if ( $ce !== null && !empty( $dp->autoInsertedEnd ) && |
| 445 | DOMUtils::isQuoteElt( $child ) |
| 446 | ) { |
| 447 | $correction = 3 + strlen( DOMCompat::nodeName( $child ) ); |
| 448 | if ( $correction === $dsrCorrection ) { |
| 449 | $ce -= $correction; |
| 450 | $dsrCorrection = 0; |
| 451 | } |
| 452 | } |
| 453 | |
| 454 | if ( DOMCompat::nodeName( $child ) === "meta" ) { |
| 455 | if ( $tsr ) { |
| 456 | if ( WTUtils::isTplMarkerMeta( $child ) ) { |
| 457 | // If this is a meta-marker tag (for templates, extensions), |
| 458 | // we have a new valid '$cs'. This marker also effectively resets tsr |
| 459 | // back to the top-level wikitext source range from nested template |
| 460 | // source range. |
| 461 | $cs = $tsr->start; |
| 462 | $ce = $tsr->end; |
| 463 | $propagateRight = true; |
| 464 | } else { |
| 465 | // All other meta-tags: <includeonly>, <noinclude>, etc. |
| 466 | $cs = $tsr->start; |
| 467 | $ce = $tsr->end; |
| 468 | } |
| 469 | } elseif ( PreHandler::isIndentPreWS( $child ) ) { |
| 470 | // Adjust start DSR; see PreHandler::newIndentPreWS() |
| 471 | $cs = $ce - 1; |
| 472 | } elseif ( DOMUtils::matchTypeOf( $child, '#^mw:Placeholder(/\w*)?$#D' ) && |
| 473 | $ce !== null && $dp->src |
| 474 | ) { |
| 475 | $cs = $ce - strlen( $dp->src ); |
| 476 | } |
| 477 | if ( isset( $dp->extTagOffsets ) ) { |
| 478 | $stWidth = $dp->extTagOffsets->openWidth; |
| 479 | $etWidth = $dp->extTagOffsets->closeWidth; |
| 480 | unset( $dp->extTagOffsets ); |
| 481 | } |
| 482 | } elseif ( DOMUtils::hasTypeOf( $child, "mw:Entity" ) && $ce !== null && $dp->src ) { |
| 483 | $cs = $ce - strlen( $dp->src ); |
| 484 | } else { |
| 485 | if ( DOMUtils::matchTypeOf( $child, '#^mw:Placeholder(/\w*)?$#D' ) && |
| 486 | $ce !== null && $dp->src |
| 487 | ) { |
| 488 | $cs = $ce - strlen( $dp->src ); |
| 489 | } else { |
| 490 | // Non-meta tags |
| 491 | if ( $endTSR ) { |
| 492 | $etWidth = $endTSR->length(); |
| 493 | } |
| 494 | if ( $tsr && empty( $dp->autoInsertedStart ) ) { |
| 495 | $cs = $tsr->start; |
| 496 | if ( $this->tsrSpansTagDOM( $child, $dp ) ) { |
| 497 | if ( $tsr->end !== null && $tsr->end > 0 ) { |
| 498 | $ce = $tsr->end; |
| 499 | $propagateRight = true; |
| 500 | } |
| 501 | } else { |
| 502 | $stWidth = $tsr->end - $tsr->start; |
| 503 | } |
| 504 | |
| 505 | $this->trace( $env, " TSR: ", $tsr, "; cs: ", $cs, "; ce: ", $ce ); |
| 506 | } elseif ( $s && $child->previousSibling === null ) { |
| 507 | $cs = $s; |
| 508 | } |
| 509 | } |
| 510 | |
| 511 | // Compute width of opening/closing tags for this dom $node |
| 512 | [ $stWidth, $etWidth ] = |
| 513 | $this->computeTagWidths( $stWidth, $etWidth, $child, $dp ); |
| 514 | |
| 515 | if ( !empty( $dp->autoInsertedStart ) ) { |
| 516 | $stWidth = 0; |
| 517 | } |
| 518 | if ( !empty( $dp->autoInsertedEnd ) ) { |
| 519 | $etWidth = 0; |
| 520 | } |
| 521 | |
| 522 | $ccs = $cs !== null && $stWidth !== null ? $cs + $stWidth : null; |
| 523 | $cce = $ce !== null && $etWidth !== null ? $ce - $etWidth : null; |
| 524 | |
| 525 | /* ----------------------------------------------------------------- |
| 526 | * Process DOM rooted at '$child'. |
| 527 | * |
| 528 | * NOTE: You might wonder why we are not checking for the zero-$children |
| 529 | * case. It is strictly not necessary and you can set newDsr directly. |
| 530 | * |
| 531 | * But, you have 2 options: [$ccs, $ccs] or [$cce, $cce]. Setting it to |
| 532 | * [$cce, $cce] would be consistent with the RTL approach. We should |
| 533 | * then compare $ccs and $cce and verify that they are identical. |
| 534 | * |
| 535 | * But, if we handled the zero-child case like the other scenarios, |
| 536 | * we don't have to worry about the above decisions and checks. |
| 537 | * ----------------------------------------------------------------- */ |
| 538 | |
| 539 | if ( WTUtils::isDOMFragmentWrapper( $child ) || |
| 540 | DOMUtils::hasTypeOf( $child, 'mw:LanguageVariant' ) |
| 541 | ) { |
| 542 | // Eliminate artificial $cs/s mismatch warnings since this is |
| 543 | // just a wrapper token with the right DSR but without any |
| 544 | // nested subtree that could account for the DSR span. |
| 545 | $newDsr = [ $ccs, $cce ]; |
| 546 | } elseif ( $child instanceof Element |
| 547 | && WTUtils::isATagFromWikiLinkSyntax( $child ) |
| 548 | && ( !isset( $dp->stx ) || $dp->stx !== "piped" ) ) { |
| 549 | /* ------------------------------------------------------------- |
| 550 | * This check here eliminates artificial DSR mismatches on content |
| 551 | * text of the A-node because of entity expansion, etc. |
| 552 | * |
| 553 | * Ex: [[7%25 solution]] will be rendered as: |
| 554 | * <a href=....>7% solution</a> |
| 555 | * If we descend into the text for the a-node, we'll have a 2-char |
| 556 | * DSR mismatch which will trigger artificial error warnings. |
| 557 | * |
| 558 | * In the non-piped link scenario, all dsr info is already present |
| 559 | * in the link target and so we get nothing new by processing |
| 560 | * content. |
| 561 | * ------------------------------------------------------------- */ |
| 562 | $newDsr = [ $ccs, $cce ]; |
| 563 | } else { |
| 564 | $env->log( "trace/dsr", static function () use ( |
| 565 | $env, $cs, $ce, $stWidth, $etWidth, $ccs, $cce |
| 566 | ) { |
| 567 | return " before-recursing:" . |
| 568 | "[cs,ce]=" . PHPUtils::jsonEncode( [ $cs, $ce ] ) . |
| 569 | "; [sw,ew]=" . PHPUtils::jsonEncode( [ $stWidth, $etWidth ] ) . |
| 570 | "; subtree-[cs,ce]=" . PHPUtils::jsonEncode( [ $ccs, $cce ] ); |
| 571 | } ); |
| 572 | |
| 573 | $this->trace( $env, "<recursion>" ); |
| 574 | $newDsr = $this->computeNodeDSR( $frame, $child, $ccs, $cce, $dsrCorrection, $opts ); |
| 575 | $this->trace( $env, "</recursion>" ); |
| 576 | } |
| 577 | |
| 578 | // $cs = min($child-dom-tree dsr->start - tag-width, current dsr->start) |
| 579 | if ( $stWidth !== null && $newDsr[0] !== null ) { |
| 580 | $newCs = $newDsr[0] - $stWidth; |
| 581 | if ( $cs === null || ( !$tsr && $newCs < $cs ) ) { |
| 582 | $cs = $newCs; |
| 583 | } |
| 584 | } |
| 585 | |
| 586 | // $ce = max($child-dom-tree dsr->end + tag-width, current dsr->end) |
| 587 | if ( $etWidth !== null && $newDsr[1] !== null ) { |
| 588 | $newCe = $newDsr[1] + $etWidth; |
| 589 | if ( $newCe > $ce ) { |
| 590 | $ce = $newCe; |
| 591 | } |
| 592 | } |
| 593 | } |
| 594 | |
| 595 | if ( $cs !== null || $ce !== null ) { |
| 596 | if ( $ce < 0 ) { |
| 597 | if ( !$fosteredNode ) { |
| 598 | $env->log( "info/dsr/negative", |
| 599 | "Negative DSR for node: " . DOMCompat::nodeName( $node ) . "; resetting to zero" ); |
| 600 | } |
| 601 | $ce = 0; |
| 602 | } |
| 603 | |
| 604 | // Fostered $nodes get a zero-dsr width range. |
| 605 | if ( $fosteredNode ) { |
| 606 | // Reset to 0, if necessary. |
| 607 | // This is critical to avoid duplication of fostered content in selser mode. |
| 608 | if ( $origCE < 0 ) { |
| 609 | $origCE = 0; |
| 610 | } |
| 611 | $dp->dsr = new DomSourceRange( $origCE, $origCE, null, null ); |
| 612 | } else { |
| 613 | $dp->dsr = new DomSourceRange( $cs, $ce, $stWidth, $etWidth ); |
| 614 | } |
| 615 | |
| 616 | $env->log( "trace/dsr", static function () use ( $frame, $child, $cs, $ce, $dp ) { |
| 617 | return " UPDATING " . DOMCompat::nodeName( $child ) . |
| 618 | " with " . PHPUtils::jsonEncode( [ $cs, $ce ] ) . |
| 619 | "; typeof: " . ( DOMCompat::getAttribute( $child, "typeof" ) ?? '' ); |
| 620 | } ); |
| 621 | } |
| 622 | |
| 623 | // Propagate any required changes to the right |
| 624 | // taking care not to cross-over into template content |
| 625 | if ( $ce !== null && |
| 626 | ( $propagateRight || $oldCE !== $ce || $e === null ) && |
| 627 | !WTUtils::isTplStartMarkerMeta( $child ) |
| 628 | ) { |
| 629 | $sibling = $child->nextSibling; |
| 630 | $newCE = $ce; |
| 631 | while ( $newCE !== null && $sibling && !WTUtils::isTplStartMarkerMeta( $sibling ) ) { |
| 632 | $nType = $sibling->nodeType; |
| 633 | if ( $nType === XML_TEXT_NODE ) { |
| 634 | $newCE += strlen( $sibling->textContent ); |
| 635 | } elseif ( $nType === XML_COMMENT_NODE ) { |
| 636 | '@phan-var Comment $sibling'; // @var Comment $sibling |
| 637 | $newCE += WTUtils::decodedCommentLength( $sibling ); |
| 638 | } elseif ( $nType === XML_ELEMENT_NODE ) { |
| 639 | DOMUtils::assertElt( $sibling ); |
| 640 | $siblingDP = DOMDataUtils::getDataParsoid( $sibling ); |
| 641 | $siblingDP->dsr ??= new DomSourceRange( null, null, null, null ); |
| 642 | $sdsrStart = $siblingDP->dsr->start; |
| 643 | if ( !empty( $siblingDP->fostered ) || |
| 644 | ( $sdsrStart !== null && $sdsrStart === $newCE ) || |
| 645 | ( $sdsrStart !== null && $sdsrStart < $newCE && isset( $siblingDP->tsr ) ) |
| 646 | ) { |
| 647 | // $sibling is fostered |
| 648 | // => nothing to propagate past it |
| 649 | // $sibling's dsr->start matches what we might propagate |
| 650 | // => nothing will change |
| 651 | // $sibling's dsr value came from tsr and it is not outside expected range |
| 652 | // => stop propagation so you don't overwrite it |
| 653 | break; |
| 654 | } |
| 655 | |
| 656 | // Update and move right |
| 657 | $env->log( "trace/dsr", static function () use ( $frame, $newCE, $sibling, $siblingDP ) { |
| 658 | return " CHANGING ce.start of " . DOMCompat::nodeName( $sibling ) . |
| 659 | " from " . $siblingDP->dsr->start . " to " . $newCE; |
| 660 | } ); |
| 661 | |
| 662 | $siblingDP->dsr->start = $newCE; |
| 663 | // If we have a dsr->end as well and since we updated |
| 664 | // dsr->start, we have to ensure that the two values don't |
| 665 | // introduce an inconsistency where dsr->start > dsr->end. |
| 666 | // Since we are in a LTR pass and are pushing updates |
| 667 | // forward, we are resolving it by updating dsr->end as |
| 668 | // well. There could be scenarios where this would be |
| 669 | // incorrect, but there is no universal fix here. |
| 670 | if ( $siblingDP->dsr->end !== null && $newCE > $siblingDP->dsr->end ) { |
| 671 | $siblingDP->dsr->end = $newCE; |
| 672 | } |
| 673 | $newCE = $siblingDP->dsr->end; |
| 674 | |
| 675 | } else { |
| 676 | break; |
| 677 | } |
| 678 | $sibling = $sibling->nextSibling; |
| 679 | } |
| 680 | |
| 681 | // Propagate new end information |
| 682 | if ( !$sibling ) { |
| 683 | $e = $newCE; |
| 684 | } |
| 685 | } |
| 686 | } |
| 687 | |
| 688 | // Don't change state if we processed a fostered $node |
| 689 | if ( $fosteredNode ) { |
| 690 | $ce = $origCE; |
| 691 | } else { |
| 692 | // $ce for next $child = $cs of current $child |
| 693 | $ce = $cs; |
| 694 | } |
| 695 | |
| 696 | $child = $prevChild; |
| 697 | } |
| 698 | |
| 699 | if ( $cs === null ) { |
| 700 | $cs = $s; |
| 701 | } |
| 702 | |
| 703 | // Detect errors |
| 704 | if ( $s !== null && $cs !== $s && !$this->acceptableInconsistency( $opts, $node, $cs, $s ) ) { |
| 705 | $env->log( "info/dsr/inconsistent", "DSR inconsistency: cs/s mismatch for node:", |
| 706 | DOMCompat::nodeName( $node ), "s:", $s, "; cs:", $cs ); |
| 707 | } |
| 708 | |
| 709 | $this->trace( $env, "END: ", DOMCompat::nodeName( $node ), ", returning: ", $cs, ", ", $e ); |
| 710 | |
| 711 | return [ $cs, $e ]; |
| 712 | } |
| 713 | |
| 714 | /** |
| 715 | * Computes DSR ranges for every node of a DOM tree. |
| 716 | * This pass is only invoked on the top-level page. |
| 717 | * |
| 718 | * @param Env $env The environment/context for the parse pipeline |
| 719 | * @param Node $root The root of the tree for which DSR has to be computed |
| 720 | * @param array $options Options governing DSR computation |
| 721 | * - sourceOffsets: [start, end] source offset. If missing, this defaults to |
| 722 | * [0, strlen($frame->getSrcText())] |
| 723 | * - attrExpansion: Is this an attribute expansion pipeline? |
| 724 | * @param bool $atTopLevel Are we running this on the top level? |
| 725 | */ |
| 726 | public function run( |
| 727 | Env $env, Node $root, array $options = [], bool $atTopLevel = false |
| 728 | ): void { |
| 729 | // Don't run this in template content |
| 730 | if ( $options['inTemplate'] ) { |
| 731 | return; |
| 732 | } |
| 733 | |
| 734 | $frame = $options['frame'] ?? $env->topFrame; |
| 735 | $startOffset = $options['sourceOffsets']->start ?? 0; |
| 736 | $endOffset = $options['sourceOffsets']->end ?? strlen( $frame->getSrcText() ); |
| 737 | $env->log( "trace/dsr", "------- tracing DSR computation -------" ); |
| 738 | |
| 739 | // The actual computation buried in trace/debug stmts. |
| 740 | $opts = [ 'attrExpansion' => $options['attrExpansion'] ?? false ]; |
| 741 | $this->computeNodeDSR( $frame, $root, $startOffset, $endOffset, 0, $opts ); |
| 742 | |
| 743 | if ( $root instanceof Element ) { |
| 744 | $dp = DOMDataUtils::getDataParsoid( $root ); |
| 745 | $dp->dsr = new DomSourceRange( $startOffset, $endOffset, 0, 0 ); |
| 746 | } |
| 747 | $env->log( "trace/dsr", "------- done tracing computation -------" ); |
| 748 | } |
| 749 | } |