Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
84.56% |
241 / 285 |
|
25.00% |
2 / 8 |
CRAP | |
0.00% |
0 / 1 |
ComputeDSR | |
84.56% |
241 / 285 |
|
25.00% |
2 / 8 |
237.02 | |
0.00% |
0 / 1 |
tsrSpansTagDOM | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 | |||
acceptableInconsistency | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
6 | |||
computeListEltWidth | |
78.57% |
11 / 14 |
|
0.00% |
0 / 1 |
10.98 | |||
computeATagWidth | |
85.71% |
12 / 14 |
|
0.00% |
0 / 1 |
10.29 | |||
computeTagWidths | |
92.59% |
25 / 27 |
|
0.00% |
0 / 1 |
15.09 | |||
trace | |
33.33% |
2 / 6 |
|
0.00% |
0 / 1 |
5.67 | |||
computeNodeDSR | |
83.67% |
164 / 196 |
|
0.00% |
0 / 1 |
147.28 | |||
run | |
91.67% |
11 / 12 |
|
0.00% |
0 / 1 |
3.01 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; |
5 | |
6 | use Wikimedia\Parsoid\Config\Env; |
7 | use Wikimedia\Parsoid\Core\DomSourceRange; |
8 | use Wikimedia\Parsoid\DOM\Comment; |
9 | use Wikimedia\Parsoid\DOM\Element; |
10 | use Wikimedia\Parsoid\DOM\Node; |
11 | use Wikimedia\Parsoid\DOM\Text; |
12 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
13 | use Wikimedia\Parsoid\Utils\DOMCompat; |
14 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
15 | use Wikimedia\Parsoid\Utils\DOMUtils; |
16 | use Wikimedia\Parsoid\Utils\PHPUtils; |
17 | use Wikimedia\Parsoid\Utils\Utils; |
18 | use Wikimedia\Parsoid\Utils\WTUtils; |
19 | use Wikimedia\Parsoid\Wikitext\Consts; |
20 | use Wikimedia\Parsoid\Wt2Html\Frame; |
21 | use Wikimedia\Parsoid\Wt2Html\TT\PreHandler; |
22 | use Wikimedia\Parsoid\Wt2Html\Wt2HtmlDOMProcessor; |
23 | |
24 | class ComputeDSR implements Wt2HtmlDOMProcessor { |
25 | /** |
26 | * For an explanation of what TSR is, see ComputeDSR::computeNodeDSR() |
27 | * |
28 | * TSR info on all these tags are only valid for the opening tag. |
29 | * |
30 | * On other tags, a, hr, br, meta-marker tags, the tsr spans |
31 | * the entire DOM, not just the tag. |
32 | * |
33 | * This code is not in Wikitext\Consts.php because this |
34 | * information is Parsoid-implementation-specific. |
35 | */ |
36 | private const WT_TAGS_WITH_LIMITED_TSR = [ |
37 | "b" => true, |
38 | "i" => true, |
39 | "h1" => true, |
40 | "h2" => true, |
41 | "h3" => true, |
42 | "h4" => true, |
43 | "h5" => true, |
44 | "h6" => true, |
45 | "ul" => true, |
46 | "ol" => true, |
47 | "dl" => true, |
48 | "li" => true, |
49 | "dt" => true, |
50 | "dd" => true, |
51 | "table" => true, |
52 | "caption" => true, |
53 | "tr" => true, |
54 | "td" => true, |
55 | "th" => true, |
56 | "hr" => true, // void element |
57 | "br" => true, // void element |
58 | "pre" => true, |
59 | ]; |
60 | |
61 | /** |
62 | * Do $parsoidData->tsr values span the entire DOM subtree rooted at $n? |
63 | * |
64 | * @param Element $n |
65 | * @param DataParsoid $parsoidData |
66 | * @return bool |
67 | */ |
68 | private function tsrSpansTagDOM( Element $n, DataParsoid $parsoidData ): bool { |
69 | // - tags known to have tag-specific tsr |
70 | // - html tags with 'stx' set |
71 | // - tags with certain typeof properties (Parsoid-generated |
72 | // constructs: placeholders, lang variants) |
73 | $name = DOMCompat::nodeName( $n ); |
74 | return !( |
75 | isset( self::WT_TAGS_WITH_LIMITED_TSR[$name] ) || |
76 | DOMUtils::matchTypeOf( |
77 | $n, |
78 | '/^mw:(Placeholder|LanguageVariant)$/D' |
79 | ) || |
80 | WTUtils::hasLiteralHTMLMarker( $parsoidData ) |
81 | ); |
82 | } |
83 | |
84 | /** |
85 | * Is the inconsistency between two different ways of computing |
86 | * start offset ($cs, $s) explainable and acceptable? |
87 | * If so, we can suppress warnings. |
88 | * |
89 | * @param array $opts |
90 | * @param Node $node |
91 | * @param int $cs |
92 | * @param int $s |
93 | * @return bool |
94 | */ |
95 | private function acceptableInconsistency( array $opts, Node $node, int $cs, int $s ): bool { |
96 | /** |
97 | * 1. For wikitext URL links, suppress cs-s diff warnings because |
98 | * the diffs can come about because of various reasons since the |
99 | * canonicalized/decoded href will become the a-link text whose width |
100 | * will not match the tsr width of source wikitext |
101 | * |
102 | * (a) urls with encoded chars (ex: 'http://example.com/?foo=bar') |
103 | * (b) non-canonical spaces (ex: 'RFC 123' instead of 'RFC 123') |
104 | * |
105 | * 2. We currently don't have source offsets for attributes. |
106 | * So, we get a lot of spurious complaints about cs/s mismatch |
107 | * when DSR computation hit the <body> tag on this attribute. |
108 | * $opts['attrExpansion'] tell us when we are processing an attribute |
109 | * and let us suppress the mismatch warning on the <body> tag. |
110 | * |
111 | * 3. Other scenarios .. to be added |
112 | */ |
113 | if ( $node instanceof Element && ( |
114 | WTUtils::isATagFromURLLinkSyntax( $node ) || |
115 | WTUtils::isATagFromMagicLinkSyntax( $node ) |
116 | ) ) { |
117 | return true; |
118 | } elseif ( isset( $opts['attrExpansion'] ) && DOMUtils::atTheTop( $node ) ) { |
119 | return true; |
120 | } else { |
121 | return false; |
122 | } |
123 | } |
124 | |
125 | /** |
126 | * Compute wikitext string length that contributes to this |
127 | * list item's open tag. Closing tag width is always 0 for lists. |
128 | * |
129 | * @param Element $li |
130 | * @return int |
131 | */ |
132 | private function computeListEltWidth( Element $li ): int { |
133 | if ( !$li->previousSibling && $li->firstChild ) { |
134 | if ( DOMUtils::isList( $li->firstChild ) ) { |
135 | // Special case!! |
136 | // First child of a list that is on a chain |
137 | // of nested lists doesn't get a width. |
138 | return 0; |
139 | } |
140 | } |
141 | |
142 | // count nest listing depth and assign |
143 | // that to the opening tag width. |
144 | $depth = 0; |
145 | |
146 | // This is the crux of the algorithm in DOMHandler::getListBullets() |
147 | while ( !DOMUtils::atTheTop( $li ) ) { |
148 | $dp = DOMDataUtils::getDataParsoid( $li ); |
149 | if ( DOMUtils::isListOrListItem( $li ) ) { |
150 | if ( DOMUtils::isListItem( $li ) ) { |
151 | $depth++; |
152 | } |
153 | } elseif ( |
154 | !WTUtils::isLiteralHTMLNode( $li ) || |
155 | empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) |
156 | ) { |
157 | break; |
158 | } |
159 | $li = $li->parentNode; |
160 | } |
161 | |
162 | return $depth; |
163 | } |
164 | |
165 | /** |
166 | * Compute wikitext string lengths that contribute to this |
167 | * anchor's opening (<a>) and closing (</a>) tags. |
168 | * |
169 | * @param Element $node |
170 | * @param ?DataParsoid $dp |
171 | * @return int[]|null |
172 | */ |
173 | private function computeATagWidth( |
174 | Element $node, ?DataParsoid $dp |
175 | ): ?array { |
176 | /* ------------------------------------------------------------- |
177 | * Tag widths are computed as per this logic here: |
178 | * |
179 | * 1. [[Foo|bar]] <-- piped mw:WikiLink |
180 | * -> start-tag: "[[Foo|" |
181 | * -> content : "bar" |
182 | * -> end-tag : "]]" |
183 | * |
184 | * 2. [[Foo]] <-- non-piped mw:WikiLink |
185 | * -> start-tag: "[[" |
186 | * -> content : "Foo" |
187 | * -> end-tag : "]]" |
188 | * |
189 | * 3. [[{{1x|Foo}}|Foo]] <-- tpl-attr mw:WikiLink |
190 | * Don't bother setting tag widths since dp->sa['href'] will be |
191 | * the expanded target and won't correspond to original source. |
192 | * |
193 | * 4. [http://wp.org foo] <-- mw:ExtLink |
194 | * -> start-tag: "[http://wp.org " |
195 | * -> content : "foo" |
196 | * -> end-tag : "]" |
197 | * -------------------------------------------------------------- */ |
198 | if ( !$dp ) { |
199 | return null; |
200 | } else { |
201 | if ( WTUtils::isATagFromWikiLinkSyntax( $node ) && !WTUtils::hasExpandedAttrsType( $node ) ) { |
202 | if ( isset( $dp->stx ) && $dp->stx === "piped" ) { |
203 | $pipeLen = strlen( $dp->firstPipeSrc ?? '|' ); |
204 | $href = $dp->sa['href']; |
205 | return [ 2 + strlen( $href ) + $pipeLen, 2 ]; |
206 | } else { |
207 | return [ 2, 2 ]; |
208 | } |
209 | } elseif ( isset( $dp->tsr ) && WTUtils::isATagFromExtLinkSyntax( $node ) ) { |
210 | return [ $dp->tmp->extLinkContentOffsets->start - $dp->tsr->start, 1 ]; |
211 | } elseif ( WTUtils::isATagFromURLLinkSyntax( $node ) || |
212 | WTUtils::isATagFromMagicLinkSyntax( $node ) |
213 | ) { |
214 | return [ 0, 0 ]; |
215 | } else { |
216 | return null; |
217 | } |
218 | } |
219 | } |
220 | |
221 | /** |
222 | * Compute wikitext string lengths that contribute to this |
223 | * node's opening and closing tags. |
224 | * |
225 | * @param int|null $stWidth Start tag width |
226 | * @param int|null $etWidth End tag width |
227 | * @param Element $node |
228 | * @param DataParsoid $dp |
229 | * @return (int|null)[] Start and end tag widths |
230 | */ |
231 | private function computeTagWidths( $stWidth, $etWidth, Element $node, DataParsoid $dp ): array { |
232 | if ( isset( $dp->extTagOffsets ) ) { |
233 | return [ |
234 | $dp->extTagOffsets->openWidth, |
235 | $dp->extTagOffsets->closeWidth |
236 | ]; |
237 | } |
238 | |
239 | if ( WTUtils::hasLiteralHTMLMarker( $dp ) ) { |
240 | if ( !empty( $dp->selfClose ) ) { |
241 | $etWidth = 0; |
242 | } |
243 | } elseif ( DOMUtils::hasTypeOf( $node, 'mw:LanguageVariant' ) ) { |
244 | $stWidth = 2; // -{ |
245 | $etWidth = 2; // }- |
246 | } else { |
247 | $nodeName = DOMCompat::nodeName( $node ); |
248 | // 'tr' tags not in the original source have zero width |
249 | if ( $nodeName === 'tr' && !isset( $dp->startTagSrc ) ) { |
250 | $stWidth = 0; |
251 | $etWidth = 0; |
252 | } else { |
253 | $wtTagWidth = Consts::$WtTagWidths[$nodeName] ?? null; |
254 | if ( $stWidth === null ) { |
255 | // we didn't have a tsr to tell us how wide this tag was. |
256 | if ( $nodeName === 'a' ) { |
257 | $wtTagWidth = $this->computeATagWidth( $node, $dp ); |
258 | $stWidth = $wtTagWidth ? $wtTagWidth[0] : null; |
259 | } elseif ( $nodeName === 'li' || $nodeName === 'dd' ) { |
260 | $stWidth = $this->computeListEltWidth( $node ); |
261 | } elseif ( $wtTagWidth ) { |
262 | $stWidth = $wtTagWidth[0]; |
263 | } |
264 | } |
265 | |
266 | if ( $etWidth === null && $wtTagWidth ) { |
267 | $etWidth = $wtTagWidth[1]; |
268 | } |
269 | } |
270 | } |
271 | |
272 | return [ $stWidth, $etWidth ]; |
273 | } |
274 | |
275 | /** |
276 | * @param Env $env |
277 | * @param mixed ...$args |
278 | */ |
279 | private function trace( Env $env, ...$args ): void { |
280 | $env->log( "trace/dsr", static function () use ( $args ) { |
281 | $buf = ''; |
282 | foreach ( $args as $arg ) { |
283 | $buf .= is_string( $arg ) ? $arg : PHPUtils::jsonEncode( $arg ); |
284 | } |
285 | return $buf; |
286 | } ); |
287 | } |
288 | |
289 | /** |
290 | * TSR = "Tag Source Range". Start and end offsets giving the location |
291 | * where the tag showed up in the original source. |
292 | * |
293 | * DSR = "DOM Source Range". dsr->start and dsr->end are open and end, |
294 | * dsr->openWidth and dsr->closeWidth are widths of the container tag. |
295 | * |
296 | * TSR is set by the tokenizer. In most cases, it only applies to the |
297 | * specific tag (opening or closing). However, for self-closing |
298 | * tags that the tokenizer generates, the TSR values applies to the entire |
299 | * DOM subtree (opening tag + content + closing tag). |
300 | * |
301 | * Ex: So [[foo]] will get tokenized to a SelfClosingTagTk(...) with a TSR |
302 | * value of [0,7]. The DSR algorithm will then use that info and assign |
303 | * the a-tag rooted at the <a href='...'>foo</a> DOM subtree a DSR value of |
304 | * [0,7,2,2], where 2 and 2 refer to the opening and closing tag widths. |
305 | * |
306 | * [s,e) -- if defined, start/end position of wikitext source that generated |
307 | * node's subtree |
308 | * |
309 | * @param Frame $frame |
310 | * @param Node $node node to process |
311 | * @param ?int $s start position, inclusive |
312 | * @param ?int $e end position, exclusive |
313 | * @param int $dsrCorrection |
314 | * @param array $opts |
315 | * @return array |
316 | */ |
317 | private function computeNodeDSR( |
318 | Frame $frame, Node $node, ?int $s, ?int $e, int $dsrCorrection, |
319 | array $opts |
320 | ): array { |
321 | $env = $frame->getEnv(); |
322 | if ( $e === null && !$node->hasChildNodes() ) { |
323 | $e = $s; |
324 | } |
325 | |
326 | $this->trace( $env, "BEG: ", DOMCompat::nodeName( $node ), " with [s, e]=", [ $s, $e ] ); |
327 | |
328 | /** @var int|null $ce Child end */ |
329 | $ce = $e; |
330 | // Initialize $cs to $ce to handle the zero-children case properly |
331 | // if this $node has no child content, then the start and end for |
332 | // the child dom are indeed identical. Alternatively, we could |
333 | // explicitly code this check before everything and bypass this. |
334 | /** @var int|null $cs Child start */ |
335 | $cs = $ce; |
336 | |
337 | $child = $node->lastChild; |
338 | while ( $child !== null ) { |
339 | $prevChild = $child->previousSibling; |
340 | $origCE = $ce; |
341 | $cType = $child->nodeType; |
342 | $fosteredNode = false; |
343 | $cs = null; |
344 | |
345 | if ( $child instanceof Element ) { |
346 | $dp = DOMDataUtils::getDataParsoid( $child ); |
347 | $endTSR = $dp->tmp->endTSR ?? null; |
348 | if ( $endTSR ) { |
349 | $ce = $endTSR->end; |
350 | } |
351 | } else { |
352 | $endTSR = null; |
353 | } |
354 | |
355 | // StrippedTag marker tags will be removed and won't |
356 | // be around to fill in the missing gap. So, absorb its width into |
357 | // the DSR of its previous sibling. Currently, this fix is only for |
358 | // B and I tags where the fix is clear-cut and obvious. |
359 | $next = $child->nextSibling; |
360 | if ( $next instanceof Element ) { |
361 | $ndp = DOMDataUtils::getDataParsoid( $next ); |
362 | if ( |
363 | isset( $ndp->src ) && |
364 | DOMUtils::hasTypeOf( $next, 'mw:Placeholder/StrippedTag' ) && |
365 | // NOTE: This inlist check matches the case in CleanUp where |
366 | // the placeholders are not removed from the DOM. We don't want |
367 | // to move the width into the sibling here and then leave around a |
368 | // a zero width placeholder because serializeDOMNode only handles |
369 | // a few cases of zero width nodes, so we'll end up duplicating |
370 | // it from ->src. |
371 | !DOMUtils::isNestedInListItem( $next ) |
372 | ) { |
373 | if ( isset( Consts::$WTQuoteTags[$ndp->name] ) && |
374 | isset( Consts::$WTQuoteTags[DOMCompat::nodeName( $child )] ) ) { |
375 | $correction = strlen( $ndp->src ); |
376 | $ce += $correction; |
377 | $dsrCorrection = $correction; |
378 | if ( Utils::isValidDSR( $ndp->dsr ?? null ) ) { |
379 | // Record original DSR for the meta tag |
380 | // since it will now get corrected to zero width |
381 | // since child acquires its width-> |
382 | $ndp->getTemp()->origDSR = new DomSourceRange( |
383 | $ndp->dsr->start, $ndp->dsr->end, null, null ); |
384 | } |
385 | } |
386 | } |
387 | } |
388 | |
389 | $env->log( "trace/dsr", static function () use ( $child, $cs, $ce ) { |
390 | // slow, for debugging only |
391 | $i = 0; |
392 | foreach ( $child->parentNode->childNodes as $x ) { |
393 | if ( $x === $child ) { |
394 | break; |
395 | } |
396 | $i++; |
397 | } |
398 | return " CHILD: <" . DOMCompat::nodeName( $child->parentNode ) . ":" . $i . |
399 | ">=" . |
400 | ( $child instanceof Element ? '' : ( $child instanceof Text ? '#' : '!' ) ) . |
401 | ( ( $child instanceof Element ) ? |
402 | ( DOMCompat::nodeName( $child ) === 'meta' ? |
403 | DOMCompat::getOuterHTML( $child ) : DOMCompat::nodeName( $child ) ) : |
404 | PHPUtils::jsonEncode( $child->nodeValue ) ) . |
405 | " with " . PHPUtils::jsonEncode( [ $cs, $ce ] ); |
406 | } ); |
407 | |
408 | if ( $cType === XML_TEXT_NODE ) { |
409 | if ( $ce !== null ) { |
410 | $cs = $ce - strlen( $child->textContent ); |
411 | } |
412 | } elseif ( $cType === XML_COMMENT_NODE ) { |
413 | '@phan-var Comment $child'; // @var Comment $child |
414 | if ( $ce !== null ) { |
415 | // Decode HTML entities & re-encode as wikitext to find length |
416 | $cs = $ce - WTUtils::decodedCommentLength( $child ); |
417 | } |
418 | } elseif ( $cType === XML_ELEMENT_NODE ) { |
419 | DOMUtils::assertElt( $child ); |
420 | $dp = DOMDataUtils::getDataParsoid( $child ); |
421 | $tsr = $dp->tsr ?? null; |
422 | $oldCE = $tsr ? $tsr->end : null; |
423 | $propagateRight = false; |
424 | $stWidth = null; |
425 | $etWidth = null; |
426 | |
427 | $fosteredNode = $dp->fostered ?? false; |
428 | |
429 | // We are making dsr corrections to account for |
430 | // stripped tags (end tags usually). When stripping happens, |
431 | // in most common use cases, a corresponding end tag is added |
432 | // back elsewhere in the DOM. |
433 | // |
434 | // So, when an autoInsertedEnd tag is encountered and a matching |
435 | // dsr-correction is found, make a 1-time correction in the |
436 | // other direction. |
437 | // |
438 | // Currently, this fix is only for |
439 | // B and I tags where the fix is clear-cut and obvious. |
440 | if ( $ce !== null && !empty( $dp->autoInsertedEnd ) && |
441 | DOMUtils::isQuoteElt( $child ) |
442 | ) { |
443 | $correction = 3 + strlen( DOMCompat::nodeName( $child ) ); |
444 | if ( $correction === $dsrCorrection ) { |
445 | $ce -= $correction; |
446 | $dsrCorrection = 0; |
447 | } |
448 | } |
449 | |
450 | if ( DOMCompat::nodeName( $child ) === "meta" ) { |
451 | if ( $tsr ) { |
452 | if ( WTUtils::isTplMarkerMeta( $child ) ) { |
453 | // If this is a meta-marker tag (for templates, extensions), |
454 | // we have a new valid '$cs'. This marker also effectively resets tsr |
455 | // back to the top-level wikitext source range from nested template |
456 | // source range. |
457 | $cs = $tsr->start; |
458 | $ce = $tsr->end; |
459 | $propagateRight = true; |
460 | } else { |
461 | // All other meta-tags: <includeonly>, <noinclude>, etc. |
462 | $cs = $tsr->start; |
463 | $ce = $tsr->end; |
464 | } |
465 | } elseif ( PreHandler::isIndentPreWS( $child ) ) { |
466 | // Adjust start DSR; see PreHandler::newIndentPreWS() |
467 | $cs = $ce - 1; |
468 | } elseif ( DOMUtils::matchTypeOf( $child, '#^mw:Placeholder(/\w*)?$#D' ) && |
469 | $ce !== null && $dp->src |
470 | ) { |
471 | $cs = $ce - strlen( $dp->src ); |
472 | } |
473 | if ( isset( $dp->extTagOffsets ) ) { |
474 | $stWidth = $dp->extTagOffsets->openWidth; |
475 | $etWidth = $dp->extTagOffsets->closeWidth; |
476 | unset( $dp->extTagOffsets ); |
477 | } |
478 | } elseif ( DOMUtils::hasTypeOf( $child, "mw:Entity" ) && $ce !== null && $dp->src ) { |
479 | $cs = $ce - strlen( $dp->src ); |
480 | } else { |
481 | if ( DOMUtils::matchTypeOf( $child, '#^mw:Placeholder(/\w*)?$#D' ) && |
482 | $ce !== null && $dp->src |
483 | ) { |
484 | $cs = $ce - strlen( $dp->src ); |
485 | } else { |
486 | // Non-meta tags |
487 | if ( $endTSR ) { |
488 | $etWidth = $endTSR->length(); |
489 | } |
490 | if ( $tsr && empty( $dp->autoInsertedStart ) ) { |
491 | $cs = $tsr->start; |
492 | if ( $this->tsrSpansTagDOM( $child, $dp ) ) { |
493 | if ( $tsr->end !== null && $tsr->end > 0 ) { |
494 | $ce = $tsr->end; |
495 | $propagateRight = true; |
496 | } |
497 | } else { |
498 | $stWidth = $tsr->end - $tsr->start; |
499 | } |
500 | |
501 | $this->trace( $env, " TSR: ", $tsr, "; cs: ", $cs, "; ce: ", $ce ); |
502 | } elseif ( $s && $child->previousSibling === null ) { |
503 | $cs = $s; |
504 | } |
505 | } |
506 | |
507 | // Compute width of opening/closing tags for this dom $node |
508 | [ $stWidth, $etWidth ] = |
509 | $this->computeTagWidths( $stWidth, $etWidth, $child, $dp ); |
510 | |
511 | if ( !empty( $dp->autoInsertedStart ) ) { |
512 | $stWidth = 0; |
513 | } |
514 | if ( !empty( $dp->autoInsertedEnd ) ) { |
515 | $etWidth = 0; |
516 | } |
517 | |
518 | $ccs = $cs !== null && $stWidth !== null ? $cs + $stWidth : null; |
519 | $cce = $ce !== null && $etWidth !== null ? $ce - $etWidth : null; |
520 | |
521 | /* ----------------------------------------------------------------- |
522 | * Process DOM rooted at '$child'. |
523 | * |
524 | * NOTE: You might wonder why we are not checking for the zero-$children |
525 | * case. It is strictly not necessary and you can set newDsr directly. |
526 | * |
527 | * But, you have 2 options: [$ccs, $ccs] or [$cce, $cce]. Setting it to |
528 | * [$cce, $cce] would be consistent with the RTL approach. We should |
529 | * then compare $ccs and $cce and verify that they are identical. |
530 | * |
531 | * But, if we handled the zero-child case like the other scenarios, |
532 | * we don't have to worry about the above decisions and checks. |
533 | * ----------------------------------------------------------------- */ |
534 | |
535 | if ( WTUtils::isDOMFragmentWrapper( $child ) || |
536 | DOMUtils::hasTypeOf( $child, 'mw:LanguageVariant' ) |
537 | ) { |
538 | // Eliminate artificial $cs/s mismatch warnings since this is |
539 | // just a wrapper token with the right DSR but without any |
540 | // nested subtree that could account for the DSR span. |
541 | $newDsr = [ $ccs, $cce ]; |
542 | } elseif ( $child instanceof Element |
543 | && WTUtils::isATagFromWikiLinkSyntax( $child ) |
544 | && ( !isset( $dp->stx ) || $dp->stx !== "piped" ) ) { |
545 | /* ------------------------------------------------------------- |
546 | * This check here eliminates artificial DSR mismatches on content |
547 | * text of the A-node because of entity expansion, etc. |
548 | * |
549 | * Ex: [[7%25 solution]] will be rendered as: |
550 | * <a href=....>7% solution</a> |
551 | * If we descend into the text for the a-node, we'll have a 2-char |
552 | * DSR mismatch which will trigger artificial error warnings. |
553 | * |
554 | * In the non-piped link scenario, all dsr info is already present |
555 | * in the link target and so we get nothing new by processing |
556 | * content. |
557 | * ------------------------------------------------------------- */ |
558 | $newDsr = [ $ccs, $cce ]; |
559 | } else { |
560 | $env->log( "trace/dsr", static function () use ( |
561 | $env, $cs, $ce, $stWidth, $etWidth, $ccs, $cce |
562 | ) { |
563 | return " before-recursing:" . |
564 | "[cs,ce]=" . PHPUtils::jsonEncode( [ $cs, $ce ] ) . |
565 | "; [sw,ew]=" . PHPUtils::jsonEncode( [ $stWidth, $etWidth ] ) . |
566 | "; subtree-[cs,ce]=" . PHPUtils::jsonEncode( [ $ccs, $cce ] ); |
567 | } ); |
568 | |
569 | $this->trace( $env, "<recursion>" ); |
570 | $newDsr = $this->computeNodeDSR( $frame, $child, $ccs, $cce, $dsrCorrection, $opts ); |
571 | $this->trace( $env, "</recursion>" ); |
572 | } |
573 | |
574 | // $cs = min($child-dom-tree dsr->start - tag-width, current dsr->start) |
575 | if ( $stWidth !== null && $newDsr[0] !== null ) { |
576 | $newCs = $newDsr[0] - $stWidth; |
577 | if ( $cs === null || ( !$tsr && $newCs < $cs ) ) { |
578 | $cs = $newCs; |
579 | } |
580 | } |
581 | |
582 | // $ce = max($child-dom-tree dsr->end + tag-width, current dsr->end) |
583 | if ( $etWidth !== null && $newDsr[1] !== null ) { |
584 | $newCe = $newDsr[1] + $etWidth; |
585 | if ( $newCe > $ce ) { |
586 | $ce = $newCe; |
587 | } |
588 | } |
589 | } |
590 | |
591 | if ( $cs !== null || $ce !== null ) { |
592 | if ( $ce < 0 ) { |
593 | if ( !$fosteredNode ) { |
594 | $env->log( "info/dsr/negative", |
595 | "Negative DSR for node: " . DOMCompat::nodeName( $node ) . "; resetting to zero" ); |
596 | } |
597 | $ce = 0; |
598 | } |
599 | |
600 | // Fostered $nodes get a zero-dsr width range. |
601 | if ( $fosteredNode ) { |
602 | // Reset to 0, if necessary. |
603 | // This is critical to avoid duplication of fostered content in selser mode. |
604 | if ( $origCE < 0 ) { |
605 | $origCE = 0; |
606 | } |
607 | $dp->dsr = new DomSourceRange( $origCE, $origCE, null, null ); |
608 | } else { |
609 | $dp->dsr = new DomSourceRange( $cs, $ce, $stWidth, $etWidth ); |
610 | } |
611 | |
612 | $env->log( "trace/dsr", static function () use ( $frame, $child, $cs, $ce, $dp ) { |
613 | return " UPDATING " . DOMCompat::nodeName( $child ) . |
614 | " with " . PHPUtils::jsonEncode( [ $cs, $ce ] ) . |
615 | "; typeof: " . ( DOMCompat::getAttribute( $child, "typeof" ) ?? '' ); |
616 | } ); |
617 | } |
618 | |
619 | // Propagate any required changes to the right |
620 | // taking care not to cross-over into template content |
621 | if ( $ce !== null && |
622 | ( $propagateRight || $oldCE !== $ce || $e === null ) && |
623 | !WTUtils::isTplStartMarkerMeta( $child ) |
624 | ) { |
625 | $sibling = $child->nextSibling; |
626 | $newCE = $ce; |
627 | while ( $newCE !== null && $sibling && !WTUtils::isTplStartMarkerMeta( $sibling ) ) { |
628 | $nType = $sibling->nodeType; |
629 | if ( $nType === XML_TEXT_NODE ) { |
630 | $newCE += strlen( $sibling->textContent ); |
631 | } elseif ( $nType === XML_COMMENT_NODE ) { |
632 | '@phan-var Comment $sibling'; // @var Comment $sibling |
633 | $newCE += WTUtils::decodedCommentLength( $sibling ); |
634 | } elseif ( $nType === XML_ELEMENT_NODE ) { |
635 | DOMUtils::assertElt( $sibling ); |
636 | $siblingDP = DOMDataUtils::getDataParsoid( $sibling ); |
637 | $siblingDP->dsr ??= new DomSourceRange( null, null, null, null ); |
638 | $sdsrStart = $siblingDP->dsr->start; |
639 | if ( !empty( $siblingDP->fostered ) || |
640 | ( $sdsrStart !== null && $sdsrStart === $newCE ) || |
641 | ( $sdsrStart !== null && $sdsrStart < $newCE && isset( $siblingDP->tsr ) ) |
642 | ) { |
643 | // $sibling is fostered |
644 | // => nothing to propagate past it |
645 | // $sibling's dsr->start matches what we might propagate |
646 | // => nothing will change |
647 | // $sibling's dsr value came from tsr and it is not outside expected range |
648 | // => stop propagation so you don't overwrite it |
649 | break; |
650 | } |
651 | |
652 | // Update and move right |
653 | $env->log( "trace/dsr", static function () use ( $frame, $newCE, $sibling, $siblingDP ) { |
654 | return " CHANGING ce.start of " . DOMCompat::nodeName( $sibling ) . |
655 | " from " . $siblingDP->dsr->start . " to " . $newCE; |
656 | } ); |
657 | |
658 | $siblingDP->dsr->start = $newCE; |
659 | // If we have a dsr->end as well and since we updated |
660 | // dsr->start, we have to ensure that the two values don't |
661 | // introduce an inconsistency where dsr->start > dsr->end. |
662 | // Since we are in a LTR pass and are pushing updates |
663 | // forward, we are resolving it by updating dsr->end as |
664 | // well. There could be scenarios where this would be |
665 | // incorrect, but there is no universal fix here. |
666 | if ( $siblingDP->dsr->end !== null && $newCE > $siblingDP->dsr->end ) { |
667 | $siblingDP->dsr->end = $newCE; |
668 | } |
669 | $newCE = $siblingDP->dsr->end; |
670 | |
671 | } else { |
672 | break; |
673 | } |
674 | $sibling = $sibling->nextSibling; |
675 | } |
676 | |
677 | // Propagate new end information |
678 | if ( !$sibling ) { |
679 | $e = $newCE; |
680 | } |
681 | } |
682 | } |
683 | |
684 | // Don't change state if we processed a fostered $node |
685 | if ( $fosteredNode ) { |
686 | $ce = $origCE; |
687 | } else { |
688 | // $ce for next $child = $cs of current $child |
689 | $ce = $cs; |
690 | } |
691 | |
692 | $child = $prevChild; |
693 | } |
694 | |
695 | if ( $cs === null ) { |
696 | $cs = $s; |
697 | } |
698 | |
699 | // Detect errors |
700 | if ( $s !== null && $cs !== $s && !$this->acceptableInconsistency( $opts, $node, $cs, $s ) ) { |
701 | $env->log( "info/dsr/inconsistent", "DSR inconsistency: cs/s mismatch for node:", |
702 | DOMCompat::nodeName( $node ), "s:", $s, "; cs:", $cs ); |
703 | } |
704 | |
705 | $this->trace( $env, "END: ", DOMCompat::nodeName( $node ), ", returning: ", $cs, ", ", $e ); |
706 | |
707 | return [ $cs, $e ]; |
708 | } |
709 | |
710 | /** |
711 | * Computes DSR ranges for every node of a DOM tree. |
712 | * This pass is only invoked on the top-level page. |
713 | * |
714 | * @param Env $env The environment/context for the parse pipeline |
715 | * @param Node $root The root of the tree for which DSR has to be computed |
716 | * @param array $options Options governing DSR computation |
717 | * - sourceOffsets: [start, end] source offset. If missing, this defaults to |
718 | * [0, strlen($frame->getSrcText())] |
719 | * - attrExpansion: Is this an attribute expansion pipeline? |
720 | * @param bool $atTopLevel Are we running this on the top level? |
721 | */ |
722 | public function run( |
723 | Env $env, Node $root, array $options = [], bool $atTopLevel = false |
724 | ): void { |
725 | // Don't run this in template content |
726 | if ( $options['inTemplate'] ) { |
727 | return; |
728 | } |
729 | |
730 | $frame = $options['frame'] ?? $env->topFrame; |
731 | $startOffset = $options['sourceOffsets']->start ?? 0; |
732 | $endOffset = $options['sourceOffsets']->end ?? strlen( $frame->getSrcText() ); |
733 | $env->log( "trace/dsr", "------- tracing DSR computation -------" ); |
734 | |
735 | // The actual computation buried in trace/debug stmts. |
736 | $opts = [ 'attrExpansion' => $options['attrExpansion'] ?? false ]; |
737 | $this->computeNodeDSR( $frame, $root, $startOffset, $endOffset, 0, $opts ); |
738 | |
739 | if ( $root instanceof Element ) { |
740 | $dp = DOMDataUtils::getDataParsoid( $root ); |
741 | $dp->dsr = new DomSourceRange( $startOffset, $endOffset, 0, 0 ); |
742 | } |
743 | $env->log( "trace/dsr", "------- done tracing computation -------" ); |
744 | } |
745 | } |