Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
81.30% |
287 / 353 |
|
29.41% |
5 / 17 |
CRAP | |
0.00% |
0 / 1 |
WrapSectionsState | |
81.30% |
287 / 353 |
|
29.41% |
5 / 17 |
251.36 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
computeSectionMetadata | |
82.35% |
28 / 34 |
|
0.00% |
0 / 1 |
9.45 | |||
shouldOmitFromTOC | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
3.04 | |||
createNewSection | |
78.26% |
18 / 23 |
|
0.00% |
0 / 1 |
12.24 | |||
isEmptySpan | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
wrapSectionsInDOM | |
88.00% |
66 / 75 |
|
0.00% |
0 / 1 |
32.66 | |||
isParsoidSection | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
findSectionAncestor | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
getDSR | |
54.55% |
12 / 22 |
|
0.00% |
0 / 1 |
22.36 | |||
fillDSRGap | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
4.25 | |||
collapseWrappers | |
81.25% |
26 / 32 |
|
0.00% |
0 / 1 |
9.53 | |||
resolveTplExtSectionConflicts | |
80.36% |
45 / 56 |
|
0.00% |
0 / 1 |
14.28 | |||
convertTOCOffsets | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
findTOCInsertionPoint | |
84.62% |
11 / 13 |
|
0.00% |
0 / 1 |
7.18 | |||
insertSyntheticSection | |
94.12% |
16 / 17 |
|
0.00% |
0 / 1 |
7.01 | |||
addSyntheticTOCMarker | |
83.78% |
31 / 37 |
|
0.00% |
0 / 1 |
15.96 | |||
run | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\PP\Processors; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\Core\DomSourceRange; |
9 | use Wikimedia\Parsoid\Core\InternalException; |
10 | use Wikimedia\Parsoid\Core\SectionMetadata; |
11 | use Wikimedia\Parsoid\DOM\Comment; |
12 | use Wikimedia\Parsoid\DOM\Document; |
13 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
14 | use Wikimedia\Parsoid\DOM\Element; |
15 | use Wikimedia\Parsoid\DOM\Node; |
16 | use Wikimedia\Parsoid\DOM\Text; |
17 | use Wikimedia\Parsoid\NodeData\DataMw; |
18 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
19 | use Wikimedia\Parsoid\Utils\DOMCompat; |
20 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
21 | use Wikimedia\Parsoid\Utils\DOMUtils; |
22 | use Wikimedia\Parsoid\Utils\PHPUtils; |
23 | use Wikimedia\Parsoid\Utils\TokenUtils; |
24 | use Wikimedia\Parsoid\Utils\WTUtils; |
25 | use Wikimedia\Parsoid\Wt2Html\Frame; |
26 | |
27 | class WrapSectionsState { |
28 | private Env $env; |
29 | private Frame $frame; |
30 | |
31 | /** @var Element|DocumentFragment */ |
32 | private $rootNode; |
33 | |
34 | /** |
35 | * The next section debug ID |
36 | */ |
37 | private int $count = 1; |
38 | |
39 | /** |
40 | * Pseudo section count is needed to determine TOC rendering |
41 | */ |
42 | private int $pseudoSectionCount = 0; |
43 | private Document $doc; |
44 | |
45 | /** |
46 | * Map of about ID to first element |
47 | * @var Element[] |
48 | */ |
49 | private array $aboutIdMap = []; |
50 | private int $sectionNumber = 0; |
51 | private ?WrapSectionsTplInfo $tplInfo = null; |
52 | |
53 | /** @var WrapSectionsTplInfo[] */ |
54 | private array $tplsAndExtsToExamine = []; |
55 | private int $oldLevel = 0; |
56 | |
57 | public function __construct( |
58 | Env $env, |
59 | Frame $frame, |
60 | Node $rootNode |
61 | ) { |
62 | $this->env = $env; |
63 | $this->frame = $frame; |
64 | $this->rootNode = $rootNode; |
65 | $this->doc = $rootNode->ownerDocument; |
66 | } |
67 | |
68 | /** |
69 | * Update section metadata needed to generate TOC. |
70 | * |
71 | * @param SectionMetadata $metadata |
72 | * @param Element $heading |
73 | * @param int $newLevel |
74 | */ |
75 | private function computeSectionMetadata( |
76 | SectionMetadata $metadata, Element $heading, int $newLevel |
77 | ): void { |
78 | if ( !$this->env->getPageConfig()->getSuppressTOC() ) { |
79 | $tocData = $this->env->getTOCData(); |
80 | $tocData->addSection( $metadata ); |
81 | $tocData->processHeading( $this->oldLevel, $newLevel, $metadata ); |
82 | } |
83 | $this->oldLevel = $newLevel; |
84 | |
85 | if ( WTUtils::isLiteralHTMLNode( $heading ) ) { |
86 | // Literal HTML tags in wikitext don't get section edit links |
87 | $metadata->fromTitle = null; |
88 | $metadata->index = ''; |
89 | $metadata->codepointOffset = null; |
90 | } elseif ( $this->tplInfo !== null ) { |
91 | $dmw = DOMDataUtils::getDataMw( $this->tplInfo->first ); |
92 | $metadata->index = ''; // Match legacy parser |
93 | if ( !isset( $dmw->parts ) ) { |
94 | // Extension or language-variant |
95 | // Need to determine what the output should be here |
96 | $metadata->fromTitle = null; |
97 | } elseif ( count( $dmw->parts ) > 1 ) { |
98 | // Multi-part content -- cannot pick a title |
99 | $metadata->fromTitle = null; |
100 | } else { |
101 | $p0 = $dmw->parts[0]; |
102 | // If just a single part (guaranteed with count above), it will be stdclass |
103 | '@phan-var \stdClass $p0'; |
104 | if ( !empty( $p0->templatearg ) ) { |
105 | // Since we currently don't process templates in Parsoid, |
106 | // this has to be a top-level {{{...}}} and so the content |
107 | // comes from the current page. But, legacy parser returns 'false' |
108 | // for this, so we'll return null as well instead of current title. |
109 | $metadata->fromTitle = null; |
110 | } elseif ( !empty( $p0->template->target->href ) ) { |
111 | // Pick template title, but strip leading "./" prefix |
112 | $metadata->fromTitle = preg_replace( |
113 | "#^./#", "", $p0->template->target->href ); |
114 | if ( $this->sectionNumber >= 0 ) { |
115 | // Legacy parser sets this to '' in some cases |
116 | // See "Templated sections (heading from template arg)" parser test |
117 | $metadata->index = 'T-' . $this->sectionNumber; |
118 | } |
119 | } else { |
120 | // Legacy parser return null here |
121 | $metadata->fromTitle = null; |
122 | } |
123 | } |
124 | $metadata->codepointOffset = null; |
125 | } else { |
126 | $title = $this->env->getContextTitle(); |
127 | // Use the dbkey (underscores) instead of text (spaces) |
128 | $metadata->fromTitle = $title->getPrefixedDBKey(); |
129 | $metadata->index = (string)$this->sectionNumber; |
130 | // Note that our DSR counts *are* byte counts, while this core |
131 | // interface expects *codepoint* counts. We are going to convert |
132 | // these in a batch (for efficiency) in ::convertTOCOffsets() below |
133 | $metadata->codepointOffset = DOMDataUtils::getDataParsoid( $heading )->dsr->start ?? -1; |
134 | } |
135 | |
136 | $metadata->anchor = DOMCompat::getAttribute( $heading, 'id' ); |
137 | $section = DOMDataUtils::getDataParsoid( $heading )->getTemp()->section; |
138 | $metadata->line = $section['line']; |
139 | $metadata->linkAnchor = $section['linkAnchor']; |
140 | } |
141 | |
142 | /** |
143 | * Should we omit this heading from TOC? |
144 | * Yes if $heading is: |
145 | * - generated by an extensoin |
146 | */ |
147 | private function shouldOmitFromTOC( Element $heading ): bool { |
148 | $node = $heading->parentNode; |
149 | while ( $node ) { |
150 | // NOTE: Here, we are making the assumption that extensions never |
151 | // emit a DOM forest and only ever have a single wrapper node. |
152 | // While ExtensionHandler doesn't assume that, this seems to be borne out |
153 | // in reality. But, if this assumption were not true, we would be adding |
154 | // TOC entries from extension-generated about siblings into the TOC. |
155 | // In scenarios where templates generated the extension and the extension |
156 | // is part of the template's wrapper, we cannot reliably determine what |
157 | // part of the output came from extensions in that case (because the |
158 | // template wrapping clobbers that information). So, for now, we ignore |
159 | // this edge case where extensions generate multiple DOM nodes (that also |
160 | // have headings). Later on, we may enforce a single-wrapper-node |
161 | // requirement for extensions. |
162 | if ( WTUtils::isFirstExtensionWrapperNode( $node ) ) { |
163 | return true; |
164 | } |
165 | $node = $node->parentNode; |
166 | } |
167 | |
168 | return false; |
169 | } |
170 | |
171 | /** |
172 | * Create a new section element |
173 | * |
174 | * @param Element|DocumentFragment $rootNode |
175 | * @param array<Section> &$sectionStack |
176 | * @param ?Section $currSection |
177 | * @param Element $heading the heading node |
178 | * @param int $newLevel |
179 | * @param bool $pseudoSection |
180 | * @return Section |
181 | */ |
182 | private function createNewSection( |
183 | Node $rootNode, array &$sectionStack, |
184 | ?Section $currSection, Element $heading, int $newLevel, |
185 | bool $pseudoSection |
186 | ): Section { |
187 | /* Structure for regular (editable or not) sections |
188 | * <section data-mw-section-id=".."> |
189 | * <h*>..</h*> |
190 | * .. |
191 | * </section> |
192 | * |
193 | * Lead sections and pseudo-sections won't have <h*> or <div> tags |
194 | */ |
195 | $section = new Section( $newLevel, $this->count++, $this->doc ); |
196 | |
197 | /* Step 1. Get section stack to the right nesting level |
198 | * 1a. Pop stack till we have a higher-level section. |
199 | */ |
200 | $stack = &$sectionStack; |
201 | $sc = count( $stack ); |
202 | while ( $sc > 0 && !( $stack[$sc - 1]->hasNestedLevel( $newLevel ) ) ) { |
203 | array_pop( $stack ); |
204 | $sc--; |
205 | } |
206 | |
207 | /* 1b. Push current section onto stack if it is a higher-level section */ |
208 | if ( $currSection && $currSection->hasNestedLevel( $newLevel ) ) { |
209 | $stack[] = $currSection; |
210 | $sc++; |
211 | } |
212 | |
213 | /* Step 2: Add new section where it belongs: a parent section OR body */ |
214 | $parentSection = $sc > 0 ? $stack[$sc - 1] : null; |
215 | if ( $parentSection ) { |
216 | $parentSection->addSection( $section ); |
217 | } else { |
218 | $rootNode->insertBefore( $section->container, $heading ); |
219 | } |
220 | |
221 | /* Step 3: Add <h*> to the <section> */ |
222 | $section->addNode( $heading ); |
223 | |
224 | /* Step 4: Assign data-mw-section-id attribute |
225 | * |
226 | * CX wants <section> tags with a distinguishing attribute so that |
227 | * it can differentiate between its internal use of <section> tags |
228 | * with what Parsoid adds. So, we will add a data-mw-section-id |
229 | * attribute always. |
230 | * |
231 | * data-mw-section-id = 0 for the lead section |
232 | * data-mw-section-id = -1 for non-editable sections |
233 | * Note that templated content cannot be edited directly. |
234 | * data-mw-section-id = -2 for pseudo sections |
235 | * data-mw-section-id > 0 for everything else and this number |
236 | * matches PHP parser / MediaWiki's notion of that section. |
237 | * |
238 | * The code here handles uneditable sections because of templating. |
239 | */ |
240 | if ( $pseudoSection ) { |
241 | $this->pseudoSectionCount++; |
242 | $section->setId( -2 ); |
243 | } elseif ( $this->tplInfo !== null ) { |
244 | $section->setId( -1 ); |
245 | } else { |
246 | $section->setId( $this->sectionNumber ); |
247 | } |
248 | |
249 | // Sections from extensions shouldn't show up in TOC |
250 | if ( !$pseudoSection && !$this->shouldOmitFromTOC( $heading ) ) { |
251 | $this->computeSectionMetadata( $section->metadata, $heading, $newLevel ); |
252 | } |
253 | |
254 | return $section; |
255 | } |
256 | |
257 | private function isEmptySpan( Element $span ): bool { |
258 | $n = $span->firstChild; |
259 | while ( $n ) { |
260 | if ( $n instanceof Element ) { |
261 | return false; |
262 | } elseif ( $n instanceof Text && !preg_match( '/^\s*$/D', $n->nodeValue ) ) { |
263 | return false; |
264 | } |
265 | $n = $n->nextSibling; |
266 | } |
267 | return true; |
268 | } |
269 | |
270 | /** |
271 | * Walk the DOM and add <section> wrappers where required. |
272 | * This is the workhorse code that wrapSections relies on. |
273 | * |
274 | * @param ?Section $currSection |
275 | * @param Element|DocumentFragment $rootNode |
276 | * @return int |
277 | */ |
278 | private function wrapSectionsInDOM( |
279 | ?Section $currSection, Node $rootNode |
280 | ): int { |
281 | // Since template wrapping is done and template wrappers are well-nested, |
282 | // we can reset template state for every subtree. |
283 | $tplInfo = null; |
284 | $sectionStack = []; |
285 | $highestSectionLevel = 7; |
286 | $node = $rootNode->firstChild; |
287 | while ( $node ) { |
288 | $next = $node->nextSibling; |
289 | $addedNode = false; |
290 | $expandSectionBoundary = false; |
291 | |
292 | // Track entry into templated and extension output |
293 | if ( !$this->tplInfo && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
294 | DOMUtils::assertElt( $node ); |
295 | $this->tplInfo = $tplInfo = new WrapSectionsTplInfo; |
296 | $tplInfo->first = $node; |
297 | $about = DOMCompat::getAttribute( $node, 'about' ); |
298 | // NOTE: could be null because of language variant markup! |
299 | $tplInfo->about = $about; |
300 | $aboutSiblings = WTUtils::getAboutSiblings( $node, $about ); |
301 | $tplInfo->last = end( $aboutSiblings ); |
302 | $this->aboutIdMap[$about] = $node; |
303 | |
304 | // Collect a sequence of rendering transparent nodes starting at $node. |
305 | // This could be while ( true ), but being defensive. |
306 | while ( $node ) { |
307 | // If we hit the end of the template, we are done! |
308 | // - If this is a heading, we'll process it below. |
309 | // - If not, the template never had a heading, so |
310 | // we can continue default section wrapping behavior. |
311 | if ( $tplInfo->last === $node ) { |
312 | break; |
313 | } |
314 | |
315 | // If we hit a non-rendering-transparent node or a non-empty span, |
316 | // we are done! We cannot expand the section boundary any further. |
317 | if ( !WTUtils::isRenderingTransparentNode( $node ) && |
318 | !( |
319 | DOMCompat::nodeName( $node ) === 'span' && |
320 | !WTUtils::isLiteralHTMLNode( $node ) && |
321 | $this->isEmptySpan( $node ) |
322 | ) |
323 | ) { |
324 | break; |
325 | } |
326 | |
327 | // Accumulate the rendering-transparent node and loop |
328 | $tplInfo->rtContentNodes[] = $node; |
329 | $node = $node->nextSibling; |
330 | } |
331 | |
332 | if ( count( $tplInfo->rtContentNodes ) > 0 && DOMUtils::isHeading( $node ) ) { |
333 | // In this scenario, we can expand the section boundary to include these nodes |
334 | // rather than start with the heading. This eliminates unnecessary conflicts |
335 | // between section & template boundaries. |
336 | $expandSectionBoundary = true; |
337 | $next = $node->nextSibling; |
338 | } else { |
339 | // Reset to normal sectioning behavior! |
340 | $node = $tplInfo->first; |
341 | $tplInfo->rtContentNodes = []; |
342 | } |
343 | } |
344 | |
345 | if ( DOMUtils::isHeading( $node ) ) { |
346 | DOMUtils::assertElt( $node ); // headings are elements |
347 | $level = (int)DOMCompat::nodeName( $node )[1]; |
348 | |
349 | $dp = DOMDataUtils::getDataParsoid( $node ); |
350 | if ( WTUtils::isLiteralHTMLNode( $node ) ) { |
351 | // HTML <h*> tags get section wrappers, but the sections are uneditable |
352 | // via the section editing API. |
353 | $this->sectionNumber = -1; |
354 | } elseif ( isset( $dp->tmp->headingIndex ) ) { |
355 | // This could be just `$this->sectionNumber++` without the |
356 | // complicated if-guard if T214538 were fixed in core; |
357 | // see T213468 where this more-complicated behavior was |
358 | // added to match core's eccentricities. |
359 | $this->sectionNumber = $dp->tmp->headingIndex; |
360 | } |
361 | if ( $level < $highestSectionLevel ) { |
362 | $highestSectionLevel = $level; |
363 | } |
364 | $currSection = $this->createNewSection( |
365 | $rootNode, $sectionStack, |
366 | $currSection, $node, $level, false |
367 | ); |
368 | if ( $tplInfo && $expandSectionBoundary ) { |
369 | foreach ( $tplInfo->rtContentNodes as $rtn ) { |
370 | $currSection->container->insertBefore( $rtn, $node ); |
371 | } |
372 | $tplInfo->firstSection = $currSection; |
373 | } |
374 | $addedNode = true; |
375 | } elseif ( $node instanceof Element ) { |
376 | $nestedHighestSectionLevel = $this->wrapSectionsInDOM( null, $node ); |
377 | if ( $currSection && !$currSection->hasNestedLevel( $nestedHighestSectionLevel ) ) { |
378 | // If we find a higher level nested section, |
379 | // (a) Make current section non-editable |
380 | // (b) There are 2 options here best illustrated with an example. |
381 | // Consider the wiktiext below. |
382 | // <div> |
383 | // =1= |
384 | // b |
385 | // </div> |
386 | // c |
387 | // =2= |
388 | // 1. Create a new pseudo-section to wrap '$node' |
389 | // There will be a <section> around the <div> which includes 'c'. |
390 | // 2. Don't create the pseudo-section by setting '$currSection = null' |
391 | // But, this can leave some content outside any top-level section. |
392 | // 'c' will not be in any section. |
393 | // The code below implements strategy 1. |
394 | $currSection->setId( -1 ); |
395 | $currSection = $this->createNewSection( |
396 | $rootNode, $sectionStack, |
397 | $currSection, $node, $nestedHighestSectionLevel, true |
398 | ); |
399 | $addedNode = true; |
400 | } |
401 | } |
402 | |
403 | if ( $currSection && !$addedNode ) { |
404 | $currSection->addNode( $node ); |
405 | } |
406 | |
407 | if ( $tplInfo && $tplInfo->first === $node ) { |
408 | $tplInfo->firstSection = $currSection; |
409 | } |
410 | |
411 | // Track exit from templated output |
412 | if ( $tplInfo && $tplInfo->last === $node ) { |
413 | if ( $currSection !== $tplInfo->firstSection ) { |
414 | // The opening $node and closing $node of the template |
415 | // are in different sections! This might require resolution. |
416 | // While 'firstSection' could be null, if we get here, |
417 | // 'lastSection' is guaranteed to always be non-null. |
418 | $tplInfo->lastSection = $currSection; |
419 | $this->tplsAndExtsToExamine[] = $tplInfo; |
420 | } |
421 | |
422 | $this->tplInfo = $tplInfo = null; |
423 | } |
424 | |
425 | $node = $next; |
426 | } |
427 | |
428 | // The last section embedded in a non-body DOM element |
429 | // should always be marked non-editable since it will have |
430 | // the closing tag (ex: </div>) showing up in the source editor |
431 | // which we cannot support in a visual editing $environment. |
432 | if ( $currSection && !DOMUtils::atTheTop( $rootNode ) ) { |
433 | $currSection->setId( -1 ); |
434 | } |
435 | |
436 | return $highestSectionLevel; |
437 | } |
438 | |
439 | /** |
440 | * Is this a Parsoid-inserted section (vs. a section node generated by |
441 | * other page-components / content-generators like extensions)? |
442 | * |
443 | * @param Element $n |
444 | * @return bool |
445 | */ |
446 | private static function isParsoidSection( Element $n ): bool { |
447 | return DOMCompat::nodeName( $n ) === 'section' && $n->hasAttribute( 'data-mw-section-id' ); |
448 | } |
449 | |
450 | /** |
451 | * Find an ancestor that is a Parsoid-inserted section |
452 | * |
453 | * @param Node $n |
454 | * @return Element |
455 | */ |
456 | private static function findSectionAncestor( Node $n ): Element { |
457 | do { |
458 | $n = DOMUtils::findAncestorOfName( $n, 'section' ); |
459 | } while ( $n && !self::isParsoidSection( $n ) ); |
460 | |
461 | Assert::invariant( $n instanceof Element, "Expected to find Parsoid-section ancestor" ); |
462 | return $n; |
463 | } |
464 | |
465 | /** |
466 | * Get opening/closing DSR offset for the subtree rooted at $node. |
467 | * This handles scenarios where $node is a section or template wrapper |
468 | * and if a section, when it has leading/trailing non-element nodes |
469 | * that don't have recorded DSR values. |
470 | * |
471 | * @param Element $node |
472 | * @param bool $start |
473 | * @return ?int |
474 | */ |
475 | private function getDSR( Element $node, bool $start ): ?int { |
476 | if ( !self::isParsoidSection( $node ) ) { |
477 | $dsr = DOMDataUtils::getDataParsoid( $node )->dsr ?? null; |
478 | if ( !$dsr ) { |
479 | Assert::invariant( |
480 | $node->hasAttribute( 'about' ), |
481 | 'Expected an about id' |
482 | ); |
483 | $about = DOMCompat::getAttribute( $node, 'about' ); |
484 | $dsr = DOMDataUtils::getDataParsoid( $this->aboutIdMap[$about] )->dsr; |
485 | } |
486 | |
487 | return $start ? $dsr->start : $dsr->end; |
488 | } |
489 | |
490 | $offset = 0; |
491 | $c = $start ? $node->firstChild : $node->lastChild; |
492 | while ( $c ) { |
493 | if ( $c instanceof Text ) { |
494 | $offset += strlen( $c->textContent ); |
495 | } elseif ( $c instanceof Comment ) { |
496 | $offset += WTUtils::decodedCommentLength( $c ); |
497 | } else { |
498 | DOMUtils::assertElt( $c ); |
499 | $ret = $this->getDSR( $c, $start ); |
500 | return $ret === null ? null : $ret + ( $start ? -$offset : $offset ); |
501 | } |
502 | $c = $start ? $c->nextSibling : $c->previousSibling; |
503 | } |
504 | |
505 | return -1; |
506 | } |
507 | |
508 | /** |
509 | * FIXME: Duplicated with TableFixups code. |
510 | * @param array &$parts |
511 | * @param ?int $offset1 |
512 | * @param ?int $offset2 |
513 | * @throws InternalException |
514 | */ |
515 | private function fillDSRGap( array &$parts, ?int $offset1, ?int $offset2 ): void { |
516 | if ( $offset1 === null || $offset2 === null ) { |
517 | throw new InternalException(); |
518 | } |
519 | if ( $offset1 < $offset2 ) { |
520 | $parts[] = PHPUtils::safeSubstr( $this->frame->getSrcText(), $offset1, $offset2 - $offset1 ); |
521 | } |
522 | } |
523 | |
524 | /** |
525 | * FIXME: There is strong overlap with TableFixups code. |
526 | * |
527 | * $wrapper will hold tpl/ext encap info for the array of tpls/exts as well as |
528 | * content before, after and in between them. Right now, this will always be a |
529 | * <section> node, but not asserting this since code doesn't depend on it being so. |
530 | * |
531 | * @param Element $wrapper |
532 | * @param array $encapWrappers |
533 | */ |
534 | private function collapseWrappers( Element $wrapper, array $encapWrappers ): void { |
535 | $wrapperDp = DOMDataUtils::getDataParsoid( $wrapper ); |
536 | |
537 | // Build up $parts, $pi to set up the combined transclusion info on $wrapper |
538 | $parts = []; |
539 | $pi = []; |
540 | $index = 0; |
541 | $prevDp = null; |
542 | $haveTemplate = false; |
543 | try { |
544 | foreach ( $encapWrappers as $encapNode ) { |
545 | $dp = DOMDataUtils::getDataParsoid( $encapNode ); |
546 | |
547 | // Plug DSR gaps between encapWrappers |
548 | if ( !$prevDp ) { |
549 | $this->fillDSRGap( $parts, $wrapperDp->dsr->start, $dp->dsr->start ); |
550 | } else { |
551 | $this->fillDSRGap( $parts, $prevDp->dsr->end, $dp->dsr->start ); |
552 | } |
553 | |
554 | if ( DOMUtils::hasTypeOf( $encapNode, "mw:Transclusion" ) ) { |
555 | $haveTemplate = true; |
556 | // Assimilate $encapNode's data-mw and data-parsoid pi info |
557 | $dmw = DOMDataUtils::getDataMw( $encapNode ); |
558 | foreach ( $dmw->parts ?? [] as $part ) { |
559 | '@phan-var string|\stdClass $part'; |
560 | // Template index is relative to other transclusions. |
561 | // This index is used to extract whitespace information from |
562 | // data-parsoid and that array only includes info for templates. |
563 | // So skip over strings here. |
564 | if ( !is_string( $part ) ) { |
565 | $part = clone $part; |
566 | if ( isset( $part->template ) ) { |
567 | $part->template->i = $index++; |
568 | } else { |
569 | $part->templatearg->i = $index++; |
570 | } |
571 | } |
572 | $parts[] = $part; |
573 | } |
574 | PHPUtils::pushArray( $pi, $dp->pi ?? [ [] ] ); |
575 | } else { |
576 | // Where a non-template type is present, we are going to treat that |
577 | // segment as a "string" in the parts array. So, we effectively treat |
578 | // "mw:Transclusion" as a generic type that covers a single template |
579 | // as well as a run of segments where at least one segment comes from |
580 | // a template but others may be from other generators (ex: extensions). |
581 | $this->fillDSRGap( $parts, $dp->dsr->start, $dp->dsr->end ); |
582 | } |
583 | |
584 | $prevDp = $dp; |
585 | } |
586 | |
587 | if ( !$haveTemplate ) { |
588 | throw new InternalException(); |
589 | } |
590 | |
591 | DOMUtils::addTypeOf( $wrapper, "mw:Transclusion" ); |
592 | $wrapperDp->pi = $pi; |
593 | $this->fillDSRGap( $parts, $prevDp->dsr->end, $wrapperDp->dsr->end ); |
594 | DOMDataUtils::setDataMw( $wrapper, new DataMw( [ 'parts' => $parts ] ) ); |
595 | } catch ( InternalException $e ) { |
596 | // We don't have accurate template wrapping information. |
597 | // Set typeof to 'mw:Placeholder' since 'mw:Transclusion' |
598 | // typeof is not actionable without valid data-mw. |
599 | // |
600 | // FIXME: |
601 | // 1. If we stop stripping section wrappers in the html->wt direction, |
602 | // we will need to add a DOMHandler for <section> or mw:Placeholder typeof |
603 | // on arbitrary Elements to traverse into children and serialize and |
604 | // prevent page corruption. |
605 | // 2. This may be a good place to collect stats for T191641#6357136 |
606 | // 3. Maybe we need a special error typeof rather than mw:Placeholder |
607 | $wrapper->setAttribute( 'typeof', 'mw:Placeholder' ); |
608 | } |
609 | } |
610 | |
611 | /** |
612 | * Section wrappers and encapsulation wrappers can conflict because of |
613 | * partial overlaps. This method identifies those conflicts and fixes up |
614 | * the encapsulation by expanding those ranges as necessary. |
615 | */ |
616 | private function resolveTplExtSectionConflicts(): void { |
617 | $secRanges = []; |
618 | '@phan-var array[] $secRanges'; |
619 | foreach ( $this->tplsAndExtsToExamine as $tplInfo ) { |
620 | $s1 = $tplInfo->firstSection->container ?? |
621 | self::findSectionAncestor( $tplInfo->first ); |
622 | |
623 | // guaranteed to be non-null |
624 | $s2 = $tplInfo->lastSection->container; |
625 | |
626 | // Find a common ancestor of s1 and s2 (could be s1 or s2) |
627 | $s2Ancestors = DOMUtils::pathToRoot( $s2 ); |
628 | $s1Ancestors = []; |
629 | $n = 0; |
630 | $ancestor = $s1; |
631 | while ( !in_array( $ancestor, $s2Ancestors, true ) ) { |
632 | $s1Ancestors[] = $ancestor; |
633 | $ancestor = $ancestor->parentNode; |
634 | $n++; |
635 | } |
636 | |
637 | // ancestor is now the common ancestor of s1 and s2 |
638 | $s1Ancestors[] = $ancestor; |
639 | $n++; |
640 | |
641 | // Set up start/end of the new encapsulation range |
642 | if ( $ancestor === $s1 || $ancestor === $s2 ) { |
643 | $start = $ancestor; |
644 | $end = $ancestor; |
645 | } else { |
646 | // While creating a new section (see createNewSection), it only |
647 | // gets added where its parent is either another section, |
648 | // or body, so all ancestors are themselves sections, or body. |
649 | $start = $s1Ancestors[$n - 2]; |
650 | $i = array_search( $ancestor, $s2Ancestors, true ); |
651 | $end = $s2Ancestors[$i - 1]; |
652 | } |
653 | |
654 | '@phan-var Element $start'; // @var Element $start |
655 | '@phan-var Element $end'; // @var Element $end |
656 | |
657 | // Add new OR update existing range |
658 | if ( $start->hasAttribute( 'about' ) ) { |
659 | // Overlaps with an existing range. |
660 | $about = DOMCompat::getAttribute( $start, 'about' ); |
661 | if ( !$end->hasAttribute( 'about' ) ) { |
662 | // Extend existing range till $end |
663 | $secRanges[$about]['end'] = $end; |
664 | $end->setAttribute( 'about', $about ); |
665 | } else { |
666 | Assert::invariant( DOMCompat::getAttribute( $end, 'about' ) === $about, |
667 | "Expected end-range about id to be $about instead of " . |
668 | DOMCompat::getAttribute( $end, 'about' ) . " in the overlap scenario." ); |
669 | } |
670 | } else { |
671 | // Check for nesting in another range. Since $start and $end |
672 | // are siblings, this is sufficient to know the entire range |
673 | // is nested |
674 | $about = null; |
675 | $n = $start->parentNode; |
676 | $body = DOMCompat::getBody( $start->ownerDocument ); |
677 | while ( $n !== $body ) { |
678 | '@phan-var Element $n'; // @var Element $n |
679 | if ( self::isParsoidSection( $n ) && $n->hasAttribute( 'about' ) ) { |
680 | $about = DOMCompat::getAttribute( $n, 'about' ); |
681 | break; |
682 | } |
683 | $n = $n->parentNode; |
684 | } |
685 | |
686 | if ( !$about ) { |
687 | // Not overlapping, not nested => new range |
688 | $about = $this->env->newAboutId(); |
689 | $start->setAttribute( 'about', $about ); |
690 | $end->setAttribute( 'about', $about ); |
691 | $secRanges[$about] = [ 'start' => $start, 'end' => $end, 'encapWrappers' => [] ]; |
692 | } |
693 | } |
694 | $secRanges[$about]['encapWrappers'][] = $tplInfo->first; |
695 | } |
696 | |
697 | // Process recorded ranges into new encapsulation information |
698 | // that spans all content in that range. |
699 | foreach ( $secRanges as $about => $range ) { |
700 | // Ensure that all top level nodes of the range have the same about id |
701 | for ( $n = $range['start']; $n !== $range['end']->nextSibling; $n = $n->nextSibling ) { |
702 | Assert::invariant( self::isParsoidSection( $n ), |
703 | "Encountered non-Parsoid-section node (" . |
704 | DOMCompat::nodeName( $n ) . |
705 | ") while updating template wrappers" ); |
706 | $n->setAttribute( 'about', $about ); |
707 | } |
708 | |
709 | $dsr1 = $this->getDSR( $range['start'], true ); // Traverses non-tpl content => will succeed |
710 | $dsr2 = $this->getDSR( $range['end'], false ); // Traverses non-tpl content => will succeed |
711 | $dp = new DataParsoid; |
712 | $dp->dsr = new DomSourceRange( $dsr1, $dsr2, null, null ); |
713 | DOMDataUtils::setDataParsoid( $range['start'], $dp ); |
714 | |
715 | $this->collapseWrappers( $range['start'], $range['encapWrappers'] ); |
716 | } |
717 | } |
718 | |
719 | private function convertTOCOffsets() { |
720 | // Create reference array from all the codepointOffsets |
721 | $offsets = []; |
722 | foreach ( $this->env->getTOCData()->getSections() as $section ) { |
723 | if ( $section->codepointOffset !== null ) { |
724 | $offsets[] = &$section->codepointOffset; |
725 | } |
726 | } |
727 | TokenUtils::convertOffsets( |
728 | $this->env->topFrame->getSrcText(), |
729 | $this->env->getCurrentOffsetType(), |
730 | 'char', |
731 | $offsets |
732 | ); |
733 | } |
734 | |
735 | /** |
736 | * In core, Parser.php adds a TOC marker before the *first* heading element |
737 | * independent of how that heading element is nested. In the common case, |
738 | * that insertion point corresponds to the last element of the lead section |
739 | * as computed by section wrapping code in this file. In the edge case, when |
740 | * a <div> wraps the heading, the insertion point lies inside the <div> and |
741 | * has no relation to the lead section. |
742 | */ |
743 | private static function findTOCInsertionPoint( Node $elt ): ?Element { |
744 | while ( $elt ) { |
745 | // Ignore extension content while finding TOC insertion point |
746 | if ( WTUtils::isFirstExtensionWrapperNode( $elt ) ) { |
747 | $elt = WTUtils::skipOverEncapsulatedContent( $elt ); |
748 | continue; |
749 | } |
750 | if ( $elt instanceof Element ) { |
751 | if ( DOMUtils::isHeading( $elt ) ) { |
752 | return $elt; |
753 | } elseif ( $elt->firstChild ) { |
754 | $tocIP = self::findTOCInsertionPoint( $elt->firstChild ); |
755 | if ( $tocIP ) { |
756 | return $tocIP; |
757 | } |
758 | } |
759 | } |
760 | $elt = $elt->nextSibling; |
761 | } |
762 | return null; |
763 | } |
764 | |
765 | /** |
766 | * Insert a synthetic section in which to place the TOC |
767 | */ |
768 | private function insertSyntheticSection( |
769 | Element $syntheticTocMeta, Element $insertionPoint |
770 | ): Element { |
771 | $prev = $insertionPoint->previousSibling; |
772 | |
773 | // Create a pseudo-section contaning the TOC |
774 | $syntheticTocSection = $this->doc->createElement( 'section' ); |
775 | $syntheticTocSection->setAttribute( 'data-mw-section-id', '-2' ); |
776 | $insertionPoint->parentNode->insertBefore( $syntheticTocSection, $insertionPoint ); |
777 | $this->pseudoSectionCount++; |
778 | $syntheticTocSection->appendChild( $syntheticTocMeta ); |
779 | |
780 | // Ensure template continuity is not broken! |
781 | // If $prev is not an encapsulation wrapper, nothing to do! |
782 | if ( $prev && WTUtils::isEncapsulationWrapper( $prev ) ) { |
783 | '@phan-var Element $prev'; |
784 | $prevAbout = DOMCompat::getAttribute( $prev, 'about' ); |
785 | |
786 | // First, handle the case of section-tag-stripping that VE does. |
787 | // So, find the leftmost non-section-wrapper node since we want |
788 | // If the about ids are different, $next & $prev belong to |
789 | // different transclusions and the TOC meta can be left alone. |
790 | $next = $insertionPoint->firstChild; |
791 | $nextAbout = $next instanceof Element ? DOMCompat::getAttribute( $next, 'about' ) : null; |
792 | if ( $prevAbout === $nextAbout ) { |
793 | $syntheticTocMeta->setAttribute( 'about', $prevAbout ); |
794 | } |
795 | |
796 | // Now handle case of section-tags not being stripped |
797 | // NOTE that $syntheticMeta is before $insertipnPoint |
798 | // If it is not-null, it is known to be a <section>. |
799 | $next = $insertionPoint; |
800 | '@phan-var Element $next'; |
801 | $nextAbout = $next ? DOMCompat::getAttribute( $next, 'about' ) : null; |
802 | if ( $prevAbout === $nextAbout ) { |
803 | $syntheticTocSection->setAttribute( 'about', $prevAbout ); |
804 | } |
805 | } |
806 | |
807 | return $syntheticTocSection; |
808 | } |
809 | |
810 | private function addSyntheticTOCMarker(): void { |
811 | // Add a synthetic TOC at the end of the first section, if necessary |
812 | $tocBS = $this->env->getBehaviorSwitch( 'toc' ); |
813 | $noTocBS = $this->env->getBehaviorSwitch( 'notoc' ); |
814 | $forceTocBS = $this->env->getBehaviorSwitch( 'forcetoc' ); |
815 | |
816 | $showToc = true; |
817 | if ( $noTocBS && !$tocBS ) { |
818 | $showToc = false; |
819 | } |
820 | $numHeadings = $this->count - 1 - $this->pseudoSectionCount; // $this->count is initialized to 1 |
821 | $enoughToc = $showToc && ( $numHeadings >= 4 || $tocBS ); |
822 | if ( $forceTocBS ) { |
823 | $showToc = true; |
824 | $enoughToc = true; |
825 | } |
826 | if ( $numHeadings == 0 ) { |
827 | $enoughToc = false; |
828 | } |
829 | |
830 | if ( !$this->env->getPageConfig()->getSuppressTOC() ) { |
831 | if ( $enoughToc ) { |
832 | // ParserOutputFlags::SHOW_TOC |
833 | $this->env->getMetadata()->setOutputFlag( 'show-toc' ); |
834 | if ( !$tocBS ) { |
835 | $syntheticTocMeta = $this->doc->createElement( 'meta' ); |
836 | $syntheticTocMeta->setAttribute( 'property', 'mw:PageProp/toc' ); |
837 | $dmw = DOMDataUtils::getDataMw( $syntheticTocMeta ); |
838 | $dmw->autoGenerated = true; |
839 | $tocIP = $this->findTOCInsertionPoint( DOMCompat::getBody( $this->doc ) ); |
840 | if ( $tocIP === null ) { |
841 | // should not happen, but nothing to do here! |
842 | return; |
843 | } |
844 | |
845 | // NOTE: Given how <section>s are computed in this file, headings |
846 | // will never have previous siblings. So, we look at $eltSection's |
847 | // previous siblings always. |
848 | $insertionPoint = self::findSectionAncestor( $tocIP ); |
849 | |
850 | $insertionContainer = $insertionPoint->previousSibling; |
851 | if ( !$insertionContainer || DOMCompat::nodeName( $insertionContainer ) !== 'section' ) { |
852 | $insertionContainer = $this->insertSyntheticSection( |
853 | $syntheticTocMeta, $insertionPoint |
854 | ); |
855 | } |
856 | $insertionContainer->appendChild( $syntheticTocMeta ); |
857 | |
858 | // Set a synthetic zero-length dsr to suppress noisy warnings |
859 | // from the round trip testing script. |
860 | $syntheticOffset = DOMDataUtils::getDataParsoid( $tocIP )->dsr->start ?? null; |
861 | if ( $syntheticOffset !== null ) { |
862 | $dp = DOMDataUtils::getDataParsoid( $syntheticTocMeta ); |
863 | $dp->dsr = new DomSourceRange( $syntheticOffset, $syntheticOffset, 0, 0 ); |
864 | } |
865 | } |
866 | } |
867 | if ( !$showToc ) { |
868 | // ParserOutputFlags::NO_TOC |
869 | $this->env->getMetadata()->setOutputFlag( 'no-toc' ); |
870 | } |
871 | } |
872 | } |
873 | |
874 | /** |
875 | * DOM Postprocessor entry function to walk DOM rooted at $root |
876 | * and add <section> wrappers as necessary. |
877 | * Implements the algorithm documented @ mw:Parsing/Notes/Section_Wrapping |
878 | */ |
879 | public function run(): void { |
880 | // 6 is the lowest possible level since we don't want |
881 | // any nesting of h-tags in the lead section |
882 | $leadSection = new Section( 6, 0, $this->doc ); |
883 | $leadSection->setId( 0 ); |
884 | |
885 | $this->wrapSectionsInDOM( $leadSection, $this->rootNode ); |
886 | |
887 | // There will always be a lead section, even if sometimes it only |
888 | // contains whitespace + comments. |
889 | $this->rootNode->insertBefore( $leadSection->container, $this->rootNode->firstChild ); |
890 | |
891 | // Resolve template conflicts after all sections have been added to the DOM |
892 | $this->resolveTplExtSectionConflicts(); |
893 | |
894 | // Convert byte offsets to codepoint offsets in TOCData |
895 | // (done in a batch to avoid O(N^2) string traversals) |
896 | $this->convertTOCOffsets(); |
897 | |
898 | $this->addSyntheticTOCMarker(); |
899 | } |
900 | } |