Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
80.67% |
288 / 357 |
|
29.41% |
5 / 17 |
CRAP | |
0.00% |
0 / 1 |
WrapSectionsState | |
80.67% |
288 / 357 |
|
29.41% |
5 / 17 |
269.54 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
computeSectionMetadata | |
77.50% |
31 / 40 |
|
0.00% |
0 / 1 |
12.38 | |||
shouldOmitFromTOC | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
3.04 | |||
createNewSection | |
78.26% |
18 / 23 |
|
0.00% |
0 / 1 |
12.24 | |||
isEmptySpan | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
wrapSectionsInDOM | |
86.49% |
64 / 74 |
|
0.00% |
0 / 1 |
33.37 | |||
isParsoidSection | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
findSectionAncestor | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
getDSR | |
52.38% |
11 / 21 |
|
0.00% |
0 / 1 |
24.07 | |||
fillDSRGap | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
4.25 | |||
collapseWrappers | |
84.38% |
27 / 32 |
|
0.00% |
0 / 1 |
8.24 | |||
resolveTplExtSectionConflicts | |
80.36% |
45 / 56 |
|
0.00% |
0 / 1 |
14.28 | |||
convertTOCOffsets | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
findTOCInsertionPoint | |
84.62% |
11 / 13 |
|
0.00% |
0 / 1 |
7.18 | |||
insertSyntheticSection | |
94.12% |
16 / 17 |
|
0.00% |
0 / 1 |
7.01 | |||
addSyntheticTOCMarker | |
83.78% |
31 / 37 |
|
0.00% |
0 / 1 |
17.09 | |||
run | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\Core\DomSourceRange; |
10 | use Wikimedia\Parsoid\Core\InternalException; |
11 | use Wikimedia\Parsoid\Core\SectionMetadata; |
12 | use Wikimedia\Parsoid\DOM\Comment; |
13 | use Wikimedia\Parsoid\DOM\Document; |
14 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
15 | use Wikimedia\Parsoid\DOM\Element; |
16 | use Wikimedia\Parsoid\DOM\Node; |
17 | use Wikimedia\Parsoid\DOM\Text; |
18 | use Wikimedia\Parsoid\NodeData\DataMw; |
19 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
20 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
21 | use Wikimedia\Parsoid\Utils\DOMCompat; |
22 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
23 | use Wikimedia\Parsoid\Utils\DOMUtils; |
24 | use Wikimedia\Parsoid\Utils\PHPUtils; |
25 | use Wikimedia\Parsoid\Utils\TokenUtils; |
26 | use Wikimedia\Parsoid\Utils\Utils; |
27 | use Wikimedia\Parsoid\Utils\WTUtils; |
28 | use Wikimedia\Parsoid\Wt2Html\Frame; |
29 | |
30 | class WrapSectionsState { |
31 | private Env $env; |
32 | private Frame $frame; |
33 | |
34 | /** @var Element|DocumentFragment */ |
35 | private $rootNode; |
36 | |
37 | /** |
38 | * The next section debug ID |
39 | */ |
40 | private int $count = 1; |
41 | |
42 | /** |
43 | * Pseudo section count is needed to determine TOC rendering |
44 | */ |
45 | private int $pseudoSectionCount = 0; |
46 | private Document $doc; |
47 | |
48 | /** |
49 | * Map of about ID to first element |
50 | * @var Element[] |
51 | */ |
52 | private array $aboutIdMap = []; |
53 | private int $sectionNumber = 0; |
54 | private ?WrapSectionsTplInfo $tplInfo = null; |
55 | |
56 | /** @var WrapSectionsTplInfo[] */ |
57 | private array $tplsAndExtsToExamine = []; |
58 | private int $oldLevel = 0; |
59 | |
60 | public function __construct( |
61 | Env $env, |
62 | Frame $frame, |
63 | Node $rootNode |
64 | ) { |
65 | $this->env = $env; |
66 | $this->frame = $frame; |
67 | $this->rootNode = $rootNode; |
68 | $this->doc = $rootNode->ownerDocument; |
69 | } |
70 | |
71 | /** |
72 | * Update section metadata needed to generate TOC. |
73 | * |
74 | * @param SectionMetadata $metadata |
75 | * @param Element $heading |
76 | * @param int $newLevel |
77 | */ |
78 | private function computeSectionMetadata( |
79 | SectionMetadata $metadata, Element $heading, int $newLevel |
80 | ): void { |
81 | if ( !$this->env->getPageConfig()->getSuppressTOC() ) { |
82 | $tocData = $this->env->getTOCData(); |
83 | $tocData->addSection( $metadata ); |
84 | $tocData->processHeading( $this->oldLevel, $newLevel, $metadata ); |
85 | } |
86 | $this->oldLevel = $newLevel; |
87 | $dp = DOMDataUtils::getDataParsoid( $heading ); |
88 | |
89 | if ( |
90 | // Literal HTML tags in wikitext don't get section edit links |
91 | WTUtils::isLiteralHTMLNode( $heading ) || |
92 | // Neither do cases where the legacy preprocessor didn't tokenize a heading |
93 | !isset( $dp->tmp->headingIndex ) |
94 | ) { |
95 | $metadata->fromTitle = null; |
96 | $metadata->index = ''; |
97 | $metadata->codepointOffset = null; |
98 | } elseif ( $this->tplInfo !== null ) { |
99 | $dmw = DOMDataUtils::getDataMw( $this->tplInfo->first ); |
100 | $metadata->index = ''; // Match legacy parser |
101 | if ( !isset( $dmw->parts ) ) { |
102 | // Extension or language-variant |
103 | // Need to determine what the output should be here |
104 | $metadata->fromTitle = null; |
105 | } elseif ( count( $dmw->parts ) > 1 ) { |
106 | // Multi-part content -- cannot pick a title |
107 | $metadata->fromTitle = null; |
108 | } else { |
109 | $p0 = $dmw->parts[0]; |
110 | if ( !( $p0 instanceof TemplateInfo ) ) { |
111 | throw new UnreachableException( |
112 | "a single part will always be a TemplateInfo not a string" |
113 | ); |
114 | } |
115 | if ( $p0->type === 'templatearg' ) { |
116 | // Since we currently don't process templates in Parsoid, |
117 | // this has to be a top-level {{{...}}} and so the content |
118 | // comes from the current page. But, legacy parser returns 'false' |
119 | // for this, so we'll return null as well instead of current title. |
120 | $metadata->fromTitle = null; |
121 | } elseif ( $p0->href !== null ) { |
122 | // Pick template title, but strip leading "./" prefix |
123 | $tplHref = Utils::decodeURIComponent( $p0->href ); |
124 | $metadata->fromTitle = PHPUtils::stripPrefix( $tplHref, './' ); |
125 | if ( $this->sectionNumber >= 0 ) { |
126 | // Legacy parser sets this to '' in some cases |
127 | // See "Templated sections (heading from template arg)" parser test |
128 | $metadata->index = 'T-' . $this->sectionNumber; |
129 | } |
130 | } else { |
131 | // Legacy parser return null here |
132 | $metadata->fromTitle = null; |
133 | } |
134 | } |
135 | $metadata->codepointOffset = null; |
136 | } else { |
137 | $title = $this->env->getContextTitle(); |
138 | // Use the dbkey (underscores) instead of text (spaces) |
139 | $metadata->fromTitle = $title->getPrefixedDBKey(); |
140 | $metadata->index = (string)$this->sectionNumber; |
141 | // Note that our DSR counts *are* byte counts, while this core |
142 | // interface expects *codepoint* counts. We are going to convert |
143 | // these in a batch (for efficiency) in ::convertTOCOffsets() below |
144 | $metadata->codepointOffset = $dp->dsr->start ?? -1; |
145 | } |
146 | |
147 | $metadata->anchor = DOMCompat::getAttribute( $heading, 'id' ); |
148 | $section = $dp->getTemp()->section; |
149 | $metadata->line = $section['line']; |
150 | $metadata->linkAnchor = $section['linkAnchor']; |
151 | } |
152 | |
153 | /** |
154 | * Should we omit this heading from TOC? |
155 | * Yes if $heading is: |
156 | * - generated by an extension |
157 | */ |
158 | private function shouldOmitFromTOC( Element $heading ): bool { |
159 | $node = $heading->parentNode; |
160 | while ( $node ) { |
161 | // NOTE: Here, we are making the assumption that extensions never |
162 | // emit a DOM forest and only ever have a single wrapper node. |
163 | // While ExtensionHandler doesn't assume that, this seems to be borne out |
164 | // in reality. But, if this assumption were not true, we would be adding |
165 | // TOC entries from extension-generated about siblings into the TOC. |
166 | // In scenarios where templates generated the extension and the extension |
167 | // is part of the template's wrapper, we cannot reliably determine what |
168 | // part of the output came from extensions in that case (because the |
169 | // template wrapping clobbers that information). So, for now, we ignore |
170 | // this edge case where extensions generate multiple DOM nodes (that also |
171 | // have headings). Later on, we may enforce a single-wrapper-node |
172 | // requirement for extensions. |
173 | if ( WTUtils::isFirstExtensionWrapperNode( $node ) ) { |
174 | return true; |
175 | } |
176 | $node = $node->parentNode; |
177 | } |
178 | |
179 | return false; |
180 | } |
181 | |
182 | /** |
183 | * Create a new section element |
184 | * |
185 | * @param Element|DocumentFragment $rootNode |
186 | * @param array<Section> &$sectionStack |
187 | * @param ?Section $currSection |
188 | * @param Element $heading the heading node |
189 | * @param int $newLevel |
190 | * @param bool $pseudoSection |
191 | * @return Section |
192 | */ |
193 | private function createNewSection( |
194 | Node $rootNode, array &$sectionStack, |
195 | ?Section $currSection, Element $heading, int $newLevel, |
196 | bool $pseudoSection |
197 | ): Section { |
198 | /* Structure for regular (editable or not) sections |
199 | * <section data-mw-section-id=".."> |
200 | * <h*>..</h*> |
201 | * .. |
202 | * </section> |
203 | * |
204 | * Lead sections and pseudo-sections won't have <h*> or <div> tags |
205 | */ |
206 | $section = new Section( $newLevel, $this->count++, $this->doc ); |
207 | |
208 | /* Step 1. Get section stack to the right nesting level |
209 | * 1a. Pop stack till we have a higher-level section. |
210 | */ |
211 | $stack = &$sectionStack; |
212 | $sc = count( $stack ); |
213 | while ( $sc > 0 && !( $stack[$sc - 1]->hasNestedLevel( $newLevel ) ) ) { |
214 | array_pop( $stack ); |
215 | $sc--; |
216 | } |
217 | |
218 | /* 1b. Push current section onto stack if it is a higher-level section */ |
219 | if ( $currSection && $currSection->hasNestedLevel( $newLevel ) ) { |
220 | $stack[] = $currSection; |
221 | $sc++; |
222 | } |
223 | |
224 | /* Step 2: Add new section where it belongs: a parent section OR body */ |
225 | $parentSection = $sc > 0 ? $stack[$sc - 1] : null; |
226 | if ( $parentSection ) { |
227 | $parentSection->addSection( $section ); |
228 | } else { |
229 | $rootNode->insertBefore( $section->container, $heading ); |
230 | } |
231 | |
232 | /* Step 3: Add <h*> to the <section> */ |
233 | $section->addNode( $heading ); |
234 | |
235 | /* Step 4: Assign data-mw-section-id attribute |
236 | * |
237 | * CX wants <section> tags with a distinguishing attribute so that |
238 | * it can differentiate between its internal use of <section> tags |
239 | * with what Parsoid adds. So, we will add a data-mw-section-id |
240 | * attribute always. |
241 | * |
242 | * data-mw-section-id = 0 for the lead section |
243 | * data-mw-section-id = -1 for non-editable sections |
244 | * Note that templated content cannot be edited directly. |
245 | * data-mw-section-id = -2 for pseudo sections |
246 | * data-mw-section-id > 0 for everything else and this number |
247 | * matches PHP parser / MediaWiki's notion of that section. |
248 | * |
249 | * The code here handles uneditable sections because of templating. |
250 | */ |
251 | if ( $pseudoSection ) { |
252 | $this->pseudoSectionCount++; |
253 | $section->setId( -2 ); |
254 | } elseif ( $this->tplInfo !== null ) { |
255 | $section->setId( -1 ); |
256 | } else { |
257 | $section->setId( $this->sectionNumber ); |
258 | } |
259 | |
260 | // Sections from extensions shouldn't show up in TOC |
261 | if ( !$pseudoSection && !$this->shouldOmitFromTOC( $heading ) ) { |
262 | $this->computeSectionMetadata( $section->metadata, $heading, $newLevel ); |
263 | } |
264 | |
265 | return $section; |
266 | } |
267 | |
268 | private function isEmptySpan( Element $span ): bool { |
269 | $n = $span->firstChild; |
270 | while ( $n ) { |
271 | if ( $n instanceof Element ) { |
272 | return false; |
273 | } elseif ( $n instanceof Text && !preg_match( '/^\s*$/D', $n->nodeValue ) ) { |
274 | return false; |
275 | } |
276 | $n = $n->nextSibling; |
277 | } |
278 | return true; |
279 | } |
280 | |
281 | /** |
282 | * Walk the DOM and add <section> wrappers where required. |
283 | * This is the workhorse code that wrapSections relies on. |
284 | * |
285 | * @param ?Section $currSection |
286 | * @param Element|DocumentFragment $rootNode |
287 | * @return int |
288 | */ |
289 | private function wrapSectionsInDOM( |
290 | ?Section $currSection, Node $rootNode |
291 | ): int { |
292 | // Since template wrapping is done and template wrappers are well-nested, |
293 | // we can reset template state for every subtree. |
294 | $tplInfo = null; |
295 | $sectionStack = []; |
296 | $highestSectionLevel = 7; |
297 | $node = $rootNode->firstChild; |
298 | while ( $node ) { |
299 | $next = $node->nextSibling; |
300 | $addedNode = false; |
301 | $expandSectionBoundary = false; |
302 | |
303 | // Track entry into templated and extension output |
304 | if ( !$this->tplInfo && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
305 | '@phan-var Element $node'; // @var Element $node |
306 | $this->tplInfo = $tplInfo = new WrapSectionsTplInfo; |
307 | $tplInfo->first = $node; |
308 | $about = DOMCompat::getAttribute( $node, 'about' ); |
309 | // NOTE: could be null because of language variant markup! |
310 | $tplInfo->about = $about; |
311 | $aboutSiblings = WTUtils::getAboutSiblings( $node, $about ); |
312 | $tplInfo->last = end( $aboutSiblings ); |
313 | $this->aboutIdMap[$about] = $node; |
314 | |
315 | // Collect a sequence of rendering transparent nodes starting at $node. |
316 | // This could be while ( true ), but being defensive. |
317 | while ( $node ) { |
318 | // If we hit the end of the template, we are done! |
319 | // - If this is a heading, we'll process it below. |
320 | // - If not, the template never had a heading, so |
321 | // we can continue default section wrapping behavior. |
322 | if ( $tplInfo->last === $node ) { |
323 | break; |
324 | } |
325 | |
326 | // If we hit a non-rendering-transparent node or a non-empty span, |
327 | // we are done! We cannot expand the section boundary any further. |
328 | if ( !WTUtils::isRenderingTransparentNode( $node ) && |
329 | !( |
330 | DOMCompat::nodeName( $node ) === 'span' && |
331 | !WTUtils::isLiteralHTMLNode( $node ) && |
332 | $this->isEmptySpan( $node ) |
333 | ) |
334 | ) { |
335 | break; |
336 | } |
337 | |
338 | // Accumulate the rendering-transparent node and loop |
339 | $tplInfo->rtContentNodes[] = $node; |
340 | $node = $node->nextSibling; |
341 | } |
342 | |
343 | if ( count( $tplInfo->rtContentNodes ) > 0 && DOMUtils::isHeading( $node ) ) { |
344 | // In this scenario, we can expand the section boundary to include these nodes |
345 | // rather than start with the heading. This eliminates unnecessary conflicts |
346 | // between section & template boundaries. |
347 | $expandSectionBoundary = true; |
348 | $next = $node->nextSibling; |
349 | } else { |
350 | // Reset to normal sectioning behavior! |
351 | $node = $tplInfo->first; |
352 | $tplInfo->rtContentNodes = []; |
353 | } |
354 | } |
355 | |
356 | if ( DOMUtils::isHeading( $node ) ) { |
357 | '@phan-var Element $node'; // @var Element $node // headings are elements |
358 | $level = (int)DOMCompat::nodeName( $node )[1]; |
359 | |
360 | $dp = DOMDataUtils::getDataParsoid( $node ); |
361 | if ( WTUtils::isLiteralHTMLNode( $node ) ) { |
362 | // HTML <h*> tags get section wrappers, but the sections are uneditable |
363 | // via the section editing API. |
364 | $this->sectionNumber = -1; |
365 | } elseif ( isset( $dp->tmp->headingIndex ) ) { |
366 | // This could be just `$this->sectionNumber++` without the |
367 | // complicated if-guard if T214538 were fixed in core; |
368 | // see T213468 where this more-complicated behavior was |
369 | // added to match core's eccentricities. |
370 | $this->sectionNumber = $dp->tmp->headingIndex; |
371 | } else { |
372 | $this->sectionNumber = -1; |
373 | } |
374 | if ( $level < $highestSectionLevel ) { |
375 | $highestSectionLevel = $level; |
376 | } |
377 | $currSection = $this->createNewSection( |
378 | $rootNode, $sectionStack, |
379 | $currSection, $node, $level, false |
380 | ); |
381 | if ( $tplInfo && $expandSectionBoundary ) { |
382 | foreach ( $tplInfo->rtContentNodes as $rtn ) { |
383 | $currSection->container->insertBefore( $rtn, $node ); |
384 | } |
385 | $tplInfo->firstSection = $currSection; |
386 | } |
387 | $addedNode = true; |
388 | } elseif ( $node instanceof Element ) { |
389 | $nestedHighestSectionLevel = $this->wrapSectionsInDOM( null, $node ); |
390 | if ( $currSection && !$currSection->hasNestedLevel( $nestedHighestSectionLevel ) ) { |
391 | // If we find a higher level nested section, |
392 | // (a) Make current section non-editable |
393 | // (b) There are 2 options here best illustrated with an example. |
394 | // Consider the wiktiext below. |
395 | // <div> |
396 | // =1= |
397 | // b |
398 | // </div> |
399 | // c |
400 | // =2= |
401 | // 1. Create a new pseudo-section to wrap '$node' |
402 | // There will be a <section> around the <div> which includes 'c'. |
403 | // 2. Don't create the pseudo-section by setting '$currSection = null' |
404 | // But, this can leave some content outside any top-level section. |
405 | // 'c' will not be in any section. |
406 | // The code below implements strategy 1. |
407 | $currSection->setId( -1 ); |
408 | $currSection = $this->createNewSection( |
409 | $rootNode, $sectionStack, |
410 | $currSection, $node, $nestedHighestSectionLevel, true |
411 | ); |
412 | $addedNode = true; |
413 | } |
414 | } |
415 | |
416 | if ( $currSection && !$addedNode ) { |
417 | $currSection->addNode( $node ); |
418 | } |
419 | |
420 | if ( $tplInfo && $tplInfo->first === $node ) { |
421 | $tplInfo->firstSection = $currSection; |
422 | } |
423 | |
424 | // Track exit from templated output |
425 | if ( $tplInfo && $tplInfo->last === $node ) { |
426 | if ( $currSection !== $tplInfo->firstSection ) { |
427 | // The opening $node and closing $node of the template |
428 | // are in different sections! This might require resolution. |
429 | // While 'firstSection' could be null, if we get here, |
430 | // 'lastSection' is guaranteed to always be non-null. |
431 | $tplInfo->lastSection = $currSection; |
432 | $this->tplsAndExtsToExamine[] = $tplInfo; |
433 | } |
434 | |
435 | $this->tplInfo = $tplInfo = null; |
436 | } |
437 | |
438 | $node = $next; |
439 | } |
440 | |
441 | // The last section embedded in a non-body DOM element |
442 | // should always be marked non-editable since it will have |
443 | // the closing tag (ex: </div>) showing up in the source editor |
444 | // which we cannot support in a visual editing $environment. |
445 | if ( $currSection && !DOMUtils::atTheTop( $rootNode ) ) { |
446 | $currSection->setId( -1 ); |
447 | } |
448 | |
449 | return $highestSectionLevel; |
450 | } |
451 | |
452 | /** |
453 | * Is this a Parsoid-inserted section (vs. a section node generated by |
454 | * other page-components / content-generators like extensions)? |
455 | * |
456 | * @param Element $n |
457 | * @return bool |
458 | */ |
459 | private static function isParsoidSection( Element $n ): bool { |
460 | return DOMCompat::nodeName( $n ) === 'section' && $n->hasAttribute( 'data-mw-section-id' ); |
461 | } |
462 | |
463 | /** |
464 | * Find an ancestor that is a Parsoid-inserted section |
465 | * |
466 | * @param Node $n |
467 | * @return Element |
468 | */ |
469 | private static function findSectionAncestor( Node $n ): Element { |
470 | do { |
471 | $n = DOMUtils::findAncestorOfName( $n, 'section' ); |
472 | } while ( $n && !self::isParsoidSection( $n ) ); |
473 | |
474 | Assert::invariant( $n instanceof Element, "Expected to find Parsoid-section ancestor" ); |
475 | '@phan-var Element $n'; // @var Element $n |
476 | return $n; |
477 | } |
478 | |
479 | /** |
480 | * Get opening/closing DSR offset for the subtree rooted at $node. |
481 | * This handles scenarios where $node is a section or template wrapper |
482 | * and if a section, when it has leading/trailing non-element nodes |
483 | * that don't have recorded DSR values. |
484 | * |
485 | * @param Element $node |
486 | * @param bool $start |
487 | * @return ?int |
488 | */ |
489 | private function getDSR( Element $node, bool $start ): ?int { |
490 | if ( !self::isParsoidSection( $node ) ) { |
491 | $dsr = DOMDataUtils::getDataParsoid( $node )->dsr ?? null; |
492 | if ( !$dsr ) { |
493 | Assert::invariant( |
494 | $node->hasAttribute( 'about' ), |
495 | 'Expected an about id' |
496 | ); |
497 | $about = DOMCompat::getAttribute( $node, 'about' ); |
498 | $dsr = DOMDataUtils::getDataParsoid( $this->aboutIdMap[$about] )->dsr; |
499 | } |
500 | |
501 | return $start ? $dsr->start : $dsr->end; |
502 | } |
503 | |
504 | $offset = 0; |
505 | $c = $start ? $node->firstChild : $node->lastChild; |
506 | while ( $c ) { |
507 | if ( $c instanceof Text ) { |
508 | $offset += strlen( $c->textContent ); |
509 | } elseif ( $c instanceof Comment ) { |
510 | $offset += WTUtils::decodedCommentLength( $c ); |
511 | } else { |
512 | '@phan-var Element $c'; // @var Element $c |
513 | $ret = $this->getDSR( $c, $start ); |
514 | return $ret === null ? null : $ret + ( $start ? -$offset : $offset ); |
515 | } |
516 | $c = $start ? $c->nextSibling : $c->previousSibling; |
517 | } |
518 | |
519 | return -1; |
520 | } |
521 | |
522 | /** |
523 | * FIXME: Duplicated with TableFixups code. |
524 | * @param list<string|TemplateInfo> &$parts |
525 | * @param ?int $offset1 |
526 | * @param ?int $offset2 |
527 | * @throws InternalException |
528 | */ |
529 | private function fillDSRGap( array &$parts, ?int $offset1, ?int $offset2 ): void { |
530 | if ( $offset1 === null || $offset2 === null ) { |
531 | throw new InternalException(); |
532 | } |
533 | if ( $offset1 < $offset2 ) { |
534 | $parts[] = PHPUtils::safeSubstr( $this->frame->getSrcText(), $offset1, $offset2 - $offset1 ); |
535 | } |
536 | } |
537 | |
538 | /** |
539 | * FIXME: There is strong overlap with TableFixups code. |
540 | * |
541 | * $wrapper will hold tpl/ext encap info for the array of tpls/exts as well as |
542 | * content before, after and in between them. Right now, this will always be a |
543 | * <section> node, but not asserting this since code doesn't depend on it being so. |
544 | * |
545 | * @param Element $wrapper |
546 | * @param array $encapWrappers |
547 | */ |
548 | private function collapseWrappers( Element $wrapper, array $encapWrappers ): void { |
549 | $wrapperDp = DOMDataUtils::getDataParsoid( $wrapper ); |
550 | |
551 | // Build up $parts, $pi to set up the combined transclusion info on $wrapper |
552 | $parts = []; |
553 | $pi = []; |
554 | $index = 0; |
555 | $prevDp = null; |
556 | $haveTemplate = false; |
557 | try { |
558 | foreach ( $encapWrappers as $encapNode ) { |
559 | $dp = DOMDataUtils::getDataParsoid( $encapNode ); |
560 | |
561 | // Plug DSR gaps between encapWrappers |
562 | if ( !$prevDp ) { |
563 | $this->fillDSRGap( $parts, $wrapperDp->dsr->start, $dp->dsr->start ); |
564 | } else { |
565 | $this->fillDSRGap( $parts, $prevDp->dsr->end, $dp->dsr->start ); |
566 | } |
567 | |
568 | if ( DOMUtils::hasTypeOf( $encapNode, "mw:Transclusion" ) ) { |
569 | $haveTemplate = true; |
570 | // Assimilate $encapNode's data-mw and data-parsoid pi info |
571 | $dmw = DOMDataUtils::getDataMw( $encapNode ); |
572 | foreach ( $dmw->parts ?? [] as $part ) { |
573 | // Template index is relative to other transclusions. |
574 | // This index is used to extract whitespace information from |
575 | // data-parsoid and that array only includes info for templates. |
576 | // So skip over strings here. |
577 | if ( !is_string( $part ) ) { |
578 | $part = clone $part; |
579 | $part->i = $index++; |
580 | } |
581 | $parts[] = $part; |
582 | } |
583 | PHPUtils::pushArray( $pi, $dp->pi ?? [ [] ] ); |
584 | } else { |
585 | // Where a non-template type is present, we are going to treat that |
586 | // segment as a "string" in the parts array. So, we effectively treat |
587 | // "mw:Transclusion" as a generic type that covers a single template |
588 | // as well as a run of segments where at least one segment comes from |
589 | // a template but others may be from other generators (ex: extensions). |
590 | $this->fillDSRGap( $parts, $dp->dsr->start, $dp->dsr->end ); |
591 | } |
592 | |
593 | $prevDp = $dp; |
594 | } |
595 | |
596 | if ( !$haveTemplate ) { |
597 | throw new InternalException(); |
598 | } |
599 | |
600 | DOMUtils::addTypeOf( $wrapper, "mw:Transclusion" ); |
601 | $wrapperDp->pi = $pi; |
602 | $this->fillDSRGap( $parts, $prevDp->dsr->end, $wrapperDp->dsr->end ); |
603 | $dataMw = new DataMw( [] ); |
604 | $dataMw->parts = $parts; |
605 | DOMDataUtils::setDataMw( $wrapper, $dataMw ); |
606 | } catch ( InternalException ) { |
607 | // We don't have accurate template wrapping information. |
608 | // Set typeof to 'mw:Placeholder' since 'mw:Transclusion' |
609 | // typeof is not actionable without valid data-mw. |
610 | // |
611 | // FIXME: |
612 | // 1. If we stop stripping section wrappers in the html->wt direction, |
613 | // we will need to add a DOMHandler for <section> or mw:Placeholder typeof |
614 | // on arbitrary Elements to traverse into children and serialize and |
615 | // prevent page corruption. |
616 | // 2. This may be a good place to collect stats for T191641#6357136 |
617 | // 3. Maybe we need a special error typeof rather than mw:Placeholder |
618 | $wrapper->setAttribute( 'typeof', 'mw:Placeholder' ); |
619 | } |
620 | } |
621 | |
622 | /** |
623 | * Section wrappers and encapsulation wrappers can conflict because of |
624 | * partial overlaps. This method identifies those conflicts and fixes up |
625 | * the encapsulation by expanding those ranges as necessary. |
626 | */ |
627 | private function resolveTplExtSectionConflicts(): void { |
628 | $secRanges = []; |
629 | '@phan-var array[] $secRanges'; |
630 | foreach ( $this->tplsAndExtsToExamine as $tplInfo ) { |
631 | $s1 = $tplInfo->firstSection->container ?? |
632 | self::findSectionAncestor( $tplInfo->first ); |
633 | |
634 | // guaranteed to be non-null |
635 | $s2 = $tplInfo->lastSection->container; |
636 | |
637 | // Find a common ancestor of s1 and s2 (could be s1 or s2) |
638 | $s2Ancestors = DOMUtils::pathToRoot( $s2 ); |
639 | $s1Ancestors = []; |
640 | $n = 0; |
641 | $ancestor = $s1; |
642 | while ( !in_array( $ancestor, $s2Ancestors, true ) ) { |
643 | $s1Ancestors[] = $ancestor; |
644 | $ancestor = $ancestor->parentNode; |
645 | $n++; |
646 | } |
647 | |
648 | // ancestor is now the common ancestor of s1 and s2 |
649 | $s1Ancestors[] = $ancestor; |
650 | $n++; |
651 | |
652 | // Set up start/end of the new encapsulation range |
653 | if ( $ancestor === $s1 || $ancestor === $s2 ) { |
654 | $start = $ancestor; |
655 | $end = $ancestor; |
656 | } else { |
657 | // While creating a new section (see createNewSection), it only |
658 | // gets added where its parent is either another section, |
659 | // or body, so all ancestors are themselves sections, or body. |
660 | $start = $s1Ancestors[$n - 2]; |
661 | $i = array_search( $ancestor, $s2Ancestors, true ); |
662 | $end = $s2Ancestors[$i - 1]; |
663 | } |
664 | |
665 | '@phan-var Element $start'; // @var Element $start |
666 | '@phan-var Element $end'; // @var Element $end |
667 | |
668 | // Add new OR update existing range |
669 | if ( $start->hasAttribute( 'about' ) ) { |
670 | // Overlaps with an existing range. |
671 | $about = DOMCompat::getAttribute( $start, 'about' ); |
672 | if ( !$end->hasAttribute( 'about' ) ) { |
673 | // Extend existing range till $end |
674 | $secRanges[$about]['end'] = $end; |
675 | $end->setAttribute( 'about', $about ); |
676 | } else { |
677 | Assert::invariant( DOMCompat::getAttribute( $end, 'about' ) === $about, |
678 | "Expected end-range about id to be $about instead of " . |
679 | DOMCompat::getAttribute( $end, 'about' ) . " in the overlap scenario." ); |
680 | } |
681 | } else { |
682 | // Check for nesting in another range. Since $start and $end |
683 | // are siblings, this is sufficient to know the entire range |
684 | // is nested |
685 | $about = null; |
686 | $n = $start->parentNode; |
687 | $body = DOMCompat::getBody( $start->ownerDocument ); |
688 | while ( $n !== $body ) { |
689 | '@phan-var Element $n'; // @var Element $n |
690 | if ( self::isParsoidSection( $n ) && $n->hasAttribute( 'about' ) ) { |
691 | $about = DOMCompat::getAttribute( $n, 'about' ); |
692 | break; |
693 | } |
694 | $n = $n->parentNode; |
695 | } |
696 | |
697 | if ( !$about ) { |
698 | // Not overlapping, not nested => new range |
699 | $about = $this->env->newAboutId(); |
700 | $start->setAttribute( 'about', $about ); |
701 | $end->setAttribute( 'about', $about ); |
702 | $secRanges[$about] = [ 'start' => $start, 'end' => $end, 'encapWrappers' => [] ]; |
703 | } |
704 | } |
705 | $secRanges[$about]['encapWrappers'][] = $tplInfo->first; |
706 | } |
707 | |
708 | // Process recorded ranges into new encapsulation information |
709 | // that spans all content in that range. |
710 | foreach ( $secRanges as $about => $range ) { |
711 | // Ensure that all top level nodes of the range have the same about id |
712 | for ( $n = $range['start']; $n !== $range['end']->nextSibling; $n = $n->nextSibling ) { |
713 | Assert::invariant( self::isParsoidSection( $n ), |
714 | "Encountered non-Parsoid-section node (" . |
715 | DOMCompat::nodeName( $n ) . |
716 | ") while updating template wrappers" ); |
717 | $n->setAttribute( 'about', $about ); |
718 | } |
719 | |
720 | $dsr1 = $this->getDSR( $range['start'], true ); // Traverses non-tpl content => will succeed |
721 | $dsr2 = $this->getDSR( $range['end'], false ); // Traverses non-tpl content => will succeed |
722 | $dp = new DataParsoid; |
723 | $dp->dsr = new DomSourceRange( $dsr1, $dsr2, null, null ); |
724 | DOMDataUtils::setDataParsoid( $range['start'], $dp ); |
725 | |
726 | $this->collapseWrappers( $range['start'], $range['encapWrappers'] ); |
727 | } |
728 | } |
729 | |
730 | private function convertTOCOffsets(): void { |
731 | // Create reference array from all the codepointOffsets |
732 | $offsets = []; |
733 | foreach ( $this->env->getTOCData()->getSections() as $section ) { |
734 | if ( $section->codepointOffset !== null ) { |
735 | $offsets[] = &$section->codepointOffset; |
736 | } |
737 | } |
738 | TokenUtils::convertOffsets( |
739 | $this->env->topFrame->getSrcText(), |
740 | $this->env->getCurrentOffsetType(), |
741 | 'char', |
742 | $offsets |
743 | ); |
744 | } |
745 | |
746 | /** |
747 | * In core, Parser.php adds a TOC marker before the *first* heading element |
748 | * independent of how that heading element is nested. In the common case, |
749 | * that insertion point corresponds to the last element of the lead section |
750 | * as computed by section wrapping code in this file. In the edge case, when |
751 | * a <div> wraps the heading, the insertion point lies inside the <div> and |
752 | * has no relation to the lead section. |
753 | */ |
754 | private static function findTOCInsertionPoint( Node $elt ): ?Element { |
755 | while ( $elt ) { |
756 | // Ignore extension content while finding TOC insertion point |
757 | if ( WTUtils::isFirstExtensionWrapperNode( $elt ) ) { |
758 | $elt = WTUtils::skipOverEncapsulatedContent( $elt ); |
759 | continue; |
760 | } |
761 | if ( $elt instanceof Element ) { |
762 | if ( DOMUtils::isHeading( $elt ) ) { |
763 | return $elt; |
764 | } elseif ( $elt->firstChild ) { |
765 | $tocIP = self::findTOCInsertionPoint( $elt->firstChild ); |
766 | if ( $tocIP ) { |
767 | return $tocIP; |
768 | } |
769 | } |
770 | } |
771 | $elt = $elt->nextSibling; |
772 | } |
773 | return null; |
774 | } |
775 | |
776 | /** |
777 | * Insert a synthetic section in which to place the TOC |
778 | */ |
779 | private function insertSyntheticSection( |
780 | Element $syntheticTocMeta, Element $insertionPoint |
781 | ): Element { |
782 | $prev = $insertionPoint->previousSibling; |
783 | |
784 | // Create a pseudo-section contaning the TOC |
785 | $syntheticTocSection = $this->doc->createElement( 'section' ); |
786 | $syntheticTocSection->setAttribute( 'data-mw-section-id', '-2' ); |
787 | $insertionPoint->parentNode->insertBefore( $syntheticTocSection, $insertionPoint ); |
788 | $this->pseudoSectionCount++; |
789 | $syntheticTocSection->appendChild( $syntheticTocMeta ); |
790 | |
791 | // Ensure template continuity is not broken! |
792 | // If $prev is not an encapsulation wrapper, nothing to do! |
793 | if ( $prev && WTUtils::isEncapsulationWrapper( $prev ) ) { |
794 | '@phan-var Element $prev'; |
795 | $prevAbout = DOMCompat::getAttribute( $prev, 'about' ); |
796 | |
797 | // First, handle the case of section-tag-stripping that VE does. |
798 | // So, find the leftmost non-section-wrapper node since we want |
799 | // If the about ids are different, $next & $prev belong to |
800 | // different transclusions and the TOC meta can be left alone. |
801 | $next = $insertionPoint->firstChild; |
802 | $nextAbout = $next instanceof Element ? DOMCompat::getAttribute( $next, 'about' ) : null; |
803 | if ( $prevAbout === $nextAbout ) { |
804 | $syntheticTocMeta->setAttribute( 'about', $prevAbout ); |
805 | } |
806 | |
807 | // Now handle case of section-tags not being stripped |
808 | // NOTE that $syntheticMeta is before $insertipnPoint |
809 | // If it is not-null, it is known to be a <section>. |
810 | $next = $insertionPoint; |
811 | '@phan-var Element $next'; |
812 | $nextAbout = $next ? DOMCompat::getAttribute( $next, 'about' ) : null; |
813 | if ( $prevAbout === $nextAbout ) { |
814 | $syntheticTocSection->setAttribute( 'about', $prevAbout ); |
815 | } |
816 | } |
817 | |
818 | return $syntheticTocSection; |
819 | } |
820 | |
821 | private function addSyntheticTOCMarker(): void { |
822 | // Add a synthetic TOC at the end of the first section, if necessary |
823 | $tocBS = $this->env->getBehaviorSwitch( 'toc' ); |
824 | $noTocBS = $this->env->getBehaviorSwitch( 'notoc' ); |
825 | $forceTocBS = $this->env->getBehaviorSwitch( 'forcetoc' ); |
826 | |
827 | $showToc = true; |
828 | if ( $noTocBS && !$tocBS ) { |
829 | $showToc = false; |
830 | } |
831 | $numHeadings = $this->count - 1 - $this->pseudoSectionCount; // $this->count is initialized to 1 |
832 | $enoughToc = $showToc && ( $numHeadings >= 4 || $tocBS ); |
833 | if ( $forceTocBS ) { |
834 | $showToc = true; |
835 | $enoughToc = true; |
836 | } |
837 | if ( $numHeadings == 0 ) { |
838 | $enoughToc = false; |
839 | } |
840 | |
841 | if ( !$this->env->getPageConfig()->getSuppressTOC() ) { |
842 | if ( $enoughToc ) { |
843 | // ParserOutputFlags::SHOW_TOC |
844 | $this->env->getMetadata()->setOutputFlag( 'show-toc' ); |
845 | if ( !$tocBS ) { |
846 | $syntheticTocMeta = $this->doc->createElement( 'meta' ); |
847 | $syntheticTocMeta->setAttribute( 'property', 'mw:PageProp/toc' ); |
848 | $dmw = DOMDataUtils::getDataMw( $syntheticTocMeta ); |
849 | $dmw->autoGenerated = true; |
850 | $tocIP = $this->findTOCInsertionPoint( DOMCompat::getBody( $this->doc ) ); |
851 | if ( $tocIP === null ) { |
852 | // should not happen, but nothing to do here! |
853 | return; |
854 | } |
855 | |
856 | // NOTE: Given how <section>s are computed in this file, headings |
857 | // will never have previous siblings. So, we look at $eltSection's |
858 | // previous siblings always. |
859 | $insertionPoint = self::findSectionAncestor( $tocIP ); |
860 | |
861 | $insertionContainer = $insertionPoint->previousSibling; |
862 | if ( !$insertionContainer || DOMCompat::nodeName( $insertionContainer ) !== 'section' ) { |
863 | $insertionContainer = $this->insertSyntheticSection( |
864 | $syntheticTocMeta, $insertionPoint |
865 | ); |
866 | } |
867 | $insertionContainer->appendChild( $syntheticTocMeta ); |
868 | |
869 | // Set a synthetic zero-length dsr to suppress noisy warnings |
870 | // from the round trip testing script. |
871 | $syntheticOffset = DOMDataUtils::getDataParsoid( $tocIP )->dsr->start ?? null; |
872 | if ( $syntheticOffset !== null ) { |
873 | $dp = DOMDataUtils::getDataParsoid( $syntheticTocMeta ); |
874 | $dp->dsr = new DomSourceRange( $syntheticOffset, $syntheticOffset, 0, 0 ); |
875 | } |
876 | } |
877 | } |
878 | if ( $numHeadings > 0 && !$showToc ) { |
879 | // ParserOutputFlags::NO_TOC |
880 | $this->env->getMetadata()->setOutputFlag( 'no-toc' ); |
881 | } |
882 | } |
883 | } |
884 | |
885 | /** |
886 | * DOM Postprocessor entry function to walk DOM rooted at $root |
887 | * and add <section> wrappers as necessary. |
888 | * Implements the algorithm documented @ mw:Parsing/Notes/Section_Wrapping |
889 | */ |
890 | public function run(): void { |
891 | // 6 is the lowest possible level since we don't want |
892 | // any nesting of h-tags in the lead section |
893 | $leadSection = new Section( 6, 0, $this->doc ); |
894 | $leadSection->setId( 0 ); |
895 | |
896 | $this->wrapSectionsInDOM( $leadSection, $this->rootNode ); |
897 | |
898 | // There will always be a lead section, even if sometimes it only |
899 | // contains whitespace + comments. |
900 | $this->rootNode->insertBefore( $leadSection->container, $this->rootNode->firstChild ); |
901 | |
902 | // Resolve template conflicts after all sections have been added to the DOM |
903 | $this->resolveTplExtSectionConflicts(); |
904 | |
905 | // Convert byte offsets to codepoint offsets in TOCData |
906 | // (done in a batch to avoid O(N^2) string traversals) |
907 | $this->convertTOCOffsets(); |
908 | |
909 | $this->addSyntheticTOCMarker(); |
910 | } |
911 | } |