Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
79.36% |
296 / 373 |
|
27.78% |
5 / 18 |
CRAP | |
0.00% |
0 / 1 |
WrapSectionsState | |
79.36% |
296 / 373 |
|
27.78% |
5 / 18 |
305.53 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
1 | |||
computeSectionMetadata | |
76.32% |
29 / 38 |
|
0.00% |
0 / 1 |
11.33 | |||
shouldOmitFromTOC | |
83.33% |
5 / 6 |
|
0.00% |
0 / 1 |
3.04 | |||
createNewSection | |
78.26% |
18 / 23 |
|
0.00% |
0 / 1 |
12.24 | |||
isEmptySpan | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
wrapSectionsInDOM | |
88.00% |
66 / 75 |
|
0.00% |
0 / 1 |
32.66 | |||
isParsoidSection | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
findSectionAncestor | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
getDSR | |
54.55% |
12 / 22 |
|
0.00% |
0 / 1 |
22.36 | |||
fillDSRGap | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
4.25 | |||
collapseWrappers | |
84.38% |
27 / 32 |
|
0.00% |
0 / 1 |
8.24 | |||
resolveTplExtSectionConflicts | |
80.36% |
45 / 56 |
|
0.00% |
0 / 1 |
14.28 | |||
convertTOCOffsets | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
3 | |||
findTOCInsertionPoint | |
84.62% |
11 / 13 |
|
0.00% |
0 / 1 |
7.18 | |||
insertSyntheticSection | |
94.12% |
16 / 17 |
|
0.00% |
0 / 1 |
7.01 | |||
addSyntheticTOCMarker | |
83.78% |
31 / 37 |
|
0.00% |
0 / 1 |
15.96 | |||
addSectionInfo | |
40.00% |
6 / 15 |
|
0.00% |
0 / 1 |
7.46 | |||
run | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
1 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Processors; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\Core\DomSourceRange; |
10 | use Wikimedia\Parsoid\Core\InternalException; |
11 | use Wikimedia\Parsoid\Core\SectionMetadata; |
12 | use Wikimedia\Parsoid\DOM\Comment; |
13 | use Wikimedia\Parsoid\DOM\Document; |
14 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
15 | use Wikimedia\Parsoid\DOM\Element; |
16 | use Wikimedia\Parsoid\DOM\Node; |
17 | use Wikimedia\Parsoid\DOM\Text; |
18 | use Wikimedia\Parsoid\NodeData\DataMw; |
19 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
20 | use Wikimedia\Parsoid\NodeData\TemplateInfo; |
21 | use Wikimedia\Parsoid\Utils\DOMCompat; |
22 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
23 | use Wikimedia\Parsoid\Utils\DOMUtils; |
24 | use Wikimedia\Parsoid\Utils\PHPUtils; |
25 | use Wikimedia\Parsoid\Utils\TokenUtils; |
26 | use Wikimedia\Parsoid\Utils\Utils; |
27 | use Wikimedia\Parsoid\Utils\WTUtils; |
28 | use Wikimedia\Parsoid\Wt2Html\Frame; |
29 | |
30 | class WrapSectionsState { |
31 | private Env $env; |
32 | private Frame $frame; |
33 | |
34 | /** @var Element|DocumentFragment */ |
35 | private $rootNode; |
36 | |
37 | /** |
38 | * The next section debug ID |
39 | */ |
40 | private int $count = 1; |
41 | |
42 | /** |
43 | * Pseudo section count is needed to determine TOC rendering |
44 | */ |
45 | private int $pseudoSectionCount = 0; |
46 | private Document $doc; |
47 | |
48 | /** |
49 | * Map of about ID to first element |
50 | * @var Element[] |
51 | */ |
52 | private array $aboutIdMap = []; |
53 | private int $sectionNumber = 0; |
54 | private ?WrapSectionsTplInfo $tplInfo = null; |
55 | |
56 | /** @var WrapSectionsTplInfo[] */ |
57 | private array $tplsAndExtsToExamine = []; |
58 | private int $oldLevel = 0; |
59 | |
60 | public function __construct( |
61 | Env $env, |
62 | Frame $frame, |
63 | Node $rootNode |
64 | ) { |
65 | $this->env = $env; |
66 | $this->frame = $frame; |
67 | $this->rootNode = $rootNode; |
68 | $this->doc = $rootNode->ownerDocument; |
69 | } |
70 | |
71 | /** |
72 | * Update section metadata needed to generate TOC. |
73 | * |
74 | * @param SectionMetadata $metadata |
75 | * @param Element $heading |
76 | * @param int $newLevel |
77 | */ |
78 | private function computeSectionMetadata( |
79 | SectionMetadata $metadata, Element $heading, int $newLevel |
80 | ): void { |
81 | if ( !$this->env->getPageConfig()->getSuppressTOC() ) { |
82 | $tocData = $this->env->getTOCData(); |
83 | $tocData->addSection( $metadata ); |
84 | $tocData->processHeading( $this->oldLevel, $newLevel, $metadata ); |
85 | } |
86 | $this->oldLevel = $newLevel; |
87 | |
88 | if ( WTUtils::isLiteralHTMLNode( $heading ) ) { |
89 | // Literal HTML tags in wikitext don't get section edit links |
90 | $metadata->fromTitle = null; |
91 | $metadata->index = ''; |
92 | $metadata->codepointOffset = null; |
93 | } elseif ( $this->tplInfo !== null ) { |
94 | $dmw = DOMDataUtils::getDataMw( $this->tplInfo->first ); |
95 | $metadata->index = ''; // Match legacy parser |
96 | if ( !isset( $dmw->parts ) ) { |
97 | // Extension or language-variant |
98 | // Need to determine what the output should be here |
99 | $metadata->fromTitle = null; |
100 | } elseif ( count( $dmw->parts ) > 1 ) { |
101 | // Multi-part content -- cannot pick a title |
102 | $metadata->fromTitle = null; |
103 | } else { |
104 | $p0 = $dmw->parts[0]; |
105 | if ( !( $p0 instanceof TemplateInfo ) ) { |
106 | throw new UnreachableException( |
107 | "a single part will always be a TemplateInfo not a string" |
108 | ); |
109 | } |
110 | if ( $p0->type === 'templatearg' ) { |
111 | // Since we currently don't process templates in Parsoid, |
112 | // this has to be a top-level {{{...}}} and so the content |
113 | // comes from the current page. But, legacy parser returns 'false' |
114 | // for this, so we'll return null as well instead of current title. |
115 | $metadata->fromTitle = null; |
116 | } elseif ( !empty( $p0->href ) ) { |
117 | // Pick template title, but strip leading "./" prefix |
118 | $tplHref = Utils::decodeURIComponent( $p0->href ); |
119 | $metadata->fromTitle = PHPUtils::stripPrefix( $tplHref, './' ); |
120 | if ( $this->sectionNumber >= 0 ) { |
121 | // Legacy parser sets this to '' in some cases |
122 | // See "Templated sections (heading from template arg)" parser test |
123 | $metadata->index = 'T-' . $this->sectionNumber; |
124 | } |
125 | } else { |
126 | // Legacy parser return null here |
127 | $metadata->fromTitle = null; |
128 | } |
129 | } |
130 | $metadata->codepointOffset = null; |
131 | } else { |
132 | $title = $this->env->getContextTitle(); |
133 | // Use the dbkey (underscores) instead of text (spaces) |
134 | $metadata->fromTitle = $title->getPrefixedDBKey(); |
135 | $metadata->index = (string)$this->sectionNumber; |
136 | // Note that our DSR counts *are* byte counts, while this core |
137 | // interface expects *codepoint* counts. We are going to convert |
138 | // these in a batch (for efficiency) in ::convertTOCOffsets() below |
139 | $metadata->codepointOffset = DOMDataUtils::getDataParsoid( $heading )->dsr->start ?? -1; |
140 | } |
141 | |
142 | $metadata->anchor = DOMCompat::getAttribute( $heading, 'id' ); |
143 | $section = DOMDataUtils::getDataParsoid( $heading )->getTemp()->section; |
144 | $metadata->line = $section['line']; |
145 | $metadata->linkAnchor = $section['linkAnchor']; |
146 | } |
147 | |
148 | /** |
149 | * Should we omit this heading from TOC? |
150 | * Yes if $heading is: |
151 | * - generated by an extensoin |
152 | */ |
153 | private function shouldOmitFromTOC( Element $heading ): bool { |
154 | $node = $heading->parentNode; |
155 | while ( $node ) { |
156 | // NOTE: Here, we are making the assumption that extensions never |
157 | // emit a DOM forest and only ever have a single wrapper node. |
158 | // While ExtensionHandler doesn't assume that, this seems to be borne out |
159 | // in reality. But, if this assumption were not true, we would be adding |
160 | // TOC entries from extension-generated about siblings into the TOC. |
161 | // In scenarios where templates generated the extension and the extension |
162 | // is part of the template's wrapper, we cannot reliably determine what |
163 | // part of the output came from extensions in that case (because the |
164 | // template wrapping clobbers that information). So, for now, we ignore |
165 | // this edge case where extensions generate multiple DOM nodes (that also |
166 | // have headings). Later on, we may enforce a single-wrapper-node |
167 | // requirement for extensions. |
168 | if ( WTUtils::isFirstExtensionWrapperNode( $node ) ) { |
169 | return true; |
170 | } |
171 | $node = $node->parentNode; |
172 | } |
173 | |
174 | return false; |
175 | } |
176 | |
177 | /** |
178 | * Create a new section element |
179 | * |
180 | * @param Element|DocumentFragment $rootNode |
181 | * @param array<Section> &$sectionStack |
182 | * @param ?Section $currSection |
183 | * @param Element $heading the heading node |
184 | * @param int $newLevel |
185 | * @param bool $pseudoSection |
186 | * @return Section |
187 | */ |
188 | private function createNewSection( |
189 | Node $rootNode, array &$sectionStack, |
190 | ?Section $currSection, Element $heading, int $newLevel, |
191 | bool $pseudoSection |
192 | ): Section { |
193 | /* Structure for regular (editable or not) sections |
194 | * <section data-mw-section-id=".."> |
195 | * <h*>..</h*> |
196 | * .. |
197 | * </section> |
198 | * |
199 | * Lead sections and pseudo-sections won't have <h*> or <div> tags |
200 | */ |
201 | $section = new Section( $newLevel, $this->count++, $this->doc ); |
202 | |
203 | /* Step 1. Get section stack to the right nesting level |
204 | * 1a. Pop stack till we have a higher-level section. |
205 | */ |
206 | $stack = &$sectionStack; |
207 | $sc = count( $stack ); |
208 | while ( $sc > 0 && !( $stack[$sc - 1]->hasNestedLevel( $newLevel ) ) ) { |
209 | array_pop( $stack ); |
210 | $sc--; |
211 | } |
212 | |
213 | /* 1b. Push current section onto stack if it is a higher-level section */ |
214 | if ( $currSection && $currSection->hasNestedLevel( $newLevel ) ) { |
215 | $stack[] = $currSection; |
216 | $sc++; |
217 | } |
218 | |
219 | /* Step 2: Add new section where it belongs: a parent section OR body */ |
220 | $parentSection = $sc > 0 ? $stack[$sc - 1] : null; |
221 | if ( $parentSection ) { |
222 | $parentSection->addSection( $section ); |
223 | } else { |
224 | $rootNode->insertBefore( $section->container, $heading ); |
225 | } |
226 | |
227 | /* Step 3: Add <h*> to the <section> */ |
228 | $section->addNode( $heading ); |
229 | |
230 | /* Step 4: Assign data-mw-section-id attribute |
231 | * |
232 | * CX wants <section> tags with a distinguishing attribute so that |
233 | * it can differentiate between its internal use of <section> tags |
234 | * with what Parsoid adds. So, we will add a data-mw-section-id |
235 | * attribute always. |
236 | * |
237 | * data-mw-section-id = 0 for the lead section |
238 | * data-mw-section-id = -1 for non-editable sections |
239 | * Note that templated content cannot be edited directly. |
240 | * data-mw-section-id = -2 for pseudo sections |
241 | * data-mw-section-id > 0 for everything else and this number |
242 | * matches PHP parser / MediaWiki's notion of that section. |
243 | * |
244 | * The code here handles uneditable sections because of templating. |
245 | */ |
246 | if ( $pseudoSection ) { |
247 | $this->pseudoSectionCount++; |
248 | $section->setId( -2 ); |
249 | } elseif ( $this->tplInfo !== null ) { |
250 | $section->setId( -1 ); |
251 | } else { |
252 | $section->setId( $this->sectionNumber ); |
253 | } |
254 | |
255 | // Sections from extensions shouldn't show up in TOC |
256 | if ( !$pseudoSection && !$this->shouldOmitFromTOC( $heading ) ) { |
257 | $this->computeSectionMetadata( $section->metadata, $heading, $newLevel ); |
258 | } |
259 | |
260 | return $section; |
261 | } |
262 | |
263 | private function isEmptySpan( Element $span ): bool { |
264 | $n = $span->firstChild; |
265 | while ( $n ) { |
266 | if ( $n instanceof Element ) { |
267 | return false; |
268 | } elseif ( $n instanceof Text && !preg_match( '/^\s*$/D', $n->nodeValue ) ) { |
269 | return false; |
270 | } |
271 | $n = $n->nextSibling; |
272 | } |
273 | return true; |
274 | } |
275 | |
276 | /** |
277 | * Walk the DOM and add <section> wrappers where required. |
278 | * This is the workhorse code that wrapSections relies on. |
279 | * |
280 | * @param ?Section $currSection |
281 | * @param Element|DocumentFragment $rootNode |
282 | * @return int |
283 | */ |
284 | private function wrapSectionsInDOM( |
285 | ?Section $currSection, Node $rootNode |
286 | ): int { |
287 | // Since template wrapping is done and template wrappers are well-nested, |
288 | // we can reset template state for every subtree. |
289 | $tplInfo = null; |
290 | $sectionStack = []; |
291 | $highestSectionLevel = 7; |
292 | $node = $rootNode->firstChild; |
293 | while ( $node ) { |
294 | $next = $node->nextSibling; |
295 | $addedNode = false; |
296 | $expandSectionBoundary = false; |
297 | |
298 | // Track entry into templated and extension output |
299 | if ( !$this->tplInfo && WTUtils::isFirstEncapsulationWrapperNode( $node ) ) { |
300 | DOMUtils::assertElt( $node ); |
301 | $this->tplInfo = $tplInfo = new WrapSectionsTplInfo; |
302 | $tplInfo->first = $node; |
303 | $about = DOMCompat::getAttribute( $node, 'about' ); |
304 | // NOTE: could be null because of language variant markup! |
305 | $tplInfo->about = $about; |
306 | $aboutSiblings = WTUtils::getAboutSiblings( $node, $about ); |
307 | $tplInfo->last = end( $aboutSiblings ); |
308 | $this->aboutIdMap[$about] = $node; |
309 | |
310 | // Collect a sequence of rendering transparent nodes starting at $node. |
311 | // This could be while ( true ), but being defensive. |
312 | while ( $node ) { |
313 | // If we hit the end of the template, we are done! |
314 | // - If this is a heading, we'll process it below. |
315 | // - If not, the template never had a heading, so |
316 | // we can continue default section wrapping behavior. |
317 | if ( $tplInfo->last === $node ) { |
318 | break; |
319 | } |
320 | |
321 | // If we hit a non-rendering-transparent node or a non-empty span, |
322 | // we are done! We cannot expand the section boundary any further. |
323 | if ( !WTUtils::isRenderingTransparentNode( $node ) && |
324 | !( |
325 | DOMCompat::nodeName( $node ) === 'span' && |
326 | !WTUtils::isLiteralHTMLNode( $node ) && |
327 | $this->isEmptySpan( $node ) |
328 | ) |
329 | ) { |
330 | break; |
331 | } |
332 | |
333 | // Accumulate the rendering-transparent node and loop |
334 | $tplInfo->rtContentNodes[] = $node; |
335 | $node = $node->nextSibling; |
336 | } |
337 | |
338 | if ( count( $tplInfo->rtContentNodes ) > 0 && DOMUtils::isHeading( $node ) ) { |
339 | // In this scenario, we can expand the section boundary to include these nodes |
340 | // rather than start with the heading. This eliminates unnecessary conflicts |
341 | // between section & template boundaries. |
342 | $expandSectionBoundary = true; |
343 | $next = $node->nextSibling; |
344 | } else { |
345 | // Reset to normal sectioning behavior! |
346 | $node = $tplInfo->first; |
347 | $tplInfo->rtContentNodes = []; |
348 | } |
349 | } |
350 | |
351 | if ( DOMUtils::isHeading( $node ) ) { |
352 | DOMUtils::assertElt( $node ); // headings are elements |
353 | $level = (int)DOMCompat::nodeName( $node )[1]; |
354 | |
355 | $dp = DOMDataUtils::getDataParsoid( $node ); |
356 | if ( WTUtils::isLiteralHTMLNode( $node ) ) { |
357 | // HTML <h*> tags get section wrappers, but the sections are uneditable |
358 | // via the section editing API. |
359 | $this->sectionNumber = -1; |
360 | } elseif ( isset( $dp->tmp->headingIndex ) ) { |
361 | // This could be just `$this->sectionNumber++` without the |
362 | // complicated if-guard if T214538 were fixed in core; |
363 | // see T213468 where this more-complicated behavior was |
364 | // added to match core's eccentricities. |
365 | $this->sectionNumber = $dp->tmp->headingIndex; |
366 | } |
367 | if ( $level < $highestSectionLevel ) { |
368 | $highestSectionLevel = $level; |
369 | } |
370 | $currSection = $this->createNewSection( |
371 | $rootNode, $sectionStack, |
372 | $currSection, $node, $level, false |
373 | ); |
374 | if ( $tplInfo && $expandSectionBoundary ) { |
375 | foreach ( $tplInfo->rtContentNodes as $rtn ) { |
376 | $currSection->container->insertBefore( $rtn, $node ); |
377 | } |
378 | $tplInfo->firstSection = $currSection; |
379 | } |
380 | $addedNode = true; |
381 | } elseif ( $node instanceof Element ) { |
382 | $nestedHighestSectionLevel = $this->wrapSectionsInDOM( null, $node ); |
383 | if ( $currSection && !$currSection->hasNestedLevel( $nestedHighestSectionLevel ) ) { |
384 | // If we find a higher level nested section, |
385 | // (a) Make current section non-editable |
386 | // (b) There are 2 options here best illustrated with an example. |
387 | // Consider the wiktiext below. |
388 | // <div> |
389 | // =1= |
390 | // b |
391 | // </div> |
392 | // c |
393 | // =2= |
394 | // 1. Create a new pseudo-section to wrap '$node' |
395 | // There will be a <section> around the <div> which includes 'c'. |
396 | // 2. Don't create the pseudo-section by setting '$currSection = null' |
397 | // But, this can leave some content outside any top-level section. |
398 | // 'c' will not be in any section. |
399 | // The code below implements strategy 1. |
400 | $currSection->setId( -1 ); |
401 | $currSection = $this->createNewSection( |
402 | $rootNode, $sectionStack, |
403 | $currSection, $node, $nestedHighestSectionLevel, true |
404 | ); |
405 | $addedNode = true; |
406 | } |
407 | } |
408 | |
409 | if ( $currSection && !$addedNode ) { |
410 | $currSection->addNode( $node ); |
411 | } |
412 | |
413 | if ( $tplInfo && $tplInfo->first === $node ) { |
414 | $tplInfo->firstSection = $currSection; |
415 | } |
416 | |
417 | // Track exit from templated output |
418 | if ( $tplInfo && $tplInfo->last === $node ) { |
419 | if ( $currSection !== $tplInfo->firstSection ) { |
420 | // The opening $node and closing $node of the template |
421 | // are in different sections! This might require resolution. |
422 | // While 'firstSection' could be null, if we get here, |
423 | // 'lastSection' is guaranteed to always be non-null. |
424 | $tplInfo->lastSection = $currSection; |
425 | $this->tplsAndExtsToExamine[] = $tplInfo; |
426 | } |
427 | |
428 | $this->tplInfo = $tplInfo = null; |
429 | } |
430 | |
431 | $node = $next; |
432 | } |
433 | |
434 | // The last section embedded in a non-body DOM element |
435 | // should always be marked non-editable since it will have |
436 | // the closing tag (ex: </div>) showing up in the source editor |
437 | // which we cannot support in a visual editing $environment. |
438 | if ( $currSection && !DOMUtils::atTheTop( $rootNode ) ) { |
439 | $currSection->setId( -1 ); |
440 | } |
441 | |
442 | return $highestSectionLevel; |
443 | } |
444 | |
445 | /** |
446 | * Is this a Parsoid-inserted section (vs. a section node generated by |
447 | * other page-components / content-generators like extensions)? |
448 | * |
449 | * @param Element $n |
450 | * @return bool |
451 | */ |
452 | private static function isParsoidSection( Element $n ): bool { |
453 | return DOMCompat::nodeName( $n ) === 'section' && $n->hasAttribute( 'data-mw-section-id' ); |
454 | } |
455 | |
456 | /** |
457 | * Find an ancestor that is a Parsoid-inserted section |
458 | * |
459 | * @param Node $n |
460 | * @return Element |
461 | */ |
462 | private static function findSectionAncestor( Node $n ): Element { |
463 | do { |
464 | $n = DOMUtils::findAncestorOfName( $n, 'section' ); |
465 | } while ( $n && !self::isParsoidSection( $n ) ); |
466 | |
467 | Assert::invariant( $n instanceof Element, "Expected to find Parsoid-section ancestor" ); |
468 | return $n; |
469 | } |
470 | |
471 | /** |
472 | * Get opening/closing DSR offset for the subtree rooted at $node. |
473 | * This handles scenarios where $node is a section or template wrapper |
474 | * and if a section, when it has leading/trailing non-element nodes |
475 | * that don't have recorded DSR values. |
476 | * |
477 | * @param Element $node |
478 | * @param bool $start |
479 | * @return ?int |
480 | */ |
481 | private function getDSR( Element $node, bool $start ): ?int { |
482 | if ( !self::isParsoidSection( $node ) ) { |
483 | $dsr = DOMDataUtils::getDataParsoid( $node )->dsr ?? null; |
484 | if ( !$dsr ) { |
485 | Assert::invariant( |
486 | $node->hasAttribute( 'about' ), |
487 | 'Expected an about id' |
488 | ); |
489 | $about = DOMCompat::getAttribute( $node, 'about' ); |
490 | $dsr = DOMDataUtils::getDataParsoid( $this->aboutIdMap[$about] )->dsr; |
491 | } |
492 | |
493 | return $start ? $dsr->start : $dsr->end; |
494 | } |
495 | |
496 | $offset = 0; |
497 | $c = $start ? $node->firstChild : $node->lastChild; |
498 | while ( $c ) { |
499 | if ( $c instanceof Text ) { |
500 | $offset += strlen( $c->textContent ); |
501 | } elseif ( $c instanceof Comment ) { |
502 | $offset += WTUtils::decodedCommentLength( $c ); |
503 | } else { |
504 | DOMUtils::assertElt( $c ); |
505 | $ret = $this->getDSR( $c, $start ); |
506 | return $ret === null ? null : $ret + ( $start ? -$offset : $offset ); |
507 | } |
508 | $c = $start ? $c->nextSibling : $c->previousSibling; |
509 | } |
510 | |
511 | return -1; |
512 | } |
513 | |
514 | /** |
515 | * FIXME: Duplicated with TableFixups code. |
516 | * @param list<string|TemplateInfo> &$parts |
517 | * @param ?int $offset1 |
518 | * @param ?int $offset2 |
519 | * @throws InternalException |
520 | */ |
521 | private function fillDSRGap( array &$parts, ?int $offset1, ?int $offset2 ): void { |
522 | if ( $offset1 === null || $offset2 === null ) { |
523 | throw new InternalException(); |
524 | } |
525 | if ( $offset1 < $offset2 ) { |
526 | $parts[] = PHPUtils::safeSubstr( $this->frame->getSrcText(), $offset1, $offset2 - $offset1 ); |
527 | } |
528 | } |
529 | |
530 | /** |
531 | * FIXME: There is strong overlap with TableFixups code. |
532 | * |
533 | * $wrapper will hold tpl/ext encap info for the array of tpls/exts as well as |
534 | * content before, after and in between them. Right now, this will always be a |
535 | * <section> node, but not asserting this since code doesn't depend on it being so. |
536 | * |
537 | * @param Element $wrapper |
538 | * @param array $encapWrappers |
539 | */ |
540 | private function collapseWrappers( Element $wrapper, array $encapWrappers ): void { |
541 | $wrapperDp = DOMDataUtils::getDataParsoid( $wrapper ); |
542 | |
543 | // Build up $parts, $pi to set up the combined transclusion info on $wrapper |
544 | $parts = []; |
545 | $pi = []; |
546 | $index = 0; |
547 | $prevDp = null; |
548 | $haveTemplate = false; |
549 | try { |
550 | foreach ( $encapWrappers as $encapNode ) { |
551 | $dp = DOMDataUtils::getDataParsoid( $encapNode ); |
552 | |
553 | // Plug DSR gaps between encapWrappers |
554 | if ( !$prevDp ) { |
555 | $this->fillDSRGap( $parts, $wrapperDp->dsr->start, $dp->dsr->start ); |
556 | } else { |
557 | $this->fillDSRGap( $parts, $prevDp->dsr->end, $dp->dsr->start ); |
558 | } |
559 | |
560 | if ( DOMUtils::hasTypeOf( $encapNode, "mw:Transclusion" ) ) { |
561 | $haveTemplate = true; |
562 | // Assimilate $encapNode's data-mw and data-parsoid pi info |
563 | $dmw = DOMDataUtils::getDataMw( $encapNode ); |
564 | foreach ( $dmw->parts ?? [] as $part ) { |
565 | // Template index is relative to other transclusions. |
566 | // This index is used to extract whitespace information from |
567 | // data-parsoid and that array only includes info for templates. |
568 | // So skip over strings here. |
569 | if ( !is_string( $part ) ) { |
570 | $part = clone $part; |
571 | $part->i = $index++; |
572 | } |
573 | $parts[] = $part; |
574 | } |
575 | PHPUtils::pushArray( $pi, $dp->pi ?? [ [] ] ); |
576 | } else { |
577 | // Where a non-template type is present, we are going to treat that |
578 | // segment as a "string" in the parts array. So, we effectively treat |
579 | // "mw:Transclusion" as a generic type that covers a single template |
580 | // as well as a run of segments where at least one segment comes from |
581 | // a template but others may be from other generators (ex: extensions). |
582 | $this->fillDSRGap( $parts, $dp->dsr->start, $dp->dsr->end ); |
583 | } |
584 | |
585 | $prevDp = $dp; |
586 | } |
587 | |
588 | if ( !$haveTemplate ) { |
589 | throw new InternalException(); |
590 | } |
591 | |
592 | DOMUtils::addTypeOf( $wrapper, "mw:Transclusion" ); |
593 | $wrapperDp->pi = $pi; |
594 | $this->fillDSRGap( $parts, $prevDp->dsr->end, $wrapperDp->dsr->end ); |
595 | $dataMw = new DataMw( [] ); |
596 | $dataMw->parts = $parts; |
597 | DOMDataUtils::setDataMw( $wrapper, $dataMw ); |
598 | } catch ( InternalException $e ) { |
599 | // We don't have accurate template wrapping information. |
600 | // Set typeof to 'mw:Placeholder' since 'mw:Transclusion' |
601 | // typeof is not actionable without valid data-mw. |
602 | // |
603 | // FIXME: |
604 | // 1. If we stop stripping section wrappers in the html->wt direction, |
605 | // we will need to add a DOMHandler for <section> or mw:Placeholder typeof |
606 | // on arbitrary Elements to traverse into children and serialize and |
607 | // prevent page corruption. |
608 | // 2. This may be a good place to collect stats for T191641#6357136 |
609 | // 3. Maybe we need a special error typeof rather than mw:Placeholder |
610 | $wrapper->setAttribute( 'typeof', 'mw:Placeholder' ); |
611 | } |
612 | } |
613 | |
614 | /** |
615 | * Section wrappers and encapsulation wrappers can conflict because of |
616 | * partial overlaps. This method identifies those conflicts and fixes up |
617 | * the encapsulation by expanding those ranges as necessary. |
618 | */ |
619 | private function resolveTplExtSectionConflicts(): void { |
620 | $secRanges = []; |
621 | '@phan-var array[] $secRanges'; |
622 | foreach ( $this->tplsAndExtsToExamine as $tplInfo ) { |
623 | $s1 = $tplInfo->firstSection->container ?? |
624 | self::findSectionAncestor( $tplInfo->first ); |
625 | |
626 | // guaranteed to be non-null |
627 | $s2 = $tplInfo->lastSection->container; |
628 | |
629 | // Find a common ancestor of s1 and s2 (could be s1 or s2) |
630 | $s2Ancestors = DOMUtils::pathToRoot( $s2 ); |
631 | $s1Ancestors = []; |
632 | $n = 0; |
633 | $ancestor = $s1; |
634 | while ( !in_array( $ancestor, $s2Ancestors, true ) ) { |
635 | $s1Ancestors[] = $ancestor; |
636 | $ancestor = $ancestor->parentNode; |
637 | $n++; |
638 | } |
639 | |
640 | // ancestor is now the common ancestor of s1 and s2 |
641 | $s1Ancestors[] = $ancestor; |
642 | $n++; |
643 | |
644 | // Set up start/end of the new encapsulation range |
645 | if ( $ancestor === $s1 || $ancestor === $s2 ) { |
646 | $start = $ancestor; |
647 | $end = $ancestor; |
648 | } else { |
649 | // While creating a new section (see createNewSection), it only |
650 | // gets added where its parent is either another section, |
651 | // or body, so all ancestors are themselves sections, or body. |
652 | $start = $s1Ancestors[$n - 2]; |
653 | $i = array_search( $ancestor, $s2Ancestors, true ); |
654 | $end = $s2Ancestors[$i - 1]; |
655 | } |
656 | |
657 | '@phan-var Element $start'; // @var Element $start |
658 | '@phan-var Element $end'; // @var Element $end |
659 | |
660 | // Add new OR update existing range |
661 | if ( $start->hasAttribute( 'about' ) ) { |
662 | // Overlaps with an existing range. |
663 | $about = DOMCompat::getAttribute( $start, 'about' ); |
664 | if ( !$end->hasAttribute( 'about' ) ) { |
665 | // Extend existing range till $end |
666 | $secRanges[$about]['end'] = $end; |
667 | $end->setAttribute( 'about', $about ); |
668 | } else { |
669 | Assert::invariant( DOMCompat::getAttribute( $end, 'about' ) === $about, |
670 | "Expected end-range about id to be $about instead of " . |
671 | DOMCompat::getAttribute( $end, 'about' ) . " in the overlap scenario." ); |
672 | } |
673 | } else { |
674 | // Check for nesting in another range. Since $start and $end |
675 | // are siblings, this is sufficient to know the entire range |
676 | // is nested |
677 | $about = null; |
678 | $n = $start->parentNode; |
679 | $body = DOMCompat::getBody( $start->ownerDocument ); |
680 | while ( $n !== $body ) { |
681 | '@phan-var Element $n'; // @var Element $n |
682 | if ( self::isParsoidSection( $n ) && $n->hasAttribute( 'about' ) ) { |
683 | $about = DOMCompat::getAttribute( $n, 'about' ); |
684 | break; |
685 | } |
686 | $n = $n->parentNode; |
687 | } |
688 | |
689 | if ( !$about ) { |
690 | // Not overlapping, not nested => new range |
691 | $about = $this->env->newAboutId(); |
692 | $start->setAttribute( 'about', $about ); |
693 | $end->setAttribute( 'about', $about ); |
694 | $secRanges[$about] = [ 'start' => $start, 'end' => $end, 'encapWrappers' => [] ]; |
695 | } |
696 | } |
697 | $secRanges[$about]['encapWrappers'][] = $tplInfo->first; |
698 | } |
699 | |
700 | // Process recorded ranges into new encapsulation information |
701 | // that spans all content in that range. |
702 | foreach ( $secRanges as $about => $range ) { |
703 | // Ensure that all top level nodes of the range have the same about id |
704 | for ( $n = $range['start']; $n !== $range['end']->nextSibling; $n = $n->nextSibling ) { |
705 | Assert::invariant( self::isParsoidSection( $n ), |
706 | "Encountered non-Parsoid-section node (" . |
707 | DOMCompat::nodeName( $n ) . |
708 | ") while updating template wrappers" ); |
709 | $n->setAttribute( 'about', $about ); |
710 | } |
711 | |
712 | $dsr1 = $this->getDSR( $range['start'], true ); // Traverses non-tpl content => will succeed |
713 | $dsr2 = $this->getDSR( $range['end'], false ); // Traverses non-tpl content => will succeed |
714 | $dp = new DataParsoid; |
715 | $dp->dsr = new DomSourceRange( $dsr1, $dsr2, null, null ); |
716 | DOMDataUtils::setDataParsoid( $range['start'], $dp ); |
717 | |
718 | $this->collapseWrappers( $range['start'], $range['encapWrappers'] ); |
719 | } |
720 | } |
721 | |
722 | private function convertTOCOffsets() { |
723 | // Create reference array from all the codepointOffsets |
724 | $offsets = []; |
725 | foreach ( $this->env->getTOCData()->getSections() as $section ) { |
726 | if ( $section->codepointOffset !== null ) { |
727 | $offsets[] = &$section->codepointOffset; |
728 | } |
729 | } |
730 | TokenUtils::convertOffsets( |
731 | $this->env->topFrame->getSrcText(), |
732 | $this->env->getCurrentOffsetType(), |
733 | 'char', |
734 | $offsets |
735 | ); |
736 | } |
737 | |
738 | /** |
739 | * In core, Parser.php adds a TOC marker before the *first* heading element |
740 | * independent of how that heading element is nested. In the common case, |
741 | * that insertion point corresponds to the last element of the lead section |
742 | * as computed by section wrapping code in this file. In the edge case, when |
743 | * a <div> wraps the heading, the insertion point lies inside the <div> and |
744 | * has no relation to the lead section. |
745 | */ |
746 | private static function findTOCInsertionPoint( Node $elt ): ?Element { |
747 | while ( $elt ) { |
748 | // Ignore extension content while finding TOC insertion point |
749 | if ( WTUtils::isFirstExtensionWrapperNode( $elt ) ) { |
750 | $elt = WTUtils::skipOverEncapsulatedContent( $elt ); |
751 | continue; |
752 | } |
753 | if ( $elt instanceof Element ) { |
754 | if ( DOMUtils::isHeading( $elt ) ) { |
755 | return $elt; |
756 | } elseif ( $elt->firstChild ) { |
757 | $tocIP = self::findTOCInsertionPoint( $elt->firstChild ); |
758 | if ( $tocIP ) { |
759 | return $tocIP; |
760 | } |
761 | } |
762 | } |
763 | $elt = $elt->nextSibling; |
764 | } |
765 | return null; |
766 | } |
767 | |
768 | /** |
769 | * Insert a synthetic section in which to place the TOC |
770 | */ |
771 | private function insertSyntheticSection( |
772 | Element $syntheticTocMeta, Element $insertionPoint |
773 | ): Element { |
774 | $prev = $insertionPoint->previousSibling; |
775 | |
776 | // Create a pseudo-section contaning the TOC |
777 | $syntheticTocSection = $this->doc->createElement( 'section' ); |
778 | $syntheticTocSection->setAttribute( 'data-mw-section-id', '-2' ); |
779 | $insertionPoint->parentNode->insertBefore( $syntheticTocSection, $insertionPoint ); |
780 | $this->pseudoSectionCount++; |
781 | $syntheticTocSection->appendChild( $syntheticTocMeta ); |
782 | |
783 | // Ensure template continuity is not broken! |
784 | // If $prev is not an encapsulation wrapper, nothing to do! |
785 | if ( $prev && WTUtils::isEncapsulationWrapper( $prev ) ) { |
786 | '@phan-var Element $prev'; |
787 | $prevAbout = DOMCompat::getAttribute( $prev, 'about' ); |
788 | |
789 | // First, handle the case of section-tag-stripping that VE does. |
790 | // So, find the leftmost non-section-wrapper node since we want |
791 | // If the about ids are different, $next & $prev belong to |
792 | // different transclusions and the TOC meta can be left alone. |
793 | $next = $insertionPoint->firstChild; |
794 | $nextAbout = $next instanceof Element ? DOMCompat::getAttribute( $next, 'about' ) : null; |
795 | if ( $prevAbout === $nextAbout ) { |
796 | $syntheticTocMeta->setAttribute( 'about', $prevAbout ); |
797 | } |
798 | |
799 | // Now handle case of section-tags not being stripped |
800 | // NOTE that $syntheticMeta is before $insertipnPoint |
801 | // If it is not-null, it is known to be a <section>. |
802 | $next = $insertionPoint; |
803 | '@phan-var Element $next'; |
804 | $nextAbout = $next ? DOMCompat::getAttribute( $next, 'about' ) : null; |
805 | if ( $prevAbout === $nextAbout ) { |
806 | $syntheticTocSection->setAttribute( 'about', $prevAbout ); |
807 | } |
808 | } |
809 | |
810 | return $syntheticTocSection; |
811 | } |
812 | |
813 | private function addSyntheticTOCMarker(): void { |
814 | // Add a synthetic TOC at the end of the first section, if necessary |
815 | $tocBS = $this->env->getBehaviorSwitch( 'toc' ); |
816 | $noTocBS = $this->env->getBehaviorSwitch( 'notoc' ); |
817 | $forceTocBS = $this->env->getBehaviorSwitch( 'forcetoc' ); |
818 | |
819 | $showToc = true; |
820 | if ( $noTocBS && !$tocBS ) { |
821 | $showToc = false; |
822 | } |
823 | $numHeadings = $this->count - 1 - $this->pseudoSectionCount; // $this->count is initialized to 1 |
824 | $enoughToc = $showToc && ( $numHeadings >= 4 || $tocBS ); |
825 | if ( $forceTocBS ) { |
826 | $showToc = true; |
827 | $enoughToc = true; |
828 | } |
829 | if ( $numHeadings == 0 ) { |
830 | $enoughToc = false; |
831 | } |
832 | |
833 | if ( !$this->env->getPageConfig()->getSuppressTOC() ) { |
834 | if ( $enoughToc ) { |
835 | // ParserOutputFlags::SHOW_TOC |
836 | $this->env->getMetadata()->setOutputFlag( 'show-toc' ); |
837 | if ( !$tocBS ) { |
838 | $syntheticTocMeta = $this->doc->createElement( 'meta' ); |
839 | $syntheticTocMeta->setAttribute( 'property', 'mw:PageProp/toc' ); |
840 | $dmw = DOMDataUtils::getDataMw( $syntheticTocMeta ); |
841 | $dmw->autoGenerated = true; |
842 | $tocIP = $this->findTOCInsertionPoint( DOMCompat::getBody( $this->doc ) ); |
843 | if ( $tocIP === null ) { |
844 | // should not happen, but nothing to do here! |
845 | return; |
846 | } |
847 | |
848 | // NOTE: Given how <section>s are computed in this file, headings |
849 | // will never have previous siblings. So, we look at $eltSection's |
850 | // previous siblings always. |
851 | $insertionPoint = self::findSectionAncestor( $tocIP ); |
852 | |
853 | $insertionContainer = $insertionPoint->previousSibling; |
854 | if ( !$insertionContainer || DOMCompat::nodeName( $insertionContainer ) !== 'section' ) { |
855 | $insertionContainer = $this->insertSyntheticSection( |
856 | $syntheticTocMeta, $insertionPoint |
857 | ); |
858 | } |
859 | $insertionContainer->appendChild( $syntheticTocMeta ); |
860 | |
861 | // Set a synthetic zero-length dsr to suppress noisy warnings |
862 | // from the round trip testing script. |
863 | $syntheticOffset = DOMDataUtils::getDataParsoid( $tocIP )->dsr->start ?? null; |
864 | if ( $syntheticOffset !== null ) { |
865 | $dp = DOMDataUtils::getDataParsoid( $syntheticTocMeta ); |
866 | $dp->dsr = new DomSourceRange( $syntheticOffset, $syntheticOffset, 0, 0 ); |
867 | } |
868 | } |
869 | } |
870 | if ( !$showToc ) { |
871 | // ParserOutputFlags::NO_TOC |
872 | $this->env->getMetadata()->setOutputFlag( 'no-toc' ); |
873 | } |
874 | } |
875 | } |
876 | |
877 | /** Transfer information about section links from behaviour switches to CMC */ |
878 | private function addSectionInfo() { |
879 | $newSectionLink = $this->env->getBehaviorSwitch( 'newsectionlink' ); |
880 | if ( $newSectionLink !== null ) { |
881 | // ParserOutputFlags::NEW_SECTION |
882 | $this->env->getMetadata()->setOutputFlag( |
883 | 'mw-NewSection', $newSectionLink |
884 | ); |
885 | } |
886 | $noNewSectionLink = $this->env->getBehaviorSwitch( 'nonewsectionlink' ); |
887 | if ( $noNewSectionLink !== null ) { |
888 | // ParserOutputFlags::HIDE_NEW_SECTION |
889 | $this->env->getMetadata()->setOutputFlag( |
890 | 'mw-HideNewSection', $noNewSectionLink |
891 | ); |
892 | } |
893 | $noEditSection = $this->env->getBehaviorSwitch( 'noeditsection' ); |
894 | if ( $noEditSection !== null ) { |
895 | // ParserOutputFlags::NO_SECTION_EDIT_LINKS |
896 | $this->env->getMetadata()->setOutputFlag( |
897 | 'no-section-edit-links', $noEditSection |
898 | ); |
899 | } |
900 | } |
901 | |
902 | /** |
903 | * DOM Postprocessor entry function to walk DOM rooted at $root |
904 | * and add <section> wrappers as necessary. |
905 | * Implements the algorithm documented @ mw:Parsing/Notes/Section_Wrapping |
906 | */ |
907 | public function run(): void { |
908 | // 6 is the lowest possible level since we don't want |
909 | // any nesting of h-tags in the lead section |
910 | $leadSection = new Section( 6, 0, $this->doc ); |
911 | $leadSection->setId( 0 ); |
912 | |
913 | $this->wrapSectionsInDOM( $leadSection, $this->rootNode ); |
914 | |
915 | // There will always be a lead section, even if sometimes it only |
916 | // contains whitespace + comments. |
917 | $this->rootNode->insertBefore( $leadSection->container, $this->rootNode->firstChild ); |
918 | |
919 | // Resolve template conflicts after all sections have been added to the DOM |
920 | $this->resolveTplExtSectionConflicts(); |
921 | |
922 | // Convert byte offsets to codepoint offsets in TOCData |
923 | // (done in a batch to avoid O(N^2) string traversals) |
924 | $this->convertTOCOffsets(); |
925 | |
926 | $this->addSyntheticTOCMarker(); |
927 | |
928 | $this->addSectionInfo(); |
929 | } |
930 | } |