Code Coverage for /src/src/Wt2Html/PP/Handlers/Headings.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	41.94% covered (danger)	41.94%	39 / 93	25.00% covered (danger)	25.00%	1 / 4	CRAP	0.00% covered (danger)	0.00%	0 / 1
Headings	41.94% covered (danger)	41.94%	39 / 93	25.00% covered (danger)	25.00%	1 / 4	219.13	0.00% covered (danger)	0.00%	0 / 1
processHeadingContent	0.00% covered (danger)	0.00%	0 / 28	0.00% covered (danger)	0.00%	0 / 1	240
genAnchors	100.00% covered (success)	100.00%	39 / 39	100.00% covered (success)	100.00%	1 / 1	6
normalizeSectionName	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
dedupeHeadingIds	0.00% covered (danger)	0.00%	0 / 22	0.00% covered (danger)	0.00%	0 / 1	72

1	<?php
2	declare( strict_types = 1 );
3
4	namespace Wikimedia\Parsoid\Wt2Html\PP\Handlers;
5
6	use Wikimedia\Parsoid\Config\Env;
7	use Wikimedia\Parsoid\Core\DomSourceRange;
8	use Wikimedia\Parsoid\Core\Sanitizer;
9	use Wikimedia\Parsoid\DOM\Element;
10	use Wikimedia\Parsoid\DOM\Node;
11	use Wikimedia\Parsoid\DOM\Text;
12	use Wikimedia\Parsoid\Utils\DOMCompat;
13	use Wikimedia\Parsoid\Utils\DOMDataUtils;
14	use Wikimedia\Parsoid\Utils\DOMUtils;
15	use Wikimedia\Parsoid\Utils\TitleException;
16	use Wikimedia\Parsoid\Utils\Utils;
17	use Wikimedia\Parsoid\Utils\WTUtils;
18
19	class Headings {
20	/**
21	* See the safe-heading transform code in Parser::finalizeHeadings in core
22	*
23	* Allowed HTML tags are:
24	* - <sup> and <sub> (T10393)
25	* - <i> (T28375)
26	* - <b> (r105284)
27	* - <bdi> (T74884)
28	* - <span dir="rtl"> and <span dir="ltr"> (T37167)
29	* (handled separately in code below)
30	* - <s> and <strike> (T35715)
31	* - <q> (T251672)
32	*/
33	private const ALLOWED_NODES_IN_ANCHOR = [ 'span', 'sup', 'sub', 'i', 'b', 'bdi', 's', 'strike', 'q' ];
34
35	/**
36	* This method implements the equivalent of the regexp-based safe-headline
37	* transform in Parser::finalizeHeadings in core.
38	*
39	* @param Node $node
40	*/
41	private static function processHeadingContent( Node $node ): void {
42	$c = $node->firstChild;
43	while ( $c ) {
44	$next = $c->nextSibling;
45	if ( $c instanceof Element ) {
46	$cName = DOMCompat::nodeName( $c );
47	if ( DOMUtils::hasTypeOf( $c, 'mw:LanguageVariant' ) ) {
48	// Special case for -{...}-
49	$dp = DOMDataUtils::getDataParsoid( $c );
50	$node->replaceChild(
51	$node->ownerDocument->createTextNode( $dp->src ?? '' ), $c
52	);
53	} elseif ( in_array( $cName, [ 'style', 'script' ], true ) ) {
54	# Remove any <style> or <script> tags (T198618)
55	$node->removeChild( $c );
56	} else {
57	self::processHeadingContent( $c );
58	if ( !$c->firstChild ) {
59	// Empty now - strip it!
60	$node->removeChild( $c );
61	} elseif (
62	!in_array( $cName, self::ALLOWED_NODES_IN_ANCHOR, true ) \|\|
63	( $cName === 'span' && DOMUtils::hasTypeOf( $c, 'mw:Entity' ) )
64	) {
65	# Strip all unallowed tag wrappers
66	DOMUtils::migrateChildren( $c, $node, $next );
67	$next = $c->nextSibling;
68	$node->removeChild( $c );
69	} else {
70	# We strip any parameter from accepted tags except dir="rtl\|ltr" from <span>,
71	# to allow setting directionality in toc items.
72	foreach ( DOMUtils::attributes( $c ) as $key => $val ) {
73	if ( $cName === 'span' ) {
74	if ( $key !== 'dir' \|\| ( $val !== 'ltr' && $val !== 'rtl' ) ) {
75	$c->removeAttribute( $key );
76	}
77	} else {
78	$c->removeAttribute( $key );
79	}
80	}
81	}
82	}
83	} elseif ( !( $c instanceof Text ) ) {
84	// Strip everying else but text nodes
85	$node->removeChild( $c );
86	}
87
88	$c = $next;
89	}
90	}
91
92	/**
93	* Generate anchor ids that the PHP parser assigns to headings.
94	* This is to ensure that links that are out there in the wild
95	* continue to be valid links into Parsoid HTML.
96	* @param Node $node
97	* @param Env $env
98	* @return bool
99	*/
100	public static function genAnchors( Node $node, Env $env ): bool {
101	if ( !DOMUtils::isHeading( $node ) ) {
102	return true;
103	}
104	'@phan-var Element $node'; /** @var Element $node */
105
106	// Deep clone the heading to mutate it to strip unwanted tags and attributes.
107	$clone = DOMDataUtils::cloneNode( $node, true );
108	'@phan-var Element $clone'; // @var Element $clone
109	// Don't bother storing data-attribs on $clone,
110	// processHeadingContent is about to strip them
111
112	self::processHeadingContent( $clone );
113	$buf = DOMCompat::getInnerHTML( $clone );
114	$line = trim( $buf );
115
116	$dp = DOMDataUtils::getDataParsoid( $node );
117	$tmp = $dp->getTemp();
118
119	// Cannot generate an anchor id if the heading already has an id!
120	//
121	// NOTE: Divergence from PHP parser behavior.
122	//
123	// The PHP parser generates a <h><span id="anchor-id-here-">..</span><h>
124	// So, it can preserve the existing id if any. However, in Parsoid, we are
125	// generating a <h* id="anchor-id-here"> ..</h*> => we either overwrite or
126	// preserve the existing id and use it for TOC, etc. We choose to preserve it.
127	if ( $node->hasAttribute( 'id' ) ) {
128	$linkAnchorId = DOMCompat::getAttribute( $node, 'id' );
129	$dp->reusedId = true;
130	$tmp->section = [
131	'line' => $line,
132	'linkAnchor' => $linkAnchorId,
133	];
134	return true;
135	}
136
137	// Additional processing for $anchor
138	$anchorText = $clone->textContent; // strip all tags
139	$anchorText = Sanitizer::normalizeSectionNameWhiteSpace( $anchorText );
140	$anchorText = self::normalizeSectionName( $anchorText, $env );
141
142	# NOTE: Parsoid defaults to html5 mode. So, if we want to replicate
143	# legacy output, we should handle that explicitly.
144	$anchorId = Sanitizer::escapeIdForAttribute( $anchorText );
145	$linkAnchorId = Sanitizer::escapeIdForLink( $anchorText );
146	$fallbackId = Sanitizer::escapeIdForAttribute( $anchorText, Sanitizer::ID_FALLBACK );
147	if ( $anchorId === $fallbackId ) {
148	$fallbackId = null; /* not needed */
149	}
150
151	// The ids need to be unique, but we'll enforce this in a post-processing
152	// step.
153
154	$node->setAttribute( 'id', $anchorId );
155	$tmp->section = [
156	'line' => $line,
157	'linkAnchor' => $linkAnchorId,
158	];
159
160	if ( $fallbackId ) {
161	$span = $node->ownerDocument->createElement( 'span' );
162	$span->setAttribute( 'id', $fallbackId );
163	DOMUtils::addTypeOf( $span, 'mw:FallbackId' );
164	$nodeDsr = DOMDataUtils::getDataParsoid( $node )->dsr ?? null;
165	// Set a zero-width dsr range for the fallback id
166	if ( Utils::isValidDSR( $nodeDsr ) ) {
167	$offset = $nodeDsr->innerStart();
168	DOMDataUtils::getDataParsoid( $span )->dsr = new DomSourceRange( $offset, $offset, null, null );
169	}
170	$node->insertBefore( $span, $node->firstChild );
171	}
172
173	return true;
174	}
175
176	/**
177	* see Parser::normalizeSectionName in Parser.php and T90902
178	* @param string $text
179	* @param Env $env
180	* @return string
181	*/
182	private static function normalizeSectionName( string $text, Env $env ): string {
183	try {
184	$title = $env->makeTitleFromURLDecodedStr( "#{$text}" );
185	return $title->getFragment();
186	} catch ( TitleException $e ) {
187	return $text;
188	}
189	}
190
191	public static function dedupeHeadingIds( array &$seenIds, Node $node ): bool {
192	// NOTE: This is not completely compliant with how PHP parser does it.
193	// If there is an id in the doc elsewhere, this will assign
194	// the heading a suffixed id, whereas the PHP parser processes
195	// headings in textual order and can introduce duplicate ids
196	// in a document in the process.
197	//
198	// However, we believe this implementation behavior is more
199	// consistent when handling this edge case, and in the common
200	// case (where heading ids won't conflict with ids elsewhere),
201	// matches PHP parser behavior.
202	if ( !$node instanceof Element ) {
203	// Not an Element
204	return true;
205	}
206
207	$origKey = DOMCompat::getAttribute( $node, 'id' );
208	if ( $origKey === null ) {
209	return true;
210	}
211	// IE 7 required attributes to be case-insensitively unique (T12721)
212	// but it did not support non-ASCII IDs. We don't support IE 7 anymore,
213	// but changing the algorithm would change the relevant fragment URLs.
214	// This case folding and matching algorithm has to stay exactly the
215	// same to preserve external links to the page.
216	$key = strtolower( $origKey );
217	if ( !isset( $seenIds[$key] ) ) {
218	$seenIds[$key] = 1;
219	return true;
220	}
221	// Only update headings and legacy links (first children of heading)
222	$isHeading = DOMUtils::isHeading( $node );
223	if ( $isHeading \|\| WTUtils::isFallbackIdSpan( $node ) ) {
224	$suffix = ++$seenIds[$key];
225	while ( !empty( $seenIds[$key . '_' . $suffix] ) ) {
226	$suffix++;
227	$seenIds[$key]++;
228	}
229	$node->setAttribute( 'id', $origKey . '_' . $suffix );
230	if ( $isHeading ) {
231	$tmp = DOMDataUtils::getDataParsoid( $node )->getTemp();
232	$linkAnchorId = $tmp->section['linkAnchor'];
233	$tmp->section['linkAnchor'] = $linkAnchorId . '_' . $suffix;
234	}
235	$seenIds[$key . '_' . $suffix] = 1;
236	}
237	return true;
238	}
239	}