Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
63.83% |
60 / 94 |
|
25.00% |
1 / 4 |
CRAP | |
0.00% |
0 / 1 |
Headings | |
63.83% |
60 / 94 |
|
25.00% |
1 / 4 |
76.48 | |
0.00% |
0 / 1 |
processHeadingContent | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
240 | |||
genAnchors | |
100.00% |
39 / 39 |
|
100.00% |
1 / 1 |
6 | |||
normalizeSectionName | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
dedupeHeadingIds | |
91.30% |
21 / 23 |
|
0.00% |
0 / 1 |
8.04 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Handlers; |
5 | |
6 | use Wikimedia\Parsoid\Config\Env; |
7 | use Wikimedia\Parsoid\Core\DomSourceRange; |
8 | use Wikimedia\Parsoid\Core\Sanitizer; |
9 | use Wikimedia\Parsoid\DOM\Element; |
10 | use Wikimedia\Parsoid\DOM\Node; |
11 | use Wikimedia\Parsoid\DOM\Text; |
12 | use Wikimedia\Parsoid\Utils\DOMCompat; |
13 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
14 | use Wikimedia\Parsoid\Utils\DOMUtils; |
15 | use Wikimedia\Parsoid\Utils\DTState; |
16 | use Wikimedia\Parsoid\Utils\TitleException; |
17 | use Wikimedia\Parsoid\Utils\Utils; |
18 | use Wikimedia\Parsoid\Utils\WTUtils; |
19 | |
20 | class Headings { |
21 | /** |
22 | * See the safe-heading transform code in Parser::finalizeHeadings in core |
23 | * |
24 | * Allowed HTML tags are: |
25 | * - <sup> and <sub> (T10393) |
26 | * - <i> (T28375) |
27 | * - <b> (r105284) |
28 | * - <bdi> (T74884) |
29 | * - <span dir="rtl"> and <span dir="ltr"> (T37167) |
30 | * (handled separately in code below) |
31 | * - <s> and <strike> (T35715) |
32 | * - <q> (T251672) |
33 | */ |
34 | private const ALLOWED_NODES_IN_ANCHOR = [ 'span', 'sup', 'sub', 'i', 'b', 'bdi', 's', 'strike', 'q' ]; |
35 | |
36 | /** |
37 | * This method implements the equivalent of the regexp-based safe-headline |
38 | * transform in Parser::finalizeHeadings in core. |
39 | * |
40 | * @param Node $node |
41 | */ |
42 | private static function processHeadingContent( Node $node ): void { |
43 | $c = $node->firstChild; |
44 | while ( $c ) { |
45 | $next = $c->nextSibling; |
46 | if ( $c instanceof Element ) { |
47 | $cName = DOMCompat::nodeName( $c ); |
48 | if ( DOMUtils::hasTypeOf( $c, 'mw:LanguageVariant' ) ) { |
49 | // Special case for -{...}- |
50 | $dp = DOMDataUtils::getDataParsoid( $c ); |
51 | $node->replaceChild( |
52 | $node->ownerDocument->createTextNode( $dp->src ?? '' ), $c |
53 | ); |
54 | } elseif ( in_array( $cName, [ 'style', 'script' ], true ) ) { |
55 | # Remove any <style> or <script> tags (T198618) |
56 | $node->removeChild( $c ); |
57 | } else { |
58 | self::processHeadingContent( $c ); |
59 | if ( !$c->firstChild ) { |
60 | // Empty now - strip it! |
61 | $node->removeChild( $c ); |
62 | } elseif ( |
63 | !in_array( $cName, self::ALLOWED_NODES_IN_ANCHOR, true ) || |
64 | ( $cName === 'span' && DOMUtils::hasTypeOf( $c, 'mw:Entity' ) ) |
65 | ) { |
66 | # Strip all unallowed tag wrappers |
67 | DOMUtils::migrateChildren( $c, $node, $next ); |
68 | $next = $c->nextSibling; |
69 | $node->removeChild( $c ); |
70 | } else { |
71 | # We strip any parameter from accepted tags except dir="rtl|ltr" from <span>, |
72 | # to allow setting directionality in toc items. |
73 | foreach ( DOMUtils::attributes( $c ) as $key => $val ) { |
74 | if ( $cName === 'span' ) { |
75 | if ( $key !== 'dir' || ( $val !== 'ltr' && $val !== 'rtl' ) ) { |
76 | $c->removeAttribute( $key ); |
77 | } |
78 | } else { |
79 | $c->removeAttribute( $key ); |
80 | } |
81 | } |
82 | } |
83 | } |
84 | } elseif ( !( $c instanceof Text ) ) { |
85 | // Strip everying else but text nodes |
86 | $node->removeChild( $c ); |
87 | } |
88 | |
89 | $c = $next; |
90 | } |
91 | } |
92 | |
93 | /** |
94 | * Generate anchor ids that the PHP parser assigns to headings. |
95 | * This is to ensure that links that are out there in the wild |
96 | * continue to be valid links into Parsoid HTML. |
97 | * @param Node $node |
98 | * @param DTState $state |
99 | * @return bool |
100 | */ |
101 | public static function genAnchors( Node $node, DTState $state ): bool { |
102 | if ( !DOMUtils::isHeading( $node ) ) { |
103 | return true; |
104 | } |
105 | '@phan-var Element $node'; /** @var Element $node */ |
106 | |
107 | // Deep clone the heading to mutate it to strip unwanted tags and attributes. |
108 | $clone = DOMDataUtils::cloneNode( $node, true ); |
109 | '@phan-var Element $clone'; // @var Element $clone |
110 | // Don't bother storing data-attribs on $clone, |
111 | // processHeadingContent is about to strip them |
112 | |
113 | self::processHeadingContent( $clone ); |
114 | $buf = DOMCompat::getInnerHTML( $clone ); |
115 | $line = trim( $buf ); |
116 | |
117 | $dp = DOMDataUtils::getDataParsoid( $node ); |
118 | $tmp = $dp->getTemp(); |
119 | |
120 | // Cannot generate an anchor id if the heading already has an id! |
121 | // |
122 | // NOTE: Divergence from PHP parser behavior. |
123 | // |
124 | // The PHP parser generates a <h*><span id="anchor-id-here-">..</span><h*> |
125 | // So, it can preserve the existing id if any. However, in Parsoid, we are |
126 | // generating a <h* id="anchor-id-here"> ..</h*> => we either overwrite or |
127 | // preserve the existing id and use it for TOC, etc. We choose to preserve it. |
128 | if ( $node->hasAttribute( 'id' ) ) { |
129 | $linkAnchorId = DOMCompat::getAttribute( $node, 'id' ); |
130 | $dp->reusedId = true; |
131 | $tmp->section = [ |
132 | 'line' => $line, |
133 | 'linkAnchor' => $linkAnchorId, |
134 | ]; |
135 | return true; |
136 | } |
137 | |
138 | // Additional processing for $anchor |
139 | $anchorText = $clone->textContent; // strip all tags |
140 | $anchorText = Sanitizer::normalizeSectionNameWhiteSpace( $anchorText ); |
141 | $anchorText = self::normalizeSectionName( $anchorText, $state->env ); |
142 | |
143 | # NOTE: Parsoid defaults to html5 mode. So, if we want to replicate |
144 | # legacy output, we should handle that explicitly. |
145 | $anchorId = Sanitizer::escapeIdForAttribute( $anchorText ); |
146 | $linkAnchorId = Sanitizer::escapeIdForLink( $anchorText ); |
147 | $fallbackId = Sanitizer::escapeIdForAttribute( $anchorText, Sanitizer::ID_FALLBACK ); |
148 | if ( $anchorId === $fallbackId ) { |
149 | $fallbackId = null; /* not needed */ |
150 | } |
151 | |
152 | // The ids need to be unique, but we'll enforce this in a post-processing step. |
153 | $node->setAttribute( 'id', $anchorId ); |
154 | $tmp->section = [ |
155 | 'line' => $line, |
156 | 'linkAnchor' => $linkAnchorId, |
157 | ]; |
158 | |
159 | if ( $fallbackId ) { |
160 | $span = $node->ownerDocument->createElement( 'span' ); |
161 | $span->setAttribute( 'id', $fallbackId ); |
162 | DOMUtils::addTypeOf( $span, 'mw:FallbackId' ); |
163 | $nodeDsr = DOMDataUtils::getDataParsoid( $node )->dsr ?? null; |
164 | // Set a zero-width dsr range for the fallback id |
165 | if ( Utils::isValidDSR( $nodeDsr ) ) { |
166 | $offset = $nodeDsr->innerStart(); |
167 | DOMDataUtils::getDataParsoid( $span )->dsr = new DomSourceRange( $offset, $offset, null, null ); |
168 | } |
169 | $node->insertBefore( $span, $node->firstChild ); |
170 | } |
171 | |
172 | return true; |
173 | } |
174 | |
175 | /** |
176 | * see Parser::normalizeSectionName in Parser.php and T90902 |
177 | * @param string $text |
178 | * @param Env $env |
179 | * @return string |
180 | */ |
181 | private static function normalizeSectionName( string $text, Env $env ): string { |
182 | try { |
183 | $title = $env->makeTitleFromURLDecodedStr( "#{$text}" ); |
184 | return $title->getFragment(); |
185 | } catch ( TitleException $e ) { |
186 | return $text; |
187 | } |
188 | } |
189 | |
190 | public static function dedupeHeadingIds( Node $node, DTState $state ): bool { |
191 | // NOTE: This is not completely compliant with how PHP parser does it. |
192 | // If there is an id in the doc elsewhere, this will assign |
193 | // the heading a suffixed id, whereas the PHP parser processes |
194 | // headings in textual order and can introduce duplicate ids |
195 | // in a document in the process. |
196 | // |
197 | // However, we believe this implementation behavior is more |
198 | // consistent when handling this edge case, and in the common |
199 | // case (where heading ids won't conflict with ids elsewhere), |
200 | // matches PHP parser behavior. |
201 | // FIXME: Maybe we should lint this issue away |
202 | if ( !$node instanceof Element ) { |
203 | // Not an Element |
204 | return true; |
205 | } |
206 | |
207 | $origKey = DOMCompat::getAttribute( $node, 'id' ); |
208 | if ( $origKey === null ) { |
209 | return true; |
210 | } |
211 | // IE 7 required attributes to be case-insensitively unique (T12721) |
212 | // but it did not support non-ASCII IDs. We don't support IE 7 anymore, |
213 | // but changing the algorithm would change the relevant fragment URLs. |
214 | // This case folding and matching algorithm has to stay exactly the |
215 | // same to preserve external links to the page. |
216 | $key = strtolower( $origKey ); |
217 | $seenIds = &$state->seenIds; |
218 | if ( !isset( $seenIds[$key] ) ) { |
219 | $seenIds[$key] = 1; |
220 | return true; |
221 | } |
222 | // Only update headings and legacy links (first children of heading) |
223 | $isHeading = DOMUtils::isHeading( $node ); |
224 | if ( $isHeading || WTUtils::isFallbackIdSpan( $node ) ) { |
225 | $suffix = ++$seenIds[$key]; |
226 | while ( !empty( $seenIds[$key . '_' . $suffix] ) ) { |
227 | $suffix++; |
228 | $seenIds[$key]++; |
229 | } |
230 | $node->setAttribute( 'id', $origKey . '_' . $suffix ); |
231 | if ( $isHeading ) { |
232 | $tmp = DOMDataUtils::getDataParsoid( $node )->getTemp(); |
233 | $linkAnchorId = $tmp->section['linkAnchor']; |
234 | $tmp->section['linkAnchor'] = $linkAnchorId . '_' . $suffix; |
235 | } |
236 | $seenIds[$key . '_' . $suffix] = 1; |
237 | } |
238 | return true; |
239 | } |
240 | } |