Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
41.94% |
39 / 93 |
|
25.00% |
1 / 4 |
CRAP | |
0.00% |
0 / 1 |
Headings | |
41.94% |
39 / 93 |
|
25.00% |
1 / 4 |
219.13 | |
0.00% |
0 / 1 |
processHeadingContent | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
240 | |||
genAnchors | |
100.00% |
39 / 39 |
|
100.00% |
1 / 1 |
6 | |||
normalizeSectionName | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
dedupeHeadingIds | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
72 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\PP\Handlers; |
5 | |
6 | use Wikimedia\Parsoid\Config\Env; |
7 | use Wikimedia\Parsoid\Core\DomSourceRange; |
8 | use Wikimedia\Parsoid\Core\Sanitizer; |
9 | use Wikimedia\Parsoid\DOM\Element; |
10 | use Wikimedia\Parsoid\DOM\Node; |
11 | use Wikimedia\Parsoid\DOM\Text; |
12 | use Wikimedia\Parsoid\Utils\DOMCompat; |
13 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
14 | use Wikimedia\Parsoid\Utils\DOMUtils; |
15 | use Wikimedia\Parsoid\Utils\TitleException; |
16 | use Wikimedia\Parsoid\Utils\Utils; |
17 | use Wikimedia\Parsoid\Utils\WTUtils; |
18 | |
19 | class Headings { |
20 | /** |
21 | * See the safe-heading transform code in Parser::finalizeHeadings in core |
22 | * |
23 | * Allowed HTML tags are: |
24 | * - <sup> and <sub> (T10393) |
25 | * - <i> (T28375) |
26 | * - <b> (r105284) |
27 | * - <bdi> (T74884) |
28 | * - <span dir="rtl"> and <span dir="ltr"> (T37167) |
29 | * (handled separately in code below) |
30 | * - <s> and <strike> (T35715) |
31 | * - <q> (T251672) |
32 | */ |
33 | private const ALLOWED_NODES_IN_ANCHOR = [ 'span', 'sup', 'sub', 'i', 'b', 'bdi', 's', 'strike', 'q' ]; |
34 | |
35 | /** |
36 | * This method implements the equivalent of the regexp-based safe-headline |
37 | * transform in Parser::finalizeHeadings in core. |
38 | * |
39 | * @param Node $node |
40 | */ |
41 | private static function processHeadingContent( Node $node ): void { |
42 | $c = $node->firstChild; |
43 | while ( $c ) { |
44 | $next = $c->nextSibling; |
45 | if ( $c instanceof Element ) { |
46 | $cName = DOMCompat::nodeName( $c ); |
47 | if ( DOMUtils::hasTypeOf( $c, 'mw:LanguageVariant' ) ) { |
48 | // Special case for -{...}- |
49 | $dp = DOMDataUtils::getDataParsoid( $c ); |
50 | $node->replaceChild( |
51 | $node->ownerDocument->createTextNode( $dp->src ?? '' ), $c |
52 | ); |
53 | } elseif ( in_array( $cName, [ 'style', 'script' ], true ) ) { |
54 | # Remove any <style> or <script> tags (T198618) |
55 | $node->removeChild( $c ); |
56 | } else { |
57 | self::processHeadingContent( $c ); |
58 | if ( !$c->firstChild ) { |
59 | // Empty now - strip it! |
60 | $node->removeChild( $c ); |
61 | } elseif ( |
62 | !in_array( $cName, self::ALLOWED_NODES_IN_ANCHOR, true ) || |
63 | ( $cName === 'span' && DOMUtils::hasTypeOf( $c, 'mw:Entity' ) ) |
64 | ) { |
65 | # Strip all unallowed tag wrappers |
66 | DOMUtils::migrateChildren( $c, $node, $next ); |
67 | $next = $c->nextSibling; |
68 | $node->removeChild( $c ); |
69 | } else { |
70 | # We strip any parameter from accepted tags except dir="rtl|ltr" from <span>, |
71 | # to allow setting directionality in toc items. |
72 | foreach ( DOMUtils::attributes( $c ) as $key => $val ) { |
73 | if ( $cName === 'span' ) { |
74 | if ( $key !== 'dir' || ( $val !== 'ltr' && $val !== 'rtl' ) ) { |
75 | $c->removeAttribute( $key ); |
76 | } |
77 | } else { |
78 | $c->removeAttribute( $key ); |
79 | } |
80 | } |
81 | } |
82 | } |
83 | } elseif ( !( $c instanceof Text ) ) { |
84 | // Strip everying else but text nodes |
85 | $node->removeChild( $c ); |
86 | } |
87 | |
88 | $c = $next; |
89 | } |
90 | } |
91 | |
92 | /** |
93 | * Generate anchor ids that the PHP parser assigns to headings. |
94 | * This is to ensure that links that are out there in the wild |
95 | * continue to be valid links into Parsoid HTML. |
96 | * @param Node $node |
97 | * @param Env $env |
98 | * @return bool |
99 | */ |
100 | public static function genAnchors( Node $node, Env $env ): bool { |
101 | if ( !DOMUtils::isHeading( $node ) ) { |
102 | return true; |
103 | } |
104 | '@phan-var Element $node'; /** @var Element $node */ |
105 | |
106 | // Deep clone the heading to mutate it to strip unwanted tags and attributes. |
107 | $clone = DOMDataUtils::cloneNode( $node, true ); |
108 | '@phan-var Element $clone'; // @var Element $clone |
109 | // Don't bother storing data-attribs on $clone, |
110 | // processHeadingContent is about to strip them |
111 | |
112 | self::processHeadingContent( $clone ); |
113 | $buf = DOMCompat::getInnerHTML( $clone ); |
114 | $line = trim( $buf ); |
115 | |
116 | $dp = DOMDataUtils::getDataParsoid( $node ); |
117 | $tmp = $dp->getTemp(); |
118 | |
119 | // Cannot generate an anchor id if the heading already has an id! |
120 | // |
121 | // NOTE: Divergence from PHP parser behavior. |
122 | // |
123 | // The PHP parser generates a <h*><span id="anchor-id-here-">..</span><h*> |
124 | // So, it can preserve the existing id if any. However, in Parsoid, we are |
125 | // generating a <h* id="anchor-id-here"> ..</h*> => we either overwrite or |
126 | // preserve the existing id and use it for TOC, etc. We choose to preserve it. |
127 | if ( $node->hasAttribute( 'id' ) ) { |
128 | $linkAnchorId = DOMCompat::getAttribute( $node, 'id' ); |
129 | $dp->reusedId = true; |
130 | $tmp->section = [ |
131 | 'line' => $line, |
132 | 'linkAnchor' => $linkAnchorId, |
133 | ]; |
134 | return true; |
135 | } |
136 | |
137 | // Additional processing for $anchor |
138 | $anchorText = $clone->textContent; // strip all tags |
139 | $anchorText = Sanitizer::normalizeSectionNameWhiteSpace( $anchorText ); |
140 | $anchorText = self::normalizeSectionName( $anchorText, $env ); |
141 | |
142 | # NOTE: Parsoid defaults to html5 mode. So, if we want to replicate |
143 | # legacy output, we should handle that explicitly. |
144 | $anchorId = Sanitizer::escapeIdForAttribute( $anchorText ); |
145 | $linkAnchorId = Sanitizer::escapeIdForLink( $anchorText ); |
146 | $fallbackId = Sanitizer::escapeIdForAttribute( $anchorText, Sanitizer::ID_FALLBACK ); |
147 | if ( $anchorId === $fallbackId ) { |
148 | $fallbackId = null; /* not needed */ |
149 | } |
150 | |
151 | // The ids need to be unique, but we'll enforce this in a post-processing |
152 | // step. |
153 | |
154 | $node->setAttribute( 'id', $anchorId ); |
155 | $tmp->section = [ |
156 | 'line' => $line, |
157 | 'linkAnchor' => $linkAnchorId, |
158 | ]; |
159 | |
160 | if ( $fallbackId ) { |
161 | $span = $node->ownerDocument->createElement( 'span' ); |
162 | $span->setAttribute( 'id', $fallbackId ); |
163 | DOMUtils::addTypeOf( $span, 'mw:FallbackId' ); |
164 | $nodeDsr = DOMDataUtils::getDataParsoid( $node )->dsr ?? null; |
165 | // Set a zero-width dsr range for the fallback id |
166 | if ( Utils::isValidDSR( $nodeDsr ) ) { |
167 | $offset = $nodeDsr->innerStart(); |
168 | DOMDataUtils::getDataParsoid( $span )->dsr = new DomSourceRange( $offset, $offset, null, null ); |
169 | } |
170 | $node->insertBefore( $span, $node->firstChild ); |
171 | } |
172 | |
173 | return true; |
174 | } |
175 | |
176 | /** |
177 | * see Parser::normalizeSectionName in Parser.php and T90902 |
178 | * @param string $text |
179 | * @param Env $env |
180 | * @return string |
181 | */ |
182 | private static function normalizeSectionName( string $text, Env $env ): string { |
183 | try { |
184 | $title = $env->makeTitleFromURLDecodedStr( "#{$text}" ); |
185 | return $title->getFragment(); |
186 | } catch ( TitleException $e ) { |
187 | return $text; |
188 | } |
189 | } |
190 | |
191 | public static function dedupeHeadingIds( array &$seenIds, Node $node ): bool { |
192 | // NOTE: This is not completely compliant with how PHP parser does it. |
193 | // If there is an id in the doc elsewhere, this will assign |
194 | // the heading a suffixed id, whereas the PHP parser processes |
195 | // headings in textual order and can introduce duplicate ids |
196 | // in a document in the process. |
197 | // |
198 | // However, we believe this implementation behavior is more |
199 | // consistent when handling this edge case, and in the common |
200 | // case (where heading ids won't conflict with ids elsewhere), |
201 | // matches PHP parser behavior. |
202 | if ( !$node instanceof Element ) { |
203 | // Not an Element |
204 | return true; |
205 | } |
206 | |
207 | $origKey = DOMCompat::getAttribute( $node, 'id' ); |
208 | if ( $origKey === null ) { |
209 | return true; |
210 | } |
211 | // IE 7 required attributes to be case-insensitively unique (T12721) |
212 | // but it did not support non-ASCII IDs. We don't support IE 7 anymore, |
213 | // but changing the algorithm would change the relevant fragment URLs. |
214 | // This case folding and matching algorithm has to stay exactly the |
215 | // same to preserve external links to the page. |
216 | $key = strtolower( $origKey ); |
217 | if ( !isset( $seenIds[$key] ) ) { |
218 | $seenIds[$key] = 1; |
219 | return true; |
220 | } |
221 | // Only update headings and legacy links (first children of heading) |
222 | $isHeading = DOMUtils::isHeading( $node ); |
223 | if ( $isHeading || WTUtils::isFallbackIdSpan( $node ) ) { |
224 | $suffix = ++$seenIds[$key]; |
225 | while ( !empty( $seenIds[$key . '_' . $suffix] ) ) { |
226 | $suffix++; |
227 | $seenIds[$key]++; |
228 | } |
229 | $node->setAttribute( 'id', $origKey . '_' . $suffix ); |
230 | if ( $isHeading ) { |
231 | $tmp = DOMDataUtils::getDataParsoid( $node )->getTemp(); |
232 | $linkAnchorId = $tmp->section['linkAnchor']; |
233 | $tmp->section['linkAnchor'] = $linkAnchorId . '_' . $suffix; |
234 | } |
235 | $seenIds[$key . '_' . $suffix] = 1; |
236 | } |
237 | return true; |
238 | } |
239 | } |