Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
64.58% |
62 / 96 |
|
25.00% |
1 / 4 |
CRAP | |
0.00% |
0 / 1 |
| Headings | |
64.58% |
62 / 96 |
|
25.00% |
1 / 4 |
73.69 | |
0.00% |
0 / 1 |
| processHeadingContent | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
240 | |||
| genAnchors | |
100.00% |
41 / 41 |
|
100.00% |
1 / 1 |
6 | |||
| normalizeSectionName | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| dedupeHeadingIds | |
91.30% |
21 / 23 |
|
0.00% |
0 / 1 |
8.04 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Wt2Html\DOM\Handlers; |
| 5 | |
| 6 | use Wikimedia\Parsoid\Config\Env; |
| 7 | use Wikimedia\Parsoid\Core\DOMCompat; |
| 8 | use Wikimedia\Parsoid\Core\DomSourceRange; |
| 9 | use Wikimedia\Parsoid\Core\Sanitizer; |
| 10 | use Wikimedia\Parsoid\DOM\Element; |
| 11 | use Wikimedia\Parsoid\DOM\Node; |
| 12 | use Wikimedia\Parsoid\DOM\Text; |
| 13 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 14 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 15 | use Wikimedia\Parsoid\Utils\DTState; |
| 16 | use Wikimedia\Parsoid\Utils\TitleException; |
| 17 | use Wikimedia\Parsoid\Utils\Utils; |
| 18 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 19 | |
| 20 | class Headings { |
| 21 | /** |
| 22 | * See the safe-heading transform code in Parser::finalizeHeadings in core |
| 23 | * |
| 24 | * Allowed HTML tags are: |
| 25 | * - <sup> and <sub> (T10393) |
| 26 | * - <i> (T28375) |
| 27 | * - <b> (r105284) |
| 28 | * - <bdi> (T74884) |
| 29 | * - <span dir="rtl"> and <span dir="ltr"> (T37167) |
| 30 | * (handled separately in code below) |
| 31 | * - <s> and <strike> (T35715) |
| 32 | * - <q> (T251672) |
| 33 | */ |
| 34 | private const ALLOWED_NODES_IN_ANCHOR = [ 'span', 'sup', 'sub', 'i', 'b', 'bdi', 's', 'strike', 'q' ]; |
| 35 | |
| 36 | /** |
| 37 | * This method implements the equivalent of the regexp-based safe-headline |
| 38 | * transform in Parser::finalizeHeadings in core. |
| 39 | * |
| 40 | * @param Node $node |
| 41 | */ |
| 42 | private static function processHeadingContent( Node $node ): void { |
| 43 | $c = $node->firstChild; |
| 44 | while ( $c ) { |
| 45 | $next = $c->nextSibling; |
| 46 | if ( $c instanceof Element ) { |
| 47 | $cName = DOMUtils::nodeName( $c ); |
| 48 | if ( DOMUtils::hasTypeOf( $c, 'mw:LanguageVariant' ) ) { |
| 49 | // Special case for -{...}- |
| 50 | $dp = DOMDataUtils::getDataParsoid( $c ); |
| 51 | $node->replaceChild( |
| 52 | $node->ownerDocument->createTextNode( $dp->src ?? '' ), $c |
| 53 | ); |
| 54 | } elseif ( in_array( $cName, [ 'style', 'script' ], true ) ) { |
| 55 | # Remove any <style> or <script> tags (T198618) |
| 56 | $node->removeChild( $c ); |
| 57 | } else { |
| 58 | self::processHeadingContent( $c ); |
| 59 | if ( !$c->firstChild ) { |
| 60 | // Empty now - strip it! |
| 61 | $node->removeChild( $c ); |
| 62 | } elseif ( |
| 63 | !in_array( $cName, self::ALLOWED_NODES_IN_ANCHOR, true ) || |
| 64 | ( $cName === 'span' && DOMUtils::hasTypeOf( $c, 'mw:Entity' ) ) |
| 65 | ) { |
| 66 | # Strip all unallowed tag wrappers |
| 67 | DOMUtils::migrateChildren( $c, $node, $next ); |
| 68 | $next = $c->nextSibling; |
| 69 | $node->removeChild( $c ); |
| 70 | } else { |
| 71 | # We strip any parameter from accepted tags except dir="rtl|ltr" from <span>, |
| 72 | # to allow setting directionality in toc items. |
| 73 | foreach ( DOMCompat::attributes( $c ) as $key => $val ) { |
| 74 | if ( $cName === 'span' ) { |
| 75 | if ( $key !== 'dir' || ( $val !== 'ltr' && $val !== 'rtl' ) ) { |
| 76 | $c->removeAttribute( $key ); |
| 77 | } |
| 78 | } else { |
| 79 | $c->removeAttribute( $key ); |
| 80 | } |
| 81 | } |
| 82 | } |
| 83 | } |
| 84 | } elseif ( !( $c instanceof Text ) ) { |
| 85 | // Strip everying else but text nodes |
| 86 | $node->removeChild( $c ); |
| 87 | } |
| 88 | |
| 89 | $c = $next; |
| 90 | } |
| 91 | } |
| 92 | |
| 93 | /** |
| 94 | * Generate anchor ids that the PHP parser assigns to headings. |
| 95 | * This is to ensure that links that are out there in the wild |
| 96 | * continue to be valid links into Parsoid HTML. |
| 97 | * @param Node $node |
| 98 | * @param DTState $state |
| 99 | * @return bool |
| 100 | */ |
| 101 | public static function genAnchors( Node $node, DTState $state ): bool { |
| 102 | if ( !DOMUtils::isHeading( $node ) ) { |
| 103 | return true; |
| 104 | } |
| 105 | '@phan-var Element $node'; /** @var Element $node */ |
| 106 | |
| 107 | // Deep clone the heading to mutate it to strip unwanted tags and attributes. |
| 108 | $clone = DOMDataUtils::cloneNode( $node, true ); |
| 109 | '@phan-var Element $clone'; // @var Element $clone |
| 110 | // Don't bother storing data-attribs on $clone, |
| 111 | // processHeadingContent is about to strip them |
| 112 | |
| 113 | self::processHeadingContent( $clone ); |
| 114 | $buf = DOMCompat::getInnerHTML( $clone ); |
| 115 | $line = trim( $buf ); |
| 116 | |
| 117 | $dp = DOMDataUtils::getDataParsoid( $node ); |
| 118 | $tmp = $dp->getTemp(); |
| 119 | |
| 120 | // Cannot generate an anchor id if the heading already has an id! |
| 121 | if ( $node->hasAttribute( 'id' ) ) { |
| 122 | $linkAnchorId = DOMCompat::getAttribute( $node, 'id' ); |
| 123 | $dp->reusedId = true; |
| 124 | $tmp->section = [ |
| 125 | 'line' => $line, |
| 126 | 'linkAnchor' => $linkAnchorId, |
| 127 | ]; |
| 128 | return true; |
| 129 | } |
| 130 | |
| 131 | // Additional processing for $anchor |
| 132 | $anchorText = $clone->textContent; // strip all tags |
| 133 | $anchorText = Sanitizer::normalizeSectionNameWhiteSpace( $anchorText ); |
| 134 | $anchorText = self::normalizeSectionName( $anchorText, $state->env ); |
| 135 | |
| 136 | # NOTE: Parsoid defaults to html5 mode. So, if we want to replicate |
| 137 | # legacy output, we should handle that explicitly. |
| 138 | $anchorId = Sanitizer::escapeIdForAttribute( $anchorText ); |
| 139 | $linkAnchorId = Sanitizer::escapeIdForLink( $anchorText ); |
| 140 | $fallbackId = Sanitizer::escapeIdForAttribute( $anchorText, Sanitizer::ID_FALLBACK ); |
| 141 | if ( $anchorId === $fallbackId ) { |
| 142 | $fallbackId = null; /* not needed */ |
| 143 | } |
| 144 | |
| 145 | // The ids need to be unique, but we'll enforce this in a post-processing step. |
| 146 | $node->setAttribute( 'id', $anchorId ); |
| 147 | $tmp->section = [ |
| 148 | 'line' => $line, |
| 149 | 'linkAnchor' => $linkAnchorId, |
| 150 | ]; |
| 151 | |
| 152 | if ( $fallbackId ) { |
| 153 | $span = $node->ownerDocument->createElement( 'span' ); |
| 154 | $span->setAttribute( 'id', $fallbackId ); |
| 155 | DOMUtils::addTypeOf( $span, 'mw:FallbackId' ); |
| 156 | $nodeDsr = $dp->dsr ?? null; |
| 157 | // Set a zero-width dsr range for the fallback id |
| 158 | if ( Utils::isValidDSR( $nodeDsr ) ) { |
| 159 | $offset = $nodeDsr->innerStart(); |
| 160 | DOMDataUtils::getDataParsoid( $span )->dsr = new DomSourceRange( |
| 161 | $offset, $offset, null, null, source: $nodeDsr->source |
| 162 | ); |
| 163 | } |
| 164 | $node->insertBefore( $span, $node->firstChild ); |
| 165 | } |
| 166 | |
| 167 | return true; |
| 168 | } |
| 169 | |
| 170 | /** |
| 171 | * see Parser::normalizeSectionName in Parser.php and T90902 |
| 172 | * @param string $text |
| 173 | * @param Env $env |
| 174 | * @return string |
| 175 | */ |
| 176 | private static function normalizeSectionName( string $text, Env $env ): string { |
| 177 | try { |
| 178 | $title = $env->makeTitleFromURLDecodedStr( "#{$text}" ); |
| 179 | return $title->getFragment(); |
| 180 | } catch ( TitleException ) { |
| 181 | return $text; |
| 182 | } |
| 183 | } |
| 184 | |
| 185 | public static function dedupeHeadingIds( Node $node, DTState $state ): bool { |
| 186 | // NOTE: This is not completely compliant with how PHP parser does it. |
| 187 | // If there is an id in the doc elsewhere, this will assign |
| 188 | // the heading a suffixed id, whereas the PHP parser processes |
| 189 | // headings in textual order and can introduce duplicate ids |
| 190 | // in a document in the process. |
| 191 | // |
| 192 | // However, we believe this implementation behavior is more |
| 193 | // consistent when handling this edge case, and in the common |
| 194 | // case (where heading ids won't conflict with ids elsewhere), |
| 195 | // matches PHP parser behavior. |
| 196 | // FIXME: Maybe we should lint this issue away |
| 197 | if ( !$node instanceof Element ) { |
| 198 | // Not an Element |
| 199 | return true; |
| 200 | } |
| 201 | |
| 202 | $origKey = DOMCompat::getAttribute( $node, 'id' ); |
| 203 | if ( $origKey === null ) { |
| 204 | return true; |
| 205 | } |
| 206 | // IE 7 required attributes to be case-insensitively unique (T12721) |
| 207 | // but it did not support non-ASCII IDs. We don't support IE 7 anymore, |
| 208 | // but changing the algorithm would change the relevant fragment URLs. |
| 209 | // This case folding and matching algorithm has to stay exactly the |
| 210 | // same to preserve external links to the page. |
| 211 | $key = strtolower( $origKey ); |
| 212 | $seenIds = &$state->seenIds; |
| 213 | if ( !isset( $seenIds[$key] ) ) { |
| 214 | $seenIds[$key] = 1; |
| 215 | return true; |
| 216 | } |
| 217 | // Only update headings and legacy links (first children of heading) |
| 218 | $isHeading = DOMUtils::isHeading( $node ); |
| 219 | if ( $isHeading || WTUtils::isFallbackIdSpan( $node ) ) { |
| 220 | $suffix = ++$seenIds[$key]; |
| 221 | while ( !empty( $seenIds[$key . '_' . $suffix] ) ) { |
| 222 | $suffix++; |
| 223 | $seenIds[$key]++; |
| 224 | } |
| 225 | $node->setAttribute( 'id', $origKey . '_' . $suffix ); |
| 226 | if ( $isHeading ) { |
| 227 | $tmp = DOMDataUtils::getDataParsoid( $node )->getTemp(); |
| 228 | $linkAnchorId = $tmp->section['linkAnchor']; |
| 229 | $tmp->section['linkAnchor'] = $linkAnchorId . '_' . $suffix; |
| 230 | } |
| 231 | $seenIds[$key . '_' . $suffix] = 1; |
| 232 | } |
| 233 | return true; |
| 234 | } |
| 235 | } |