Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 817 |
|
0.00% |
0 / 16 |
CRAP | |
0.00% |
0 / 1 |
| LinkHandlerUtils | |
0.00% |
0 / 817 |
|
0.00% |
0 / 16 |
89102 | |
0.00% |
0 / 1 |
| splitLinkContentString | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
30 | |||
| getHref | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
56 | |||
| normalizeIWP | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| escapeLinkTarget | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 | |||
| getContentString | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
42 | |||
| getLinkRoundTripData | |
0.00% |
0 / 130 |
|
0.00% |
0 / 1 |
3192 | |||
| escapeExtLinkURL | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
| addColonEscape | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
30 | |||
| isURLLink | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
56 | |||
| hasAutoUrlTerminatingChars | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| isSimpleWikiLink | |
0.00% |
0 / 49 |
|
0.00% |
0 / 1 |
306 | |||
| serializeAsWikiLink | |
0.00% |
0 / 126 |
|
0.00% |
0 / 1 |
2352 | |||
| serializeAsExtLink | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
72 | |||
| linkHandler | |
0.00% |
0 / 68 |
|
0.00% |
0 / 1 |
380 | |||
| figureHandler | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
| figureToConstrainedText | |
0.00% |
0 / 339 |
|
0.00% |
0 / 1 |
12882 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Html2Wt; |
| 5 | |
| 6 | use stdClass; |
| 7 | use UnexpectedValueException; |
| 8 | use Wikimedia\Parsoid\Config\Env; |
| 9 | use Wikimedia\Parsoid\Core\MediaStructure; |
| 10 | use Wikimedia\Parsoid\DOM\Element; |
| 11 | use Wikimedia\Parsoid\DOM\Node; |
| 12 | use Wikimedia\Parsoid\DOM\Text; |
| 13 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\AutoURLLinkText; |
| 14 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText; |
| 15 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ExtLinkText; |
| 16 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\MagicLinkText; |
| 17 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\WikiLinkText; |
| 18 | use Wikimedia\Parsoid\Html2Wt\DOMHandlers\FallbackHTMLHandler; |
| 19 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
| 20 | use Wikimedia\Parsoid\NodeData\TempData; |
| 21 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 22 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 23 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 24 | use Wikimedia\Parsoid\Utils\PHPUtils; |
| 25 | use Wikimedia\Parsoid\Utils\TokenUtils; |
| 26 | use Wikimedia\Parsoid\Utils\UrlUtils; |
| 27 | use Wikimedia\Parsoid\Utils\Utils; |
| 28 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 29 | use Wikimedia\Parsoid\Wt2Html\TokenizerUtils; |
| 30 | |
| 31 | /** |
| 32 | * Serializes link markup. |
| 33 | */ |
| 34 | class LinkHandlerUtils { |
| 35 | private const REDIRECT_TEST_RE = '/^([ \t\n\r\0\x0b])*$/D'; |
| 36 | private const MW_TITLE_WHITESPACE_RE |
| 37 | = '/[ _\xA0\x{1680}\x{180E}\x{2000}-\x{200A}\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}]+/u'; |
| 38 | |
| 39 | /** |
| 40 | * Split a string based on a prefix and suffix |
| 41 | * |
| 42 | * @param string $contentString |
| 43 | * @param DataParsoid $dp Containing ->prefix and ->tail |
| 44 | * @return stdClass |
| 45 | */ |
| 46 | private static function splitLinkContentString( string $contentString, DataParsoid $dp ): stdClass { |
| 47 | $tail = $dp->tail ?? ''; |
| 48 | $prefix = $dp->prefix ?? ''; |
| 49 | |
| 50 | $tailLen = strlen( $tail ); |
| 51 | if ( $tailLen && substr( $contentString, -$tailLen ) === $tail ) { |
| 52 | // strip the tail off the content |
| 53 | $contentString = substr( $contentString, 0, -$tailLen ); |
| 54 | } else { |
| 55 | $tail = ''; |
| 56 | } |
| 57 | |
| 58 | $prefixLen = strlen( $prefix ); |
| 59 | if ( $prefixLen && substr( $contentString, 0, $prefixLen ) === $prefix ) { |
| 60 | $contentString = substr( $contentString, $prefixLen ); |
| 61 | } else { |
| 62 | $prefix = ''; |
| 63 | } |
| 64 | |
| 65 | return (object)[ |
| 66 | 'contentString' => $contentString, |
| 67 | 'tail' => $tail, |
| 68 | 'prefix' => $prefix, |
| 69 | ]; |
| 70 | } |
| 71 | |
| 72 | /** |
| 73 | * Helper function for munging protocol-less absolute URLs: |
| 74 | * If this URL is absolute, but doesn't contain a protocol, |
| 75 | * try to find a localinterwiki protocol that would work. |
| 76 | * |
| 77 | * @param Env $env |
| 78 | * @param Element $node |
| 79 | * @return string |
| 80 | */ |
| 81 | private static function getHref( Env $env, Element $node ): string { |
| 82 | $href = DOMCompat::getAttribute( $node, 'href' ) ?? ''; |
| 83 | if ( ( $href[0] ?? '' ) === '/' && ( $href[1] ?? '' ) !== '/' ) { |
| 84 | // protocol-less but absolute. let's find a base href |
| 85 | foreach ( $env->getSiteConfig()->interwikiMapNoNamespaces() as $interwikiInfo ) { |
| 86 | if ( isset( $interwikiInfo['localinterwiki'] ) && isset( $interwikiInfo['url'] ) ) { |
| 87 | $base = $interwikiInfo['url']; |
| 88 | |
| 89 | // evaluate the url relative to this base |
| 90 | $nhref = UrlUtils::expandUrl( $href, $base ); |
| 91 | |
| 92 | // can this match the pattern? |
| 93 | $re = '/^' . strtr( preg_quote( $base, '/' ), [ '\\$1' => '.*' ] ) . '$/sD'; |
| 94 | if ( preg_match( $re, $nhref ) ) { |
| 95 | return $nhref; |
| 96 | } |
| 97 | } |
| 98 | } |
| 99 | } |
| 100 | return $href; |
| 101 | } |
| 102 | |
| 103 | /** |
| 104 | * Normalize an interwiki prefix (?) |
| 105 | * @param string $str |
| 106 | * @return string |
| 107 | */ |
| 108 | private static function normalizeIWP( string $str ): string { |
| 109 | return PHPUtils::stripPrefix( trim( strtolower( $str ) ), ':' ); |
| 110 | } |
| 111 | |
| 112 | /** |
| 113 | * Escape a link target, and indicate if it's valid |
| 114 | * @param string $linkTarget |
| 115 | * @param SerializerState $state |
| 116 | * @return stdClass |
| 117 | */ |
| 118 | private static function escapeLinkTarget( string $linkTarget, SerializerState $state ): stdClass { |
| 119 | // Entity-escape the content. |
| 120 | $linkTarget = Utils::escapeWtEntities( $linkTarget ); |
| 121 | return (object)[ |
| 122 | 'linkTarget' => $linkTarget, |
| 123 | // Is this an invalid link? |
| 124 | 'invalidLink' => !$state->getEnv()->isValidLinkTarget( $linkTarget ) || |
| 125 | // `isValidLinkTarget` omits fragments (the part after #) so, |
| 126 | // even though "|" is an invalid character, we still need to ensure |
| 127 | // it doesn't appear in there. The percent encoded version is fine |
| 128 | // in the fragment, since it won't break the parse. |
| 129 | strpos( $linkTarget, '|' ) !== false, |
| 130 | ]; |
| 131 | } |
| 132 | |
| 133 | /** |
| 134 | * Get the plain text content of the node, if it can be represented as such |
| 135 | * |
| 136 | * NOTE: This function seems a little inconsistent about what's considered |
| 137 | * null and what's an empty string. For example, no children is null |
| 138 | * but a single diffMarker gets a string? One of the current callers |
| 139 | * seems to subtly depend on that though. |
| 140 | * |
| 141 | * FIXME(T254501): This function can return `$node->textContent` instead |
| 142 | * of the string concatenation once mw:DisplaySpace is preprocessed away. |
| 143 | * |
| 144 | * @param Node $node |
| 145 | * @return ?string |
| 146 | */ |
| 147 | private static function getContentString( Node $node ): ?string { |
| 148 | if ( !$node->hasChildNodes() ) { |
| 149 | return null; |
| 150 | } |
| 151 | $contentString = ''; |
| 152 | $child = $node->firstChild; |
| 153 | while ( $child ) { |
| 154 | if ( $child instanceof Text ) { |
| 155 | $contentString .= $child->nodeValue; |
| 156 | } elseif ( DOMUtils::hasTypeOf( $child, 'mw:DisplaySpace' ) ) { |
| 157 | $contentString .= ' '; |
| 158 | } elseif ( DiffUtils::isDiffMarker( $child ) ) { |
| 159 | } else { |
| 160 | return null; |
| 161 | } |
| 162 | $child = $child->nextSibling; |
| 163 | } |
| 164 | return $contentString; |
| 165 | } |
| 166 | |
| 167 | /** |
| 168 | * Helper function for getting RT data from the tokens |
| 169 | * @param Env $env |
| 170 | * @param Element $node |
| 171 | * @param SerializerState $state |
| 172 | * @return stdClass |
| 173 | */ |
| 174 | private static function getLinkRoundTripData( |
| 175 | Env $env, Element $node, SerializerState $state |
| 176 | ): stdClass { |
| 177 | $dp = DOMDataUtils::getDataParsoid( $node ); |
| 178 | $siteConfig = $env->getSiteConfig(); |
| 179 | $rtData = (object)[ |
| 180 | 'type' => null, // could be null |
| 181 | 'href' => null, // filled in below |
| 182 | 'origHref' => null, // filled in below |
| 183 | 'target' => null, // filled in below |
| 184 | 'tail' => $dp->tail ?? '', |
| 185 | 'prefix' => $dp->prefix ?? '', |
| 186 | 'linkType' => null |
| 187 | ]; |
| 188 | $rtData->content = new stdClass; |
| 189 | $isIW = false; |
| 190 | |
| 191 | // Figure out the type of the link |
| 192 | if ( $node->hasAttribute( 'rel' ) ) { |
| 193 | $rel = DOMCompat::getAttribute( $node, 'rel' ) ?? ''; |
| 194 | // Parsoid only emits and recognizes ExtLink, WikiLink, MediaLink and PageProp rel values. |
| 195 | // Everything else defaults to ExtLink during serialization (unless it is |
| 196 | // serializable to a wikilink) |
| 197 | // We're keeping the preg_match here instead of going through DOMUtils::matchRel |
| 198 | // because we have \b guards to handle the multivalue, and we're keeping the matches, |
| 199 | // which matchRel doesn't do. |
| 200 | if ( preg_match( '/\b(mw:(WikiLink|ExtLink|MediaLink|PageProp)(\S*))\b/', $rel, $typeMatch ) ) { |
| 201 | $rtData->type = $typeMatch[1]; |
| 202 | // Strip link subtype info |
| 203 | if ( $typeMatch[2] === 'WikiLink' || $typeMatch[2] === 'ExtLink' ) { |
| 204 | $rtData->type = 'mw:' . $typeMatch[2]; |
| 205 | } |
| 206 | $isIW = ( |
| 207 | ( $typeMatch[2] === 'WikiLink' && ( $typeMatch[3] ?? '' ) === '/Interwiki' ) || |
| 208 | // TODO: Remove this when we no longer have to worry about Flow boards |
| 209 | ( $typeMatch[2] === 'ExtLink' && ( $dp->isIW ?? false ) ) |
| 210 | ); |
| 211 | } |
| 212 | } |
| 213 | |
| 214 | // Default link type if nothing else is set |
| 215 | if ( $rtData->type === null && !DOMUtils::selectMediaElt( $node ) ) { |
| 216 | $rtData->type = 'mw:ExtLink'; |
| 217 | } |
| 218 | |
| 219 | // Get href, and save the token's "real" href for comparison |
| 220 | $href = self::getHref( $env, $node ); |
| 221 | $rtData->origHref = $href; |
| 222 | $rtData->href = preg_replace( '#^(\.\.?/)+#', '', $href, 1 ); |
| 223 | |
| 224 | // WikiLinks should be relative (but see below); fixup the link type |
| 225 | // if a WikiLink has an absolute URL. |
| 226 | // (This may get converted back to a WikiLink below, in the interwiki |
| 227 | // handling code.) |
| 228 | if ( $rtData->type === 'mw:WikiLink' && |
| 229 | ( preg_match( '#^(\w+:)?//#', $rtData->href ) || |
| 230 | substr( $rtData->origHref ?? '', 0, 1 ) === '/' ) |
| 231 | ) { |
| 232 | $rtData->type = 'mw:ExtLink'; |
| 233 | } |
| 234 | |
| 235 | // Now get the target from rt data |
| 236 | $rtData->target = $state->serializer->serializedAttrVal( $node, 'href' ); |
| 237 | |
| 238 | // Check if the link content has been modified or is newly inserted content. |
| 239 | // FIXME: This will only work with selser of course. Hard to test without selser. |
| 240 | if ( |
| 241 | $state->inInsertedContent || |
| 242 | DiffUtils::hasDiffMark( $node, DiffMarkers::SUBTREE_CHANGED ) |
| 243 | ) { |
| 244 | $rtData->contentModified = true; |
| 245 | } |
| 246 | |
| 247 | // Get the content string or tokens |
| 248 | $contentString = self::getContentString( $node ); |
| 249 | if ( $contentString !== null ) { |
| 250 | if ( !empty( $rtData->target['value'] ) && $rtData->target['value'] !== $contentString ) { |
| 251 | // Try to identify a new potential tail |
| 252 | $contentParts = self::splitLinkContentString( $contentString, $dp ); |
| 253 | $rtData->content->string = $contentParts->contentString; |
| 254 | $rtData->tail = $contentParts->tail; |
| 255 | $rtData->prefix = $contentParts->prefix; |
| 256 | } else { |
| 257 | $rtData->tail = ''; |
| 258 | $rtData->prefix = ''; |
| 259 | $rtData->content->string = $contentString; |
| 260 | } |
| 261 | } elseif ( $node->hasChildNodes() ) { |
| 262 | $rtData->contentNode = $node; |
| 263 | } elseif ( $rtData->type === 'mw:PageProp/redirect' ) { |
| 264 | $rtData->isRedirect = true; |
| 265 | $rtData->prefix = $dp->src |
| 266 | ?? ( ( $siteConfig->mwAliases()['redirect'][0] ?? '#REDIRECT' ) . ' ' ); |
| 267 | } |
| 268 | |
| 269 | // Update link type based on additional analysis. |
| 270 | // What might look like external links might be serializable as a wikilink. |
| 271 | $target = &$rtData->target; |
| 272 | |
| 273 | // mw:MediaLink annotations are considered authoritative |
| 274 | // and interwiki link matches aren't made for these |
| 275 | if ( $rtData->type === 'mw:MediaLink' ) { |
| 276 | // Parse title from resource attribute (see analog in image handling) |
| 277 | $resource = $state->serializer->serializedAttrVal( $node, 'resource' ); |
| 278 | if ( $resource['value'] === null ) { |
| 279 | // from non-parsoid HTML: try to reconstruct resource from href? |
| 280 | // (See similar code which tries to guess resource from <img src>) |
| 281 | $mediaPrefix = $siteConfig->namespaceName( $siteConfig->namespaceId( 'media' ) ); |
| 282 | $slashPos = strrpos( $rtData->origHref, '/' ); |
| 283 | $fileName = $slashPos === false ? $rtData->origHref : |
| 284 | substr( $rtData->origHref, $slashPos + 1 ); |
| 285 | $resource = [ |
| 286 | 'value' => $mediaPrefix . ':' . $fileName, |
| 287 | 'fromsrc' => false, |
| 288 | 'modified' => false |
| 289 | ]; |
| 290 | } |
| 291 | $rtData->target = $resource; |
| 292 | $rtData->href = preg_replace( '#^(\.\.?/)+#', '', $rtData->target['value'], 1 ); |
| 293 | return $rtData; |
| 294 | } |
| 295 | |
| 296 | // Check if the href matches any of our interwiki URL patterns |
| 297 | $interwikiMatch = $siteConfig->interwikiMatcher( $href ); |
| 298 | if ( !$interwikiMatch ) { |
| 299 | if ( $isIW ) { |
| 300 | // If this is an interwiki but we can't find it then ignore the |
| 301 | // data-parsoid href (which is proably just the interwiki link again) |
| 302 | // and use the href from the <a> tag |
| 303 | $rtData->target = DOMCompat::getAttribute( $node, 'href' ); |
| 304 | } |
| 305 | return $rtData; |
| 306 | } |
| 307 | |
| 308 | $iw = $siteConfig->interwikiMapNoNamespaces()[ltrim( $interwikiMatch[0], ':' )]; |
| 309 | $localInterwiki = !empty( $iw['local'] ); |
| 310 | |
| 311 | // Only to be used in question mark check, since other checks want to include the fragment |
| 312 | $targetForQmarkCheck = $interwikiMatch[1]; |
| 313 | // FIXME: If ever the default value for $wgExternalInterwikiFragmentMode |
| 314 | // changes, we can reduce this by always stripping off the fragment |
| 315 | // identifier, since in "html5" mode, that isn't encoded. At present, |
| 316 | // we can only do that if we know it's a local interwiki link. |
| 317 | if ( $localInterwiki ) { |
| 318 | $withoutFragment = strstr( $targetForQmarkCheck, '#', true ); |
| 319 | if ( $withoutFragment !== false ) { |
| 320 | $targetForQmarkCheck = $withoutFragment; |
| 321 | } |
| 322 | } |
| 323 | |
| 324 | if ( |
| 325 | // Question mark is a valid title char, so it won't fail the test below, |
| 326 | // but gets percent encoded on the way out since it has special |
| 327 | // semantics in a url. That will break the url we're serializing, so |
| 328 | // protect it. |
| 329 | strpos( $targetForQmarkCheck, '?' ) === false && |
| 330 | // Ensure we have a valid link target, otherwise falling back to extlink |
| 331 | // is preferable, since it won't serialize as a link. |
| 332 | ( |
| 333 | $interwikiMatch[1] === '' || !self::escapeLinkTarget( |
| 334 | // Append the prefix since we want to validate the target |
| 335 | // with respect to it being an interwiki. |
| 336 | $interwikiMatch[0] . ':' . $interwikiMatch[1], |
| 337 | $state |
| 338 | )->invalidLink |
| 339 | ) && |
| 340 | // ExtLinks should have content to convert. |
| 341 | ( |
| 342 | $rtData->type !== 'mw:ExtLink' || |
| 343 | !empty( $rtData->content->string ) || |
| 344 | !empty( $rtData->contentNode ) |
| 345 | ) && |
| 346 | ( $isIW || !empty( $target['modified'] ) || !empty( $rtData->contentModified ) ) |
| 347 | ) { |
| 348 | // External link that is really an interwiki link. Convert it. |
| 349 | if ( $rtData->type === 'mw:ExtLink' ) { |
| 350 | $rtData->type = 'mw:WikiLink'; |
| 351 | } |
| 352 | $rtData->isInterwiki = true; |
| 353 | $iwMap = $siteConfig->interwikiMapNoNamespaces(); |
| 354 | // could this be confused with a language link? |
| 355 | $iwi = $iwMap[self::normalizeIWP( $interwikiMatch[0] )] ?? null; |
| 356 | $rtData->isInterwikiLang = $iwi && isset( $iwi['language'] ); |
| 357 | // is this our own wiki? |
| 358 | $rtData->isLocal = $iwi && isset( $iwi['localinterwiki'] ); |
| 359 | // strip off localinterwiki prefixes |
| 360 | $localPrefix = ''; |
| 361 | $oldPrefix = null; |
| 362 | while ( true ) { |
| 363 | $tmp = substr( $target['value'], strlen( $localPrefix ) ); |
| 364 | if ( !preg_match( '/^(:?([^:]+)):/', $tmp, $oldPrefix ) ) { |
| 365 | break; |
| 366 | } |
| 367 | $iwi = $iwMap[Utils::normalizeNamespaceName( $oldPrefix[2] )] ?? null; |
| 368 | if ( !$iwi || !isset( $iwi['localinterwiki'] ) ) { |
| 369 | break; |
| 370 | } |
| 371 | $localPrefix .= $oldPrefix[1] . ':'; |
| 372 | } |
| 373 | |
| 374 | if ( !empty( $target['fromsrc'] ) && empty( $target['modified'] ) ) { |
| 375 | // Leave the target alone! |
| 376 | } else { |
| 377 | if ( $rtData->type === 'mw:PageProp/Language' ) { |
| 378 | $targetValue = implode( ':', $interwikiMatch ); |
| 379 | // Strip initial colon |
| 380 | if ( $targetValue[0] === ':' ) { |
| 381 | $targetValue = substr( $targetValue, 1 ); |
| 382 | } |
| 383 | $target['value'] = $targetValue; |
| 384 | } elseif ( |
| 385 | $oldPrefix && ( // Should we preserve the old prefix? |
| 386 | strcasecmp( $oldPrefix[1], $interwikiMatch[0] ) === 0 || |
| 387 | // Check if the old prefix mapped to the same URL as |
| 388 | // the new one. Use the old one if that's the case. |
| 389 | // Example: [[w:Foo]] vs. [[:en:Foo]] |
| 390 | ( $iwMap[self::normalizeIWP( $oldPrefix[1] )]['url'] ?? null ) |
| 391 | === ( $iwMap[self::normalizeIWP( $interwikiMatch[0] )]['url'] ?? null ) |
| 392 | ) |
| 393 | ) { |
| 394 | // Reuse old prefix capitalization |
| 395 | if ( Utils::decodeWtEntities( substr( $target['value'], strlen( $oldPrefix[1] ) + 1 ) ) |
| 396 | !== $interwikiMatch[1] |
| 397 | ) { |
| 398 | // Modified, update target.value. |
| 399 | $target['value'] = $localPrefix . $oldPrefix[1] . ':' . $interwikiMatch[1]; |
| 400 | } |
| 401 | // Ensure that we generate an interwiki link and not a language link! |
| 402 | if ( $rtData->isInterwikiLang && $target['value'][0] !== ':' ) { |
| 403 | $target['value'] = ':' . $target['value']; |
| 404 | } |
| 405 | } else { // Else: preserve old encoding |
| 406 | if ( !empty( $rtData->isLocal ) ) { |
| 407 | // - interwikiMatch[0] will be something like ":en" or "w" |
| 408 | // - This tests whether the interwiki-like link is actually |
| 409 | // a local wikilink. |
| 410 | |
| 411 | $target['value'] = $interwikiMatch[1]; |
| 412 | // interwikiMatch[1] may start with a language link prefix, |
| 413 | // ensure that we generate interwiki link syntax in that case. (T292022) |
| 414 | if ( |
| 415 | preg_match( '/^([^:]+):/', $target['value'], $match ) && |
| 416 | !empty( $iwMap[self::normalizeIWP( $match[1] )]['language'] ) |
| 417 | ) { |
| 418 | $target['value'] = ':' . $target['value']; |
| 419 | } |
| 420 | |
| 421 | $rtData->isInterwiki = $rtData->isInterwikiLang = false; |
| 422 | } else { |
| 423 | $target['value'] = implode( ':', $interwikiMatch ); |
| 424 | } |
| 425 | } |
| 426 | } |
| 427 | } |
| 428 | |
| 429 | return $rtData; |
| 430 | } |
| 431 | |
| 432 | /** |
| 433 | * The provided URL is already percent-encoded -- but it may still |
| 434 | * not be safe for wikitext. Add additional escapes to make the URL |
| 435 | * wikitext-safe. Don't touch percent escapes already in the url, |
| 436 | * though! |
| 437 | * @param string $urlStr |
| 438 | * @return string |
| 439 | */ |
| 440 | private static function escapeExtLinkURL( string $urlStr ): string { |
| 441 | // this regexp is the negation of EXT_LINK_URL_CLASS in the PHP parser |
| 442 | return preg_replace( |
| 443 | // IPv6 host names are bracketed with []. Entity-decode these. |
| 444 | '!^([a-z][^:/]*:)?//[([0-9a-f:.]+)](:\d|/|$)!iD', |
| 445 | '$1//[$2]$3', |
| 446 | preg_replace_callback( |
| 447 | // phpcs:ignore Generic.Files.LineLength.TooLong |
| 448 | '/[\]\[<>"\x00-\x20\x7F\x{A0}\x{1680}\x{180E}\x{2000}-\x{200A}\x{202F}\x{205F}\x{3000}]|-(?=\{)/u', |
| 449 | static function ( $m ) { |
| 450 | return Utils::entityEncodeAll( $m[0] ); |
| 451 | }, |
| 452 | $urlStr |
| 453 | ), |
| 454 | 1 |
| 455 | ); |
| 456 | } |
| 457 | |
| 458 | /** |
| 459 | * Add a colon escape to a wikilink target string if needed. |
| 460 | * @param Env $env |
| 461 | * @param string $linkTarget |
| 462 | * @param stdClass $linkData |
| 463 | * @return string |
| 464 | */ |
| 465 | private static function addColonEscape( |
| 466 | Env $env, string $linkTarget, stdClass $linkData |
| 467 | ): string { |
| 468 | $linkTitle = $env->makeTitleFromText( $linkTarget ); |
| 469 | $categoryNs = $env->getSiteConfig()->canonicalNamespaceId( 'category' ); |
| 470 | $fileNs = $env->getSiteConfig()->canonicalNamespaceId( 'file' ); |
| 471 | |
| 472 | if ( ( $linkTitle->getNamespace() === $categoryNs || $linkTitle->getNamespace() === $fileNs ) && |
| 473 | $linkData->type === 'mw:WikiLink' && |
| 474 | $linkTarget[0] !== ':' ) { |
| 475 | // Escape category and file links |
| 476 | return ':' . $linkTarget; |
| 477 | } else { |
| 478 | return $linkTarget; |
| 479 | } |
| 480 | } |
| 481 | |
| 482 | /** |
| 483 | * Test if something is a URL link |
| 484 | * @param Env $env |
| 485 | * @param Element $node |
| 486 | * @param stdClass $linkData |
| 487 | * @return bool |
| 488 | */ |
| 489 | private static function isURLLink( Env $env, Element $node, stdClass $linkData ): bool { |
| 490 | $target = $linkData->target; |
| 491 | |
| 492 | // Get plain text content, if any |
| 493 | $contentStr = self::getContentString( $node ); |
| 494 | |
| 495 | // First check if we can serialize as an URL link |
| 496 | return ( $contentStr !== null && $contentStr !== '' ) && |
| 497 | // Can we minimize this? |
| 498 | ( $target['value'] === $contentStr || self::getHref( $env, $node ) === $contentStr ) && |
| 499 | // protocol-relative url links not allowed in text |
| 500 | // (see autourl rule in peg tokenizer, T32269) |
| 501 | !str_starts_with( $contentStr, '//' ) && Utils::isProtocolValid( $contentStr, $env ) && |
| 502 | !self::hasAutoUrlTerminatingChars( $contentStr ); |
| 503 | } |
| 504 | |
| 505 | /** |
| 506 | * The legacy parser Parser.php::makeFreeExternalLink terminates an autourl when encountering |
| 507 | * some characters; since we wish to mimic that behaviour we need this method to check whether |
| 508 | * the provided URL is in that case. |
| 509 | * @param string $url |
| 510 | * @return bool |
| 511 | */ |
| 512 | private static function hasAutoUrlTerminatingChars( string $url ): bool { |
| 513 | $sep = TokenizerUtils::getAutoUrlTerminatingChars( strpos( $url, '(' ) !== false ); |
| 514 | return str_contains( $sep, substr( $url, -1 ) ); |
| 515 | } |
| 516 | |
| 517 | /** |
| 518 | * Figure out if we need a piped or simple link |
| 519 | * @param Env $env |
| 520 | * @param DataParsoid $dp |
| 521 | * @param array $target |
| 522 | * @param stdClass $linkData |
| 523 | * @return bool |
| 524 | */ |
| 525 | private static function isSimpleWikiLink( |
| 526 | Env $env, DataParsoid $dp, array $target, stdClass $linkData |
| 527 | ): bool { |
| 528 | $canUseSimple = false; |
| 529 | $contentString = $linkData->content->string ?? null; |
| 530 | |
| 531 | // FIXME (SSS): |
| 532 | // 1. Revisit this logic to see if all these checks |
| 533 | // are still relevant or whether this can be simplified somehow. |
| 534 | // 2. There are also duplicate computations for env.normalizedTitleKey(..) |
| 535 | // and Util.decodeURIComponent(..) that could be removed. |
| 536 | // 3. This could potentially be refactored as if-then chains. |
| 537 | |
| 538 | // Would need to pipe for any non-string content. |
| 539 | // Preserve unmodified or non-minimal piped links. |
| 540 | if ( $contentString !== null && |
| 541 | ( !empty( $target['modified'] ) || !empty( $linkData->contentModified ) || |
| 542 | ( $dp->stx ?? null ) !== 'piped' |
| 543 | ) && |
| 544 | // Relative links are not simple |
| 545 | !str_starts_with( $contentString, './' ) |
| 546 | ) { |
| 547 | // Strip colon escapes from the original target as that is |
| 548 | // stripped when deriving the content string. |
| 549 | // Strip ./ prefixes as well since they are relative link prefixes |
| 550 | // added to all titles. |
| 551 | // The prefix stripping, when it occurs, also includes spaces before the prefix. |
| 552 | // Finally, we also remove trailing spaces because these are removed for <a> links |
| 553 | // by DOMNormalizer::moveTrailingSpacesOut, and we wouldn't want that to lead to the |
| 554 | // link getting piped for only that reason. |
| 555 | $strippedTargetValue = rtrim( |
| 556 | preg_replace( '#^\s*(:|\./)#', '', $target['value'], 1 ) |
| 557 | ); |
| 558 | |
| 559 | // Strip colon escape after prefix for interwikis |
| 560 | if ( !empty( $linkData->isInterwiki ) ) { |
| 561 | $strippedTargetValue = preg_replace( '#^(\w+:):#', '$1', $strippedTargetValue, 1 ); |
| 562 | } |
| 563 | |
| 564 | $decodedTarget = Utils::decodeWtEntities( $strippedTargetValue ); |
| 565 | // Deal with the protocol-relative link scenario as well |
| 566 | $hrefHasProto = preg_match( '#^(\w+:)?//#', $linkData->href ); |
| 567 | |
| 568 | // Normalize content string and decoded target before comparison. |
| 569 | // Piped links don't come down this path => it is safe to normalize both. |
| 570 | $contentString = str_replace( '_', ' ', $contentString ); |
| 571 | $decodedTarget = str_replace( '_', ' ', $decodedTarget ); |
| 572 | |
| 573 | // See if the (normalized) content matches the |
| 574 | // target, either shadowed or actual. |
| 575 | $canUseSimple = |
| 576 | $contentString === $decodedTarget || |
| 577 | // try wrapped in forward slashes in case they were stripped |
| 578 | ( '/' . $contentString . '/' ) === $decodedTarget || |
| 579 | // normalize as titles and compare |
| 580 | // FIXME: This will strip an interwiki prefix. Is that right? |
| 581 | $env->normalizedTitleKey( $contentString, true ) |
| 582 | === preg_replace( self::MW_TITLE_WHITESPACE_RE, '_', $decodedTarget ) || |
| 583 | // Relative link |
| 584 | ( |
| 585 | ( |
| 586 | $env->getSiteConfig()->namespaceHasSubpages( |
| 587 | $env->getContextTitle()->getNamespace() |
| 588 | ) && |
| 589 | preg_match( '#^\.\./.*[^/]$#D', $strippedTargetValue ) && |
| 590 | $contentString === $env->resolveTitle( $strippedTargetValue ) |
| 591 | ) || |
| 592 | ( |
| 593 | preg_match( '#^\.\./.*?/$#D', $strippedTargetValue ) && |
| 594 | $contentString === preg_replace( '#^(?:\.\./)+(.*?)/$#D', '$1', $strippedTargetValue, 1 ) |
| 595 | ) |
| 596 | ) || |
| 597 | // if content == href this could be a simple link... eg [[Foo]]. |
| 598 | // but if href is an absolute url with protocol, this won't |
| 599 | // work: [[http://example.com]] is not a valid simple link! |
| 600 | ( |
| 601 | !$hrefHasProto && |
| 602 | // Always compare against decoded uri because |
| 603 | // <a rel="mw:WikiLink" href="7%25 Solution">7%25 Solution</a></p> |
| 604 | // should serialize as [[7% Solution|7%25 Solution]] |
| 605 | ( |
| 606 | $contentString === Utils::decodeURIComponent( $linkData->href ) || |
| 607 | // normalize with underscores for comparison with href |
| 608 | $env->normalizedTitleKey( $contentString, true ) |
| 609 | === Utils::decodeURIComponent( $linkData->href ) |
| 610 | ) |
| 611 | ); |
| 612 | } |
| 613 | |
| 614 | return $canUseSimple; |
| 615 | } |
| 616 | |
| 617 | /** |
| 618 | * Serialize as wiki link |
| 619 | * @param Element $node |
| 620 | * @param SerializerState $state |
| 621 | * @param stdClass $linkData |
| 622 | */ |
| 623 | private static function serializeAsWikiLink( |
| 624 | Element $node, SerializerState $state, stdClass $linkData |
| 625 | ): void { |
| 626 | $contentParts = null; |
| 627 | $contentSrc = ''; |
| 628 | $isPiped = false; |
| 629 | $needsEscaping = true; |
| 630 | $env = $state->getEnv(); |
| 631 | $siteConfig = $env->getSiteConfig(); |
| 632 | $target = $linkData->target; |
| 633 | $dp = DOMDataUtils::getDataParsoid( $node ); |
| 634 | |
| 635 | // Decode any link that did not come from the source (data-mw/parsoid) |
| 636 | // Links that come from data-mw/data-parsoid will be true titles, |
| 637 | // but links that come from hrefs will need to be url-decoded. |
| 638 | // Ex: <a href="/wiki/A%3Fb">Foobar</a> |
| 639 | if ( empty( $target['fromsrc'] ) ) { |
| 640 | // Omit fragments from decoding |
| 641 | $hash = strpos( $target['value'], '#' ); |
| 642 | if ( $hash !== false ) { |
| 643 | $target['value'] = Utils::decodeURIComponent( substr( $target['value'], 0, $hash ) ) |
| 644 | . substr( $target['value'], $hash ); |
| 645 | } else { |
| 646 | $target['value'] = Utils::decodeURIComponent( $target['value'] ); |
| 647 | } |
| 648 | } |
| 649 | |
| 650 | // Special-case handling for category links |
| 651 | if ( $linkData->type === 'mw:PageProp/Category' ) { |
| 652 | // Split target and sort key in $target['value']. |
| 653 | // The sort key shows up as "#something" in there. |
| 654 | // However, watch out for parser functions that start with "{{#" |
| 655 | // The atomic group is essential to prevent "{{#" parser function prefix |
| 656 | // from getting split at the "{{" and "#" where the "{{" matches the |
| 657 | // [^#]* and the "#" matches after separately. |
| 658 | if ( preg_match( '/^((?>{{#|[^#])*)#(.*)/', $target['value'], $targetParts ) ) { |
| 659 | $target['value'] = strtr( preg_replace( '#^(\.\.?/)*#', '', $targetParts[1], 1 ), '_', ' ' ); |
| 660 | // FIXME: Reverse `Sanitizer.sanitizeTitleURI(strContent).replace(/#/g, '%23');` |
| 661 | $strContent = Utils::decodeURIComponent( $targetParts[2] ); |
| 662 | $contentParts = self::splitLinkContentString( $strContent, $dp ); |
| 663 | $linkData->content->string = $contentParts->contentString; |
| 664 | $dp->tail = $linkData->tail = $contentParts->tail; |
| 665 | $dp->prefix = $linkData->prefix = $contentParts->prefix; |
| 666 | } else { // No sort key, will serialize to simple link |
| 667 | // Normalize the content string |
| 668 | $linkData->content->string = strtr( |
| 669 | PHPUtils::stripPrefix( $target['value'], './' ), '_', ' ' |
| 670 | ); |
| 671 | } |
| 672 | |
| 673 | // Special-case handling for template-affected sort keys |
| 674 | // FIXME: sort keys cannot be modified yet, but if they are, |
| 675 | // we need to fully shadow the sort key. |
| 676 | // if ( !target.modified ) { |
| 677 | // The target and source key was not modified |
| 678 | $sortKeySrc = $state->serializer->serializedAttrVal( $node, 'mw:sortKey' ); |
| 679 | if ( isset( $sortKeySrc['value'] ) ) { |
| 680 | $linkData->contentNode = null; |
| 681 | $linkData->content->string = $sortKeySrc['value']; |
| 682 | // TODO: generalize this flag. It is already used by |
| 683 | // getAttributeShadowInfo. Maybe use the same |
| 684 | // structure as its return value? |
| 685 | $linkData->content->fromsrc = true; |
| 686 | } |
| 687 | // } |
| 688 | } else { |
| 689 | if ( $linkData->type === 'mw:PageProp/Language' ) { |
| 690 | // Fix up the content string |
| 691 | // TODO: see if linkData can be cleaner! |
| 692 | $linkData->content->string ??= Utils::decodeWtEntities( $target['value'] ); |
| 693 | } |
| 694 | } |
| 695 | |
| 696 | // The string value of the content, if it is plain text. |
| 697 | $linkTarget = null; |
| 698 | $escapedTgt = null; |
| 699 | if ( !empty( $linkData->isRedirect ) ) { |
| 700 | $linkTarget = $target['value']; |
| 701 | if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) { |
| 702 | $linkTarget = strtr( preg_replace( '#^(\.\.?/)*#', '', $linkTarget, 1 ), '_', ' ' ); |
| 703 | $escapedTgt = self::escapeLinkTarget( $linkTarget, $state ); |
| 704 | $linkTarget = $escapedTgt->linkTarget; |
| 705 | // Determine if it's a redirect to a category, in which case |
| 706 | // it needs a ':' on front to distingish from a category link. |
| 707 | if ( preg_match( '/^([^:]+)[:]/', $linkTarget, $categoryMatch ) ) { |
| 708 | $ns = $siteConfig->namespaceId( Utils::normalizeNamespaceName( $categoryMatch[1] ) ); |
| 709 | if ( $ns === $siteConfig->canonicalNamespaceId( 'category' ) ) { |
| 710 | // Check that the next node isn't a category link, |
| 711 | // in which case we don't want the ':'. |
| 712 | $nextNode = $node->nextSibling; |
| 713 | if ( !( |
| 714 | $nextNode instanceof Element && DOMCompat::nodeName( $nextNode ) === 'link' && |
| 715 | DOMUtils::hasRel( $nextNode, 'mw:PageProp/Category' ) && |
| 716 | DOMCompat::getAttribute( $nextNode, 'href' ) === DOMCompat::getAttribute( $node, 'href' ) |
| 717 | ) ) { |
| 718 | $linkTarget = ':' . $linkTarget; |
| 719 | } |
| 720 | } |
| 721 | } |
| 722 | } |
| 723 | } elseif ( self::isSimpleWikiLink( $env, $dp, $target, $linkData ) ) { |
| 724 | // Simple case |
| 725 | if ( empty( $target['modified'] ) && empty( $linkData->contentModified ) ) { |
| 726 | $linkTarget = PHPUtils::stripPrefix( $target['value'], './' ); |
| 727 | } else { |
| 728 | // If token has templated attrs or is a subpage, use target.value |
| 729 | // since content string will be drastically different. |
| 730 | if ( WTUtils::hasExpandedAttrsType( $node ) || |
| 731 | preg_match( '#(^|/)\.\./#', $target['value'] ) |
| 732 | ) { |
| 733 | $linkTarget = PHPUtils::stripPrefix( $target['value'], './' ); |
| 734 | } else { |
| 735 | $escapedTgt = self::escapeLinkTarget( $linkData->content->string, $state ); |
| 736 | if ( !$escapedTgt->invalidLink ) { |
| 737 | $linkTarget = self::addColonEscape( $env, $escapedTgt->linkTarget, $linkData ); |
| 738 | } else { |
| 739 | $linkTarget = $escapedTgt->linkTarget; |
| 740 | } |
| 741 | } |
| 742 | if ( !empty( $linkData->isInterwikiLang ) && |
| 743 | $linkTarget[0] !== ':' && |
| 744 | $linkData->type !== 'mw:PageProp/Language' |
| 745 | ) { |
| 746 | // ensure interwiki links can't be confused with |
| 747 | // interlanguage links. |
| 748 | $linkTarget = ':' . $linkTarget; |
| 749 | } |
| 750 | } |
| 751 | } elseif ( self::isURLLink( $state->getEnv(), $node, $linkData ) |
| 752 | /* && empty( $linkData->isInterwiki ) */ |
| 753 | ) { |
| 754 | // Uncomment the above check if we want [[wikipedia:Foo|http://en.wikipedia.org/wiki/Foo]] |
| 755 | // for '<a href="http://en.wikipedia.org/wiki/Foo">http://en.wikipedia.org/wiki/Foo</a>' |
| 756 | $linkData->linkType = 'mw:URLLink'; |
| 757 | } else { |
| 758 | // Emit piped wikilink syntax |
| 759 | $isPiped = true; |
| 760 | |
| 761 | // First get the content source |
| 762 | if ( !empty( $linkData->contentNode ) ) { |
| 763 | $cs = $state->serializeLinkChildrenToString( |
| 764 | $linkData->contentNode, |
| 765 | [ $state->serializer->wteHandlers, 'wikilinkHandler' ] |
| 766 | ); |
| 767 | // strip off the tail and handle the pipe trick |
| 768 | $contentParts = self::splitLinkContentString( $cs, $dp ); |
| 769 | $contentSrc = $contentParts->contentString; |
| 770 | $dp->tail = $contentParts->tail; |
| 771 | $linkData->tail = $contentParts->tail; |
| 772 | $dp->prefix = $contentParts->prefix; |
| 773 | $linkData->prefix = $contentParts->prefix; |
| 774 | $needsEscaping = false; |
| 775 | } else { |
| 776 | $contentSrc = $linkData->content->string ?? ''; |
| 777 | $needsEscaping = empty( $linkData->content->fromsrc ); |
| 778 | } |
| 779 | |
| 780 | if ( $contentSrc === '' && $linkData->type !== 'mw:PageProp/Category' ) { |
| 781 | // Protect empty link content from PST pipe trick |
| 782 | $contentSrc = '<nowiki/>'; |
| 783 | $needsEscaping = false; |
| 784 | } |
| 785 | |
| 786 | $linkTarget = $target['value']; |
| 787 | if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) { |
| 788 | // Links starting with ./ shouldn't get _ replaced with ' ' |
| 789 | $linkContentIsRelative = str_starts_with( $linkData->content->string ?? '', './' ); |
| 790 | $linkTarget = preg_replace( '#^(\.\.?/)*#', '', $linkTarget, 1 ); |
| 791 | if ( empty( $linkData->isInterwiki ) && !$linkContentIsRelative ) { |
| 792 | $linkTarget = strtr( $linkTarget, '_', ' ' ); |
| 793 | } |
| 794 | $escapedTgt = self::escapeLinkTarget( $linkTarget, $state ); |
| 795 | $linkTarget = $escapedTgt->linkTarget; |
| 796 | } |
| 797 | |
| 798 | // If we are reusing the target from source, we don't |
| 799 | // need to worry about colon-escaping because it will |
| 800 | // be in the right form already. |
| 801 | // |
| 802 | // Trying to eliminate this check and always check for |
| 803 | // colon-escaping seems a bit tricky when the reused |
| 804 | // target has encoded entities that won't resolve to |
| 805 | // valid titles. |
| 806 | if ( ( !$escapedTgt || !$escapedTgt->invalidLink ) && empty( $target['fromsrc'] ) ) { |
| 807 | $linkTarget = self::addColonEscape( $env, $linkTarget, $linkData ); |
| 808 | } |
| 809 | } |
| 810 | if ( $linkData->linkType === 'mw:URLLink' ) { |
| 811 | $state->emitChunk( new AutoURLLinkText( $node->textContent, $node ), $node ); |
| 812 | return; |
| 813 | } |
| 814 | |
| 815 | if ( !empty( $linkData->isRedirect ) ) { |
| 816 | // Drop duplicates |
| 817 | if ( $state->redirectText !== null ) { |
| 818 | return; |
| 819 | } |
| 820 | |
| 821 | // Buffer redirect text if it is not in start of file position |
| 822 | if ( !preg_match( self::REDIRECT_TEST_RE, $state->out . $state->currLine->text ) ) { |
| 823 | $state->redirectText = $linkData->prefix . '[[' . $linkTarget . ']]'; |
| 824 | $state->emitChunk( '', $node ); // Flush separators for this node |
| 825 | // Flush separators for this node |
| 826 | return; |
| 827 | } |
| 828 | |
| 829 | // Set to some non-null string |
| 830 | $state->redirectText = 'unbuffered'; |
| 831 | } |
| 832 | |
| 833 | $pipedText = null; |
| 834 | if ( $escapedTgt && $escapedTgt->invalidLink ) { |
| 835 | // If the link target was invalid, instead of emitting an invalid link, |
| 836 | // omit the link and serialize just the content instead. But, log the |
| 837 | // invalid html for Parsoid clients to investigate later. |
| 838 | $state->getEnv()->log( |
| 839 | 'error/html2wt/link', 'Bad title text', DOMCompat::getOuterHTML( $node ) |
| 840 | ); |
| 841 | |
| 842 | // For non-piped content, use the original invalid link text |
| 843 | $pipedText = $isPiped ? $contentSrc : $linkTarget; |
| 844 | $state->needsEscaping = $needsEscaping; |
| 845 | $state->emitChunk( $linkData->prefix . $pipedText . $linkData->tail, $node ); |
| 846 | } else { |
| 847 | $pipe = $dp->firstPipeSrc ?? '|'; |
| 848 | if ( $isPiped && $needsEscaping ) { |
| 849 | // We are definitely not in sol context since content |
| 850 | // will be preceded by "[[" or "[" text in target wikitext. |
| 851 | $pipedText = $pipe . $state->serializer->wteHandlers |
| 852 | ->escapeLinkContent( $state, $contentSrc, false, $node, false ); |
| 853 | } elseif ( $isPiped ) { |
| 854 | $pipedText = $pipe . $contentSrc; |
| 855 | } else { |
| 856 | $pipedText = ''; |
| 857 | } |
| 858 | if ( $isPiped ) { |
| 859 | $state->singleLineContext->disable(); |
| 860 | } |
| 861 | $state->emitChunk( new WikiLinkText( |
| 862 | $linkData->prefix . '[[' . $linkTarget . $pipedText . ']]' . $linkData->tail, |
| 863 | $node, $siteConfig, $linkData->type |
| 864 | ), $node ); |
| 865 | if ( $isPiped ) { |
| 866 | $state->singleLineContext->pop(); |
| 867 | } |
| 868 | } |
| 869 | } |
| 870 | |
| 871 | /** |
| 872 | * Serialize as external link |
| 873 | * @param Element $node |
| 874 | * @param SerializerState $state |
| 875 | * @param stdClass $linkData |
| 876 | */ |
| 877 | private static function serializeAsExtLink( |
| 878 | Element $node, SerializerState $state, stdClass $linkData |
| 879 | ): void { |
| 880 | $target = $linkData->target; |
| 881 | $urlStr = $target['value']; |
| 882 | if ( !empty( $target['modified'] ) || empty( $target['fromsrc'] ) ) { |
| 883 | // We expect modified hrefs to be percent-encoded already, so |
| 884 | // don't need to encode them here any more. Unmodified hrefs are |
| 885 | // just using the original encoding anyway. |
| 886 | // BUT we do have to encode certain special wikitext |
| 887 | // characters (like []) which aren't necessarily |
| 888 | // percent-encoded because they are valid in URLs and HTML5 |
| 889 | $urlStr = self::escapeExtLinkURL( $urlStr ); |
| 890 | } |
| 891 | |
| 892 | if ( self::isURLLink( $state->getEnv(), $node, $linkData ) ) { |
| 893 | // Serialize as URL link |
| 894 | $state->emitChunk( new AutoURLLinkText( $urlStr, $node ), $node ); |
| 895 | return; |
| 896 | } |
| 897 | |
| 898 | $siteConfig = $state->getEnv()->getSiteConfig(); |
| 899 | |
| 900 | $pureHashMatch = substr( $urlStr, 0, 1 ) === '#'; |
| 901 | // Fully serialize the content |
| 902 | $contentStr = $state->serializeLinkChildrenToString( |
| 903 | $node, |
| 904 | [ $state->serializer->wteHandlers, $pureHashMatch ? 'wikilinkHandler' : 'aHandler' ] |
| 905 | ); |
| 906 | |
| 907 | // serialize as auto-numbered external link |
| 908 | // [http://example.com] |
| 909 | $linktext = null; |
| 910 | $class = null; |
| 911 | // If it's just anchor text, serialize as an internal link. |
| 912 | if ( $pureHashMatch ) { |
| 913 | $class = WikiLinkText::class; |
| 914 | $linktext = '[[' . $urlStr . ( ( $contentStr ) ? '|' . $contentStr : '' ) . ']]'; |
| 915 | } else { |
| 916 | $class = ExtLinkText::class; |
| 917 | $linktext = '[' . $urlStr . ( ( $contentStr ) ? ' ' . $contentStr : '' ) . ']'; |
| 918 | } |
| 919 | $state->emitChunk( new $class( $linktext, $node, $siteConfig, $linkData->type ), $node ); |
| 920 | } |
| 921 | |
| 922 | /** |
| 923 | * Main link handler. |
| 924 | * @param SerializerState $state |
| 925 | * @param Element $node |
| 926 | */ |
| 927 | public static function linkHandler( SerializerState $state, Element $node ): void { |
| 928 | // TODO: handle internal/external links etc using RDFa and dataParsoid |
| 929 | // Also convert unannotated html links without advanced attributes to |
| 930 | // external wiki links for html import. Might want to consider converting |
| 931 | // relative links without path component and file extension to wiki links. |
| 932 | $env = $state->getEnv(); |
| 933 | $siteConfig = $env->getSiteConfig(); |
| 934 | |
| 935 | // Get the rt data from the token and tplAttrs |
| 936 | $linkData = self::getLinkRoundTripData( $env, $node, $state ); |
| 937 | $linkType = $linkData->type; |
| 938 | // If this could be a magic link, serialize it as a magic link by |
| 939 | // changing the link type to ExtLink. (If magic links are disabled, then |
| 940 | // the ExtResourceURLPatternMatcher() will return false.) |
| 941 | $magicLinkMatch = $siteConfig->getExtResourceURLPatternMatcher()( Utils::decodeURI( $linkData->origHref ) ); |
| 942 | if ( $magicLinkMatch !== false ) { |
| 943 | if ( |
| 944 | $magicLinkMatch[0] === 'PMID' && |
| 945 | DOMUtils::matchRel( $node, '|^mw:WikiLink/Interwiki\b|' ) !== null && |
| 946 | $linkType === 'mw:WikiLink' |
| 947 | ) { |
| 948 | // Round-trip PMIDs as interwikis if that's how they were |
| 949 | // originally. (Don't change the link type.) |
| 950 | } else { |
| 951 | $contentStr = $state->serializeLinkChildrenToString( |
| 952 | $node, |
| 953 | [ $state->serializer->wteHandlers, 'aHandler' ] |
| 954 | ); |
| 955 | $serialized = $siteConfig->makeExtResourceURL( |
| 956 | $magicLinkMatch, $linkData->origHref, $contentStr |
| 957 | ); |
| 958 | if ( $serialized[0] !== '[' ) { |
| 959 | // Successfully serialized as a magic link |
| 960 | $state->emitChunk( new MagicLinkText( $serialized, $node ), $node ); |
| 961 | return; |
| 962 | } |
| 963 | } |
| 964 | } |
| 965 | if ( $linkType !== null && isset( $linkData->target['value'] ) ) { |
| 966 | // We have a type and target info |
| 967 | if ( $linkType === 'mw:WikiLink' || $linkType === 'mw:MediaLink' || |
| 968 | preg_match( TokenUtils::SOL_TRANSPARENT_LINK_REGEX, $linkType ) |
| 969 | ) { |
| 970 | // [[..]] links: normal, category, redirect, or lang links |
| 971 | // (except images) |
| 972 | self::serializeAsWikiLink( $node, $state, $linkData ); |
| 973 | return; |
| 974 | } elseif ( $linkType === 'mw:ExtLink' ) { |
| 975 | // [..] links, autolinks, ISBN, RFC, PMID |
| 976 | self::serializeAsExtLink( $node, $state, $linkData ); |
| 977 | return; |
| 978 | } else { |
| 979 | throw new UnexpectedValueException( |
| 980 | 'Unhandled link serialization scenario: ' . DOMCompat::getOuterHTML( $node ) |
| 981 | ); |
| 982 | } |
| 983 | } else { |
| 984 | $safeAttr = [ |
| 985 | 'href' => true, |
| 986 | 'rel' => true, |
| 987 | 'class' => true, |
| 988 | 'title' => true, |
| 989 | DOMDataUtils::DATA_OBJECT_ATTR_NAME => true |
| 990 | ]; |
| 991 | |
| 992 | $isComplexLink = false; |
| 993 | foreach ( DOMUtils::attributes( $node ) as $name => $value ) { |
| 994 | // XXX: Don't drop rel and class in every case once a tags are |
| 995 | // actually supported in the MW default config? |
| 996 | if ( !isset( $safeAttr[$name] ) ) { |
| 997 | $isComplexLink = true; |
| 998 | break; |
| 999 | } |
| 1000 | } |
| 1001 | |
| 1002 | if ( $isComplexLink ) { |
| 1003 | $env->log( 'error/html2wt/link', 'Encountered', DOMCompat::getOuterHTML( $node ), |
| 1004 | '-- serializing as extlink and dropping <a> attributes unsupported in wikitext.' |
| 1005 | ); |
| 1006 | } else { |
| 1007 | $media = DOMUtils::selectMediaElt( $node ); // TODO: Handle missing media too |
| 1008 | $isFigure = $media instanceof Element && $media->parentNode === $node; |
| 1009 | if ( $isFigure ) { |
| 1010 | // this is a basic html figure: <a><img></a> |
| 1011 | self::figureHandler( $state, $node, new MediaStructure( $media, $node ) ); |
| 1012 | return; |
| 1013 | } |
| 1014 | } |
| 1015 | |
| 1016 | // href is already percent-encoded, etc., but it might contain |
| 1017 | // spaces or other wikitext nasties. escape the nasties. |
| 1018 | $hrefStr = self::escapeExtLinkURL( self::getHref( $env, $node ) ); |
| 1019 | $handler = [ $state->serializer->wteHandlers, 'aHandler' ]; |
| 1020 | $str = $state->serializeLinkChildrenToString( $node, $handler ); |
| 1021 | $chunk = null; |
| 1022 | if ( !$hrefStr ) { |
| 1023 | // Without an href, we just emit the string as text. |
| 1024 | // However, to preserve targets for anchor links, |
| 1025 | // serialize as a span with a name. |
| 1026 | $name = DOMCompat::getAttribute( $node, 'name' ); |
| 1027 | if ( $name !== null ) { |
| 1028 | $doc = $node->ownerDocument; |
| 1029 | $span = $doc->createElement( 'span' ); |
| 1030 | $span->setAttribute( 'name', $name ); |
| 1031 | $span->appendChild( $doc->createTextNode( $str ) ); |
| 1032 | $chunk = DOMCompat::getOuterHTML( $span ); |
| 1033 | } else { |
| 1034 | $chunk = $str; |
| 1035 | } |
| 1036 | } else { |
| 1037 | $chunk = new ExtLinkText( '[' . $hrefStr . ' ' . $str . ']', |
| 1038 | $node, $siteConfig, 'mw:ExtLink' |
| 1039 | ); |
| 1040 | } |
| 1041 | $state->emitChunk( $chunk, $node ); |
| 1042 | } |
| 1043 | } |
| 1044 | |
| 1045 | /** |
| 1046 | * Main figure handler. |
| 1047 | * |
| 1048 | * @param SerializerState $state |
| 1049 | * @param Element $node |
| 1050 | * @param ?MediaStructure $ms |
| 1051 | */ |
| 1052 | public static function figureHandler( |
| 1053 | SerializerState $state, Element $node, ?MediaStructure $ms |
| 1054 | ): void { |
| 1055 | if ( !$ms ) { |
| 1056 | $state->getEnv()->log( |
| 1057 | 'error/html2wt/figure', |
| 1058 | "Couldn't parse media structure: ", |
| 1059 | DOMCompat::getOuterHTML( $node ) |
| 1060 | ); |
| 1061 | ( new FallbackHTMLHandler )->handle( $node, $state ); |
| 1062 | return; |
| 1063 | } |
| 1064 | $ct = self::figureToConstrainedText( $state, $ms ); |
| 1065 | $state->emitChunk( $ct ?? '', $node ); |
| 1066 | } |
| 1067 | |
| 1068 | /** |
| 1069 | * Serialize a figure to contrained text. |
| 1070 | * |
| 1071 | * WARN: There's probably more to do to ensure this is purely functional, |
| 1072 | * no side-effects (ie. calls to state->emit) happen while processing. |
| 1073 | * |
| 1074 | * @param SerializerState $state |
| 1075 | * @param MediaStructure $ms |
| 1076 | * @return ?ConstrainedText |
| 1077 | */ |
| 1078 | public static function figureToConstrainedText( |
| 1079 | SerializerState $state, MediaStructure $ms |
| 1080 | ): ?ConstrainedText { |
| 1081 | $env = $state->getEnv(); |
| 1082 | $outerElt = $ms->containerElt ?? $ms->mediaElt; |
| 1083 | $linkElt = $ms->linkElt; |
| 1084 | $elt = $ms->mediaElt; |
| 1085 | $captionElt = $ms->captionElt; |
| 1086 | $format = WTUtils::getMediaFormat( $outerElt ); |
| 1087 | |
| 1088 | // Try to identify the local title to use for this image. |
| 1089 | $resource = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'resource' ); |
| 1090 | if ( !isset( $resource['value'] ) ) { |
| 1091 | // from non-parsoid HTML: try to reconstruct resource from src? |
| 1092 | // (this won't work for manual-thumb images) |
| 1093 | $src = DOMCompat::getAttribute( $elt, 'src' ); |
| 1094 | if ( $src === null ) { |
| 1095 | $env->log( 'error/html2wt/figure', |
| 1096 | 'In WSP.figureHandler, img does not have resource or src:', |
| 1097 | DOMCompat::getOuterHTML( $outerElt ) |
| 1098 | ); |
| 1099 | return null; |
| 1100 | } |
| 1101 | if ( preg_match( '/^https?:/', $src ) ) { |
| 1102 | // external image link, presumably $wgAllowExternalImages=true |
| 1103 | return new AutoURLLinkText( $src, $outerElt ); |
| 1104 | } |
| 1105 | $resource = [ |
| 1106 | 'value' => $src, |
| 1107 | 'fromsrc' => false, |
| 1108 | 'modified' => false |
| 1109 | ]; |
| 1110 | } |
| 1111 | if ( empty( $resource['fromsrc'] ) ) { |
| 1112 | $resource['value'] = preg_replace( '#^(\.\.?/)+#', '', $resource['value'], 1 ); |
| 1113 | } |
| 1114 | |
| 1115 | $nopts = []; |
| 1116 | $outerDP = DOMDataUtils::getDataParsoid( $outerElt ); |
| 1117 | $outerDMW = DOMDataUtils::getDataMw( $outerElt ); |
| 1118 | $mwAliases = $state->getEnv()->getSiteConfig()->mwAliases(); |
| 1119 | |
| 1120 | // Return ref to the array element in case it is modified |
| 1121 | $getOpt = static function & ( $key ) use ( &$outerDP ): ?array { |
| 1122 | $null = null; |
| 1123 | if ( empty( $outerDP->optList ) ) { |
| 1124 | return $null; |
| 1125 | } |
| 1126 | foreach ( $outerDP->optList as $opt ) { |
| 1127 | if ( ( $opt['ck'] ?? null ) === $key ) { |
| 1128 | return $opt; |
| 1129 | } |
| 1130 | } |
| 1131 | return $null; |
| 1132 | }; |
| 1133 | // Return ref to the array element in case it is modified |
| 1134 | $getLastOpt = static function & ( $key ) use ( &$outerDP ): ?array { |
| 1135 | $null = null; |
| 1136 | $opts = $outerDP->optList ?? []; |
| 1137 | for ( $i = count( $opts ) - 1; $i >= 0; $i-- ) { |
| 1138 | if ( ( $opts[$i]['ck'] ?? null ) === $key ) { |
| 1139 | return $opts[$i]; |
| 1140 | } |
| 1141 | } |
| 1142 | return $null; |
| 1143 | }; |
| 1144 | |
| 1145 | // Try to identify the local title to use for the link. |
| 1146 | $link = null; |
| 1147 | |
| 1148 | $linkFromDataMw = WTSUtils::getAttrFromDataMw( $outerDMW, 'link', true ); |
| 1149 | if ( $linkFromDataMw !== null ) { |
| 1150 | // "link" attribute on the `outerElt` takes precedence |
| 1151 | if ( isset( $linkFromDataMw->value['html'] ) ) { |
| 1152 | $link = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'link' ); |
| 1153 | } else { |
| 1154 | $link = [ |
| 1155 | 'value' => "link={$linkFromDataMw->value['txt']}", |
| 1156 | 'modified' => false, |
| 1157 | 'fromsrc' => false, |
| 1158 | 'fromDataMW' => true |
| 1159 | ]; |
| 1160 | } |
| 1161 | } elseif ( $linkElt && $linkElt->hasAttribute( 'href' ) ) { |
| 1162 | $link = $state->serializer->serializedImageAttrVal( $outerElt, $linkElt, 'href' ); |
| 1163 | if ( empty( $link['fromsrc'] ) ) { |
| 1164 | // strip page or lang parameter if present on href |
| 1165 | $strippedHref = preg_replace( |
| 1166 | '#[?]((?:page=\d+)|(?:lang=[a-z]+(?:-[a-z]+)*))$#Di', |
| 1167 | '', |
| 1168 | DOMCompat::getAttribute( $linkElt, 'href' ) ?? '' |
| 1169 | ); |
| 1170 | if ( $strippedHref === DOMCompat::getAttribute( $elt, 'resource' ) ) { |
| 1171 | // default link: same place as resource |
| 1172 | $link = $resource; |
| 1173 | } |
| 1174 | $link['value'] = preg_replace( '#^(\.\.?/)+#', '', $link['value'], 1 ); |
| 1175 | } |
| 1176 | } else { |
| 1177 | // Otherwise, just try and get it from data-mw |
| 1178 | $link = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, 'href' ); |
| 1179 | } |
| 1180 | |
| 1181 | if ( $link && empty( $link['modified'] ) && empty( $link['fromsrc'] ) ) { |
| 1182 | $linkOpt = $getOpt( 'link' ); |
| 1183 | if ( $linkOpt ) { |
| 1184 | $link['fromsrc'] = true; |
| 1185 | $link['value'] = $linkOpt['ak']; |
| 1186 | } |
| 1187 | } |
| 1188 | |
| 1189 | // Reconstruct the caption |
| 1190 | if ( !$captionElt && ( $outerDMW->caption ?? null ) !== null ) { |
| 1191 | $fragment = $outerDMW->caption; |
| 1192 | // FIXME: We should just be able to serialize the children of the |
| 1193 | // fragment, however, we need some way of marking this as being |
| 1194 | // inInsertedContent so that any bare text is assured to be escaped |
| 1195 | $captionElt = $outerElt->ownerDocument->createElement( 'div' ); |
| 1196 | DOMDataUtils::getDataParsoid( $captionElt )->setTempFlag( TempData::IS_NEW ); |
| 1197 | DOMUtils::migrateChildren( $fragment, $captionElt ); |
| 1198 | // Needs a parent node in order for WTS to be happy |
| 1199 | $fragment->appendChild( $captionElt ); |
| 1200 | } |
| 1201 | |
| 1202 | $caption = null; |
| 1203 | if ( $captionElt ) { |
| 1204 | $caption = $state->serializeCaptionChildrenToString( |
| 1205 | $captionElt, [ $state->serializer->wteHandlers, 'mediaOptionHandler' ] |
| 1206 | ); |
| 1207 | |
| 1208 | // Alt stuff |
| 1209 | if ( !WTUtils::hasVisibleCaption( $outerElt ) && $elt->hasAttribute( 'alt' ) ) { |
| 1210 | $altOnElt = trim( DOMCompat::getAttribute( $elt, 'alt' ) ?? '' ); |
| 1211 | $altFromCaption = trim( WTUtils::textContentFromCaption( $captionElt ) ); |
| 1212 | // The first condition is to support an empty \alt=\ option |
| 1213 | // when no caption is present |
| 1214 | if ( $altOnElt && ( $altOnElt === $altFromCaption ) ) { |
| 1215 | $elt->removeAttribute( 'alt' ); |
| 1216 | } |
| 1217 | } |
| 1218 | } |
| 1219 | |
| 1220 | // Fetch the alt (if any) |
| 1221 | $alt = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'alt' ); |
| 1222 | // Fetch the lang (if any) |
| 1223 | $lang = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'lang' ); |
| 1224 | // Fetch the muted (if any) |
| 1225 | $muted = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'muted' ); |
| 1226 | // Fetch the loop (if any) |
| 1227 | $loop = $state->serializer->serializedImageAttrVal( $outerElt, $elt, 'loop' ); |
| 1228 | |
| 1229 | // Ok, start assembling options, beginning with link & alt & lang |
| 1230 | // Other media don't have links in output. |
| 1231 | $linkCond = DOMCompat::nodeName( $elt ) === 'img'; |
| 1232 | if ( $linkCond && $link ) { |
| 1233 | // Check whether the link goes to the default place, in which |
| 1234 | // case an explicit link tag isn't needed. |
| 1235 | // The link may be external, or may include wikitext template markup, |
| 1236 | // therefore check first that it parses to a title. |
| 1237 | $linkTitle = $env->normalizedTitleKey( |
| 1238 | Utils::decodeURIComponent( $link['value'] ), true |
| 1239 | ); |
| 1240 | $resourceTitle = $env->normalizedTitleKey( |
| 1241 | Utils::decodeURIComponent( $resource['value'] ), true |
| 1242 | ); |
| 1243 | if ( |
| 1244 | $link['value'] === $resource['value'] || |
| 1245 | ( $linkTitle !== null && $linkTitle === $resourceTitle ) |
| 1246 | ) { |
| 1247 | $linkCond = false; // No explicit link attribute needed |
| 1248 | } |
| 1249 | } |
| 1250 | |
| 1251 | // "alt" for non-image is handle below |
| 1252 | $altCond = $alt['value'] !== null && DOMCompat::nodeName( $elt ) === 'img'; |
| 1253 | |
| 1254 | // This loop handles media options which *mostly* correspond 1-1 with |
| 1255 | // HTML attributes. `img_$name` is the name of the media option, |
| 1256 | // and $value is the Parsoid "shadow info" for the attribute. |
| 1257 | // $cond tells us whether we need to explicitly output this option; |
| 1258 | // if it is false we are using an implicit default. |
| 1259 | // `lang` and `alt` are fairly straightforward. `link` |
| 1260 | // is a little trickier, since we need to massage/fake the shadow |
| 1261 | // info because it doesn't come *directly* from the attribute. |
| 1262 | // link comes from the combination of a[href], img[src], and |
| 1263 | // img[resource], etc; |
| 1264 | foreach ( [ |
| 1265 | [ 'name' => 'link', 'value' => $link, 'cond' => $linkCond, 'alias' => 'img_link' ], |
| 1266 | [ 'name' => 'alt', 'value' => $alt, 'cond' => $altCond, 'alias' => 'img_alt' ], |
| 1267 | [ 'name' => 'lang', 'value' => $lang, 'cond' => isset( $lang['value'] ), 'alias' => 'img_lang' ], |
| 1268 | [ 'name' => 'muted', 'value' => $muted, 'cond' => isset( $muted['value'] ), 'alias' => 'timedmedia_muted' ], |
| 1269 | [ 'name' => 'loop', 'value' => $loop, 'cond' => isset( $loop['value'] ), 'alias' => 'timedmedia_loop' ], |
| 1270 | ] as $o ) { |
| 1271 | if ( !$o['cond'] ) { |
| 1272 | continue; |
| 1273 | } |
| 1274 | if ( $o['value'] && !empty( $o['value']['fromsrc'] ) ) { |
| 1275 | $nopts[] = [ |
| 1276 | 'ck' => $o['name'], |
| 1277 | 'ak' => [ $o['value']['value'] ], |
| 1278 | ]; |
| 1279 | } else { |
| 1280 | $value = $o['value'] ? $o['value']['value'] : ''; |
| 1281 | if ( $o['value'] && in_array( $o['name'], [ 'link', 'alt' ], true ) ) { |
| 1282 | // see WikiLinkHandler::isWikitextOpt(): link and alt are allowed |
| 1283 | // to contain arbitrary wikitext, even though it is stripped |
| 1284 | // to a string before emitting. |
| 1285 | $value = $state->serializer->wteHandlers->escapeLinkContent( |
| 1286 | $state, $value, false, $outerElt, true |
| 1287 | ); |
| 1288 | } |
| 1289 | $nopts[] = [ |
| 1290 | 'ck' => $o['name'], |
| 1291 | 'v' => $value, |
| 1292 | 'ak' => $mwAliases[$o['alias']], |
| 1293 | ]; |
| 1294 | } |
| 1295 | } |
| 1296 | |
| 1297 | // Now we handle media options which all come from space-separated |
| 1298 | // values in a single HTML attribute, `class`. (But note that there |
| 1299 | // can also be "extra" classes added by `img_class` as well.) |
| 1300 | $classes = DOMCompat::getClassList( $outerElt ); |
| 1301 | $extra = []; // 'extra' classes |
| 1302 | $val = null; |
| 1303 | |
| 1304 | foreach ( $classes as $c ) { |
| 1305 | switch ( $c ) { |
| 1306 | case 'mw-halign-none': |
| 1307 | case 'mw-halign-right': |
| 1308 | case 'mw-halign-left': |
| 1309 | case 'mw-halign-center': |
| 1310 | $val = substr( $c, 10 ); // strip mw-halign- prefix |
| 1311 | $nopts[] = [ |
| 1312 | 'ck' => $val, |
| 1313 | 'ak' => $mwAliases['img_' . $val], |
| 1314 | ]; |
| 1315 | break; |
| 1316 | |
| 1317 | case 'mw-valign-top': |
| 1318 | case 'mw-valign-middle': |
| 1319 | case 'mw-valign-baseline': |
| 1320 | case 'mw-valign-sub': |
| 1321 | case 'mw-valign-super': |
| 1322 | case 'mw-valign-text-top': |
| 1323 | case 'mw-valign-bottom': |
| 1324 | case 'mw-valign-text-bottom': |
| 1325 | $val = strtr( substr( $c, 10 ), '-', '_' ); // strip mw-valign and '-' to '_' |
| 1326 | $nopts[] = [ |
| 1327 | 'ck' => $val, |
| 1328 | 'ak' => $mwAliases['img_' . $val], |
| 1329 | ]; |
| 1330 | break; |
| 1331 | |
| 1332 | case 'mw-image-border': |
| 1333 | $nopts[] = [ |
| 1334 | 'ck' => 'border', |
| 1335 | 'ak' => $mwAliases['img_border'], |
| 1336 | ]; |
| 1337 | break; |
| 1338 | |
| 1339 | case 'mw-default-size': |
| 1340 | case 'mw-default-audio-height': |
| 1341 | // handled below |
| 1342 | break; |
| 1343 | |
| 1344 | default: |
| 1345 | $extra[] = $c; |
| 1346 | break; |
| 1347 | } |
| 1348 | } |
| 1349 | |
| 1350 | if ( count( $extra ) ) { |
| 1351 | $nopts[] = [ |
| 1352 | 'ck' => 'class', |
| 1353 | 'v' => implode( ' ', $extra ), |
| 1354 | 'ak' => $mwAliases['img_class'], |
| 1355 | ]; |
| 1356 | } |
| 1357 | |
| 1358 | // Now we handle parameters which don't have a representation |
| 1359 | // as HTML attributes; they are set only from the data-mw |
| 1360 | // values. (In theory they could perhaps be reverse engineered |
| 1361 | // from the thumbnail URL, but that would be fragile and expose |
| 1362 | // thumbnail implementation to the editor so we don't do that.) |
| 1363 | $mwParams = [ |
| 1364 | [ 'prop' => 'thumb', 'ck' => 'manualthumb', 'alias' => 'img_manualthumb' ], |
| 1365 | [ 'prop' => 'page', 'ck' => 'page', 'alias' => 'img_page' ], |
| 1366 | // Video specific |
| 1367 | [ 'prop' => 'starttime', 'ck' => 'starttime', 'alias' => 'timedmedia_starttime' ], |
| 1368 | [ 'prop' => 'endtime', 'ck' => 'endtime', 'alias' => 'timedmedia_endtime' ], |
| 1369 | [ 'prop' => 'thumbtime', 'ck' => 'thumbtime', 'alias' => 'timedmedia_thumbtime' ] |
| 1370 | ]; |
| 1371 | |
| 1372 | // `img_link` and `img_alt` are only surfaced as HTML attributes |
| 1373 | // for image media. For all other media we treat them as set only |
| 1374 | // from data-mw. |
| 1375 | if ( DOMCompat::nodeName( $elt ) !== 'img' ) { |
| 1376 | $mwParams[] = [ 'prop' => 'link', 'ck' => 'link', 'alias' => 'img_link' ]; |
| 1377 | $mwParams[] = [ 'prop' => 'alt', 'ck' => 'alt', 'alias' => 'img_alt' ]; |
| 1378 | } |
| 1379 | |
| 1380 | $hasManualthumb = false; |
| 1381 | foreach ( $mwParams as $o ) { |
| 1382 | $v = $outerDMW->{$o['prop']} ?? null; |
| 1383 | if ( $v === null ) { |
| 1384 | $a = WTSUtils::getAttrFromDataMw( $outerDMW, $o['ck'], true ); |
| 1385 | if ( $a !== null ) { |
| 1386 | if ( isset( $a->value['html'] ) ) { |
| 1387 | $si = $state->serializer->getAttributeValueAsShadowInfo( $outerElt, $o['ck'] ); |
| 1388 | if ( isset( $si['value'] ) ) { |
| 1389 | $nopts[] = [ |
| 1390 | 'ck' => $o['ck'], |
| 1391 | 'ak' => [ $si['value'] ], |
| 1392 | ]; |
| 1393 | continue; |
| 1394 | } |
| 1395 | } else { |
| 1396 | $v = $a->value['txt']; |
| 1397 | } |
| 1398 | } |
| 1399 | } |
| 1400 | if ( $v !== null ) { |
| 1401 | $ak = $state->serializer->getAttributeValue( |
| 1402 | $outerElt, $o['ck'] |
| 1403 | ) ?? $mwAliases[$o['alias']]; |
| 1404 | $nopts[] = [ |
| 1405 | 'ck' => $o['ck'], |
| 1406 | 'ak' => $ak, |
| 1407 | 'v' => $v |
| 1408 | ]; |
| 1409 | // Piggyback this here ... |
| 1410 | if ( $o['prop'] === 'thumb' ) { |
| 1411 | $hasManualthumb = true; |
| 1412 | $format = ''; |
| 1413 | } |
| 1414 | } |
| 1415 | } |
| 1416 | |
| 1417 | // These media options come from the HTML `typeof` attribute. |
| 1418 | switch ( $format ) { |
| 1419 | case 'Thumb': |
| 1420 | $nopts[] = [ |
| 1421 | 'ck' => 'thumbnail', |
| 1422 | 'ak' => $state->serializer->getAttributeValue( |
| 1423 | $outerElt, 'thumbnail' |
| 1424 | ) ?? $mwAliases['img_thumbnail'], |
| 1425 | ]; |
| 1426 | break; |
| 1427 | case 'Frame': |
| 1428 | $nopts[] = [ |
| 1429 | 'ck' => 'framed', |
| 1430 | 'ak' => $state->serializer->getAttributeValue( |
| 1431 | $outerElt, 'framed' |
| 1432 | ) ?? $mwAliases['img_framed'], |
| 1433 | ]; |
| 1434 | break; |
| 1435 | case 'Frameless': |
| 1436 | $nopts[] = [ |
| 1437 | 'ck' => 'frameless', |
| 1438 | 'ak' => $state->serializer->getAttributeValue( |
| 1439 | $outerElt, 'frameless' |
| 1440 | ) ?? $mwAliases['img_frameless'], |
| 1441 | ]; |
| 1442 | break; |
| 1443 | } |
| 1444 | |
| 1445 | // Now handle the size-related options. This is complicated! |
| 1446 | // We consider the `height`, `data-height`, `width`, and |
| 1447 | // `data-width` attributes, as well as the `typeof` and the `class`. |
| 1448 | |
| 1449 | // Get the user-specified height from wikitext |
| 1450 | $wh = $state->serializer->serializedImageAttrVal( |
| 1451 | $outerElt, $elt, $ms->isRedLink() ? 'data-height' : 'height' |
| 1452 | ); |
| 1453 | // Get the user-specified width from wikitext |
| 1454 | $ww = $state->serializer->serializedImageAttrVal( |
| 1455 | $outerElt, $elt, $ms->isRedLink() ? 'data-width' : 'width' |
| 1456 | ); |
| 1457 | |
| 1458 | $sizeUnmodified = !empty( $ww['fromDataMW'] ) || |
| 1459 | ( empty( $ww['modified'] ) && empty( $wh['modified'] ) ); |
| 1460 | $upright = $getOpt( 'upright' ); |
| 1461 | |
| 1462 | // XXX: Infer upright factor from default size for all thumbs by default? |
| 1463 | // Better for scaling with user prefs, but requires knowledge about |
| 1464 | // default used in VE. |
| 1465 | if ( $sizeUnmodified && $upright && |
| 1466 | // Only serialize upright where it is actually respected |
| 1467 | // This causes some dirty diffs, but makes sure that we don't |
| 1468 | // produce nonsensical output after a type switch. |
| 1469 | // TODO: Only strip if type was actually modified. |
| 1470 | in_array( $format, [ 'Frameless', 'Thumb' ], true ) |
| 1471 | ) { |
| 1472 | // preserve upright option |
| 1473 | $nopts[] = [ |
| 1474 | 'ck' => $upright['ck'], |
| 1475 | 'ak' => [ $upright['ak'] ], // FIXME: don't use ak here! |
| 1476 | ]; |
| 1477 | } |
| 1478 | |
| 1479 | if ( |
| 1480 | !DOMUtils::hasClass( $outerElt, 'mw-default-size' ) && |
| 1481 | $format !== 'Frame' && !$hasManualthumb |
| 1482 | ) { |
| 1483 | $size = $getLastOpt( 'width' ); |
| 1484 | $sizeString = (string)( $size['ak'] ?? '' ); |
| 1485 | if ( $sizeString === '' && !empty( $ww['fromDataMW'] ) ) { |
| 1486 | $sizeString = (string)( $ww['value'] ?? '' ); |
| 1487 | } |
| 1488 | if ( $sizeUnmodified && $sizeString !== '' ) { |
| 1489 | // preserve original width/height string if not touched |
| 1490 | $nopts[] = [ |
| 1491 | 'ck' => 'width', |
| 1492 | 'v' => $sizeString, // original size string |
| 1493 | 'ak' => [ '$1' ], // don't add px or the like |
| 1494 | ]; |
| 1495 | } else { |
| 1496 | $bbox = null; |
| 1497 | // Serialize to a square bounding box |
| 1498 | if ( isset( $ww['value'] ) && preg_match( '/^\d+/', $ww['value'] ) ) { |
| 1499 | $bbox = intval( $ww['value'] ); |
| 1500 | } |
| 1501 | if ( isset( $wh['value'] ) && preg_match( '/^\d+/', $wh['value'] ) && |
| 1502 | // As with "mw-default-size", editing clients should remove the |
| 1503 | // "mw-default-audio-height" if they want to factor a defined |
| 1504 | // height into the bounding box size. However, note that, at |
| 1505 | // present, a defined height for audio is ignored while parsing, |
| 1506 | // so this only has the effect of modifying the width. |
| 1507 | ( |
| 1508 | DOMCompat::nodeName( $elt ) !== 'audio' || |
| 1509 | !DOMUtils::hasClass( $outerElt, 'mw-default-audio-height' ) |
| 1510 | ) |
| 1511 | ) { |
| 1512 | $height = intval( $wh['value'] ); |
| 1513 | if ( $bbox === null || $height > $bbox ) { |
| 1514 | $bbox = $height; |
| 1515 | } |
| 1516 | } |
| 1517 | if ( $bbox !== null ) { |
| 1518 | $nopts[] = [ |
| 1519 | 'ck' => 'width', |
| 1520 | // MediaWiki interprets 100px as a width |
| 1521 | // restriction only, so we need to make the bounding |
| 1522 | // box explicitly square (100x100px). The 'px' is |
| 1523 | // added by the alias though, and can be localized. |
| 1524 | 'v' => $bbox . 'x' . $bbox, |
| 1525 | 'ak' => $mwAliases['img_width'], // adds the 'px' suffix |
| 1526 | ]; |
| 1527 | } |
| 1528 | } |
| 1529 | } |
| 1530 | |
| 1531 | $opts = $outerDP->optList ?? []; // original wikitext options |
| 1532 | |
| 1533 | // Add bogus options from old optlist in order to round-trip cleanly (T64500) |
| 1534 | foreach ( $opts as $o ) { |
| 1535 | if ( ( $o['ck'] ?? null ) === 'bogus' ) { |
| 1536 | $nopts[] = [ |
| 1537 | 'ck' => 'bogus', |
| 1538 | 'ak' => [ $o['ak'] ], |
| 1539 | ]; |
| 1540 | } |
| 1541 | } |
| 1542 | |
| 1543 | // Put the caption last, by default. |
| 1544 | if ( is_string( $caption ) ) { |
| 1545 | $nopts[] = [ |
| 1546 | 'ck' => 'caption', |
| 1547 | 'ak' => [ $caption ], |
| 1548 | ]; |
| 1549 | } |
| 1550 | |
| 1551 | // ok, sort the new options to match the order given in the old optlist |
| 1552 | // and try to match up the aliases used |
| 1553 | $changed = false; |
| 1554 | foreach ( $nopts as &$no ) { |
| 1555 | // Make sure we have an array here. Default in data-parsoid is |
| 1556 | // actually a string. |
| 1557 | // FIXME: don't reuse ak for two different things! |
| 1558 | if ( !is_array( $no['ak'] ) ) { |
| 1559 | $no['ak'] = [ $no['ak'] ]; |
| 1560 | } |
| 1561 | |
| 1562 | $no['sortId'] = count( $opts ); |
| 1563 | $idx = -1; |
| 1564 | foreach ( $opts as $i => $o ) { |
| 1565 | if ( ( $o['ck'] ?? null ) === $no['ck'] && |
| 1566 | // for bogus options, make sure the source matches too. |
| 1567 | ( $o['ck'] !== 'bogus' || $o['ak'] === $no['ak'][0] ) |
| 1568 | ) { |
| 1569 | $idx = $i; |
| 1570 | break; |
| 1571 | } |
| 1572 | } |
| 1573 | if ( $idx < 0 ) { |
| 1574 | // Preferred words are first in the alias list |
| 1575 | // (but not in old versions of mediawiki). |
| 1576 | $no['ak'] = $no['ak'][0]; |
| 1577 | $changed = true; |
| 1578 | continue; |
| 1579 | } |
| 1580 | |
| 1581 | $no['sortId'] = $idx; |
| 1582 | // use a matching alias, if there is one |
| 1583 | $a = null; |
| 1584 | foreach ( $no['ak'] as $b ) { |
| 1585 | // note the trim() here; that allows us to snarf eccentric |
| 1586 | // whitespace from the original option wikitext |
| 1587 | $b2 = $b; |
| 1588 | if ( isset( $no['v'] ) ) { |
| 1589 | $b2 = str_replace( '$1', $no['v'], $b ); |
| 1590 | } |
| 1591 | if ( $b2 === trim( implode( ',', (array)$opts[$idx]['ak'] ) ) ) { |
| 1592 | $a = $b; |
| 1593 | break; |
| 1594 | } |
| 1595 | } |
| 1596 | // use the alias (incl whitespace) from the original option wikitext |
| 1597 | // if found; otherwise use the last alias given (English default by |
| 1598 | // convention that works everywhere). |
| 1599 | // TODO: use first alias (localized) instead for RTL languages (T53852) |
| 1600 | if ( $a !== null && $no['ck'] !== 'caption' ) { |
| 1601 | $no['ak'] = $opts[$idx]['ak']; |
| 1602 | unset( $no['v'] ); // prevent double substitution |
| 1603 | } else { |
| 1604 | $no['ak'] = PHPUtils::lastItem( $no['ak'] ); |
| 1605 | if ( !( $no['ck'] === 'caption' && $a !== null ) ) { |
| 1606 | $changed = true; |
| 1607 | } |
| 1608 | } |
| 1609 | } |
| 1610 | |
| 1611 | // Filter out bogus options if the image options/caption have changed. |
| 1612 | if ( $changed ) { |
| 1613 | $nopts = array_filter( $nopts, static function ( $no ) { |
| 1614 | return $no['ck'] !== 'bogus'; |
| 1615 | } ); |
| 1616 | // empty captions should get filtered out in this case, too (T64264) |
| 1617 | $nopts = array_filter( $nopts, static function ( $no ) { |
| 1618 | return !( $no['ck'] === 'caption' && $no['ak'] === '' ); |
| 1619 | } ); |
| 1620 | } |
| 1621 | |
| 1622 | // sort! |
| 1623 | usort( $nopts, static function ( $a, $b ) { |
| 1624 | return $a['sortId'] <=> $b['sortId']; |
| 1625 | } ); |
| 1626 | |
| 1627 | // emit all the options as wikitext! |
| 1628 | $wikitext = '[[' . $resource['value']; |
| 1629 | foreach ( $nopts as $o ) { |
| 1630 | $wikitext .= '|'; |
| 1631 | if ( isset( $o['v'] ) ) { |
| 1632 | $wikitext .= str_replace( '$1', $o['v'], $o['ak'] ); |
| 1633 | } else { |
| 1634 | $wikitext .= $o['ak']; |
| 1635 | } |
| 1636 | } |
| 1637 | $wikitext .= ']]'; |
| 1638 | |
| 1639 | return new WikiLinkText( |
| 1640 | $wikitext, $outerElt, $state->getEnv()->getSiteConfig(), 'mw:File' |
| 1641 | ); |
| 1642 | } |
| 1643 | |
| 1644 | } |