/**
* Serializes link markup.
* @module
*/
'use strict';
require('../../core-upgrade.js');
var url = require('url');
var CT = require('./ConstrainedText.js');
var ContentUtils = require('../utils/ContentUtils.js').ContentUtils;
var DiffUtils = require('./DiffUtils.js').DiffUtils;
var DOMDataUtils = require('../utils/DOMDataUtils.js').DOMDataUtils;
var DOMUtils = require('../utils/DOMUtils.js').DOMUtils;
var JSUtils = require('../utils/jsutils.js').JSUtils;
var Promise = require('../utils/promise.js');
var TokenUtils = require('../utils/TokenUtils.js').TokenUtils;
var Util = require('../utils/Util.js').Util;
var WTUtils = require('../utils/WTUtils.js').WTUtils;
const { WTSUtils } = require('../html2wt/WTSUtils.js');
var AutoURLLinkText = CT.AutoURLLinkText;
var ExtLinkText = CT.ExtLinkText;
var MagicLinkText = CT.MagicLinkText;
var WikiLinkText = CT.WikiLinkText;
var lastItem = JSUtils.lastItem;
var REDIRECT_TEST_RE = /^([ \t\n\r\0\x0b])*$/;
var MW_TITLE_WHITESPACE_RE = /[ _\u00A0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+/g;
/**
* Strip a string suffix if it matches.
*/
var stripSuffix = function(text, suffix) {
var sLen = suffix.length;
if (sLen && text.substr(-sLen) === suffix) {
return text.substr(0, text.length - sLen);
} else {
return text;
}
};
var splitLinkContentString = function(contentString, dp, target) {
var tail = dp.tail;
var prefix = dp.prefix;
if (tail && contentString.substr(contentString.length - tail.length) === tail) {
// strip the tail off the content
contentString = stripSuffix(contentString, tail);
} else if (tail) {
tail = '';
}
if (prefix && contentString.substr(0, prefix.length) === prefix) {
contentString = contentString.substr(prefix.length);
} else if (prefix) {
prefix = '';
}
return {
contentString: contentString || '',
tail: tail || '',
prefix: prefix || '',
};
};
// Helper function for munging protocol-less absolute URLs:
// If this URL is absolute, but doesn't contain a protocol,
// try to find a localinterwiki protocol that would work.
var getHref = function(env, node) {
var href = node.getAttribute('href') || '';
if (/^\/[^\/]/.test(href)) {
// protocol-less but absolute. let's find a base href
var bases = [];
var nhref;
env.conf.wiki.interwikiMap.forEach(function(interwikiInfo, prefix) {
if (interwikiInfo.localinterwiki !== undefined &&
interwikiInfo.url !== undefined) {
// this is a possible base href
bases.push(interwikiInfo.url);
}
});
for (var i = 0; i < bases.length; i++) {
// evaluate the url relative to this base
nhref = url.resolve(bases[i], href);
// can this match the pattern?
var re = '^' +
bases[i].split('$1').map(JSUtils.escapeRegExp).join('[\\s\\S]*') +
'$';
if (new RegExp(re).test(nhref)) {
return nhref;
}
}
}
return href;
};
function normalizeIWP(str) {
return str.toLowerCase().trim().replace(/^:/, '');
}
var escapeLinkTarget = function(linkTarget, state) {
// Entity-escape the content.
linkTarget = Util.escapeWtEntities(linkTarget);
return {
linkTarget: linkTarget,
// Is this an invalid link?
invalidLink: !state.env.isValidLinkTarget(linkTarget) ||
// `isValidLinkTarget` omits fragments (the part after #) so,
// even though "|" is an invalid character, we still need to ensure
// it doesn't appear in there. The percent encoded version is fine
// in the fragment, since it won't break the parse.
/\|/.test(linkTarget)
};
};
// Helper function for getting RT data from the tokens
var getLinkRoundTripData = Promise.async(function *(env, node, state) {
var dp = DOMDataUtils.getDataParsoid(node);
var wiki = env.conf.wiki;
var rtData = {
type: null, // could be null
href: null, // filled in below
origHref: null, // filled in below
target: null, // filled in below
tail: dp.tail || '',
prefix: dp.prefix || '',
content: {}, // string or tokens
};
// Figure out the type of the link
if (node.hasAttribute('rel')) {
var rel = node.getAttribute('rel');
// Parsoid only emits and recognizes ExtLink, WikiLink, and PageProp rel values.
// Everything else defaults to ExtLink during serialization (unless it is
// serializable to a wikilink)
var typeMatch = rel.match(/\b(mw:(WikiLink|ExtLink|MediaLink|PageProp)[^\s]*)\b/);
if (typeMatch) {
rtData.type = typeMatch[1];
// Strip link subtype info
if (/^mw:(Wiki|Ext)Link\//.test(rtData.type)) {
rtData.type = 'mw:' + typeMatch[2];
}
}
}
// Default link type if nothing else is set
if (rtData.type === null && !DOMUtils.selectMediaElt(node)) {
rtData.type = 'mw:ExtLink';
}
// Get href, and save the token's "real" href for comparison
var href = getHref(env, node);
rtData.origHref = href;
rtData.href = href.replace(/^(\.\.?\/)+/, '');
// WikiLinks should be relative (but see below); fixup the link type
// if a WikiLink has an absolute URL.
// (This may get converted back to a WikiLink below, in the interwiki
// handling code.)
if (rtData.type === 'mw:WikiLink' &&
(/^(\w+:)?\/\//.test(rtData.href) || /^\//.test(rtData.origHref))) {
rtData.type = 'mw:ExtLink';
}
// Now get the target from rt data
rtData.target = yield state.serializer.serializedAttrVal(node, 'href');
// Check if the link content has been modified or is newly inserted content.
// FIXME: This will only work with selser of course. Hard to test without selser.
if (state.inModifiedContent || DiffUtils.hasDiffMark(node, env, 'subtree-changed')) {
rtData.contentModified = true;
}
// Get the content string or tokens
var contentParts;
if (node.hasChildNodes() && DOMUtils.allChildrenAreText(node)) {
var contentString = node.textContent;
if (rtData.target.value && rtData.target.value !== contentString) {
// Try to identify a new potential tail
contentParts = splitLinkContentString(contentString, dp, rtData.target);
rtData.content.string = contentParts.contentString;
rtData.tail = contentParts.tail;
rtData.prefix = contentParts.prefix;
} else {
rtData.tail = '';
rtData.prefix = '';
rtData.content.string = contentString;
}
} else if (node.hasChildNodes()) {
rtData.contentNode = node;
} else if (/^mw:PageProp\/redirect$/.test(rtData.type)) {
rtData.isRedirect = true;
rtData.prefix = dp.src ||
((wiki.mwAliases.redirect[0] || '#REDIRECT') + ' ');
}
// Update link type based on additional analysis.
// What might look like external links might be serializable as a wikilink.
var target = rtData.target;
// mw:MediaLink annotations are considered authoritative
// and interwiki link matches aren't made for these
if (/\bmw:MediaLink\b/.test(rtData.type)) {
// Parse title from resource attribute (see analog in image handling)
var resource = yield state.serializer.serializedAttrVal(node, 'resource');
if (resource.value === null) {
// from non-parsoid HTML: try to reconstruct resource from href?
// (See similar code which tries to guess resource from <img src>)
var mediaPrefix = wiki.namespaceNames[wiki.namespaceIds.get('media')];
resource = {
value: mediaPrefix + ':' + rtData.origHref.replace(/.*\//, ''),
fromsrc: false,
modified: false,
};
}
rtData.target = resource;
rtData.href = rtData.target.value.replace(/^(\.\.?\/)+/, '');
return rtData;
}
// Check if the href matches any of our interwiki URL patterns
var interWikiMatch = wiki.interWikiMatcher().match(href);
if (interWikiMatch
// Question mark is a valid title char, so it won't fail the test below,
// but gets percent encoded on the way out since it has special
// semantics in a url. That will break the url we're serializing, so
// protect it.
// FIXME: If ever the default value for $wgExternalInterwikiFragmentMode
// changes, we can reduce this by always stripping off the fragment
// identifier, since in "html5" mode, that isn't encoded. At present,
// we can only do that if we know it's a local interwiki link.
&& !/\?/.test(interWikiMatch[1])
// Ensure we have a valid link target, otherwise falling back to extlink
// is preferable, since it won't serialize as a link.
&& (!interWikiMatch[1].length ||
!escapeLinkTarget(interWikiMatch[1], state).invalidLink)
// ExtLinks should have content to convert.
&& (rtData.type !== 'mw:ExtLink' || rtData.content.string || rtData.contentNode)
&& (dp.isIW || target.modified || rtData.contentModified)) {
// External link that is really an interwiki link. Convert it.
// TODO: Leaving this for backwards compatibility, remove when 1.5 is no longer bound
if (rtData.type === 'mw:ExtLink') {
rtData.type = 'mw:WikiLink';
}
rtData.isInterwiki = true;
// could this be confused with a language link?
var iwi = wiki.interwikiMap.get(normalizeIWP(interWikiMatch[0]));
rtData.isInterwikiLang = iwi && iwi.language !== undefined;
// is this our own wiki?
rtData.isLocal = iwi && iwi.localinterwiki !== undefined;
// strip off localinterwiki prefixes
var localPrefix = '';
var oldPrefix;
while (true) { // eslint-disable-line
oldPrefix = target.value.slice(localPrefix.length).match(/^(:?[^:]+):/);
if (!oldPrefix) {
break;
}
iwi = wiki.interwikiMap.get(
Util.normalizeNamespaceName(oldPrefix[1].replace(/^:/, ''))
);
if (!iwi || iwi.localinterwiki === undefined) {
break;
}
localPrefix += oldPrefix[1] + ':';
}
if (target.fromsrc && !target.modified) {
// Leave the target alone!
} else if (/\bmw:PageProp\/Language\b/.test(rtData.type)) {
target.value = interWikiMatch.join(':').replace(/^:/, '');
} else if (
oldPrefix && ( // Should we preserve the old prefix?
oldPrefix[1].toLowerCase() === interWikiMatch[0].toLowerCase() ||
// Check if the old prefix mapped to the same URL as
// the new one. Use the old one if that's the case.
// Example: [[w:Foo]] vs. [[:en:Foo]]
(wiki.interwikiMap.get(normalizeIWP(oldPrefix[1])) || {}).url ===
(wiki.interwikiMap.get(normalizeIWP(interWikiMatch[0])) || {}).url
)
) {
// Reuse old prefix capitalization
if (Util.decodeWtEntities(target.value.substr(oldPrefix[1].length + 1)) !== interWikiMatch[1]) {
// Modified, update target.value.
target.value = localPrefix + oldPrefix[1] + ':' + interWikiMatch[1];
}
// Ensure that we generate an interwiki link and not a language link!
if (rtData.isInterwikiLang && !(/^:/.test(target.value))) {
target.value = ':' + target.value;
}
// Else: preserve old encoding
} else if (rtData.isLocal) {
// - interwikiMatch will be ":en", ":de", etc.
// - This tests whether the interwiki-like link is actually
// a local wikilink.
target.value = interWikiMatch[1];
rtData.isInterwiki = rtData.isInterwikiLang = false;
} else {
target.value = interWikiMatch.join(':');
}
}
return rtData;
});
/**
* The provided URL is already percent-encoded -- but it may still
* not be safe for wikitext. Add additional escapes to make the URL
* wikitext-safe. Don't touch percent escapes already in the url,
* though!
* @private
*/
var escapeExtLinkURL = function(urlStr) {
// this regexp is the negation of EXT_LINK_URL_CLASS in the PHP parser
return urlStr.replace(/[\]\[<>"\x00-\x20\x7F\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]|-(?=\{)/g, function(m) {
return Util.entityEncodeAll(m);
}).replace(
// IPv6 host names are bracketed with []. Entity-decode these.
/^([a-z][^:\/]*:)?\/\/[([0-9a-f:.]+)](:\d|\/|$)/i,
'$1//[$2]$3'
);
};
/**
* Add a colon escape to a wikilink target string if needed.
* @private
*/
var addColonEscape = function(env, linkTarget, linkData) {
var linkTitle = env.makeTitleFromText(linkTarget);
if ((linkTitle.getNamespace().isCategory() || linkTitle.getNamespace().isFile())
&& linkData.type === 'mw:WikiLink'
&& !/^:/.test(linkTarget)) {
// Escape category and file links
return ':' + linkTarget;
} else {
return linkTarget;
}
};
var isURLLink = function(env, node, linkData) {
var target = linkData.target;
// Get plain text content, if any
var contentStr = node.hasChildNodes() &&
DOMUtils.allChildrenAreText(node) ? node.textContent : null;
// First check if we can serialize as an URL link
return contentStr &&
// Can we minimize this?
(target.value === contentStr || getHref(env, node) === contentStr) &&
// protocol-relative url links not allowed in text
// (see autourl rule in peg tokenizer, T32269)
!(/^\/\//).test(contentStr) && Util.isProtocolValid(contentStr, env);
};
// Figure out if we need a piped or simple link
var isSimpleWikiLink = function(env, dp, target, linkData) {
var canUseSimple = false;
var contentString = linkData.content.string;
// FIXME (SSS):
// 1. Revisit this logic to see if all these checks
// are still relevant or whether this can be simplified somehow.
// 2. There are also duplicate computations for env.normalizedTitleKey(..)
// and Util.decodeURIComponent(..) that could be removed.
// 3. This could potentially be refactored as if-then chains.
// Would need to pipe for any non-string content.
// Preserve unmodified or non-minimal piped links.
if (contentString !== undefined
&& (target.modified || linkData.contentModified || dp.stx !== 'piped')
// Relative links are not simple
&& !contentString.match(/^\.\//)
) {
// Strip colon escapes from the original target as that is
// stripped when deriving the content string.
// Strip ./ prefixes as well since they are relative link prefixes
// added to all titles.
var strippedTargetValue = target.value.replace(/^(:|\.\/)/, '');
var decodedTarget = Util.decodeWtEntities(strippedTargetValue);
// Deal with the protocol-relative link scenario as well
var hrefHasProto = /^(\w+:)?\/\//.test(linkData.href);
// Normalize content string and decoded target before comparison.
// Piped links don't come down this path => it is safe to normalize both.
contentString = contentString.replace(/_/g, ' ');
decodedTarget = decodedTarget.replace(/_/g, ' ');
// See if the (normalized) content matches the
// target, either shadowed or actual.
canUseSimple = (
contentString === decodedTarget
// try wrapped in forward slashes in case they were stripped
|| ('/' + contentString + '/') === decodedTarget
// normalize as titles and compare
|| env.normalizedTitleKey(contentString, true) === decodedTarget.replace(MW_TITLE_WHITESPACE_RE, '_')
// Relative link
|| (env.conf.wiki.namespacesWithSubpages[env.page.ns] &&
(/^\.\.\/.*[^\/]$/.test(strippedTargetValue) &&
contentString === env.resolveTitle(strippedTargetValue)) ||
(/^\.\.\/.*?\/$/.test(strippedTargetValue) &&
contentString === strippedTargetValue.replace(/^(?:\.\.\/)+(.*?)\/$/, '$1')))
// if content == href this could be a simple link... eg [[Foo]].
// but if href is an absolute url with protocol, this won't
// work: [[http://example.com]] is not a valid simple link!
|| (!hrefHasProto &&
// Always compare against decoded uri because
// <a rel="mw:WikiLink" href="7%25 Solution">7%25 Solution</a></p>
// should serialize as [[7% Solution|7%25 Solution]]
(contentString === Util.decodeURIComponent(linkData.href) ||
// normalize with underscores for comparison with href
env.normalizedTitleKey(contentString, true) === Util.decodeURIComponent(linkData.href)))
);
}
return canUseSimple;
};
var serializeAsWikiLink = Promise.async(function *(node, state, linkData) {
var contentParts;
var contentSrc = '';
var isPiped = false;
var requiresEscaping = true;
var env = state.env;
var wiki = env.conf.wiki;
var oldSOLState = state.onSOL;
var target = linkData.target;
var dp = DOMDataUtils.getDataParsoid(node);
// Decode any link that did not come from the source (data-mw/parsoid)
// Links that come from data-mw/data-parsoid will be true titles,
// but links that come from hrefs will need to be url-decoded.
// Ex: <a href="/wiki/A%3Fb">Foobar</a>
if (!target.fromsrc) {
// Omit fragments from decoding
var hash = target.value.indexOf('#');
if (hash > -1) {
target.value = Util.decodeURIComponent(target.value.substring(0, hash)) + target.value.substring(hash);
} else {
target.value = Util.decodeURIComponent(target.value);
}
}
// Special-case handling for category links
if (linkData.type === 'mw:PageProp/Category') {
// Split target and sort key
var targetParts = target.value.match(/^([^#]*)#(.*)/);
if (targetParts) {
target.value = targetParts[1]
.replace(/^(\.\.?\/)*/, '')
.replace(/_/g, ' ');
// FIXME: Reverse `Sanitizer.sanitizeTitleURI(strContent).replace(/#/g, '%23');`
var strContent = Util.decodeURIComponent(targetParts[2]);
contentParts = splitLinkContentString(strContent, dp);
linkData.content.string = contentParts.contentString;
dp.tail = linkData.tail = contentParts.tail;
dp.prefix = linkData.prefix = contentParts.prefix;
} else { // No sort key, will serialize to simple link
// Normalize the content string
linkData.content.string = target.value.replace(/^\.\//, '').replace(/_/g, ' ');
}
// Special-case handling for template-affected sort keys
// FIXME: sort keys cannot be modified yet, but if they are,
// we need to fully shadow the sort key.
// if ( !target.modified ) {
// The target and source key was not modified
var sortKeySrc =
yield state.serializer.serializedAttrVal(node, 'mw:sortKey');
if (sortKeySrc.value !== null) {
linkData.contentNode = undefined;
linkData.content.string = sortKeySrc.value;
// TODO: generalize this flag. It is already used by
// getAttributeShadowInfo. Maybe use the same
// structure as its return value?
linkData.content.fromsrc = true;
}
// }
} else if (linkData.type === 'mw:PageProp/Language') {
// Fix up the the content string
// TODO: see if linkData can be cleaner!
if (linkData.content.string === undefined) {
linkData.content.string = Util.decodeWtEntities(target.value);
}
}
// The string value of the content, if it is plain text.
var linkTarget, escapedTgt;
if (linkData.isRedirect) {
linkTarget = target.value;
if (target.modified || !target.fromsrc) {
linkTarget = linkTarget.replace(/^(\.\.?\/)*/, '').replace(/_/g, ' ');
escapedTgt = escapeLinkTarget(linkTarget, state);
linkTarget = escapedTgt.linkTarget;
// Determine if it's a redirect to a category, in which case
// it needs a ':' on front to distingish from a category link.
var categoryMatch = linkTarget.match(/^([^:]+)[:]/);
if (categoryMatch) {
var ns = wiki.namespaceIds.get(Util.normalizeNamespaceName(categoryMatch[1]));
if (ns === wiki.canonicalNamespaces.category) {
// Check that the next node isn't a category link,
// in which case we don't want the ':'.
var nextNode = node.nextSibling;
if (!(
nextNode && DOMUtils.isElt(nextNode) && nextNode.nodeName === "LINK" &&
nextNode.getAttribute('rel') === "mw:PageProp/Category" &&
nextNode.getAttribute('href') === node.getAttribute('href')
)) {
linkTarget = ':' + linkTarget;
}
}
}
}
} else if (isSimpleWikiLink(env, dp, target, linkData)) {
// Simple case
if (!target.modified && !linkData.contentModified) {
linkTarget = target.value.replace(/^\.\//, '');
} else {
// If token has templated attrs or is a subpage, use target.value
// since content string will be drastically different.
if (WTUtils.hasExpandedAttrsType(node) ||
/(^|\/)\.\.\//.test(target.value)) {
linkTarget = target.value.replace(/^\.\//, '');
} else {
escapedTgt = escapeLinkTarget(linkData.content.string, state);
if (!escapedTgt.invalidLink) {
linkTarget = addColonEscape(env, escapedTgt.linkTarget, linkData);
} else {
linkTarget = escapedTgt.linkTarget;
}
}
if (linkData.isInterwikiLang && !/^[:]/.test(linkTarget) &&
linkData.type !== 'mw:PageProp/Language') {
// ensure interwiki links can't be confused with
// interlanguage links.
linkTarget = ':' + linkTarget;
}
}
} else if (isURLLink(state.env, node, linkData)/* && !linkData.isInterwiki */) {
// Uncomment the above check if we want [[wikipedia:Foo|http://en.wikipedia.org/wiki/Foo]]
// for '<a href="http://en.wikipedia.org/wiki/Foo">http://en.wikipedia.org/wiki/Foo</a>'
linkData.linkType = "mw:URLLink";
} else {
// Emit piped wikilink syntax
isPiped = true;
// First get the content source
if (linkData.contentNode) {
var cs = yield state.serializeLinkChildrenToString(
linkData.contentNode,
state.serializer.wteHandlers.wikilinkHandler
);
// strip off the tail and handle the pipe trick
contentParts = splitLinkContentString(cs, dp);
contentSrc = contentParts.contentString;
dp.tail = contentParts.tail;
linkData.tail = contentParts.tail;
dp.prefix = contentParts.prefix;
linkData.prefix = contentParts.prefix;
requiresEscaping = false;
} else {
contentSrc = linkData.content.string || '';
requiresEscaping = !linkData.content.fromsrc;
}
if (contentSrc === '' &&
linkData.type !== 'mw:PageProp/Category') {
// Protect empty link content from PST pipe trick
contentSrc = '<nowiki/>';
requiresEscaping = false;
}
linkTarget = target.value;
if (target.modified || !target.fromsrc) {
// Links starting with ./ shouldn't get _ replaced with ' '
var linkContentIsRelative =
linkData.content && linkData.content.string &&
linkData.content.string.match(/^\.\//);
linkTarget = linkTarget.replace(/^(\.\.?\/)*/, '');
if (!linkData.isInterwiki && !linkContentIsRelative) {
linkTarget = linkTarget.replace(/_/g, ' ');
}
escapedTgt = escapeLinkTarget(linkTarget, state);
linkTarget = escapedTgt.linkTarget;
}
// If we are reusing the target from source, we don't
// need to worry about colon-escaping because it will
// be in the right form already.
//
// Trying to eliminate this check and always check for
// colon-escaping seems a bit tricky when the reused
// target has encoded entities that won't resolve to
// valid titles.
if ((!escapedTgt || !escapedTgt.invalidLink) && !target.fromsrc) {
linkTarget = addColonEscape(env, linkTarget, linkData);
}
}
if (linkData.linkType === "mw:URLLink") {
state.emitChunk(new AutoURLLinkText(node.textContent, node), node);
return;
}
if (linkData.isRedirect) {
// Drop duplicates
if (state.redirectText !== null) {
return;
}
// Buffer redirect text if it is not in start of file position
if (!REDIRECT_TEST_RE.test(state.out + state.currLine.text)) {
state.redirectText = linkData.prefix + '[[' + linkTarget + ']]';
state.emitChunk('', node); // Flush seperators for this node
return;
}
// Set to some non-null string
state.redirectText = 'unbuffered';
}
var pipedText;
if (escapedTgt && escapedTgt.invalidLink) {
// If the link target was invalid, instead of emitting an invalid link,
// omit the link and serialize just the content instead. But, log the
// invalid html for Parsoid clients to investigate later.
state.env.log("error/html2wt/link", "Bad title text", node.outerHTML);
// For non-piped content, use the original invalid link text
pipedText = isPiped ? contentSrc : linkTarget;
if (requiresEscaping) {
// Escape the text in the old sol context
state.onSOL = oldSOLState;
pipedText = state.serializer.wteHandlers.escapeWikiText(state, pipedText, { node: node });
}
state.emitChunk(linkData.prefix + pipedText + linkData.tail, node);
} else {
if (isPiped && requiresEscaping) {
// We are definitely not in sol context since content
// will be preceded by "[[" or "[" text in target wikitext.
pipedText = '|' + state.serializer.wteHandlers.escapeLinkContent(state, contentSrc, false, node, false);
} else if (isPiped) {
pipedText = '|' + contentSrc;
} else {
pipedText = '';
}
state.emitChunk(new WikiLinkText(
linkData.prefix + '[[' + linkTarget + pipedText + ']]' + linkData.tail,
node, wiki, linkData.type), node);
}
});
var serializeAsExtLink = Promise.async(function *(node, state, linkData) {
var target = linkData.target;
var urlStr = target.value;
if (target.modified || !target.fromsrc) {
// We expect modified hrefs to be percent-encoded already, so
// don't need to encode them here any more. Unmodified hrefs are
// just using the original encoding anyway.
// BUT we do have to encode certain special wikitext
// characters (like []) which aren't necessarily
// percent-encoded because they are valid in URLs and HTML5
urlStr = escapeExtLinkURL(urlStr);
}
if (isURLLink(state.env, node, linkData)) {
// Serialize as URL link
state.emitChunk(new AutoURLLinkText(urlStr, node), node);
return;
}
var wiki = state.env.conf.wiki;
// TODO: match vs. interwikis too
var magicLinkMatch = wiki.ExtResourceURLPatternMatcher.match(Util.decodeURI(linkData.origHref));
var pureHashMatch = /^#/.test(urlStr);
// Fully serialize the content
var contentStr = yield state.serializeLinkChildrenToString(
node,
pureHashMatch ?
state.serializer.wteHandlers.wikilinkHandler :
state.serializer.wteHandlers.aHandler
);
// First check for ISBN/RFC/PMID links. We rely on selser to
// preserve non-minimal forms.
if (magicLinkMatch) {
var serializer = wiki.ExtResourceSerializer[magicLinkMatch[0]];
var serialized = serializer(magicLinkMatch, target.value, contentStr);
if (serialized[0] === '[') {
// Serialization as a magic link failed (perhaps the
// content string wasn't appropriate).
state.emitChunk(
magicLinkMatch[0] === 'ISBN' ?
new WikiLinkText(serialized, node, wiki, 'mw:WikiLink') :
new ExtLinkText(serialized, node, wiki, 'mw:ExtLink'),
node
);
} else {
state.emitChunk(new MagicLinkText(serialized, node), node);
}
return;
// There is an interwiki for RFCs, but strangely none for PMIDs.
} else {
// serialize as auto-numbered external link
// [http://example.com]
var linktext, Construct;
// If it's just anchor text, serialize as an internal link.
if (pureHashMatch) {
Construct = WikiLinkText;
linktext = '[[' + urlStr + (contentStr ? '|' + contentStr : '') + ']]';
} else {
Construct = ExtLinkText;
linktext = '[' + urlStr + (contentStr ? ' ' + contentStr : '') + ']';
}
state.emitChunk(new Construct(linktext, node, wiki, linkData.type), node);
return;
}
});
/**
* Main link handler.
* @function
* @param {Node} node
* @return {Promise}
*/
var linkHandler = Promise.async(function *(state, node) {
// TODO: handle internal/external links etc using RDFa and dataAttribs
// Also convert unannotated html links without advanced attributes to
// external wiki links for html import. Might want to consider converting
// relative links without path component and file extension to wiki links.
var env = state.env;
var wiki = env.conf.wiki;
// Get the rt data from the token and tplAttrs
var linkData = yield getLinkRoundTripData(env, node, state);
var linkType = linkData.type;
if (wiki.ExtResourceURLPatternMatcher.match(Util.decodeURI(linkData.origHref))) {
// Override the 'rel' type if this is a magic link
linkType = 'mw:ExtLink';
}
if (linkType !== null && linkData.target.value !== null) {
// We have a type and target info
if (/^mw:WikiLink|mw:MediaLink$/.test(linkType) ||
TokenUtils.solTransparentLinkRegexp.test(linkType)) {
// [[..]] links: normal, category, redirect, or lang links
// (except images)
return (yield serializeAsWikiLink(node, state, linkData));
} else if (linkType === 'mw:ExtLink') {
// [..] links, autolinks, ISBN, RFC, PMID
return (yield serializeAsExtLink(node, state, linkData));
} else {
throw new Error('Unhandled link serialization scenario: ' +
node.outerHTML);
}
} else {
var safeAttr = new Set(["href", "rel", "class", "title", DOMDataUtils.DataObjectAttrName()]);
var isComplexLink = function(attributes) {
for (var i = 0; i < attributes.length; i++) {
var attr = attributes.item(i);
// XXX: Don't drop rel and class in every case once a tags are
// actually supported in the MW default config?
if (attr.name && !safeAttr.has(attr.name)) {
return true;
}
}
return false;
};
var isFigure = false;
if (isComplexLink(node.attributes)) {
env.log("error/html2wt/link", "Encountered", node.outerHTML,
"-- serializing as extlink and dropping <a> attributes unsupported in wikitext.");
} else {
var media = DOMUtils.selectMediaElt(node);
isFigure = !!(media && media.parentElement === node);
}
var hrefStr;
if (isFigure) {
// this is a basic html figure: <a><img></a>
return (yield state.serializer.figureHandler(node));
} else {
// href is already percent-encoded, etc., but it might contain
// spaces or other wikitext nasties. escape the nasties.
hrefStr = escapeExtLinkURL(getHref(env, node));
var handler = state.serializer.wteHandlers.aHandler;
var str = yield state.serializeLinkChildrenToString(node, handler);
var chunk;
if (!hrefStr) {
// Without an href, we just emit the string as text.
// However, to preserve targets for anchor links,
// serialize as a span with a name.
if (node.hasAttribute('name')) {
var name = node.getAttribute('name');
var doc = node.ownerDocument;
var span = doc.createElement('span');
span.setAttribute('name', name);
span.appendChild(doc.createTextNode(str));
chunk = span.outerHTML;
} else {
chunk = str;
}
} else {
chunk = new ExtLinkText('[' + hrefStr + ' ' + str + ']',
node, wiki, 'mw:ExtLink');
}
state.emitChunk(chunk, node);
}
}
});
function eltNameFromMediaType(type) {
switch (type) {
case 'mw:Audio':
return 'AUDIO';
case 'mw:Video':
return 'VIDEO';
default:
return 'IMG';
}
}
/**
* Main figure handler.
*
* All figures have a fixed structure:
* ```
* <figure or figure-inline typeof="mw:Image...">
* <a or span><img ...><a or span>
* <figcaption>....</figcaption>
* </figure or figure-inline>
* ```
* Pull out this fixed structure, being as generous as possible with
* possibly-broken HTML.
*
* @function
* @param {Node} node
* @return {Promise}
*/
var figureHandler = Promise.async(function *(state, node) {
var env = state.env;
var outerElt = node;
const mediaTypeInfo = WTSUtils.getMediaType(node);
const { rdfaType } = mediaTypeInfo;
let { format } = mediaTypeInfo;
var eltName = eltNameFromMediaType(rdfaType);
var elt = node.querySelector(eltName);
// TODO: Remove this when version 1.7.0 of the content is no longer supported
if (!elt && rdfaType === 'mw:Audio') {
eltName = 'VIDEO';
elt = node.querySelector(eltName);
}
var linkElt = null;
// parent of elt is probably the linkElt
if (elt &&
(elt.parentElement.tagName === 'A' ||
(elt.parentElement.tagName === 'SPAN' &&
elt.parentElement !== outerElt))) {
linkElt = elt.parentElement;
}
// FIGCAPTION or last child (which is not the linkElt) is the caption.
var captionElt = node.querySelector('FIGCAPTION');
if (!captionElt) {
for (captionElt = node.lastElementChild;
captionElt;
captionElt = captionElt.previousElementSibling) {
if (captionElt !== linkElt && captionElt !== elt &&
/^(SPAN|DIV)$/.test(captionElt.tagName)) {
break;
}
}
}
// special case where `node` is the ELT tag itself!
if (node.tagName === eltName) {
linkElt = captionElt = null;
outerElt = elt = node;
}
// Maybe this is "missing" media, i.e. a redlink
let isMissing = false;
if (!elt &&
/^FIGURE/.test(outerElt.nodeName) &&
outerElt.firstChild && outerElt.firstChild.nodeName === 'A' &&
outerElt.firstChild.firstChild && outerElt.firstChild.firstChild.nodeName === 'SPAN') {
linkElt = outerElt.firstChild;
elt = linkElt.firstChild;
isMissing = true;
}
// The only essential thing is the ELT tag!
if (!elt) {
env.log("error/html2wt/figure",
"In WSP.figureHandler, node does not have any " + eltName + " elements:",
node.outerHTML);
state.emitChunk('', node);
return;
}
// Try to identify the local title to use for this image.
var resource = yield state.serializer.serializedImageAttrVal(outerElt, elt, 'resource');
if (resource.value === null) {
// from non-parsoid HTML: try to reconstruct resource from src?
// (this won't work for manual-thumb images)
if (!elt.hasAttribute('src')) {
env.log("error/html2wt/figure",
"In WSP.figureHandler, img does not have resource or src:",
node.outerHTML);
state.emitChunk('', node);
return;
}
var src = elt.getAttribute('src');
if (/^https?:/.test(src)) {
// external image link, presumably $wgAllowExternalImages=true
state.emitChunk(new AutoURLLinkText(src, node), node);
return;
}
resource = {
value: src,
fromsrc: false,
modified: false,
};
}
if (!resource.fromsrc) {
resource.value = resource.value.replace(/^(\.\.?\/)+/, '');
}
var nopts = [];
var outerDP = outerElt ? DOMDataUtils.getDataParsoid(outerElt) : {};
var outerDMW = outerElt ? DOMDataUtils.getDataMw(outerElt) : {};
var mwAliases = state.env.conf.wiki.mwAliases;
var getOpt = function(key) {
if (!outerDP.optList) {
return null;
}
return outerDP.optList.find(function(o) { return o.ck === key; });
};
var getLastOpt = function(key) {
var o = outerDP.optList || [];
for (var i = o.length - 1; i >= 0; i--) {
if (o[i].ck === key) {
return o[i];
}
}
return null;
};
// Try to identify the local title to use for the link.
let link;
const linkFromDataMw = WTSUtils.getAttrFromDataMw(outerDMW, 'link', true);
if (linkFromDataMw !== null) {
// "link" attribute on the `outerElt` takes precedence
if (linkFromDataMw[1].html !== undefined) {
link = yield state.serializer.getAttributeValueAsShadowInfo(outerElt, 'link');
} else {
link = {
value: `link=${linkFromDataMw[1].txt}`,
modified: false,
fromsrc: false,
fromDataMW: true,
};
}
} else if (linkElt && linkElt.hasAttribute('href')) {
link = yield state.serializer.serializedImageAttrVal(outerElt, linkElt, 'href');
if (!link.fromsrc) {
if (linkElt.getAttribute('href') ===
elt.getAttribute('resource')) {
// default link: same place as resource
link = resource;
}
link.value = link.value.replace(/^(\.\.?\/)+/, '');
}
} else {
// Otherwise, just try and get it from data-mw
link = yield state.serializer.getAttributeValueAsShadowInfo(outerElt, 'href');
}
if (link && !link.modified && !link.fromsrc) {
const linkOpt = getOpt('link');
if (linkOpt) {
link.fromsrc = true;
link.value = linkOpt.ak;
}
}
// Reconstruct the caption
if (!captionElt && typeof outerDMW.caption === 'string') {
captionElt = outerElt.ownerDocument.createElement('div');
ContentUtils.ppToDOM(env, outerDMW.caption, { node: captionElt, markNew: true });
// Needs a parent node in order for WTS to be happy:
// DocumentFragment to the rescue!
outerElt.ownerDocument.createDocumentFragment().appendChild(captionElt);
}
var caption = null;
if (captionElt) {
caption = yield state.serializeCaptionChildrenToString(
captionElt, state.serializer.wteHandlers.mediaOptionHandler
);
}
// Fetch the alt (if any)
var alt =
yield state.serializer.serializedImageAttrVal(outerElt, elt, 'alt');
// Fetch the lang (if any)
var lang =
yield state.serializer.serializedImageAttrVal(outerElt, elt, 'lang');
// Ok, start assembling options, beginning with link & alt & lang
// Other media don't have links in output.
const linkCond = elt.nodeName === 'IMG' && (!link || link.value !== resource.value);
// "alt" for non-image is handle below
const altCond = alt.value !== null && elt.nodeName === 'IMG';
[
{ name: 'link', value: link, cond: linkCond },
{ name: 'alt', value: alt, cond: altCond },
{ name: 'lang', value: lang, cond: lang.value !== null },
].forEach(function(o) {
if (!o.cond) { return; }
if (o.value && o.value.fromsrc) {
nopts.push({
ck: o.name,
ak: [ o.value.value ],
});
} else {
let value = o.value ? o.value.value : '';
if (o.value && /^(link|alt)$/.test(o.name)) {
// see wt2html/tt/WikiLinkHandler.js: link and alt are whitelisted
// for accepting arbitrary wikitext, even though it is stripped
// to a string before emitting.
value = state.serializer.wteHandlers.escapeLinkContent(state, value, false, node, true);
}
nopts.push({
ck: o.name,
v: value,
ak: mwAliases['img_' + o.name],
});
}
});
// Handle class-signified options
var classes = outerElt ? outerElt.classList : [];
var extra = []; // 'extra' classes
var val;
for (var ix = 0; ix < classes.length; ix++) {
switch (classes[ix]) {
case 'mw-halign-none':
case 'mw-halign-right':
case 'mw-halign-left':
case 'mw-halign-center':
val = classes[ix].replace(/^mw-halign-/, '');
nopts.push({
ck: val,
ak: mwAliases['img_' + val],
});
break;
case 'mw-valign-top':
case 'mw-valign-middle':
case 'mw-valign-baseline':
case 'mw-valign-sub':
case 'mw-valign-super':
case 'mw-valign-text-top':
case 'mw-valign-bottom':
case 'mw-valign-text-bottom':
val = classes[ix].replace(/^mw-valign-/, '')
.replace(/-/g, '_');
nopts.push({
ck: val,
ak: mwAliases['img_' + val],
});
break;
case 'mw-image-border':
nopts.push({
ck: 'border',
ak: mwAliases.img_border,
});
break;
case 'mw-default-size':
case 'mw-default-audio-height':
// handled below
break;
default:
extra.push(classes[ix]);
break;
}
}
if (extra.length) {
nopts.push({
ck: 'class',
v: extra.join(' '),
ak: mwAliases.img_class,
});
}
var paramFromDataMw = Promise.async(function *(o) {
var v = outerDMW[o.prop];
if (v === undefined) {
var a = WTSUtils.getAttrFromDataMw(outerDMW, o.ck, true);
if (a !== null && a[1].html === undefined) { v = a[1].txt; }
}
if (v !== undefined) {
var ak = yield state.serializer.getAttributeValue(
outerElt, o.ck, mwAliases[o.alias]
);
nopts.push({
ck: o.ck,
ak: ak,
v: v,
});
// Piggyback this here ...
if (o.prop === 'thumb') { format = ''; }
}
});
let mwParams = [
{ prop: 'thumb', ck: 'manualthumb', alias: 'img_manualthumb' },
{ prop: 'page', ck: 'page', alias: 'img_page' },
// mw:Video specific
{ prop: 'starttime', ck: 'starttime', alias: 'timedmedia_starttime' },
{ prop: 'endtime', ck: 'endtime', alias: 'timedmedia_endtime' },
{ prop: 'thumbtime', ck: 'thumbtime', alias: 'timedmedia_thumbtime' },
];
// "alt" for images is handled above
if (elt.nodeName !== 'IMG') {
mwParams = mwParams.concat([
{ prop: 'link', ck: 'link', alias: 'img_link' },
{ prop: 'alt', ck: 'alt', alias: 'img_alt' },
]);
}
yield Promise.map(mwParams, paramFromDataMw);
switch (format) {
case 'Thumb':
nopts.push({
ck: 'thumbnail',
ak: yield state.serializer.getAttributeValue(
outerElt, 'thumbnail', mwAliases.img_thumbnail
)
});
break;
case 'Frame':
nopts.push({
ck: 'framed',
ak: yield state.serializer.getAttributeValue(
outerElt, 'framed', mwAliases.img_framed
)
});
break;
case 'Frameless':
nopts.push({
ck: 'frameless',
ak: yield state.serializer.getAttributeValue(
outerElt, 'frameless', mwAliases.img_frameless
)
});
break;
}
// Get the user-specified height from wikitext
var wh =
yield state.serializer.serializedImageAttrVal(outerElt, elt, `${isMissing ? 'data-' : ''}height`);
// Get the user-specified width from wikitext
var ww =
yield state.serializer.serializedImageAttrVal(outerElt, elt, `${isMissing ? 'data-' : ''}width`);
var sizeUnmodified = ww.fromDataMW || (!ww.modified && !wh.modified);
var upright = getOpt('upright');
// XXX: Infer upright factor from default size for all thumbs by default?
// Better for scaling with user prefs, but requires knowledge about
// default used in VE.
if (sizeUnmodified && upright
// Only serialize upright where it is actually respected
// This causes some dirty diffs, but makes sure that we don't
// produce nonsensical output after a type switch.
// TODO: Only strip if type was actually modified.
&& format in { 'Frameless': 1, 'Thumb': 1 }) {
// preserve upright option
nopts.push({
ck: upright.ck,
ak: [upright.ak], // FIXME: don't use ak here!
});
}
if (!(outerElt && outerElt.classList.contains('mw-default-size'))) {
var size = getLastOpt('width');
var sizeString = (size && size.ak) || (ww.fromDataMW && ww.value);
if (sizeUnmodified && sizeString) {
// preserve original width/height string if not touched
nopts.push({
ck: 'width',
v: sizeString, // original size string
ak: ['$1'], // don't add px or the like
});
} else {
var bbox = null;
// Serialize to a square bounding box
if (ww.value !== null && ww.value !== ''
&& ww.value !== undefined) {
bbox = +ww.value;
}
if (wh.value !== null && wh.value !== '' && wh.value !== undefined &&
// As with "mw-default-size", editing clients should remove the
// "mw-default-audio-height" if they want to factor a defined
// height into the bounding box size. However, note that, at
// present, a defined height for audio is ignored while parsing,
// so this only has the effect of modifying the width.
(rdfaType !== 'mw:Audio' ||
!outerElt.classList.contains('mw-default-audio-height'))) {
var height = +wh.value;
if (bbox === null || height > bbox) {
bbox = height;
}
}
if (bbox !== null) {
nopts.push({
ck: 'width',
// MediaWiki interprets 100px as a width
// restriction only, so we need to make the bounding
// box explicitly square (100x100px). The 'px' is
// added by the alias though, and can be localized.
v: bbox + 'x' + bbox,
ak: mwAliases.img_width, // adds the 'px' suffix
});
}
}
}
var opts = outerDP.optList || []; // original wikitext options
// Add bogus options from old optlist in order to round-trip cleanly (T64500)
opts.forEach(function(o) {
if (o.ck === 'bogus') {
nopts.push({
ck: 'bogus',
ak: [ o.ak ],
});
}
});
// Put the caption last, by default.
if (typeof (caption) === 'string') {
nopts.push({
ck: 'caption',
ak: [caption],
});
}
// ok, sort the new options to match the order given in the old optlist
// and try to match up the aliases used
var changed = false;
nopts.forEach(function(no) {
// Make sure we have an array here. Default in data-parsoid is
// actually a string.
// FIXME: don't reuse ak for two different things!
if (!Array.isArray(no.ak)) {
no.ak = [no.ak];
}
no.sortId = opts.length;
var idx = opts.findIndex(function(o) {
return o.ck === no.ck &&
// for bogus options, make sure the source matches too.
(o.ck !== 'bogus' || o.ak === no.ak[0]);
});
if (idx < 0) {
// Preferred words are first in the alias list
// (but not in old versions of mediawiki).
no.ak = state.env.conf.wiki.useOldAliasOrder ?
lastItem(no.ak) : no.ak[0];
changed = true;
return; /* new option */
}
no.sortId = idx;
// use a matching alias, if there is one
var a = no.ak.find(function(b) {
// note the trim() here; that allows us to snarf eccentric
// whitespace from the original option wikitext
if ('v' in no) { b = b.replace('$1', no.v); }
return b === String(opts[idx].ak).trim();
});
// use the alias (incl whitespace) from the original option wikitext
// if found; otherwise use the last alias given (English default by
// convention that works everywhere).
// TODO: use first alias (localized) instead for RTL languages (T53852)
if (a !== undefined && no.ck !== 'caption') {
no.ak = opts[idx].ak;
no.v = undefined; // prevent double substitution
} else {
no.ak = lastItem(no.ak);
if (!(no.ck === 'caption' && a !== undefined)) {
changed = true;
}
}
});
// Filter out bogus options if the image options/caption have changed.
if (changed) {
nopts = nopts.filter(function(no) { return no.ck !== 'bogus'; });
// empty captions should get filtered out in this case, too (T64264)
nopts = nopts.filter(function(no) {
return !(no.ck === 'caption' && no.ak === '');
});
}
// sort!
nopts.sort(function(a, b) { return a.sortId - b.sortId; });
// emit all the options as wikitext!
var wikitext = '[[' + resource.value;
nopts.forEach(function(o) {
wikitext += '|';
if (o.v !== undefined) {
wikitext += o.ak.replace('$1', o.v);
} else {
wikitext += o.ak;
}
});
wikitext += ']]';
state.emitChunk(
new WikiLinkText(wikitext, node, state.env.conf.wiki, rdfaType),
node);
});
if (typeof module === "object") {
module.exports.linkHandler = linkHandler;
module.exports.figureHandler = figureHandler;
}