/** @module */
'use strict';
require('../../core-upgrade.js');
const { DOMUtils } = require('../utils/DOMUtils.js');
const { JSUtils } = require('../utils/jsutils.js');
const { PegTokenizer } = require('../wt2html/tokenizer.js');
const { SanitizerConstants } = require('../wt2html/tt/Sanitizer.js');
const { TagTk, EndTagTk, SelfclosingTagTk, NlTk, EOFTk, CommentTk } = require('../tokens/TokenTypes.js');
const { TokenUtils } = require('../utils/TokenUtils.js');
const { Util } = require('../utils/Util.js');
const { WTUtils } = require('../utils/WTUtils.js');
const { WikitextConstants: Consts } = require('../config/WikitextConstants.js');
// ignore the cases where the serializer adds newlines not present in the dom
function startsOnANewLine(node) {
var name = node.nodeName.toUpperCase();
return Consts.BlockScopeOpenTags.has(name) &&
!WTUtils.isLiteralHTMLNode(node) &&
name !== "BLOCKQUOTE";
}
// look ahead on current line for block content
function hasBlocksOnLine(node, first) {
// special case for firstNode:
// we're at sol so ignore possible \n at first char
if (first) {
if (node.textContent.substring(1).match(/\n/)) {
return false;
}
node = node.nextSibling;
}
while (node) {
if (DOMUtils.isElt(node)) {
if (DOMUtils.isBlockNode(node)) {
return !startsOnANewLine(node);
}
if (node.hasChildNodes()) {
if (hasBlocksOnLine(node.firstChild, false)) {
return true;
}
}
} else {
if (node.textContent.match(/\n/)) {
return false;
}
}
node = node.nextSibling;
}
return false;
}
function hasLeadingEscapableQuoteChar(text, opts) {
var node = opts.node;
// Use 'node.textContent' to do the tests since it hasn't had newlines
// stripped out from it.
// Ex: For this DOM: <i>x</i>\n'\n<i>y</i>
// node.textContent = \n'\n and text = '
// Those newline separators can prevent unnecessary <nowiki/> protection
// if the string begins with one or more newlines before a leading quote.
var origText = node.textContent;
if (origText.match(/^'/)) {
var prev = DOMUtils.previousNonDeletedSibling(node);
if (!prev) {
prev = node.parentNode;
}
if (DOMUtils.isQuoteElt(prev)) {
return true;
}
}
return false;
}
function hasTrailingEscapableQuoteChar(text, opts) {
var node = opts.node;
// Use 'node.textContent' to do the tests since it hasn't had newlines
// stripped out from it.
// Ex: For this DOM: <i>x</i>\n'\n<i>y</i>
// node.textContent = \n'\n and text = '
// Those newline separators can prevent unnecessary <nowiki/> protection
// if the string ends with a trailing quote and then one or more newlines.
var origText = node.textContent;
if (origText.match(/'$/)) {
var next = DOMUtils.nextNonDeletedSibling(node);
if (!next) {
next = node.parentNode;
}
if (DOMUtils.isQuoteElt(next)) {
return true;
}
}
return false;
}
// SSS FIXME: By doing a DOM walkahead to identify what else is on the current line,
// these heuristics can be improved. Ex: '<i>foo</i> blah blah does not require a
// <nowiki/> after the single quote since we know that there are no other quotes on
// the rest of the line that will be emitted. Similarly, '' does not need a <nowiki>
// wrapper since there are on other quote chars on the line.
//
// This is checking text-node siblings of i/b tags.
function escapedIBSiblingNodeText(state, text, opts) {
// For a sequence of 2+ quote chars, we have to
// fully wrap the sequence in <nowiki>...</nowiki>
// <nowiki/> at the start and end doesn't work.
//
// Ex: ''<i>foo</i> should serialize to <nowiki>''</nowiki>''foo''.
//
// Serializing it to ''<nowiki/>''foo'' breaks html2html semantics
// since it will parse back to <i><meta../></i>foo<i></i>
if (text.match(/''+/)) {
// Minimize the length of the string that is wrapped in <nowiki>.
var pieces = text.split("'");
var first = pieces.shift();
var last = pieces.pop();
return first + "<nowiki>'" + pieces.join("'") + "'</nowiki>" + last;
}
// Check whether the head and/or tail of the text needs <nowiki/> protection.
var out = '';
if (hasTrailingEscapableQuoteChar(text, opts)) {
state.hasQuoteNowikis = true;
out = text + "<nowiki/>";
}
if (hasLeadingEscapableQuoteChar(text, opts)) {
state.hasQuoteNowikis = true;
out = "<nowiki/>" + (out || text);
}
return out;
}
const linkEscapeRE = /(\[\[)|(\]\])|(-\{)|(^[^\[]*\]$)/;
/**
* @class
* @alias module:html2wt/WikitextEscapeHandlers
* @param {MWParserEnvironment} env
* @param {WikitextSerializer} serializer
*/
class WikitextEscapeHandlers {
constructor(options) {
this.tokenizer = new PegTokenizer(options.env);
this.options = options;
}
isFirstContentNode(node) {
// Conservative but safe
if (!node) {
return true;
}
// Skip deleted-node markers
return DOMUtils.previousNonDeletedSibling(node) === null;
}
liHandler(liNode, state, text, opts) {
if (opts.node.parentNode !== liNode) {
return false;
}
// For <dt> nodes, ":" trigger nowiki outside of elements
// For first nodes of <li>'s, bullets in sol posn trigger escaping
if (liNode.nodeName === 'DT' && /:/.test(text)) {
return true;
} else if (/^[#*:;]*$/.test(state.currLine.text) && this.isFirstContentNode(opts.node)) {
// Wikitext styling might require whitespace insertion after list bullets.
// In those scenarios, presence of bullet-wiktext in the text node is okay.
// Hence the check for /^[#*:;]*$/ above.
return text.match(/^[#*:;]/);
} else {
return false;
}
}
thHandler(thNode, state, text, opts) {
// {|
// !a<div>!!b</div>
// !c<div>||d</div>
// |}
//
// The <div> will get split across two <th> tags because
// the !! and | has higher precedence in the tokenizer.
//
// So, no matter where in the DOM subtree of the <th> node
// that text shows up in, we have to unconditionally escape
// the !! and | characters.
//
// That is, so long as it serializes to the same line as the
// heading was started.
return state.currLine.text.match(/^\s*!/) && text.match(/^[^\n]*!!|\|/);
}
mediaOptionHandler(state, text) {
return /\|/.test(text) || linkEscapeRE.test(text);
}
wikilinkHandler(state, text) {
return linkEscapeRE.test(text);
}
aHandler(state, text) {
return text.match(/\]/);
}
tdHandler(tdNode, inWideTD, state, text, opts) {
/*
* "|" anywhere in a text node of the <td> subtree can be trouble!
* It is not sufficient to just look at immediate child of <td>
* Try parsing the following table:
*
* {|
* |a''b|c''
* |}
*
* Similarly, "-" or "+" when emitted after a "|" in sol position
* is trouble, but in addition to showing up as the immediate first
* child of tdNode, they can appear on the leftmost path from
* tdNode as long as the path only has nodes don't emit any wikitext.
* Ex: <td><p>-</p></td>, but not: <td><small>-</small></td>
*/
// If 'text' is on the same wikitext line as the "|" corresponding
// to the <td>
// * | in a td should be escaped
// * +-} in SOL position (if they show up on the leftmost path with
// only zero-wt-emitting nodes on that path)
return (!opts.node || state.currLine.firstNode === tdNode) &&
(/\|/.test(text) || (
!inWideTD &&
state.currLine.text === '|' &&
text.match(/^[\-+}]/) &&
opts.node && DOMUtils.pathToAncestor(opts.node, tdNode).every(function(n) {
return this.isFirstContentNode(n) &&
(n === opts.node || WTUtils.isZeroWidthWikitextElt(n));
}, this)
));
}
// Tokenize string and pop EOFTk
tokenizeStr(str, sol) {
var tokens = this.tokenizer.tokenizeSync(str, { sol });
console.assert(tokens.pop().constructor === EOFTk, 'Expected EOF token!');
return tokens;
}
textCanParseAsLink(node, state, text) {
state.env.log("trace/wt-escape", "link-test-text=", function() { return JSON.stringify(text); });
// Strip away extraneous characters after a ]] or a ]
// They are inessential to the test of whether the ]]/]
// will get parsed into a wikilink and only complicate
// the logic (needing to ignore entities, etc.).
text = text.replace(/\][^\]]*$/, ']');
// text only contains ']' chars.
// Since we stripped everything after ']' above, if a newline is
// present, a link would have to straddle newlines which is not valid.
if (/\n/.test(text)) {
return false;
}
var str = state.currLine.text + text;
var tokens = this.tokenizeStr(str, false); // sol state is irrelevant here
var n = tokens.length;
var lastToken = tokens[n - 1];
state.env.log("trace/wt-escape", "str=", str, ";tokens=", tokens);
// If 'text' remained outside of any non-string tokens,
// it does not need nowiking.
if (lastToken === text || (typeof lastToken === 'string' &&
text === lastToken.substring(lastToken.length - text.length))) {
return false;
}
// Verify that the tokenized links are valid links
var buf = '';
for (var i = n - 1; i >= 0; i--) {
var t = tokens[i];
if (typeof t === 'string') {
buf = t + buf;
} else if (t.name === 'wikilink') {
var target = t.getAttribute('href');
if (Array.isArray(target)) {
// FIXME: in theory template expansion *could* make this a link.
return false;
}
if (state.env.isValidLinkTarget(target) &&
!state.env.conf.wiki.hasValidProtocol(target)) {
return true;
}
// Assumes 'src' will always be present which it seems to be.
// Tests will fail if anything changes in the tokenizer.
buf = t.dataAttribs.src + buf;
} else if (t.name === 'extlink') {
// Check if the extlink came from a template which in the end
// would not really parse as an extlink.
var href = t.getAttribute('href');
if (Array.isArray(href)) {
href = href[0];
}
if (!TokenUtils.isTemplateToken(href)) {
// Not a template and a real href => needs nowiking
if (typeof href === 'string' && /https?:\/\//.test(href)) {
return true;
}
} else {
while (node) {
node = DOMUtils.previousNonSepSibling(node);
if (WTUtils.isFirstEncapsulationWrapperNode(node)) {
// FIXME: This is not entirely correct.
// Assumes that extlink content doesn't have templates.
// Solution: Count # of non-nested templates encountered
// and skip over intermediate templates.
// var content = t.getAttribute('mw:content');
// var n = intermediateNonNestedTemplates(content);
break;
}
}
if (node && node.nodeName === 'A' &&
node.textContent === node.getAttribute('href')) {
// The template expands to an url link => needs nowiking
return true;
}
}
// Since this will not parse to a real extlink,
// update buf with the wikitext src for this token.
var tsr = t.dataAttribs.tsr;
buf = str.substring(tsr[0], tsr[1]) + buf;
} else {
// We have no other smarts => be conservative.
return true;
}
if (text === buf.substring(buf.length - text.length)) {
// 'text' emerged unscathed
return false;
}
}
// We couldn't prove safety of skipping nowiki-ing.
return true;
}
hasWikitextTokens(state, onNewline, options, text) {
state.env.log("trace/wt-escape", "nl:", onNewline, ":text=", function() { return JSON.stringify(text); });
// tokenize the text
var sol = onNewline && !(state.inIndentPre || state.inPPHPBlock);
// If we're inside a <pre>, we need to add an extra space after every
// newline so that the tokenizer correctly parses all tokens in a pre
// instead of just the first one. See T95794.
if (state.inIndentPre) {
text = text.replace(/\n/g, "\n ");
}
var tokens = this.tokenizeStr(text, sol);
// If the token stream has a TagTk, SelfclosingTagTk, EndTagTk or CommentTk
// then this text needs escaping!
var numEntities = 0;
for (var i = 0, n = tokens.length; i < n; i++) {
var t = tokens[i];
state.env.log("trace/wt-escape", "T:", function() { return JSON.stringify(t); });
var tc = t.constructor;
// Ignore non-whitelisted html tags
if (TokenUtils.isHTMLTag(t)) {
if (/(?:^|\s)mw:Extension(?=$|\s)/.test(t.getAttribute("typeof")) &&
options.extName !== t.getAttribute("name")) {
return true;
}
// Always escape isolated extension tags (T59469). Consider this:
// echo "<ref>foo<p></ref></p>" | node parse --html2wt
// The <ref> and </ref> tag-like text is spread across the DOM, and in
// the worst case can be anywhere. So, we conservatively escape these
// elements always (which can lead to excessive nowiki-escapes in some
// cases, but is always safe).
if ((tc === TagTk || tc === EndTagTk) &&
state.env.conf.wiki.extConfig.tags.has(t.name.toLowerCase())) {
return true;
}
// If the tag is one that's allowed in wikitext, we need to escape
// it inside <nowiki>s, because a text node nodeValue always returns
// non-escaped entities (that is, converts "<h2>" to "<h2>").
// TODO: We should also do this for <a> tags because even if they
// aren't allowed in wikitext and thus don't need to be escaped, the
// result can be confusing for editors. However, doing it here in a
// simple way interacts badly with normal link escaping, so it's
// left for later.
if (Consts.Sanitizer.TagWhiteList.has(t.name.toUpperCase())) {
return true;
} else {
continue;
}
}
if (tc === SelfclosingTagTk) {
// * Ignore RFC/ISBN/PMID tokens when those are encountered in the
// context of another link's content -- those are not parsed to
// ext-links in that context. (T109371)
if ((t.name === 'extlink' || t.name === 'wikilink') && t.dataAttribs && t.dataAttribs.stx === 'magiclink' && (state.inAttribute || state.inLink)) {
continue;
}
// Ignore url links in attributes (href, mostly)
// since they are not in danger of being autolink-ified there.
if (t.name === 'urllink' && (state.inAttribute || state.inLink)) {
continue;
}
// Ignore invalid behavior-switch tokens
if (t.name === 'behavior-switch' && !state.env.conf.wiki.isMagicWord(t.attribs[0].v)) {
continue;
}
// ignore TSR marker metas
if (t.name === 'meta' && t.getAttribute('typeof') === 'mw:TSRMarker') {
continue;
}
if (t.name === 'wikilink') {
if (state.env.isValidLinkTarget(t.getAttribute("href"))) {
return true;
} else {
continue;
}
}
return true;
}
if (state.inCaption && tc === TagTk && t.name === 'listItem') {
continue;
}
if (tc === TagTk) {
var ttype = t.getAttribute('typeof');
// Ignore mw:Entity tokens
if (t.name === 'span' && ttype === 'mw:Entity') {
numEntities++;
continue;
}
// Ignore table tokens outside of tables
if (t.name in { caption: 1, td: 1, tr: 1, th: 1 } && !TokenUtils.isHTMLTag(t) && state.wikiTableNesting === 0) {
continue;
}
// Ignore display-hack placeholders and display spaces -- they dont need nowiki escaping
// They are added as a display-hack by the tokenizer (and we should probably
// find a better solution than that if one exists).
if (ttype && ttype.match(/(?:\b|mw:DisplaySpace\s+)mw:Placeholder\b/) && t.dataAttribs.isDisplayHack) {
// Skip over the entity and the end-tag as well
i += 2;
continue;
}
// Headings have both SOL and EOL requirements. This tokenization
// here only verifies SOL requirements, not EOL requirements.
// So, record this information so that we can strip unnecessary
// nowikis after the fact.
if (t.name.match(/^h\d$/)) {
state.hasHeadingEscapes = true;
}
return true;
}
if (tc === EndTagTk) {
// Ignore mw:Entity tokens
if (numEntities > 0 && t.name === 'span') {
numEntities--;
continue;
}
// Ignore heading tokens
if (t.name.match(/^h\d$/)) {
continue;
}
// Ignore table tokens outside of tables
if (t.name in { caption: 1, table: 1 } && state.wikiTableNesting === 0) {
continue;
}
// </br>!
if (SanitizerConstants.noEndTagSet.has(t.name.toLowerCase())) {
continue;
}
return true;
}
}
return false;
}
/**
* This function attempts to wrap smallest escapable units into
* nowikis (which can potentially add multiple nowiki pairs in a
* single string). The idea here is that since this should all be
* text, anything that tokenizes to another construct needs to be
* wrapped.
*
* Full-wrapping is enabled if the string is being escaped within
* context-specific handlers where the tokenization context might
* be different from what we use in this code.
*/
escapedText(state, sol, origText, fullWrap, dontWrapIfUnnecessary) {
var match = origText.match(/^([^]*?)((?:\r?\n)*)$/);
var text = match[1];
var nls = match[2];
if (fullWrap) {
return "<nowiki>" + text + "</nowiki>" + nls;
}
var buf = '';
var inNowiki = false;
var nowikisAdded = false;
var tokensWithoutClosingTag = new Set([
// These token types don't come with a closing tag
'listItem', 'td', 'tr',
]);
// reverse escaping nowiki tags
// we do this so that they tokenize as nowikis
// instead of entity enclosed text
text = text.replace(/<(\/?nowiki\s*\/?\s*)>/gi, '<$1>');
var tokens = this.tokenizeStr(text, sol);
var nowikiWrap = function(str, close) {
if (!inNowiki) {
buf += '<nowiki>';
inNowiki = true;
nowikisAdded = true;
}
buf += str;
if (close) {
buf += '</nowiki>';
inNowiki = false;
}
};
for (var i = 0, n = tokens.length; i < n; i++) {
var t = tokens[i];
if (t.constructor === String) {
if (t.length > 0) {
t = WTUtils.escapeNowikiTags(t);
if (!inNowiki && ((sol && t.match(/^ /)) || t.match(/\n /))) {
var x = t.split(/(^|\n) /g);
buf += x[0];
for (var k = 1; k < x.length - 1; k += 2) {
buf += x[k];
if (k !== 1 || x[k] === '\n' || sol) {
nowikiWrap(' ', true);
} else {
buf += ' ';
}
buf += x[k + 1];
}
} else {
buf += t;
}
sol = false;
}
continue;
}
// Ignore display hacks, so text like "A : B" doesn't produce
// an unnecessary nowiki.
if (t.dataAttribs && t.dataAttribs.isDisplayHack) {
continue;
}
var tsr = (t.dataAttribs || {}).tsr;
if (!Array.isArray(tsr)) {
state.env.log('error/html2wt/escapeNowiki',
'Missing tsr for token ',
JSON.stringify(t),
'while processing text ',
text);
// Bail and wrap the whole thing in a nowiki
// if we have missing information.
// Use match[1] since text has been clobbered above.
return '<nowiki>' + match[1] + '</nowiki>' + nls;
}
// Now put back the escaping we removed above
var tSrc = WTUtils.escapeNowikiTags(text.substring(tsr[0], tsr[1]));
switch (t.constructor) {
case NlTk:
buf += tSrc;
sol = true;
break;
case CommentTk:
// Comments are sol-transparent
buf += tSrc;
break;
case TagTk:
// Treat tokens with missing tags as self-closing tokens
// for the purpose of minimal nowiki escaping
nowikiWrap(tSrc, tokensWithoutClosingTag.has(t.name));
sol = false;
break;
case EndTagTk:
nowikiWrap(tSrc, true);
sol = false;
break;
case SelfclosingTagTk:
if (t.name !== 'meta' || !/^mw:(TSRMarker|EmptyLine)$/.test(t.getAttribute('typeof'))) {
// Don't bother with marker or empty-line metas
nowikiWrap(tSrc, true);
}
sol = false;
break;
}
}
// close any unclosed nowikis
if (inNowiki) {
buf += '</nowiki>';
}
// Make sure nowiki is always added
// Ex: "foo]]" won't tokenize into tags at all
if (!nowikisAdded && !dontWrapIfUnnecessary) {
buf = '';
nowikiWrap(text, true);
}
buf += nls;
return buf;
}
/**
* @param {Object} state
* @param {string} text
* @param {Object} opts
*/
escapeWikiText(state, text, opts) {
state.env.log("trace/wt-escape", "EWT:", function() { return JSON.stringify(text); });
/* -----------------------------------------------------------------
* General strategy: If a substring requires escaping, we can escape
* the entire string without further analysis of the rest of the string.
* ----------------------------------------------------------------- */
var hasMagicWord = /(^|\W)(RFC|ISBN|PMID)\s/.test(text);
var hasAutolink = state.env.conf.wiki.findValidProtocol(text);
var fullCheckNeeded = !state.inLink && (hasMagicWord || hasAutolink);
var hasLanguageConverter = false;
var hasQuoteChar = false;
var indentPreUnsafe = false;
var hasNonQuoteEscapableChars = false;
var indentPreSafeMode = state.inIndentPre || state.inPHPBlock;
var sol = state.onSOL && !indentPreSafeMode;
// Fast path for special protected characters.
if (state.protect && state.protect.test(text)) {
return this.escapedText(state, sol, text);
}
if (!fullCheckNeeded) {
hasQuoteChar = /'/.test(text);
indentPreUnsafe = (!indentPreSafeMode && (/\n +[^\r\n]*?[^\s]+/).test(text) || sol && (/^ +[^\r\n]*?[^\s]+/).test(text));
hasNonQuoteEscapableChars = /[<>\[\]\-\+\|!=#\*:;~{}]|__[^_]*__/.test(text);
hasLanguageConverter = /-\{|\}-/.test(text);
if (hasLanguageConverter) { fullCheckNeeded = true; }
}
// Quick check for the common case (useful to kill a majority of requests)
//
// Pure white-space or text without wt-special chars need not be analyzed
if (!fullCheckNeeded && !hasQuoteChar && !indentPreUnsafe && !hasNonQuoteEscapableChars) {
state.env.log("trace/wt-escape", "---No-checks needed---");
return text;
}
// Context-specific escape handler
var wteHandler = JSUtils.lastItem(state.wteHandlerStack);
if (wteHandler && wteHandler(state, text, opts)) {
state.env.log("trace/wt-escape", "---Context-specific escape handler---");
return this.escapedText(state, false, text, true);
}
// Quote-escape test
if (/''+/.test(text)
|| hasLeadingEscapableQuoteChar(text, opts)
|| hasTrailingEscapableQuoteChar(text, opts)) {
// Check if we need full-wrapping <nowiki>..</nowiki>
// or selective <nowiki/> escaping for quotes.
if (fullCheckNeeded
|| indentPreUnsafe
|| (hasNonQuoteEscapableChars &&
this.hasWikitextTokens(state, sol, this.options, text))) {
state.env.log("trace/wt-escape", "---quotes: escaping text---");
// If the reason for full wrap is that the text contains non-quote
// escapable chars, it's still possible to minimize the contents
// of the <nowiki> (T71950).
return this.escapedText(state, sol, text);
} else {
var quoteEscapedText = escapedIBSiblingNodeText(state, text, opts);
if (quoteEscapedText) {
state.env.log("trace/wt-escape", "---sibling of i/b tag---");
return quoteEscapedText;
}
}
}
// Template and template-arg markers are escaped unconditionally!
// Conditional escaping requires matching brace pairs and knowledge
// of whether we are in template arg context or not.
if (text.match(/\{\{\{|\{\{|\}\}\}|\}\}/)) {
state.env.log("trace/wt-escape", "---Unconditional: transclusion chars---");
return this.escapedText(state, false, text);
}
// Once we eliminate the possibility of multi-line tokens, split the text
// around newlines and escape each line separately.
if (/\n./.test(text)) {
state.env.log("trace/wt-escape", "-- <multi-line-escaping-mode> --");
// We've already processed the full string in a context-specific handler.
// No more additional processing required. So, push/pop a null handler.
state.wteHandlerStack.push(null);
var ret = text.split(/\n/).map((line, i) => {
if (i > 0) {
// Update state
state.onSOL = true;
state.currLine.text = '';
opts.inMultilineMode = true;
}
return this.escapeWikiText(state, line, opts);
}).join('\n');
state.wteHandlerStack.pop();
// If nothing changed, check if the original multiline string has
// any wikitext tokens (ex: multi-line html tags <div\n>foo</div\n>).
if (ret === text &&
this.hasWikitextTokens(state, sol, this.options, text)) {
state.env.log("trace/wt-escape", "---Found multi-line wt tokens---");
ret = this.escapedText(state, sol, text);
}
state.env.log("trace/wt-escape", "-- </multi-line-escaping-mode> --");
return ret;
}
state.env.log("trace/wt-escape", "SOL:", sol, function() { return JSON.stringify(text); });
var hasTildes = text.match(/~{3,5}/);
if (!fullCheckNeeded && !hasTildes) {
// {{, {{{, }}}, }} are handled above.
// Test 1: '', [], <>, __FOO__ need escaping wherever they occur
// = needs escaping in end-of-line context
// Test 2: {|, |}, ||, |-, |+, , *#:;, ----, =*= need escaping only in SOL context.
if (!sol && !text.match(/''|[<>]|\[.*\]|\]|(=[ ]*(\n|$))|__[^_]*__/)) {
// It is not necessary to test for an unmatched opening bracket ([)
// as long as we always escape an unmatched closing bracket (]).
state.env.log("trace/wt-escape", "---Not-SOL and safe---");
return text;
}
// Quick checks when on a newline
// + can only occur as "|+" and - can only occur as "|-" or ----
if (sol && !text.match(/(^|\n)[ #*:;=]|[<\[\]>\|'!]|\-\-\-\-|__[^_]*__/)) {
state.env.log("trace/wt-escape", "---SOL and safe---");
return text;
}
}
// The front-end parser eliminated pre-tokens in the tokenizer
// and moved them to a stream handler. So, we always conservatively
// escape text with ' ' in sol posn with one caveat:
// * and when the current line has block tokens
if (indentPreUnsafe &&
(!hasBlocksOnLine(state.currLine.firstNode, true) || opts.inMultilineMode)
) {
state.env.log("trace/wt-escape", "---SOL and pre---");
state.hasIndentPreNowikis = true;
return this.escapedText(state, sol, text);
}
// escape nowiki tags
text = WTUtils.escapeNowikiTags(text);
// Use the tokenizer to see if we have any wikitext tokens
//
// Ignores entities
if (hasTildes) {
state.env.log("trace/wt-escape", "---Found tildes---");
return this.escapedText(state, sol, text);
} else if (this.hasWikitextTokens(state, sol, this.options, text)) {
state.env.log("trace/wt-escape", "---Found WT tokens---");
return this.escapedText(state, sol, text);
} else if (text.match(/[^\[]*\]/) && this.textCanParseAsLink(opts.node, state, text)) {
// we have an closing bracket, and
// - the text will get parsed as a link in
state.env.log("trace/wt-escape", "---Links: complex single-line test---");
return this.escapedText(state, sol, text);
} else if (opts.isLastChild && text.match(/=$/)) {
// 1. we have an open heading char, and
// - text ends in a '='
// - text comes from the last child
var headingMatch = state.currLine.firstNode.nodeName.match(/^H(\d)/);
if (headingMatch) {
var n = headingMatch[1];
if ((state.currLine.text + text)[n] === '=') {
// The first character after the heading wikitext is/will be a '='.
// So, the trailing '=' can change semantics if it is not nowikied.
state.env.log("trace/wt-escape", "---Heading: complex single-line test---");
return this.escapedText(state, sol, text);
} else {
return text;
}
} else if (state.currLine.text[0] === '=') {
state.env.log("trace/wt-escape", "---Text-as-heading: complex single-line test---");
return this.escapedText(state, sol, text);
} else {
return text;
}
} else {
state.env.log("trace/wt-escape", "---All good!---");
return text;
}
}
/**
* General strategy:
*
* Tokenize the arg wikitext. Anything that parses as tags
* are good and we need not bother with those. Check for harmful
* characters `[[]]{{}}` or additionally `=` in positional parameters and escape
* those fragments since these characters could change semantics of the entire
* template transclusion.
*
* This function makes a couple of assumptions:
*
* 1. The tokenizer sets tsr on all non-string tokens.
* 2. The tsr on TagTk and EndTagTk corresponds to the
* width of the opening and closing wikitext tags and not
* the entire DOM range they span in the end.
*/
escapeTplArgWT(arg, opts) {
var env = this.options.env;
var serializeAsNamed = opts.serializeAsNamed;
var buf = '';
var openNowiki;
var isTemplate = opts.type === 'template';
function appendStr(str, isLast, checkNowiki) {
if (!checkNowiki) {
if (openNowiki) {
buf += "</nowiki>";
openNowiki = false;
}
buf += str;
return;
}
// '=' is not allowed in positional parameters. We can either
// nowiki escape it or convert the named parameter into a
// positional param to avoid the escaping.
if (isTemplate && !serializeAsNamed && /[=]/.test(str)) {
// In certain situations, it is better to add a nowiki escape
// rather than convert this to a named param.
//
// Ex: Consider: {{funky-tpl|a|b|c|d|e|f|g|h}}
//
// If an editor changes 'a' to 'f=oo' and we convert it to
// a named param 1=f=oo, we are effectively converting all
// the later params into named params as well and we get
// {{funky-tpl|1=f=oo|2=b|3=c|...|8=h}} instead of
// {{funky-tpl|<nowiki>f=oo</nowiki>|b|c|...|h}}
//
// The latter is better in this case. This is a real problem
// in production.
//
// For now, we use a simple heuristic below and can be
// refined later, if necessary
//
// 1. Either there were no original positional args
// 2. Or, only the last positional arg uses '='
if (opts.numPositionalArgs === 0 ||
opts.numPositionalArgs === opts.argPositionalIndex) {
serializeAsNamed = true;
}
}
// Count how many reasons for nowiki
var needNowikiCount = 0;
var neededSubstitution;
// Protect against unmatched pairs of braces and brackets, as they
// should never appear in template arguments.
var bracketPairStrippedStr =
str.replace(/\[\[([^\[\]]*)\]\]|\{\{([^\{\}]*)\}\}|-\{([^\{\}]*)\}-/g, '_$1_');
if (/\{\{|\}\}|\[\[|\]\]|-\{/.test(bracketPairStrippedStr)) {
needNowikiCount++;
}
if (opts.type !== 'templatearg' && !serializeAsNamed && /[=]/.test(str)) {
needNowikiCount++;
}
if (opts.argIndex === opts.numArgs && isLast && /\}$/.test(str)) {
// If this is the last part of the last argument, we need to protect
// against an ending }, as it would get confused with the template ending }}.
needNowikiCount++;
neededSubstitution = [/(\})$/, "<nowiki>}</nowiki>"];
}
if (/\|/.test(str)) {
// If there's an unprotected |, guard it so it doesn't get confused
// with the beginning of a different paramenter.
needNowikiCount++;
neededSubstitution = [/\|/g, "{{!}}"];
}
// Now, if arent' already in a <nowiki> and there's only one reason to
// protect, avoid guarding too much text by just substituting.
if (!openNowiki && needNowikiCount === 1 && neededSubstitution) {
str = str.replace(neededSubstitution[0], neededSubstitution[1]);
needNowikiCount = false;
}
if (!openNowiki && needNowikiCount) {
buf += "<nowiki>";
openNowiki = true;
}
if (!needNowikiCount && openNowiki) {
buf += "</nowiki>";
openNowiki = false;
}
buf += str;
}
var tokens = this.tokenizeStr(arg, false);
for (var i = 0, n = tokens.length; i < n; i++) {
var t = tokens[i];
var da = t.dataAttribs;
var last = i === n - 1;
// For mw:Entity spans, the opening and closing tags have 0 width
// and the enclosed content is the decoded entity. Hence the
// special case to serialize back the entity's source.
if (t.constructor === TagTk) {
var type = t.getAttribute("typeof");
if (type && type.match(/\bmw:(?:(?:DisplaySpace\s+mw:)?Placeholder|Entity)\b/)) {
i += 2;
appendStr(arg.substring(da.tsr[0], tokens[i].dataAttribs.tsr[1]), last, false);
continue;
} else if (type === "mw:Nowiki") {
i++;
while (i < n && (tokens[i].constructor !== EndTagTk || tokens[i].getAttribute("typeof") !== "mw:Nowiki")) {
i++;
}
if (i < n) {
// After tokenization, we can get here:
// * Text explicitly protected by <nowiki> in the parameter.
// * Other things that should be protected but weren't
// according to the tokenizer.
// In template argument, we only need to check for unmatched
// braces and brackets pairs (which is done in appendStr),
// but only if they weren't explicitly protected in the
// passed wikitext.
var substr = arg.substring(da.tsr[0], tokens[i].dataAttribs.tsr[1]);
appendStr(substr, last, !/<nowiki>[^<]*<\/nowiki>/.test(substr));
}
continue;
}
}
var errors;
switch (t.constructor) {
case TagTk:
case EndTagTk:
case NlTk:
case CommentTk:
if (!da.tsr) {
errors = ["Missing tsr for: " + JSON.stringify(t)];
errors.push("Arg : " + JSON.stringify(arg));
errors.push("Toks: " + JSON.stringify(tokens));
env.log("error/html2wt/wtescape", errors.join("\n"));
}
appendStr(arg.substring(da.tsr[0], da.tsr[1]), last, false);
break;
case SelfclosingTagTk:
if (!da.tsr) {
errors = ["Missing tsr for: " + JSON.stringify(t)];
errors.push("Arg : " + JSON.stringify(arg));
errors.push("Toks: " + JSON.stringify(tokens));
env.log("error/html2wt/wtescape", errors.join("\n"));
}
var tkSrc = arg.substring(da.tsr[0], da.tsr[1]);
// Replace pipe by an entity. This is not completely safe.
if (t.name === 'extlink' || t.name === 'urllink') {
var tkBits = this.tokenizer.tokenizeSync(tkSrc, {
startRule: "tplarg_or_template_or_bust",
});
tkBits.forEach(function(bit) {
if (typeof bit === "object") {
appendStr(bit.dataAttribs.src, last, false);
} else {
// Convert to a named param w/ the same reasoning
// as above for escapeStr, however, here we replace
// with an entity to avoid breaking up querystrings
// with nowikis.
if (isTemplate && !serializeAsNamed && /[=]/.test(bit)) {
if (opts.numPositionalArgs === 0 ||
opts.numPositionalArgs === opts.argIndex) {
serializeAsNamed = true;
} else {
bit = bit.replace(/=/g, '=');
}
}
buf += bit.replace(/\|/g, '|');
}
});
} else {
appendStr(tkSrc, last, false);
}
break;
case String:
appendStr(t, last, true);
break;
case EOFTk:
break;
}
}
// If nowiki still open, close it now.
if (openNowiki) {
buf += "</nowiki>";
}
return { serializeAsNamed: serializeAsNamed, v: buf };
}
// See also `escapeLinkTarget` in LinkHandler.js
escapeLinkContent(state, str, solState, node, isMedia) {
// Entity-escape the content.
str = Util.escapeWtEntities(str);
// Wikitext-escape content.
state.onSOL = solState || false;
state.wteHandlerStack.push(isMedia
? this.mediaOptionHandler
: this.wikilinkHandler);
state.inLink = true;
var res = this.escapeWikiText(state, str, { node: node });
state.inLink = false;
state.wteHandlerStack.pop();
return res;
}
}
if (typeof module === "object") {
module.exports.WikitextEscapeHandlers = WikitextEscapeHandlers;
}