/**
* General token sanitizer. Strips out (or encapsulates) unsafe and disallowed
* tag types and attributes. Should run last in the third, synchronous
* expansion stage.
*
* A large part of this code is a straight port from the PHP version.
* @module
*/
'use strict';
require('../../../core-upgrade.js');
var semver = require('semver');
var JSUtils = require('../../utils/jsutils.js').JSUtils;
var TokenHandler = require('./TokenHandler.js');
var TokenUtils = require('../../utils/TokenUtils.js').TokenUtils;
var Util = require('../../utils/Util.js').Util;
var WikitextConstants = require('../../config/WikitextConstants.js').WikitextConstants;
const { KV, TagTk, EndTagTk, SelfclosingTagTk } = require('../../tokens/TokenTypes.js');
var UTF8Utils = {
/**
* Returns true if a given Unicode codepoint is a valid character in
* both HTML5 and XML.
*/
validateCodepoint: function(codepoint) {
// U+000C is valid in HTML5 but not allowed in XML.
// U+000D is valid in XML but not allowed in HTML5.
// U+007F - U+009F are disallowed in HTML5 (control characters).
return codepoint === 0x09
|| codepoint === 0x0a
|| (codepoint >= 0x20 && codepoint <= 0x7e)
|| (codepoint >= 0xa0 && codepoint <= 0xd7ff)
|| (codepoint >= 0xe000 && codepoint <= 0xfffd)
|| (codepoint >= 0x10000 && codepoint <= 0x10ffff);
},
/**
* Returns a JS string from the provided code point.
*/
codepointToUtf8: function(cp) {
return String.fromCodePoint(cp);
},
/**
* Returns the code point at the first position of the string.
*/
utf8ToCodepoint: function(str) {
return str.codePointAt(0);
},
};
/**
* @namespace
*/
var SanitizerConstants = {
/** Character entity aliases accepted by MediaWiki */
htmlEntityAliases: {
'רלמ': 'rlm',
'رلم': 'rlm',
},
/**
* List of all named character entities defined in HTML 4.01
* http://www.w3.org/TR/html4/sgml/entities.html
* As well as ' which is only defined starting in XHTML1.
*/
htmlEntities: {
'Aacute': 193,
'aacute': 225,
'Acirc': 194,
'acirc': 226,
'acute': 180,
'AElig': 198,
'aelig': 230,
'Agrave': 192,
'agrave': 224,
'alefsym': 8501,
'Alpha': 913,
'alpha': 945,
'amp': 38,
'and': 8743,
'ang': 8736,
'apos': 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
'Aring': 197,
'aring': 229,
'asymp': 8776,
'Atilde': 195,
'atilde': 227,
'Auml': 196,
'auml': 228,
'bdquo': 8222,
'Beta': 914,
'beta': 946,
'brvbar': 166,
'bull': 8226,
'cap': 8745,
'Ccedil': 199,
'ccedil': 231,
'cedil': 184,
'cent': 162,
'Chi': 935,
'chi': 967,
'circ': 710,
'clubs': 9827,
'cong': 8773,
'copy': 169,
'crarr': 8629,
'cup': 8746,
'curren': 164,
'dagger': 8224,
'Dagger': 8225,
'darr': 8595,
'dArr': 8659,
'deg': 176,
'Delta': 916,
'delta': 948,
'diams': 9830,
'divide': 247,
'Eacute': 201,
'eacute': 233,
'Ecirc': 202,
'ecirc': 234,
'Egrave': 200,
'egrave': 232,
'empty': 8709,
'emsp': 8195,
'ensp': 8194,
'Epsilon': 917,
'epsilon': 949,
'equiv': 8801,
'Eta': 919,
'eta': 951,
'ETH': 208,
'eth': 240,
'Euml': 203,
'euml': 235,
'euro': 8364,
'exist': 8707,
'fnof': 402,
'forall': 8704,
'frac12': 189,
'frac14': 188,
'frac34': 190,
'frasl': 8260,
'Gamma': 915,
'gamma': 947,
'ge': 8805,
'gt': 62,
'harr': 8596,
'hArr': 8660,
'hearts': 9829,
'hellip': 8230,
'Iacute': 205,
'iacute': 237,
'Icirc': 206,
'icirc': 238,
'iexcl': 161,
'Igrave': 204,
'igrave': 236,
'image': 8465,
'infin': 8734,
'int': 8747,
'Iota': 921,
'iota': 953,
'iquest': 191,
'isin': 8712,
'Iuml': 207,
'iuml': 239,
'Kappa': 922,
'kappa': 954,
'Lambda': 923,
'lambda': 955,
'lang': 9001,
'laquo': 171,
'larr': 8592,
'lArr': 8656,
'lceil': 8968,
'ldquo': 8220,
'le': 8804,
'lfloor': 8970,
'lowast': 8727,
'loz': 9674,
'lrm': 8206,
'lsaquo': 8249,
'lsquo': 8216,
'lt': 60,
'macr': 175,
'mdash': 8212,
'micro': 181,
'middot': 183,
'minus': 8722,
'Mu': 924,
'mu': 956,
'nabla': 8711,
'nbsp': 160,
'ndash': 8211,
'ne': 8800,
'ni': 8715,
'not': 172,
'notin': 8713,
'nsub': 8836,
'Ntilde': 209,
'ntilde': 241,
'Nu': 925,
'nu': 957,
'Oacute': 211,
'oacute': 243,
'Ocirc': 212,
'ocirc': 244,
'OElig': 338,
'oelig': 339,
'Ograve': 210,
'ograve': 242,
'oline': 8254,
'Omega': 937,
'omega': 969,
'Omicron': 927,
'omicron': 959,
'oplus': 8853,
'or': 8744,
'ordf': 170,
'ordm': 186,
'Oslash': 216,
'oslash': 248,
'Otilde': 213,
'otilde': 245,
'otimes': 8855,
'Ouml': 214,
'ouml': 246,
'para': 182,
'part': 8706,
'permil': 8240,
'perp': 8869,
'Phi': 934,
'phi': 966,
'Pi': 928,
'pi': 960,
'piv': 982,
'plusmn': 177,
'pound': 163,
'prime': 8242,
'Prime': 8243,
'prod': 8719,
'prop': 8733,
'Psi': 936,
'psi': 968,
'quot': 34,
'radic': 8730,
'rang': 9002,
'raquo': 187,
'rarr': 8594,
'rArr': 8658,
'rceil': 8969,
'rdquo': 8221,
'real': 8476,
'reg': 174,
'rfloor': 8971,
'Rho': 929,
'rho': 961,
'rlm': 8207,
'rsaquo': 8250,
'rsquo': 8217,
'sbquo': 8218,
'Scaron': 352,
'scaron': 353,
'sdot': 8901,
'sect': 167,
'shy': 173,
'Sigma': 931,
'sigma': 963,
'sigmaf': 962,
'sim': 8764,
'spades': 9824,
'sub': 8834,
'sube': 8838,
'sum': 8721,
'sup': 8835,
'sup1': 185,
'sup2': 178,
'sup3': 179,
'supe': 8839,
'szlig': 223,
'Tau': 932,
'tau': 964,
'there4': 8756,
'Theta': 920,
'theta': 952,
'thetasym': 977,
'thinsp': 8201,
'THORN': 222,
'thorn': 254,
'tilde': 732,
'times': 215,
'trade': 8482,
'Uacute': 218,
'uacute': 250,
'uarr': 8593,
'uArr': 8657,
'Ucirc': 219,
'ucirc': 251,
'Ugrave': 217,
'ugrave': 249,
'uml': 168,
'upsih': 978,
'Upsilon': 933,
'upsilon': 965,
'Uuml': 220,
'uuml': 252,
'weierp': 8472,
'Xi': 926,
'xi': 958,
'Yacute': 221,
'yacute': 253,
'yen': 165,
'Yuml': 376,
'yuml': 255,
'Zeta': 918,
'zeta': 950,
'zwj': 8205,
'zwnj': 8204,
},
UTF8_REPLACEMENT: "\xef\xbf\xbd",
/**
* Regular expression to match various types of character references in
* Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
*/
CHAR_REFS_RE_G: /&([A-Za-z0-9\x80-\xff]+);|&\#([0-9]+);|&\#[xX]([0-9A-Fa-f]+);|(&)/g,
/**
* Blacklist for evil uris like javascript:
* WARNING: DO NOT use this in any place that actually requires blacklisting
* for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the
* only way to be secure from javascript: uri based xss vectors is to whitelist
* things that you know are safe and deny everything else.
* [1]: http://ha.ckers.org/xss.html
*/
EVIL_URI_PATTERN: /(^|\s|\*\/\s*)(javascript|vbscript)([^\w]|$)/i,
XMLNS_ATTRIBUTE_PATTERN: /^xmlns:[:A-Z_a-z-.0-9]+$/,
IDN_RE_G: new RegExp(
"[\t ]|" + // general whitespace
"\u00ad|" + // 00ad SOFT HYPHEN
"\u1806|" + // 1806 MONGOLIAN TODO SOFT HYPHEN
"\u200b|" + // 200b ZERO WIDTH SPACE
"\u2060|" + // 2060 WORD JOINER
"\ufeff|" + // feff ZERO WIDTH NO-BREAK SPACE
"\u034f|" + // 034f COMBINING GRAPHEME JOINER
"\u180b|" + // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
"\u180c|" + // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
"\u180d|" + // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
"\u200c|" + // 200c ZERO WIDTH NON-JOINER
"\u200d|" + // 200d ZERO WIDTH JOINER
"[\ufe00-\ufe0f]", // fe00-fe0f VARIATION SELECTOR-1-16
"g"
),
setDerivedConstants: function() {
function computeCSSDecodeRegexp() {
// Decode escape sequences and line continuation
// See the grammar in the CSS 2 spec, appendix D.
// This has to be done AFTER decoding character references.
// This means it isn't possible for this function to return
// unsanitized escape sequences. It is possible to manufacture
// input that contains character references that decode to
// escape sequences that decode to character references, but
// it's OK for the return value to contain character references
// because the caller is supposed to escape those anyway.
var space = '[\\x20\\t\\r\\n\\f]';
var nl = '(?:\\n|\\r\\n|\\r|\\f)';
var backslash = '\\\\';
return new RegExp(backslash +
"(?:" +
"(" + nl + ")|" + // 1. Line continuation
"([0-9A-Fa-f]{1,6})" + space + "?|" + // 2. character number
"(.)|" + // 3. backslash cancelling special meaning
"()$" + // 4. backslash at end of string
")");
}
function setupAttributeWhitelist() {
var common = [
// HTML
'id',
'class',
'style',
'lang',
'dir',
'title',
// WAI-ARIA
'aria-describedby',
'aria-flowto',
'aria-label',
'aria-labelledby',
'aria-owns',
'role',
// RDFa
// These attributes are specified in section 9 of
// https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
'about',
'property',
'resource',
'datatype',
'typeof',
// Microdata. These are specified by
// https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
'itemid',
'itemprop',
'itemref',
'itemscope',
'itemtype',
];
var block = common.concat(["align"]);
var tablealign = ["align", "valign"];
var tablecell = [
'abbr',
'axis',
'headers',
'scope',
'rowspan',
'colspan',
'nowrap', // deprecated
'width', // deprecated
'height', // deprecated
'bgcolor', // deprecated
];
// Numbers refer to sections in HTML 4.01 standard describing the element.
// See: http://www.w3.org/TR/html4/
return {
// 7.5.4
'div': block,
'center': common, // deprecated
'span': common,
// 7.5.5
'h1': block,
'h2': block,
'h3': block,
'h4': block,
'h5': block,
'h6': block,
// 7.5.6
// address
// 8.2.4
'bdo': common,
// 9.2.1
'em': common,
'strong': common,
'cite': common,
'dfn': common,
'code': common,
'samp': common,
'kbd': common,
'var': common,
'abbr': common,
// acronym
// 9.2.2
'blockquote': common.concat([ 'cite' ]),
'q': common.concat([ 'cite' ]),
// 9.2.3
'sub': common,
'sup': common,
// 9.3.1
'p': block,
// 9.3.2
'br': common.concat([ 'clear' ]),
// https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
'wbr': common,
// 9.3.4
'pre': common.concat([ 'width' ]),
// 9.4
'ins': common.concat([ 'cite', 'datetime' ]),
'del': common.concat([ 'cite', 'datetime' ]),
// 10.2
'ul': common.concat([ 'type' ]),
'ol': common.concat([ 'type', 'start', 'reversed' ]),
'li': common.concat([ 'type', 'value' ]),
// 10.3
'dl': common,
'dd': common,
'dt': common,
// 11.2.1
'table': common.concat([
'summary', 'width', 'border', 'frame',
'rules', 'cellspacing', 'cellpadding',
'align', 'bgcolor',
]),
// 11.2.2
'caption': block,
// 11.2.3
'thead': common,
'tfoot': common,
'tbody': common,
// 11.2.4
'colgroup': common.concat([ 'span' ]),
'col': common.concat([ 'span' ]),
// 11.2.5
'tr': common.concat([ 'bgcolor' ]).concat(tablealign),
// 11.2.6
'td': common.concat(tablecell, tablealign),
'th': common.concat(tablecell, tablealign),
// 12.2
// NOTE: <a> is not allowed directly, but the attrib
// whitelist is used from the Parser object
'a': common.concat([ 'href', 'rel', 'rev' ]), // rel/rev esp. for RDFa
// 13.2
// Not usually allowed, but may be used for extension-style hooks
// such as <math> when it is rasterized, or if wgAllowImageTag is
// true
'img': common.concat([ 'alt', 'src', 'width', 'height', 'srcset' ]),
// Attributes for A/V tags added in T163583 / T133673
'audio': common.concat([ 'controls', 'preload', 'width', 'height' ]),
'video': common.concat([ 'poster', 'controls', 'preload', 'width', 'height' ]),
'source': common.concat([ 'type', 'src' ]),
'track': common.concat([ 'type', 'src', 'srclang', 'kind', 'label' ]),
// 15.2.1
'tt': common,
'b': common,
'i': common,
'big': common,
'small': common,
'strike': common,
's': common,
'u': common,
// 15.2.2
'font': common.concat([ 'size', 'color', 'face' ]),
// basefont
// 15.3
'hr': common.concat([ 'width' ]),
// HTML Ruby annotation text module, simple ruby only.
// https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
'ruby': common,
// rbc
'rb': common,
'rp': common,
'rt': common, // common.concat([ 'rbspan' ]),
'rtc': common,
// MathML root element, where used for extensions
// 'title' may not be 100% valid here; it's XHTML
// http://www.w3.org/TR/REC-MathML/
'math': [ 'class', 'style', 'id', 'title' ],
// HTML 5 section 4.5
'figure': common,
'figure-inline': common, // T118520
'figcaption': common,
// HTML 5 section 4.6
'bdi': common,
// HTML5 elements, defined by:
// https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
'data': common.concat(['value']),
'time': common.concat(['datetime']),
'mark': common,
// meta and link are only permitted by removeHTMLtags when Microdata
// is enabled so we don't bother adding a conditional to hide these
// Also meta and link are only valid in WikiText as Microdata elements
// (ie: validateTag rejects tags missing the attributes needed for Microdata)
// So we don't bother including $common attributes that have no purpose.
'meta': ['itemprop', 'content'],
'link': ['itemprop', 'href', 'title'],
};
}
// Tags whose end tags are not accepted, but whose start /
// self-closing version might be legal.
this.noEndTagSet = new Set(['br']);
// |/?[^/])[^\\s]+$");
this.cssDecodeRE = computeCSSDecodeRegexp();
this.attributeWhitelist = setupAttributeWhitelist();
},
};
// init caches, convert lists to hashtables, etc.
SanitizerConstants.setDerivedConstants();
var ignoreFields;
if (semver.gte(process.version, '6.5.0')) {
// We're ignoring non-global RegExps in >=6.5.0 because it's the first
// version of node to contain this lastIndex writable bug,
// https://github.com/nodejs/node/blob/2cc29517966de7257a2f1b34c58c77225a21e05d/deps/v8/test/webkit/fast/regex/lastIndex-expected.txt#L45
ignoreFields = {
EVIL_URI_RE: true,
XMLNS_ATTRIBUTE_RE: true,
};
} else {
ignoreFields = {};
}
// Can't freeze the regexp state variables w/ global flag
ignoreFields.IDN_RE_G = true;
ignoreFields.CHAR_REFS_RE_G = true;
// Freeze it blocking all accidental changes
JSUtils.deepFreezeButIgnore(SanitizerConstants, ignoreFields);
/* The sanitizer is a stand-alone object ("static class") that is not tied to
* the parsing pipeline. This lets it be usable by extensions and code that
* don't have access to the parsing pipeline. The SanitizerHandler provides
* the parsing pipeline a hook into the sanitizer's abilities */
var Sanitizer = {};
Sanitizer.attributeWhitelistCache = {};
Sanitizer.attributeWhitelist = function(tag) {
var awlCache = this.attributeWhitelistCache;
if (!awlCache[tag]) {
awlCache[tag] = new Set(SanitizerConstants.attributeWhitelist[tag] || []);
}
return awlCache[tag];
};
Sanitizer._stripIDNs = function(host) {
return host.replace(SanitizerConstants.IDN_RE_G, '');
};
Sanitizer.cleanUrl = function(env, href, mode) {
if (mode !== 'wikilink') {
href = href.replace(/([\][<>"\x00-\x20\x7F\|])/g, Util.phpURLEncode);
}
var bits = href.match(/^((?:[a-zA-Z][^:\/]*:)?(?:\/\/)?)([^\/]+)(\/?.*)/);
var proto, host, path;
if (bits) {
proto = bits[1];
if (proto && !env.conf.wiki.hasValidProtocol(proto)) {
// invalid proto, disallow URL
return null;
}
host = Sanitizer._stripIDNs(bits[2]);
var match = /^%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$/.exec(host);
if (match) {
// IPv6 host names
host = '[' + match[1] + ']' + match[2];
}
path = bits[3];
} else {
proto = '';
host = '';
path = href;
}
return proto + host + path;
};
/**
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
* return the UTF-8 encoding of that character. Otherwise, returns
* pseudo-entity source (eg "&foo;").
*/
// gwicke: Use Util.decodeWtEntities instead?
Sanitizer.decodeEntity = function(name) {
if (SanitizerConstants.htmlEntityAliases[name]) {
name = SanitizerConstants.htmlEntityAliases[name];
}
var e = SanitizerConstants.htmlEntities[name];
return e ? UTF8Utils.codepointToUtf8(e) : "&" + name + ";";
};
/**
* Return UTF-8 string for a codepoint if that is a valid
* character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
*/
Sanitizer.decodeChar = function(codepoint) {
if (UTF8Utils.validateCodepoint(codepoint)) {
return UTF8Utils.codepointToUtf8(codepoint);
} else {
return SanitizerConstants.UTF8_REPLACEMENT;
}
};
/**
* Decode any character references, numeric or named entities,
* in the text and return a UTF-8 string.
*/
Sanitizer.decodeCharReferences = function(text) {
return text.replace(SanitizerConstants.CHAR_REFS_RE_G, function() {
if (arguments[1]) {
return Sanitizer.decodeEntity(arguments[1]);
} else if (arguments[2]) {
return Sanitizer.decodeChar(parseInt(arguments[2], 10));
} else if (arguments[3]) {
return Sanitizer.decodeChar(parseInt(arguments[3], 16));
} else {
return arguments[4];
}
});
};
var ieReplace = new Map(Object.entries({
'ʀ': 'r',
'ɴ': 'n',
'ⁿ': 'n',
'ʟ': 'l',
'ɪ': 'i',
'⁽': '(',
'₍': '(',
}));
Sanitizer.normalizeCss = function(text) {
// Decode character references like {
text = Sanitizer.decodeCharReferences(text);
// Decode escape sequences and line continuation
// See the grammar in the CSS 2 spec, appendix D.
// This has to be done AFTER decoding character references.
// This means it isn't possible for this function to return
// unsanitized escape sequences. It is possible to manufacture
// input that contains character references that decode to
// escape sequences that decode to character references, but
// it's OK for the return value to contain character references
// because the caller is supposed to escape those anyway.
text = text.replace(SanitizerConstants.cssDecodeRE, function cssDecodeCallback() {
var c;
if (arguments[1] !== undefined) {
// Line continuation
return '';
} else if (arguments[2] !== undefined) {
c = UTF8Utils.codepointToUtf8(parseInt(arguments[2], 16));
} else if (arguments[3] !== undefined) {
c = arguments[3];
} else {
c = '\\';
}
if (c === "\n" || c === '"' || c === "'" || c === '\\') {
// These characters need to be escaped in strings
// Clean up the escape sequence to avoid parsing errors by clients
return '\\' + (c.charCodeAt(0)).toString(16) + ' ';
} else {
// Decode unnecessary escape
return c;
}
});
// Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
text = text.replace(
// U+FF01 to U+FF5A, excluding U+FF3C (T60088)
/[\uFF01-\uFF3B\uFF3D-\uFF5A]/g,
function(u) {
var cp = UTF8Utils.utf8ToCodepoint(u);
return String.fromCodePoint(cp - 65248); // ASCII range \x21-\x7A
});
// Convert more characters IE6 might treat as ascii
// U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
text = text.replace(/\u0280|\u0274|\u207F|\u029F|\u026A|\u207D|\u208D/g, function(u) {
return ieReplace.get(u) || u;
});
// Let the value through if it's nothing but a single comment, to
// allow other functions which may reject it to pass some error
// message through.
if (!/^\s*\/\*[^*\/]*\*\/\s*$/.test(text)) {
// Remove any comments; IE gets token splitting wrong
// This must be done AFTER decoding character references and
// escape sequences, because those steps can introduce comments
// This step cannot introduce character references or escape
// sequences, because it replaces comments with spaces rather
// than removing them completely.
text = Sanitizer.delimiterReplace('/*', '*/', ' ', text);
// Remove anything after a comment-start token, to guard against
// incorrect client implementations.
const commentPos = text.indexOf('/*');
if (commentPos !== -1) {
text = text.slice(0, commentPos);
}
}
// S followed by repeat, iteration, or prolonged sound marks,
// which IE will treat as "ss"
text = text.replace(/s(?:\u3031|\u309D|\u30FC|\u30FD|\uFE7C|\uFE7D|\uFF70)/ig, 'ss');
return text;
};
// Cut-down version of StringUtils::delimiterReplace in mediawiki-core
Sanitizer.delimiterReplace = function(startDelim, endDelim, replace, subject, flags) {
// Rather than port the full functionality of StringUtils::delimiterReplace,
// just port the small subset actually used by Sanitizer. Protect
// ourselves with assertions in case someone naively tries to use
// the stuff we lazily omitted.
console.assert(flags === undefined, "Not Implemented");
console.assert(replace.indexOf('$') === -1, "Not Implemented");
const re = JSUtils.rejoin(
JSUtils.escapeRegExp(startDelim),
/[^]*?/,
JSUtils.escapeRegExp(endDelim),
{ flags: 'g' }
);
return subject.replace(re, replace);
};
var insecureRE = new RegExp(
"expression" +
"|filter\\s*:" +
"|accelerator\\s*:" +
"|-o-link\\s*:" +
"|-o-link-source\\s*:" +
"|-o-replace\\s*:" +
"|url\\s*\\(" +
"|image\\s*\\(" +
"|image-set\\s*\\(" +
"|attr\\s*\\([^)]+[\\s,]+url",
"i"
);
Sanitizer.checkCss = function(text) {
text = Sanitizer.normalizeCss(text);
// \000-\010\013\016-\037\177 are the octal escape sequences
if (/[\u0000-\u0008\u000B\u000E-\u001F\u007F]/.test(text) ||
text.indexOf(SanitizerConstants.UTF8_REPLACEMENT) > -1) {
return '/* invalid control char */';
} else if (insecureRE.test(text)) {
return '/* insecure input */';
} else {
return text;
}
};
Sanitizer.normalizeSectionIdWhiteSpace = function(id) {
return Util.phpTrim(id.replace(/[ _]+/g, ' '));
};
/**
* Given a section name or other user-generated or otherwise unsafe string, escapes it to be
* a valid HTML id attribute.
*
* WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
* be sure to use proper escaping.
*
* In Parsoid, proper escaping is usually handled for us by the HTML
* serialization algorithm, but be careful of corner cases (such as
* emitting attributes in wikitext).
*
* @param {string} id String to escape
* @param {Object} options Specify whether the primary or fallback encoding
* should be used.
* @return {string} Escaped ID
*
* @since 1.30
*/
Sanitizer.escapeIdForAttribute = function(id, options) {
// For consistency with PHP's API, we accept "primary" or "fallback" as
// the mode in 'options'. This (slightly) abstracts the actual details
// of the id encoding from the Parsoid code which handles ids; we could
// swap primary and fallback here, or even transition to a new HTML6
// encoding (!), without touching all the call sites.
var mode = (options && options.fallback) ? 'legacy' : 'html5';
return Sanitizer.escapeIdInternal(id, mode);
};
/**
* Given a section name or other user-generated or otherwise unsafe string, escapes it to be
* a valid URL fragment.
*
* WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
* be sure to use proper escaping.
*
* @param {string} id String to escape
* @return {string} Escaped ID
*
* @since 1.30
*/
Sanitizer.escapeIdForLink = function(id) {
return Sanitizer.escapeIdInternal(id, 'html5');
};
/**
* Given a section name or other user-generated or otherwise unsafe string, escapes it to be
* a valid URL fragment for external interwikis.
*
* @param {string} id String to escape
* @return {string} Escaped ID
*
* @since 1.30
*/
Sanitizer.escapeIdForExternalInterwiki = function(id) {
// Assume $wgExternalInterwikiFragmentMode = 'legacy'
return Sanitizer.escapeIdInternal(id, 'legacy');
};
/**
* Helper for escapeIdFor*() functions. Performs most of the actual escaping.
*
* @param {string} id String to escape.
* @param {string} mode 'html5' or 'legacy'
* @return {string}
*/
Sanitizer.escapeIdInternal = function(id, mode) {
switch (mode) {
case 'html5':
id = id.replace(/ /g, '_');
break;
case 'legacy':
// This corresponds to 'noninitial' mode of the old escapeId
id = id.replace(/ /g, '_');
id = Util.phpURLEncode(id);
id = id.replace(/%3A/g, ':');
id = id.replace(/%/g, '.');
break;
default:
throw new Error("Invalid mode: " + mode);
}
return id;
};
/**
* Given a string containing a space delimited list of ids, escape each id
* to match ids escaped by the escapeIdForAttribute() function.
*
* @since 1.27
*
* @param {string} referenceString Space delimited list of ids
* @return {string}
*/
Sanitizer.escapeIdReferenceList = function(referenceString) {
// Explode the space delimited list string into an array of tokens
const references =
String(referenceString).split(/\s+/g).filter(s => s.length)
// Escape each token as an id
.map(ref => Sanitizer.escapeIdForAttribute(ref));
// Merge the array back to a space delimited list string
// If the array is empty, the result will be an empty string ('')
return references.join(' ');
};
// SSS FIXME: There is a test in mediawiki.environment.js that doles out
// and tests about ids. There are probably some tests in mediawiki.Util.js
// as well. We should move all these kind of tests somewhere else.
Sanitizer.isParsoidAttr = function(k, v, attrs) {
// NOTES:
// 1. Currently the tokenizer unconditionally escapes typeof and about
// attributes from wikitxt to data-x-typeof and data-x-about. So,
// this check will only pass through Parsoid inserted attrs.
// 2. But, if we fix the over-aggressive escaping in the tokenizer to
// not escape non-Parsoid typeof and about, then this will return
// true for something like typeof='mw:Foo evilScriptHere'. But, that
// is safe since this check is only used to see if we should
// unconditionally discard the entire attribute or process it further.
// That further processing will catch and discard any dangerous
// strings in the rest of the attribute
return (/^(?:typeof|property|rel)$/).test(k) && /(?:^|\s)mw:.+?(?=$|\s)/.test(v) ||
k === "about" && /^#mwt\d+$/.test(v) ||
k === "content" && /(?:^|\s)mw:.+?(?=$|\s)/.test(KV.lookup(attrs, 'property'));
};
/**
* Given an attribute name, checks whether it is a reserved data attribute
* (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki
* core and extension code can safely use it to communicate with frontend code.
* @param {string} attr Attribute name.
* @return {bool}
*/
Sanitizer.isReservedDataAttribute = function(attr) {
// data-ooui is reserved for ooui.
// data-mw and data-parsoid are reserved for parsoid.
// data-mw-<name here> is reserved for extensions (or core) if
// they need to communicate some data to the client and want to be
// sure that it isn't coming from an untrusted user.
// We ignore the possibility of namespaces since user-generated HTML
// can't use them anymore.
if (/^data-(mw|parsoid)/.test(attr)) {
return false; // PARSOID SPECIFIC
}
return /^data-(ooui|mw|parsoid)/i.test(attr);
};
// php's `Sanitizer::getAttribsRegex()` only permits attribute keys matching
// these classes. Transpiled by regexpu v4.1.1 on https://mothereff.in/regexpu
// which corresponds to Unicode v10.0.0
//
// From, /^[:_\p{L}\p{N}][:_\.\-\p{L}\p{N}]*$/u
var getAttribsRegex = /^(?:[0-:A-Z_a-z\xAA\xB2\xB3\xB5\xB9\xBA\xBC-\xBE\xC0-\xD6\xD8-\xF6\xF8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0370-\u0374\u0376\u0377\u037A-\u037D\u037F\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03F5\u03F7-\u0481\u048A-\u052F\u0531-\u0556\u0559\u0561-\u0587\u05D0-\u05EA\u05F0-\u05F2\u0620-\u064A\u0660-\u0669\u066E\u066F\u0671-\u06D3\u06D5\u06E5\u06E6\u06EE-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07C0-\u07EA\u07F4\u07F5\u07FA\u0800-\u0815\u081A\u0824\u0828\u0840-\u0858\u0860-\u086A\u08A0-\u08B4\u08B6-\u08BD\u0904-\u0939\u093D\u0950\u0958-\u0961\u0966-\u096F\u0971-\u0980\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC\u09DD\u09DF-\u09E1\u09E6-\u09F1\u09F4-\u09F9\u09FC\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A59-\u0A5C\u0A5E\u0A66-\u0A6F\u0A72-\u0A74\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AD0\u0AE0\u0AE1\u0AE6-\u0AEF\u0AF9\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B3D\u0B5C\u0B5D\u0B5F-\u0B61\u0B66-\u0B6F\u0B71-\u0B77\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BD0\u0BE6-\u0BF2\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C39\u0C3D\u0C58-\u0C5A\u0C60\u0C61\u0C66-\u0C6F\u0C78-\u0C7E\u0C80\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0\u0CE1\u0CE6-\u0CEF\u0CF1\u0CF2\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D3A\u0D3D\u0D4E\u0D54-\u0D56\u0D58-\u0D61\u0D66-\u0D78\u0D7A-\u0D7F\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0DE6-\u0DEF\u0E01-\u0E30\u0E32\u0E33\u0E40-\u0E46\u0E50-\u0E59\u0E81\u0E82\u0E84\u0E87\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA\u0EAB\u0EAD-\u0EB0\u0EB2\u0EB3\u0EBD\u0EC0-\u0EC4\u0EC6\u0ED0-\u0ED9\u0EDC-\u0EDF\u0F00\u0F20-\u0F33\u0F40-\u0F47\u0F49-\u0F6C\u0F88-\u0F8C\u1000-\u102A\u103F-\u1049\u1050-\u1055\u105A-\u105D\u1061\u1065\u1066\u106E-\u1070\u1075-\u1081\u108E\u1090-\u1099\u10A0-\u10C5\u10C7\u10CD\u10D0-\u10FA\u10FC-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1369-\u137C\u1380-\u138F\u13A0-\u13F5\u13F8-\u13FD\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u16EE-\u16F8\u1700-\u170C\u170E-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17D7\u17DC\u17E0-\u17E9\u17F0-\u17F9\u1810-\u1819\u1820-\u1877\u1880-\u1884\u1887-\u18A8\u18AA\u18B0-\u18F5\u1900-\u191E\u1946-\u196D\u1970-\u1974\u1980-\u19AB\u19B0-\u19C9\u19D0-\u19DA\u1A00-\u1A16\u1A20-\u1A54\u1A80-\u1A89\u1A90-\u1A99\u1AA7\u1B05-\u1B33\u1B45-\u1B4B\u1B50-\u1B59\u1B83-\u1BA0\u1BAE-\u1BE5\u1C00-\u1C23\u1C40-\u1C49\u1C4D-\u1C7D\u1C80-\u1C88\u1CE9-\u1CEC\u1CEE-\u1CF1\u1CF5\u1CF6\u1D00-\u1DBF\u1E00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2070\u2071\u2074-\u2079\u207F-\u2089\u2090-\u209C\u2102\u2107\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2139\u213C-\u213F\u2145-\u2149\u214E\u2150-\u2189\u2460-\u249B\u24EA-\u24FF\u2776-\u2793\u2C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2CE4\u2CEB-\u2CEE\u2CF2\u2CF3\u2CFD\u2D00-\u2D25\u2D27\u2D2D\u2D30-\u2D67\u2D6F\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u2E2F\u3005-\u3007\u3021-\u3029\u3031-\u3035\u3038-\u303C\u3041-\u3096\u309D-\u309F\u30A1-\u30FA\u30FC-\u30FF\u3105-\u312E\u3131-\u318E\u3192-\u3195\u31A0-\u31BA\u31F0-\u31FF\u3220-\u3229\u3248-\u324F\u3251-\u325F\u3280-\u3289\u32B1-\u32BF\u3400-\u4DB5\u4E00-\u9FEA\uA000-\uA48C\uA4D0-\uA4FD\uA500-\uA60C\uA610-\uA62B\uA640-\uA66E\uA67F-\uA69D\uA6A0-\uA6EF\uA717-\uA71F\uA722-\uA788\uA78B-\uA7AE\uA7B0-\uA7B7\uA7F7-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA830-\uA835\uA840-\uA873\uA882-\uA8B3\uA8D0-\uA8D9\uA8F2-\uA8F7\uA8FB\uA8FD\uA900-\uA925\uA930-\uA946\uA960-\uA97C\uA984-\uA9B2\uA9CF-\uA9D9\uA9E0-\uA9E4\uA9E6-\uA9FE\uAA00-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAA50-\uAA59\uAA60-\uAA76\uAA7A\uAA7E-\uAAAF\uAAB1\uAAB5\uAAB6\uAAB9-\uAABD\uAAC0\uAAC2\uAADB-\uAADD\uAAE0-\uAAEA\uAAF2-\uAAF4\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E\uAB30-\uAB5A\uAB5C-\uAB65\uAB70-\uABE2\uABF0-\uABF9\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uF900-\uFA6D\uFA70-\uFAD9\uFB00-\uFB06\uFB13-\uFB17\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC]|\uD800[\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD07-\uDD33\uDD40-\uDD78\uDD8A\uDD8B\uDE80-\uDE9C\uDEA0-\uDED0\uDEE1-\uDEFB\uDF00-\uDF23\uDF2D-\uDF4A\uDF50-\uDF75\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]|\uD801[\uDC00-\uDC9D\uDCA0-\uDCA9\uDCB0-\uDCD3\uDCD8-\uDCFB\uDD00-\uDD27\uDD30-\uDD63\uDE00-\uDF36\uDF40-\uDF55\uDF60-\uDF67]|\uD802[\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDC58-\uDC76\uDC79-\uDC9E\uDCA7-\uDCAF\uDCE0-\uDCF2\uDCF4\uDCF5\uDCFB-\uDD1B\uDD20-\uDD39\uDD80-\uDDB7\uDDBC-\uDDCF\uDDD2-\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE40-\uDE47\uDE60-\uDE7E\uDE80-\uDE9F\uDEC0-\uDEC7\uDEC9-\uDEE4\uDEEB-\uDEEF\uDF00-\uDF35\uDF40-\uDF55\uDF58-\uDF72\uDF78-\uDF91\uDFA9-\uDFAF]|\uD803[\uDC00-\uDC48\uDC80-\uDCB2\uDCC0-\uDCF2\uDCFA-\uDCFF\uDE60-\uDE7E]|\uD804[\uDC03-\uDC37\uDC52-\uDC6F\uDC83-\uDCAF\uDCD0-\uDCE8\uDCF0-\uDCF9\uDD03-\uDD26\uDD36-\uDD3F\uDD50-\uDD72\uDD76\uDD83-\uDDB2\uDDC1-\uDDC4\uDDD0-\uDDDA\uDDDC\uDDE1-\uDDF4\uDE00-\uDE11\uDE13-\uDE2B\uDE80-\uDE86\uDE88\uDE8A-\uDE8D\uDE8F-\uDE9D\uDE9F-\uDEA8\uDEB0-\uDEDE\uDEF0-\uDEF9\uDF05-\uDF0C\uDF0F\uDF10\uDF13-\uDF28\uDF2A-\uDF30\uDF32\uDF33\uDF35-\uDF39\uDF3D\uDF50\uDF5D-\uDF61]|\uD805[\uDC00-\uDC34\uDC47-\uDC4A\uDC50-\uDC59\uDC80-\uDCAF\uDCC4\uDCC5\uDCC7\uDCD0-\uDCD9\uDD80-\uDDAE\uDDD8-\uDDDB\uDE00-\uDE2F\uDE44\uDE50-\uDE59\uDE80-\uDEAA\uDEC0-\uDEC9\uDF00-\uDF19\uDF30-\uDF3B]|\uD806[\uDCA0-\uDCF2\uDCFF\uDE00\uDE0B-\uDE32\uDE3A\uDE50\uDE5C-\uDE83\uDE86-\uDE89\uDEC0-\uDEF8]|\uD807[\uDC00-\uDC08\uDC0A-\uDC2E\uDC40\uDC50-\uDC6C\uDC72-\uDC8F\uDD00-\uDD06\uDD08\uDD09\uDD0B-\uDD30\uDD46\uDD50-\uDD59]|\uD808[\uDC00-\uDF99]|\uD809[\uDC00-\uDC6E\uDC80-\uDD43]|[\uD80C\uD81C-\uD820\uD840-\uD868\uD86A-\uD86C\uD86F-\uD872\uD874-\uD879][\uDC00-\uDFFF]|\uD80D[\uDC00-\uDC2E]|\uD811[\uDC00-\uDE46]|\uD81A[\uDC00-\uDE38\uDE40-\uDE5E\uDE60-\uDE69\uDED0-\uDEED\uDF00-\uDF2F\uDF40-\uDF43\uDF50-\uDF59\uDF5B-\uDF61\uDF63-\uDF77\uDF7D-\uDF8F]|\uD81B[\uDF00-\uDF44\uDF50\uDF93-\uDF9F\uDFE0\uDFE1]|\uD821[\uDC00-\uDFEC]|\uD822[\uDC00-\uDEF2]|\uD82C[\uDC00-\uDD1E\uDD70-\uDEFB]|\uD82F[\uDC00-\uDC6A\uDC70-\uDC7C\uDC80-\uDC88\uDC90-\uDC99]|\uD834[\uDF60-\uDF71]|\uD835[\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]|\uD83A[\uDC00-\uDCC4\uDCC7-\uDCCF\uDD00-\uDD43\uDD50-\uDD59]|\uD83B[\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]|\uD83C[\uDD00-\uDD0C]|\uD869[\uDC00-\uDED6\uDF00-\uDFFF]|\uD86D[\uDC00-\uDF34\uDF40-\uDFFF]|\uD86E[\uDC00-\uDC1D\uDC20-\uDFFF]|\uD873[\uDC00-\uDEA1\uDEB0-\uDFFF]|\uD87A[\uDC00-\uDFE0]|\uD87E[\uDC00-\uDE1D])(?:[\-\.0-:A-Z_a-z\xAA\xB2\xB3\xB5\xB9\xBA\xBC-\xBE\xC0-\xD6\xD8-\xF6\xF8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0370-\u0374\u0376\u0377\u037A-\u037D\u037F\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03F5\u03F7-\u0481\u048A-\u052F\u0531-\u0556\u0559\u0561-\u0587\u05D0-\u05EA\u05F0-\u05F2\u0620-\u064A\u0660-\u0669\u066E\u066F\u0671-\u06D3\u06D5\u06E5\u06E6\u06EE-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07C0-\u07EA\u07F4\u07F5\u07FA\u0800-\u0815\u081A\u0824\u0828\u0840-\u0858\u0860-\u086A\u08A0-\u08B4\u08B6-\u08BD\u0904-\u0939\u093D\u0950\u0958-\u0961\u0966-\u096F\u0971-\u0980\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC\u09DD\u09DF-\u09E1\u09E6-\u09F1\u09F4-\u09F9\u09FC\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A59-\u0A5C\u0A5E\u0A66-\u0A6F\u0A72-\u0A74\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AD0\u0AE0\u0AE1\u0AE6-\u0AEF\u0AF9\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B3D\u0B5C\u0B5D\u0B5F-\u0B61\u0B66-\u0B6F\u0B71-\u0B77\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BD0\u0BE6-\u0BF2\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C39\u0C3D\u0C58-\u0C5A\u0C60\u0C61\u0C66-\u0C6F\u0C78-\u0C7E\u0C80\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0\u0CE1\u0CE6-\u0CEF\u0CF1\u0CF2\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D3A\u0D3D\u0D4E\u0D54-\u0D56\u0D58-\u0D61\u0D66-\u0D78\u0D7A-\u0D7F\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0DE6-\u0DEF\u0E01-\u0E30\u0E32\u0E33\u0E40-\u0E46\u0E50-\u0E59\u0E81\u0E82\u0E84\u0E87\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA\u0EAB\u0EAD-\u0EB0\u0EB2\u0EB3\u0EBD\u0EC0-\u0EC4\u0EC6\u0ED0-\u0ED9\u0EDC-\u0EDF\u0F00\u0F20-\u0F33\u0F40-\u0F47\u0F49-\u0F6C\u0F88-\u0F8C\u1000-\u102A\u103F-\u1049\u1050-\u1055\u105A-\u105D\u1061\u1065\u1066\u106E-\u1070\u1075-\u1081\u108E\u1090-\u1099\u10A0-\u10C5\u10C7\u10CD\u10D0-\u10FA\u10FC-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1369-\u137C\u1380-\u138F\u13A0-\u13F5\u13F8-\u13FD\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u16EE-\u16F8\u1700-\u170C\u170E-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17D7\u17DC\u17E0-\u17E9\u17F0-\u17F9\u1810-\u1819\u1820-\u1877\u1880-\u1884\u1887-\u18A8\u18AA\u18B0-\u18F5\u1900-\u191E\u1946-\u196D\u1970-\u1974\u1980-\u19AB\u19B0-\u19C9\u19D0-\u19DA\u1A00-\u1A16\u1A20-\u1A54\u1A80-\u1A89\u1A90-\u1A99\u1AA7\u1B05-\u1B33\u1B45-\u1B4B\u1B50-\u1B59\u1B83-\u1BA0\u1BAE-\u1BE5\u1C00-\u1C23\u1C40-\u1C49\u1C4D-\u1C7D\u1C80-\u1C88\u1CE9-\u1CEC\u1CEE-\u1CF1\u1CF5\u1CF6\u1D00-\u1DBF\u1E00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2070\u2071\u2074-\u2079\u207F-\u2089\u2090-\u209C\u2102\u2107\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2139\u213C-\u213F\u2145-\u2149\u214E\u2150-\u2189\u2460-\u249B\u24EA-\u24FF\u2776-\u2793\u2C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2CE4\u2CEB-\u2CEE\u2CF2\u2CF3\u2CFD\u2D00-\u2D25\u2D27\u2D2D\u2D30-\u2D67\u2D6F\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u2E2F\u3005-\u3007\u3021-\u3029\u3031-\u3035\u3038-\u303C\u3041-\u3096\u309D-\u309F\u30A1-\u30FA\u30FC-\u30FF\u3105-\u312E\u3131-\u318E\u3192-\u3195\u31A0-\u31BA\u31F0-\u31FF\u3220-\u3229\u3248-\u324F\u3251-\u325F\u3280-\u3289\u32B1-\u32BF\u3400-\u4DB5\u4E00-\u9FEA\uA000-\uA48C\uA4D0-\uA4FD\uA500-\uA60C\uA610-\uA62B\uA640-\uA66E\uA67F-\uA69D\uA6A0-\uA6EF\uA717-\uA71F\uA722-\uA788\uA78B-\uA7AE\uA7B0-\uA7B7\uA7F7-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA830-\uA835\uA840-\uA873\uA882-\uA8B3\uA8D0-\uA8D9\uA8F2-\uA8F7\uA8FB\uA8FD\uA900-\uA925\uA930-\uA946\uA960-\uA97C\uA984-\uA9B2\uA9CF-\uA9D9\uA9E0-\uA9E4\uA9E6-\uA9FE\uAA00-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAA50-\uAA59\uAA60-\uAA76\uAA7A\uAA7E-\uAAAF\uAAB1\uAAB5\uAAB6\uAAB9-\uAABD\uAAC0\uAAC2\uAADB-\uAADD\uAAE0-\uAAEA\uAAF2-\uAAF4\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E\uAB30-\uAB5A\uAB5C-\uAB65\uAB70-\uABE2\uABF0-\uABF9\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uF900-\uFA6D\uFA70-\uFAD9\uFB00-\uFB06\uFB13-\uFB17\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC]|\uD800[\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD07-\uDD33\uDD40-\uDD78\uDD8A\uDD8B\uDE80-\uDE9C\uDEA0-\uDED0\uDEE1-\uDEFB\uDF00-\uDF23\uDF2D-\uDF4A\uDF50-\uDF75\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]|\uD801[\uDC00-\uDC9D\uDCA0-\uDCA9\uDCB0-\uDCD3\uDCD8-\uDCFB\uDD00-\uDD27\uDD30-\uDD63\uDE00-\uDF36\uDF40-\uDF55\uDF60-\uDF67]|\uD802[\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDC58-\uDC76\uDC79-\uDC9E\uDCA7-\uDCAF\uDCE0-\uDCF2\uDCF4\uDCF5\uDCFB-\uDD1B\uDD20-\uDD39\uDD80-\uDDB7\uDDBC-\uDDCF\uDDD2-\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE40-\uDE47\uDE60-\uDE7E\uDE80-\uDE9F\uDEC0-\uDEC7\uDEC9-\uDEE4\uDEEB-\uDEEF\uDF00-\uDF35\uDF40-\uDF55\uDF58-\uDF72\uDF78-\uDF91\uDFA9-\uDFAF]|\uD803[\uDC00-\uDC48\uDC80-\uDCB2\uDCC0-\uDCF2\uDCFA-\uDCFF\uDE60-\uDE7E]|\uD804[\uDC03-\uDC37\uDC52-\uDC6F\uDC83-\uDCAF\uDCD0-\uDCE8\uDCF0-\uDCF9\uDD03-\uDD26\uDD36-\uDD3F\uDD50-\uDD72\uDD76\uDD83-\uDDB2\uDDC1-\uDDC4\uDDD0-\uDDDA\uDDDC\uDDE1-\uDDF4\uDE00-\uDE11\uDE13-\uDE2B\uDE80-\uDE86\uDE88\uDE8A-\uDE8D\uDE8F-\uDE9D\uDE9F-\uDEA8\uDEB0-\uDEDE\uDEF0-\uDEF9\uDF05-\uDF0C\uDF0F\uDF10\uDF13-\uDF28\uDF2A-\uDF30\uDF32\uDF33\uDF35-\uDF39\uDF3D\uDF50\uDF5D-\uDF61]|\uD805[\uDC00-\uDC34\uDC47-\uDC4A\uDC50-\uDC59\uDC80-\uDCAF\uDCC4\uDCC5\uDCC7\uDCD0-\uDCD9\uDD80-\uDDAE\uDDD8-\uDDDB\uDE00-\uDE2F\uDE44\uDE50-\uDE59\uDE80-\uDEAA\uDEC0-\uDEC9\uDF00-\uDF19\uDF30-\uDF3B]|\uD806[\uDCA0-\uDCF2\uDCFF\uDE00\uDE0B-\uDE32\uDE3A\uDE50\uDE5C-\uDE83\uDE86-\uDE89\uDEC0-\uDEF8]|\uD807[\uDC00-\uDC08\uDC0A-\uDC2E\uDC40\uDC50-\uDC6C\uDC72-\uDC8F\uDD00-\uDD06\uDD08\uDD09\uDD0B-\uDD30\uDD46\uDD50-\uDD59]|\uD808[\uDC00-\uDF99]|\uD809[\uDC00-\uDC6E\uDC80-\uDD43]|[\uD80C\uD81C-\uD820\uD840-\uD868\uD86A-\uD86C\uD86F-\uD872\uD874-\uD879][\uDC00-\uDFFF]|\uD80D[\uDC00-\uDC2E]|\uD811[\uDC00-\uDE46]|\uD81A[\uDC00-\uDE38\uDE40-\uDE5E\uDE60-\uDE69\uDED0-\uDEED\uDF00-\uDF2F\uDF40-\uDF43\uDF50-\uDF59\uDF5B-\uDF61\uDF63-\uDF77\uDF7D-\uDF8F]|\uD81B[\uDF00-\uDF44\uDF50\uDF93-\uDF9F\uDFE0\uDFE1]|\uD821[\uDC00-\uDFEC]|\uD822[\uDC00-\uDEF2]|\uD82C[\uDC00-\uDD1E\uDD70-\uDEFB]|\uD82F[\uDC00-\uDC6A\uDC70-\uDC7C\uDC80-\uDC88\uDC90-\uDC99]|\uD834[\uDF60-\uDF71]|\uD835[\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]|\uD83A[\uDC00-\uDCC4\uDCC7-\uDCCF\uDD00-\uDD43\uDD50-\uDD59]|\uD83B[\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]|\uD83C[\uDD00-\uDD0C]|\uD869[\uDC00-\uDED6\uDF00-\uDFFF]|\uD86D[\uDC00-\uDF34\uDF40-\uDFFF]|\uD86E[\uDC00-\uDC1D\uDC20-\uDFFF]|\uD873[\uDC00-\uDEA1\uDEB0-\uDFFF]|\uD87A[\uDC00-\uDFE0]|\uD87E[\uDC00-\uDE1D])*$/;
Sanitizer.sanitizeTagAttrs = function(env, tagName, token, attrs) {
var tag = tagName || token.name;
var xmlnsRE = SanitizerConstants.XMLNS_ATTRIBUTE_PATTERN;
var evilUriRE = SanitizerConstants.EVIL_URI_PATTERN;
var wlist = Sanitizer.attributeWhitelist(tag);
var newAttrs = {};
var n = attrs.length;
for (var i = 0; i < n; i++) {
var a = attrs[i];
if (!a.v) { a.v = ""; }
// Convert attributes to string, if necessary.
a.k = TokenUtils.tokensToString(a.k);
a.v = TokenUtils.tokensToString(a.v, false, {
unpackDOMFragments: true,
env, // FIXME: Sneaking in `env` to avoid changing the signature
});
var origK = a.ksrc || a.k;
var k = a.k.toLowerCase();
var v = a.v;
var origV = a.vsrc || v;
var psdAttr = this.isParsoidAttr(k, v, attrs);
// Bypass RDFa/whitelisting checks for Parsoid-inserted attrs
// Safe to do since the tokenizer renames about/typeof attrs.
// unconditionally. FIXME: The escaping solution in the tokenizer
// may be aggressive. There is no need to escape typeof strings
// that or about ids that don't resemble Parsoid tokens/about ids.
if (!psdAttr) {
if (!getAttribsRegex.test(k)) {
newAttrs[k] = [null, origV, origK];
continue;
}
// Allow XML namespace declaration to allow RDFa
if (xmlnsRE.test(k)) {
if (!evilUriRE.test(v)) {
newAttrs[k] = [v, origV, origK];
} else {
newAttrs[k] = [null, origV, origK];
}
continue;
}
// Allow any attribute beginning with "data-"
// However:
// * Disallow data attributes used by MediaWiki code
// * Ensure that the attribute is not namespaced by banning
// colons.
if (
(!/^data-[^:]*$/i.test(k) && !wlist.has(k)) ||
Sanitizer.isReservedDataAttribute(k)
) {
newAttrs[k] = [null, origV, origK];
continue;
}
}
// Strip javascript "expression" from stylesheets.
// http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
if (k === 'style') {
v = Sanitizer.checkCss(v);
}
// Escape HTML id attributes
if (k === 'id') {
v = Sanitizer.escapeIdForAttribute(v, { fallback:false });
}
// Escape HTML id reference lists
if (
k === 'aria-describedby'
|| k === 'aria-flowto'
|| k === 'aria-labelledby'
|| k === 'aria-owns'
) {
v = Sanitizer.escapeIdReferenceList(v);
}
// RDFa and microdata properties allow URLs, URIs and/or CURIs.
// Check them for sanity
if (
k === 'rel' || k === 'rev'
// RDFa
|| k === 'about' || k === 'property'
|| k === 'resource' || k === 'datatype'
|| k === 'typeof'
// HTML5 microdata
|| k === 'itemid' || k === 'itemprop'
|| k === 'itemref' || k === 'itemscope'
|| k === 'itemtype'
) {
// Paranoia. Allow "simple" values but suppress javascript
if (evilUriRE.test(v)) {
// Retain the Parsoid typeofs for Parsoid attrs
var newV = psdAttr ? origV.replace(/(?:^|\s)(?!mw:\w)[^\s]*/g, '').trim() : null;
newAttrs[k] = [newV, origV, origK];
continue;
}
}
// NOTE: Even though elements using href/src are not allowed directly,
// supply validation code that can be used by tag hook handlers, etc
if (token && (k === 'href' || k === 'src' || k === 'poster')) { // T163583
// `origV` will always be `v`, because `a.vsrc` isn't set, since
// this attribute didn't come from source. However, in the
// LinkHandler, we may have already shadowed this value so use
// that instead.
var rel = token.getAttributeShadowInfo('rel');
var mode = (k === 'href' && rel && (/^mw:WikiLink(\/Interwiki)?$/.test(rel.value))) ?
'wikilink' : 'external';
var origHref = token.getAttributeShadowInfo(k).value;
var newHref = Sanitizer.cleanUrl(env, v, mode);
if (newHref !== v) {
newAttrs[k] = [newHref, origHref, origK];
continue;
}
}
// SSS FIXME: This logic is not RT-friendly.
// If this attribute was previously set, override it.
// Output should only have one attribute of each name.
newAttrs[k] = [v, origV, origK];
}
// itemtype, itemid, itemref don't make sense without itemscope
if (newAttrs.itemscope === undefined) {
// SSS FIXME: This logic is not RT-friendly.
delete newAttrs.itemtype;
delete newAttrs.itemid;
delete newAttrs.itemref;
}
// TODO: Strip itemprop if we aren't descendants of an itemscope.
return newAttrs;
};
/**
* Sanitize and apply attributes to a wrapper element.
*
* Used primarily when we're applying tokenized attributes directly to
* dom elements, which wouldn't have had a chance to be sanitized before
* tree building.
*/
Sanitizer.applySanitizedArgs = function(env, wrapper, attrs) {
var sanitizedAttrs = Sanitizer.sanitizeTagAttrs(env, wrapper.nodeName.toLowerCase(), null, attrs);
Object.keys(sanitizedAttrs).forEach(function(k) {
if (sanitizedAttrs[k][0] !== null) {
wrapper.setAttribute(k, sanitizedAttrs[k][0]);
}
});
};
/**
* Sanitize a token.
*
* XXX: Make attribute sanitation reversible by storing round-trip info in
* token.dataAttribs object (which is serialized as JSON in a data-parsoid
* attribute in the DOM).
*/
Sanitizer.sanitizeToken = function(env, frame, token, inTemplate) {
var i, l, kv;
var attribs = token.attribs;
var noEndTagSet = SanitizerConstants.noEndTagSet;
var tagWhiteList = WikitextConstants.Sanitizer.TagWhiteList;
if (TokenUtils.isHTMLTag(token) && (
!tagWhiteList.has(token.name.toUpperCase()) ||
(token.constructor === EndTagTk && noEndTagSet.has(token.name))
)) { // unknown tag -- convert to plain text
if (!inTemplate && token.dataAttribs.tsr) {
// Just get the original token source, so that we can avoid
// whitespace differences.
token = token.getWTSource(frame);
} else if (token.constructor !== EndTagTk) {
// Handle things without a TSR: For example template or extension
// content. Whitespace in these is not necessarily preserved.
var buf = "<" + token.name;
for (i = 0, l = attribs.length; i < l; i++) {
kv = attribs[i];
buf += " " + kv.k + "='" + kv.v + "'";
}
if (token.constructor === SelfclosingTagTk) {
buf += " /";
}
buf += ">";
token = buf;
} else {
token = "</" + token.name + ">";
}
} else {
if (attribs && attribs.length > 0) {
// Sanitize attributes
if (token.constructor === TagTk || token.constructor === SelfclosingTagTk) {
var newAttrs = this.sanitizeTagAttrs(env, null, token, attribs);
// Reset token attribs and rebuild
token.attribs = [];
// SSS FIXME: We are right now adding shadow information for all sanitized
// attributes. This is being done to minimize dirty diffs for the first
// cut. It can be reasonably argued that we can permanently delete dangerous
// and unacceptable attributes in the interest of safety/security and the
// resultant dirty diffs should be acceptable. But, this is something to do
// in the future once we have passed the initial tests of parsoid acceptance.
Object.keys(newAttrs).forEach(function(j) {
var vs = newAttrs[j];
// explicit check against null to prevent discarding empty strings
if (vs[0] !== null) {
token.addNormalizedAttribute(j, vs[0], vs[1]);
} else {
token.setShadowInfo(vs[2], vs[0], vs[1]);
}
});
} else {
// EndTagTk, drop attributes
token.attribs = [];
}
}
}
return token;
};
/**
* Sanitize a title to be used in a URI?
*/
Sanitizer.sanitizeTitleURI = function(title, isInterwiki) {
var bits = title.split('#');
var anchor = null;
if (bits.length > 1) { // split at first '#'
anchor = title.substring(bits[0].length + 1);
title = bits[0];
}
title = title.replace(/[%? \[\]#|<>]/g, function(m) {
return encodeURIComponent(m);
});
if (anchor !== null) {
title += '#' + (isInterwiki
? Sanitizer.escapeIdForExternalInterwiki(anchor)
: Sanitizer.escapeIdForLink(anchor));
}
return title;
};
/**
* @class
* @param {TokenTransformManager} manager The manager for this part of the pipeline.
* @param {Object} options
*/
class SanitizerHandler extends TokenHandler {
constructor(manager, options) {
super(manager, options);
this.register(manager);
this.inTemplate = options.inTemplate;
}
}
// Register this transformer with the TokenTransformer
SanitizerHandler.prototype.register = function(manager) {
this.manager = manager;
};
SanitizerHandler.prototype.onAny = function(token) {
var env = this.manager.env;
env.log("trace/sanitizer", this.manager.pipelineId, function() {
return JSON.stringify(token);
});
// Pass through a transparent line meta-token
if (TokenUtils.isEmptyLineMetaToken(token)) {
env.log("trace/sanitizer", this.manager.pipelineId, "--unchanged--");
return { tokens: [ token ] };
}
token = Sanitizer.sanitizeToken(env, this.manager.frame, token, this.inTemplate);
env.log("trace/sanitizer", this.manager.pipelineId, function() {
return " ---> " + JSON.stringify(token);
});
return { tokens: [ token ] };
};
if (typeof module === "object") {
module.exports.Sanitizer = Sanitizer;
module.exports.SanitizerHandler = SanitizerHandler;
module.exports.SanitizerConstants = SanitizerConstants;
}