Source: html2wt/ConstrainedText.js: JSDoc

/**
 * Chunk-based serialization support.
 *
 * Keeping wikitext output in `ConstrainedText` chunks allows us to
 * preserve meta-information about boundary conditions at the edges
 * of chunks.  This allows us to more easily add `<nowiki>` and other
 * fixups where needed to prevent misparsing caused by juxtaposition.
 *
 * For example, the chunk corresponding to a magic link can "remember"
 * that it needs to have word boundaries on either side.  If these aren't
 * present (after the chunks on either side have been serialized) then
 * we can add <nowiki> escapes at the proper places.
 * @module
 */

'use strict';

const { DOMDataUtils } = require('../utils/DOMDataUtils.js');
const { DOMUtils } = require('../utils/DOMUtils.js');
const { JSUtils } = require('../utils/jsutils.js');
const { Util } = require('../utils/Util.js');

/**
 * A chunk of wikitext output.  This base class contains the
 * wikitext and a pointer to the DOM node which is responsible for
 * generating it.  Subclasses can add additional properties to record
 * context or wikitext boundary restrictions for proper escaping.
 * The chunk is serialized with the `escape` method, which might
 * alter the wikitext in order to ensure it doesn't run together
 * with its context (usually by adding `<nowiki>` tags).
 *
 * The main entry point is the static function `ConstrainedText.escapeLine()`.
 */
class ConstrainedText {
	/**
	 * This adds necessary escapes to a line of chunks.  We provide
	 * the `ConstrainedText#escape` function with its left and right
	 * context, and it can determine what escapes are needed.
	 *
	 * The `line` parameter is an array of `ConstrainedText` *chunks*
	 * which make up a line (or part of a line, in some cases of nested
	 * processing).
	 * @param {ConstrainedText[]} line
	 * @return {string}
	 * @static
	 */
	static escapeLine(line) {
		// The left context will be precise (that is, it is the result
		// of `ConstrainedText#escape` and will include any escapes
		// triggered by chunks on the left), but the right context
		// is just the (unescaped) text property from the chunk.
		// As we work left to right we will piece together a fully-escaped
		// string.  Be careful not to shoot yourself in the foot -- if the
		// escaped text is significantly different from the chunk's `text`
		// property, the preceding chunk may not have made the correct
		// decisions about emitting an escape suffix.  We could solve
		// this by looping until the state converges (or until we detect
		// a loop) but for now let's hope that's not necessary.
		var state = {
			leftContext: '',
			rightContext: line.map(function(ct) { return ct.text; }).join(''),
			line: line,
			pos: 0,
		};
		var safeLeft = '';
		for (state.pos = 0; state.pos < line.length; state.pos++) {
			var chunk = line[state.pos];
			// Process the escapes for this chunk, given escaped previous chunk
			state.rightContext = state.rightContext.slice(chunk.text.length);
			var thisEscape = chunk.escape(state);
			state.leftContext +=
				(thisEscape.prefix || '') + thisEscape.text + (thisEscape.suffix || '');
			if (thisEscape.greedy) {
				// protect the left context: this will be matched greedily
				// by this chunk, so there's no chance that a subsequent
				// token will include this in its prefix.
				safeLeft += state.leftContext;
				state.leftContext = '';
			}
		}
		// right context should be empty here.
		return safeLeft + state.leftContext;
	}

	/**
	 * @param {Object} args Options.
	 * @param {string} args.text The text string associated with this chunk.
	 * @param {Node} args.node The DOM {@link Node} associated with this chunk.
	 * @param {string} [args.prefix]
	 *  The prefix string to add if the start of the chunk doesn't match its
	 *  constraints.
	 * @param {string} [args.suffix]
	 *  The suffix string to add if the end of the chunk doesn't match its
	 *  constraints.
	 */
	constructor(args) {
		this.text = args.text;
		this.node = args.node;
		if (args.prefix !== undefined || args.suffix !== undefined) {
			// save space in the object in the common case of no prefix/suffix
			this.prefix = args.prefix;
			this.suffix = args.suffix;
		}
	}

	/**
	 * Ensure that the argument `o`, which is perhaps a string, is a instance of
	 * `ConstrainedText`.
	 * @param {string|ConstrainedText} o
	 * @param {Node} node
	 *   The DOM {@link Node} corresponding to `o`.
	 * @return {ConstrainedText}
	 * @static
	 */
	static cast(o, node) {
		if (o instanceof ConstrainedText) { return o; }
		return new ConstrainedText({ text: o || '', node: node });
	}

	/**
	 * Use the provided `state`, which gives context and access to the entire
	 * list of chunks, to determine the proper escape prefix/suffix.
	 * Returns an object with a `text` property as well as optional
	 * `prefix` and 'suffix' properties giving desired escape strings.
	 * @param {Object} state
	 * @return {Object}
	 * @return {string} Return.text.
	 * @return {string} [return.prefix].
	 * @return {string} [return.suffix].
	 */
	escape(state) {
		// default implementation: no escaping, no prefixes or suffixes.
		return { text: this.text, prefix: this.prefix, suffix: this.suffix };
	}

	/**
	 * Simple equality.  This enforces type equality (ie subclasses are not equal).
	 * @param {Object} ct
	 * @return {boolean}
	 */
	equals(ct) {
		return this === ct ||
			(this.constructor === ConstrainedText &&
				ct.constructor === ConstrainedText &&
				this.text === ct.text);
	}

	/**
	 * Useful shortcut: execute a regular expression on the raw wikitext.
	 * @param {RegExp} re
	 * @return {Array|null}
	 *  An Array containing the matched results or null if there were no matches.
	 */
	match(re) {
		return this.text.match(re);
	}

	/**
	 * SelSer support: when we come across an unmodified node in during
	 * selective serialization, we know we can use the original wikitext
	 * for that node unmodified.  *But* there may be boundary conditions
	 * on the left and right sides of the selser'ed text which are going
	 * to require escaping.
	 *
	 * So rather than turning the node into a plain old `ConstrainedText`
	 * chunk, allow subclasses of `ConstrainedText` to register as potential
	 * handlers of selser nodes.  A selser'ed magic link, for example,
	 * will then turn into a `MagicLinkText` and thus be able to enforce
	 * the proper boundary constraints.
	 * @static
	 * @param {string} text
	 * @param {Node} node
	 * @param {Object} dataParsoid
	 * @param {MWParserEnvironment} env
	 * @param {Object} opts
	 */
	// Main dispatch point: iterate through registered subclasses, asking
	// each if they can handle this node (by invoking `_fromSelSer`).
	static fromSelSer(text, node, dataParsoid, env, opts) {
		// We define parent types before subtypes, so search the list backwards
		// to be sure we check subtypes before parent types.
		var types = this._types;
		for (var i = types.length - 1; i >= 0; i--) {
			var ct = types[i]._fromSelSer &&
				types[i]._fromSelSer(text, node, dataParsoid, env, opts);
			if (!ct) { continue; }
			if (!Array.isArray(ct)) { ct = [ct]; }
			// tag these chunks as coming from selser
			ct.forEach(function(t) { t.selser = true; });
			return ct;
		}
		// ConstrainedText._fromSelSer should handle everything which reaches it
		// so nothing should make it here.
		throw new Error("Should never happen.");
	}

	/**
	 * Base case: the given node type does not correspond to a special
	 * `ConstrainedText` subclass.  We still have to be careful: the leftmost
	 * (rightmost) children of `node` may still be exposed to our left (right)
	 * context.  If so (ie, their DSR bounds coincide) split the selser text
	 * and emit multiple `ConstrainedText` chunks to preserve the proper
	 * boundary conditions.
	 * @static
	 * @private
	 */
	static _fromSelSer(text, node, dataParsoid, env, opts) {
		// look at leftmost and rightmost children, it may be that we need
		// to turn these into ConstrainedText chunks in order to preserve
		// the proper escape conditions on the prefix/suffix text.
		var firstChild = DOMUtils.firstNonDeletedChild(node);
		var lastChild = DOMUtils.lastNonDeletedChild(node);
		var firstChildDp = DOMUtils.isElt(firstChild) && DOMDataUtils.getDataParsoid(firstChild);
		var lastChildDp = DOMUtils.isElt(lastChild) && DOMDataUtils.getDataParsoid(lastChild);
		var prefixChunks = [];
		var suffixChunks = [];
		var len;
		var ignorePrefix = opts && opts.ignorePrefix;
		var ignoreSuffix = opts && opts.ignoreSuffix;
		// check to see if first child's DSR start is the same as this node's
		// DSR start.  If so, the first child is exposed to the (modified)
		// left-hand context, and so recursively convert it to the proper
		// list of specialized chunks.
		if (!ignorePrefix &&
			firstChildDp && Util.isValidDSR(firstChildDp.dsr) &&
			dataParsoid.dsr[0] === firstChildDp.dsr[0]) {
			len = firstChildDp.dsr[1] - firstChildDp.dsr[0];
			prefixChunks = ConstrainedText.fromSelSer(
				text.slice(0, len), firstChild, firstChildDp, env,
				// this child node's right context will be protected:
				{ ignoreSuffix: true }
			);
			text = text.slice(len);
		}
		// check to see if last child's DSR end is the same as this node's
		// DSR end.  If so, the last child is exposed to the (modified)
		// right-hand context, and so recursively convert it to the proper
		// list of specialized chunks.
		if (!ignoreSuffix && lastChild !== firstChild &&
			lastChildDp && Util.isValidDSR(lastChildDp.dsr) &&
			dataParsoid.dsr[1] === lastChildDp.dsr[1]) {
			len = lastChildDp.dsr[1] - lastChildDp.dsr[0];
			suffixChunks = ConstrainedText.fromSelSer(
				text.slice(-len), lastChild, lastChildDp, env,
				// this child node's left context will be protected:
				{ ignorePrefix: true }
			);
			text = text.slice(0, -len);
		}
		// glue together prefixChunks, whatever's left of `text`, and suffixChunks
		var chunks = [ ConstrainedText.cast(text, node) ];
		chunks = prefixChunks.concat(chunks, suffixChunks);
		// top-level chunks only:
		if (!(ignorePrefix || ignoreSuffix)) {
			// ensure that the first chunk belongs to `node` in order to
			// emit separators correctly before `node`
			if (chunks[0].node !== node) {
				chunks.unshift(ConstrainedText.cast('', node));
			}
			// set 'noSep' flag on all but the first chunk, so we don't get
			// extra separators from `SSP.emitChunk`
			chunks.forEach(function(t, i) {
				if (i > 0) { t.noSep = true; }
			});
		}
		return chunks;
	}
}

/**
 * This subclass allows specification of a regular expression for
 * acceptable (or prohibited) leading (and/or trailing) contexts.
 *
 * This is an *abstract* class; it's intended to be subclassed, not
 * used directly, and so it not included in the lists of types
 * tried by `fromSelSer`.
 *
 * @class
 * @extends ~ConstrainedText
 * @inheritdoc
 * @param {Object} args
 * @param {RegExp} args.goodPrefix
 * @param {RegExp} args.goodSuffix
 */
class RegExpConstrainedText extends ConstrainedText {
	constructor(args) {
		super(args);
		this.prefix = args.prefix !== undefined ? args.prefix : '<nowiki/>';
		this.suffix = args.suffix !== undefined ? args.suffix : '<nowiki/>';
		// functions which return true if escape prefix/suffix need to be added
		const matcher = (re, invert) => ((context) => {
			return re.test(context) ? !invert : invert;
		});
		this.prefixMatcher = args.goodPrefix ? matcher(args.goodPrefix, true) :
			args.badPrefix ? matcher(args.badPrefix, false) : false;
		this.suffixMatcher = args.goodSuffix ? matcher(args.goodSuffix, true) :
			args.badSuffix ? matcher(args.badSuffix, false) : false;
	}

	/** @inheritdoc */
	escape(state) {
		var result = { text: this.text };
		if (this.prefixMatcher && this.prefixMatcher(state.leftContext)) {
			result.prefix = this.prefix;
		}
		if (this.suffixMatcher && this.suffixMatcher(state.rightContext)) {
			result.suffix = this.suffix;
		}
		return result;
	}
}

/**
 * An internal wiki link, like `[[Foo]]`.
 * @class
 * @extends ~RegExpConstrainedText
 * @param {string} text
 * @param {Node} node
 * @param {WikiConfig} wikiConfig
 * @param {string} type
 *   The type of the link, as described by the `rel` attribute.
 */
class WikiLinkText extends RegExpConstrainedText {
	constructor(text, node, wikiConfig, type) {
		// category links/external links/images don't use link trails or prefixes
		var noTrails = !/^mw:WikiLink(\/Interwiki)?$/.test(type);
		var badPrefix = /(^|[^\[])(\[\[)*\[$/;
		if (!noTrails && wikiConfig.linkPrefixRegex) {
			badPrefix = JSUtils.rejoin('(', wikiConfig.linkPrefixRegex, ')|(', badPrefix, ')', { flags: 'u' });
		}
		super({
			text: text,
			node: node,
			badPrefix: badPrefix,
			badSuffix: noTrails ? undefined : wikiConfig.linkTrailRegex,
		});
		// We match link trails greedily when they exist.
		if (!(noTrails || /\]$/.test(text))) {
			this.greedy = true;
		}
	}

	escape(state) {
		var r = super.escape(state);
		// If previous token was also a WikiLink, its linktrail will
		// eat up any possible linkprefix characters, so we don't need
		// a <nowiki> in this case.  (Eg: [[a]]-[[b]] in iswiki; the -
		// character is both a link prefix and a link trail, but it gets
		// preferentially associated with the [[a]] as a link trail.)
		r.greedy = this.greedy;
		return r;
	}

	static _fromSelSer(text, node, dataParsoid, env) {
		var type = node.getAttribute('rel') || '';
		var stx = dataParsoid.stx || '';
		// TODO: Leaving this for backwards compatibility, remove when 1.5 is no longer bound
		if (type === 'mw:ExtLink') {
			type = 'mw:WikiLink/Interwiki';
		}
		if (/^mw:WikiLink(\/Interwiki)?$/.test(type) && /^(simple|piped)$/.test(stx)) {
			return new WikiLinkText(text, node, env.conf.wiki, type);
		}
	}
}

/**
 * An external link, like `[http://example.com]`.
 * @class
 * @extends ~ConstrainedText
 * @param {string} text
 * @param {Node} node
 * @param {WikiConfig} wikiConfig
 * @param {string} type
 *   The type of the link, as described by the `rel` attribute.
 */
class ExtLinkText extends ConstrainedText {
	constructor(text, node, wikiConfig, type) {
		super({
			text: text,
			node: node,
		});
	}

	static _fromSelSer(text, node, dataParsoid, env) {
		var type = node.getAttribute('rel') || '';
		var stx = dataParsoid.stx || '';
		if (type === 'mw:ExtLink' && !/^(simple|piped)$/.test(stx)) {
			return new ExtLinkText(text, node, env.conf.wiki, type);
		}
	}
}

/**
 * An autolink to an external resource, like `http://example.com`.
 * @class
 * @extends ~RegExpConstrainedText
 * @param {string} url
 * @param {Node} node
 */
class AutoURLLinkText extends RegExpConstrainedText {
	constructor(url, node) {
		super({
			text: url,
			node: node,
			// there's a \b boundary at start, and first char of url is a word char
			badPrefix: /\w$/,
			badSuffix: AutoURLLinkText.badSuffix(url),
		});
		// Hack around the difference between PHP \w and JS \w
		this.prefixMatcher = function(leftContext) {
			return Util.isUniWord(Util.lastUniChar(leftContext));
		};
	}

	static badSuffix(url) {
		// Cache the constructed regular expressions.
		if (this._badSuffix) { return this._badSuffix(url); }
		// build regexps representing the trailing context for an autourl link
		// This regexp comes from the PHP parser's EXT_LINK_URL_CLASS regexp.
		const EXT_LINK_URL_CLASS = /[^\[\]<>"\x00-\x20\x7F\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]/.source.slice(1, -1);
		// This set of trailing punctuation comes from Parser.php::makeFreeExternalLink
		const TRAILING_PUNCT = /[,;\\.:!?]/.source.slice(1, -1);
		const NOT_LTGTNBSP = /(?!&(lt|gt|nbsp|#x0*(3[CcEe]|[Aa]0)|#0*(60|62|160));)/.source;
		const NOT_QQ = /(?!'')/.source;

		const PAREN_AUTOURL_BAD_SUFFIX = new RegExp("^" + NOT_LTGTNBSP + NOT_QQ + "[" + TRAILING_PUNCT + "]*[" + EXT_LINK_URL_CLASS + TRAILING_PUNCT + "]");
		// if the URL has an doesn't have an open paren in it, TRAILING PUNCT will
		// include ')' as well.
		const NOPAREN_AUTOURL_BAD_SUFFIX = new RegExp("^" + NOT_LTGTNBSP + NOT_QQ + "[" + TRAILING_PUNCT + "\\)]*[" + EXT_LINK_URL_CLASS + TRAILING_PUNCT + "\\)]");
		this._badSuffix = (url) => {
			return /\(/.test(url) ? PAREN_AUTOURL_BAD_SUFFIX : NOPAREN_AUTOURL_BAD_SUFFIX;
		};
		return this._badSuffix(url);
	}

	static _fromSelSer(text, node, dataParsoid, env) {
		if ((node.tagName === 'A' && dataParsoid.stx === 'url') ||
			(node.tagName === 'IMG' && dataParsoid.type === 'extlink')) {
			return new AutoURLLinkText(text, node);
		}
	}

	// Special case for entities which "leak off the end".
	escape(state) {
		var r = super.escape(state);
		// If the text ends with an incomplete entity, be careful of
		// suffix text which could complete it.
		if (!r.suffix &&
			(/&[#0-9a-zA-Z]*$/.test(r.text)) &&
			(/^[#0-9a-zA-Z]*;/.test(state.rightContext))) {
			r.suffix = this.suffix;
		}
		return r;
	}
}

/**
 * An autolink to an RFC/PMID/ISBN, like `RFC 1234`.
 * @class
 * @extends ~RegExpConstrainedText
 * @param {string} text
 * @param {Node} node
 */
class MagicLinkText extends RegExpConstrainedText {
	constructor(text, node) {
		super({
			text: text,
			node: node,
			// there are \b boundaries on either side, and first/last characters
			// are word characters.
			badPrefix: /\w$/,
			badSuffix: /^\w/,
		});
		// Hack around the difference between PHP \w and JS \w
		this.prefixMatcher = function(leftContext) {
			return Util.isUniWord(Util.lastUniChar(leftContext));
		};
		this.suffixMatcher = function(rightContext) {
			return Util.isUniWord(rightContext);
		};
	}

	static _fromSelSer(text, node, dataParsoid, env) {
		if (dataParsoid.stx === 'magiclink') {
			return new MagicLinkText(text, node);
		}
	}
}

/**
 * Language Variant markup, like `-{ ... }-`.
 * @class
 * @extends ~RegExpConstrainedText
 * @param {string} text
 * @param {Node} node
 */
class LanguageVariantText extends RegExpConstrainedText {
	constructor(text, node) {
		super({
			text: text,
			node: node,
			// at sol vertical bars immediately preceding cause problems in tables
			badPrefix: /^\|$/,
		});
	}

	static _fromSelSer(text, node, dataParsoid, env) {
		if (node.getAttribute('typeof') === 'mw:LanguageVariant') {
			return new LanguageVariantText(text, node);
		}
	}
}

/**
 * List of types we attempt `fromSelSer` with.  This should include all the
 * concrete subclasses of `ConstrainedText` (`RegExpConstrainedText` is
 * missing since it is an abstract class).  We also include the
 * `ConstrainedText` class as the first element (even though it is
 * an abstract base class) as a little bit of a hack: it simplifies
 * `ConstrainedText.fromSelSer` by factoring some of its work into
 * `ConstrainedText._fromSelSer`.
 */
ConstrainedText._types = [
	// Base class is first, as a special case
	ConstrainedText,
	// All concrete subclasses of ConstrainedText
	WikiLinkText, ExtLinkText, AutoURLLinkText,
	MagicLinkText, LanguageVariantText,
];

module.exports = {
	ConstrainedText: ConstrainedText,
	RegExpConstrainedText: RegExpConstrainedText,
	AutoURLLinkText: AutoURLLinkText,
	ExtLinkText: ExtLinkText,
	LanguageVariantText: LanguageVariantText,
	MagicLinkText: MagicLinkText,
	WikiLinkText: WikiLinkText,
};