Source: wt2html/tt/AttributeExpander.js: JSDoc

/**
 * Generic attribute expansion handler.
 * @module
 */

'use strict';

const { AttributeTransformManager } = require('../TokenTransformManager.js');
const { PegTokenizer } = require('../tokenizer.js');
const Promise = require('../../utils/promise.js');
const TokenHandler = require('./TokenHandler.js');
const { PipelineUtils } = require('../../utils/PipelineUtils.js');
const { TokenUtils } = require('../../utils/TokenUtils.js');
const { NlTk, TagTk, SelfclosingTagTk } = require('../../tokens/TokenTypes.js');
const { Util } = require('../../utils/Util.js');

/**
 * Generic attribute expansion handler.
 *
 * @class
 * @extends module:wt2html/tt/TokenHandler
 */
class AttributeExpander extends TokenHandler {
	constructor(manager, options) {
		super(manager, options);
		this.tokenizer = new PegTokenizer(this.env);

		if (!this.options.standalone) {
			// XXX: only register for tag tokens?
			this.manager.addTransform(
				(token, cb) => this.onToken(token, cb),
				'AttributeExpander:onToken',
				AttributeExpander.rank(),
				'any'
			);
		}
	}

	static rank() { return 1.12; }
	static skipRank() { return 1.13; /* should be higher than all other ranks above */ }

	/**
	 * Token handler.
	 *
	 * Expands target and arguments (both keys and values) and either directly
	 * calls or sets up the callback to _expandTemplate, which then fetches and
	 * processes the template.
	 *
	 * @private
	 * @param {Token} token Token whose attrs being expanded.
	 * @param {Function} cb Results passed back via this callback.
	 */
	onToken(token, cb) {
		const attribs = token.attribs;
		// console.warn( 'AttributeExpander.onToken: ', JSON.stringify( token ) );
		if ((token.constructor === TagTk || token.constructor === SelfclosingTagTk) &&
			// Do not process dom-fragment tokens: a separate handler deals with them.
			attribs && attribs.length &&
			token.name !== 'mw:dom-fragment-token' &&
			(
				token.name !== 'meta' ||
				!/mw:(TSRMarker|Placeholder|Transclusion|Param|Includes)/.test(token.getAttribute('typeof'))
			)
		) {
			const atm = new AttributeTransformManager(
				this.manager,
				{ expandTemplates: this.options.expandTemplates, inTemplate: this.options.inTemplate }
			);
			const ret = atm.process(attribs);
			if (ret.async) {
				cb({ async: true });
				ret.promises.then(
					() => this.buildExpandedAttrs(token, atm.getNewKVs(attribs))
				).then(
					ret => cb(ret)
				).done();
			} else {
				cb({ tokens: [token] });
			}
		} else {
			cb({ tokens: [token] });
		}
	}

	static nlTkIndex(nlTkOkay, tokens, atTopLevel) {
		// Moving this check here since it makes the
		// callsite cleaner and simpler.
		if (nlTkOkay) {
			return -1;
		}

		// Check if we have a newline token in the attribute key/value token stream.
		// However, newlines are acceptable inside a <*include*>..</*include*> directive
		// since they are stripped out.
		//
		// let includeRE = !atTopLevel ? /(?:^|\s)mw:Includes\/NoInclude(\/.*)?(?:\s|$)/ : /(?:^|\s)mw:Includes\/(?:Only)?Include(?:Only)?(\/.*)?(?:\s|$)/;
		//
		// SSS FIXME: We cannot support this usage for <*include*> directives currently
		// since they don't go through template encapsulation and don't have a data-mw
		// format with "wt" and "transclusion" parts that we can use to just track bits
		// of wikitext that don't have a DOM representation.
		//
		// So, for now, we just suppress all newlines contained within these directives.
		//
		const includeRE = /(?:^|\s)mw:Includes\/(?:No|Only)?Include(?:Only)?(\/.*)?(?:\s|$)/;
		let inInclude = false;
		for (let i = 0, n = tokens.length; i < n; i++) {
			const t = tokens[i];
			if (t.constructor === SelfclosingTagTk) {
				const type = t.getAttribute("typeof");
				const typeMatch = type ? type.match(includeRE) : null;
				if (typeMatch) {
					inInclude = !typeMatch[1] || !typeMatch[1].match(/\/End$/);
				}
			} else if (!inInclude && t.constructor === NlTk) {
				// newline token outside <*include*>
				return i;
			}
		}

		return -1;
	}

	static metaTypeMatcher() {
		return /(mw:(LanguageVariant|Transclusion|Param|Includes\/)(.*)?$)/;
	}

	static splitTokens(frame, token, nlTkPos, tokens, wrapTemplates) {
		const buf = [];
		let postNLBuf, startMeta, metaTokens;

		// Split the token array around the first newline token.
		for (let i = 0, l = tokens.length; i < l; i++) {
			const t = tokens[i];
			if (i === nlTkPos) {
				// split here!
				postNLBuf = tokens.slice(i);
				break;
			} else {
				if (wrapTemplates && t.constructor === SelfclosingTagTk) {
					const type = t.getAttribute("typeof");
					const typeMatch = type && type.match(this.metaTypeMatcher());
					// Don't trip on transclusion end tags
					if (typeMatch && !typeMatch[1].match(/\/End$/)) {
						startMeta = t;
					}
				}

				buf.push(t);
			}
		}

		if (wrapTemplates && startMeta) {
			// Support template wrapping with the following steps:
			// - Hoist the transclusion start-meta from the first line
			//   to before the token.
			// - Update the start-meta tsr to that of the token.
			// - Record the wikitext between the token and the transclusion
			//   as an unwrappedWT data-parsoid attribute of the start-meta.
			const dp = startMeta.dataAttribs;
			dp.unwrappedWT = frame.srcText.substring(token.dataAttribs.tsr[0], dp.tsr[0]);

			// unwrappedWT will be added to the data-mw.parts array which makes
			// this a multi-template-content-block.
			// Record the first wikitext node of this block (required by html->wt serialization)

			// Match the casing from DOM nodes
			const tokenName = token.name.toUpperCase();

			dp.firstWikitextNode = token.dataAttribs.stx ? tokenName + "_" + token.dataAttribs.stx : tokenName;

			// Update tsr[0] only. Unless the end-meta token is moved as well,
			// updating tsr[1] can introduce bugs in cases like:
			//
			//   {|
			//   |{{singlechart|Australia|93|artist=Madonna|album=Girls Gone Wild}}|x
			//   |}
			//
			// which can then cause dirty diffs (the "|" before the x gets dropped).
			dp.tsr[0] = token.dataAttribs.tsr[0];
			metaTokens = [startMeta];

			return { metaTokens: metaTokens, preNLBuf: buf, postNLBuf: postNLBuf };
		} else {
			return { metaTokens: [], preNLBuf: tokens, postNLBuf: [] };
		}
	}

	/* ----------------------------------------------------------
	* This helper method strips all meta tags introduced by
	* transclusions, etc. and returns the content.
	* ---------------------------------------------------------- */
	static stripMetaTags(env, tokens, wrapTemplates) {
		const buf = [];
		let hasGeneratedContent = false;

		for (let i = 0, l = tokens.length; i < l; i++) {
			const t = tokens[i];
			if ([TagTk, SelfclosingTagTk].indexOf(t.constructor) !== -1) {
				// Take advantage of this iteration of `tokens` to seek out
				// document fragments.  They're an indication that an attribute
				// value wasn't present as literal text in the input and the
				// token should be annotated with "mw:ExpandedAttrs".
				if (TokenUtils.isDOMFragmentType(t.getAttribute('typeof'))) {
					hasGeneratedContent = true;
				}

				if (wrapTemplates) {
					// Strip all meta tags.
					const type = t.getAttribute("typeof");
					const typeMatch = type && type.match(this.metaTypeMatcher());
					if (typeMatch) {
						if (!typeMatch[1].match(/\/End$/)) {
							hasGeneratedContent = true;
						}
					} else {
						buf.push(t);
						continue;
					}
				}

				if (t.name !== "meta") {
					// Dont strip token if it is not a meta-tag
					buf.push(t);
				}
			} else {
				buf.push(t);
			}
		}

		return { hasGeneratedContent: hasGeneratedContent, value: buf };
	}

	/**
	 * Callback for attribute expansion in AttributeTransformManager
	 * @private
	 */
	*buildExpandedAttrsG(token, expandedAttrs) {
		// If we're not in a template, we'll be doing template wrapping in dom
		// post-processing (same conditional there), so take care of meta markers
		// found while processing tokens.
		const wrapTemplates = !this.options.inTemplate;
		const env = this.manager.env;
		let metaTokens = [];
		let postNLToks = [];
		let tmpDataMW;
		const oldAttrs = token.attribs;
		// Build newAttrs lazily (on-demand) to avoid creating
		// objects in the common case where nothing of significance
		// happens in this code.
		let newAttrs = null;
		let nlTkPos = -1;
		let i, l;
		const nlTkOkay = TokenUtils.isHTMLTag(token) || !TokenUtils.isTableTag(token);

		// Identify attributes that were generated in full or in part using templates
		for (i = 0, l = oldAttrs.length; i < l; i++) {
			const oldA = oldAttrs[i];
			const expandedA = expandedAttrs[i];

			// Preserve the key and value source, if available.
			// But, if 'oldA' wasn't cloned, expandedA will be the same as 'oldA'.
			if (oldA !== expandedA) {
				expandedA.ksrc = oldA.ksrc;
				expandedA.vsrc = oldA.vsrc;
				expandedA.srcOffsets = oldA.srcOffsets;
			}

			// Deal with two template-expansion scenarios for the attribute key (not value)
			//
			// 1. We have a template that generates multiple attributes of this token
			//    as well as content after the token.
			//    Ex: infobox templates from aircraft, ship, and other pages
			//        See enwiki:Boeing_757
			//
			//    - Split the expanded tokens into multiple lines.
			//    - Expanded attributes associated with the token are retained in the
			//      first line before a NlTk.
			//    - Content tokens after the NlTk are moved to subsequent lines.
			//    - The meta tags are hoisted before the original token to make sure
			//      that the entire token and following content is encapsulated as a unit.
			//
			// 2. We have a template that only generates multiple attributes of this
			//    token. In that case, we strip all template meta tags from the expanded
			//    tokens and assign it a mw:ExpandedAttrs type with orig/expanded
			//    values in data-mw.
			//
			// Reparse-KV-string scenario with templated attributes:
			// -----------------------------------------------------
			// In either scenario above, we need additional special handling if the
			// template generates one or more k=v style strings:
			//    <div {{echo|1=style='color:red''}}></div>
			//    <div {{echo|1=style='color:red' title='boo'}}></div>
			//
			// Real use case: Template {{ligne grise}} on frwp.
			//
			// To support this, we utilize the following hack. If we got a string of the
			// form "k=v" and our orig-v was "", we convert the token array to a string
			// and retokenize it to extract one or more attributes.
			//
			// But, we won't support scenarios like this:
			//   {| title={{echo|1='name' style='color:red;'\n|-\n|foo}}\n|}
			// Here, part of one attribute and additional complete attribute strings
			// need reparsing, and that isn't a use case that is worth more complexity here.
			//
			// FIXME:
			// ------
			// 1. It is not possible for multiple instances of scenario 1 to be triggered
			//    for the same token. So, I am not bothering trying to test and deal with it.
			//
			// 2. We trigger the Reparse-KV-string scenario only for attribute keys,
			//    since it isn't possible for attribute values to require this reparsing.
			//    However, it is possible to come up with scenarios where a template
			//    returns the value for one attribute and additional k=v strings for newer
			//    attributes. We don't support that scenario, but don't even test for it.
			//
			// Reparse-KV-string scenario with non-string attributes:
			// ------------------------------------------------------
			// This is only going to be the case with table wikitext that has special syntax
			// for attribute strings.
			//
			// {| <div>a</div> style='border:1px solid black;'
			// |- <div>b</div> style='border:1px dotted blue;'
			// | <div>c</div> style='color:red;'
			// |}
			//
			// In wikitext like the above, the PEG tokenizer doesn't recognize these as
			// valid attributes (the templated attribute scenario is a special case) and
			// orig-v will be "". So, the same strategy as above is applied here as well.

			const origK = expandedA.k;
			const origV = expandedA.v;
			let updatedK = null;
			let updatedV = null;
			let expandedK = expandedA.k;
			let reparsedKV = false;

			if (expandedK) {
				// FIXME: We should get rid of these array/string/non-string checks
				// and probably use appropriately-named flags to convey type information.
				if (Array.isArray(oldA.k)) {
					if (!(expandedK.constructor === String && /(^|\s)mw:maybeContent(\s|$)/.test(expandedK))) {
						nlTkPos = AttributeExpander.nlTkIndex(nlTkOkay, expandedK, wrapTemplates);
						if (nlTkPos !== -1) {
							// Scenario 1 from the documentation comment above.
							updatedK = AttributeExpander.splitTokens(this.manager.frame, token, nlTkPos, expandedK, wrapTemplates);
							expandedK = updatedK.preNLBuf;
							postNLToks = updatedK.postNLBuf;
							metaTokens = updatedK.metaTokens;
						} else {
							// Scenario 2 from the documentation comment above.
							updatedK = AttributeExpander.stripMetaTags(env, expandedK, wrapTemplates);
							expandedK = updatedK.value;
						}

						expandedA.k = expandedK;

						// Check if we need to deal with the Reparse-KV-string scenario.
						// (See documentation comment above)
						// So far, "standalone" mode is only for expanding template
						// targets, which by definition do not have values, so this
						// scenario doesn't apply.  It was wrongly being triggered
						// by the "#ifexpr" parser function, which can expect the
						// "=" equality operator.
						if (expandedA.v === '' && !this.options.standalone) {
							// Extract a parsable string from the token array.
							// Trim whitespace to ensure tokenizer isn't tripped up
							// by the presence of unnecessary whitespace.
							const kStr = TokenUtils.tokensToString(expandedK, false, {
								unpackDOMFragments: true,
								env,  // FIXME: Sneaking in `env` to avoid changing the signature
							}).trim();
							const rule = nlTkOkay ? 'generic_newline_attributes' : 'table_attributes';
							const kvs  = /=/.test(kStr) ? this.tokenizer.tokenizeAs(kStr, rule, /* sol */true) : new Error('null');
							if (!(kvs instanceof Error)) {
								// At this point, templates should have been
								// expanded.  Returning a template token here
								// probably means that when we just converted to
								// string and reparsed, we put back together a
								// failed expansion.  This can be particularly bad
								// when we make iterative calls to expand template
								// names.
								const convertTemplates = function(p) {
									return p.map(function(t) {
										if (!TokenUtils.isTemplateToken(t)) { return t; }
										return t.dataAttribs.src;
									});
								};
								kvs.forEach(function(kv) {
									if (Array.isArray(kv.k)) {
										kv.k = convertTemplates(kv.k);
									}
									if (Array.isArray(kv.v)) {
										kv.v = convertTemplates(kv.v);
									}
									// These `kv`s come from tokenizing the string
									// we produced above, and will therefore have
									// offset starting at zero.  Shift them by the
									// old amount if available.
									if (Array.isArray(expandedA.srcOffsets)) {
										const offset = expandedA.srcOffsets[0];
										if (Array.isArray(kv.srcOffsets)) {
											kv.srcOffsets = kv.srcOffsets.map(function(n) {
												n += offset;
												return n;
											});
										}
									}
								});
								// SSS FIXME: Collect all keys here, not just the first key
								// i.e. in a string like {{echo|1=id='v1' title='foo' style='..'}}
								// that string is setting attributes for [id, title, style], not just id.
								//
								// That requires the ability for the data-mw.attribs[i].txt to be an array.
								// However, the spec at [[mw:Parsoid/MediaWiki_DOM_spec]] says:
								//    "This spec also assumes that a template can only
								//     generate one attribute rather than multiple attributes."
								//
								// So, revision of the spec is another FIXME at which point this code can
								// be updated to reflect the revised spec.
								expandedK = kvs[0].k;
								reparsedKV = true;
								if (!newAttrs) {
									newAttrs = i === 0 ? [] : expandedAttrs.slice(0, i);
								}
								newAttrs = newAttrs.concat(kvs);
							}
						}
					}
				}

				// We have a potentially expanded value.
				// Check if the value came from a template/extension expansion.
				let attrValTokens = origV;
				if (expandedK.constructor === String && Array.isArray(oldA.v)) {
					if (!expandedK.match(/^mw:/)) {
						nlTkPos = AttributeExpander.nlTkIndex(nlTkOkay, attrValTokens, wrapTemplates);
						if (nlTkPos !== -1) {
							// Scenario 1 from the documentation comment above.
							updatedV = AttributeExpander.splitTokens(this.manager.frame, token, nlTkPos, attrValTokens, wrapTemplates);
							attrValTokens = updatedV.preNLBuf;
							postNLToks = updatedV.postNLBuf;
							metaTokens = updatedV.metaTokens;
						} else {
							// Scenario 2 from the documentation comment above.
							updatedV = AttributeExpander.stripMetaTags(env, attrValTokens, wrapTemplates);
							attrValTokens = updatedV.value;
						}
						expandedA.v = attrValTokens;
					}
				}

				// Update data-mw to account for templated attributes.
				// For editability, set HTML property.
				//
				// If we encountered a reparse-KV-string scenario,
				// we set the value's HTML to [] since we can edit
				// the transclusion either via the key's HTML or the
				// value's HTML, but not both.
				if ((reparsedKV && (updatedK.hasGeneratedContent || metaTokens.length > 0)) ||
					(updatedK && updatedK.hasGeneratedContent) ||
					(updatedV && updatedV.hasGeneratedContent)) {
					const key = TokenUtils.tokensToString(expandedK);
					if (!tmpDataMW) {
						tmpDataMW = new Map();
					}
					tmpDataMW.set(key, {
						k: {
							txt: key,
							html: reparsedKV || (updatedK && updatedK.hasGeneratedContent) ? origK : undefined,
							srcOffsets: expandedA.srcOffsets.slice(0, 2),
						},
						v: {
							html: reparsedKV ? [] : origV,
							srcOffsets: expandedA.srcOffsets.slice(2, 4),
						},
					});
				}
			}

			// Update newAttrs
			if (newAttrs && !reparsedKV) {
				newAttrs.push(expandedA);
			}
		}

		token.attribs = newAttrs || expandedAttrs;

		// If the token already has an about, it already has transclusion/extension
		// wrapping. No need to record information about templated attributes in addition.
		//
		// FIXME: If there is a real use case for extension attributes getting
		// templated, this check can be relaxed to allow that.
		// https://gerrit.wikimedia.org/r/#/c/65575 has some reference code that
		// can be used then.

		if (!token.getAttribute('about') && tmpDataMW && tmpDataMW.size > 0) {

			// Flatten k-v pairs.
			let vals = [];
			tmpDataMW.forEach(function(obj) {
				vals.push(obj.k, obj.v);
			});

			// Clone the vals since they'll be passed to another pipeline
			// for expanding, which may destructively mutate them in the
			// process.
			//
			// This is a problem since subsequent handlers to the
			// AttributeExpander may interact with the original tokens still
			// present as attributes of `token`.
			//
			// For example, while treebuilding, the object holding dataAttribs
			// of a token is reused as the data-parsoid attribute of the
			// corresonding node.  Thus, when we get to the DOM cleanup pass,
			// unsetting properties changes the token as well.  This was
			// the issue when an "href" was expanded and then the
			// ExternalLinkHandler tried to call tokensToString on it,
			// resulting in a transcluded entity missing its src (which,
			// by the way, had already been clobered by WrapTemplates,
			// similar to T214241).
			//
			// The general principle here being, don't share tokens between
			// pipelines.
			vals = Util.clone(vals);

			// Async-expand all token arrays to DOM.
			const eVals = yield PipelineUtils.expandValuesToDOM(
				this.manager.env, this.manager.frame, vals,
				this.options.expandTemplates,
				this.options.inTemplate
			);

			// Rebuild flattened k-v pairs.
			const expAttrs = [];
			for (let j = 0; j < eVals.length; j += 2) {
				expAttrs.push([eVals[j], eVals[j + 1]]);
			}

			if (token.name === 'template') {
				// Don't add Parsoid about, typeof, data-mw attributes here since
				// we won't be able to distinguish between Parsoid-added attributes
				// and actual template attributes in cases like:
				//   {{some-tpl|about=#mwt1|typeof=mw:Transclusion}}
				// In both cases, we will encounter a template token that looks like:
				//   { ... "attribs":[{"k":"about","v":"#mwt1"},{"k":"typeof","v":"mw:Transclusion"}] .. }
				// So, record these in the tmp attribute for the template hander
				// to retrieve and process.
				if (!token.dataAttribs.tmp) {
					token.dataAttribs.tmp = {};
				}
				token.dataAttribs.tmp.templatedAttribs = expAttrs;
			} else {
				// Mark token as having expanded attrs.
				token.addAttribute("about", this.manager.env.newAboutId());
				token.addSpaceSeparatedAttribute("typeof", "mw:ExpandedAttrs");
				token.addAttribute("data-mw", JSON.stringify({
					attribs: expAttrs,
				}));
			}
		}

		const newTokens = metaTokens.concat([token], postNLToks);
		if (metaTokens.length === 0) {
			// No more attribute expansion required for token after this
			newTokens.rank = AttributeExpander.skipRank();
		}

		return { tokens: newTokens };
	}
}
// This is clunky, but we don't have async/await until Node >= 7 (T206035)
AttributeExpander.prototype.buildExpandedAttrs =
	Promise.async(AttributeExpander.prototype.buildExpandedAttrsG);

if (typeof module === "object") {
	module.exports.AttributeExpander = AttributeExpander;
}