Source: utils/PipelineUtils.js: JSDoc

/**
 * This file contains parsing pipeline related utilities.
 *
 * @module
 */

'use strict';

require('../../core-upgrade.js');

var JSUtils = require('./jsutils.js').JSUtils;
var Promise = require('./promise.js');
var Util = require('./Util.js').Util;
var TokenUtils = require('./TokenUtils.js').TokenUtils;
const { ContentUtils } = require('./ContentUtils.js');
const { DOMDataUtils } = require('./DOMDataUtils.js');
const { DOMUtils } = require('./DOMUtils.js');
const { WTUtils } = require('./WTUtils.js');
const { KV, TagTk, EndTagTk, SelfclosingTagTk, EOFTk, CommentTk } = require('../tokens/TokenTypes.js');
const tu = require('../wt2html/tokenizer.utils.js');

/**
 * @namespace
 */
var PipelineUtils = {
	/**
	 * Creates a dom-fragment-token for processing 'content' (an array of tokens)
	 * in its own subpipeline all the way to DOM. These tokens will be processed
	 * by their own handler (DOMFragmentBuilder) in the last stage of the async
	 * pipeline.
	 *
	 * srcOffsets should always be provided to process top-level page content in a
	 * subpipeline. Without it, DSR computation and template wrapping cannot be done
	 * in the subpipeline. While unpackDOMFragment can do this on unwrapping, that can
	 * be a bit fragile and makes dom-fragments a leaky abstraction by leaking subpipeline
	 * processing into the top-level pipeline.
	 *
	 * @param {Token[]|string} content
	 *   The array of tokens to process.
	 * @param {number[]} srcOffsets
	 *   Wikitext source offsets (start/end) of these tokens.
	 * @param {Object} [opts]
	 *   Parsing options.
	 * @param {Token} opts.contextTok
	 *   The token that generated the content.
	 * @param {boolean} opts.inlineContext
	 *   Is this DOM fragment used in an inline context?
	 * @param {boolean} opts.inPHPBlock
	 *   Is this DOM fragment used inside a "PHP Block"
	 *   FIXME: This primarily exists for backward compatibility
	 *   reasons and is likely to eventually go away.
	 */
	getDOMFragmentToken: function(content, srcOffsets, opts) {
		if (!opts) {
			opts = {};
		}

		return new SelfclosingTagTk('mw:dom-fragment-token', [
			new KV('contextTok', opts.token, tu.expandTsrV(opts.token.dataAttribs.tsr)),
			new KV('content', content, tu.expandTsrV(srcOffsets)),
			new KV('inlineContext',  opts.inlineContext || false),
			new KV('inPHPBLock',  opts.inPHPBLock || false),
		]);
	},

	/**
	 * Processes content (wikitext, array of tokens, whatever) in its own pipeline
	 * based on options.
	 *
	 * @param {MWParserEnvironment} env
	 *    The environment/context for the expansion.
	 *
	 * @param {Object} frame
	 *    The parent frame within which the expansion is taking place.
	 *    This param is mostly defunct now that we are not doing native
	 *    expansion anymore.
	 *
	 * @param {Object} content
	 *    This could be wikitext or single token or an array of tokens.
	 *    How this content is processed depends on what kind of pipeline
	 *    is constructed specified by opts.
	 *
	 * @param {Object} opts
	 *    Processing options that specify pipeline-type, opts, and callbacks.
	 * @param {string} opts.pipelineType
	 * @param {Object} opts.pipelineOpts
	 * @param {Object} [opts.tplArgs]
	 * @param {string} opts.tplArgs.name
	 * @param {Title} opts.tplArgs.title
	 * @param {Array} opts.tplArgs.attribs
	 * @param {Array} [opts.srcOffsets]
	 * @param {Function} [opts.chunkCB]
	 * @param {Function} [opts.endCB]
	 * @param {Function} [opts.documentCB]
	 * @param {boolean} opts.sol
	 */
	processContentInPipeline: function(env, frame, content, opts) {
		// Build a pipeline
		var pipeline = env.pipelineFactory.getPipeline(
			opts.pipelineType,
			opts.pipelineOpts
		);

		// Set frame if necessary
		const srcText = opts.srcText || frame.srcText;
		console.assert(typeof (srcText) === 'string');
		if (opts.tplArgs) {
			pipeline.setFrame(frame, opts.tplArgs.title, opts.tplArgs.attribs, srcText);
		} else {
			pipeline.setFrame(frame, null, [], srcText);
		}

		// Set source offsets for this pipeline's content
		if (opts.srcOffsets) {
			console.assert(Array.isArray(opts.srcOffsets) && opts.srcOffsets.length === 2);
			pipeline.setSourceOffsets(opts.srcOffsets[0], opts.srcOffsets[1]);
		}

		// Set up provided callbacks
		if (opts.chunkCB) {
			pipeline.addListener('chunk', opts.chunkCB);
		}
		if (opts.endCB) {
			pipeline.addListener('end', opts.endCB);
		}
		if (opts.documentCB) {
			pipeline.addListener('document', opts.documentCB);
		}

		// Off the starting block ... ready, set, go!
		pipeline.process(content, opts.sol);
	},

	/**
	 * A promise returning wrapper around processContentInPipeline that
	 * resolves with the docuemnt.
	 *
	 * @param {MWParserEnvironment} env
	 *    The environment/context for the expansion.
	 *
	 * @param {Object} frame
	 *    The parent frame within which the expansion is taking place.
	 *    This param is mostly defunct now that we are not doing native
	 *    expansion anymore.
	 *
	 * @param {Object} content
	 *    This could be wikitext or single token or an array of tokens.
	 *    How this content is processed depends on what kind of pipeline
	 *    is constructed specified by opts.
	 *
	 * @param {Object} opts
	 *    Processing options that specify pipeline-type, opts, and callbacks.
	 * @param {string} opts.pipelineType
	 * @param {Object} opts.pipelineOpts
	 * @param {Object} [opts.tplArgs]
	 * @param {string} opts.tplArgs.name
	 * @param {Array} opts.tplArgs.attribs
	 * @param {Array} [opts.srcOffsets]
	 * @param {boolean} opts.sol
	 * @return {Promise<Document>}
	 */
	promiseToProcessContent: function(env, frame, content, opts, cb) {
		cb = JSUtils.mkPromised(cb);
		// FIXME: refactor these, so that Promise.async style is default
		// (with proper error handling) and the three-callback
		// style is built on top of that.
		PipelineUtils.processContentInPipeline(
			env, frame, content,
			Object.assign({}, opts, {
				// processContentInPipeline has no error callback :(
				documentCB: dom => cb(null, dom),
			})
		);
		return cb.promise;
	},

	/**
	 * Expands value all the way to DOM and pass it back to a callback.
	 *
	 * FIXME: More of the users of `PipelineUtils.promiseToProcessContent` and
	 * `PipelineUtils.processContentInPipeline` could be converted to use this method
	 * if we could find a good way to abstract the different use cases.
	 *
	 * @param {Object} env
	 *    The environment/context for the expansion.
	 *
	 * @param {Object} frame
	 *    The parent frame within which the expansion is taking place.
	 *    This param is mostly defunct now that we are not doing native
	 *    expansion anymore.
	 *
	 * @param {Object[]} v
	 *    The value to process.
	 *    The value is expected to be an object with a "html" property.
	 *    The html property is expanded to DOM only if it is an array (of tokens).
	 *    Non-arrays are passed back unexpanded.
	 *
	 * @param {boolean} expandTemplates
	 *    Should any templates encountered here be expanded
	 *    (usually false for nested templates since they are never directly editable).
	 *
	 * @param {boolean} inTemplate
	 *    Unexpanded templates can occur in the content of extension tags.
	 *
	 * @return {Promise}
	 *    A promise that will be resolved with the expanded values.
	 */
	expandValueToDOM: Promise.async(function *(env, frame, v, expandTemplates, inTemplate) {
		if (Array.isArray(v.html)) {
			// Set up pipeline options
			var opts = {
				pipelineType: 'tokens/x-mediawiki/expanded',
				pipelineOpts: {
					attrExpansion: true,
					inlineContext: true,
					expandTemplates: expandTemplates,
					inTemplate: inTemplate,
				},
				srcOffsets: v.srcOffsets,
				sol: true,
			};
			var content = v.html.concat([new EOFTk()]);
			try {
				var dom = yield PipelineUtils.promiseToProcessContent(
					env, frame, content, opts
				);
				// Since we aren't at the top level, data attrs
				// were not applied in cleanup.  However, tmp
				// was stripped.
				v.html = ContentUtils.ppToXML(dom.body, { innerXML: true });
			} catch (err) {
				env.log('error', 'Expanding values to DOM', err);
			}
		}
		// Remove srcOffsets after value is expanded, so they don't show
		// up in the output data-mw attribute
		v.srcOffsets = undefined;
		return v;
	}),

	/**
	 * See `expandValueToDOM` above.
	 */
	expandValuesToDOM: function(env, frame, vals, expandTemplates, inTemplate) {
		return Promise.all(vals.map((v) => {
			return PipelineUtils.expandValueToDOM(env, frame, v, expandTemplates, inTemplate);
		}));
	},

	/**
	 * @param {Token[]} tokBuf This is where the tokens get stored.
	 */
	convertDOMtoTokens: function(tokBuf, node) {
		function domAttrsToTagAttrs(attrs) {
			var out = [];
			for (var j = 0, m = attrs.length; j < m; j++) {
				var a = attrs.item(j);
				// Not super important since they'll be overwritten in prepareDOM.js
				if (!['data-parsoid', DOMDataUtils.DataObjectAttrName()].includes(a.name)) {
					out.push(new KV(a.name, a.value));
				}
			}
			return { attrs: out, dataAttrs: DOMDataUtils.getDataParsoid(node) };
		}

		switch (node.nodeType) {
			case node.ELEMENT_NODE:
				var nodeName = node.nodeName.toLowerCase();
				var attrInfo = domAttrsToTagAttrs(node.attributes);

				if (Util.isVoidElement(nodeName)) {
					tokBuf.push(new SelfclosingTagTk(nodeName, attrInfo.attrs, attrInfo.dataAttrs));
				} else {
					tokBuf.push(new TagTk(nodeName, attrInfo.attrs, attrInfo.dataAttrs));
					for (var child = node.firstChild; child; child = child.nextSibling) {
						tokBuf = PipelineUtils.convertDOMtoTokens(tokBuf, child);
					}
					var endTag = new EndTagTk(nodeName);
					// Keep stx parity
					if (WTUtils.isLiteralHTMLNode(node)) {
						endTag.dataAttribs = { 'stx': 'html' };
					}
					tokBuf.push(endTag);
				}
				break;

			case node.TEXT_NODE:
				tokBuf = tokBuf.concat(TokenUtils.newlinesToNlTks(node.nodeValue));
				break;

			case node.COMMENT_NODE:
				tokBuf.push(new CommentTk(node.nodeValue));
				break;

			default:
				console.warn("Unhandled node type: " + node.outerHTML);
				break;
		}
		return tokBuf;
	},

	/**
	 * Get tokens representing a DOM forest (from transclusions, extensions,
	 * whatever that were generated as part of a separate processing pipeline)
	 * in the token stream. These tokens will tunnel the subtree through the
	 * token processing while preserving token stream semantics as if
	 * the DOM had been converted to tokens.
	 *
	 * @param {Node[]} nodes List of DOM nodes that need to be tunneled through.
	 * @param {Object} opts
	 *    See encapsulateExpansionHTML's doc. for more info about these options.
	 * @return {Array} List of token representatives.
	 */
	getWrapperTokens: function(nodes, opts) {
		if (!nodes.length) {
			return [new TagTk('span'), new EndTagTk('span')];
		}

		var node = nodes[0];

		// Do we represent this with inline or block elements?
		// This is to ensure that we get p-wrapping correct.
		//
		// * If all content is inline, we use inline-elements to represent this
		//   so that this content gets swallowed into the P tag that wraps
		//   adjacent inline content.
		//
		// * If any part of this is a block content, we treat extension content
		//   independent of surrounding content and don't want inline content
		//   here to be swallowed into a P tag that wraps adjacent inline content.
		//
		// This behavior ensures that we and clients can "drop-in" extension content
		// into the DOM without messing with fixing up paragraph tags of surrounding
		// content. It could potentially introduce minor rendering differences when
		// compared to PHP parser output, but we'll swallow it for now.
		var wrapperType = 'INLINE';
		if (opts.pipelineOpts && (opts.pipelineOpts.inlineContext || opts.pipelineOpts.inPHPBlock)) {
			// If the DOM fragment is being processed in the context where P wrapping
			// has been suppressed, we represent the DOM fragment with inline-tokens.
			//
			// FIXME(SSS): Looks like we have some "impedance mismatch" here. But, this
			// is correct in scenarios where link-content or image-captions are being
			// processed in a sub-pipeline and we don't want a <div> in the link-caption
			// to cause the <a>..</a> to get split apart.
			//
			// Filed as T49963
		} else if (opts.sealFragment) {
			// Sealed fragments aren't amenable to inspection, since the
			// ultimate content is unknown.  For example, refs shuttle content
			// through treebuilding that ends up in the references list.
			//
			// FIXME(arlolra): Do we need a mechanism to specify content
			// categories?
		} else {
			for (var i = 0; i < nodes.length; i++) {
				if (DOMUtils.isBlockNode(nodes[i]) || DOMUtils.hasBlockElementDescendant(nodes[i])) {
					wrapperType = 'BLOCK';
					break;
				}
			}
		}

		var wrapperName;
		if (wrapperType === 'BLOCK' && !DOMUtils.isBlockNode(node)) {
			wrapperName = 'DIV';
		} else if (node.nodeName === 'A') {
			// Do not use 'A' as a wrapper node because it could
			// end up getting nested inside another 'A' and the DOM
			// structure can change where the wrapper tokens are no
			// longer siblings.
			// Ex: "[http://foo.com Bad nesting [[Here]]].
			wrapperName = 'SPAN';
		} else if (['STYLE', 'SCRIPT'].includes(node.nodeName) && nodes.length > 1) {
			// <style>/<script> tags are not fostered, so if we're wrapping
			// more than a single node, they aren't a good representation for
			// the content.  It can lead to fosterable content being inserted
			// in a fosterable position after treebuilding is done, which isn't
			// roundtrippable.
			wrapperName = 'SPAN';
		} else if (!DOMUtils.isElt(node)) {
			wrapperName = 'SPAN';
		} else {
			wrapperName = node.nodeName;
		}

		// Assumed to only be called on nodes that have had data
		// attributes stored.
		if (DOMUtils.isElt(node)) {
			console.assert(!node.hasAttribute(DOMDataUtils.DataObjectAttrName()));
		}

		var workNode;
		if (!DOMUtils.isElt(node)) {
			workNode = node.ownerDocument.createElement(wrapperName);
		} else if (wrapperName !== node.nodeName) {
			// Create a copy of the node without children
			workNode = node.ownerDocument.createElement(wrapperName);
			// Copy over attributes
			for (var j = 0; j < node.attributes.length; j++) {
				var attribute = node.attributes.item(j);
				// "typeof" is ignored since it'll be remove below.
				// "data-parsoid" will be overwritten with `dataAttribs` when
				// the token gets to the tree builder, so skip it.  It's
				// present on the `node` since it has already had its data
				// attributes stored.
				if (!['typeof', 'data-parsoid'].includes(attribute.name)) {
					workNode.setAttribute(attribute.name, attribute.value);
				}
			}
		} else {
			// Shallow clone since we don't want to convert the whole tree
			// to tokens.
			workNode = node.clone(false);

			// dataAttribs are not copied over so that we don't inject
			// broken tsr or dsr values. This also lets these tokens pass
			// through the sanitizer as stx.html is not set.
			//
			// FIXME(arlolra): Presumably, the tsr/dsr portion of the above
			// comment is with respect to the case where the child nodes are
			// being dropped.  The stx part though is suspect considering
			// below where `workNode` is set to the `node` that information
			// would be preserved.
			//
			// We've filed T204279 to look into all this, but for now we'll do
			// the safe thing and preserve dataAttribs where it makes sense.
			//
			// As indicated above, data attributes have already been stored
			// for `node` so we need to peel them off for the purpose of
			// cloning.
			const storedDp = DOMDataUtils.getJSONAttribute(node, 'data-parsoid', {});
			storedDp.tsr = undefined;
			DOMDataUtils.setDataParsoid(workNode, storedDp);
		}

		var tokens = [];
		PipelineUtils.convertDOMtoTokens(tokens, workNode);

		// Remove the typeof attribute from the first token.
		// It will be replaced with mw:DOMFragment.
		tokens[0].removeAttribute('typeof');
		// Remove the about attribute from the first token.
		// We want to be able to distinguish when this wrapper was template
		// annotated.
		tokens[0].removeAttribute('about');

		return tokens;
	},

	/**
	 * Generates wrapper tokens for a HTML expansion -- the wrapper
	 * tokens are placeholders that adequately represent semantics
	 * of the HTML DOM for the purposes of additional token transformations
	 * that will be applied to them.
	 *
	 * @param {MWParserEnvironment} env
	 *    The active environment/context.
	 *
	 * @param {Token} token
	 *    The token that generated the DOM.
	 *
	 * @param {Object} expansion
	 * @param {string} expansion.html
	 *    HTML of the expansion.
	 * @param {Node[]} expansion.nodes
	 *    Outermost nodes of the HTML.
	 *
	 * @param {Object} [opts]
	 * @param {Object} opts.tsr
	 *    The TSR to set on the generated tokens. This TSR is
	 *    used to compute DSR on the placeholder tokens.
	 *    The computed DSR is transferred over to the unpacked DOM
	 *    if setDSR is true (see below).
	 * @param {boolean} opts.setDSR
	 *    When the DOM fragment is unpacked, this option governs
	 *    whether the DSR from the placeholder node is transferred
	 *    over to the unpacked DOM or not.
	 *    For example: Cite, reused transclusions.
	 * @param {boolean} opts.fromCache
	 * @param {Object} opts.pipelineOpts
	 * @param {boolean} opts.sealFragment
	 * @param {string} opts.wrapperName
	 */
	encapsulateExpansionHTML: function(env, token, expansion, opts) {
		opts = opts || {};

		// Get placeholder tokens to get our subdom through the token processing
		// stages. These will be finally unwrapped on the DOM.
		var toks = PipelineUtils.getWrapperTokens(expansion.nodes, opts);
		var firstWrapperToken = toks[0];

		// Add the DOMFragment type so that we get unwrapped later.
		firstWrapperToken.setAttribute('typeof', 'mw:DOMFragment' + (opts.sealFragment ? '/sealed/' + opts.wrapperName : ''));

		// Assign the HTML fragment to the data-parsoid.html on the first wrapper token.
		firstWrapperToken.dataAttribs.html = expansion.html;

		// Pass through setDSR flag
		if (opts.setDSR) {
			if (!firstWrapperToken.dataAttribs.tmp) {
				firstWrapperToken.dataAttribs.tmp = {};
			}
			firstWrapperToken.dataAttribs.tmp.setDSR = opts.setDSR;
		}

		// Pass through fromCache flag
		if (opts.fromCache) {
			if (!firstWrapperToken.dataAttribs.tmp) {
				firstWrapperToken.dataAttribs.tmp = {};
			}
			firstWrapperToken.dataAttribs.tmp.fromCache = opts.fromCache;
		}

		// Transfer the tsr.
		// The first token gets the full width, the following tokens zero width.
		var tokenTsr = opts.tsr || (token.dataAttribs ? token.dataAttribs.tsr : null);
		if (tokenTsr) {
			firstWrapperToken.dataAttribs.tsr = tokenTsr;
			firstWrapperToken.dataAttribs.extTagOffsets = token.dataAttribs ? token.dataAttribs.extTagOffsets : null;
			var endTsr = [tokenTsr[1], tokenTsr[1]];
			for (var i = 1; i < toks.length; i++) {
				toks[i].dataAttribs.tsr = endTsr;
			}
		}

		return toks;
	},

	/**
	 * Wrap text and comment nodes in a node list into spans, so that all
	 * top-level nodes are elements.
	 *
	 * @param {Node[]} nodes List of DOM nodes to wrap, mix of node types.
	 */
	addSpanWrappers: function(nodes) {
		let textCommentAccum = [];
		const doc = nodes[0] && nodes[0].ownerDocument;

		function wrapAccum() {
			// Wrap accumulated nodes in a span
			const span = doc.createElement('span');
			const parentNode = textCommentAccum[0].parentNode;
			parentNode.insertBefore(span, textCommentAccum[0]);
			textCommentAccum.forEach(function(n) {
				span.appendChild(n);
			});
			DOMDataUtils.setDataParsoid(span, { tmp: { wrapper: true } });
			textCommentAccum = [];
		}

		// Build a real array out of nodes.
		//
		// Operating directly on DOM child-nodes array
		// and manipulating them by adding span wrappers
		// changes the traversal itself
		const nodeBuf = Array.from(nodes);

		nodeBuf.forEach(function(node) {
			if (DOMUtils.isText(node) || DOMUtils.isComment(node)) {
				textCommentAccum.push(node);
			} else {
				if (textCommentAccum.length) {
					wrapAccum();
				}
			}
		});

		if (textCommentAccum.length) {
			wrapAccum();
		}
	},

	/**
	 * Convert a HTML5 DOM into a mw:DOMFragment and generate appropriate
	 * tokens to insert into the token stream for further processing.
	 *
	 * The DOMPostProcessor will unpack the fragment and insert the HTML
	 * back into the DOM.
	 *
	 * @param {MWParserEnvironment} env
	 *    The active environment/context.
	 *
	 * @param {Token} token
	 *    The token that generated the DOM.
	 *
	 * @param {Node} body
	 *    The DOM that the token expanded to.
	 *
	 * @param {Object} opts
	 *    Options to be passed onto the encapsulation code
	 *    See encapsulateExpansionHTML's doc. for more info about these options.
	 */
	tunnelDOMThroughTokens: function(env, token, body, opts) {
		console.assert(DOMUtils.isBody(body), 'DOMFragment expected body node.');
		// Get placeholder tokens to get our subdom through the token processing
		// stages. These will be finally unwrapped on the DOM.
		var expansion = PipelineUtils.makeExpansion(env, body.childNodes);
		return PipelineUtils.encapsulateExpansionHTML(env, token, expansion, opts);
	},

	makeExpansion: function(env, nodes) {
		nodes.forEach(function(n) {
			DOMDataUtils.visitAndStoreDataAttribs(n);
		});
		return { nodes: nodes, html: env.setFragment(nodes) };
	},

	/**
	 * Extract transclusion and extension expansions from a DOM, and return
	 * them in a structure like this:
	 * ```
	 *     {
	 *         transclusions: {
	 *             'key1': {
	 *                  html: 'html1',
	 *                  nodes: [<node1>, <node2>]
	 *             }
	 *         },
	 *         extensions: {
	 *             'key2': {
	 *                  html: 'html2',
	 *                  nodes: [<node1>, <node2>]
	 *             }
	 *         },
	 *         files: {
	 *             'key3': {
	 *                  html: 'html3',
	 *                  nodes: [<node1>, <node2>]
	 *             }
	 *         }
	 *     }
	 * ```
	 */
	extractExpansions: function(env, body) {
		var expansions = {
			transclusions: {},
			extensions: {},
			media: {},
		};
		function doExtractExpansions(node) {
			var nodes, expAccum;
			while (node) {
				if (DOMUtils.isElt(node)) {
					var typeOf = node.getAttribute('typeof') || '';
					if ((/(?:^|\s)(?:mw:(?:Transclusion(?=$|\s)|Extension\/))/.test(typeOf) && node.hasAttribute('about')) ||
							/(?:^|\s)(?:mw:(?:Image|Video|Audio)(?:(?=$|\s)|\/))/.test(typeOf)) {
						var dp = DOMDataUtils.getDataParsoid(node);
						var about = node.hasAttribute('about') ?
							node.getAttribute('about') : null;
						nodes = WTUtils.getAboutSiblings(node, about);

						var key;
						if (/(?:^|\s)mw:Transclusion(?=$|\s)/.test(typeOf)) {
							expAccum = expansions.transclusions;
							key = dp.src;
						} else if (/(?:^|\s)mw:Extension\//.test(typeOf)) {
							expAccum = expansions.extensions;
							key = dp.src;
						} else {
							expAccum = expansions.media;
							// XXX gwicke: use proper key that is not
							// source-based? This also needs to work for
							// transclusion output.
							key = null;
						}

						if (key) {
							expAccum[key] = PipelineUtils.makeExpansion(env, nodes);
						}

						node = JSUtils.lastItem(nodes);
					} else {
						doExtractExpansions(node.firstChild);
					}
				}
				node = node.nextSibling;
			}
		}
		// Kick off the extraction
		doExtractExpansions(body.firstChild);
		return expansions;
	},

};

if (typeof module === "object") {
	module.exports.PipelineUtils = PipelineUtils;
}