Source: wt2html/parser.js: JSDoc

/**
 * This module assembles parser pipelines from parser stages with
 * asynchronous communnication between stages based on events. Apart from the
 * default pipeline which converts WikiText to HTML DOM, it also provides
 * sub-pipelines for the processing of template transclusions.
 *
 * See http://www.mediawiki.org/wiki/Parsoid and
 * http://www.mediawiki.org/wiki/Parsoid/Token_stream_transformations
 * for illustrations of the pipeline architecture.
 * @module
 */

'use strict';

var Promise = require('../utils/promise.js');
var PegTokenizer = require('./tokenizer.js').PegTokenizer;
var TokenTransformManager = require('./TokenTransformManager.js');
var ExtensionHandler = require('./tt/ExtensionHandler.js').ExtensionHandler;
var NoIncludeOnly = require('./tt/NoIncludeOnly.js');
var QuoteTransformer = require('./tt/QuoteTransformer.js').QuoteTransformer;
var TokenStreamPatcher = require('./tt/TokenStreamPatcher.js').TokenStreamPatcher;
var PreHandler = require('./tt/PreHandler.js').PreHandler;
var ParagraphWrapper = require('./tt/ParagraphWrapper.js').ParagraphWrapper;
var SanitizerHandler = require('./tt/Sanitizer.js').SanitizerHandler;
var TemplateHandler = require('./tt/TemplateHandler.js').TemplateHandler;
var AttributeExpander = require('./tt/AttributeExpander.js').AttributeExpander;
var ListHandler = require('./tt/ListHandler.js').ListHandler;
var WikiLinkHandler = require('./tt/WikiLinkHandler.js').WikiLinkHandler;
var ExternalLinkHandler = require('./tt/ExternalLinkHandler.js').ExternalLinkHandler;
var BehaviorSwitchHandler = require('./tt/BehaviorSwitchHandler.js').BehaviorSwitchHandler;
var LanguageVariantHandler = require('./tt/LanguageVariantHandler.js').LanguageVariantHandler;
var DOMFragmentBuilder = require('./tt/DOMFragmentBuilder.js').DOMFragmentBuilder;
var HTML5TreeBuilder = require('./HTML5TreeBuilder.js').HTML5TreeBuilder;
var DOMPostProcessor = require('./DOMPostProcessor.js').DOMPostProcessor;
var JSUtils = require('../utils/jsutils.js').JSUtils;

var SyncTokenTransformManager = TokenTransformManager.SyncTokenTransformManager;
var AsyncTokenTransformManager = TokenTransformManager.AsyncTokenTransformManager;
var IncludeOnly = NoIncludeOnly.IncludeOnly;
var NoInclude = NoIncludeOnly.NoInclude;
var OnlyInclude = NoIncludeOnly.OnlyInclude;


var ParserPipeline; // forward declaration
var globalPipelineId = 0;

/**
 * @class
 * @param {MWParserEnvironment} env
 */
function ParserPipelineFactory(env) {
	this.pipelineCache = {};
	this.env = env;
}

/**
 * Recipe for parser pipelines and -subpipelines, depending on input types.
 *
 * Token stream transformations to register by type and per phase. The
 * possible ranks for individual transformation registrations are [0,1)
 * (excluding 1.0) for sync01, [1,2) for async12 and [2,3) for sync23.
 *
 * Should perhaps be moved to {@link MWParserEnvironment}, so that all
 * configuration can be found in a single place.
 */
ParserPipelineFactory.prototype.recipes = {
	// The full wikitext pipeline
	'text/x-mediawiki/full': [
		// Input pipeline including the tokenizer
		'text/x-mediawiki',
		// Final synchronous token transforms and DOM building / processing
		'tokens/x-mediawiki/expanded',
	],

	// A pipeline from wikitext to expanded tokens. The input pipeline for
	// wikitext.
	'text/x-mediawiki': [
		[ PegTokenizer, [] ],
		'tokens/x-mediawiki',
	],

	// Synchronous per-input and async token stream transformations. Produces
	// a fully expanded token stream ready for consumption by the
	// tokens/expanded pipeline.
	'tokens/x-mediawiki': [
		// Synchronous in-order per input
		[
			SyncTokenTransformManager,
			[ 1, 'tokens/x-mediawiki' ],
			[
				// PHASE RANGE: [0,1)
				OnlyInclude,  // 0.01
				IncludeOnly,  // 0.02
				NoInclude,  // 0.03
			],
		],
		/*
		* Asynchronous out-of-order per input. Each async transform can only
		* operate on a single input token, but can emit multiple output
		* tokens. If multiple tokens need to be collected per-input, then a
		* separate collection transform in sync01 can be used to wrap the
		* collected tokens into a single one later processed in an async12
		* transform.
		*/
		[
			AsyncTokenTransformManager,
			[ 2, 'tokens/x-mediawiki' ],
			[
				// PHASE RANGE: [1,2)
				TemplateHandler,  // 1.1
				ExtensionHandler,  // 1.11

				// Expand attributes after templates to avoid expanding unused branches
				// No expansion of quotes, paragraphs etc in attributes, as in
				// PHP parser- up to text/x-mediawiki/expanded only.
				AttributeExpander,  // 1.12

				// now all attributes expanded to tokens or string

				// more convenient after attribute expansion
				WikiLinkHandler,  // 1.15
				ExternalLinkHandler,  // 1.15
				LanguageVariantHandler, // 1.16

				// This converts dom-fragment-token tokens all the way to DOM
				// and wraps them in DOMFragment wrapper tokens which will then
				// get unpacked into the DOM by a dom-fragment unpacker.
				DOMFragmentBuilder,  // 1.99
			],
		],
	],

	// Final stages of main pipeline, operating on fully expanded tokens of
	// potentially mixed origin.
	'tokens/x-mediawiki/expanded': [
		// Synchronous in-order on fully expanded token stream (including
		// expanded templates etc). In order to support mixed input (from
		// wikitext and plain HTML, say) all applicable transforms need to be
		// included here. Input-specific token types avoid any runtime
		// overhead for unused transforms.
		[
			SyncTokenTransformManager,
			// PHASE RANGE: [2,3)
			[ 3, 'tokens/x-mediawiki/expanded' ],
			[
				TokenStreamPatcher,     // 2.001 -- 2.003
				// add <pre>s
				PreHandler,             // 2.051 -- 2.054
				QuoteTransformer,       // 2.1
				// add before transforms that depend on behavior switches
				// examples: toc generation, edit sections
				BehaviorSwitchHandler,  // 2.14

				ListHandler,            // 2.49
				SanitizerHandler,       // 2.90, 2.91
				// Wrap tokens into paragraphs post-sanitization so that
				// tags that converted to text by the sanitizer have a chance
				// of getting wrapped into paragraphs.  The sanitizer does not
				// require the existence of p-tags for its functioning.
				ParagraphWrapper,       // 2.95 -- 2.97
			],
		],

		// Build a tree out of the fully processed token stream
		[ HTML5TreeBuilder, [] ],

		/*
		 * Final processing on the HTML DOM.
		 */

		/*
		 * Generic DOM transformer.
		 * This performs a lot of post-processing of the DOM
		 * (Template wrapping, broken wikitext/html detection, etc.)
		 */
		[ DOMPostProcessor, [] ],
	],
};

var supportedOptions = new Set([
	// If true, templates found in content will have its contents expanded
	'expandTemplates',

	// If true, indicates pipeline is processing the expanded content of a
	// template or its arguments
	'inTemplate',

	// If true, indicates that we are in a <includeonly> context
	// (in current usage, isInclude === inTemplate)
	'isInclude',

	// The extension tag that is being processed (Ex: ref, references)
	// (in current usage, only used for native tag implementation)
	'extTag',

	// Extension-specific options
	'extTagOpts',

	// Content being parsed is used in an inline context
	'inlineContext',

	// FIXME: Related to PHP parser doBlockLevels side effect.
	// Primarily exists for backward compatibility reasons.
	// Might eventually go away in favor of something else.
	'inPHPBlock',

	// Are we processing content of attributes?
	// (in current usage, used for transcluded attr. keys/values)
	'attrExpansion',
]);

// Default options processing
var defaultOptions = function(options) {
	if (!options) { options = {}; }

	Object.keys(options).forEach(function(k) {
		console.assert(supportedOptions.has(k), 'Invalid cacheKey option: ' + k);
	});

	// default: not an include context
	if (options.isInclude === undefined) {
		options.isInclude = false;
	}

	// default: wrap templates
	if (options.expandTemplates === undefined) {
		options.expandTemplates = true;
	}

	return options;
};

/**
 * Generic pipeline creation from the above recipes.
 */
ParserPipelineFactory.prototype.makePipeline = function(type, options) {
	// SSS FIXME: maybe there is some built-in method for this already?
	options = defaultOptions(options);

	var recipe = this.recipes[type];
	if (!recipe) {
		console.trace();
		throw 'Error while trying to construct pipeline for ' + type;
	}
	var stages = [];
	for (var i = 0, l = recipe.length; i < l; i++) {
		// create the stage
		var stageData = recipe[i];
		var stage;

		if (stageData.constructor === String) {
			// Points to another subpipeline, get it recursively
			// Clone options object and clear cache type
			var newOpts = Object.assign({}, options);
			stage = this.makePipeline(stageData, newOpts);
		} else {
			console.assert(stageData[1].length <= 2);
			stage = new (stageData[0])(this.env, options, this, stageData[1][0], stageData[1][1]);
			if (stageData.length >= 3) {
				// FIXME: This code here adds the 'transformers' property to every stage
				// behind the back of that stage.  There are two alternatives to this:
				//
				// 1. Add 'recordTransformer' and 'getTransformers' functions to every stage.
				//    But, seems excessive compared to current approach where the stages
				//    aren't concerned with unnecessary details of state maintained for
				//    the purposes of top-level orchestration.
				// 2. Alternatively, we could also maintain this information as a separate
				//    object rather than tack it onto '.transformers' property of each stage.
				//    this.stageTransformers = [
				//      [stage1-transformers],
				//      [stage2-transformers],
				//      ...
				//    ];

				stage.transformers = [];
				// Create (and implicitly register) transforms
				var transforms = stageData[2];
				for (var j = 0; j < transforms.length; j++) {
					const T = transforms[j];
					stage.transformers.push(new T(stage, options));
				}
			}
		}

		// connect with previous stage
		if (i) {
			stage.addListenersOn(stages[i - 1]);
		}
		stages.push(stage);
	}

	return new ParserPipeline(
		type,
		stages,
		this.env
	);
};

function getCacheKey(cacheKey, options) {
	cacheKey = cacheKey || '';
	if (!options.isInclude) {
		cacheKey += '::noInclude';
	}
	if (!options.expandTemplates) {
		cacheKey += '::noExpand';
	}
	if (options.inlineContext) {
		cacheKey += '::inlineContext';
	}
	if (options.inPHPBlock) {
		cacheKey += '::inPHPBlock';
	}
	if (options.inTemplate) {
		cacheKey += '::inTemplate';
	}
	if (options.attrExpansion) {
		cacheKey += '::attrExpansion';
	}
	if (options.extTag) {
		cacheKey += '::' + options.extTag;
		// FIXME: This is not the best strategy. But, instead of
		// premature complexity, let us see how extensions want to
		// use this and then figure out what constraints are needed.
		if (options.extTagOpts) {
			cacheKey += '::' + JSON.stringify(options.extTagOpts);
		}
	}
	return cacheKey;
}

/**
 * @param {string} src
 * @param {Function} [cb]
 * @return {Promise}
 */
ParserPipelineFactory.prototype.parse = function(src, cb) {
	return new Promise((resolve, reject) => {
		// Now go ahead with the actual parsing
		var parser = this.getPipeline('text/x-mediawiki/full');
		parser.once('document', resolve);
		parser.processToplevelDoc(src);
	}).nodify(cb);
};


/**
 * Get a subpipeline (not the top-level one) of a given type.
 *
 * Subpipelines are cached as they are frequently created.
 */
ParserPipelineFactory.prototype.getPipeline = function(type, options) {
	options = defaultOptions(options);

	var cacheKey = getCacheKey(type, options);
	if (!this.pipelineCache[cacheKey]) {
		this.pipelineCache[cacheKey] = [];
	}

	var pipe;
	if (this.pipelineCache[cacheKey].length) {
		pipe = this.pipelineCache[cacheKey].pop();
		pipe.resetState();
		// Clear both 'end' and 'document' handlers
		pipe.removeAllListeners('end');
		pipe.removeAllListeners('document');
		// Also remove chunk listeners, although ideally that would already
		// happen in resetState. We'd need to avoid doing so when called from
		// processToplevelDoc though, so lets do it here for now.
		pipe.removeAllListeners('chunk');
	} else {
		pipe = this.makePipeline(type, options);
	}
	// add a cache callback
	var returnPipeline = () => this.returnPipeline(cacheKey, pipe);
	// Token pipelines emit an 'end' event
	pipe.addListener('end', returnPipeline);
	// Document pipelines emit a final 'document' even instead
	pipe.addListener('document', returnPipeline);

	// Debugging aid: Assign unique id to the pipeline
	pipe.setPipelineId(globalPipelineId++);

	return pipe;
};

/**
 * Callback called by a pipeline at the end of its processing. Returns the
 * pipeline to the cache.
 */
ParserPipelineFactory.prototype.returnPipeline = function(cacheKey, pipe) {
	// Clear all listeners, but do so after all other handlers have fired
	// pipe.on('end', function() { pipe.removeAllListeners( ) });
	var cache = this.pipelineCache[cacheKey];
	if (!cache) {
		cache = this.pipelineCache[cacheKey] = [];
	}
	if (cache.length < 100) {
		cache.push(pipe);
	}
};

/* ******************* ParserPipeline *************************** */

/**
 * Wrap some stages into a pipeline. The last member of the pipeline is
 * supposed to emit events, while the first is supposed to support a process()
 * method that sets the pipeline in motion.
 * @class
 */
ParserPipeline = function(type, stages, env) {
	this.pipeLineType = type;
	this.stages = stages;
	this.first = stages[0];
	this.last = JSUtils.lastItem(stages);
	this.env = env;
};

/**
 * Applies the function across all stages and transformers registered at each stage.
 * @private
 */
ParserPipeline.prototype._applyToStage = function(fn, args) {
	// Apply to each stage
	this.stages.forEach(function(stage) {
		if (stage[fn] && stage[fn].constructor === Function) {
			stage[fn].apply(stage, args);
		}
		// Apply to each registered transformer for this stage
		if (stage.transformers) {
			stage.transformers.forEach(function(t) {
				if (t[fn] && t[fn].constructor === Function) {
					t[fn].apply(t, args);
				}
			});
		}
	});
};

/**
 * This is useful for debugging.
 */
ParserPipeline.prototype.setPipelineId = function(id) {
	this.id = id;
	this._applyToStage("setPipelineId", [id]);
};

/**
 * This is primarily required to reset native extensions
 * which might have be shared globally per parsing environment
 * (unlike pipeline stages and transformers that exist one per
 * pipeline). So, cannot rely on 'end' event to reset pipeline
 * because there will be one 'end' event per pipeline.
 *
 * Ex: cite needs to maintain a global sequence across all
 * template transclusion pipelines, extension, and top-level
 * pipelines.
 *
 * This lets us reuse pipelines to parse unrelated top-level pages
 * Ex: parser tests. Currently only parser tests exercise
 * this functionality.
 */
ParserPipeline.prototype.resetState = function(opts) {
	this._applyToStage("resetState", [opts]);
};

/**
 * Set source offsets for the source that this pipeline will process.
 *
 * This lets us use different pipelines to parse fragments of the same page
 * Ex: extension content (found on the same page) is parsed with a different
 * pipeline than the top-level page.
 *
 * Because of this, the source offsets are not [0, page.length) always
 * and needs to be explicitly initialized
 */
ParserPipeline.prototype.setSourceOffsets = function(start, end) {
	this._applyToStage("setSourceOffsets", [start, end]);
};

/**
 * Feed input tokens to the first pipeline stage.
 *
 * @param {Array|string} input tokens
 * @param {boolean} sol Whether tokens should be processed in start-of-line
 *   context.
 */
ParserPipeline.prototype.process = function(input, sol) {
	try {
		return this.first.process(input, sol);
	} catch (err) {
		this.env.log("fatal", err);
	}
};

/**
 * Feed input tokens to the first pipeline stage.
 */
ParserPipeline.prototype.processToplevelDoc = function(input) {
	// Reset pipeline state once per top-level doc.
	// This clears state from any per-doc global state
	// maintained across all pipelines used by the document.
	// (Ex: Cite state)
	this.resetState({ toplevel: true });
	if (!this.env.startTime) {
		this.env.startTime = JSUtils.startTime();
	}
	this.env.log('trace/time', 'Starting parse at ', this.env.startTime);
	this.process(input, /* sol */true);
};

/**
 * Set the frame on the last pipeline stage (normally the
 * AsyncTokenTransformManager).
 */
ParserPipeline.prototype.setFrame = function(frame, title, args, srcText) {
	return this._applyToStage("setFrame", [frame, title, args, srcText]);
};

/**
 * Register the first pipeline stage with the last stage from a separate pipeline.
 */
ParserPipeline.prototype.addListenersOn = function(stage) {
	return this.first.addListenersOn(stage);
};

// Forward the EventEmitter API to this.last
ParserPipeline.prototype.on = function(ev, cb) {
	return this.last.on(ev, cb);
};
ParserPipeline.prototype.once = function(ev, cb) {
	return this.last.once(ev, cb);
};
ParserPipeline.prototype.addListener = function(ev, cb) {
	return this.last.addListener(ev, cb);
};
ParserPipeline.prototype.removeListener = function(ev, cb) {
	return this.last.removeListener(ev, cb);
};
ParserPipeline.prototype.setMaxListeners = function(n) {
	return this.last.setMaxListeners(n);
};
ParserPipeline.prototype.listeners = function(ev) {
	return this.last.listeners(ev);
};
ParserPipeline.prototype.removeAllListeners = function(event) {
	this.last.removeAllListeners(event);
};

if (typeof module === "object") {
	module.exports.ParserPipelineFactory = ParserPipelineFactory;
}