/**
* State object for the wikitext serializers.
*
* Here is what the state attributes mean:
*
* rtTestMode
* - Are we currently running round-trip tests? If yes, then we know
* there won't be any edits and we more aggressively try to use original
* source and source flags during serialization since this is a test of
* Parsoid's efficacy in preserving information.
*
* sep
* - Separator information:
* - constraints: min/max number of newlines
* - text: collected separator text from DOM text/comment nodes
* - lastSourceNode: Seems to be bookkeeping to make sure we don't reuse
* original separators when `emitChunk` is called
* consecutively on the same node. However, it also
* differs from `state.prevNode` in that it only gets
* updated when a node calls `emitChunk` so that nodes
* serializing `justChildren` don't mix up `buildSep`.
*
* onSOL
* - Is the serializer at the start of a new wikitext line?
*
* atStartOfOutput
* - True when wts kicks off, false after the first char has been output
*
* inLink
* - Is the serializer currently handling link content (children of `<a>`)?
*
* inIndentPre
* - Is the serializer currently handling indent-pre tags?
*
* inPHPBlock
* - Is the serializer currently handling a tag that the PHP parser
* treats as a block tag?
*
* inAttribute
* - Is the serializer being invoked recursively to serialize a
* template-generated attribute (via `WSP.getAttributeValue`'s
* template handling). If so, we should suppress some
* serialization escapes, like autolink protection, since
* these are not valid for attribute values.
*
* hasIndentPreNowikis
* - Did we introduce nowikis for indent-pre protection?
* If yes, we might run a post-pass to strip useless ones.
*
* hasQuoteNowikis
* - Did we introduce nowikis to preserve quote semantics?
* If yes, we might run a post-pass to strip useless ones.
*
* hasSelfClosingNowikis:
* - Did we introduce `<nowiki />`s?
* If yes, we do a postpass to remove unnecessary trailing ones.
*
* hasHeadingEscapes:
* - Did we introduce nowikis around `=.*=` text?
* If yes, we do a postpass to remove unnecessary escapes.
*
* wikiTableNesting
* - Records the nesting level of wikitext tables
*
* wteHandlerStack
* - Stack of wikitext escaping handlers -- these handlers are responsible
* for smart escaping when the surrounding wikitext context is known.
*
* currLine
* - This object is used by the wikitext escaping algorithm -- represents
* a "single line" of output wikitext as represented by a block node in
* the DOM.
* - firstNode: first DOM node processed on this line
* - text: output so far from all nodes on the current line
* - chunks: list of ConstrainedText chunks comprising the current line
*
* singleLineContext
* - Stack used to enforce single-line context
*
* redirectText
* - Text to be emitted at the start of file, for redirects
* @module
*/
'use strict';
require('../../core-upgrade.js');
const semver = require('semver');
const { ConstrainedText } = require('./ConstrainedText.js');
const { DOMDataUtils } = require('../utils/DOMDataUtils.js');
const { DOMUtils } = require('../utils/DOMUtils.js');
const { JSUtils } = require('../utils/jsutils.js');
const Promise = require('../utils/promise.js');
const { Util } = require('../utils/Util.js');
const { WTSUtils } = require('./WTSUtils.js');
const { WTUtils } = require('../utils/WTUtils.js');
const initialState = {
rtTestMode: true,
sep: {},
onSOL: true,
escapeText: false,
atStartOfOutput: true, // SSS FIXME: Can this be done away with in some way?
inIndentPre: false,
inPHPBlock: false,
inAttribute: false,
hasIndentPreNowikis: false,
hasSelfClosingNowikis: false,
hasQuoteNowikis: false,
hasHeadingEscapes: false,
redirectText: null,
wikiTableNesting: 0,
wteHandlerStack: [],
// XXX: replace with output buffering per line
currLine: null,
out: '',
logPrefix: 'OUT:',
};
// Make sure the initialState is never modified
JSUtils.deepFreeze(initialState);
/**
* Stack and helpers to enforce single-line context while serializing.
* @class
*/
class SingleLineContext {
constructor() { this._stack = []; }
enforce() { this._stack.push(true); }
enforced() { return this._stack.length > 0 && JSUtils.lastItem(this._stack); }
disable() { this._stack.push(false); }
pop() { this._stack.pop(); }
}
/**
* @class
*/
class SerializerState {
constructor(serializer, options) {
this.env = serializer.env;
this.serializer = serializer;
// Make sure options and initialState are cloned,
// so we don't alter the initial state for later serializer runs.
Util.extendProps(this, Util.clone(options), Util.clone(initialState));
this.resetCurrLine(null);
this.singleLineContext = new SingleLineContext();
}
/**
*/
initMode(selserMode) {
this.useWhitespaceHeuristics = semver.gte(this.env.inputContentVersion, '1.7.0');
this.selserMode = selserMode || false;
this.rtTestMode = this.rtTestMode &&
!this.selserMode; // Always false in selser mode.
}
/**
* Appends the seperator source and updates the SOL state if necessary.
*/
appendSep(src) {
this.sep.src = (this.sep.src || '') + src;
this.sepIntroducedSOL(src);
}
/**
* Cycle the state after processing a node.
*/
updateSep(node) {
this.sep.lastSourceNode = node;
}
/**
* Reset the current line state.
*/
resetCurrLine(node) {
this.currLine = {
text: '',
chunks: [],
firstNode: node,
};
}
/**
*/
flushLine() {
this.out += ConstrainedText.escapeLine(this.currLine.chunks);
this.currLine.chunks.length = 0;
}
/**
* Extracts a subset of the page source bound by the supplied indices.
*/
getOrigSrc(start, end) {
console.assert(this.selserMode);
return start <= end ? this.env.page.src.substring(start, end) : null;
}
/**
* Like it says on the tin.
*/
updateModificationFlags(node) {
this.prevNodeUnmodified = this.currNodeUnmodified;
this.currNodeUnmodified = false;
this.prevNode = node;
}
/**
* Separators put us in SOL state.
*/
sepIntroducedSOL(sep) {
// Don't get tripped by newlines in comments! Be wary of nowikis added
// by makeSepIndentPreSafe on the last line.
if (sep.replace(Util.COMMENT_REGEXP_G, '').search(/\n$/) !== -1) {
// Since we are stashing away newlines for emitting
// before the next element, we are in SOL state wrt
// the content of that next element.
//
// FIXME: The only serious caveat is if all these newlines
// will get stripped out in the context of any parent node
// that suppress newlines (ex: <li> nodes that are forcibly
// converted to non-html wikitext representation -- newlines
// will get suppressed in those context). We currently don't
// handle arbitrary HTML which cause these headaches. And,
// in any case, we might decide to emit such HTML as native
// HTML to avoid these problems. To be figured out later when
// it is a real issue.
this.onSOL = true;
}
}
/**
* Accumulates chunks on the current line.
*/
pushToCurrLine(text, node) {
console.assert(text instanceof ConstrainedText);
this.currLine.chunks.push(text);
}
/**
* Pushes the seperator to the current line and resets the separator state.
*/
emitSep(sep, node, debugPrefix) {
sep = ConstrainedText.cast(sep, node);
// Replace newlines if we're in a single-line context
if (this.singleLineContext.enforced()) {
sep.text = sep.text.replace(/\n/g, ' ');
}
this.pushToCurrLine(sep, node);
// Reset separator state
this.sep = {};
this.updateSep(node);
this.sepIntroducedSOL(sep.text);
this.env.log(this.serializer.logType,
"--->", debugPrefix,
() => JSON.stringify(sep.text));
}
/**
* Determines if we can use the original seperator for this node or if we
* need to build one based on its constraints, and then emits it.
*
* The following comment applies to `origSepUsable` but is placed outside the
* function body since character count (including comments) can prevent
* inlining in older versions of v8 (node < 8.3).
*
* ---
*
* When block nodes are deleted, the deletion affects whether unmodified
* newline separators between a pair of unmodified P tags can be reused.
*
* Example:
* ```
* Original WT : "<div>x</div>foo\nbar"
* Original HTML: "<div>x</div><p>foo</p>\n<p>bar</p>"
* Edited HTML : "<p>foo</p>\n<p>bar</p>"
* Annotated DOM: "<mw:DiffMarker is-block><p>foo</p>\n<p>bar</p>"
* Expected WT : "foo\n\nbar"
* ```
*
* Note the additional newline between "foo" and "bar" even though originally,
* there was just a single newline.
*
* So, even though the two P tags and the separator between them is
* unmodified, it is insufficient to rely on just that. We have to look at
* what has happened on the two wikitext lines onto which the two P tags
* will get serialized.
*
* Now, if you check the code for `nextToDeletedBlockNodeInWT`, that code is
* not really looking at ALL the nodes before/after the nodes that could
* serialize onto the wikitext lines. It is looking at the immediately
* adjacent nodes, i.e. it is not necessary to look if a block-tag was
* deleted 2 or 5 siblings away. If we had to actually examine all of those,
* nodes, this would get very complex, and it would be much simpler to just
* discard the original separators => potentially lots of dirty diffs.
*
* To understand why it is sufficient (for correctness) to examine just
* the immediately adjacent nodes, let us look at an additional example.
* ```
* Original WT : "a<div>b</div>c<div>d</div>e\nf"
* Original HTML: "<p>a</p><div>b</div><p>c</p><div>d</div><p>e</p>\n<p>f</p>"
* ```
* Note how `<block>` tags and `<p>` tags interleave in the HTML. This would be
* the case always no matter how much inline content showed up between the
* block tags in wikitext. If the b-`<div>` was deleted, we don't care
* about it, since we still have the d-`<div>` before the P tag that preserves
* the correctness of the single `"\n"` separator. If the d-`<div>` was deleted,
* we conservatively ignore the original separator and let normal P-P constraints
* take care of it. At worst, we might generate a dirty diff in this scenario.
*/
emitSepForNode(node) {
var again = (node === this.sep.lastSourceNode);
var origSepUsable = !again &&
this.prevNodeUnmodified && !WTSUtils.nextToDeletedBlockNodeInWT(this.prevNode, true) &&
this.currNodeUnmodified && !WTSUtils.nextToDeletedBlockNodeInWT(node, false);
var origSep = null;
if (origSepUsable) {
if (DOMUtils.isElt(this.prevNode) && DOMUtils.isElt(node)) {
origSep = this.getOrigSrc(
DOMDataUtils.getDataParsoid(this.prevNode).dsr[1],
DOMDataUtils.getDataParsoid(node).dsr[0]
);
} else {
origSep = this.sep.src || null;
}
}
if (origSep !== null && WTSUtils.isValidSep(origSep)) {
this.emitSep(origSep, node, 'ORIG-SEP:');
} else {
var sep = this.serializer.buildSep(node);
this.emitSep(sep || '', node, 'SEP:');
}
}
/**
* Pushes the chunk to the current line.
*/
emitChunk(res, node) {
res = ConstrainedText.cast(res, node);
// Replace newlines if we're in a single-line context
if (this.singleLineContext.enforced()) {
res.text = res.text.replace(/\n/g, ' ');
}
// Emit separator first
if (res.noSep) {
/* skip separators for internal tokens fromSelSer */
} else {
this.emitSepForNode(node);
}
if (this.onSOL) {
// process escapes in our full line
this.flushLine();
this.resetCurrLine(node);
}
// Escape 'res' if necessary
if (this.escapeText) {
res = new ConstrainedText({
text: this.serializer.wteHandlers.escapeWikiText(this, res.text, {
node: node,
isLastChild: DOMUtils.nextNonDeletedSibling(node) === null,
}),
prefix: res.prefix,
suffix: res.suffix,
node: res.node,
});
this.escapeText = false;
} else {
// If 'res' is coming from selser and the current node is a paragraph tag,
// check if 'res' might need some leading chars nowiki-escaped before being output.
// Because of block-tag p-wrapping behavior, sol-sensitive characters that used to
// be in non-sol positions, but yet wrapped in p-tags, could end up in sol-position
// if those block tags get deleted during edits.
//
// Ex: a<div>foo</div>*b
// -- wt2html --> <p>a</p><div>foo<div><p>*b</p>
// -- EDIT --> <p>a</p><p>*b</p>
// -- html2wt --> a\n\n<nowiki>*</nowiki>b
//
// In this scenario, the <p>a</p>, <p>*b</p>, and <p>#c</p>
// will be marked unmodified and will be processed below.
if (this.selserMode
&& this.onSOL
&& this.currNodeUnmodified
// 'node' came from original Parsoid HTML unmodified. So, if its content
// needs nowiki-escaping, we know that the reason it didn't parse into
// lists/headings/whatever is because it didn't occur at the start of the
// line => it had a block-tag in the original wikitext. So if the previous
// node was also unmodified (and since it also came from original Parsoid
// HTML), we can safely infer that it couldn't have been an inline node or
// a P-tag (if it were, the p-wrapping code would have swallowed that content
// into 'node'). So, it would have to be some sort of block tag => this.onSOL
// couldn't have been true (because we could have serialized 'node' on the
// same line as the block tag) => we can save some effort by eliminating
// scenarios where 'this.prevNodeUnmodified' is true.
&& !this.prevNodeUnmodified
&& node.nodeName === 'P' && !WTUtils.isLiteralHTMLNode(node)
) {
var pChild = DOMUtils.firstNonSepChild(node);
// If a text node, we have to make sure that the text doesn't
// get reparsed as non-text in the wt2html pipeline.
if (pChild && DOMUtils.isText(pChild)) {
var solWikitextRE = JSUtils.rejoin(
'^((?:',
this.env.conf.wiki.solTransparentWikitextNoWsRegexp,
'|',
// SSS FIXME: What about onlyinclude and noinclude?
/<includeonly>.*?<\/includeonly>/,
')*)',
/([ \*#:;{\|!=].*)$/
);
// Note that res is a ConstrainedText, not a string
var match = res.match(solWikitextRE);
if (match && match[2]) {
if (/^([\*#:;]|{\||.*=$)/.test(match[2]) ||
// ! and | chars are harmless outside tables
(/^[\|!]/.test(match[2]) && this.wikiTableNesting > 0) ||
// indent-pres are suppressed inside <blockquote>
(/^ [^\s]/.test(match[2]) && !DOMUtils.hasAncestorOfName(node, 'BLOCKQUOTE'))) {
res = ConstrainedText.cast((match[1] || '') +
'<nowiki>' + match[2][0] + '</nowiki>' +
match[2].substring(1), node);
}
}
}
}
}
// Emitting text that has not been escaped
this.currLine.text += res.text;
// Output res
this.env.log(this.serializer.logType, '--->', this.logPrefix, function() {
return JSON.stringify(res instanceof ConstrainedText ? res.text : res);
});
this.pushToCurrLine(res, node);
// Update sol flag. Test for
// newlines followed by optional includeonly or comments
var solRE = JSUtils.rejoin(
/(^|\n)/,
'(',
// SSS FIXME: What about onlyinclude and noinclude?
/<includeonly>.*?<\/includeonly>/,
'|',
this.env.conf.wiki.solTransparentWikitextNoWsRegexp,
')*$'
);
// Note that res is a ConstrainedText, not a string
if (!res.match(solRE)) {
this.onSOL = false;
}
// We've emit something so we're no longer at SOO.
this.atStartOfOutput = false;
}
/**
* Serialize the children of a DOM node, sharing the global serializer state.
* Typically called by a DOM-based handler to continue handling its children.
*/
*serializeChildrenG(node, wtEscaper, firstChild) {
// SSS FIXME: Unsure if this is the right thing always
if (wtEscaper) { this.wteHandlerStack.push(wtEscaper); }
var child = firstChild || node.firstChild;
while (child !== null) {
var next = yield this.serializer._serializeNode(child);
if (next === node) { break; } // Serialized all children
if (next === child) { next = next.nextSibling; } // Advance
child = next;
}
if (wtEscaper) { this.wteHandlerStack.pop(); }
// If we serialized children explicitly,
// we were obviously processing a modified node.
this.currNodeUnmodified = false;
}
/**
* Abstracts some steps taken in `_serializeChildrenToString` and `serializeDOM`
* @private
*/
*_kickOffSerializeG(node, wtEscaper) {
this.updateSep(node);
this.currNodeUnmodified = false;
this.updateModificationFlags(node);
this.resetCurrLine(node.firstChild);
yield this.serializeChildren(node, wtEscaper);
// Emit child-parent seps.
this.emitSepForNode(node);
// We've reached EOF, flush the remaining buffered text.
this.flushLine();
}
/**
* Serialize children to a string
*
* FIXME(arlorla): Shouldln't affect the separator state, but accidents have
* have been known to happen. T109793 suggests using its own wts / state.
*/
*_serializeChildrenToStringG(node, wtEscaper, inState) {
// FIXME: Make sure that the separators emitted here conform to the
// syntactic constraints of syntactic context.
var oldSep = this.sep;
var oldSOL = this.onSOL;
var oldOut = this.out;
var oldStart = this.atStartOfOutput;
var oldCurrLine = this.currLine;
var oldLogPrefix = this.logPrefix;
// Modification flags
var oldPrevNodeUnmodified = this.prevNodeUnmodified;
var oldCurrNodeUnmodified = this.currNodeUnmodified;
var oldPrevNode = this.prevNode;
this.out = '';
this.logPrefix = 'OUT(C):';
this.sep = {};
this.onSOL = false;
this.atStartOfOutput = false;
this[inState] = true;
yield this._kickOffSerialize(node, wtEscaper);
// restore the state
var bits = this.out;
this.out = oldOut;
this[inState] = false;
this.sep = oldSep;
this.onSOL = oldSOL;
this.atStartOfOutput = oldStart;
this.currLine = oldCurrLine;
this.logPrefix = oldLogPrefix;
// Modification flags
this.prevNodeUnmodified = oldPrevNodeUnmodified;
this.currNodeUnmodified = oldCurrNodeUnmodified;
this.prevNode = oldPrevNode;
return bits;
}
_serializeLinkChildrenToString(node, wtEscaper) {
return this._serializeChildrenToString(node, wtEscaper, 'inLink');
}
_serializeCaptionChildrenToString(node, wtEscaper) {
return this._serializeChildrenToString(node, wtEscaper, 'inCaption');
}
_serializeIndentPreChildrenToString(node, wtEscaper) {
return this._serializeChildrenToString(node, wtEscaper, 'inIndentPre');
}
}
// Clunky workaround
[ "serializeChildren", "_kickOffSerialize", "_serializeChildrenToString" ].forEach(function(f) {
SerializerState.prototype[f] = Promise.async(SerializerState.prototype[f + "G"]);
});
// Clunky workaround
["serializeLinkChildrenToString", "serializeCaptionChildrenToString", "serializeIndentPreChildrenToString" ].forEach(function(f) {
SerializerState.prototype[f] = Promise.method(SerializerState.prototype["_" + f]);
});
if (typeof module === "object") {
module.exports.SerializerState = SerializerState;
}