Code Coverage for /src/src/Html2Wt/SerializerState.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	23.90% covered (danger)	23.90%	60 / 251	37.04% covered (danger)	37.04%	10 / 27	CRAP	0.00% covered (danger)	0.00%	0 / 1
SerializerState	23.90% covered (danger)	23.90%	60 / 251	37.04% covered (danger)	37.04%	10 / 27	3987.45	0.00% covered (danger)	0.00%	0 / 1
solWikitextRegexp	0.00% covered (danger)	0.00%	0 / 11	0.00% covered (danger)	0.00%	0 / 1	6
solRegexp	0.00% covered (danger)	0.00%	0 / 8	0.00% covered (danger)	0.00%	0 / 1	6
__construct	100.00% covered (success)	100.00%	13 / 13	100.00% covered (success)	100.00%	1 / 1	1
getEnv	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
initMode	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
appendSep	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
updateSep	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
resetSep	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	2
resetCurrLine	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	2
flushLine	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	2
getOrigSrc	80.00% covered (warning)	80.00%	4 / 5	0.00% covered (danger)	0.00%	0 / 1	3.07
isValidDSR	0.00% covered (danger)	0.00%	0 / 45	0.00% covered (danger)	0.00%	0 / 1	342
updateModificationFlags	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	1
sepIntroducedSOL	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	12
pushToCurrLine	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	2
emitSep	0.00% covered (danger)	0.00%	0 / 7	0.00% covered (danger)	0.00%	0 / 1	6
emitSepForNode	0.00% covered (danger)	0.00%	0 / 30	0.00% covered (danger)	0.00%	0 / 1	272
recoverTrimmedWhitespace	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	2
emitChunk	40.48% covered (danger)	40.48%	17 / 42	0.00% covered (danger)	0.00%	0 / 1	156.81
serializeChildren	100.00% covered (success)	100.00%	8 / 8	100.00% covered (success)	100.00%	1 / 1	5
kickOffSerialize	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	1
serializeChildrenToString	0.00% covered (danger)	0.00%	0 / 33	0.00% covered (danger)	0.00%	0 / 1	2
serializeLinkChildrenToString	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
serializeCaptionChildrenToString	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
serializeIndentPreChildrenToString	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
openAnnotationRange	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
closeAnnotationRange	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2	declare( strict_types = 1 );
3
4	namespace Wikimedia\Parsoid\Html2Wt;
5
6	use Composer\Semver\Semver;
7	use stdClass;
8	use Wikimedia\Assert\Assert;
9	use Wikimedia\Parsoid\Config\Env;
10	use Wikimedia\Parsoid\Core\DomSourceRange;
11	use Wikimedia\Parsoid\Core\SelectiveUpdateData;
12	use Wikimedia\Parsoid\DOM\DocumentFragment;
13	use Wikimedia\Parsoid\DOM\Element;
14	use Wikimedia\Parsoid\DOM\Node;
15	use Wikimedia\Parsoid\DOM\Text;
16	use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI;
17	use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText;
18	use Wikimedia\Parsoid\Tokens\SourceRange;
19	use Wikimedia\Parsoid\Utils\DiffDOMUtils;
20	use Wikimedia\Parsoid\Utils\DOMCompat;
21	use Wikimedia\Parsoid\Utils\DOMDataUtils;
22	use Wikimedia\Parsoid\Utils\DOMUtils;
23	use Wikimedia\Parsoid\Utils\PHPUtils;
24	use Wikimedia\Parsoid\Utils\Utils;
25	use Wikimedia\Parsoid\Utils\WTUtils;
26
27	/**
28	* State object for the wikitext serializers.
29	*/
30	class SerializerState {
31
32	/**
33	* Regexp for checking if what we have consumed wikimarkup that has special meaning at the
34	* beginning of the line, and is indeed at the beginning of the line (modulo comments and
35	* other ignored elements).
36	*
37	* @return string
38	*/
39	private function solWikitextRegexp(): string {
40	static $solWikitextRegexp = null;
41	if ( $solWikitextRegexp === null ) {
42	$sol = PHPUtils::reStrip(
43	$this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp( true ),
44	'@'
45	);
46	$solWikitextRegexp = '@' .
47	'^(' . $sol . ')' .
48	'([\ \#:;{\\|!=].)$' .
49	'@D';
50	}
51	return $solWikitextRegexp;
52	}
53
54	/**
55	* Regexp for checking whether we are at the start of the line (modulo comments and
56	* other ignored elements).
57	*
58	* @return string
59	*/
60	private function solRegexp(): string {
61	static $solRegexp = null;
62	if ( $solRegexp === null ) {
63	$sol = PHPUtils::reStrip(
64	$this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp( true ),
65	'@'
66	);
67	$solRegexp = '@(^\|\n)' . $sol . '$@D';
68	}
69	return $solRegexp;
70	}
71
72	/**
73	* Separator information:
74	* - constraints (array<array\|int>\|null): min/max number of newlines
75	* - src (string\|null): collected separator text from DOM text/comment nodes
76	* - lastSourceNode (?Node): Seems to be bookkeeping to make sure we don't reuse
77	* original separators when `emitChunk` is called
78	* consecutively on the same node. However, it also
79	* differs from `state.prevNode` in that it only gets
80	* updated when a node calls `emitChunk` so that nodes
81	* serializing `justChildren` don't mix up `buildSep`.
82	* FIXME: could use a dedicated class
83	* @var stdClass
84	*/
85	public $sep;
86
87	/**
88	* Is the serializer at the start of a new wikitext line?
89	* @var bool
90	*/
91	public $onSOL = true;
92
93	/**
94	* True when wts kicks off, false after the first char has been output
95	* SSS FIXME: Can this be done away with in some way?
96	* @var bool
97	*/
98	public $atStartOfOutput = true;
99
100	/**
101	* Is the serializer currently handling link content (children of `<a>`)?
102	* @var bool
103	*/
104	public $inLink = false;
105
106	/**
107	* Is the serializer currently handling caption content?
108	* @var bool
109	*/
110	public $inCaption = false;
111
112	/**
113	* Is the serializer currently handling an indent-pre tag?
114	* @var bool
115	*/
116	public $inIndentPre = false;
117
118	/**
119	* Is the serializer currently handling a html-pre tag?
120	* @var bool
121	*/
122	public $inHTMLPre = false;
123
124	/**
125	* Is the serializer currently handling a tag that the PHP parser
126	* treats as a block tag?
127	* @var bool
128	*/
129	public $inPHPBlock = false;
130
131	/**
132	* Is the serializer being invoked recursively to serialize a
133	* template-generated attribute (via `WSP.getAttributeValue`'s
134	* template handling). If so, we should suppress some
135	* serialization escapes, like autolink protection, since
136	* these are not valid for attribute values.
137	* @var bool
138	*/
139	public $inAttribute = false;
140
141	/**
142	* Is the serializer currently processing a subtree that has been
143	* marked inserted compared to original content (ex: via VE / CX)?
144	*
145	* @var bool
146	*/
147	public $inInsertedContent;
148
149	/**
150	* Did we introduce nowikis for indent-pre protection?
151	* If yes, we might run a post-pass to strip useless ones.
152	* @var bool
153	*/
154	public $hasIndentPreNowikis = false;
155
156	/**
157	* Did we introduce nowikis to preserve quote semantics?
158	* If yes, we might run a post-pass to strip useless ones.
159	* @var bool
160	*/
161	public $hasQuoteNowikis = false;
162
163	/**
164	* Did we introduce `<nowiki />`s?
165	* If yes, we do a postpass to remove unnecessary trailing ones.
166	* @var bool
167	*/
168	public $hasSelfClosingNowikis = false;
169
170	/**
171	* Did we introduce nowikis around `=.*=` text?
172	* If yes, we do a postpass to remove unnecessary escapes.
173	* @var bool
174	*/
175	public $hasHeadingEscapes = false;
176
177	/**
178	* Records the nesting level of wikitext tables
179	* @var int
180	*/
181	public $wikiTableNesting = 0;
182
183	/**
184	* Stack of wikitext escaping handlers -- these handlers are responsible
185	* for smart escaping when the surrounding wikitext context is known.
186	* @var callable[] See {@link serializeChildren()}
187	*/
188	public $wteHandlerStack = [];
189
190	/**
191	* This array is used by the wikitext escaping algorithm -- represents
192	* a "single line" of output wikitext as represented by a block node in
193	* the DOM.
194	* - firstNode (?Node): first DOM node processed on this line
195	* - text (string): output so far from all nodes on the current line
196	* - chunks (ConstrainedText[]): list of chunks comprising the current line
197	* @var stdClass
198	* XXX: replace with output buffering per line
199	* FIXME: could use a dedicated class
200	*/
201	public $currLine;
202
203	/**
204	* Stack used to enforce single-line context
205	* @var SingleLineContext
206	*/
207	public $singleLineContext;
208
209	/**
210	* Text to be emitted at the start of file, for redirects
211	* @var string\|null
212	*/
213	public $redirectText = null;
214
215	/** @var WikitextSerializer */
216	public $serializer;
217
218	/** @var ParsoidExtensionAPI */
219	public $extApi;
220
221	/** @var string The serialized output */
222	public $out = '';
223
224	/**
225	* Whether to use heuristics to determine if a list item, heading, table cell, etc.
226	* should have whitespace inserted after the "*#=\|!" wikitext chars? This is normally
227	* true by default, but not so if HTML content version is older than 1.7.0.
228	* In practice, we are now at version 2.1, but Flow stores HTML, so till Flow migrates
229	* all its content over to a later version, we need a boolean flag.
230	* @var bool
231	*/
232	public $useWhitespaceHeuristics;
233
234	/**
235	* Are we in selective serialization mode?
236	* @see SelectiveSerializer
237	* @var bool
238	*/
239	public $selserMode;
240
241	private ?SelectiveUpdateData $selserData;
242
243	/**
244	* If in selser mode, while processing a node, do we know if
245	* its previous node has not been modified in an edit?
246	* @var bool
247	*/
248	public $prevNodeUnmodified;
249
250	/**
251	* If in selser mode, while processing a node, do we know if
252	* it has not been modified in an edit?
253	* @var bool
254	*/
255	public $currNodeUnmodified;
256
257	/**
258	* Should we run the wikitext escaping code on the wikitext chunk
259	* that will be emitted?
260	* @var bool
261	*/
262	public $needsEscaping = false;
263
264	/**
265	* Used as fast patch for special protected characters in WikitextEscapeHandlers and
266	* comes from LanguageVariantHandler
267	* @var string\|null
268	*/
269	public $protect;
270
271	/** @var Separators */
272	public $separators;
273
274	/** @var Env */
275	private $env;
276
277	/** @var Element */
278	public $currNode;
279
280	/** @var Element */
281	private $prevNode;
282
283	/** @var array */
284	public $openAnnotations;
285
286	/**
287	* Log prefix to use in trace output
288	* @var string
289	*/
290	private $logPrefix = 'OUT:';
291
292	public $haveTrimmedWsDSR = false;
293
294	/**
295	* @param WikitextSerializer $serializer
296	* @param array $options List of options for serialization:
297	* - onSOL: (bool)
298	* - inPHPBlock: (bool)
299	* - inAttribute: (bool)
300	* - protect: (string)
301	* - selserData: (SelectiveUpdateData)
302	*/
303	public function __construct( WikitextSerializer $serializer, array $options = [] ) {
304	$this->env = $serializer->env;
305	$this->serializer = $serializer;
306	$this->extApi = new ParsoidExtensionAPI( $this->env, [ 'html2wt' => [ 'state' => $this ] ] );
307	$this->onSOL = $options['onSOL'] ?? $this->onSOL;
308	$this->inPHPBlock = $options['inPHPBlock'] ?? $this->inPHPBlock;
309	$this->inAttribute = $options['inAttribute'] ?? $this->inAttribute;
310	$this->protect = $options['protect'] ?? null;
311	$this->selserData = $options['selserData'] ?? null;
312	$this->resetCurrLine( null );
313	$this->singleLineContext = new SingleLineContext();
314	$this->resetSep();
315	$this->haveTrimmedWsDSR = Semver::satisfies( $this->env->getInputContentVersion(), '>=2.1.1' );
316	$this->separators = new Separators( $this->env, $this );
317	}
318
319	/**
320	* @note Porting note: this replaces direct access
321	* @return Env
322	*/
323	public function getEnv(): Env {
324	return $this->env;
325	}
326
327	/**
328	* Initialize a few boolean flags based on serialization mode.
329	* FIXME: Ideally, this should be private. Requires shuffing around
330	* where SerializerState is constructed so that $selserMode is known
331	* at the time of construction.
332	* @private for use by WikitextSerializer only
333	* @param bool $selserMode Are we running selective serialization?
334	*/
335	public function initMode( bool $selserMode ): void {
336	$this->useWhitespaceHeuristics =
337	Semver::satisfies( $this->env->getInputContentVersion(), '>=1.7.0' );
338	$this->selserMode = $selserMode;
339	}
340
341	/**
342	* Appends the separator source to the separator src buffer.
343	* Don't update $state->onSOL since this string hasn't been emitted yet.
344	* If content handlers change behavior based on whether this newline will
345	* be emitted or not, they should peek into this buffer (ex: see TDHandler
346	* and THHandler code).
347	*
348	* @param string $src
349	*/
350	public function appendSep( string $src ): void {
351	$this->sep->src = ( $this->sep->src ?? '' ) . $src;
352	}
353
354	/**
355	* Cycle the state after processing a node.
356	* @param Node $node
357	*/
358	public function updateSep( Node $node ): void {
359	$this->sep->lastSourceNode = $node;
360	}
361
362	private function resetSep() {
363	$this->sep = (object)[
364	'constraints' => null,
365	'src' => null,
366	'lastSourceNode' => null,
367	];
368	}
369
370	/**
371	* Reset the current line state.
372	* @param ?Node $node
373	*/
374	private function resetCurrLine( ?Node $node ): void {
375	$this->currLine = (object)[
376	'text' => '',
377	'chunks' => [],
378	'firstNode' => $node
379	];
380	}
381
382	/**
383	* Process and emit a line of ConstrainedText chunks, adjusting chunk boundaries as necessary.
384	* (Start of line and end of line are always safe for ConstrainedText chunks, so we don't need
385	* to buffer more than the last line.)
386	*/
387	private function flushLine(): void {
388	$this->out .= ConstrainedText::escapeLine( $this->currLine->chunks );
389	$this->currLine->chunks = [];
390	}
391
392	/**
393	* Extracts a subset of the page source bound by the supplied source range.
394	* @param SourceRange $sr
395	* @return string\|null
396	*/
397	public function getOrigSrc( SourceRange $sr ): ?string {
398	Assert::invariant( $this->selserMode, 'SerializerState::$selserMode must be set' );
399	if (
400	$sr->start <= $sr->end &&
401	// FIXME: Having a $start greater than the source length is
402	// probably a canary for corruption. Maybe we should be throwing
403	// here instead. See T240053.
404	// But, see comment in UnpackDOMFragments where we very very rarely
405	// can deliberately set DSR to point outside page source.
406	$sr->start <= strlen( $this->selserData->revText )
407	) {
408	// XXX should use $frame->getSrcText() like WTUtils::getWTSource
409	return $sr->substr( $this->selserData->revText );
410	} else {
411	return null;
412	}
413	}
414
415	/**
416	* Check the validity of a DSR in the context of the page source.
417	*
418	* Returns false if Utils::isValidDSR() would return false, but also
419	* returns false if the DSR offsets would create a bad UTF-8 string
420	* (ie, the start offsets don't point to a valid UTF-8 start character).
421	* @param ?DomSourceRange $dsr DSR source range values
422	* @param bool $all Also check the widths of the container tag
423	* @return bool
424	*/
425	public function isValidDSR( ?DomSourceRange $dsr, bool $all = false ) {
426	if ( !Utils::isValidDSR( $dsr, $all ) ) {
427	return false;
428	}
429	if ( !( $dsr->start <= $dsr->end &&
430	$dsr->end <= strlen( $this->selserData->revText ) ) ) {
431	return false;
432	}
433	// check the UTF-8 ranges.
434	$src = $this->selserData->revText;
435	$check = static function ( $start, $end ) use ( $src ) {
436	if ( $start === $end ) {
437	// zero-length string is always ok
438	return true;
439	}
440	$firstChar = ord( $src[$start] );
441	if ( ( $firstChar & 0xC0 ) === 0x80 ) {
442	return false; // bad UTF-8 at start of string
443	}
444	$i = 0;
445	// This next loop won't pass $start because we've already
446	// asserted that the first character isn't 10xx xxxx
447	do {
448	$i--;
449	if ( $i <= -5 ) {
450	return false; // bad UTF-8 at end of string (>4 byte sequence)
451	}
452	$lastChar = ord( $src[$end + $i] );
453	} while ( ( $lastChar & 0xC0 ) === 0x80 );
454	if ( ( $lastChar & 0x80 ) === 0 ) {
455	return $i === -1;
456	} elseif ( ( $lastChar & 0xE0 ) === 0xC0 ) {
457	return $i === -2;
458	} elseif ( ( $lastChar & 0xF0 ) === 0xE0 ) {
459	return $i === -3;
460	} elseif ( ( $lastChar & 0xF8 ) === 0xF0 ) {
461	return $i === -4;
462	} else {
463	return false;
464	}
465	};
466	if ( !$all ) {
467	return $check( $dsr->start, $dsr->end );
468	}
469	// Check each inner ranges.
470	$openEnd = $dsr->start + $dsr->openWidth;
471	if ( $openEnd > $dsr->end ) {
472	return false;
473	}
474	if ( !$check( $dsr->start, $openEnd ) ) {
475	return false;
476	}
477	$closeStart = $dsr->end - $dsr->closeWidth;
478	if ( $dsr->start > $closeStart ) {
479	return false;
480	}
481	if ( !$check( $closeStart, $dsr->end ) ) {
482	return false;
483	}
484	if ( $openEnd > $closeStart ) {
485	return false;
486	}
487	if ( !$check( $openEnd, $closeStart ) ) {
488	return false;
489	}
490	return true;
491	}
492
493	/**
494	* Like it says on the tin.
495	* @param Node $node
496	*/
497	public function updateModificationFlags( Node $node ): void {
498	$this->prevNodeUnmodified = $this->currNodeUnmodified;
499	$this->currNodeUnmodified = false;
500	$this->prevNode = $node;
501	}
502
503	/**
504	* Separators put us in SOL state.
505	* @param string $sep
506	* @param Node $node
507	*/
508	private function sepIntroducedSOL( string $sep, Node $node ): void {
509	// Don't get tripped by newlines in comments! Be wary of nowikis added
510	// by makeSepIndentPreSafe on the last line.
511	$nonCommentSep = preg_replace( Utils::COMMENT_REGEXP, '', $sep );
512	if ( substr( $nonCommentSep, -1 ) === "\n" ) {
513	$this->onSOL = true;
514	}
515
516	if ( str_contains( $nonCommentSep, "\n" ) ) {
517	// process escapes in our full line
518	$this->flushLine();
519	$this->resetCurrLine( $node );
520	}
521	}
522
523	/**
524	* Accumulates chunks on the current line.
525	* @param ConstrainedText $chunk
526	* @param string $logPrefix
527	*/
528	private function pushToCurrLine( ConstrainedText $chunk, string $logPrefix ) {
529	// Emitting text that has not been escaped
530	$this->currLine->text .= $chunk->text;
531
532	$this->currLine->chunks[] = $chunk;
533
534	$this->serializer->trace( '--->', $logPrefix, static function () use ( $chunk ) {
535	return PHPUtils::jsonEncode( $chunk->text );
536	} );
537	}
538
539	/**
540	* Pushes the separator to the current line and resets the separator state.
541	* @param string $sep
542	* @param Node $node
543	* @param string $debugPrefix
544	*/
545	private function emitSep( string $sep, Node $node, string $debugPrefix ): void {
546	$sep = ConstrainedText::cast( $sep, $node );
547
548	// Replace newlines if we're in a single-line context
549	if ( $this->singleLineContext->enforced() ) {
550	$sep->text = preg_replace( '/\n/', ' ', $sep->text );
551	}
552
553	$this->pushToCurrLine( $sep, $debugPrefix );
554	$this->sepIntroducedSOL( $sep->text, $node );
555
556	// Reset separator state
557	$this->resetSep();
558	$this->updateSep( $node );
559	}
560
561	/**
562	* Determines if we can use the original separator for this node or if we
563	* need to build one based on its constraints, and then emits it.
564	*
565	* @param Node $node
566	*/
567	private function emitSepForNode( Node $node ): void {
568	/* When block nodes are deleted, the deletion affects whether unmodified
569	* newline separators between a pair of unmodified P tags can be reused.
570	*
571	* Example:
572	* ```
573	* Original WT : "<div>x</div>foo\nbar"
574	* Original HTML: "<div>x</div><p>foo</p>\n<p>bar</p>"
575	* Edited HTML : "<p>foo</p>\n<p>bar</p>"
576	* Annotated DOM: "<mw:DiffMarker is-block><p>foo</p>\n<p>bar</p>"
577	* Expected WT : "foo\n\nbar"
578	* ```
579	*
580	* Note the additional newline between "foo" and "bar" even though originally,
581	* there was just a single newline.
582	*
583	* So, even though the two P tags and the separator between them is
584	* unmodified, it is insufficient to rely on just that. We have to look at
585	* what has happened on the two wikitext lines onto which the two P tags
586	* will get serialized.
587	*
588	* Now, if you check the code for `nextToDeletedBlockNodeInWT`, that code is
589	* not really looking at ALL the nodes before/after the nodes that could
590	* serialize onto the wikitext lines. It is looking at the immediately
591	* adjacent nodes, i.e. it is not necessary to look if a block-tag was
592	* deleted 2 or 5 siblings away. If we had to actually examine all of those,
593	* nodes, this would get very complex, and it would be much simpler to just
594	* discard the original separators => potentially lots of dirty diffs.
595	*
596	* To understand why it is sufficient (for correctness) to examine just
597	* the immediately adjacent nodes, let us look at an additional example.
598	* ```
599	* Original WT : "a<div>b</div>c<div>d</div>e\nf"
600	* Original HTML: "<p>a</p><div>b</div><p>c</p><div>d</div><p>e</p>\n<p>f</p>"
601	* ```
602	* Note how `<block>` tags and `<p>` tags interleave in the HTML. This would be
603	* the case always no matter how much inline content showed up between the
604	* block tags in wikitext. If the b-`<div>` was deleted, we don't care
605	* about it, since we still have the d-`<div>` before the P tag that preserves
606	* the correctness of the single `"\n"` separator. If the d-`<div>` was deleted,
607	* we conservatively ignore the original separator and let normal P-P constraints
608	* take care of it. At worst, we might generate a dirty diff in this scenario. */
609	$origSepNeeded = ( $node !== $this->sep->lastSourceNode );
610	$origSepUsable = $origSepNeeded &&
611	(
612	// first-content-node of <body> ($this->prevNode)
613	(
614	DOMUtils::isBody( $this->prevNode ) &&
615	$node->parentNode === $this->prevNode
616	)
617	\|\|
618	// unmodified sibling node of $this->prevNode
619	(
620	$this->prevNode && $this->prevNodeUnmodified &&
621	$node->parentNode === $this->prevNode->parentNode &&
622	!WTSUtils::nextToDeletedBlockNodeInWT( $this->prevNode, true )
623	)
624	) &&
625	$this->currNodeUnmodified && !WTSUtils::nextToDeletedBlockNodeInWT( $node, false );
626
627	$origSep = null;
628	if ( $origSepUsable ) {
629	if ( $this->prevNode instanceof Element && $node instanceof Element ) {
630	'@phan-var Element $node';/** @var Element $node */
631	if ( DOMUtils::isBody( $this->prevNode ) ) {
632	// <body> won't have DSR in body_only scenarios
633	$sr = new SourceRange( 0, 0 );
634	} else {
635	$sr = DOMDataUtils::getDataParsoid( $this->prevNode )->dsr;
636	}
637	$sr = $sr->to( DOMDataUtils::getDataParsoid( $node )->dsr );
638	$origSep = $this->getOrigSrc( $sr );
639	} elseif ( $this->sep->src && WTSUtils::isValidSep( $this->sep->src ) ) {
640	// We don't know where '$this->sep->src' comes from. So, reuse it
641	// only if it is a valid separator string.
642	$origSep = $this->sep->src;
643	}
644	}
645
646	if ( $origSep !== null ) {
647	$this->emitSep( $origSep, $node, 'ORIG-SEP:' );
648	} else {
649	$sep = $this->separators->buildSep( $node );
650	$this->emitSep( $sep ?? '', $node, 'SEP:' );
651	}
652	}
653
654	/**
655	* Recovers and emits any trimmed whitespace for $node
656	* @param Node $node
657	* @param bool $leading
658	* if true, trimmed leading whitespace is emitted
659	* if false, trimmed railing whitespace is emitted
660	* @return string\|null
661	*/
662	public function recoverTrimmedWhitespace( Node $node, bool $leading ): ?string {
663	$sep = $this->separators->recoverTrimmedWhitespace( $node, $leading );
664	$this->serializer->trace( '--->', "TRIMMED-SEP:", static function () use ( $sep ) {
665	return PHPUtils::jsonEncode( $sep );
666	} );
667	return $sep;
668	}
669
670	/**
671	* Pushes the chunk to the current line.
672	* @param ConstrainedText\|string $res
673	* @param Node $node
674	*/
675	public function emitChunk( $res, Node $node ): void {
676	$res = ConstrainedText::cast( $res, $node );
677
678	// Replace newlines if we're in a single-line context
679	if ( $this->singleLineContext->enforced() ) {
680	$res->text = str_replace( "\n", ' ', $res->text );
681	}
682
683	// Emit separator first
684	if ( $res->noSep ) {
685	/* skip separators for internal tokens from SelSer */
686	if ( $this->onSOL ) {
687	// process escapes in our full line
688	$this->flushLine();
689	$this->resetCurrLine( $node );
690	}
691	} else {
692	$this->emitSepForNode( $node );
693	}
694
695	$needsEscaping = $this->needsEscaping;
696	if ( $needsEscaping && $this->currNode instanceof Text ) {
697	$needsEscaping = !$this->inHTMLPre && ( $this->onSOL \|\| !$this->currNodeUnmodified );
698	}
699
700	// Escape 'res' if necessary
701	if ( $needsEscaping ) {
702	$res = new ConstrainedText( [
703	'text' => $this->serializer->escapeWikitext( $this, $res->text, [
704	'node' => $node,
705	'isLastChild' => DiffDOMUtils::nextNonDeletedSibling( $node ) === null,
706	] ),
707	'prefix' => $res->prefix,
708	'suffix' => $res->suffix,
709	'node' => $res->node,
710	] );
711	$this->needsEscaping = false;
712	} else {
713	// If 'res' is coming from selser and the current node is a paragraph tag,
714	// check if 'res' might need some leading chars nowiki-escaped before being output.
715	// Because of block-tag p-wrapping behavior, sol-sensitive characters that used to
716	// be in non-sol positions, but yet wrapped in p-tags, could end up in sol-position
717	// if those block tags get deleted during edits.
718	//
719	// Ex: a<div>foo</div>*b
720	// -- wt2html --> <p>a</p><div>foo<div><p>*b</p>
721	// -- EDIT --> <p>a</p><p>*b</p>
722	// -- html2wt --> a\n\n<nowiki>*</nowiki>b
723	//
724	// In this scenario, the <p>a</p>, <p>*b</p>, and <p>#c</p>
725	// will be marked unmodified and will be processed below.
726	if ( $this->selserMode
727	&& $this->onSOL
728	&& $this->currNodeUnmodified
729	// 'node' came from original Parsoid HTML unmodified. So, if its content
730	// needs nowiki-escaping, we know that the reason it didn't parse into
731	// lists/headings/whatever is because it didn't occur at the start of the
732	// line => it had a block-tag in the original wikitext. So if the previous
733	// node was also unmodified (and since it also came from original Parsoid
734	// HTML), we can safely infer that it couldn't have been an inline node or
735	// a P-tag (if it were, the p-wrapping code would have swallowed that content
736	// into 'node'). So, it would have to be some sort of block tag => this.onSOL
737	// couldn't have been true (because we could have serialized 'node' on the
738	// same line as the block tag) => we can save some effort by eliminating
739	// scenarios where 'this.prevNodeUnmodified' is true.
740	&& !$this->prevNodeUnmodified
741	&& DOMCompat::nodeName( $node ) === 'p' && !WTUtils::isLiteralHTMLNode( $node )
742	) {
743	$pChild = DiffDOMUtils::firstNonSepChild( $node );
744	// If a text node, we have to make sure that the text doesn't
745	// get reparsed as non-text in the wt2html pipeline.
746	if ( $pChild instanceof Text ) {
747	$match = $res->matches( $this->solWikitextRegexp(), $this->env );
748	if ( $match && isset( $match[2] ) ) {
749	if ( preg_match( '/^([\#:;]\|{\\|\|.=$)/D', $match[2] )
750	// ! and \| chars are harmless outside tables
751	\|\| ( strspn( $match[2], '\|!' ) && $this->wikiTableNesting > 0 )
752	// indent-pres are suppressed inside <blockquote>
753	\|\| ( preg_match( '/^ \S/', $match[2] )
754	&& !DOMUtils::hasNameOrHasAncestorOfName( $node, 'blockquote' ) )
755	) {
756	$res = ConstrainedText::cast( ( $match[1] ?: '' )
757	. '<nowiki>' . substr( $match[2], 0, 1 ) . '</nowiki>'
758	. substr( $match[2], 1 ), $node );
759	}
760	}
761	}
762	}
763	}
764
765	// Output res
766	$this->pushToCurrLine( $res, $this->logPrefix );
767
768	// Update sol flag. Test for newlines followed by optional includeonly or comments
769	if ( !$res->matches( $this->solRegexp(), $this->env ) ) {
770	$this->onSOL = false;
771	}
772
773	// We've emit something so we're no longer at SOO.
774	$this->atStartOfOutput = false;
775	}
776
777	/**
778	* Serialize the children of a DOM node, sharing the global serializer state.
779	* Typically called by a DOM-based handler to continue handling its children.
780	* @param Element\|DocumentFragment $node
781	* @param ?callable $wtEscaper ( $state, $text, $opts )
782	* PORT-FIXME document better; should this be done via WikitextEscapeHandlers somehow?
783	* @param ?Node $firstChild
784	*/
785	public function serializeChildren(
786	Node $node, ?callable $wtEscaper = null, ?Node $firstChild = null
787	): void {
788	// SSS FIXME: Unsure if this is the right thing always
789	if ( $wtEscaper ) {
790	$this->wteHandlerStack[] = $wtEscaper;
791	}
792
793	$child = $firstChild ?: $node->firstChild;
794	while ( $child !== null ) {
795	// We always get the next child to process
796	$child = $this->serializer->serializeNode( $child );
797	}
798
799	if ( $wtEscaper ) {
800	array_pop( $this->wteHandlerStack );
801	}
802
803	// If we serialized children explicitly,
804	// we were obviously processing a modified node.
805	$this->currNodeUnmodified = false;
806	}
807
808	/**
809	* Abstracts some steps taken in `serializeChildrenToString` and `serializeDOM`
810	*
811	* @param Element\|DocumentFragment $node
812	* @param ?callable $wtEscaper See {@link serializeChildren()}
813	* @internal For use by WikitextSerializer only
814	*/
815	public function kickOffSerialize(
816	Node $node, ?callable $wtEscaper = null
817	): void {
818	$this->updateSep( $node );
819	$this->currNodeUnmodified = false;
820	$this->updateModificationFlags( $node );
821	$this->resetCurrLine( $node->firstChild );
822	$this->serializeChildren( $node, $wtEscaper );
823	// Emit child-parent seps.
824	$this->emitSepForNode( $node );
825	// We've reached EOF, flush the remaining buffered text.
826	$this->flushLine();
827	}
828
829	/**
830	* Serialize children to a string
831	*
832	* FIXME(arlorla): Shouldn't affect the separator state, but accidents have
833	* have been known to happen. T109793 suggests using its own wts / state.
834	*
835	* @param Element\|DocumentFragment $node
836	* @param ?callable $wtEscaper See {@link serializeChildren()}
837	* @param string $inState
838	* @return string
839	*/
840	private function serializeChildrenToString(
841	Node $node, ?callable $wtEscaper, string $inState
842	): string {
843	$states = [ 'inLink', 'inCaption', 'inIndentPre', 'inHTMLPre', 'inPHPBlock', 'inAttribute' ];
844	Assert::parameter( in_array( $inState, $states, true ), '$inState', 'Must be one of: '
845	. implode( ', ', $states ) );
846	// FIXME: Make sure that the separators emitted here conform to the
847	// syntactic constraints of syntactic context.
848	$oldSep = $this->sep;
849	$oldSOL = $this->onSOL;
850	$oldOut = $this->out;
851	$oldStart = $this->atStartOfOutput;
852	$oldCurrLine = $this->currLine;
853	$oldLogPrefix = $this->logPrefix;
854	// Modification flags
855	$oldPrevNodeUnmodified = $this->prevNodeUnmodified;
856	$oldCurrNodeUnmodified = $this->currNodeUnmodified;
857	$oldPrevNode = $this->prevNode;
858
859	$this->out = '';
860	$this->logPrefix = 'OUT(C):';
861	$this->resetSep();
862	$this->onSOL = false;
863	$this->atStartOfOutput = false;
864	$this->$inState = true;
865
866	$this->singleLineContext->disable();
867	$this->kickOffSerialize( $node, $wtEscaper );
868	$this->singleLineContext->pop();
869
870	// restore the state
871	$bits = $this->out;
872	$this->out = $oldOut;
873	$this->$inState = false;
874	$this->sep = $oldSep;
875	$this->onSOL = $oldSOL;
876	$this->atStartOfOutput = $oldStart;
877	$this->currLine = $oldCurrLine;
878	$this->logPrefix = $oldLogPrefix;
879	// Modification flags
880	$this->prevNodeUnmodified = $oldPrevNodeUnmodified;
881	$this->currNodeUnmodified = $oldCurrNodeUnmodified;
882	$this->prevNode = $oldPrevNode;
883	return $bits;
884	}
885
886	/**
887	* Serialize children of a link to a string
888	* @param Element\|DocumentFragment $node
889	* @param ?callable $wtEscaper See {@link serializeChildren()}
890	* @return string
891	*/
892	public function serializeLinkChildrenToString(
893	Node $node, ?callable $wtEscaper = null
894	): string {
895	return $this->serializeChildrenToString( $node, $wtEscaper, 'inLink' );
896	}
897
898	/**
899	* Serialize children of a caption to a string
900	* @param Element\|DocumentFragment $node
901	* @param ?callable $wtEscaper See {@link serializeChildren()}
902	* @return string
903	*/
904	public function serializeCaptionChildrenToString(
905	Node $node, ?callable $wtEscaper = null
906	): string {
907	return $this->serializeChildrenToString( $node, $wtEscaper, 'inCaption' );
908	}
909
910	/**
911	* Serialize children of an indent-pre to a string
912	* @param Element\|DocumentFragment $node
913	* @param ?callable $wtEscaper See {@link serializeChildren()}
914	* @return string
915	*/
916	public function serializeIndentPreChildrenToString(
917	Node $node, ?callable $wtEscaper = null
918	): string {
919	return $this->serializeChildrenToString( $node, $wtEscaper, 'inIndentPre' );
920	}
921
922	/**
923	* Take notes of the open annotation ranges and whether they have been extended.
924	* @param string $ann
925	* @param bool $extended
926	*/
927	public function openAnnotationRange( string $ann, bool $extended ) {
928	$this->openAnnotations[$ann] = $extended;
929	}
930
931	/**
932	* Removes the corresponding annotation range from the list of open ranges.
933	* @param string $ann
934	*/
935	public function closeAnnotationRange( string $ann ) {
936	unset( $this->openAnnotations[$ann] );
937	}
938
939	}