Code Coverage for /src/src/Html2Wt/Separators.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	1.33% covered (danger)	1.33%	6 / 450	7.14% covered (danger)	7.14%	1 / 14	CRAP	0.00% covered (danger)	0.00%	0 / 1
Separators	1.33% covered (danger)	1.33%	6 / 450	7.14% covered (danger)	7.14%	1 / 14	43382.10	0.00% covered (danger)	0.00%	0 / 1
loggableConstraints	0.00% covered (danger)	0.00%	0 / 15	0.00% covered (danger)	0.00%	0 / 1	6
precedingSeparatorTextLen	0.00% covered (danger)	0.00%	0 / 11	0.00% covered (danger)	0.00%	0 / 1	30
getSepNlConstraints	0.00% covered (danger)	0.00%	0 / 35	0.00% covered (danger)	0.00%	0 / 1	72
makeSeparator	0.00% covered (danger)	0.00%	0 / 64	0.00% covered (danger)	0.00%	0 / 1	702
mergeConstraints	0.00% covered (danger)	0.00%	0 / 15	0.00% covered (danger)	0.00%	0 / 1	6
debugOut	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	12
updateSeparatorConstraints	0.00% covered (danger)	0.00%	0 / 37	0.00% covered (danger)	0.00%	0 / 1	72
__construct	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	2
makeSepIndentPreSafe	0.00% covered (danger)	0.00%	0 / 60	0.00% covered (danger)	0.00%	0 / 1	1260
handleAutoInserted	0.00% covered (danger)	0.00%	0 / 9	0.00% covered (danger)	0.00%	0 / 1	20
fetchLeadingTrimmedSpace	0.00% covered (danger)	0.00%	0 / 31	0.00% covered (danger)	0.00%	0 / 1	342
fetchTrailingTrimmedSpace	0.00% covered (danger)	0.00%	0 / 27	0.00% covered (danger)	0.00%	0 / 1	240
recoverTrimmedWhitespace	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	5
buildSep	0.00% covered (danger)	0.00%	0 / 132	0.00% covered (danger)	0.00%	0 / 1	6480

1	<?php
2
3	declare( strict_types = 1 );
4
5	namespace Wikimedia\Parsoid\Html2Wt;
6
7	use Wikimedia\Assert\Assert;
8	use Wikimedia\Parsoid\Config\Env;
9	use Wikimedia\Parsoid\Core\DomSourceRange;
10	use Wikimedia\Parsoid\DOM\Comment;
11	use Wikimedia\Parsoid\DOM\DocumentFragment;
12	use Wikimedia\Parsoid\DOM\Element;
13	use Wikimedia\Parsoid\DOM\Node;
14	use Wikimedia\Parsoid\Html2Wt\DOMHandlers\DOMHandler;
15	use Wikimedia\Parsoid\Utils\DiffDOMUtils;
16	use Wikimedia\Parsoid\Utils\DOMCompat;
17	use Wikimedia\Parsoid\Utils\DOMDataUtils;
18	use Wikimedia\Parsoid\Utils\DOMUtils;
19	use Wikimedia\Parsoid\Utils\PHPUtils;
20	use Wikimedia\Parsoid\Utils\TokenUtils;
21	use Wikimedia\Parsoid\Utils\Utils;
22	use Wikimedia\Parsoid\Utils\WTUtils;
23	use Wikimedia\Parsoid\Wikitext\Consts;
24
25	class Separators {
26	/*
27	* This regexp looks for leading whitespace on the last line of a separator string.
28	* So, only comments (single or multi-line) or other newlines can precede that
29	* whitespace-of-interest. But, also account for any whitespace preceding newlines
30	* since that needs to be skipped over (Ex: " \n ").
31	*/
32	private const INDENT_PRE_WS_IN_SEP_REGEXP =
33	'/^((?: \n\|(?:' . Utils::COMMENT_REGEXP_FRAGMENT . ')))( +)([^\n]*)$/D';
34
35	/**
36	* @var SerializerState
37	*/
38	private $state;
39
40	/**
41	* @var Env
42	*/
43	private $env;
44
45	/**
46	* Clean up the constraints object to prevent excessively verbose output
47	* and clog up log files / test runs.
48	*
49	* @param array $constraints
50	* @return array
51	*/
52	private static function loggableConstraints( array $constraints ): array {
53	$c = [
54	'a' => $constraints['a'] ?? null,
55	'b' => $constraints['b'] ?? null,
56	'min' => $constraints['min'] ?? null,
57	'max' => $constraints['max'] ?? null,
58	];
59	if ( !empty( $constraints['constraintInfo'] ) ) {
60	$constraintInfo = $constraints['constraintInfo'];
61	$c['constraintInfo'] = [
62	'onSOL' => $constraintInfo['onSOL'] ?? false,
63	'sepType' => $constraintInfo['sepType'] ?? null,
64	'nodeA' => DOMCompat::nodeName( $constraintInfo['nodeA'] ),
65	'nodeB' => DOMCompat::nodeName( $constraintInfo['nodeB'] ),
66	];
67	}
68	return $c;
69	}
70
71	private static function precedingSeparatorTextLen( Node $n ): ?int {
72	// Given the CSS white-space property and specifically,
73	// "pre" and "pre-line" values for this property, it seems that any
74	// sensible HTML editor would have to preserve IEW in HTML documents
75	// to preserve rendering. One use-case where an editor might change
76	// IEW drastically would be when the user explicitly requests it
77	// (Ex: pretty-printing of raw source code).
78	//
79	// For now, we are going to exploit this. This information is
80	// only used to extrapolate DSR values and extract a separator
81	// string from source, and is only used locally. In addition,
82	// the extracted text is verified for being a valid separator.
83	//
84	// So, at worst, this can create a local dirty diff around separators
85	// and at best, it gets us a clean diff.
86
87	$len = 0;
88	$orig = $n;
89	while ( $n ) {
90	if ( DOMUtils::isIEW( $n ) ) {
91	$len += strlen( $n->nodeValue );
92	} elseif ( $n instanceof Comment ) {
93	$len += WTUtils::decodedCommentLength( $n );
94	} elseif ( $n !== $orig ) { // dont return if input node!
95	return null;
96	}
97
98	$n = $n->previousSibling;
99	}
100
101	return $len;
102	}
103
104	/**
105	* Helper for updateSeparatorConstraints.
106	*
107	* Collects, checks and integrates separator newline requirements to a simple
108	* min, max structure.
109	*
110	* @param Node $nodeA
111	* @param array $aCons
112	* @param Node $nodeB
113	* @param array $bCons
114	* @return array
115	*/
116	private function getSepNlConstraints(
117	Node $nodeA, array $aCons, Node $nodeB, array $bCons
118	): array {
119	$env = $this->state->getEnv();
120
121	$nlConstraints = [
122	'min' => $aCons['min'] ?? null,
123	'max' => $aCons['max'] ?? null,
124	'constraintInfo' => [],
125	];
126
127	if ( isset( $bCons['min'] ) ) {
128	if ( $nlConstraints['max'] !== null && $nlConstraints['max'] < $bCons['min'] ) {
129	// Conflict, warn and let nodeB win.
130	$env->log(
131	'info/html2wt',
132	'Incompatible constraints 1:',
133	DOMCompat::nodeName( $nodeA ),
134	DOMCompat::nodeName( $nodeB ),
135	self::loggableConstraints( $nlConstraints )
136	);
137	$nlConstraints['min'] = $bCons['min'];
138	$nlConstraints['max'] = $bCons['min'];
139	} else {
140	$nlConstraints['min'] = max( $nlConstraints['min'] ?? 0, $bCons['min'] );
141	}
142	}
143
144	if ( isset( $bCons['max'] ) ) {
145	if ( ( $nlConstraints['min'] ?? 0 ) > $bCons['max'] ) {
146	// Conflict, warn and let nodeB win.
147	$env->log(
148	'info/html2wt',
149	'Incompatible constraints 2:',
150	DOMCompat::nodeName( $nodeA ),
151	DOMCompat::nodeName( $nodeB ),
152	self::loggableConstraints( $nlConstraints )
153	);
154	$nlConstraints['min'] = $bCons['max'];
155	$nlConstraints['max'] = $bCons['max'];
156	} else {
157	$nlConstraints['max'] = min( $nlConstraints['max'] ?? $bCons['max'], $bCons['max'] );
158	}
159	}
160
161	if ( $nlConstraints['max'] === null ) {
162	// Anything more than two lines will trigger paragraphs, so default to
163	// two if nothing is specified. (FIXME: This is a conservative strategy
164	// since strictly speaking, this is not always true. This is more a
165	// cautious fallback to handle cases where some DOM handler is missing
166	// a necessary max constraint.)
167	$nlConstraints['max'] = 2;
168	}
169
170	if ( ( $nlConstraints['min'] ?? 0 ) > $nlConstraints['max'] ) {
171	$nlConstraints['max'] = $nlConstraints['min'];
172	}
173
174	return $nlConstraints;
175	}
176
177	/**
178	* Create a separator given a (potentially empty) separator text and newline constraints.
179	*
180	* @param Node $node
181	* @param string $sep
182	* @param array $nlConstraints
183	* @return string
184	*/
185	private function makeSeparator( Node $node, string $sep, array $nlConstraints ): string {
186	$origSep = $sep;
187	$sepType = $nlConstraints['constraintInfo']['sepType'] ?? null;
188
189	// Split on comment/ws-only lines, consuming subsequent newlines since
190	// those lines are ignored by the PHP parser
191	// Ignore lines with ws and a single comment in them
192	$splitRe = implode( [ "#(?:\n(?:[ \t]*?",
193	Utils::COMMENT_REGEXP_FRAGMENT,
194	"[ \t]*?)+(?=\n))+\|",
195	Utils::COMMENT_REGEXP_FRAGMENT,
196	"#"
197	] );
198	$sepNlCount = substr_count( implode( preg_split( $splitRe, $sep ) ), "\n" );
199	$minNls = $nlConstraints['min'] ?? 0;
200
201	if ( $this->state->atStartOfOutput && $minNls > 0 ) {
202	// Skip first newline as we are in start-of-line context
203	$minNls--;
204	}
205
206	if ( $minNls > 0 && $sepNlCount < $minNls ) {
207	// Append newlines
208	$nlBuf = [];
209	for ( $i = 0; $i < ( $minNls - $sepNlCount ); $i++ ) {
210	$nlBuf[] = "\n";
211	}
212
213	/* ------------------------------------------------------------------
214	* The following two heuristics try to do a best-guess on where to
215	* add the newlines relative to nodeA and nodeB that best matches
216	* wikitext output expectations.
217	*
218	* 1. In a parent-child separator scenario, where the first child of
219	* nodeA is not an element, it could have contributed to the separator.
220	* In that case, the newlines should be prepended because they
221	* usually correspond to the parent's constraints,
222	* and the separator was plucked from the child.
223	*
224	* Try html2wt on this snippet:
225	*
226	* a<p><!--cmt-->b</p>
227	*
228	* 2. In a sibling scenario, if nodeB is a literal-HTML element, nodeA is
229	* forcing the newline and hence the newline should be emitted right
230	* after it.
231	*
232	* Try html2wt on this snippet:
233	*
234	* <p>foo</p> <p data-parsoid='{"stx":"html"}'>bar</p>
235	* -------------------------------------------------------------------- */
236	$constraintInfo = $nlConstraints['constraintInfo'] ?? [];
237	$sepType = $constraintInfo['sepType'] ?? null;
238	$nodeA = $constraintInfo['nodeA'] ?? null;
239	$nodeB = $constraintInfo['nodeB'] ?? null;
240	if (
241	$sepType === 'parent-child' &&
242	!DiffDOMUtils::isContentNode( DiffDOMUtils::firstNonDeletedChild( $nodeA ) ) &&
243	!(
244	isset( Consts::$HTML['ChildTableTags'][DOMCompat::nodeName( $nodeB )] ) &&
245	!WTUtils::isLiteralHTMLNode( $nodeB )
246	)
247	) {
248	$sep = implode( $nlBuf ) . $sep;
249	} elseif ( $sepType === 'sibling' && WTUtils::isLiteralHTMLNode( $nodeB ) ) {
250	$sep = implode( $nlBuf ) . $sep;
251	} else {
252	$sep .= implode( $nlBuf );
253	}
254	} elseif ( isset( $nlConstraints['max'] ) && $sepNlCount > $nlConstraints['max'] && (
255	// In selser mode, if the current node is an unmodified rendering-transparent node
256	// of a sibling pair, leave the separator alone since the excess newlines aren't
257	// going to change the semantics of how this node will be parsed in wt->html direction.
258	// This will instead eliminate a dirty diff on the page.
259	!$this->state->selserMode \|\|
260	$sepType !== 'sibling' \|\|
261	!$this->state->currNodeUnmodified \|\|
262	!WTUtils::isRenderingTransparentNode( $node )
263	) ) {
264	// Strip some newlines outside of comments.
265	//
266	// Capture separators in a single array with a capturing version of
267	// the split regexp, so that we can work on the non-separator bits
268	// when stripping newlines.
269	//
270	// Dirty-diff minimizing heuristic: Strip newlines away from an unmodified node.
271	// If both nodes are unmodified, this dirties the separator before the current node.
272	// If both nodes are modified, this dirties the separator after the previous node.
273	$allBits = preg_split( '#(' . PHPUtils::reStrip( $splitRe, '#' ) . ')#',
274	$sep, -1, PREG_SPLIT_DELIM_CAPTURE );
275	$newBits = [];
276	$n = $sepNlCount - $nlConstraints['max'];
277
278	$stripAtEnd = $this->state->prevNodeUnmodified;
279	while ( $n > 0 ) {
280	$bit = $stripAtEnd ? array_pop( $allBits ) : array_shift( $allBits );
281	while ( $bit && preg_match( $splitRe, $bit ) ) {
282	// Retain comment-only lines as is
283	$newBits[] = $bit;
284	$bit = $stripAtEnd ? array_pop( $allBits ) : array_shift( $allBits );
285	}
286	// @phan-suppress-next-line PhanPluginLoopVariableReuse
287	while ( $n > 0 && str_contains( $bit, "\n" ) ) {
288	$bit = preg_replace( '/\n([^\n]*)/', '$1', $bit, 1 );
289	$n--;
290	}
291	$newBits[] = $bit;
292	}
293	if ( $stripAtEnd ) {
294	$newBits = array_merge( $allBits, array_reverse( $newBits ) );
295	} else {
296	PHPUtils::pushArray( $newBits, $allBits );
297	}
298	$sep = implode( $newBits );
299	}
300
301	$this->state->getEnv()->log(
302	'debug/wts/sep',
303	'make-new \|',
304	static function () use ( $nlConstraints, $sepNlCount, $minNls, $sep, $origSep ) {
305	$constraints = Utils::clone( $nlConstraints, true, true );
306	unset( $constraints['constraintInfo'] );
307	return PHPUtils::jsonEncode( $sep ) . ', ' . PHPUtils::jsonEncode( $origSep ) . ', ' .
308	$minNls . ', ' . $sepNlCount . ', ' . PHPUtils::jsonEncode( $constraints );
309	}
310	);
311
312	return $sep;
313	}
314
315	/**
316	* Merge two constraints.
317	* @param Env $env
318	* @param array $oldConstraints
319	* @param array $newConstraints
320	* @return array
321	*/
322	private static function mergeConstraints(
323	Env $env, array $oldConstraints, array $newConstraints
324	): array {
325	$res = [
326	'min' => max( $oldConstraints['min'] ?? 0, $newConstraints['min'] ?? 0 ),
327	'max' => min( $oldConstraints['max'] ?? 2, $newConstraints['max'] ?? 2 ),
328	'constraintInfo' => [],
329	];
330
331	if ( $res['min'] > $res['max'] ) {
332	$res['max'] = $res['min'];
333	$env->log(
334	'info/html2wt',
335	'Incompatible constraints (merge):',
336	$res,
337	self::loggableConstraints( $oldConstraints ),
338	self::loggableConstraints( $newConstraints )
339	);
340	}
341
342	return $res;
343	}
344
345	public static function debugOut( Node $node ): string {
346	$value = '';
347	if ( $node instanceof Element ) {
348	$value = DOMCompat::getOuterHTML( $node );
349	}
350	if ( !$value ) {
351	$value = $node->nodeValue;
352	}
353	return mb_substr( PHPUtils::jsonEncode( $value ), 0, 40 );
354	}
355
356	/**
357	* Figure out separator constraints and merge them with existing constraints
358	* in state so that they can be emitted when the next content emits source.
359	*
360	* @param Node $nodeA
361	* @param DOMHandler $sepHandlerA
362	* @param Node $nodeB
363	* @param DOMHandler $sepHandlerB
364	*/
365	public function updateSeparatorConstraints(
366	Node $nodeA, DOMHandler $sepHandlerA, Node $nodeB, DOMHandler $sepHandlerB
367	): void {
368	$state = $this->state;
369
370	if ( $nodeB->parentNode === $nodeA ) {
371	// parent-child separator, nodeA parent of nodeB
372	'@phan-var Element\|DocumentFragment $nodeA'; // @var Element\|DocumentFragment $nodeA
373	$sepType = 'parent-child';
374	$aCons = $sepHandlerA->firstChild( $nodeA, $nodeB, $state );
375	$bCons = $nodeB instanceof Element ? $sepHandlerB->before( $nodeB, $nodeA, $state ) : [];
376	} elseif ( $nodeA->parentNode === $nodeB ) {
377	// parent-child separator, nodeB parent of nodeA
378	'@phan-var Element\|DocumentFragment $nodeB'; // @var Element\|DocumentFragment $nodeA
379	$sepType = 'child-parent';
380	$aCons = $nodeA instanceof Element ? $sepHandlerA->after( $nodeA, $nodeB, $state ) : [];
381	$bCons = $sepHandlerB->lastChild( $nodeB, $nodeA, $state );
382	} else {
383	// sibling separator
384	$sepType = 'sibling';
385	$aCons = $nodeA instanceof Element ? $sepHandlerA->after( $nodeA, $nodeB, $state ) : [];
386	$bCons = $nodeB instanceof Element ? $sepHandlerB->before( $nodeB, $nodeA, $state ) : [];
387	}
388	$nlConstraints = $this->getSepNlConstraints( $nodeA, $aCons, $nodeB, $bCons );
389
390	if ( !empty( $state->sep->constraints ) ) {
391	// Merge the constraints
392	$state->sep->constraints = self::mergeConstraints(
393	$this->env,
394	$state->sep->constraints,
395	$nlConstraints
396	);
397	} else {
398	$state->sep->constraints = $nlConstraints;
399	}
400
401	$this->env->log(
402	'debug/wts/sep',
403	function () use ( $sepType, $nodeA, $nodeB, $state ) {
404	return 'constraint' . ' \| ' .
405	$sepType . ' \| ' .
406	'<' . DOMCompat::nodeName( $nodeA ) . ',' . DOMCompat::nodeName( $nodeB ) .
407	'>' . ' \| ' . PHPUtils::jsonEncode( $state->sep->constraints ) . ' \| ' .
408	self::debugOut( $nodeA ) . ' \| ' . self::debugOut( $nodeB );
409	}
410	);
411
412	$state->sep->constraints['constraintInfo'] = [
413	'onSOL' => $state->onSOL,
414	// force SOL state when separator is built/emitted
415	'forceSOL' => $sepHandlerB->forceSOL(),
416	'sepType' => $sepType,
417	'nodeA' => $nodeA,
418	'nodeB' => $nodeB,
419	];
420	}
421
422	public function __construct( Env $env, SerializerState $state ) {
423	$this->env = $env;
424	$this->state = $state;
425	}
426
427	private function makeSepIndentPreSafe(
428	string $sep, array $nlConstraints
429	): string {
430	$state = $this->state;
431	$constraintInfo = $nlConstraints['constraintInfo'] ?? [];
432	$sepType = $constraintInfo['sepType'] ?? null;
433	$nodeA = $constraintInfo['nodeA'] ?? null;
434	$nodeB = $constraintInfo['nodeB'] ?? null;
435	$forceSOL = ( $constraintInfo['forceSOL'] ?? false ) && $sepType !== 'child-parent';
436	$origNodeB = $nodeB;
437
438	// Ex: "<div>foo</div>\n <span>bar</span>"
439	//
440	// We also should test for onSOL state to deal with HTML like
441	// <ul> <li>foo</li></ul>
442	// and strip the leading space before non-indent-pre-safe tags
443	if (
444	!$state->inPHPBlock &&
445	!$state->inIndentPre &&
446	preg_match( self::INDENT_PRE_WS_IN_SEP_REGEXP, $sep ) && (
447	str_contains( $sep, "\n" ) \|\| !empty( $constraintInfo['onSOL'] ) \|\| $forceSOL
448	)
449	) {
450	// 'sep' is the separator before 'nodeB' and it has leading spaces on a newline.
451	// We have to decide whether that leading space will trigger indent-pres in wikitext.
452	// The decision depends on where this separator will be emitted relative
453	// to 'nodeA' and 'nodeB'.
454
455	$isIndentPreSafe = false;
456
457	// Example sepType scenarios:
458	//
459	// 1. sibling
460	// <div>foo</div>
461	// <span>bar</span>
462	// The span will be wrapped in an indent-pre if the leading space
463	// is not stripped since span is not a block tag
464	//
465	// 2. child-parent
466	// <span>foo
467	// </span>bar
468	// The " </span>bar" will be wrapped in an indent-pre if the
469	// leading space is not stripped since span is not a block tag
470	//
471	// 3. parent-child
472	// <div>foo
473	// <span>bar</span>
474	// </div>
475	//
476	// In all cases, only block-tags prevent indent-pres.
477	// (except for a special case for <br> nodes)
478	if ( $nodeB && WTSUtils::precedingSpaceSuppressesIndentPre( $nodeB, $origNodeB ) ) {
479	$isIndentPreSafe = true;
480	} elseif ( $sepType === 'sibling' \|\| ( $nodeA && DOMUtils::atTheTop( $nodeA ) ) ) {
481	Assert::invariant( !DOMUtils::atTheTop( $nodeA ) \|\| $sepType === 'parent-child', __METHOD__ );
482
483	// 'nodeB' is the first non-separator child of 'nodeA'.
484	//
485	// Walk past sol-transparent nodes in the right-sibling chain
486	// of 'nodeB' till we establish indent-pre safety.
487	while ( $nodeB &&
488	( DiffUtils::isDiffMarker( $nodeB ) \|\| WTUtils::emitsSolTransparentSingleLineWT( $nodeB ) )
489	) {
490	$nodeB = $nodeB->nextSibling;
491	}
492
493	$isIndentPreSafe = !$nodeB \|\| WTSUtils::precedingSpaceSuppressesIndentPre( $nodeB, $origNodeB );
494	}
495
496	// Check whether nodeB is nested inside an element that suppresses
497	// indent-pres.
498	if ( $nodeB && !$isIndentPreSafe && !DOMUtils::atTheTop( $nodeB ) ) {
499	$parentB = $nodeB->parentNode; // could be nodeA
500	while ( WTUtils::isZeroWidthWikitextElt( $parentB ) ) {
501	$parentB = $parentB->parentNode;
502	}
503
504	// The token stream paragraph wrapper (and legacy doBlockLevels)
505	// tracks this separately with $inBlockquote
506	$isIndentPreSafe = DOMUtils::hasNameOrHasAncestorOfName(
507	$parentB, 'blockquote'
508	);
509
510	// First scope wins
511	while ( !$isIndentPreSafe && !DOMUtils::atTheTop( $parentB ) ) {
512	if (
513	TokenUtils::tagOpensBlockScope( DOMCompat::nodeName( $parentB ) ) &&
514	// Only html p-tag is indent pre suppressing
515	( DOMCompat::nodeName( $parentB ) !== 'p' \|\| WTUtils::isLiteralHTMLNode( $parentB ) )
516	) {
517	$isIndentPreSafe = true;
518	break;
519	} elseif ( TokenUtils::tagClosesBlockScope( DOMCompat::nodeName( $parentB ) ) ) {
520	break;
521	}
522	$parentB = $parentB->parentNode;
523	}
524	}
525
526	$stripLeadingSpace = ( !empty( $constraintInfo['onSOL'] ) \|\| $forceSOL ) &&
527	$nodeB && !WTUtils::isLiteralHTMLNode( $nodeB ) &&
528	isset( Consts::$HTMLTagsRequiringSOLContext[DOMCompat::nodeName( $nodeB )] );
529	if ( !$isIndentPreSafe \|\| $stripLeadingSpace ) {
530	// Wrap non-nl ws from last line, but preserve comments.
531	// This avoids triggering indent-pres.
532	$sep = preg_replace_callback(
533	self::INDENT_PRE_WS_IN_SEP_REGEXP,
534	static function ( $matches ) use ( $stripLeadingSpace, $state ) {
535	if ( !$stripLeadingSpace ) {
536	// Since we nowiki-ed, we are no longer in sol state
537	$state->onSOL = false;
538	$state->hasIndentPreNowikis = true;
539	$space = '<nowiki>' . $matches[2] . '</nowiki>';
540	}
541	return ( $matches[1] ?? '' ) . ( $space ?? '' ) . ( $matches[3] ?? '' );
542	},
543	$sep
544	);
545	}
546	}
547
548	$state->getEnv()->log(
549	'debug/wts/sep',
550	'ipre-safe \|',
551	static function () use ( $sep, $nlConstraints ) {
552	$constraints = Utils::clone( $nlConstraints, true, true );
553	unset( $constraints['constraintInfo'] );
554	return PHPUtils::jsonEncode( $sep ) . ', ' . PHPUtils::jsonEncode( $constraints );
555	}
556	);
557
558	return $sep;
559	}
560
561	/**
562	* Serializing auto inserted content should invalidate the original separator
563	* @param Element $node
564	* @return DomSourceRange\|null
565	*/
566	private static function handleAutoInserted( Element $node ): ?DomSourceRange {
567	$dp = DOMDataUtils::getDataParsoid( $node );
568	if ( !isset( $dp->dsr ) ) {
569	return null;
570	}
571
572	$dsr = clone $dp->dsr;
573	if ( !empty( $dp->autoInsertedStart ) ) {
574	$dsr->openWidth = null;
575	}
576	if ( !empty( $dp->autoInsertedEnd ) ) {
577	$dsr->closeWidth = null;
578	}
579	return $dsr;
580	}
581
582	/**
583	* $node is embedded inside a parent node that has its leading/trailing whitespace trimmed
584	* in the wt->html direction. In this method, we attempt to recover leading trimmed whitespace
585	* using DSR information on $node.
586	*
587	* In some cases, $node might have an additional "data-mw-selser-wrapper" span
588	* that is added by SelSer - look past those wrappers.
589	*
590	* The recovery is attempted in two different ways:
591	* 1. If we have additional DSR fields about leading/trailing WS
592	* (represented by $state->haveTrimmedWsDSR), that info is used.
593	* 2. If not, we simply inspect source at $dsr->innerStart and if it
594	* happens to be whitespace, we use that.
595	*
596	* @param Node $node
597	* @return ?string
598	*/
599	private function fetchLeadingTrimmedSpace( Node $node ): ?string {
600	$origNode = $node;
601	$parentNode = $node->parentNode;
602
603	// Skip past the artificial span wrapper
604	if ( $parentNode instanceof Element && $parentNode->hasAttribute( 'data-mw-selser-wrapper' ) ) {
605	$node = $parentNode;
606	$parentNode = $parentNode->parentNode;
607	}
608
609	// Leading trimmed whitespace only makes sense for first child.
610	// Ignore comments (which are part of separators) + deletion markers.
611	if ( DiffDOMUtils::previousNonSepSibling( $node ) ) {
612	return null;
613	}
614
615	'@phan-var Element\|DocumentFragment $parentNode'; // @var Element\|DocumentFragment $parentNode
616	if ( isset( Consts::$WikitextTagsWithTrimmableWS[DOMCompat::nodeName( $parentNode )] ) &&
617	( $origNode instanceof Element \|\| !preg_match( '/^[ \t]/', $origNode->nodeValue ) )
618	) {
619	// Don't reintroduce whitespace that's already been captured as a DisplaySpace
620	if ( DOMUtils::hasTypeOf( $origNode, 'mw:DisplaySpace' ) ) {
621	return null;
622	}
623
624	// FIXME: Is this complexity worth some minor dirty diff on this test?
625	// ParserTest: "3. List embedded in a formatting tag in a misnested way"
626	// I've not added an equivalent check in the trailing whitespace case.
627	if ( $origNode instanceof Element &&
628	isset( DOMDataUtils::getDataParsoid( $origNode )->autoInsertedStart ) &&
629	strspn( $origNode->firstChild->textContent ?? '', " \t" ) >= 1
630	) {
631	return null;
632	}
633
634	$state = $this->state;
635	$dsr = DOMDataUtils::getDataParsoid( $parentNode )->dsr ?? null;
636	if ( Utils::isValidDSR( $dsr, true ) ) {
637	if (
638	$state->haveTrimmedWsDSR &&
639	$dsr->hasTrimmedWS() &&
640	$dsr->hasValidLeadingWS()
641	) {
642	if ( preg_match(
643	'/^([ \t]*)/',
644	$state->getOrigSrc( $dsr->innerRange() ) ?? '',
645	$matches
646	) ) {
647	// $matches[1] is just spaces and tabs
648	return substr( $matches[1], 0, $dsr->leadingWS );
649	}
650	} elseif ( $dsr->innerStart() < $dsr->innerEnd() ) {
651	$sep = $state->getOrigSrc( $dsr->innerRange() ) ?? '';
652	// return first character of inner range iff it is
653	// tab or space
654	return preg_match( '/^[ \t]/', $sep ) ? $sep[0] : null;
655	}
656	}
657	}
658
659	return null;
660	}
661
662	/**
663	* $node is embedded inside a parent node that has its leading/trailing whitespace trimmed
664	* in the wt->html direction. In this method, we attempt to recover trailing trimmed whitespace
665	* using DSR information on $node.
666	*
667	* In some cases, $node might have an additional "data-mw-selser-wrapper" span
668	* that is added by SelSer - look past those wrappers.
669	*
670	* The recovery is attempted in two different ways:
671	* 1. If we have additional DSR fields about leading/trailing WS
672	* (represented by $state->haveTrimmedWsDSR), that info is used.
673	* 2. If not, we simply inspect source at $dsr->innerEnd and if it
674	* happens to be whitespace, we use that.
675	*
676	* @param Node $node
677	* @return ?string
678	*/
679	private function fetchTrailingTrimmedSpace( Node $node ): ?string {
680	$origNode = $node;
681	$parentNode = $node->parentNode;
682
683	// Skip past the artificial span wrapper
684	if ( $parentNode instanceof Element && $parentNode->hasAttribute( 'data-mw-selser-wrapper' ) ) {
685	$node = $parentNode;
686	$parentNode = $parentNode->parentNode;
687	}
688
689	// Trailing trimmed whitespace only makes sense for last child.
690	// Ignore comments (which are part of separators) + deletion markers.
691	if ( DiffDOMUtils::nextNonSepSibling( $node ) ) {
692	return null;
693	}
694
695	'@phan-var Element\|DocumentFragment $parentNode'; // @var Element\|DocumentFragment $parentNode
696	if ( isset( Consts::$WikitextTagsWithTrimmableWS[DOMCompat::nodeName( $parentNode )] ) &&
697	( $origNode instanceof Element \|\| !preg_match( '/[ \t]$/', $origNode->nodeValue ) )
698	) {
699	// Don't reintroduce whitespace that's already been captured as a DisplaySpace
700	if ( DOMUtils::hasTypeOf( $origNode, 'mw:DisplaySpace' ) ) {
701	return null;
702	}
703
704	$state = $this->state;
705	$dsr = DOMDataUtils::getDataParsoid( $parentNode )->dsr ?? null;
706	if ( Utils::isValidDSR( $dsr, true ) ) {
707	if (
708	$state->haveTrimmedWsDSR &&
709	$dsr->hasTrimmedWS() &&
710	$dsr->hasValidTrailingWS()
711	) {
712	if ( preg_match(
713	'/([ \t]*)$/',
714	$state->getOrigSrc( $dsr->innerRange() ) ?? '',
715	$matches
716	) ) {
717	// $matches[1] is just spaces and tabs
718	// note that trailingWS can be zero
719	return substr( $matches[1], strlen( $matches[1] ) - $dsr->trailingWS );
720	}
721	} elseif ( ( $dsr->innerEnd() - 1 ) > $dsr->innerStart() ) {
722	// The > instead of >= in the test above is to
723	// deal with an edge case where that single space
724	// is captured by the getLeadingSpace case above
725	$sep = $state->getOrigSrc( $dsr->innerRange() ) ?? '';
726	// Return last character of $sep iff it is space or tab
727	return preg_match( '/[ \t]$/', $sep ) ? substr( $sep, -1 ) : null;
728	}
729	}
730	}
731
732	return null;
733	}
734
735	/**
736	* Emit a separator based on the collected (and merged) constraints
737	* and existing separator text. Called when new output is triggered.
738	* @param Node $node
739	* @param bool $leading
740	* if true, trimmed leading whitespace is emitted
741	* if false, trimmed trailing whitespace is emitted
742	* @return string\|null
743	*/
744	public function recoverTrimmedWhitespace( Node $node, bool $leading ): ?string {
745	// Deal with scenarios where leading / trailing whitespace were trimmed.
746	// We now need to figure out if we need to add any leading / trailing WS back.
747	if ( $this->state->useWhitespaceHeuristics && $this->state->selserMode ) {
748	if ( $leading ) {
749	return $this->fetchLeadingTrimmedSpace( $node );
750	} else {
751	$lastChild = DiffDOMUtils::lastNonDeletedChild( $node );
752	return $lastChild ? $this->fetchTrailingTrimmedSpace( $lastChild ) : null;
753	}
754	}
755
756	return null;
757	}
758
759	/**
760	* Emit a separator based on the collected (and merged) constraints
761	* and existing separator text. Called when new output is triggered.
762	* @param Node $node
763	* @return string\|null
764	*/
765	public function buildSep( Node $node ): ?string {
766	$state = $this->state;
767	$sepType = $state->sep->constraints['constraintInfo']['sepType'] ?? null;
768	$sep = null;
769	$origNode = $node;
770	$prevNode = $state->sep->lastSourceNode;
771	$dsrA = null;
772	$dsrB = null;
773
774	/* ----------------------------------------------------------------------
775	* Assuming we have access to the original source, we can use DSR offsets
776	* to extract separators from source only if:
777	* - we are in selser mode AND
778	* - this node is not part of a newly inserted subtree (marked 'modified')
779	* for which DSR isn't available
780	* - neither node is adjacent to a deleted block node
781	* (see the long comment in SerializerState::emitChunk in the middle)
782	*
783	* In other scenarios, DSR values on "adjacent" nodes in the edited DOM
784	* may not reflect deleted content between them.
785	* ---------------------------------------------------------------------- */
786	$origSepNeeded = $node !== $prevNode && $state->selserMode;
787	$origSepNeededAndUsable =
788	$origSepNeeded && !$state->inInsertedContent &&
789	!WTSUtils::nextToDeletedBlockNodeInWT( $prevNode, true ) &&
790	!WTSUtils::nextToDeletedBlockNodeInWT( $node, false ) &&
791	WTSUtils::origSrcValidInEditedContext( $state, $prevNode ) &&
792	WTSUtils::origSrcValidInEditedContext( $state, $node );
793
794	if ( $origSepNeededAndUsable ) {
795	if ( $prevNode instanceof Element ) {
796	$dsrA = self::handleAutoInserted( $prevNode );
797	} elseif ( !( $prevNode instanceof DocumentFragment ) ) {
798	// Check if $prevNode is the last child of a zero-width element,
799	// and use that for dsr purposes instead. Typical case: text in p.
800	if (
801	!$prevNode->nextSibling &&
802	$prevNode->parentNode !== $node &&
803	$prevNode->parentNode instanceof Element &&
804	( DOMDataUtils::getDataParsoid( $prevNode->parentNode )->dsr->closeWidth ?? null ) === 0
805	) {
806	$dsrA = self::handleAutoInserted( $prevNode->parentNode );
807	} elseif (
808	// Can we extrapolate DSR from $prevNode->previousSibling?
809	// Yes, if $prevNode->parentNode didn't have its children edited.
810	$prevNode->previousSibling instanceof Element &&
811	!DiffUtils::directChildrenChanged( $prevNode->parentNode )
812	) {
813	$endDsr = DOMDataUtils::getDataParsoid( $prevNode->previousSibling )->dsr->end ?? null;
814	$correction = null;
815	if ( is_int( $endDsr ) ) {
816	if ( $prevNode instanceof Comment ) {
817	'@phan-var Comment $prevNode'; // @var Comment $prevNode
818	$correction = WTUtils::decodedCommentLength( $prevNode );
819	} else {
820	$correction = strlen( $prevNode->nodeValue );
821	}
822	$dsrA = new DomSourceRange(
823	$endDsr,
824	$endDsr + $correction + WTUtils::indentPreDSRCorrection( $prevNode ),
825	0,
826	0
827	);
828	}
829	}
830	}
831
832	if ( !$dsrA ) {
833	// nothing to do -- no reason to compute dsrB if dsrA is null
834	} elseif ( $node instanceof Element ) {
835	// $node is parent of $prevNode
836	if ( $prevNode->parentNode === $node ) {
837	'@phan-var Element\|DocumentFragment $node'; // @var Element\|DocumentFragment $node
838	// FIXME: Maybe we shouldn't set dsr in the dsr pass if both aren't valid?
839	//
840	// When we are in the lastChild sep scenario and the parent doesn't have
841	// useable dsr, if possible, walk up the ancestor nodes till we find
842	// a dsr-bearing node
843	//
844	// This fix is needed to handle trailing newlines in this wikitext:
845	// [[File:foo.jpg\|thumb\|300px\|foo\n{{1x\|A}}\n{{1x\|B}}\n{{1x\|C}}\n\n]]
846	while (
847	!$node->nextSibling &&
848	!DOMUtils::atTheTop( $node ) &&
849	(
850	empty( DOMDataUtils::getDataParsoid( $node )->dsr ) \|\|
851	DOMDataUtils::getDataParsoid( $node )->dsr->start === null \|\|
852	DOMDataUtils::getDataParsoid( $node )->dsr->end === null
853	)
854	) {
855	$node = $node->parentNode;
856	}
857	}
858
859	// The top node could be a document fragment
860	$dsrB = $node instanceof Element ? self::handleAutoInserted( $node ) : null;
861	} elseif ( !( $node instanceof DocumentFragment ) ) {
862	// $node is text/comment. Can we extrapolate DSR from $node->parentNode?
863	// Yes, if this is the child of a zero-width element and
864	// is only preceded by separator elements.
865	//
866	// 1. text in p.
867	// 2. ws-only child of a node with auto-inserted start tag
868	// Ex: "<span> <s>x</span> </s>" --> <span> <s>x</s></span><s> </s>
869	// 3. ws-only children of a node with auto-inserted start tag
870	// Ex: "{\|\n\|-\n <!--foo--> \n\|}"
871	$nodeParent = $node->parentNode;
872	// phpcs:ignore Generic.Files.LineLength.TooLong
873	'@phan-var Element\|DocumentFragment $nodeParent'; // @var Element\|DocumentFragment $nodeParent
874
875	if (
876	$nodeParent !== $prevNode &&
877	$nodeParent instanceof Element &&
878	( DOMDataUtils::getDataParsoid( $nodeParent )->dsr->openWidth ?? null ) === 0
879	) {
880	$sepLen = self::precedingSeparatorTextLen( $node );
881	if ( $sepLen !== null ) {
882	$dsrB = DOMDataUtils::getDataParsoid( $nodeParent )->dsr;
883	if ( is_int( $dsrB->start ) && $sepLen > 0 ) {
884	$dsrB = clone $dsrB;
885	$dsrB->start += $sepLen;
886	}
887	}
888	}
889	}
890
891	// FIXME: Maybe we shouldn't set dsr in the dsr pass if both aren't valid?
892	// NOTE: Synthetic DSR ranges
893	// may not necessarily have offsets that correspond to valid
894	// UTF-8 characters. So use $state->isValidDSR() to ensure that
895	// all offsets land on valid UTF-8 characters before trying to
896	// construct substrings based on relations between them.
897	if (
898	$state->isValidDSR( $dsrA ) &&
899	$state->isValidDSR( $dsrB )
900	) {
901	// Figure out containment relationship
902	if ( $dsrA->start <= $dsrB->start ) {
903	if ( $dsrB->end <= $dsrA->end ) {
904	if ( $dsrA->start === $dsrB->start && $dsrA->end === $dsrB->end ) {
905	// Both have the same dsr range, so there can't be any
906	// separators between them
907	$sep = '';
908	} elseif ( isset( $dsrA->openWidth ) && $state->isValidDSR( $dsrA, true ) ) {
909	// B in A, from parent to child
910	$sep = $state->getOrigSrc( $dsrA->openRange()->to( $dsrB ) );
911	}
912	} elseif ( $dsrA->end <= $dsrB->start ) {
913	// B following A (siblingish)
914	$sep = $state->getOrigSrc( $dsrA->to( $dsrB ) );
915	} elseif ( isset( $dsrB->closeWidth ) && $state->isValidDSR( $dsrB, true ) ) {
916	// A in B, from child to parent
917	$sep = $state->getOrigSrc( $dsrA->to( $dsrB->closeRange() ) );
918	}
919	} elseif ( $dsrA->end <= $dsrB->end ) {
920	if ( isset( $dsrB->closeWidth ) && $state->isValidDSR( $dsrB, true ) ) {
921	// A in B, from child to parent
922	$sep = $state->getOrigSrc( $dsrA->to( $dsrB->closeRange() ) );
923	}
924	} else {
925	$this->env->log( 'info/html2wt', 'dsr backwards: should not happen!' );
926	}
927
928	// Reset if $sep is invalid
929	if ( $sep && !WTSUtils::isValidSep( $sep ) ) {
930	$sep = null;
931	}
932	}
933	} elseif ( $origSepNeeded && !DiffUtils::hasDiffMarkers( $prevNode ) ) {
934	// Given the following conditions:
935	// - $prevNode has no diff markers. (checked above)
936	// - $prevNode's next non-sep sibling ($next) was inserted.
937	// - $next is an ancestor of $node.
938	// - all of those ancestor nodes from $node->$next have zero-width
939	// wikitext (otherwise, the separator isn't usable)
940	// Try to extract a separator from original source that existed
941	// between $prevNode and its original next sibling or its parent
942	// (if $prevNode was the last non-sep child).
943	//
944	// This minimizes dirty-diffs to that separator text from
945	// the insertion of $next after $prevNode.
946	$next = DiffDOMUtils::nextNonSepSibling( $prevNode );
947	$origSepUsable = $next && DiffUtils::hasInsertedDiffMark( $next );
948
949	// Check that $next is an ancestor of $node and all nodes
950	// on that path have zero-width wikitext
951	if ( $origSepUsable && $node !== $next ) {
952	$n = $node->parentNode;
953	while ( $n && $next !== $n ) {
954	if ( !WTUtils::isZeroWidthWikitextElt( $n ) ) {
955	$origSepUsable = false;
956	break;
957	}
958	$n = $n->parentNode;
959	}
960	$origSepUsable = $origSepUsable && $n !== null;
961	}
962
963	// Extract separator from original source if possible
964	if ( $origSepUsable ) {
965	$origNext = DiffDOMUtils::nextNonSepSibling( $next );
966	if ( !$origNext ) { // $prevNode was last non-sep child of its parent
967	// We could work harder for text/comments and extrapolate, but skipping that here
968	// FIXME: If we had a generic DSR extrapolation utility, that would be useful
969	$o1 = $prevNode instanceof Element ?
970	DOMDataUtils::getDataParsoid( $prevNode )->dsr ?? null : null;
971	if ( $o1 !== null ) {
972	$dsr2 = DOMDataUtils::getDataParsoid( $prevNode->parentNode )->dsr ?? null;
973	$sep = $dsr2 !== null ? $state->getOrigSrc( $o1->to( $dsr2->closeRange() ) ) : null;
974	}
975	} elseif ( !DiffUtils::hasDiffMarkers( $origNext ) ) {
976	// We could work harder for text/comments and extrapolate, but skipping that here
977	// FIXME: If we had a generic DSR extrapolation utility, that would be useful
978	$o1 = $prevNode instanceof Element ?
979	DOMDataUtils::getDataParsoid( $prevNode )->dsr ?? null : null;
980	if ( $o1 !== null ) {
981	$o2 = $origNext instanceof Element ?
982	DOMDataUtils::getDataParsoid( $origNext )->dsr ?? null : null;
983	$sep = $o2 !== null ? $state->getOrigSrc( $o1->to( $o2 ) ) : null;
984	}
985	}
986
987	if ( $sep !== null ) {
988	// Since this is an inserted node, we might have to augment this
989	// with newline constraints and so, we just set this recovered sep
990	// to the buffered sep in state->sep->src
991	$state->sep->src = $sep;
992	$sep = null;
993	}
994	}
995	}
996
997	// If all efforts failed, use special-purpose heuristics to recover
998	// trimmed leading / trailing whitespace from lists, headings, table-cells
999	if ( $sep === null ) {
1000	if ( $sepType === 'parent-child' ) {
1001	$sep = $this->recoverTrimmedWhitespace( $node, true );
1002	$state->sep->src = ( $sep ?? '' ) . $state->sep->src;
1003	} elseif ( $sepType === 'child-parent' ) {
1004	$sep = $this->recoverTrimmedWhitespace( $node, false );
1005	$state->sep->src .= $sep ?? '';
1006	}
1007	}
1008
1009	$this->env->log(
1010	'debug/wts/sep',
1011	static function () use ( $prevNode, $origNode, $sep, $state ) {
1012	return 'maybe-sep \| ' .
1013	'prev:' . ( $prevNode ? DOMCompat::nodeName( $prevNode ) : '--none--' ) .
1014	', node:' . DOMCompat::nodeName( $origNode ) .
1015	', sep: ' . PHPUtils::jsonEncode( $sep ) .
1016	', state.sep.src: ' . PHPUtils::jsonEncode( $state->sep->src ?? null );
1017	}
1018	);
1019
1020	// If the separator is being emitted before a node that emits sol-transparent WT,
1021	// go through makeSeparator to verify indent-pre constraints are met.
1022	$sepConstraints = $state->sep->constraints ?? [ 'max' => 0 ];
1023	if ( $sep === null \|\| ( $state->sep->src && $state->sep->src !== $sep ) ) {
1024	if ( !empty( $state->sep->constraints ) \|\| !empty( $state->sep->src ) ) {
1025	// TODO: set modified flag if start or end node (but not both) are
1026	// modified / new so that the selser can use the separator
1027	$sep = $this->makeSeparator( $node, $state->sep->src ?? '', $sepConstraints );
1028	} else {
1029	$sep = null;
1030	}
1031	}
1032
1033	if ( $sep !== null ) {
1034	$sep = self::makeSepIndentPreSafe( $sep, $sepConstraints );
1035	}
1036	return $sep;
1037	}
1038	}