Code Coverage for /src/src/Html2Wt/ConstrainedText/ConstrainedText.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	66.06% covered (warning)	66.06%	72 / 109	25.00% covered (danger)	25.00%	2 / 8	CRAP	0.00% covered (danger)	0.00%	0 / 1
ConstrainedText	66.06% covered (warning)	66.06%	72 / 109	25.00% covered (danger)	25.00%	2 / 8	106.75	0.00% covered (danger)	0.00%	0 / 1
escapeLine	100.00% covered (success)	100.00%	14 / 14	100.00% covered (success)	100.00%	1 / 1	3
__construct	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	2
cast	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	6
escape	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
equals	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	20
matches	0.00% covered (danger)	0.00%	0 / 8	0.00% covered (danger)	0.00%	0 / 1	20
fromSelSer	92.86% covered (success)	92.86%	13 / 14	0.00% covered (danger)	0.00%	0 / 1	5.01
fromSelSerImpl	75.86% covered (warning)	75.86%	44 / 58	0.00% covered (danger)	0.00%	0 / 1	27.20

1	<?php
2	declare( strict_types = 1 );
3
4	namespace Wikimedia\Parsoid\Html2Wt\ConstrainedText;
5
6	use Wikimedia\Parsoid\Config\Env;
7	use Wikimedia\Parsoid\DOM\Element;
8	use Wikimedia\Parsoid\DOM\Node;
9	use Wikimedia\Parsoid\NodeData\DataParsoid;
10	use Wikimedia\Parsoid\Utils\DiffDOMUtils;
11	use Wikimedia\Parsoid\Utils\DOMCompat;
12	use Wikimedia\Parsoid\Utils\DOMDataUtils;
13	use Wikimedia\Parsoid\Utils\DOMUtils;
14	use Wikimedia\Parsoid\Utils\PHPUtils;
15	use Wikimedia\Parsoid\Utils\Utils;
16
17	/**
18	* A chunk of wikitext output. This base class contains the
19	* wikitext and a pointer to the DOM node which is responsible for
20	* generating it. Subclasses can add additional properties to record
21	* context or wikitext boundary restrictions for proper escaping.
22	* The chunk is serialized with the `escape` method, which might
23	* alter the wikitext in order to ensure it doesn't run together
24	* with its context (usually by adding `<nowiki>` tags).
25	*
26	* The main entry point is the static function `ConstrainedText::escapeLine()`.
27	*/
28	class ConstrainedText {
29	/**
30	* This adds necessary escapes to a line of chunks. We provide
31	* the `ConstrainedText#escape` function with its left and right
32	* context, and it can determine what escapes are needed.
33	*
34	* The `line` parameter is an array of `ConstrainedText` chunks
35	* which make up a line (or part of a line, in some cases of nested
36	* processing).
37	*
38	* @param ConstrainedText[] $line
39	* @return string
40	*/
41	public static function escapeLine( array $line ): string {
42	// The left context will be precise (that is, it is the result
43	// of `ConstrainedText#escape` and will include any escapes
44	// triggered by chunks on the left), but the right context
45	// is just the (unescaped) text property from the chunk.
46	// As we work left to right we will piece together a fully-escaped
47	// string. Be careful not to shoot yourself in the foot -- if the
48	// escaped text is significantly different from the chunk's `text`
49	// property, the preceding chunk may not have made the correct
50	// decisions about emitting an escape suffix. We could solve
51	// this by looping until the state converges (or until we detect
52	// a loop) but for now let's hope that's not necessary.
53	$state = new State( $line );
54	$safeLeft = '';
55	for ( $state->pos = 0; $state->pos < count( $line ); $state->pos++ ) {
56	$chunk = $line[$state->pos];
57	// Process the escapes for this chunk, given escaped previous chunk
58	$state->rightContext = substr( $state->rightContext, strlen( $chunk->text ) );
59	$thisEscape = $chunk->escape( $state );
60	$state->leftContext .=
61	( $thisEscape->prefix ?? '' ) .
62	$thisEscape->text .
63	( $thisEscape->suffix ?? '' );
64	if ( $thisEscape->greedy ) {
65	// protect the left context: this will be matched greedily
66	// by this chunk, so there's no chance that a subsequent
67	// token will include this in its prefix.
68	$safeLeft .= $state->leftContext;
69	$state->leftContext = '';
70	}
71	}
72	// right context should be empty here.
73	return $safeLeft . $state->leftContext;
74	}
75
76	/**
77	* The wikitext string associated with this chunk.
78	* @var string
79	*/
80	public $text;
81	/**
82	* The DOM Node associated with this chunk.
83	* @var Node
84	*/
85	public $node;
86	/**
87	* The prefix string to add if the start of the chunk doesn't match its
88	* constraints.
89	* @var ?string
90	*/
91	public $prefix;
92	/**
93	* The suffix string to add if the end of the chunk doesn't match its
94	* constraints.
95	* @var ?string
96	*/
97	public $suffix;
98	/**
99	* Does this chunk come from selser?
100	* @var bool
101	*/
102	public $selser;
103	/**
104	* Suppress separators?
105	* @var bool
106	*/
107	public $noSep;
108
109	/**
110	* @param array{text:string,node:Node,prefix?:string,suffix?:string} $args Options.
111	*/
112	public function __construct( array $args ) {
113	$this->text = $args['text'];
114	$this->node = $args['node'];
115	$this->prefix = $args['prefix'] ?? null;
116	$this->suffix = $args['suffix'] ?? null;
117	$this->selser = false;
118	$this->noSep = false;
119	}
120
121	/**
122	* Ensure that the argument `o`, which is perhaps a string, is a instance of
123	* `ConstrainedText`.
124	* @param string\|ConstrainedText $o
125	* @param Node $node
126	* The {@link Node} corresponding to `o`.
127	* @return ConstrainedText
128	*/
129	public static function cast( $o, Node $node ): ConstrainedText {
130	if ( $o instanceof ConstrainedText ) {
131	return $o;
132	}
133	return new ConstrainedText( [ 'text' => $o ?? '', 'node' => $node ] );
134	}
135
136	/**
137	* Use the provided `state`, which gives context and access to the entire
138	* list of chunks, to determine the proper escape prefix/suffix.
139	* Returns an object with a `text` property as well as optional
140	* `prefix` and 'suffix' properties giving desired escape strings.
141	* @param State $state Context state
142	* @return Result
143	*/
144	public function escape( State $state ): Result {
145	// default implementation: no escaping, no prefixes or suffixes.
146	return new Result( $this->text, $this->prefix, $this->suffix );
147	}
148
149	/**
150	* Simple equality. This enforces type equality
151	* (ie subclasses are not equal).
152	* @param ConstrainedText $ct
153	* @return bool
154	*/
155	public function equals( ConstrainedText $ct ): bool {
156	return $this === $ct \|\| (
157	get_class( $this ) === self::class &&
158	get_class( $ct ) === self::class &&
159	$this->text === $ct->text
160	);
161	}
162
163	/**
164	* Useful shortcut: execute a regular expression on the raw wikitext.
165	* @param string $re
166	* @param Env $env
167	* @return array\|null
168	* An array containing the matched results or null if there were no matches.
169	*/
170	public function matches( string $re, Env $env ): ?array {
171	$r = preg_match( $re, $this->text, $m );
172	if ( $r === false ) {
173	if ( version_compare( PHP_VERSION, '8.0.0', '>' ) ) {
174	$error_msg = preg_last_error_msg();
175	} else {
176	$error_msg = "preg_last_error: " . preg_last_error();
177	}
178	$env->log( 'error', $error_msg, $re, $this->text );
179	throw new \Error( 'Bad regular expression' );
180	}
181	return $r === 0 ? null : $m;
182	}
183
184	/**
185	* SelSer support: when we come across an unmodified node in during
186	* selective serialization, we know we can use the original wikitext
187	* for that node unmodified. But there may be boundary conditions
188	* on the left and right sides of the selser'ed text which are going
189	* to require escaping.
190	*
191	* So rather than turning the node into a plain old `ConstrainedText`
192	* chunk, allow subclasses of `ConstrainedText` to register as potential
193	* handlers of selser nodes. A selser'ed magic link, for example,
194	* will then turn into a `MagicLinkText` and thus be able to enforce
195	* the proper boundary constraints.
196	*
197	* @param string $text
198	* @param Element $node
199	* @param DataParsoid $dataParsoid
200	* @param Env $env
201	* @param array $opts
202	* @return ConstrainedText[]
203	*/
204	public static function fromSelSer(
205	string $text, Element $node, DataParsoid $dataParsoid,
206	Env $env, array $opts = []
207	): array {
208	// Main dispatch point: iterate through registered subclasses, asking
209	// each if they can handle this node (by invoking `fromSelSerImpl`).
210
211	// We define parent types before subtypes, so search the list backwards
212	// to be sure we check subtypes before parent types.
213	$types = self::$types;
214	for ( $i = count( $types ) - 1; $i >= 0; $i-- ) {
215	$ct = call_user_func(
216	[ $types[$i], 'fromSelSerImpl' ],
217	$text, $node, $dataParsoid, $env, $opts
218	);
219	if ( !$ct ) {
220	continue;
221	}
222	if ( !is_array( $ct ) ) {
223	$ct = [ $ct ];
224	}
225	// tag these chunks as coming from selser
226	foreach ( $ct as $t ) {
227	$t->selser = true;
228	}
229	return $ct;
230	}
231	// ConstrainedText::fromSelSerImpl should handle everything which reaches it
232	// so nothing should make it here.
233	throw new \Error( 'Should never happen.' );
234	}
235
236	/**
237	* Base case: the given node type does not correspond to a special
238	* `ConstrainedText` subclass. We still have to be careful: the leftmost
239	* (rightmost) children of `node` may still be exposed to our left (right)
240	* context. If so (ie, their DSR bounds coincide) split the selser text
241	* and emit multiple `ConstrainedText` chunks to preserve the proper
242	* boundary conditions.
243	*
244	* @param string $text
245	* @param Element $node
246	* @param DataParsoid $dataParsoid
247	* @param Env $env
248	* @param array $opts
249	* @return ConstrainedText\|ConstrainedText[]
250	*/
251	protected static function fromSelSerImpl(
252	string $text, Element $node, DataParsoid $dataParsoid,
253	Env $env, array $opts
254	) {
255	// look at leftmost and rightmost children, it may be that we need
256	// to turn these into ConstrainedText chunks in order to preserve
257	// the proper escape conditions on the prefix/suffix text.
258	$firstChild = DiffDOMUtils::firstNonDeletedChild( $node );
259	$lastChild = DiffDOMUtils::lastNonDeletedChild( $node );
260	$firstChildDp = $firstChild instanceof Element ?
261	DOMDataUtils::getDataParsoid( $firstChild ) : null;
262	$lastChildDp = $lastChild instanceof Element ?
263	DOMDataUtils::getDataParsoid( $lastChild ) : null;
264	$prefixChunks = [];
265	$suffixChunks = [];
266	$len = null;
267	$ignorePrefix = $opts['ignorePrefix'] ?? false;
268	$ignoreSuffix = $opts['ignoreSuffix'] ?? false;
269	// check to see if first child's DSR start is the same as this node's
270	// DSR start. If so, the first child is exposed to the (modified)
271	// left-hand context, and so recursively convert it to the proper
272	// list of specialized chunks.
273	if (
274	!$ignorePrefix &&
275	$firstChildDp && Utils::isValidDSR( $firstChildDp->dsr ?? null ) &&
276	$dataParsoid->dsr->start === $firstChildDp->dsr->start
277	) {
278	DOMUtils::assertElt( $firstChild ); // implied by $firstChildDp
279	$len = $firstChildDp->dsr->length();
280	if ( $len < 0 ) { // T254412: Bad DSR
281	$env->log( "error/html2wt/dsr",
282	"Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ),
283	"Node: " . DOMCompat::getOuterHTML( $firstChild ) );
284	} else {
285	if ( $len > strlen( $text ) ) { // T254412: Bad DSR
286	$env->log( "error/html2wt/dsr",
287	"Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ),
288	"Node: " . DOMCompat::getOuterHTML( $firstChild ) );
289	$len = strlen( $text );
290	}
291	$prefixChunks = self::fromSelSer(
292	substr( $text, 0, $len ), $firstChild, $firstChildDp, $env,
293	// this child node's right context will be protected:
294	[ 'ignoreSuffix' => true ]
295	);
296	$text = substr( $text, $len );
297	}
298	}
299	// check to see if last child's DSR end is the same as this node's
300	// DSR end. If so, the last child is exposed to the (modified)
301	// right-hand context, and so recursively convert it to the proper
302	// list of specialized chunks.
303	if (
304	!$ignoreSuffix && $lastChild !== $firstChild &&
305	$lastChildDp && Utils::isValidDSR( $lastChildDp->dsr ?? null ) &&
306	$dataParsoid->dsr->end === $lastChildDp->dsr->end
307	) {
308	DOMUtils::assertElt( $lastChild ); // implied by $lastChildDp
309	$len = $lastChildDp->dsr->length();
310	if ( $len < 0 ) { // T254412: Bad DSR
311	$env->log( "error/html2wt/dsr",
312	"Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ),
313	"Node: " . DOMCompat::getOuterHTML( $lastChild ) );
314	} else {
315	if ( $len > strlen( $text ) ) { // T254412: Bad DSR
316	$env->log( "error/html2wt/dsr",
317	"Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ),
318	"Node: " . DOMCompat::getOuterHTML( $lastChild ) );
319	$len = strlen( $text );
320	}
321	$suffixChunks = self::fromSelSer(
322	substr( $text, -$len ), $lastChild, $lastChildDp, $env,
323	// this child node's left context will be protected:
324	[ 'ignorePrefix' => true ]
325	);
326	$text = substr( $text, 0, -$len );
327	}
328	}
329	// glue together prefixChunks, whatever's left of `text`, and suffixChunks
330	$chunks = [ self::cast( $text, $node ) ];
331	$chunks = array_merge( $prefixChunks, $chunks, $suffixChunks );
332	// top-level chunks only:
333	if ( !( $ignorePrefix \|\| $ignoreSuffix ) ) {
334	// ensure that the first chunk belongs to `node` in order to
335	// emit separators correctly before `node`
336	if ( $chunks[0]->node !== $node ) {
337	array_unshift( $chunks, self::cast( '', $node ) );
338	}
339	// set 'noSep' flag on all but the first chunk, so we don't get
340	// extra separators from `SSP.emitChunk`
341	foreach ( $chunks as $i => $t ) {
342	if ( $i > 0 ) {
343	$t->noSep = true;
344	}
345	}
346	}
347	return $chunks;
348	}
349
350	/**
351	* List of types we attempt `fromSelSer` with. This should include all the
352	* concrete subclasses of `ConstrainedText` (`RegExpConstrainedText` is
353	* missing since it is an abstract class). We also include the
354	* `ConstrainedText` class as the first element (even though it is
355	* an abstract base class) as a little bit of a hack: it simplifies
356	* `ConstrainedText.fromSelSer` by factoring some of its work into
357	* `ConstrainedText.fromSelSerImpl`.
358	* @var class-string[]
359	*/
360	private static $types = [
361	// Base class is first, as a special case
362	self::class,
363	// All concrete subclasses of ConstrainedText
364	WikiLinkText::class, ExtLinkText::class, AutoURLLinkText::class,
365	MagicLinkText::class, LanguageVariantText::class
366	];
367	}