Code Coverage for /src/src/Utils/TokenUtils.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	40.14% covered (danger)	40.14%	116 / 289	40.74% covered (danger)	40.74%	11 / 27	CRAP	0.00% covered (danger)	0.00%	0 / 1
TokenUtils	40.14% covered (danger)	40.14%	116 / 289	40.74% covered (danger)	40.74%	11 / 27	6668.46	0.00% covered (danger)	0.00%	0 / 1
getTokenType	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	2
isWikitextBlockTag	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
tagOpensBlockScope	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	2
tagClosesBlockScope	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	2
isTemplateToken	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	2
isHTMLTag	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	7
hasDOMFragmentType	100.00% covered (success)	100.00%	1 / 1	100.00% covered (success)	100.00%	1 / 1	1
isTableTag	100.00% covered (success)	100.00%	2 / 2	100.00% covered (success)	100.00%	1 / 1	3
isSolTransparentLinkTag	100.00% covered (success)	100.00%	7 / 7	100.00% covered (success)	100.00%	1 / 1	5
isBehaviorSwitch	0.00% covered (danger)	0.00%	0 / 9	0.00% covered (danger)	0.00%	0 / 1	30
isSolTransparent	0.00% covered (danger)	0.00%	0 / 11	0.00% covered (danger)	0.00%	0 / 1	90
isTranslationUnitMarker	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	12
isEmptyLineMetaToken	100.00% covered (success)	100.00%	3 / 3	100.00% covered (success)	100.00%	1 / 1	3
matchTypeOf	0.00% covered (danger)	0.00%	0 / 10	0.00% covered (danger)	0.00%	0 / 1	20
hasTypeOf	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	2
shiftTokenTSR	0.00% covered (danger)	0.00%	0 / 32	0.00% covered (danger)	0.00%	0 / 1	552
stripEOFTkFromTokens	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	12
convertOffsets	95.16% covered (success)	95.16%	59 / 62	0.00% covered (danger)	0.00%	0 / 1	32
convertTokenOffsets	0.00% covered (danger)	0.00%	0 / 24	0.00% covered (danger)	0.00%	0 / 1	90
collectOffsets	0.00% covered (danger)	0.00%	0 / 23	0.00% covered (danger)	0.00%	0 / 1	156
isEntitySpanToken	100.00% covered (success)	100.00%	4 / 4	100.00% covered (success)	100.00%	1 / 1	4
newlinesToNlTks	0.00% covered (danger)	0.00%	0 / 7	0.00% covered (danger)	0.00%	0 / 1	6
tokensToString	50.00% covered (danger)	50.00%	19 / 38	0.00% covered (danger)	0.00%	0 / 1	82.50
kvToHash	100.00% covered (success)	100.00%	6 / 6	100.00% covered (success)	100.00%	1 / 1	2
tokenTrim	13.04% covered (danger)	13.04%	3 / 23	0.00% covered (danger)	0.00%	0 / 1	90.56
isAnnotationStartToken	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	6
isAnnotationEndToken	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	6

1	<?php
2	declare( strict_types = 1 );
3
4	/**
5	* This file contains general utilities for:
6	* (a) querying token properties and token types
7	* (b) manipulating tokens, individually and as collections.
8	*/
9
10	namespace Wikimedia\Parsoid\Utils;
11
12	use Wikimedia\Assert\Assert;
13	use Wikimedia\Assert\UnreachableException;
14	use Wikimedia\Parsoid\Config\Env;
15	use Wikimedia\Parsoid\Core\DomSourceRange;
16	use Wikimedia\Parsoid\Tokens\CommentTk;
17	use Wikimedia\Parsoid\Tokens\EndTagTk;
18	use Wikimedia\Parsoid\Tokens\EOFTk;
19	use Wikimedia\Parsoid\Tokens\KV;
20	use Wikimedia\Parsoid\Tokens\KVSourceRange;
21	use Wikimedia\Parsoid\Tokens\NlTk;
22	use Wikimedia\Parsoid\Tokens\SelfclosingTagTk;
23	use Wikimedia\Parsoid\Tokens\SourceRange;
24	use Wikimedia\Parsoid\Tokens\TagTk;
25	use Wikimedia\Parsoid\Tokens\Token;
26	use Wikimedia\Parsoid\Wikitext\Consts;
27
28	class TokenUtils {
29	public const SOL_TRANSPARENT_LINK_REGEX =
30	'/(?:^\|\s)mw:PageProp\/(?:Category\|redirect\|Language)(?=$\|\s)/D';
31
32	/**
33	* Gets a string type value for a token
34	* @param Token\|string $token
35	* @return string
36	*/
37	public static function getTokenType( $token ): string {
38	return is_string( $token ) ? 'string' : $token->getType();
39	}
40
41	/**
42	* @param string $name
43	* @return bool
44	*/
45	public static function isWikitextBlockTag( string $name ): bool {
46	return isset( Consts::$wikitextBlockElems[$name] );
47	}
48
49	/**
50	* In the legacy parser, these block tags open block-tag scope
51	* See doBlockLevels in the PHP parser (includes/parser/Parser.php).
52	*
53	* @param string $name
54	* @return bool
55	*/
56	public static function tagOpensBlockScope( string $name ): bool {
57	return isset( Consts::$blockElems[$name] ) \|\|
58	isset( Consts::$alwaysBlockElems[$name] );
59	}
60
61	/**
62	* In the legacy parser, these block tags close block-tag scope
63	* See doBlockLevels in the PHP parser (includes/parser/Parser.php).
64	*
65	* @param string $name
66	* @return bool
67	*/
68	public static function tagClosesBlockScope( string $name ): bool {
69	return isset( Consts::$antiBlockElems[$name] ) \|\|
70	isset( Consts::$neverBlockElems[$name] );
71	}
72
73	/**
74	* Is this a template token?
75	* @param Token\|string\|null $token
76	* @return bool
77	*/
78	public static function isTemplateToken( $token ): bool {
79	return $token instanceof SelfclosingTagTk && $token->getName() === 'template';
80	}
81
82	/**
83	* Determine whether the current token was an HTML tag in wikitext.
84	*
85	* @param Token\|string\|null $token
86	* @return bool
87	*/
88	public static function isHTMLTag( $token ): bool {
89	return $token && !is_string( $token ) &&
90	( $token instanceof TagTk \|\|
91	$token instanceof EndTagTk \|\|
92	$token instanceof SelfClosingTagTk ) &&
93	isset( $token->dataParsoid->stx ) &&
94	$token->dataParsoid->stx === 'html';
95	}
96
97	/**
98	* Is the token a DOMFragment type value?
99	*
100	* @param Token $token
101	* @return bool
102	*/
103	public static function hasDOMFragmentType( Token $token ): bool {
104	return self::matchTypeOf( $token, '#^mw:DOMFragment(/sealed/\w+)?$#D' ) !== null;
105	}
106
107	/**
108	* Is the token a table tag?
109	*
110	* @param Token\|string $token
111	* @return bool
112	*/
113	public static function isTableTag( $token ): bool {
114	return ( $token instanceof TagTk \|\| $token instanceof EndTagTk ) &&
115	isset( Consts::$HTML['TableTags'][$token->getName()] );
116	}
117
118	/**
119	* Determine if token is a transparent link tag
120	*
121	* @param Token\|string $token
122	* @return bool
123	*/
124	public static function isSolTransparentLinkTag( $token ): bool {
125	return (
126	$token instanceof SelfclosingTagTk \|\|
127	$token instanceof TagTk \|\|
128	$token instanceof EndTagTk
129	) &&
130	$token->getName() === 'link' &&
131	preg_match( self::SOL_TRANSPARENT_LINK_REGEX, $token->getAttributeV( 'rel' ) ?? '' );
132	}
133
134	/**
135	* Does this token represent a behavior switch?
136	*
137	* @param Env $env
138	* @param Token\|string $token
139	* @return bool
140	*/
141	public static function isBehaviorSwitch( Env $env, $token ): bool {
142	return $token instanceof SelfclosingTagTk && (
143	// Before BehaviorSwitchHandler (ie. PreHandler, etc.)
144	$token->getName() === 'behavior-switch' \|\|
145	// After BehaviorSwitchHandler
146	// (ie. ListHandler, ParagraphWrapper, etc.)
147	( $token->getName() === 'meta' &&
148	$token->hasAttribute( 'property' ) &&
149	preg_match( $env->getSiteConfig()->bswPagePropRegexp(),
150	$token->getAttributeV( 'property' ) ?? '' )
151	) );
152	}
153
154	/**
155	* This should come close to matching
156	* {@link WTUtils::emitsSolTransparentSingleLineWT},
157	* without the single line caveat.
158	* @param Env $env
159	* @param Token\|string $token
160	* @return bool
161	*/
162	public static function isSolTransparent( Env $env, $token ): bool {
163	if ( is_string( $token ) ) {
164	return (bool)preg_match( '/^[ \t]*$/D', $token );
165	} elseif ( self::isSolTransparentLinkTag( $token ) ) {
166	return true;
167	} elseif ( $token instanceof CommentTk && !self::isTranslationUnitMarker( $env, $token ) ) {
168	return true;
169	} elseif ( self::isBehaviorSwitch( $env, $token ) ) {
170	return true;
171	} elseif ( !$token instanceof SelfclosingTagTk \|\| $token->getName() !== 'meta' ) {
172	return false;
173	} else { // only metas left
174	return !( isset( $token->dataParsoid->stx ) && $token->dataParsoid->stx === 'html' );
175	}
176	}
177
178	/**
179	* HACK: Returns true if $token looks like a TU marker (<!--T:XXX-->) and if we could be in a
180	* translate-annotated page.
181	* @param Env $env
182	* @param CommentTk $token
183	* @return bool
184	*/
185	public static function isTranslationUnitMarker( Env $env, CommentTk $token ): bool {
186	return $env->hasAnnotations &&
187	$env->getSiteConfig()->isAnnotationTag( 'translate' ) &&
188	preg_match( '/^T:/', $token->value ) === 1;
189	}
190
191	/**
192	* Is token a transparent link tag?
193	*
194	* @param Token\|string $token
195	* @return bool
196	*/
197	public static function isEmptyLineMetaToken( $token ): bool {
198	return $token instanceof SelfclosingTagTk &&
199	$token->getName() === 'meta' &&
200	$token->getAttributeV( 'typeof' ) === 'mw:EmptyLine';
201	}
202
203	/**
204	* Determine whether the token matches the given `typeof` attribute value.
205	*
206	* @param Token $t The token to test
207	* @param string $typeRe Regular expression matching the expected value of
208	* the `typeof` attribute.
209	* @return ?string The matching `typeof` value, or `null` if there is
210	* no match.
211	*/
212	public static function matchTypeOf( Token $t, string $typeRe ): ?string {
213	$v = $t->getAttributeV( 'typeof' );
214	if ( $v === null ) {
215	return null;
216	}
217	Assert::invariant( is_string( $v ), "Typeof is not simple" );
218	foreach ( preg_split( '/\s+/', $v, -1, PREG_SPLIT_NO_EMPTY ) as $ty ) {
219	$count = preg_match( $typeRe, $ty );
220	Assert::invariant( $count !== false, "Bad regexp" );
221	if ( $count ) {
222	return $ty;
223	}
224	}
225	return null;
226	}
227
228	/**
229	* Determine whether the token matches the given typeof attribute value.
230	*
231	* @param Token $t
232	* @param string $type Expected value of "typeof" attribute, as a literal
233	* string.
234	* @return bool True if the token matches.
235	*/
236	public static function hasTypeOf( Token $t, string $type ): bool {
237	return self::matchTypeOf(
238	$t, '/^' . preg_quote( $type, '/' ) . '$/D'
239	) !== null;
240	}
241
242	/**
243	* Shift TSR of a token
244	*
245	* PORT-FIXME: In JS this was sometimes called with $offset=undefined, which meant do
246	* nothing by default, except if there was a third parameter set to true, in which case it
247	* meant the same thing as $offset = null. We can't pass in undefined in PHP, so this should
248	* usually be handled with isset() is the caller. But isset() returns true if the variable is
249	* null, so let's use false instead of null for whatever the previous code meant by a null
250	* offset.
251	*
252	* @param array<Token\|string> $tokens
253	* @param int\|false $offset
254	*/
255	public static function shiftTokenTSR( array $tokens, $offset ): void {
256	// Bail early if we can
257	if ( $offset === 0 ) {
258	return;
259	}
260
261	// JS b/c
262	if ( $offset === null ) {
263	$offset = false;
264	}
265
266	// update/clear tsr
267	for ( $i = 0, $n = count( $tokens ); $i < $n; $i++ ) {
268	$t = $tokens[$i];
269	switch ( is_object( $t ) ? get_class( $t ) : null ) {
270	case TagTk::class:
271	case SelfclosingTagTk::class:
272	case NlTk::class:
273	case CommentTk::class:
274	case EndTagTk::class:
275	$da = $t->dataParsoid;
276	$tsr = $da->tsr;
277	if ( $tsr ) {
278	if ( $offset ) {
279	$da->tsr = $tsr->offset( $offset );
280	} else {
281	$da->tsr = null;
282	}
283	}
284
285	if ( $offset && isset( $da->extTagOffsets ) ) {
286	$da->extTagOffsets =
287	$da->extTagOffsets->offset( $offset );
288	}
289
290	// SSS FIXME: offset will always be available in
291	// chunky-tokenizer mode in which case we wont have
292	// buggy offsets below. The null scenario is only
293	// for when the token-stream-patcher attempts to
294	// reparse a string -- it is likely to only patch up
295	// small string fragments and the complicated use cases
296	// below should not materialize.
297	// CSA: token-stream-patcher shouldn't have problems
298	// now that $frame->srcText is always accurate?
299
300	// content offsets for ext-links
301	if ( $offset && isset( $da->tmp->extLinkContentOffsets ) ) {
302	$da->tmp->extLinkContentOffsets =
303	$da->tmp->extLinkContentOffsets->offset( $offset );
304	}
305
306	// Process attributes
307	if ( isset( $t->attribs ) ) {
308	for ( $j = 0, $m = count( $t->attribs ); $j < $m; $j++ ) {
309	$a = $t->attribs[$j];
310	if ( is_array( $a->k ) ) {
311	self::shiftTokenTSR( $a->k, $offset );
312	}
313	if ( is_array( $a->v ) ) {
314	self::shiftTokenTSR( $a->v, $offset );
315	}
316
317	// src offsets used to set mw:TemplateParams
318	if ( !$offset ) {
319	$a->srcOffsets = null;
320	} elseif ( $a->srcOffsets !== null ) {
321	$a->srcOffsets = $a->srcOffsets->offset( $offset );
322	}
323	}
324	}
325	break;
326
327	default:
328	break;
329	}
330	}
331	}
332
333	/**
334	* Strip EOFTk token from token chunk.
335	* The EOFTk is expected to be the last token of the chunk.
336	*
337	* @param array &$tokens
338	* @return array return the modified token array so that this call can be chained
339	*/
340	public static function stripEOFTkFromTokens( array &$tokens ): array {
341	$n = count( $tokens );
342	if ( $n && $tokens[$n - 1] instanceof EOFTk ) {
343	array_pop( $tokens );
344	}
345	return $tokens;
346	}
347
348	/**
349	* Convert string offsets
350	*
351	* Offset types are:
352	* - 'byte': Bytes (UTF-8 encoding), e.g. PHP `substr()` or `strlen()`.
353	* - 'char': Unicode code points (encoding irrelevant), e.g. PHP `mb_substr()` or `mb_strlen()`.
354	* - 'ucs2': 16-bit code units (UTF-16 encoding), e.g. JavaScript `.substring()` or `.length`.
355	*
356	* Offsets that are mid-Unicode character are "rounded" up to the next full
357	* character, i.e. the output offset will always point to the start of a
358	* Unicode code point (or just past the end of the string). Offsets outside
359	* the string are "rounded" to 0 or just-past-the-end.
360	*
361	* @note When constructing the array of offsets to pass to this method,
362	* populate it with references as `$offsets[] = &$var;`.
363	*
364	* @param string $s Unicode string the offsets are offsets into, UTF-8 encoded.
365	* @param ('byte'\|'ucs2'\|'char') $from Offset type to convert from.
366	* @param ('byte'\|'ucs2'\|'char') $to Offset type to convert to.
367	* @param int[] $offsets References to the offsets to convert.
368	*/
369	public static function convertOffsets(
370	string $s, string $from, string $to, array $offsets
371	): void {
372	static $valid = [ 'byte', 'char', 'ucs2' ];
373	if ( !in_array( $from, $valid, true ) ) {
374	throw new \InvalidArgumentException( 'Invalid $from' );
375	}
376	if ( !in_array( $to, $valid, true ) ) {
377	throw new \InvalidArgumentException( 'Invalid $to' );
378	}
379
380	$i = 0;
381	$offsetCt = count( $offsets );
382	if ( $offsetCt === 0 ) { // Nothing to do
383	return;
384	}
385	sort( $offsets, SORT_NUMERIC );
386
387	$bytePos = 0;
388	$ucs2Pos = 0;
389	$charPos = 0;
390
391	$fromPos = &${$from . 'Pos'}; // @phan-suppress-current-line PhanPluginDollarDollar
392	$toPos = &${$to . 'Pos'}; // @phan-suppress-current-line PhanPluginDollarDollar
393
394	$byteLen = strlen( $s );
395	while ( $bytePos < $byteLen ) {
396	// Update offsets that we've reached
397	while ( $offsets[$i] <= $fromPos ) {
398	$offsets[$i] = $toPos;
399	if ( ++$i >= $offsetCt ) {
400	return;
401	}
402	}
403
404	// Update positions
405	++$charPos;
406	$c = ord( $s[$bytePos] ) & 0xf8;
407	switch ( $c ) {
408	case 0x00:
409	case 0x08:
410	case 0x10:
411	case 0x18:
412	case 0x20:
413	case 0x28:
414	case 0x30:
415	case 0x38:
416	case 0x40:
417	case 0x48:
418	case 0x50:
419	case 0x58:
420	case 0x60:
421	case 0x68:
422	case 0x70:
423	case 0x78:
424	++$bytePos;
425	++$ucs2Pos;
426	break;
427
428	case 0xc0:
429	case 0xc8:
430	case 0xd0:
431	case 0xd8:
432	$bytePos += 2;
433	++$ucs2Pos;
434	break;
435
436	case 0xe0:
437	case 0xe8:
438	$bytePos += 3;
439	++$ucs2Pos;
440	break;
441
442	case 0xf0:
443	$bytePos += 4;
444	$ucs2Pos += 2;
445	break;
446
447	default:
448	throw new \InvalidArgumentException( '$s is not UTF-8' );
449	}
450	}
451
452	// Convert any offsets past the end of the string to the length of the
453	// string.
454	while ( $i < $offsetCt ) {
455	$offsets[$i] = $toPos;
456	++$i;
457	}
458	}
459
460	/**
461	* Convert offsets in a token array
462	*
463	* @see TokenUtils::convertOffsets()
464	*
465	* @param string $s The offset reference string
466	* @param ('byte'\|'ucs2'\|'char') $from Offset type to convert from
467	* @param ('byte'\|'ucs2'\|'char') $to Offset type to convert to
468	* @param array<Token\|string\|array> $tokens
469	*/
470	public static function convertTokenOffsets(
471	string $s, string $from, string $to, array $tokens
472	): void {
473	$offsets = []; /* @var array<int> $offsets */
474	self::collectOffsets( $tokens, static function ( $sr ) use ( &$offsets ) {
475	if ( $sr instanceof DomSourceRange ) {
476	// Adjust the widths to be actual character offsets
477	if ( $sr->openWidth !== null ) {
478	Assert::invariant( $sr->start !== null, "width w/o start" );
479	$sr->openWidth = $sr->start + $sr->openWidth;
480	$offsets[] =& $sr->openWidth;
481	}
482	if ( $sr->closeWidth !== null ) {
483	Assert::invariant( $sr->end !== null, "width w/o end" );
484	$sr->closeWidth = $sr->end - $sr->closeWidth;
485	$offsets[] =& $sr->closeWidth;
486	}
487	}
488	if ( $sr->start !== null ) {
489	$offsets[] =& $sr->start;
490	}
491	if ( $sr->end !== null ) {
492	$offsets[] =& $sr->end;
493	}
494	} );
495	self::convertOffsets( $s, $from, $to, $offsets );
496	self::collectOffsets( $tokens, static function ( $sr ) use ( &$offsets ) {
497	if ( $sr instanceof DomSourceRange ) {
498	// Adjust widths back from being character offsets
499	if ( $sr->openWidth !== null ) {
500	$sr->openWidth -= $sr->start;
501	}
502	if ( $sr->closeWidth !== null ) {
503	$sr->closeWidth = $sr->end - $sr->closeWidth;
504	}
505	}
506	} );
507	}
508
509	/**
510	* @param array<Token\|string>\|array<KV>\|KV\|Token\|DomSourceRange\|KVSourceRange\|SourceRange\|string $input
511	* @param callable $offsetFunc
512	*/
513	private static function collectOffsets( $input, callable $offsetFunc ): void {
514	if ( is_array( $input ) ) {
515	foreach ( $input as $token ) {
516	self::collectOffsets( $token, $offsetFunc );
517	}
518	} elseif ( $input instanceof KV ) {
519	self::collectOffsets( $input->k, $offsetFunc );
520	self::collectOffsets( $input->v, $offsetFunc );
521	if ( $input->srcOffsets ) {
522	self::collectOffsets( $input->srcOffsets, $offsetFunc );
523	}
524	} elseif ( $input instanceof Token ) {
525	if ( isset( $input->dataParsoid->tsr ) ) {
526	self::collectOffsets( $input->dataParsoid->tsr, $offsetFunc );
527	}
528	if ( isset( $input->dataParsoid->tmp->extLinkContentOffsets ) ) {
529	self::collectOffsets( $input->dataParsoid->tmp->extLinkContentOffsets, $offsetFunc );
530	}
531	if ( isset( $input->dataParsoid->tokens ) ) {
532	self::collectOffsets( $input->dataParsoid->tokens, $offsetFunc );
533	}
534	if ( isset( $input->dataParsoid->extTagOffsets ) ) {
535	self::collectOffsets( $input->dataParsoid->extTagOffsets, $offsetFunc );
536	}
537	self::collectOffsets( $input->attribs, $offsetFunc );
538	} elseif ( $input instanceof KVSourceRange ) {
539	self::collectOffsets( $input->key, $offsetFunc );
540	self::collectOffsets( $input->value, $offsetFunc );
541	} elseif ( $input instanceof SourceRange ) {
542	// This includes DomSourceRange
543	$offsetFunc( $input );
544	}
545	}
546
547	/**
548	* Tests whether token represents an HTML entity.
549	* Think `<span typeof="mw:Entity">`.
550	* @param Token\|string\|null $token
551	* @return bool
552	*/
553	public static function isEntitySpanToken( $token ): bool {
554	return $token &&
555	$token instanceof TagTk &&
556	$token->getName() === 'span' &&
557	self::hasTypeOf( $token, 'mw:Entity' );
558	}
559
560	/**
561	* Transform `"\n"` and `"\r\n"` in the input string to {@link NlTk} tokens.
562	* @param string $str
563	* @return array (interspersed string and NlTk tokens)
564	*/
565	public static function newlinesToNlTks( string $str ): array {
566	$toks = preg_split( '/\n\|\r\n/', $str );
567	$ret = [];
568	// Add one NlTk between each pair, hence toks.length-1
569	for ( $i = 0, $n = count( $toks ) - 1; $i < $n; $i++ ) {
570	$ret[] = $toks[$i];
571	$ret[] = new NlTk( null );
572	}
573	$ret[] = $toks[$i];
574	return $ret;
575	}
576
577	/**
578	* Flatten/convert a token array into a string.
579	* @param string\|Token\|array<Token\|string> $tokens
580	* @param bool $strict Whether to abort as soon as we find a token we
581	* can't stringify.
582	* @param array<string,bool\|Env> $opts
583	* @return string\|array{0:string,1:Array<Token\|string>}
584	* The stringified tokens. If $strict is true, returns a two-element
585	* array containing string prefix and the remainder of the tokens as
586	* soon as we encounter something we can't stringify.
587	*
588	* Unsure why phan is whining about $opts array accesses.
589	* So for now, I am simply suppressing those warnings.
590	*/
591	public static function tokensToString( $tokens, bool $strict = false, array $opts = [] ) {
592	if ( is_string( $tokens ) ) {
593	return $tokens;
594	}
595
596	if ( !is_array( $tokens ) ) {
597	$tokens = [ $tokens ];
598	}
599
600	$out = '';
601	for ( $i = 0, $l = count( $tokens ); $i < $l; $i++ ) {
602	$token = $tokens[$i];
603	if ( $token === null ) {
604	throw new UnreachableException( "No nulls expected." );
605	} elseif ( $token instanceof KV ) {
606	// Since this function is occasionally called on KV->v,
607	// whose signature recursively includes KV[], a mismatch with
608	// this function, we assert that those values are only
609	// included in safe places that don't intend to stringify
610	// their tokens.
611	throw new UnreachableException( "No KVs expected." );
612	} elseif ( is_string( $token ) ) {
613	$out .= $token;
614	} elseif ( is_array( $token ) ) {
615	Assert::invariant( !$strict, "strict case handled above" );
616	$out .= self::tokensToString( $token, $strict, $opts );
617	} elseif (
618	$token instanceof CommentTk \|\|
619	( empty( $opts['retainNLs'] ) && $token instanceof NlTk )
620	) {
621	// strip comments and newlines
622	} elseif ( !empty( $opts['stripEmptyLineMeta'] ) && self::isEmptyLineMetaToken( $token ) ) {
623	// If requested, strip empty line meta tokens too.
624	} elseif ( !empty( $opts['includeEntities'] ) && self::isEntitySpanToken( $token ) ) {
625	$out .= $token->dataParsoid->src;
626	$i += 2; // Skip child and end tag.
627	} elseif ( $strict ) {
628	// If strict, return accumulated string on encountering first non-text token
629	return [ $out, array_slice( $tokens, $i ) ];
630	} elseif (
631	// This option shouldn't be used if the tokens have been
632	// expanded to DOM
633	!empty( $opts['unpackDOMFragments'] ) &&
634	( $token instanceof TagTk \|\| $token instanceof SelfclosingTagTk ) &&
635	self::hasDOMFragmentType( $token )
636	) {
637	// Handle dom fragments
638	$domFragment = $opts['env']->getDOMFragment(
639	$token->dataParsoid->html
640	);
641	// Calling `env->removeDOMFragment()` here is case dependent
642	// but should be rare enough when permissible that it can be
643	// ignored.
644	// FIXME: The correct thing to do would be to return
645	// `$domFragment.innerHTML` for the current scenarios where
646	// `unpackDOMFragments` is used (expanded attribute
647	// values and reparses thereof) but we'd need to remove
648	// the span wrapping and typeof annotation of extension
649	// content and nowikis. Since we're primarily expecting
650	// to find <translate> and <nowiki> here, this will do.
651	$out .= $domFragment->textContent;
652	if ( $token instanceof TagTk ) {
653	$i += 1; // Skip the EndTagTK
654	Assert::invariant(
655	$i >= $l \|\| $tokens[$i] instanceof EndTagTk,
656	"tag should be followed by endtag"
657	);
658	}
659	}
660	}
661	return $out;
662	}
663
664	/**
665	* Convert an array of key-value pairs into a hash of keys to values.
666	* For duplicate keys, the last entry wins.
667	* @param array<KV> $kvs
668	* @return array<string,array<Token\|string>>\|array<string,string>
669	*/
670	public static function kvToHash( array $kvs ): array {
671	$res = [];
672	foreach ( $kvs as $kv ) {
673	$key = trim( self::tokensToString( $kv->k ) );
674	// SSS FIXME: Temporary fix to handle extensions which use
675	// entities in attribute values. We need more robust handling
676	// of non-string template attribute values in general.
677	$val = self::tokensToString( $kv->v );
678	$res[mb_strtolower( $key )] = self::tokenTrim( $val );
679	}
680	return $res;
681	}
682
683	/**
684	* Trim space and newlines from leading and trailing text tokens.
685	* @param string\|Token\|(Token\|string)[] $tokens
686	* @return string\|Token\|(Token\|string)[]
687	*/
688	public static function tokenTrim( $tokens ) {
689	if ( !is_array( $tokens ) ) {
690	if ( is_string( $tokens ) ) {
691	return trim( $tokens );
692	}
693	return $tokens;
694	}
695
696	$n = count( $tokens );
697
698	// strip leading space
699	foreach ( $tokens as &$token ) {
700	if ( $token instanceof NlTk ) {
701	$token = '';
702	} elseif ( is_string( $token ) ) {
703	$token = preg_replace( '/^\s+/', '', $token, 1 );
704	if ( $token !== '' ) {
705	break;
706	}
707	} else {
708	break;
709	}
710	}
711
712	// strip trailing space
713	for ( $i = $n - 1; $i >= 0; $i-- ) {
714	$token = &$tokens[$i];
715	if ( $token instanceof NlTk ) {
716	$token = ''; // replace newline with empty
717	} elseif ( is_string( $token ) ) {
718	$token = preg_replace( '/\s+$/D', '', $token, 1 );
719	if ( $token !== '' ) {
720	break;
721	}
722	} else {
723	break;
724	}
725	}
726
727	return $tokens;
728	}
729
730	/**
731	* Checks whether the provided meta tag token is an annotation start token
732	* @param Token $t
733	* @return bool
734	*/
735	public static function isAnnotationStartToken( Token $t ): bool {
736	$type = self::matchTypeOf( $t, WTUtils::ANNOTATION_META_TYPE_REGEXP );
737	return $type !== null && !str_ends_with( $type, '/End' );
738	}
739
740	/**
741	* Checks whether the provided meta tag token is an annotation end token
742	* @param Token $t
743	* @return bool
744	*/
745	public static function isAnnotationEndToken( Token $t ): bool {
746	$type = self::matchTypeOf( $t, WTUtils::ANNOTATION_META_TYPE_REGEXP );
747	return $type !== null && str_ends_with( $type, '/End' );
748	}
749	}