Code Coverage for /src/src/Wt2Html/TokenizerUtils.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	0.00% covered (danger)	0.00%	0 / 194	0.00% covered (danger)	0.00%	0 / 13	CRAP	0.00% covered (danger)	0.00%	0 / 1
TokenizerUtils	0.00% covered (danger)	0.00%	0 / 194	0.00% covered (danger)	0.00%	0 / 13	12432	0.00% covered (danger)	0.00%	0 / 1
internalFlatten	0.00% covered (danger)	0.00%	0 / 15	0.00% covered (danger)	0.00%	0 / 1	72
flattenIfArray	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
flattenString	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	12
flattenStringlist	0.00% covered (danger)	0.00%	0 / 15	0.00% covered (danger)	0.00%	0 / 1	42
getAttrVal	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
buildTableTokens	0.00% covered (danger)	0.00%	0 / 28	0.00% covered (danger)	0.00%	0 / 1	240
buildXMLTag	0.00% covered (danger)	0.00%	0 / 13	0.00% covered (danger)	0.00%	0 / 1	20
inlineBreaks	0.00% covered (danger)	0.00%	0 / 74	0.00% covered (danger)	0.00%	0 / 1	2256
popComments	0.00% covered (danger)	0.00%	0 / 17	0.00% covered (danger)	0.00%	0 / 1	156
getAutoUrlTerminatingChars	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
enforceParserResourceLimits	0.00% covered (danger)	0.00%	0 / 16	0.00% covered (danger)	0.00%	0 / 1	90
protectAttrs	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	6
resetAnnotationIncludeRegex	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2

1	<?php
2	/**
3	* Utilities used in the tokenizer.
4	* @module wt2html/tokenizer_utils
5	*/
6
7	declare( strict_types = 1 );
8
9	namespace Wikimedia\Parsoid\Wt2Html;
10
11	use Wikimedia\Parsoid\Config\Env;
12	use Wikimedia\Parsoid\NodeData\DataParsoid;
13	use Wikimedia\Parsoid\NodeData\TempData;
14	use Wikimedia\Parsoid\Tokens\CommentTk;
15	use Wikimedia\Parsoid\Tokens\EndTagTk;
16	use Wikimedia\Parsoid\Tokens\SelfclosingTagTk;
17	use Wikimedia\Parsoid\Tokens\SourceRange;
18	use Wikimedia\Parsoid\Tokens\TagTk;
19	use Wikimedia\Parsoid\Tokens\Token;
20	use Wikimedia\Parsoid\Utils\DOMDataUtils;
21	use Wikimedia\Parsoid\Utils\PHPUtils;
22	use Wikimedia\Parsoid\Wikitext\Consts;
23
24	class TokenizerUtils {
25	private static $protectAttrsRegExp;
26	private static $inclAnnRegExp;
27
28	/**
29	* @param mixed $e
30	* @param ?array &$res
31	* @return mixed (same type as $e)
32	* @throws \Exception
33	*/
34	private static function internalFlatten( $e, ?array &$res ) {
35	// Don't bother flattening if we dont have an array
36	if ( !is_array( $e ) ) {
37	return $e;
38	}
39
40	for ( $i = 0; $i < count( $e ); $i++ ) {
41	$v = $e[$i];
42	if ( is_array( $v ) ) {
43	// Change in assumption from a shallow array to a nested array.
44	if ( $res === null ) {
45	$res = array_slice( $e, 0, $i );
46	}
47	self::internalFlatten( $v, $res );
48	} elseif ( $v !== null ) {
49	if ( $res !== null ) {
50	$res[] = $v;
51	}
52	} else {
53	throw new \RuntimeException( __METHOD__ . ": found falsy element $i" );
54	}
55	}
56
57	if ( $res !== null ) {
58	$e = $res;
59	}
60	return $e;
61	}
62
63	/**
64	* If $a is an array, this recursively flattens all nested arrays.
65	* @param mixed $a
66	* @return mixed
67	*/
68	public static function flattenIfArray( $a ) {
69	return self::internalFlatten( $a, $res );
70	}
71
72	/**
73	* FIXME: document
74	* @param mixed $c
75	* @return mixed
76	*/
77	public static function flattenString( $c ) {
78	$out = self::flattenStringlist( $c );
79	if ( count( $out ) === 1 && is_string( $out[0] ) ) {
80	return $out[0];
81	} else {
82	return $out;
83	}
84	}
85
86	/**
87	* FIXME: document
88	* @param array $c
89	* @return array
90	*/
91	public static function flattenStringlist( array $c ): array {
92	$out = [];
93	$text = '';
94	$c = self::flattenIfArray( $c );
95	for ( $i = 0, $l = count( $c ); $i < $l; $i++ ) {
96	$ci = $c[$i];
97	if ( is_string( $ci ) ) {
98	if ( $ci !== '' ) {
99	$text .= $ci;
100	}
101	} else {
102	if ( $text !== '' ) {
103	$out[] = $text;
104	$text = '';
105	}
106	$out[] = $ci;
107	}
108	}
109	if ( $text !== '' ) {
110	$out[] = $text;
111	}
112	return $out;
113	}
114
115	/**
116	* @param mixed $value
117	* @param int $start start of TSR range
118	* @param int $end end of TSR range
119	* @return array
120	*/
121	public static function getAttrVal( $value, int $start, int $end ): array {
122	return [ 'value' => $value, 'srcOffsets' => new SourceRange( $start, $end ) ];
123	}
124
125	/**
126	* Build a token array representing <tag>$content</tag> alongwith
127	* appropriate attributes and TSR info set on the tokens.
128	*
129	* @param string $tagName
130	* @param string $wtChar
131	* @param mixed $attrInfo
132	* @param SourceRange $tsr
133	* @param int $endPos
134	* @param mixed $content
135	* @param bool $addEndTag
136	* @return array (of tokens)
137	*/
138	public static function buildTableTokens(
139	string $tagName, string $wtChar, $attrInfo, SourceRange $tsr,
140	int $endPos, $content, bool $addEndTag = false
141	): array {
142	$dp = new DataParsoid;
143	$dp->tsr = $tsr;
144
145	if ( $tagName === 'td' ) {
146	if ( !$attrInfo ) {
147	// Add a flag that indicates that the tokenizer didn't
148	// encounter a "\|...\|" attribute box. This is useful when
149	// deciding which <td>/<th> cells need attribute fixups.
150	$dp->setTempFlag( TempData::NO_ATTRS );
151	} elseif ( !$attrInfo[0] && $attrInfo[1] === "" ) {
152	// FIXME: Skip comments between the two "\|" chars
153	// [ [], "", "\|"] => "\|\|" syntax for first <td> on line
154	$dp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL );
155	$dp->setTempFlag( TempData::NO_ATTRS );
156	}
157	} elseif ( $tagName === 'th' ) {
158	if ( !$attrInfo ) {
159	// Add a flag that indicates that the tokenizer didn't
160	// encounter a "\|...\|" attribute box. This is useful when
161	// deciding which <td>/<th> cells need attribute fixups.
162	$dp->setTempFlag( TempData::NO_ATTRS );
163
164	// FIXME: Skip comments between the two "!" chars
165	// "!!foo" in sol context parses as <th>!foo</th>
166	if (
167	is_string( $content[0][0] ?? null ) &&
168	str_starts_with( $content[0][0], "!" )
169	) {
170	$dp->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL );
171	}
172	}
173	}
174
175	$a = [];
176	if ( $attrInfo ) {
177	$a = $attrInfo[0];
178	if ( !$a ) {
179	$dp->startTagSrc = $wtChar . $attrInfo[1];
180	}
181	if ( ( !$a && $attrInfo[2] ) \|\| $attrInfo[2] !== '\|' ) {
182	// Variation from default
183	// 1. Separator present with an empty attribute block
184	// 2. Not "\|"
185	$dp->attrSepSrc = $attrInfo[2];
186	}
187	}
188
189	$tokens = [ new TagTk( $tagName, $a, $dp ) ];
190	PHPUtils::pushArray( $tokens, $content );
191
192	if ( $addEndTag ) {
193	$dataParsoid = new DataParsoid;
194	$dataParsoid->tsr = new SourceRange( $endPos, $endPos );
195	$tokens[] = new EndTagTk( $tagName, [], $dataParsoid );
196	} else {
197	// We rely on our tree builder to close the table cell (td/th) as needed.
198	// We cannot close the cell here because cell content can come from
199	// multiple parsing contexts and we cannot close the tag in the same
200	// parsing context in which the td was opened:
201	// Ex: {{1x\|{{!}}foo}}{{1x\|bar}} has to output <td>foobar</td>
202	//
203	// Previously a meta marker was added here for DSR computation, but
204	// that's complicated now that marker meta handling has been removed
205	// from ComputeDSR.
206	}
207
208	return $tokens;
209	}
210
211	/**
212	* Build a token representing <tag>, <tag />, or </tag>
213	* with appropriate attributes set on the token.
214	*
215	* @param string $name
216	* @param string $lcName
217	* @param array $attribs
218	* @param mixed $endTag
219	* @param bool $selfClose
220	* @param SourceRange $tsr
221	* @return Token
222	*/
223	public static function buildXMLTag( string $name, string $lcName, array $attribs, $endTag,
224	bool $selfClose, SourceRange $tsr
225	): Token {
226	$tok = null;
227	$da = new DataParsoid;
228	$da->tsr = $tsr;
229	$da->stx = 'html';
230
231	if ( $name !== $lcName ) {
232	$da->srcTagName = $name;
233	}
234
235	if ( $endTag !== null ) {
236	$tok = new EndTagTk( $lcName, $attribs, $da );
237	} elseif ( $selfClose ) {
238	$da->selfClose = true;
239	$tok = new SelfclosingTagTk( $lcName, $attribs, $da );
240	} else {
241	$tok = new TagTk( $lcName, $attribs, $da );
242	}
243
244	return $tok;
245	}
246
247	/**
248	* Inline breaks, flag-enabled rule which detects end positions for
249	* active higher-level rules in inline and other nested rules.
250	* Those inner rules are then exited, so that the outer rule can
251	* handle the end marker.
252	* @param string $input
253	* @param int $pos
254	* @param array $stops
255	* @param Env $env
256	* @return bool
257	* @throws \Exception
258	*/
259	public static function inlineBreaks( string $input, int $pos, array $stops, Env $env ): bool {
260	$c = $input[$pos];
261	$c2 = $input[$pos + 1] ?? '';
262
263	switch ( $c ) {
264	case '=':
265	if ( $stops['arrow'] && $c2 === '>' ) {
266	return true;
267	}
268	if ( $stops['equal'] ) {
269	return true;
270	}
271	if ( $stops['h'] ) {
272	if ( self::$inclAnnRegExp === null ) {
273	$tags = array_merge(
274	[ 'noinclude', 'includeonly', 'onlyinclude' ],
275	$env->getSiteConfig()->getAnnotationTags()
276	);
277	self::$inclAnnRegExp = '\|<\/?(?:' . implode( '\|', $tags ) . ')>';
278	}
279	return ( $pos === strlen( $input ) - 1
280	// possibly more equals followed by spaces or comments
281	\|\| preg_match( '/^=(?:[ \t]\|<\!--(?:(?!-->).)-->'
282	. self::$inclAnnRegExp . ')*(?:[\r\n]\|$)/sD',
283	substr( $input, $pos + 1 ) ) );
284	}
285	return false;
286
287	case '\|':
288	return !$stops['annOrExtTag'] && (
289	$stops['templateArg']
290	\|\| $stops['tableCellArg']
291	\|\| $stops['linkdesc']
292	\|\| ( $stops['table']
293	&& $pos < strlen( $input ) - 1
294	&& preg_match( '/[}\|]/', $input[$pos + 1] ) )
295	);
296
297	case '!':
298	return $stops['th']
299	&& !$stops['intemplate']
300	&& $c2 === '!';
301
302	case '{':
303	// {{!}} pipe templates..
304	// FIXME: Presumably these should mix with and match \| above.
305	// phpcs:ignore Squiz.WhiteSpace.LanguageConstructSpacing.IncorrectSingle
306	return ( $stops['tableCellArg']
307	&& substr( $input, $pos, 5 ) === '{{!}}' )
308	\|\| ( $stops['table']
309	&& substr( $input, $pos, 10 ) === '{{!}}{{!}}' );
310
311	case '}':
312	$preproc = $stops['preproc'];
313	return ( $c2 === '}' && $preproc === '}}' )
314	\|\| ( $c2 === '-' && $preproc === '}-' );
315
316	case ':':
317	return $stops['colon']
318	&& !$stops['extlink']
319	&& !$stops['intemplate']
320	&& !$stops['linkdesc']
321	&& !( $stops['preproc'] === '}-' );
322
323	case ';':
324	return $stops['semicolon'];
325
326	case "\r":
327	return $stops['table']
328	&& preg_match( '/\r\n?\s*[!\|]/', substr( $input, $pos ) );
329
330	case "\n":
331	// The code below is just a manual / efficient
332	// version of this check.
333	//
334	// stops.table && /^\n\s*[!\|]/.test(input.substr(pos));
335	//
336	// It eliminates a substr on the string and eliminates
337	// a potential perf problem since "\n" and the inline_breaks
338	// test is common during tokenization.
339	if ( !$stops['table'] ) {
340	return false;
341	}
342
343	// Allow leading whitespace in tables
344
345	// Since we switched on 'c' which is input[pos],
346	// we know that input[pos] is "\n".
347	// So, the /^\n/ part of the regexp is already satisfied.
348	// Look for /\s*[!\|]/ below.
349	$n = strlen( $input );
350	for ( $i = $pos + 1; $i < $n; $i++ ) {
351	$d = $input[$i];
352	if ( preg_match( '/[!\|]/', $d ) ) {
353	return true;
354	} elseif ( !( preg_match( '/\s/', $d ) ) ) {
355	return false;
356	}
357	}
358	return false;
359	case '[':
360	// This is a special case in php's doTableStuff, added in
361	// response to T2553. If it encounters a `[[`, it bails on
362	// parsing attributes and interprets it all as content.
363	return $stops['tableCellArg'] && $c2 === '[';
364
365	case '-':
366	// Same as above: a special case in doTableStuff, added
367	// as part of T153140
368	return $stops['tableCellArg'] && $c2 === '{';
369
370	case ']':
371	if ( $stops['extlink'] ) {
372	return true;
373	}
374	return $stops['preproc'] === ']]'
375	&& $c2 === ']';
376
377	default:
378	throw new \RuntimeException( 'Unhandled case!' );
379	}
380	}
381
382	/**
383	* Pop off the end comments, if any.
384	* @param array &$attrs
385	* @return array\|null
386	*/
387	public static function popComments( array &$attrs ): ?array {
388	$buf = [];
389	for ( $i = count( $attrs ) - 1; $i > -1; $i-- ) {
390	$kv = $attrs[$i];
391	if ( is_string( $kv->k ) && !$kv->v && preg_match( '/^\s*$/D', $kv->k ) ) {
392	// permit whitespace
393	array_unshift( $buf, $kv->k );
394	} elseif ( is_array( $kv->k ) && !$kv->v ) {
395	// all should be comments
396	foreach ( $kv->k as $k ) {
397	if ( !( $k instanceof CommentTk ) ) {
398	break 2;
399	}
400	}
401	array_splice( $buf, 0, 0, $kv->k );
402	} else {
403	break;
404	}
405	}
406	// ensure we found a comment
407	while ( $buf && !( $buf[0] instanceof CommentTk ) ) {
408	array_shift( $buf );
409	}
410	if ( $buf ) {
411	array_splice( $attrs, -count( $buf ), count( $buf ) );
412	return [ 'buf' => $buf, 'commentStartPos' => $buf[0]->dataParsoid->tsr->start ];
413	} else {
414	return null;
415	}
416	}
417
418	/** Get a string containing all the autourl terminating characters (as in legacy parser
419	* Parser.php::makeFreeExternalLink). This list is slightly context-dependent because the
420	* inclusion of the right parenthesis depends on whether the provided character array $arr
421	* contains a left parenthesis.
422	* @param bool $hasLeftParen should be true if the URL in question contains
423	* a left parenthesis.
424	* @return string
425	*/
426	public static function getAutoUrlTerminatingChars( bool $hasLeftParen ): string {
427	$chars = Consts::$strippedUrlCharacters;
428	if ( !$hasLeftParen ) {
429	$chars .= ')';
430	}
431	return $chars;
432	}
433
434	/**
435	* @param Env $env
436	* @param mixed $token
437	*/
438	public static function enforceParserResourceLimits( Env $env, $token ) {
439	if ( $token instanceof TagTk \|\| $token instanceof SelfclosingTagTk ) {
440	$resource = null;
441	switch ( $token->getName() ) {
442	case 'listItem':
443	$resource = 'listItem';
444	break;
445	case 'template':
446	$resource = 'transclusion';
447	break;
448	case 'td':
449	case 'th':
450	$resource = 'tableCell';
451	break;
452	}
453	if (
454	$resource !== null &&
455	$env->bumpWt2HtmlResourceUse( $resource ) === false
456	) {
457	// `false` indicates that this bump pushed us over the threshold
458	// We don't want to log every token above that, which would be `null`
459	$env->log( 'warn', "wt2html: $resource limit exceeded" );
460	}
461	}
462	}
463
464	/**
465	* Protect Parsoid-inserted attributes by escaping them to prevent
466	* Parsoid-HTML spoofing in wikitext.
467	*
468	* @param string $name
469	* @return string
470	*/
471	public static function protectAttrs( string $name ): string {
472	if ( self::$protectAttrsRegExp === null ) {
473	self::$protectAttrsRegExp = "/^(about\|data-mw.\|data-parsoid.\|data-x.*\|" .
474	DOMDataUtils::DATA_OBJECT_ATTR_NAME .
475	'\|property\|rel\|typeof)$/i';
476	}
477	return preg_replace( self::$protectAttrsRegExp, 'data-x-$1', $name );
478	}
479
480	/**
481	* Resets $inclAnnRegExp to null to avoid test environment side effects
482	*/
483	public static function resetAnnotationIncludeRegex(): void {
484	self::$inclAnnRegExp = null;
485	}
486
487	}