Code Coverage for /src/src/Wt2Html/TT/TokenStreamPatcher.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	0.00% covered (danger)	0.00%	0 / 191	0.00% covered (danger)	0.00%	0 / 11	CRAP	0.00% covered (danger)	0.00%	0 / 1
TokenStreamPatcher	0.00% covered (danger)	0.00%	0 / 191	0.00% covered (danger)	0.00%	0 / 11	7482	0.00% covered (danger)	0.00%	0 / 1
__construct	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	2
resetState	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	6
reset	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	2
onNewline	0.00% covered (danger)	0.00%	0 / 14	0.00% covered (danger)	0.00%	0 / 1	30
onEnd	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	2
clearSOL	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	2
reprocessTokens	0.00% covered (danger)	0.00%	0 / 11	0.00% covered (danger)	0.00%	0 / 1	6
convertTokenToString	0.00% covered (danger)	0.00%	0 / 22	0.00% covered (danger)	0.00%	0 / 1	182
onAny	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
handleT2529Hack	0.00% covered (danger)	0.00%	0 / 8	0.00% covered (danger)	0.00%	0 / 1	56
onAnyInternal	0.00% covered (danger)	0.00%	0 / 116	0.00% covered (danger)	0.00%	0 / 1	2652

1	<?php
2	declare( strict_types = 1 );
3
4	namespace Wikimedia\Parsoid\Wt2Html\TT;
5
6	use Wikimedia\Parsoid\NodeData\DataParsoid;
7	use Wikimedia\Parsoid\Tokens\EndTagTk;
8	use Wikimedia\Parsoid\Tokens\EOFTk;
9	use Wikimedia\Parsoid\Tokens\KV;
10	use Wikimedia\Parsoid\Tokens\NlTk;
11	use Wikimedia\Parsoid\Tokens\SelfclosingTagTk;
12	use Wikimedia\Parsoid\Tokens\TagTk;
13	use Wikimedia\Parsoid\Tokens\Token;
14	use Wikimedia\Parsoid\Utils\PHPUtils;
15	use Wikimedia\Parsoid\Utils\TokenUtils;
16	use Wikimedia\Parsoid\Wt2Html\PegTokenizer;
17	use Wikimedia\Parsoid\Wt2Html\TokenTransformManager;
18
19	/**
20	* This class is an attempt to fixup the token stream to reparse strings
21	* as tokens that failed to parse in the tokenizer because of SOL or
22	* other constraints OR because tags were being constructed in pieces
23	* or whatever.
24	*
25	* This is a pure hack to improve compatibility with the core parser
26	* given that we dont have a preprocessor. This will be a grab-bag of
27	* heuristics and tricks to handle different scenarios.
28	*/
29	class TokenStreamPatcher extends TokenHandler {
30	private PegTokenizer $tokenizer;
31
32	/** @var int\|null */
33	private $srcOffset;
34
35	private bool $sol;
36
37	private array $tokenBuf;
38	private int $wikiTableNesting;
39	/** True only for top-level & attribute value pipelines */
40	private bool $inIndependentParse;
41
42	/** @var Token\|null */
43	private $lastConvertedTableCellToken;
44
45	/** @var SelfclosingTagTk\|null */
46	private $tplStartToken = null;
47
48	/** @var NlTk\|null */
49	private $discardableNlTk = null;
50
51	public function __construct( TokenTransformManager $manager, array $options ) {
52	$newOptions = [ 'tsp' => true ] + $options;
53	parent::__construct( $manager, $newOptions );
54	$this->tokenizer = new PegTokenizer( $this->env );
55	$this->reset();
56	}
57
58	/**
59	* Resets any internal state for this token handler.
60	*
61	* @param array $parseOpts
62	*/
63	public function resetState( array $parseOpts ): void {
64	parent::resetState( $parseOpts );
65	$this->inIndependentParse = $this->atTopLevel \|\| isset( $this->options['attrExpansion'] );
66	}
67
68	private function reset() {
69	$this->srcOffset = 0;
70	$this->sol = true;
71	$this->tokenBuf = [];
72	$this->wikiTableNesting = 0;
73	// This marker tries to track the most recent table-cell token (td/th)
74	// that was converted to string. For those, we want to get rid
75	// of their corresponding mw:TSRMarker meta tag.
76	//
77	// This marker is set when we convert a td/th token to string
78	//
79	// This marker is cleared in one of the following scenarios:
80	// 1. When we clear a mw:TSRMarker corresponding to the token set earlier
81	// 2. When we change table nesting
82	// 3. When we hit a tr/td/th/caption token that wasn't converted to string
83	$this->lastConvertedTableCellToken = null;
84	}
85
86	/**
87	* @inheritDoc
88	*/
89	public function onNewline( NlTk $token ): ?TokenHandlerResult {
90	$self = $this;
91	$this->env->log( 'trace/tsp', $this->pipelineId,
92	static function () use ( $self, $token ) {
93	return "(indep=" . ( $self->inIndependentParse ? "yes" : "no " ) .
94	";sol=" . ( $self->sol ? "yes" : "no " ) .
95	PHPUtils::jsonEncode( $token );
96	}
97	);
98	$this->srcOffset = $token->dataParsoid->tsr->end ?? null;
99	if ( $this->sol && $this->tplStartToken ) {
100	// When using core preprocessor, start-of-line start is forced by
101	// inserting a newline in certain cases (the "T2529 hack"). In the
102	// legacy parser, the T2529 hack is never applied if the template was
103	// already at the start of the line (the `!$piece['lineStart']`
104	// check in Parser::braceSubstitution where T2529 is handled), but
105	// that context (`$this->sol`) isn't passed through when Parsoid
106	// invokes the core preprocessor. Thus, when $this->sol is true,
107	// prepare to (if the following tokens warrant it) remove an unnecessary
108	// T2529 newline added by the legacy preprocessor.
109	$this->discardableNlTk = $token;
110	}
111	$this->tokenBuf[] = $token;
112	$this->sol = true;
113	return new TokenHandlerResult( [] );
114	}
115
116	/**
117	* @inheritDoc
118	*/
119	public function onEnd( EOFTk $token ): ?TokenHandlerResult {
120	$res = $this->onAny( $token );
121	$this->reset();
122	return $res;
123	}
124
125	/**
126	* Clear start of line info
127	*/
128	private function clearSOL() {
129	// clear tsr and sol flag
130	$this->srcOffset = null;
131	$this->sol = false;
132	}
133
134	/**
135	* Fully reprocess the output tokens from the tokenizer through
136	* all the other handlers in stage 2.
137	*
138	* @param int $srcOffset
139	* @param array $toks
140	* @param bool $popEOF
141	* @return array
142	*/
143	private function reprocessTokens( int $srcOffset, array $toks, bool $popEOF = false ): array {
144	// Update tsr
145	TokenUtils::shiftTokenTSR( $toks, $srcOffset );
146	$pipe = $this->env->getPipelineFactory()->getPipeline( "tokens/x-mediawiki" );
147	$pipe->init( [
148	'frame' => $this->manager->getFrame(),
149	'toplevel' => $this->atTopLevel,
150	// The tokens should be reprocessed in the context of the original frame's source
151	'srcText' => $this->manager->getFrame()->getSrcText()
152	] );
153	$toks = (array)$pipe->parse( $toks, [] );
154	if ( $popEOF ) {
155	array_pop( $toks ); // pop EOFTk
156	}
157	return $toks;
158	}
159
160	private function convertTokenToString( Token $token ): array {
161	$da = $token->dataParsoid;
162	$tsr = $da->tsr ?? null;
163
164	if ( $tsr && $tsr->end > $tsr->start ) {
165	// > will only hold if these are valid numbers
166	$str = $tsr->substr( $this->manager->getFrame()->getSrcText() );
167	// sol === false ensures that the pipe will not be parsed as a <td>/listItem again
168	$toks = $this->tokenizer->tokenizeSync( $str, [ 'sol' => false ] );
169	return $this->reprocessTokens( $tsr->start, $toks, true );
170	} elseif ( !empty( $da->autoInsertedStart ) && !empty( $da->autoInsertedEnd ) ) {
171	return [ '' ];
172	} else {
173	// SSS FIXME: What about "!!" and "\|\|"??
174	switch ( $token->getName() ) {
175	case 'td':
176	return [ '\|' ];
177	case 'th':
178	return [ '!' ];
179	case 'tr':
180	return [ '\|-' ];
181	case 'caption':
182	return [ $token instanceof TagTk ? '\|+' : '' ];
183	case 'table':
184	return [ $token instanceof EndTagTk ? '\|}' : $token ];
185	case 'listItem':
186	return [ implode( '', $token->getAttributeV( 'bullets' ) ) ];
187	}
188
189	// No conversion if we get here
190	return [ $token ];
191	}
192	}
193
194	/**
195	* @inheritDoc
196	*/
197	public function onAny( $token ): ?TokenHandlerResult {
198	try {
199	return $this->onAnyInternal( $token );
200	} finally {
201	// Ensure we always clean up discardableNlTk and tplStartToken even
202	// in the presence of exceptions.
203	$this->discardableNlTk = null;
204	if ( $this->tplStartToken !== $token ) {
205	$this->tplStartToken = null;
206	}
207	}
208	}
209
210	/**
211	* The legacy parser's "T2529 hack" attempts to ensure templates are
212	* always evaluated in start-of-line context by prepending a newline
213	* if necessary. However, it is inconsistent: in particular it
214	* only treats }\| : ; # * as SOL-sensitive tokens, neglecting ==
215	* (headings) and ! \| \|} (in table context).
216	*
217	* If we're using the core preprocessor for template expansion:
218	* - The core preprocessor as invoked by Parsoid will always insert the
219	* newline in the "T2529 cases" (even though it's not necessary; Parsoid
220	* is already in SOL mode) HOWEVER
221	* - As described in ::onNewline() above, the newline insertion is
222	* /supposed/ to be suppressed if the template was already
223	* at the start of the line. So we need to strip the unnecessarily
224	* added NlTk to avoid "extra" whitespace in Parsoid's expansion.
225	* Ex: "{{my-tpl}}" in sol-context which will get expanded to "\n*foo"
226	* but the "\n" wasn't necessary
227	*
228	* If we're in native preprocessor mode:
229	* - If we are in SOL state, we don't need to add a newline.
230	* - If we are not in SOL state, we need to insert a newline in 'T2529' cases.
231	* Ex: "{{my-tpl}}" in sol-context which expands to "*foo" but in
232	* non-sol context expands to "\n*foo"
233	*
234	* @param string $tokenName
235	*/
236	private function handleT2529Hack( string $tokenName ): void {
237	// Core's
238	if ( $tokenName === 'table' \|\| $tokenName === 'listItem' ) {
239	// We're in a context when the core preprocessor would apply
240	// the "T2529 hack" to ensure start-of-line context.
241	if ( $this->discardableNlTk ) {
242	// We're using core preprocessor and were already at
243	// the start of the line, so the core preprocessor wouldn't
244	// actually have inserted a newline here. Swallow up ours.
245	array_pop( $this->tokenBuf );
246	} elseif ( !$this->sol &&
247	$this->tplStartToken &&
248	$this->env->nativeTemplateExpansionEnabled()
249	) {
250	// Native preprocessor; add a newline in "T2529 cases"
251	// for correct whitespace. (Remember that this only happens
252	// if we weren't already at the start of the line.)
253	// Add a newline & force SOL
254	$this->tokenBuf[] = new NlTk( null );
255	$this->sol = true;
256	}
257	}
258	}
259
260	/**
261	* @param mixed $token
262	* @return ?TokenHandlerResult
263	*/
264	public function onAnyInternal( $token ): ?TokenHandlerResult {
265	$self = $this;
266	$this->env->log( 'trace/tsp', $this->pipelineId,
267	static function () use ( $self, $token ) {
268	return "(indep=" . ( $self->inIndependentParse ? "yes" : "no " ) .
269	";sol=" . ( $self->sol ? "yes" : "no " ) .
270	PHPUtils::jsonEncode( $token );
271	}
272	);
273
274	$tokens = [ $token ];
275	$tc = TokenUtils::getTokenType( $token );
276	switch ( $tc ) {
277	case 'string':
278	// While we are buffering newlines to suppress them
279	// in case we see a category, buffer all intervening
280	// white-space as well.
281	if ( count( $this->tokenBuf ) > 0 && preg_match( '/^\s*$/D', $token ) ) {
282	$this->tokenBuf[] = $token;
283	return new TokenHandlerResult( [] );
284	}
285
286	// This is only applicable where we use Parsoid's (broken) native preprocessor.
287	// This supports scenarios like "{{1x\|*bar}}". When "{{{1}}}" is tokenized
288	// "*bar" isn't available and so won't become a list.
289	// FIXME: {{1x\|1===foo==}} will still be broken. So, this fix below is somewhat
290	// independent of T2529 for our broken preprocessor but we are restricting the
291	// fix to T2529.
292	$T2529hack = false;
293	if ( $this->env->nativeTemplateExpansionEnabled() &&
294	$this->tplStartToken &&
295	preg_match( '/^(?:{\\\|\|[:;#*])/', $token )
296	) {
297	// Add a newline & force SOL
298	$T2529hack = true;
299	// Remove newline insertion in the core preprocessor
300	// only occurs if we weren't already at the start of
301	// the line (see discussion in ::onNewline() above).
302	if ( !$this->sol ) {
303	$this->tokenBuf[] = new NlTk( null );
304	$this->sol = true;
305	}
306	}
307
308	if ( $this->sol ) {
309	// Attempt to match "{\|" after a newline and convert
310	// it to a table token.
311	if ( $this->inIndependentParse && str_starts_with( $token, '{\|' ) ) {
312	// Reparse string with the 'table_start_tag' rule
313	// and fully reprocess them.
314	$retoks = $this->tokenizer->tokenizeAs( $token, 'table_start_tag', /* sol */true );
315	if ( $retoks === false ) {
316	// XXX: The string begins with table start syntax,
317	// we really shouldn't be here. Anything else on the
318	// line would get swallowed up as attributes.
319	$this->env->log( 'error', 'Failed to tokenize table start tag.' );
320	$this->clearSOL();
321	} else {
322	$tokens = $this->reprocessTokens( $this->srcOffset, $retoks );
323	$this->wikiTableNesting++;
324	$this->lastConvertedTableCellToken = null;
325	}
326	} elseif ( $this->inIndependentParse && $T2529hack ) { // {\| has been handled above
327	$retoks = $this->tokenizer->tokenizeAs( $token, 'list_item', /* sol */true );
328	if ( $retoks === false ) {
329	$this->env->log( 'error', 'Failed to tokenize list item.' );
330	$this->clearSOL();
331	} else {
332	$tokens = $this->reprocessTokens( $this->srcOffset, $retoks );
333	}
334	} elseif ( preg_match( '/^\s*$/D', $token ) ) {
335	// White-space doesn't change SOL state
336	// Update srcOffset
337	$this->srcOffset += strlen( $token );
338	} else {
339	$this->clearSOL();
340	}
341	} else {
342	$this->clearSOL();
343	}
344	break;
345
346	case 'CommentTk':
347	// Comments don't change SOL state
348	// Update srcOffset
349	$this->srcOffset = $token->dataParsoid->tsr->end ?? null;
350	break;
351
352	case 'SelfclosingTagTk':
353	if ( $token->getName() === 'meta' && ( $token->dataParsoid->stx ?? '' ) !== 'html' ) {
354	if ( TokenUtils::hasTypeOf( $token, 'mw:Transclusion' ) ) {
355	$this->tplStartToken = $token;
356	}
357	$this->srcOffset = $token->dataParsoid->tsr->end ?? null;
358	if ( count( $this->tokenBuf ) > 0 &&
359	TokenUtils::hasTypeOf( $token, 'mw:Transclusion' )
360	) {
361	// If we have buffered newlines, we might very well encounter
362	// a category link, so continue buffering.
363	$this->tokenBuf[] = $token;
364	return new TokenHandlerResult( [] );
365	}
366	} elseif ( $token->getName() === 'link' &&
367	$token->getAttributeV( 'rel' ) === 'mw:PageProp/Category'
368	) {
369	// Replace buffered newline & whitespace tokens with mw:EmptyLine
370	// meta-tokens. This tunnels them through the rest of the transformations
371	// without affecting them. During HTML building, they are expanded
372	// back to newlines / whitespace.
373	$n = count( $this->tokenBuf );
374	if ( $n > 0 ) {
375	$i = 0;
376	while ( $i < $n &&
377	!( $this->tokenBuf[$i] instanceof SelfclosingTagTk )
378	) {
379	$i++;
380	}
381
382	$dp = new DataParsoid;
383	$dp->tokens = array_slice( $this->tokenBuf, 0, $i );
384	$toks = [
385	new SelfclosingTagTk( 'meta',
386	[ new KV( 'typeof', 'mw:EmptyLine' ) ],
387	$dp
388	)
389	];
390	if ( $i < $n ) {
391	$toks[] = $this->tokenBuf[$i];
392	if ( $i + 1 < $n ) {
393	$dp = new DataParsoid;
394	$dp->tokens = array_slice( $this->tokenBuf, $i + 1 );
395	$toks[] = new SelfclosingTagTk( 'meta',
396	[ new KV( 'typeof', 'mw:EmptyLine' ) ],
397	$dp
398	);
399	}
400	}
401	$tokens = array_merge( $toks, $tokens );
402	$this->tokenBuf = [];
403	}
404	$this->clearSOL();
405	} else {
406	$this->clearSOL();
407	}
408	break;
409
410	case 'TagTk':
411	if ( $this->inIndependentParse && !TokenUtils::isHTMLTag( $token ) ) {
412	$tokenName = $token->getName();
413	$this->handleT2529Hack( $tokenName );
414	if ( $tokenName === 'listItem' && isset( $this->options['attrExpansion'] ) ) {
415	// Convert list items back to bullet wikitext in attribute context
416	$tokens = $this->convertTokenToString( $token );
417	} elseif ( $tokenName === 'table' ) {
418	$this->lastConvertedTableCellToken = null;
419	$this->wikiTableNesting++;
420	} elseif ( in_array( $tokenName, [ 'td', 'th', 'tr', 'caption' ], true ) ) {
421	if ( $this->wikiTableNesting === 0 ) {
422	if ( $token->getName() === 'td' \|\| $token->getName() === 'th' ) {
423	$this->lastConvertedTableCellToken = $token;
424	}
425	$tokens = $this->convertTokenToString( $token );
426	} else {
427	$this->lastConvertedTableCellToken = null;
428	}
429	}
430	}
431	$this->clearSOL();
432	break;
433
434	case 'EndTagTk':
435	if ( $this->inIndependentParse && !TokenUtils::isHTMLTag( $token ) ) {
436	if ( $this->wikiTableNesting > 0 ) {
437	if ( $token->getName() === 'table' ) {
438	$this->lastConvertedTableCellToken = null;
439	$this->wikiTableNesting--;
440	}
441	} elseif ( $token->getName() === 'table' \|\| $token->getName() === 'caption' ) {
442	// Convert this to "\|}"
443	$tokens = $this->convertTokenToString( $token );
444	}
445	}
446	$this->clearSOL();
447	break;
448
449	default:
450	break;
451	}
452
453	// Emit buffered newlines (and a transclusion meta-token, if any)
454	if ( count( $this->tokenBuf ) > 0 ) {
455	$tokens = array_merge( $this->tokenBuf, $tokens );
456	$this->tokenBuf = [];
457	}
458	return new TokenHandlerResult( $tokens );
459	}
460	}