Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 194 |
|
0.00% |
0 / 11 |
CRAP | |
0.00% |
0 / 1 |
TokenStreamPatcher | |
0.00% |
0 / 194 |
|
0.00% |
0 / 11 |
7310 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
resetState | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
reset | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
onNewline | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
onEnd | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
clearSOL | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
reprocessTokens | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
6 | |||
convertTokenToString | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
182 | |||
onAny | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
handleT2529Hack | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
56 | |||
onAnyInternal | |
0.00% |
0 / 115 |
|
0.00% |
0 / 1 |
2550 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\TT; |
5 | |
6 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
7 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
8 | use Wikimedia\Parsoid\Tokens\EOFTk; |
9 | use Wikimedia\Parsoid\Tokens\KV; |
10 | use Wikimedia\Parsoid\Tokens\NlTk; |
11 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
12 | use Wikimedia\Parsoid\Tokens\TagTk; |
13 | use Wikimedia\Parsoid\Tokens\Token; |
14 | use Wikimedia\Parsoid\Utils\PHPUtils; |
15 | use Wikimedia\Parsoid\Utils\PipelineUtils; |
16 | use Wikimedia\Parsoid\Utils\TokenUtils; |
17 | use Wikimedia\Parsoid\Wt2Html\PegTokenizer; |
18 | use Wikimedia\Parsoid\Wt2Html\TokenTransformManager; |
19 | |
20 | /** |
21 | * This class is an attempt to fixup the token stream to reparse strings |
22 | * as tokens that failed to parse in the tokenizer because of SOL or |
23 | * other constraints OR because tags were being constructed in pieces |
24 | * or whatever. |
25 | * |
26 | * This is a pure hack to improve compatibility with the core parser |
27 | * given that we dont have a preprocessor. This will be a grab-bag of |
28 | * heuristics and tricks to handle different scenarios. |
29 | */ |
30 | class TokenStreamPatcher extends TokenHandler { |
31 | private PegTokenizer $tokenizer; |
32 | |
33 | /** @var int|null */ |
34 | private $srcOffset; |
35 | |
36 | private bool $sol; |
37 | |
38 | private array $tokenBuf; |
39 | private int $wikiTableNesting; |
40 | /** True only for top-level & attribute value pipelines */ |
41 | private bool $inIndependentParse; |
42 | |
43 | /** @var Token|null */ |
44 | private $lastConvertedTableCellToken; |
45 | |
46 | /** @var SelfclosingTagTk|null */ |
47 | private $tplStartToken = null; |
48 | |
49 | /** @var NlTk|null */ |
50 | private $discardableNlTk = null; |
51 | |
52 | public function __construct( TokenTransformManager $manager, array $options ) { |
53 | $newOptions = [ 'tsp' => true ] + $options; |
54 | parent::__construct( $manager, $newOptions ); |
55 | $this->tokenizer = new PegTokenizer( $this->env ); |
56 | $this->reset(); |
57 | } |
58 | |
59 | /** |
60 | * Resets any internal state for this token handler. |
61 | * |
62 | * @param array $parseOpts |
63 | */ |
64 | public function resetState( array $parseOpts ): void { |
65 | parent::resetState( $parseOpts ); |
66 | $this->inIndependentParse = $this->atTopLevel || isset( $this->options['attrExpansion'] ); |
67 | } |
68 | |
69 | private function reset() { |
70 | $this->srcOffset = 0; |
71 | $this->sol = true; |
72 | $this->tokenBuf = []; |
73 | $this->wikiTableNesting = 0; |
74 | // This marker tries to track the most recent table-cell token (td/th) |
75 | // that was converted to string. For those, we want to get rid |
76 | // of their corresponding mw:TSRMarker meta tag. |
77 | // |
78 | // This marker is set when we convert a td/th token to string |
79 | // |
80 | // This marker is cleared in one of the following scenarios: |
81 | // 1. When we clear a mw:TSRMarker corresponding to the token set earlier |
82 | // 2. When we change table nesting |
83 | // 3. When we hit a tr/td/th/caption token that wasn't converted to string |
84 | $this->lastConvertedTableCellToken = null; |
85 | } |
86 | |
87 | /** |
88 | * @inheritDoc |
89 | */ |
90 | public function onNewline( NlTk $token ): ?TokenHandlerResult { |
91 | $self = $this; |
92 | $this->env->log( 'trace/tsp', $this->pipelineId, |
93 | static function () use ( $self, $token ) { |
94 | return "(indep=" . ( $self->inIndependentParse ? "yes" : "no " ) . |
95 | ";sol=" . ( $self->sol ? "yes" : "no " ) . ') ' . |
96 | PHPUtils::jsonEncode( $token ); |
97 | } |
98 | ); |
99 | $this->srcOffset = $token->dataParsoid->tsr->end ?? null; |
100 | if ( $this->sol && $this->tplStartToken ) { |
101 | // When using core preprocessor, start-of-line start is forced by |
102 | // inserting a newline in certain cases (the "T2529 hack"). In the |
103 | // legacy parser, the T2529 hack is never applied if the template was |
104 | // already at the start of the line (the `!$piece['lineStart']` |
105 | // check in Parser::braceSubstitution where T2529 is handled), but |
106 | // that context (`$this->sol`) isn't passed through when Parsoid |
107 | // invokes the core preprocessor. Thus, when $this->sol is true, |
108 | // prepare to (if the following tokens warrant it) remove an unnecessary |
109 | // T2529 newline added by the legacy preprocessor. |
110 | $this->discardableNlTk = $token; |
111 | } |
112 | $this->tokenBuf[] = $token; |
113 | $this->sol = true; |
114 | return new TokenHandlerResult( [] ); |
115 | } |
116 | |
117 | /** |
118 | * @inheritDoc |
119 | */ |
120 | public function onEnd( EOFTk $token ): ?TokenHandlerResult { |
121 | $res = $this->onAny( $token ); |
122 | $this->reset(); |
123 | return $res; |
124 | } |
125 | |
126 | /** |
127 | * Clear start of line info |
128 | */ |
129 | private function clearSOL() { |
130 | // clear tsr and sol flag |
131 | $this->srcOffset = null; |
132 | $this->sol = false; |
133 | } |
134 | |
135 | /** |
136 | * Fully reprocess the output tokens from the tokenizer through |
137 | * all the other handlers in stage 2. |
138 | * |
139 | * @param int|false $srcOffset See TokenUtils::shiftTokenTSR, which has b/c for null |
140 | * @param array $toks |
141 | * @param bool $popEOF |
142 | * @return array |
143 | */ |
144 | private function reprocessTokens( $srcOffset, array $toks, bool $popEOF = false ): array { |
145 | // Update tsr |
146 | TokenUtils::shiftTokenTSR( $toks, $srcOffset ); |
147 | |
148 | $toks = (array)PipelineUtils::processContentInPipeline( |
149 | $this->env, |
150 | $this->manager->getFrame(), |
151 | $toks, |
152 | [ |
153 | 'pipelineType' => 'peg-tokens-to-expanded-tokens', |
154 | 'pipelineOpts' => [], |
155 | 'sol' => true, |
156 | 'toplevel' => $this->atTopLevel, |
157 | ] |
158 | ); |
159 | |
160 | if ( $popEOF ) { |
161 | array_pop( $toks ); // pop EOFTk |
162 | } |
163 | return $toks; |
164 | } |
165 | |
166 | private function convertTokenToString( Token $token ): array { |
167 | $da = $token->dataParsoid; |
168 | $tsr = $da->tsr ?? null; |
169 | |
170 | if ( $tsr && $tsr->end > $tsr->start ) { |
171 | // > will only hold if these are valid numbers |
172 | $str = $tsr->substr( $this->manager->getFrame()->getSrcText() ); |
173 | // sol === false ensures that the pipe will not be parsed as a <td>/listItem again |
174 | $toks = $this->tokenizer->tokenizeSync( $str, [ 'sol' => false ] ); |
175 | return $this->reprocessTokens( $tsr->start, $toks, true ); |
176 | } elseif ( !empty( $da->autoInsertedStart ) && !empty( $da->autoInsertedEnd ) ) { |
177 | return [ '' ]; |
178 | } else { |
179 | // SSS FIXME: What about "!!" and "||"?? |
180 | switch ( $token->getName() ) { |
181 | case 'td': |
182 | return [ '|' ]; |
183 | case 'th': |
184 | return [ '!' ]; |
185 | case 'tr': |
186 | return [ '|-' ]; |
187 | case 'caption': |
188 | return [ $token instanceof TagTk ? '|+' : '' ]; |
189 | case 'table': |
190 | return [ $token instanceof EndTagTk ? '|}' : $token ]; |
191 | case 'listItem': |
192 | return [ implode( '', $token->getAttributeV( 'bullets' ) ) ]; |
193 | } |
194 | |
195 | // No conversion if we get here |
196 | return [ $token ]; |
197 | } |
198 | } |
199 | |
200 | /** |
201 | * @inheritDoc |
202 | */ |
203 | public function onAny( $token ): ?TokenHandlerResult { |
204 | try { |
205 | return $this->onAnyInternal( $token ); |
206 | } finally { |
207 | // Ensure we always clean up discardableNlTk and tplStartToken even |
208 | // in the presence of exceptions. |
209 | $this->discardableNlTk = null; |
210 | if ( $this->tplStartToken !== $token ) { |
211 | $this->tplStartToken = null; |
212 | } |
213 | } |
214 | } |
215 | |
216 | /** |
217 | * The legacy parser's "T2529 hack" attempts to ensure templates are |
218 | * always evaluated in start-of-line context by prepending a newline |
219 | * if necessary. However, it is inconsistent: in particular it |
220 | * only treats }| : ; # * as SOL-sensitive tokens, neglecting == |
221 | * (headings) and ! | |} (in table context). |
222 | * |
223 | * If we're using the core preprocessor for template expansion: |
224 | * - The core preprocessor as invoked by Parsoid will always insert the |
225 | * newline in the "T2529 cases" (even though it's not necessary; Parsoid |
226 | * is already in SOL mode) *HOWEVER* |
227 | * - As described in ::onNewline() above, the newline insertion is |
228 | * /supposed/ to be suppressed if the template was *already* |
229 | * at the start of the line. So we need to strip the unnecessarily |
230 | * added NlTk to avoid "extra" whitespace in Parsoid's expansion. |
231 | * Ex: "{{my-tpl}}" in sol-context which will get expanded to "\n*foo" |
232 | * but the "\n" wasn't necessary |
233 | * |
234 | * If we're in native preprocessor mode: |
235 | * - If we are in SOL state, we don't need to add a newline. |
236 | * - If we are not in SOL state, we need to insert a newline in 'T2529' cases. |
237 | * Ex: "{{my-tpl}}" in sol-context which expands to "*foo" but in |
238 | * non-sol context expands to "\n*foo" |
239 | * |
240 | * @param string $tokenName |
241 | */ |
242 | private function handleT2529Hack( string $tokenName ): void { |
243 | // Core's |
244 | if ( $tokenName === 'table' || $tokenName === 'listItem' ) { |
245 | // We're in a context when the core preprocessor would apply |
246 | // the "T2529 hack" to ensure start-of-line context. |
247 | if ( $this->discardableNlTk ) { |
248 | // We're using core preprocessor and were already at |
249 | // the start of the line, so the core preprocessor wouldn't |
250 | // actually have inserted a newline here. Swallow up ours. |
251 | array_pop( $this->tokenBuf ); |
252 | } elseif ( !$this->sol && |
253 | $this->tplStartToken && |
254 | $this->env->nativeTemplateExpansionEnabled() |
255 | ) { |
256 | // Native preprocessor; add a newline in "T2529 cases" |
257 | // for correct whitespace. (Remember that this only happens |
258 | // if we weren't already at the start of the line.) |
259 | // Add a newline & force SOL |
260 | $this->tokenBuf[] = new NlTk( null ); |
261 | $this->sol = true; |
262 | } |
263 | } |
264 | } |
265 | |
266 | /** |
267 | * @param mixed $token |
268 | * @return ?TokenHandlerResult |
269 | */ |
270 | public function onAnyInternal( $token ): ?TokenHandlerResult { |
271 | $self = $this; |
272 | $this->env->log( 'trace/tsp', $this->pipelineId, |
273 | static function () use ( $self, $token ) { |
274 | return "(indep=" . ( $self->inIndependentParse ? "yes" : "no " ) . |
275 | ";sol=" . ( $self->sol ? "yes" : "no " ) . ') ' . |
276 | PHPUtils::jsonEncode( $token ); |
277 | } |
278 | ); |
279 | |
280 | $tokens = [ $token ]; |
281 | $tc = TokenUtils::getTokenType( $token ); |
282 | switch ( $tc ) { |
283 | case 'string': |
284 | // While we are buffering newlines to suppress them |
285 | // in case we see a category, buffer all intervening |
286 | // white-space as well. |
287 | if ( count( $this->tokenBuf ) > 0 && preg_match( '/^\s*$/D', $token ) ) { |
288 | $this->tokenBuf[] = $token; |
289 | return new TokenHandlerResult( [] ); |
290 | } |
291 | |
292 | // This is only applicable where we use Parsoid's (broken) native preprocessor. |
293 | // This supports scenarios like "{{1x|*bar}}". When "{{{1}}}" is tokenized |
294 | // "*bar" isn't available and so won't become a list. |
295 | // FIXME: {{1x|1===foo==}} will still be broken. So, this fix below is somewhat |
296 | // independent of T2529 for our broken preprocessor but we are restricting the |
297 | // fix to T2529. |
298 | $T2529hack = false; |
299 | if ( $this->env->nativeTemplateExpansionEnabled() && |
300 | $this->tplStartToken && |
301 | preg_match( '/^(?:{\\||[:;#*])/', $token ) |
302 | ) { |
303 | // Add a newline & force SOL |
304 | $T2529hack = true; |
305 | // Remove newline insertion in the core preprocessor |
306 | // only occurs if we weren't already at the start of |
307 | // the line (see discussion in ::onNewline() above). |
308 | if ( !$this->sol ) { |
309 | $this->tokenBuf[] = new NlTk( null ); |
310 | $this->sol = true; |
311 | } |
312 | } |
313 | |
314 | if ( $this->sol ) { |
315 | // Attempt to match "{|" after a newline and convert |
316 | // it to a table token. |
317 | if ( $this->inIndependentParse && str_starts_with( $token, '{|' ) ) { |
318 | // Reparse string with the 'table_start_tag' rule |
319 | // and fully reprocess them. |
320 | $retoks = $this->tokenizer->tokenizeAs( $token, 'table_start_tag', /* sol */true ); |
321 | if ( $retoks === false ) { |
322 | // XXX: The string begins with table start syntax, |
323 | // we really shouldn't be here. Anything else on the |
324 | // line would get swallowed up as attributes. |
325 | $this->env->log( 'error', 'Failed to tokenize table start tag.' ); |
326 | $this->clearSOL(); |
327 | } else { |
328 | $tokens = $this->reprocessTokens( $this->srcOffset, $retoks ); |
329 | $this->wikiTableNesting++; |
330 | $this->lastConvertedTableCellToken = null; |
331 | } |
332 | } elseif ( $this->inIndependentParse && $T2529hack ) { // {| has been handled above |
333 | $retoks = $this->tokenizer->tokenizeAs( $token, 'list_item', /* sol */true ); |
334 | if ( $retoks === false ) { |
335 | $this->env->log( 'error', 'Failed to tokenize list item.' ); |
336 | $this->clearSOL(); |
337 | } else { |
338 | $tokens = $this->reprocessTokens( $this->srcOffset, $retoks ); |
339 | } |
340 | } elseif ( preg_match( '/^\s*$/D', $token ) ) { |
341 | // White-space doesn't change SOL state |
342 | // Update srcOffset |
343 | $this->srcOffset += strlen( $token ); |
344 | } else { |
345 | $this->clearSOL(); |
346 | } |
347 | } else { |
348 | $this->clearSOL(); |
349 | } |
350 | break; |
351 | |
352 | case 'CommentTk': |
353 | // Comments don't change SOL state |
354 | // Update srcOffset |
355 | $this->srcOffset = $token->dataParsoid->tsr->end ?? null; |
356 | break; |
357 | |
358 | case 'SelfclosingTagTk': |
359 | if ( $token->getName() === 'meta' && ( $token->dataParsoid->stx ?? '' ) !== 'html' ) { |
360 | if ( TokenUtils::hasTypeOf( $token, 'mw:Transclusion' ) ) { |
361 | $this->tplStartToken = $token; |
362 | } |
363 | $this->srcOffset = $token->dataParsoid->tsr->end ?? null; |
364 | if ( count( $this->tokenBuf ) > 0 && |
365 | TokenUtils::hasTypeOf( $token, 'mw:Transclusion' ) |
366 | ) { |
367 | // If we have buffered newlines, we might very well encounter |
368 | // a category link, so continue buffering. |
369 | $this->tokenBuf[] = $token; |
370 | return new TokenHandlerResult( [] ); |
371 | } |
372 | } elseif ( TokenUtils::isSolTransparentLinkTag( $token ) ) { |
373 | // Replace buffered newline & whitespace tokens with mw:EmptyLine |
374 | // meta-tokens. This tunnels them through the rest of the transformations |
375 | // without affecting them. During HTML building, they are expanded |
376 | // back to newlines / whitespace. |
377 | $n = count( $this->tokenBuf ); |
378 | if ( $n > 0 ) { |
379 | $i = 0; |
380 | while ( $i < $n && |
381 | !( $this->tokenBuf[$i] instanceof SelfclosingTagTk ) |
382 | ) { |
383 | $i++; |
384 | } |
385 | |
386 | $dp = new DataParsoid; |
387 | $dp->tokens = array_slice( $this->tokenBuf, 0, $i ); |
388 | $toks = [ |
389 | new SelfclosingTagTk( 'meta', |
390 | [ new KV( 'typeof', 'mw:EmptyLine' ) ], |
391 | $dp |
392 | ) |
393 | ]; |
394 | if ( $i < $n ) { |
395 | $toks[] = $this->tokenBuf[$i]; |
396 | if ( $i + 1 < $n ) { |
397 | $dp = new DataParsoid; |
398 | $dp->tokens = array_slice( $this->tokenBuf, $i + 1 ); |
399 | $toks[] = new SelfclosingTagTk( 'meta', |
400 | [ new KV( 'typeof', 'mw:EmptyLine' ) ], |
401 | $dp |
402 | ); |
403 | } |
404 | } |
405 | $tokens = array_merge( $toks, $tokens ); |
406 | $this->tokenBuf = []; |
407 | } |
408 | $this->clearSOL(); |
409 | } else { |
410 | $this->clearSOL(); |
411 | } |
412 | break; |
413 | |
414 | case 'TagTk': |
415 | if ( $this->inIndependentParse && !TokenUtils::isHTMLTag( $token ) ) { |
416 | $tokenName = $token->getName(); |
417 | $this->handleT2529Hack( $tokenName ); |
418 | if ( $tokenName === 'listItem' && isset( $this->options['attrExpansion'] ) ) { |
419 | // Convert list items back to bullet wikitext in attribute context |
420 | $tokens = $this->convertTokenToString( $token ); |
421 | } elseif ( $tokenName === 'table' ) { |
422 | $this->lastConvertedTableCellToken = null; |
423 | $this->wikiTableNesting++; |
424 | } elseif ( in_array( $tokenName, [ 'td', 'th', 'tr', 'caption' ], true ) ) { |
425 | if ( $this->wikiTableNesting === 0 ) { |
426 | if ( $token->getName() === 'td' || $token->getName() === 'th' ) { |
427 | $this->lastConvertedTableCellToken = $token; |
428 | } |
429 | $tokens = $this->convertTokenToString( $token ); |
430 | } else { |
431 | $this->lastConvertedTableCellToken = null; |
432 | } |
433 | } |
434 | } |
435 | $this->clearSOL(); |
436 | break; |
437 | |
438 | case 'EndTagTk': |
439 | if ( $this->inIndependentParse && !TokenUtils::isHTMLTag( $token ) ) { |
440 | if ( $this->wikiTableNesting > 0 ) { |
441 | if ( $token->getName() === 'table' ) { |
442 | $this->lastConvertedTableCellToken = null; |
443 | $this->wikiTableNesting--; |
444 | } |
445 | } elseif ( $token->getName() === 'table' || $token->getName() === 'caption' ) { |
446 | // Convert this to "|}" |
447 | $tokens = $this->convertTokenToString( $token ); |
448 | } |
449 | } |
450 | $this->clearSOL(); |
451 | break; |
452 | |
453 | default: |
454 | break; |
455 | } |
456 | |
457 | // Emit buffered newlines (and a transclusion meta-token, if any) |
458 | if ( count( $this->tokenBuf ) > 0 ) { |
459 | $tokens = array_merge( $this->tokenBuf, $tokens ); |
460 | $this->tokenBuf = []; |
461 | } |
462 | return new TokenHandlerResult( $tokens ); |
463 | } |
464 | } |