Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 191 |
|
0.00% |
0 / 11 |
CRAP | |
0.00% |
0 / 1 |
TokenStreamPatcher | |
0.00% |
0 / 191 |
|
0.00% |
0 / 11 |
7482 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
resetState | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
reset | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
onNewline | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
onEnd | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
clearSOL | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
reprocessTokens | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
6 | |||
convertTokenToString | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
182 | |||
onAny | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
handleT2529Hack | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
56 | |||
onAnyInternal | |
0.00% |
0 / 116 |
|
0.00% |
0 / 1 |
2652 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\TT; |
5 | |
6 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
7 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
8 | use Wikimedia\Parsoid\Tokens\EOFTk; |
9 | use Wikimedia\Parsoid\Tokens\KV; |
10 | use Wikimedia\Parsoid\Tokens\NlTk; |
11 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
12 | use Wikimedia\Parsoid\Tokens\TagTk; |
13 | use Wikimedia\Parsoid\Tokens\Token; |
14 | use Wikimedia\Parsoid\Utils\PHPUtils; |
15 | use Wikimedia\Parsoid\Utils\TokenUtils; |
16 | use Wikimedia\Parsoid\Wt2Html\PegTokenizer; |
17 | use Wikimedia\Parsoid\Wt2Html\TokenTransformManager; |
18 | |
19 | /** |
20 | * This class is an attempt to fixup the token stream to reparse strings |
21 | * as tokens that failed to parse in the tokenizer because of SOL or |
22 | * other constraints OR because tags were being constructed in pieces |
23 | * or whatever. |
24 | * |
25 | * This is a pure hack to improve compatibility with the core parser |
26 | * given that we dont have a preprocessor. This will be a grab-bag of |
27 | * heuristics and tricks to handle different scenarios. |
28 | */ |
29 | class TokenStreamPatcher extends TokenHandler { |
30 | private PegTokenizer $tokenizer; |
31 | |
32 | /** @var int|null */ |
33 | private $srcOffset; |
34 | |
35 | private bool $sol; |
36 | |
37 | private array $tokenBuf; |
38 | private int $wikiTableNesting; |
39 | /** True only for top-level & attribute value pipelines */ |
40 | private bool $inIndependentParse; |
41 | |
42 | /** @var Token|null */ |
43 | private $lastConvertedTableCellToken; |
44 | |
45 | /** @var SelfclosingTagTk|null */ |
46 | private $tplStartToken = null; |
47 | |
48 | /** @var NlTk|null */ |
49 | private $discardableNlTk = null; |
50 | |
51 | public function __construct( TokenTransformManager $manager, array $options ) { |
52 | $newOptions = [ 'tsp' => true ] + $options; |
53 | parent::__construct( $manager, $newOptions ); |
54 | $this->tokenizer = new PegTokenizer( $this->env ); |
55 | $this->reset(); |
56 | } |
57 | |
58 | /** |
59 | * Resets any internal state for this token handler. |
60 | * |
61 | * @param array $parseOpts |
62 | */ |
63 | public function resetState( array $parseOpts ): void { |
64 | parent::resetState( $parseOpts ); |
65 | $this->inIndependentParse = $this->atTopLevel || isset( $this->options['attrExpansion'] ); |
66 | } |
67 | |
68 | private function reset() { |
69 | $this->srcOffset = 0; |
70 | $this->sol = true; |
71 | $this->tokenBuf = []; |
72 | $this->wikiTableNesting = 0; |
73 | // This marker tries to track the most recent table-cell token (td/th) |
74 | // that was converted to string. For those, we want to get rid |
75 | // of their corresponding mw:TSRMarker meta tag. |
76 | // |
77 | // This marker is set when we convert a td/th token to string |
78 | // |
79 | // This marker is cleared in one of the following scenarios: |
80 | // 1. When we clear a mw:TSRMarker corresponding to the token set earlier |
81 | // 2. When we change table nesting |
82 | // 3. When we hit a tr/td/th/caption token that wasn't converted to string |
83 | $this->lastConvertedTableCellToken = null; |
84 | } |
85 | |
86 | /** |
87 | * @inheritDoc |
88 | */ |
89 | public function onNewline( NlTk $token ): ?TokenHandlerResult { |
90 | $self = $this; |
91 | $this->env->log( 'trace/tsp', $this->pipelineId, |
92 | static function () use ( $self, $token ) { |
93 | return "(indep=" . ( $self->inIndependentParse ? "yes" : "no " ) . |
94 | ";sol=" . ( $self->sol ? "yes" : "no " ) . |
95 | PHPUtils::jsonEncode( $token ); |
96 | } |
97 | ); |
98 | $this->srcOffset = $token->dataParsoid->tsr->end ?? null; |
99 | if ( $this->sol && $this->tplStartToken ) { |
100 | // When using core preprocessor, start-of-line start is forced by |
101 | // inserting a newline in certain cases (the "T2529 hack"). In the |
102 | // legacy parser, the T2529 hack is never applied if the template was |
103 | // already at the start of the line (the `!$piece['lineStart']` |
104 | // check in Parser::braceSubstitution where T2529 is handled), but |
105 | // that context (`$this->sol`) isn't passed through when Parsoid |
106 | // invokes the core preprocessor. Thus, when $this->sol is true, |
107 | // prepare to (if the following tokens warrant it) remove an unnecessary |
108 | // T2529 newline added by the legacy preprocessor. |
109 | $this->discardableNlTk = $token; |
110 | } |
111 | $this->tokenBuf[] = $token; |
112 | $this->sol = true; |
113 | return new TokenHandlerResult( [] ); |
114 | } |
115 | |
116 | /** |
117 | * @inheritDoc |
118 | */ |
119 | public function onEnd( EOFTk $token ): ?TokenHandlerResult { |
120 | $res = $this->onAny( $token ); |
121 | $this->reset(); |
122 | return $res; |
123 | } |
124 | |
125 | /** |
126 | * Clear start of line info |
127 | */ |
128 | private function clearSOL() { |
129 | // clear tsr and sol flag |
130 | $this->srcOffset = null; |
131 | $this->sol = false; |
132 | } |
133 | |
134 | /** |
135 | * Fully reprocess the output tokens from the tokenizer through |
136 | * all the other handlers in stage 2. |
137 | * |
138 | * @param int $srcOffset |
139 | * @param array $toks |
140 | * @param bool $popEOF |
141 | * @return array |
142 | */ |
143 | private function reprocessTokens( int $srcOffset, array $toks, bool $popEOF = false ): array { |
144 | // Update tsr |
145 | TokenUtils::shiftTokenTSR( $toks, $srcOffset ); |
146 | $pipe = $this->env->getPipelineFactory()->getPipeline( "tokens/x-mediawiki" ); |
147 | $pipe->init( [ |
148 | 'frame' => $this->manager->getFrame(), |
149 | 'toplevel' => $this->atTopLevel, |
150 | // The tokens should be reprocessed in the context of the original frame's source |
151 | 'srcText' => $this->manager->getFrame()->getSrcText() |
152 | ] ); |
153 | $toks = (array)$pipe->parse( $toks, [] ); |
154 | if ( $popEOF ) { |
155 | array_pop( $toks ); // pop EOFTk |
156 | } |
157 | return $toks; |
158 | } |
159 | |
160 | private function convertTokenToString( Token $token ): array { |
161 | $da = $token->dataParsoid; |
162 | $tsr = $da->tsr ?? null; |
163 | |
164 | if ( $tsr && $tsr->end > $tsr->start ) { |
165 | // > will only hold if these are valid numbers |
166 | $str = $tsr->substr( $this->manager->getFrame()->getSrcText() ); |
167 | // sol === false ensures that the pipe will not be parsed as a <td>/listItem again |
168 | $toks = $this->tokenizer->tokenizeSync( $str, [ 'sol' => false ] ); |
169 | return $this->reprocessTokens( $tsr->start, $toks, true ); |
170 | } elseif ( !empty( $da->autoInsertedStart ) && !empty( $da->autoInsertedEnd ) ) { |
171 | return [ '' ]; |
172 | } else { |
173 | // SSS FIXME: What about "!!" and "||"?? |
174 | switch ( $token->getName() ) { |
175 | case 'td': |
176 | return [ '|' ]; |
177 | case 'th': |
178 | return [ '!' ]; |
179 | case 'tr': |
180 | return [ '|-' ]; |
181 | case 'caption': |
182 | return [ $token instanceof TagTk ? '|+' : '' ]; |
183 | case 'table': |
184 | return [ $token instanceof EndTagTk ? '|}' : $token ]; |
185 | case 'listItem': |
186 | return [ implode( '', $token->getAttributeV( 'bullets' ) ) ]; |
187 | } |
188 | |
189 | // No conversion if we get here |
190 | return [ $token ]; |
191 | } |
192 | } |
193 | |
194 | /** |
195 | * @inheritDoc |
196 | */ |
197 | public function onAny( $token ): ?TokenHandlerResult { |
198 | try { |
199 | return $this->onAnyInternal( $token ); |
200 | } finally { |
201 | // Ensure we always clean up discardableNlTk and tplStartToken even |
202 | // in the presence of exceptions. |
203 | $this->discardableNlTk = null; |
204 | if ( $this->tplStartToken !== $token ) { |
205 | $this->tplStartToken = null; |
206 | } |
207 | } |
208 | } |
209 | |
210 | /** |
211 | * The legacy parser's "T2529 hack" attempts to ensure templates are |
212 | * always evaluated in start-of-line context by prepending a newline |
213 | * if necessary. However, it is inconsistent: in particular it |
214 | * only treats }| : ; # * as SOL-sensitive tokens, neglecting == |
215 | * (headings) and ! | |} (in table context). |
216 | * |
217 | * If we're using the core preprocessor for template expansion: |
218 | * - The core preprocessor as invoked by Parsoid will always insert the |
219 | * newline in the "T2529 cases" (even though it's not necessary; Parsoid |
220 | * is already in SOL mode) *HOWEVER* |
221 | * - As described in ::onNewline() above, the newline insertion is |
222 | * /supposed/ to be suppressed if the template was *already* |
223 | * at the start of the line. So we need to strip the unnecessarily |
224 | * added NlTk to avoid "extra" whitespace in Parsoid's expansion. |
225 | * Ex: "{{my-tpl}}" in sol-context which will get expanded to "\n*foo" |
226 | * but the "\n" wasn't necessary |
227 | * |
228 | * If we're in native preprocessor mode: |
229 | * - If we are in SOL state, we don't need to add a newline. |
230 | * - If we are not in SOL state, we need to insert a newline in 'T2529' cases. |
231 | * Ex: "{{my-tpl}}" in sol-context which expands to "*foo" but in |
232 | * non-sol context expands to "\n*foo" |
233 | * |
234 | * @param string $tokenName |
235 | */ |
236 | private function handleT2529Hack( string $tokenName ): void { |
237 | // Core's |
238 | if ( $tokenName === 'table' || $tokenName === 'listItem' ) { |
239 | // We're in a context when the core preprocessor would apply |
240 | // the "T2529 hack" to ensure start-of-line context. |
241 | if ( $this->discardableNlTk ) { |
242 | // We're using core preprocessor and were already at |
243 | // the start of the line, so the core preprocessor wouldn't |
244 | // actually have inserted a newline here. Swallow up ours. |
245 | array_pop( $this->tokenBuf ); |
246 | } elseif ( !$this->sol && |
247 | $this->tplStartToken && |
248 | $this->env->nativeTemplateExpansionEnabled() |
249 | ) { |
250 | // Native preprocessor; add a newline in "T2529 cases" |
251 | // for correct whitespace. (Remember that this only happens |
252 | // if we weren't already at the start of the line.) |
253 | // Add a newline & force SOL |
254 | $this->tokenBuf[] = new NlTk( null ); |
255 | $this->sol = true; |
256 | } |
257 | } |
258 | } |
259 | |
260 | /** |
261 | * @param mixed $token |
262 | * @return ?TokenHandlerResult |
263 | */ |
264 | public function onAnyInternal( $token ): ?TokenHandlerResult { |
265 | $self = $this; |
266 | $this->env->log( 'trace/tsp', $this->pipelineId, |
267 | static function () use ( $self, $token ) { |
268 | return "(indep=" . ( $self->inIndependentParse ? "yes" : "no " ) . |
269 | ";sol=" . ( $self->sol ? "yes" : "no " ) . |
270 | PHPUtils::jsonEncode( $token ); |
271 | } |
272 | ); |
273 | |
274 | $tokens = [ $token ]; |
275 | $tc = TokenUtils::getTokenType( $token ); |
276 | switch ( $tc ) { |
277 | case 'string': |
278 | // While we are buffering newlines to suppress them |
279 | // in case we see a category, buffer all intervening |
280 | // white-space as well. |
281 | if ( count( $this->tokenBuf ) > 0 && preg_match( '/^\s*$/D', $token ) ) { |
282 | $this->tokenBuf[] = $token; |
283 | return new TokenHandlerResult( [] ); |
284 | } |
285 | |
286 | // This is only applicable where we use Parsoid's (broken) native preprocessor. |
287 | // This supports scenarios like "{{1x|*bar}}". When "{{{1}}}" is tokenized |
288 | // "*bar" isn't available and so won't become a list. |
289 | // FIXME: {{1x|1===foo==}} will still be broken. So, this fix below is somewhat |
290 | // independent of T2529 for our broken preprocessor but we are restricting the |
291 | // fix to T2529. |
292 | $T2529hack = false; |
293 | if ( $this->env->nativeTemplateExpansionEnabled() && |
294 | $this->tplStartToken && |
295 | preg_match( '/^(?:{\\||[:;#*])/', $token ) |
296 | ) { |
297 | // Add a newline & force SOL |
298 | $T2529hack = true; |
299 | // Remove newline insertion in the core preprocessor |
300 | // only occurs if we weren't already at the start of |
301 | // the line (see discussion in ::onNewline() above). |
302 | if ( !$this->sol ) { |
303 | $this->tokenBuf[] = new NlTk( null ); |
304 | $this->sol = true; |
305 | } |
306 | } |
307 | |
308 | if ( $this->sol ) { |
309 | // Attempt to match "{|" after a newline and convert |
310 | // it to a table token. |
311 | if ( $this->inIndependentParse && str_starts_with( $token, '{|' ) ) { |
312 | // Reparse string with the 'table_start_tag' rule |
313 | // and fully reprocess them. |
314 | $retoks = $this->tokenizer->tokenizeAs( $token, 'table_start_tag', /* sol */true ); |
315 | if ( $retoks === false ) { |
316 | // XXX: The string begins with table start syntax, |
317 | // we really shouldn't be here. Anything else on the |
318 | // line would get swallowed up as attributes. |
319 | $this->env->log( 'error', 'Failed to tokenize table start tag.' ); |
320 | $this->clearSOL(); |
321 | } else { |
322 | $tokens = $this->reprocessTokens( $this->srcOffset, $retoks ); |
323 | $this->wikiTableNesting++; |
324 | $this->lastConvertedTableCellToken = null; |
325 | } |
326 | } elseif ( $this->inIndependentParse && $T2529hack ) { // {| has been handled above |
327 | $retoks = $this->tokenizer->tokenizeAs( $token, 'list_item', /* sol */true ); |
328 | if ( $retoks === false ) { |
329 | $this->env->log( 'error', 'Failed to tokenize list item.' ); |
330 | $this->clearSOL(); |
331 | } else { |
332 | $tokens = $this->reprocessTokens( $this->srcOffset, $retoks ); |
333 | } |
334 | } elseif ( preg_match( '/^\s*$/D', $token ) ) { |
335 | // White-space doesn't change SOL state |
336 | // Update srcOffset |
337 | $this->srcOffset += strlen( $token ); |
338 | } else { |
339 | $this->clearSOL(); |
340 | } |
341 | } else { |
342 | $this->clearSOL(); |
343 | } |
344 | break; |
345 | |
346 | case 'CommentTk': |
347 | // Comments don't change SOL state |
348 | // Update srcOffset |
349 | $this->srcOffset = $token->dataParsoid->tsr->end ?? null; |
350 | break; |
351 | |
352 | case 'SelfclosingTagTk': |
353 | if ( $token->getName() === 'meta' && ( $token->dataParsoid->stx ?? '' ) !== 'html' ) { |
354 | if ( TokenUtils::hasTypeOf( $token, 'mw:Transclusion' ) ) { |
355 | $this->tplStartToken = $token; |
356 | } |
357 | $this->srcOffset = $token->dataParsoid->tsr->end ?? null; |
358 | if ( count( $this->tokenBuf ) > 0 && |
359 | TokenUtils::hasTypeOf( $token, 'mw:Transclusion' ) |
360 | ) { |
361 | // If we have buffered newlines, we might very well encounter |
362 | // a category link, so continue buffering. |
363 | $this->tokenBuf[] = $token; |
364 | return new TokenHandlerResult( [] ); |
365 | } |
366 | } elseif ( $token->getName() === 'link' && |
367 | $token->getAttributeV( 'rel' ) === 'mw:PageProp/Category' |
368 | ) { |
369 | // Replace buffered newline & whitespace tokens with mw:EmptyLine |
370 | // meta-tokens. This tunnels them through the rest of the transformations |
371 | // without affecting them. During HTML building, they are expanded |
372 | // back to newlines / whitespace. |
373 | $n = count( $this->tokenBuf ); |
374 | if ( $n > 0 ) { |
375 | $i = 0; |
376 | while ( $i < $n && |
377 | !( $this->tokenBuf[$i] instanceof SelfclosingTagTk ) |
378 | ) { |
379 | $i++; |
380 | } |
381 | |
382 | $dp = new DataParsoid; |
383 | $dp->tokens = array_slice( $this->tokenBuf, 0, $i ); |
384 | $toks = [ |
385 | new SelfclosingTagTk( 'meta', |
386 | [ new KV( 'typeof', 'mw:EmptyLine' ) ], |
387 | $dp |
388 | ) |
389 | ]; |
390 | if ( $i < $n ) { |
391 | $toks[] = $this->tokenBuf[$i]; |
392 | if ( $i + 1 < $n ) { |
393 | $dp = new DataParsoid; |
394 | $dp->tokens = array_slice( $this->tokenBuf, $i + 1 ); |
395 | $toks[] = new SelfclosingTagTk( 'meta', |
396 | [ new KV( 'typeof', 'mw:EmptyLine' ) ], |
397 | $dp |
398 | ); |
399 | } |
400 | } |
401 | $tokens = array_merge( $toks, $tokens ); |
402 | $this->tokenBuf = []; |
403 | } |
404 | $this->clearSOL(); |
405 | } else { |
406 | $this->clearSOL(); |
407 | } |
408 | break; |
409 | |
410 | case 'TagTk': |
411 | if ( $this->inIndependentParse && !TokenUtils::isHTMLTag( $token ) ) { |
412 | $tokenName = $token->getName(); |
413 | $this->handleT2529Hack( $tokenName ); |
414 | if ( $tokenName === 'listItem' && isset( $this->options['attrExpansion'] ) ) { |
415 | // Convert list items back to bullet wikitext in attribute context |
416 | $tokens = $this->convertTokenToString( $token ); |
417 | } elseif ( $tokenName === 'table' ) { |
418 | $this->lastConvertedTableCellToken = null; |
419 | $this->wikiTableNesting++; |
420 | } elseif ( in_array( $tokenName, [ 'td', 'th', 'tr', 'caption' ], true ) ) { |
421 | if ( $this->wikiTableNesting === 0 ) { |
422 | if ( $token->getName() === 'td' || $token->getName() === 'th' ) { |
423 | $this->lastConvertedTableCellToken = $token; |
424 | } |
425 | $tokens = $this->convertTokenToString( $token ); |
426 | } else { |
427 | $this->lastConvertedTableCellToken = null; |
428 | } |
429 | } |
430 | } |
431 | $this->clearSOL(); |
432 | break; |
433 | |
434 | case 'EndTagTk': |
435 | if ( $this->inIndependentParse && !TokenUtils::isHTMLTag( $token ) ) { |
436 | if ( $this->wikiTableNesting > 0 ) { |
437 | if ( $token->getName() === 'table' ) { |
438 | $this->lastConvertedTableCellToken = null; |
439 | $this->wikiTableNesting--; |
440 | } |
441 | } elseif ( $token->getName() === 'table' || $token->getName() === 'caption' ) { |
442 | // Convert this to "|}" |
443 | $tokens = $this->convertTokenToString( $token ); |
444 | } |
445 | } |
446 | $this->clearSOL(); |
447 | break; |
448 | |
449 | default: |
450 | break; |
451 | } |
452 | |
453 | // Emit buffered newlines (and a transclusion meta-token, if any) |
454 | if ( count( $this->tokenBuf ) > 0 ) { |
455 | $tokens = array_merge( $this->tokenBuf, $tokens ); |
456 | $this->tokenBuf = []; |
457 | } |
458 | return new TokenHandlerResult( $tokens ); |
459 | } |
460 | } |