Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 181 |
|
0.00% |
0 / 10 |
CRAP | |
0.00% |
0 / 1 |
TokenStreamPatcher | |
0.00% |
0 / 181 |
|
0.00% |
0 / 10 |
6162 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
resetState | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
6 | |||
reset | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
onNewline | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
12 | |||
onEnd | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
clearSOL | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
reprocessTokens | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
6 | |||
convertTokenToString | |
0.00% |
0 / 22 |
|
0.00% |
0 / 1 |
240 | |||
onAny | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
onAnyInternal | |
0.00% |
0 / 113 |
|
0.00% |
0 / 1 |
2550 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html\TT; |
5 | |
6 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
7 | use Wikimedia\Parsoid\Tokens\CommentTk; |
8 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
9 | use Wikimedia\Parsoid\Tokens\EOFTk; |
10 | use Wikimedia\Parsoid\Tokens\KV; |
11 | use Wikimedia\Parsoid\Tokens\NlTk; |
12 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
13 | use Wikimedia\Parsoid\Tokens\TagTk; |
14 | use Wikimedia\Parsoid\Tokens\Token; |
15 | use Wikimedia\Parsoid\Utils\PHPUtils; |
16 | use Wikimedia\Parsoid\Utils\PipelineUtils; |
17 | use Wikimedia\Parsoid\Utils\TokenUtils; |
18 | use Wikimedia\Parsoid\Wt2Html\PegTokenizer; |
19 | use Wikimedia\Parsoid\Wt2Html\TokenHandlerPipeline; |
20 | |
21 | /** |
22 | * This class is an attempt to fixup the token stream to reparse strings |
23 | * as tokens that failed to parse in the tokenizer because of SOL or |
24 | * other constraints OR because tags were being constructed in pieces |
25 | * or whatever. |
26 | * |
27 | * This is a pure hack to improve compatibility with the core parser |
28 | * given that we dont have a preprocessor. This will be a grab-bag of |
29 | * heuristics and tricks to handle different scenarios. |
30 | */ |
31 | class TokenStreamPatcher extends TokenHandler { |
32 | private PegTokenizer $tokenizer; |
33 | |
34 | /** @var int|null */ |
35 | private $srcOffset; |
36 | |
37 | private bool $sol; |
38 | |
39 | private array $tokenBuf; |
40 | private int $wikiTableNesting; |
41 | /** True only for top-level & attribute value pipelines */ |
42 | private bool $inIndependentParse; |
43 | |
44 | /** @var Token|null */ |
45 | private $lastConvertedTableCellToken; |
46 | |
47 | /** @var SelfclosingTagTk|null */ |
48 | private $tplStartToken = null; |
49 | |
50 | public function __construct( TokenHandlerPipeline $manager, array $options ) { |
51 | $newOptions = [ 'tsp' => true ] + $options; |
52 | parent::__construct( $manager, $newOptions ); |
53 | $this->tokenizer = new PegTokenizer( $this->env ); |
54 | $this->reset(); |
55 | } |
56 | |
57 | /** |
58 | * Resets any internal state for this token handler. |
59 | * |
60 | * @param array $parseOpts |
61 | */ |
62 | public function resetState( array $parseOpts ): void { |
63 | parent::resetState( $parseOpts ); |
64 | $this->inIndependentParse = $this->atTopLevel || isset( $this->options['attrExpansion'] ); |
65 | } |
66 | |
67 | private function reset() { |
68 | $this->srcOffset = 0; |
69 | $this->sol = true; |
70 | $this->tokenBuf = []; |
71 | $this->wikiTableNesting = 0; |
72 | // This marker tries to track the most recent table-cell token (td/th) |
73 | // that was converted to string. For those, we want to get rid |
74 | // of their corresponding mw:TSRMarker meta tag. |
75 | // |
76 | // This marker is set when we convert a td/th token to string |
77 | // |
78 | // This marker is cleared in one of the following scenarios: |
79 | // 1. When we clear a mw:TSRMarker corresponding to the token set earlier |
80 | // 2. When we change table nesting |
81 | // 3. When we hit a tr/td/th/caption token that wasn't converted to string |
82 | $this->lastConvertedTableCellToken = null; |
83 | } |
84 | |
85 | /** |
86 | * @inheritDoc |
87 | */ |
88 | public function onNewline( NlTk $token ): ?array { |
89 | $self = $this; |
90 | $this->env->trace( 'tsp', $this->pipelineId, |
91 | static function () use ( $self, $token ) { |
92 | return "(indep=" . ( $self->inIndependentParse ? "yes" : "no " ) . |
93 | ";sol=" . ( $self->sol ? "yes" : "no " ) . ') ' . |
94 | PHPUtils::jsonEncode( $token ); |
95 | } |
96 | ); |
97 | $this->srcOffset = $token->dataParsoid->tsr->end ?? null; |
98 | $this->tokenBuf[] = $token; |
99 | $this->sol = true; |
100 | return []; |
101 | } |
102 | |
103 | /** |
104 | * @inheritDoc |
105 | */ |
106 | public function onEnd( EOFTk $token ): ?array { |
107 | $res = $this->onAny( $token ); |
108 | $this->reset(); |
109 | return $res; |
110 | } |
111 | |
112 | /** |
113 | * Clear start of line info |
114 | */ |
115 | private function clearSOL() { |
116 | // clear tsr and sol flag |
117 | $this->srcOffset = null; |
118 | $this->sol = false; |
119 | } |
120 | |
121 | /** |
122 | * Fully reprocess the output tokens from the tokenizer through |
123 | * all the other handlers in stage 2. |
124 | * |
125 | * @param int|false $srcOffset See TokenUtils::shiftTokenTSR, which has b/c for null |
126 | * @param array $toks |
127 | * @param bool $popEOF |
128 | * @return array<string|Token> |
129 | */ |
130 | private function reprocessTokens( $srcOffset, array $toks, bool $popEOF = false ): array { |
131 | // Update tsr |
132 | TokenUtils::shiftTokenTSR( $toks, $srcOffset ); |
133 | |
134 | $toks = (array)PipelineUtils::processContentInPipeline( |
135 | $this->env, |
136 | $this->manager->getFrame(), |
137 | $toks, |
138 | [ |
139 | 'pipelineType' => 'peg-tokens-to-expanded-tokens', |
140 | 'pipelineOpts' => [], |
141 | 'sol' => true, |
142 | 'toplevel' => $this->atTopLevel, |
143 | ] |
144 | ); |
145 | |
146 | if ( $popEOF ) { |
147 | array_pop( $toks ); // pop EOFTk |
148 | } |
149 | return $toks; |
150 | } |
151 | |
152 | /** |
153 | * @return array<string|Token> |
154 | */ |
155 | private function convertTokenToString( Token $token ): array { |
156 | $da = $token->dataParsoid; |
157 | $tsr = $da->tsr ?? null; |
158 | |
159 | if ( $tsr && $tsr->end > $tsr->start ) { |
160 | // > will only hold if these are valid numbers |
161 | $str = $tsr->substr( $this->manager->getFrame()->getSrcText() ); |
162 | // sol === false ensures that the pipe will not be parsed as a <td>/listItem again |
163 | $toks = $this->tokenizer->tokenizeSync( $str, [ 'sol' => false ] ); |
164 | return $this->reprocessTokens( $tsr->start, $toks, true ); |
165 | } elseif ( !empty( $da->autoInsertedStart ) && !empty( $da->autoInsertedEnd ) ) { |
166 | return [ '' ]; |
167 | } else { |
168 | switch ( $token->getName() ) { |
169 | case 'td': |
170 | return [ ( $token->dataParsoid->stx ?? '' ) === 'row' ? '||' : '|' ]; |
171 | case 'th': |
172 | return [ ( $token->dataParsoid->stx ?? '' ) === 'row' ? '!!' : '!' ]; |
173 | case 'tr': |
174 | return [ '|-' ]; |
175 | case 'caption': |
176 | return [ $token instanceof TagTk ? '|+' : '' ]; |
177 | case 'table': |
178 | return [ $token instanceof EndTagTk ? '|}' : $token ]; |
179 | case 'listItem': |
180 | return [ implode( '', $token->getAttributeV( 'bullets' ) ) ]; |
181 | } |
182 | |
183 | return [ $token ]; |
184 | } |
185 | } |
186 | |
187 | /** |
188 | * @inheritDoc |
189 | */ |
190 | public function onAny( $token ): ?array { |
191 | try { |
192 | return $this->onAnyInternal( $token ); |
193 | } finally { |
194 | // Ensure we always clean up tplStartToken even |
195 | // in the presence of exceptions. |
196 | if ( $this->tplStartToken !== $token ) { |
197 | $this->tplStartToken = null; |
198 | } |
199 | } |
200 | } |
201 | |
202 | /** |
203 | * @param string|Token $token |
204 | * @return ?array<string|Token> |
205 | */ |
206 | public function onAnyInternal( $token ): ?array { |
207 | $self = $this; |
208 | $this->env->trace( 'tsp', $this->pipelineId, |
209 | static function () use ( $self, $token ) { |
210 | return "(indep=" . ( $self->inIndependentParse ? "yes" : "no " ) . |
211 | ";sol=" . ( $self->sol ? "yes" : "no " ) . ') ' . |
212 | PHPUtils::jsonEncode( $token ); |
213 | } |
214 | ); |
215 | |
216 | $tokens = [ $token ]; |
217 | |
218 | switch ( true ) { |
219 | case is_string( $token ): |
220 | // While we are buffering newlines to suppress them |
221 | // in case we see a category, buffer all intervening |
222 | // white-space as well. |
223 | if ( count( $this->tokenBuf ) > 0 && preg_match( '/^\s*$/D', $token ) ) { |
224 | $this->tokenBuf[] = $token; |
225 | return []; |
226 | } |
227 | |
228 | // This is only applicable where we use Parsoid's (broken) native preprocessor. |
229 | // This supports scenarios like "{{1x|*bar}}". When "{{{1}}}" is tokenized |
230 | // "*bar" isn't available and so won't become a list. |
231 | // FIXME: {{1x|1===foo==}} will still be broken. So, this fix below is somewhat |
232 | // independent of T2529 for our broken preprocessor but we are restricting the |
233 | // fix to T2529. |
234 | $T2529hack = false; |
235 | if ( $this->env->nativeTemplateExpansionEnabled() && |
236 | $this->tplStartToken && |
237 | preg_match( '/^(?:{\\||[:;#*])/', $token ) |
238 | ) { |
239 | // Add a newline & force SOL |
240 | $T2529hack = true; |
241 | // Remove newline insertion in the core preprocessor |
242 | // only occurs if we weren't already at the start of |
243 | // the line (see discussion in ::onNewline() above). |
244 | if ( !$this->sol ) { |
245 | $this->tokenBuf[] = new NlTk( null ); |
246 | $this->sol = true; |
247 | } |
248 | } |
249 | |
250 | if ( $this->sol ) { |
251 | // Attempt to match "{|" after a newline and convert |
252 | // it to a table token. |
253 | if ( $this->inIndependentParse && str_starts_with( $token, '{|' ) ) { |
254 | // Reparse string with the 'table_start_tag' rule |
255 | // and fully reprocess them. |
256 | $retoks = $this->tokenizer->tokenizeAs( $token, 'table_start_tag', /* sol */true ); |
257 | if ( $retoks === false ) { |
258 | // XXX: The string begins with table start syntax, |
259 | // we really shouldn't be here. Anything else on the |
260 | // line would get swallowed up as attributes. |
261 | $this->env->log( 'error', 'Failed to tokenize table start tag.' ); |
262 | $this->clearSOL(); |
263 | } else { |
264 | $tokens = $this->reprocessTokens( $this->srcOffset, $retoks ); |
265 | $this->wikiTableNesting++; |
266 | $this->lastConvertedTableCellToken = null; |
267 | } |
268 | } elseif ( $this->inIndependentParse && $T2529hack ) { // {| has been handled above |
269 | $retoks = $this->tokenizer->tokenizeAs( $token, 'list_item', /* sol */true ); |
270 | if ( $retoks === false ) { |
271 | $this->env->log( 'error', 'Failed to tokenize list item.' ); |
272 | $this->clearSOL(); |
273 | } else { |
274 | $tokens = $this->reprocessTokens( $this->srcOffset, $retoks ); |
275 | } |
276 | } elseif ( preg_match( '/^\s*$/D', $token ) ) { |
277 | // White-space doesn't change SOL state |
278 | // Update srcOffset |
279 | $this->srcOffset += strlen( $token ); |
280 | } else { |
281 | $this->clearSOL(); |
282 | } |
283 | } else { |
284 | $this->clearSOL(); |
285 | } |
286 | break; |
287 | |
288 | case $token instanceof CommentTk: |
289 | // Comments don't change SOL state |
290 | // Update srcOffset |
291 | $this->srcOffset = $token->dataParsoid->tsr->end ?? null; |
292 | break; |
293 | |
294 | case $token instanceof SelfclosingTagTk: |
295 | if ( $token->getName() === 'meta' && ( $token->dataParsoid->stx ?? '' ) !== 'html' ) { |
296 | if ( TokenUtils::hasTypeOf( $token, 'mw:Transclusion' ) ) { |
297 | $this->tplStartToken = $token; |
298 | } |
299 | $this->srcOffset = $token->dataParsoid->tsr->end ?? null; |
300 | if ( count( $this->tokenBuf ) > 0 && |
301 | TokenUtils::hasTypeOf( $token, 'mw:Transclusion' ) |
302 | ) { |
303 | // If we have buffered newlines, we might very well encounter |
304 | // a category link, so continue buffering. |
305 | $this->tokenBuf[] = $token; |
306 | return []; |
307 | } |
308 | } elseif ( TokenUtils::isSolTransparentLinkTag( $token ) ) { |
309 | // Replace buffered newline & whitespace tokens with mw:EmptyLine |
310 | // meta-tokens. This tunnels them through the rest of the transformations |
311 | // without affecting them. During HTML building, they are expanded |
312 | // back to newlines / whitespace. |
313 | $n = count( $this->tokenBuf ); |
314 | if ( $n > 0 ) { |
315 | $i = 0; |
316 | while ( $i < $n && |
317 | !( $this->tokenBuf[$i] instanceof SelfclosingTagTk ) |
318 | ) { |
319 | $i++; |
320 | } |
321 | |
322 | $dp = new DataParsoid; |
323 | $dp->tokens = array_slice( $this->tokenBuf, 0, $i ); |
324 | $toks = [ |
325 | new SelfclosingTagTk( 'meta', |
326 | [ new KV( 'typeof', 'mw:EmptyLine' ) ], |
327 | $dp |
328 | ) |
329 | ]; |
330 | if ( $i < $n ) { |
331 | $toks[] = $this->tokenBuf[$i]; |
332 | if ( $i + 1 < $n ) { |
333 | $dp = new DataParsoid; |
334 | $dp->tokens = array_slice( $this->tokenBuf, $i + 1 ); |
335 | $toks[] = new SelfclosingTagTk( 'meta', |
336 | [ new KV( 'typeof', 'mw:EmptyLine' ) ], |
337 | $dp |
338 | ); |
339 | } |
340 | } |
341 | $tokens = array_merge( $toks, $tokens ); |
342 | $this->tokenBuf = []; |
343 | } |
344 | $this->clearSOL(); |
345 | } else { |
346 | $this->clearSOL(); |
347 | } |
348 | break; |
349 | |
350 | case $token instanceof TagTk: |
351 | if ( $this->inIndependentParse && !TokenUtils::isHTMLTag( $token ) ) { |
352 | $tokenName = $token->getName(); |
353 | if ( $tokenName === 'listItem' && isset( $this->options['attrExpansion'] ) ) { |
354 | // Convert list items back to bullet wikitext in attribute context |
355 | $tokens = $this->convertTokenToString( $token ); |
356 | } elseif ( $tokenName === 'table' ) { |
357 | $this->lastConvertedTableCellToken = null; |
358 | $this->wikiTableNesting++; |
359 | } elseif ( in_array( $tokenName, [ 'td', 'th', 'tr', 'caption' ], true ) ) { |
360 | if ( $this->wikiTableNesting === 0 ) { |
361 | if ( $token->getName() === 'td' || $token->getName() === 'th' ) { |
362 | $this->lastConvertedTableCellToken = $token; |
363 | } |
364 | $tokens = $this->convertTokenToString( $token ); |
365 | } else { |
366 | $this->lastConvertedTableCellToken = null; |
367 | } |
368 | } |
369 | } |
370 | $this->clearSOL(); |
371 | break; |
372 | |
373 | case $token instanceof EndTagTk: |
374 | if ( $this->inIndependentParse && !TokenUtils::isHTMLTag( $token ) ) { |
375 | if ( $this->wikiTableNesting > 0 ) { |
376 | if ( $token->getName() === 'table' ) { |
377 | $this->lastConvertedTableCellToken = null; |
378 | $this->wikiTableNesting--; |
379 | } |
380 | } elseif ( $token->getName() === 'table' || $token->getName() === 'caption' ) { |
381 | // Convert this to "|}" |
382 | $tokens = $this->convertTokenToString( $token ); |
383 | } |
384 | } |
385 | $this->clearSOL(); |
386 | break; |
387 | |
388 | default: |
389 | break; |
390 | } |
391 | |
392 | // Emit buffered newlines (and a transclusion meta-token, if any) |
393 | if ( count( $this->tokenBuf ) > 0 ) { |
394 | $tokens = array_merge( $this->tokenBuf, $tokens ); |
395 | $this->tokenBuf = []; |
396 | } |
397 | |
398 | return $tokens; |
399 | } |
400 | } |