Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
41.26% |
92 / 223 |
|
16.67% |
2 / 12 |
CRAP | |
0.00% |
0 / 1 |
TreeBuilderStage | |
41.26% |
92 / 223 |
|
16.67% |
2 / 12 |
1694.76 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
resetState | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
processChunk | |
56.52% |
13 / 23 |
|
0.00% |
0 / 1 |
18.22 | |||
finalizeDOM | |
58.33% |
7 / 12 |
|
0.00% |
0 / 1 |
5.16 | |||
kvArrToAttr | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
stashDataAttribs | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
2.01 | |||
processToken | |
43.59% |
51 / 117 |
|
0.00% |
0 / 1 |
358.65 | |||
handleDeletedStartTag | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
182 | |||
insertPlaceholderMeta | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
56 | |||
process | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
processChunkily | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
hasAfe | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
3.14 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | // Suppress UnusedPluginSuppression because |
4 | // Phan on PHP 7.4 and PHP 8.1 need different suppressions |
5 | // @phan-file-suppress UnusedPluginSuppression,UnusedPluginFileSuppression |
6 | |
7 | /** |
8 | * Front-end/Wrapper for a particular tree builder, in this case the |
9 | * parser/tree builder from RemexHtml. Feed it tokens and it will build |
10 | * you a DOM tree and emit an event. |
11 | */ |
12 | |
13 | namespace Wikimedia\Parsoid\Wt2Html\TreeBuilder; |
14 | |
15 | use Generator; |
16 | use Wikimedia\Parsoid\Config\Env; |
17 | use Wikimedia\Parsoid\DOM\Node; |
18 | use Wikimedia\Parsoid\NodeData\DataMw; |
19 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
20 | use Wikimedia\Parsoid\NodeData\NodeData; |
21 | use Wikimedia\Parsoid\NodeData\TempData; |
22 | use Wikimedia\Parsoid\Tokens\CommentTk; |
23 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
24 | use Wikimedia\Parsoid\Tokens\EOFTk; |
25 | use Wikimedia\Parsoid\Tokens\NlTk; |
26 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
27 | use Wikimedia\Parsoid\Tokens\TagTk; |
28 | use Wikimedia\Parsoid\Tokens\Token; |
29 | use Wikimedia\Parsoid\Utils\DOMCompat; |
30 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
31 | use Wikimedia\Parsoid\Utils\DOMUtils; |
32 | use Wikimedia\Parsoid\Utils\PHPUtils; |
33 | use Wikimedia\Parsoid\Utils\TokenUtils; |
34 | use Wikimedia\Parsoid\Utils\Utils; |
35 | use Wikimedia\Parsoid\Utils\WTUtils; |
36 | use Wikimedia\Parsoid\Wt2Html\PipelineStage; |
37 | use Wikimedia\RemexHtml\TreeBuilder\Marker; |
38 | |
39 | class TreeBuilderStage extends PipelineStage { |
40 | /** @var int */ |
41 | private $tagId; |
42 | |
43 | /** @var bool */ |
44 | private $inTransclusion; |
45 | |
46 | /** @var int */ |
47 | private $tableDepth; |
48 | |
49 | /** @var RemexPipeline */ |
50 | private $remexPipeline; |
51 | |
52 | /** @var string|Token|null */ |
53 | private $lastToken; |
54 | |
55 | /** @var string */ |
56 | private $textContentBuffer = ''; |
57 | |
58 | public function __construct( |
59 | Env $env, array $options = [], string $stageId = "", |
60 | ?PipelineStage $prevStage = null |
61 | ) { |
62 | parent::__construct( $env, $prevStage ); |
63 | |
64 | // Reset variable state and set up the parser |
65 | $this->resetState( [] ); |
66 | } |
67 | |
68 | /** |
69 | * @inheritDoc |
70 | */ |
71 | public function resetState( array $options ): void { |
72 | parent::resetState( $options ); |
73 | |
74 | // Reset vars |
75 | $this->tagId = 1; // Assigned to start/self-closing tags |
76 | $this->inTransclusion = false; |
77 | |
78 | /* -------------------------------------------------------------------- |
79 | * Crude tracking of whether we are in a table |
80 | * |
81 | * The only requirement for correctness of detecting fostering content |
82 | * is that as long as there is an unclosed <table> tag, this value |
83 | * is positive. |
84 | * |
85 | * We can ensure that by making sure that independent of how many |
86 | * excess </table> tags we run into, this value is never negative. |
87 | * |
88 | * So, since this.tableDepth >= 0 always, whenever a <table> tag is seen, |
89 | * this.tableDepth >= 1 always, and our requirement is met. |
90 | * -------------------------------------------------------------------- */ |
91 | $this->tableDepth = 0; |
92 | |
93 | $this->remexPipeline = $this->env->fetchRemexPipeline( $this->toFragment ); |
94 | $this->textContentBuffer = ''; |
95 | $this->lastToken = null; |
96 | } |
97 | |
98 | /** |
99 | * Process a chunk of tokens and feed it to the HTML5 tree builder. |
100 | * This doesn't return anything. |
101 | * |
102 | * @param array $tokens Array of tokens to process |
103 | */ |
104 | public function processChunk( array $tokens ): void { |
105 | $s = null; |
106 | $profile = null; |
107 | if ( $this->env->profiling() ) { |
108 | $profile = $this->env->getCurrentProfile(); |
109 | $s = microtime( true ); |
110 | } |
111 | $n = count( $tokens ); |
112 | $i = 0; |
113 | while ( $i < $n ) { |
114 | $token = $tokens[$i]; |
115 | // if there are exactly two newlines directly after the paragraph end, and if we have active |
116 | // formatting elements, we process one of the new lines inside the paragraph (before the EndTk) |
117 | // rather than after (T368720) |
118 | $nlIndex = $i + 1; |
119 | if ( $token instanceof EndTagTk && $token->getName() === 'p' && $this->hasAfe() ) { |
120 | while ( $nlIndex < $n && $tokens[$nlIndex] instanceof NlTk ) { |
121 | $nlIndex++; |
122 | } |
123 | } |
124 | if ( $nlIndex === $i + 3 ) { |
125 | $this->processToken( $tokens[$i + 1] ); |
126 | $this->processToken( $tokens[$i + 2] ); |
127 | $this->processToken( $token ); |
128 | $i += 3; |
129 | } else { |
130 | $this->processToken( $token ); |
131 | $i += 1; |
132 | } |
133 | } |
134 | |
135 | if ( $profile ) { |
136 | $profile->bumpTimeUse( |
137 | 'HTML5 TreeBuilder', 1000 * ( microtime( true ) - $s ), 'HTML5' ); |
138 | } |
139 | } |
140 | |
141 | public function finalizeDOM(): Node { |
142 | // Check if the EOFTk actually made it all the way through, and flag the |
143 | // page where it did not! |
144 | if ( $this->lastToken !== null && !( $this->lastToken instanceof EOFTk ) ) { |
145 | $this->env->log( |
146 | 'error', 'EOFTk was lost in page', |
147 | $this->env->getContextTitle()->getPrefixedText() |
148 | ); |
149 | } |
150 | |
151 | if ( $this->toFragment ) { |
152 | // This is similar to DOMCompat::setInnerHTML() in that we can |
153 | // consider it equivalent to the fragment parsing algorithm, |
154 | // https://html.spec.whatwg.org/#html-fragment-parsing-algorithm |
155 | $node = $this->env->getTopLevelDoc()->createDocumentFragment(); |
156 | DOMUtils::migrateChildrenBetweenDocs( |
157 | DOMCompat::getBody( $this->remexPipeline->doc ), $node |
158 | ); |
159 | } else { |
160 | $node = DOMCompat::getBody( $this->remexPipeline->doc ); |
161 | } |
162 | |
163 | return $node; |
164 | } |
165 | |
166 | private function kvArrToAttr( array $kvArr ): array { |
167 | $attribs = []; |
168 | foreach ( $kvArr as $kv ) { |
169 | $attribs[$kv->k] = $kv->v; |
170 | |
171 | } |
172 | return $attribs; |
173 | } |
174 | |
175 | /** |
176 | * Keep this in sync with `DOMDataUtils.setNodeData()` |
177 | * |
178 | * @param array $attribs |
179 | * @param DataParsoid $dataParsoid |
180 | * @return array |
181 | */ |
182 | private function stashDataAttribs( array $attribs, DataParsoid $dataParsoid, ?DataMw $dataMw ): array { |
183 | $data = new NodeData; |
184 | $data->parsoid = $dataParsoid; |
185 | if ( $dataMw !== null ) { |
186 | $data->mw = $dataMw; |
187 | } |
188 | // Store in the top level doc since we'll be importing the nodes after treebuilding |
189 | $nodeId = DOMDataUtils::stashObjectInDoc( $this->env->getTopLevelDoc(), $data ); |
190 | $attribs[DOMDataUtils::DATA_OBJECT_ATTR_NAME] = (string)$nodeId; |
191 | return $attribs; |
192 | } |
193 | |
194 | /** |
195 | * Adapt the token format to internal HTML tree builder format, call the actual |
196 | * html tree builder by emitting the token. |
197 | * |
198 | * @param Token|string $token |
199 | */ |
200 | public function processToken( $token ): void { |
201 | if ( $this->pipelineId === 0 ) { |
202 | if ( $this->env->bumpWt2HtmlResourceUse( 'token' ) === false ) { |
203 | // `false` indicates that this bump pushed us over the threshold |
204 | // We don't want to log every token above that, which would be `null` |
205 | $this->env->log( 'warn', "wt2html: token limit exceeded" ); |
206 | } |
207 | } |
208 | |
209 | $dispatcher = $this->remexPipeline->dispatcher; |
210 | $attribs = !is_string( $token ) && $token->attribs !== null ? $this->kvArrToAttr( $token->attribs ) : []; |
211 | $dataParsoid = !is_string( $token ) ? $token->dataParsoid : new DataParsoid; |
212 | $dataMw = $token->dataMw ?? null; |
213 | $tmp = $dataParsoid->getTemp(); |
214 | |
215 | if ( $this->inTransclusion ) { |
216 | $tmp->setFlag( TempData::IN_TRANSCLUSION ); |
217 | } |
218 | |
219 | // Assign tagId to open/self-closing tags |
220 | if ( $token instanceof TagTk || $token instanceof SelfclosingTagTk ) { |
221 | $tmp->tagId = $this->tagId++; |
222 | } |
223 | |
224 | $this->env->log( 'trace/html', $this->pipelineId, static function () use ( $token ) { |
225 | return PHPUtils::jsonEncode( $token ); |
226 | } ); |
227 | |
228 | // Store the last token |
229 | $this->lastToken = $token; |
230 | |
231 | $isString = is_string( $token ) || $token instanceof NlTk; |
232 | if ( !$isString && $this->textContentBuffer !== '' ) { |
233 | // Finalize the combined string tokens |
234 | $dispatcher->characters( $this->textContentBuffer, 0, strlen( $this->textContentBuffer ), 0, 0 ); |
235 | |
236 | // If inside a table and a transclusion, add a meta tag after every |
237 | // text node so that we can detect fostered content that came from |
238 | // a transclusion. |
239 | if ( $this->inTransclusion && $this->tableDepth > 0 ) { |
240 | // The HTML spec says, "Space characters separated from non-space |
241 | // characters by non-character tokens are not affected by foster |
242 | // parenting" |
243 | if ( !preg_match( '/^\s*$/D', $this->textContentBuffer ) ) { |
244 | $this->env->log( |
245 | 'debug/html', $this->pipelineId, |
246 | 'Inserting shadow transclusion meta' |
247 | ); |
248 | $this->remexPipeline->insertExplicitStartTag( |
249 | 'meta', [ 'typeof' => 'mw:TransclusionShadow' ], true |
250 | ); |
251 | } |
252 | } |
253 | |
254 | $this->textContentBuffer = ''; |
255 | } |
256 | |
257 | if ( $isString ) { |
258 | $data = $token instanceof NlTk ? "\n" : $token; |
259 | // Combine string tokens to be finalized later |
260 | $this->textContentBuffer .= $data; |
261 | } elseif ( $token instanceof TagTk ) { |
262 | $tName = $token->getName(); |
263 | if ( $tName === 'table' ) { |
264 | $this->tableDepth++; |
265 | // Don't add foster box in transclusion |
266 | // Avoids unnecessary insertions, the case where a table |
267 | // doesn't have tsr info, and the messy unbalanced table case, |
268 | // like the navbox |
269 | if ( !$this->inTransclusion ) { |
270 | $this->env->log( 'debug/html', $this->pipelineId, 'Inserting foster box meta' ); |
271 | $this->remexPipeline->insertImplicitStartTag( |
272 | 'table', |
273 | [ 'typeof' => 'mw:FosterBox' ] |
274 | ); |
275 | } |
276 | } |
277 | |
278 | $node = $this->remexPipeline->insertExplicitStartTag( |
279 | $tName, |
280 | $this->stashDataAttribs( $attribs, $dataParsoid, $dataMw ), |
281 | false |
282 | ); |
283 | if ( !$node ) { |
284 | $this->handleDeletedStartTag( $tName, $dataParsoid ); |
285 | } |
286 | } elseif ( $token instanceof SelfclosingTagTk ) { |
287 | $tName = $token->getName(); |
288 | |
289 | // Re-expand an empty-line meta-token into its constituent comment + WS tokens |
290 | if ( TokenUtils::isEmptyLineMetaToken( $token ) ) { |
291 | $this->processChunk( $dataParsoid->tokens ); |
292 | return; |
293 | } |
294 | |
295 | $wasInserted = false; |
296 | |
297 | // Transclusion metas are placeholders and are eliminated after template-wrapping. |
298 | // Fostering them unnecessarily expands template ranges. Same for mw:Param metas. |
299 | if ( $tName === 'meta' ) { |
300 | $shouldNotFoster = TokenUtils::matchTypeOf( |
301 | $token, |
302 | '#^mw:(Transclusion|Param)(/|$)#' |
303 | ); |
304 | if ( $shouldNotFoster ) { |
305 | // transclusions state |
306 | $transType = TokenUtils::matchTypeOf( $token, '#^mw:Transclusion#' ); |
307 | if ( $transType ) { |
308 | // typeof starts with mw:Transclusion |
309 | $this->inTransclusion = ( $transType === 'mw:Transclusion' ); |
310 | } |
311 | $this->remexPipeline->insertUnfosteredMeta( |
312 | $this->stashDataAttribs( $attribs, $dataParsoid, $dataMw ) ); |
313 | $wasInserted = true; |
314 | } |
315 | } |
316 | |
317 | if ( !$wasInserted ) { |
318 | $node = $this->remexPipeline->insertExplicitStartTag( |
319 | $tName, |
320 | $this->stashDataAttribs( $attribs, $dataParsoid, $dataMw ), |
321 | false |
322 | ); |
323 | if ( $node ) { |
324 | if ( !Utils::isVoidElement( $tName ) ) { |
325 | $this->remexPipeline->insertExplicitEndTag( |
326 | $tName, ( $dataParsoid->stx ?? '' ) === 'html' ); |
327 | } |
328 | } else { |
329 | $this->insertPlaceholderMeta( $tName, $dataParsoid, true ); |
330 | } |
331 | } |
332 | } elseif ( $token instanceof EndTagTk ) { |
333 | $tName = $token->getName(); |
334 | if ( $tName === 'table' && $this->tableDepth > 0 ) { |
335 | $this->tableDepth--; |
336 | } |
337 | $node = $this->remexPipeline->insertExplicitEndTag( |
338 | $tName, |
339 | ( $dataParsoid->stx ?? '' ) === 'html' |
340 | ); |
341 | if ( $node ) { |
342 | // Copy data attribs from the end tag to the element |
343 | $nodeDP = DOMDataUtils::getDataParsoid( $node ); |
344 | if ( !WTUtils::hasLiteralHTMLMarker( $nodeDP ) |
345 | && isset( $dataParsoid->endTagSrc ) |
346 | ) { |
347 | $nodeDP->endTagSrc = $dataParsoid->endTagSrc; |
348 | } |
349 | if ( !empty( $dataParsoid->stx ) ) { |
350 | // FIXME: Not sure why we do this. For example, |
351 | // with "{|\n|x\n</table>", why should the entire table |
352 | // be marked HTML syntax? This is probably entirely |
353 | // 2013-era historical stuff. Investigate & fix. |
354 | // |
355 | // Same behavior with '''foo</b> |
356 | // |
357 | // Transfer stx flag |
358 | $nodeDP->stx = $dataParsoid->stx; |
359 | } |
360 | if ( isset( $dataParsoid->tsr ) ) { |
361 | $nodeDP->getTemp()->endTSR = $dataParsoid->tsr; |
362 | } |
363 | if ( isset( $nodeDP->autoInsertedStartToken ) ) { |
364 | $nodeDP->autoInsertedStart = true; |
365 | unset( $nodeDP->autoInsertedStartToken ); |
366 | } |
367 | if ( isset( $nodeDP->autoInsertedEndToken ) ) { |
368 | $nodeDP->autoInsertedEnd = true; |
369 | unset( $nodeDP->autoInsertedEndToken ); |
370 | } |
371 | } else { |
372 | // The tag was stripped. Insert an mw:Placeholder for round-tripping |
373 | $this->insertPlaceholderMeta( $tName, $dataParsoid, false ); |
374 | } |
375 | } elseif ( $token instanceof CommentTk ) { |
376 | $dp = $token->dataParsoid; |
377 | // @phan-suppress-next-line PhanUndeclaredProperty dynamic property |
378 | if ( isset( $dp->unclosedComment ) ) { |
379 | // Add a marker meta tag to aid accurate DSR computation |
380 | $attribs = [ 'typeof' => 'mw:Placeholder/UnclosedComment' ]; |
381 | $this->remexPipeline->insertUnfosteredMeta( |
382 | $this->stashDataAttribs( $attribs, $dp, $token->dataMw ) ); |
383 | } |
384 | $dispatcher->comment( $token->value, 0, 0 ); |
385 | } elseif ( $token instanceof EOFTk ) { |
386 | $dispatcher->endDocument( 0 ); |
387 | } else { |
388 | $errors = [ |
389 | '-------- Unhandled token ---------', |
390 | 'TYPE: ' . $token->getType(), |
391 | 'VAL : ' . PHPUtils::jsonEncode( $token ) |
392 | ]; |
393 | $this->env->log( 'error', implode( "\n", $errors ) ); |
394 | } |
395 | } |
396 | |
397 | /** |
398 | * Insert td/tr/th tag source or a placeholder meta |
399 | * |
400 | * @param string $name |
401 | * @param DataParsoid $dp |
402 | */ |
403 | private function handleDeletedStartTag( string $name, DataParsoid $dp ): void { |
404 | if ( ( $dp->stx ?? null ) !== 'html' && |
405 | ( $name === 'td' || $name === 'tr' || $name === 'th' ) |
406 | ) { |
407 | // A stripped wikitext-syntax table tag outside of a table. Re-insert the original |
408 | // page source. |
409 | if ( !empty( $dp->tsr ) && |
410 | $dp->tsr->start !== null && $dp->tsr->end !== null |
411 | ) { |
412 | $origTxt = $dp->tsr->substr( $this->frame->getSrcText() ); |
413 | } else { |
414 | switch ( $name ) { |
415 | case 'td': |
416 | $origTxt = '|'; |
417 | break; |
418 | case 'tr': |
419 | $origTxt = '|-'; |
420 | break; |
421 | case 'th': |
422 | $origTxt = '!'; |
423 | break; |
424 | default: |
425 | $origTxt = ''; |
426 | break; |
427 | } |
428 | } |
429 | if ( $origTxt !== '' ) { |
430 | $this->remexPipeline->dispatcher->characters( $origTxt, 0, strlen( $origTxt ), 0, |
431 | 0 ); |
432 | } |
433 | } else { |
434 | $this->insertPlaceholderMeta( $name, $dp, true ); |
435 | } |
436 | } |
437 | |
438 | /** |
439 | * Insert a placeholder meta for a deleted start or end tag |
440 | * |
441 | * @param string $name |
442 | * @param DataParsoid $dp |
443 | * @param bool $isStart |
444 | */ |
445 | private function insertPlaceholderMeta( |
446 | string $name, DataParsoid $dp, bool $isStart |
447 | ) { |
448 | // If node is in a position where the placeholder node will get fostered |
449 | // out, don't bother adding one since the browser and other compliant |
450 | // clients will move the placeholder out of the table. |
451 | if ( $this->remexPipeline->isFosterablePosition() ) { |
452 | return; |
453 | } |
454 | |
455 | $src = $dp->src ?? null; |
456 | |
457 | if ( !$src ) { |
458 | if ( !empty( $dp->tsr ) ) { |
459 | $src = $dp->tsr->substr( $this->frame->getSrcText() ); |
460 | } elseif ( WTUtils::hasLiteralHTMLMarker( $dp ) ) { |
461 | if ( $isStart ) { |
462 | $src = '<' . $name . '>'; |
463 | } else { |
464 | $src = '</' . $name . '>'; |
465 | } |
466 | } |
467 | } |
468 | |
469 | if ( $src ) { |
470 | $metaDP = new DataParsoid; |
471 | $metaDP->src = $src; |
472 | $metaDP->name = $name; |
473 | $this->remexPipeline->insertUnfosteredMeta( |
474 | $this->stashDataAttribs( |
475 | [ 'typeof' => 'mw:Placeholder/StrippedTag' ], |
476 | $metaDP, null |
477 | ) |
478 | ); |
479 | } |
480 | } |
481 | |
482 | /** |
483 | * @inheritDoc |
484 | */ |
485 | public function process( $input, array $opts ) { |
486 | '@phan-var array $input'; // @var array $input |
487 | $this->processChunk( $input ); |
488 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
489 | return $this->finalizeDOM(); |
490 | } |
491 | |
492 | /** |
493 | * @inheritDoc |
494 | */ |
495 | public function processChunkily( $input, array $opts ): Generator { |
496 | if ( $this->prevStage ) { |
497 | foreach ( $this->prevStage->processChunkily( $input, $opts ) as $chunk ) { |
498 | '@phan-var array $chunk'; // @var array $chunk |
499 | $this->processChunk( $chunk ); |
500 | } |
501 | yield $this->finalizeDOM(); |
502 | } else { |
503 | yield $this->process( $input, $opts ); |
504 | } |
505 | } |
506 | |
507 | private function hasAfe(): bool { |
508 | $afe = $this->remexPipeline->treeBuilder->afe->getTail(); |
509 | while ( $afe !== null && $afe instanceof Marker ) { |
510 | $afe = $afe->prevAFE; |
511 | } |
512 | return $afe !== null; |
513 | } |
514 | } |