Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
40.10% |
83 / 207 |
|
18.18% |
2 / 11 |
CRAP | |
0.00% |
0 / 1 |
TreeBuilderStage | |
40.10% |
83 / 207 |
|
18.18% |
2 / 11 |
1420.55 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
resetState | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
processChunk | |
63.64% |
7 / 11 |
|
0.00% |
0 / 1 |
4.77 | |||
finalizeDOM | |
58.33% |
7 / 12 |
|
0.00% |
0 / 1 |
5.16 | |||
kvArrToAttr | |
75.00% |
3 / 4 |
|
0.00% |
0 / 1 |
2.06 | |||
stashDataAttribs | |
85.71% |
6 / 7 |
|
0.00% |
0 / 1 |
2.01 | |||
processToken | |
43.59% |
51 / 117 |
|
0.00% |
0 / 1 |
342.75 | |||
handleDeletedStartTag | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
182 | |||
insertPlaceholderMeta | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
56 | |||
process | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
processChunkily | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | // Suppress UnusedPluginSuppression because |
4 | // Phan on PHP 7.4 and PHP 8.1 need different suppressions |
5 | // @phan-file-suppress UnusedPluginSuppression,UnusedPluginFileSuppression |
6 | |
7 | /** |
8 | * Front-end/Wrapper for a particular tree builder, in this case the |
9 | * parser/tree builder from RemexHtml. Feed it tokens and it will build |
10 | * you a DOM tree and emit an event. |
11 | */ |
12 | |
13 | namespace Wikimedia\Parsoid\Wt2Html\TreeBuilder; |
14 | |
15 | use Generator; |
16 | use Wikimedia\Parsoid\Config\Env; |
17 | use Wikimedia\Parsoid\DOM\Node; |
18 | use Wikimedia\Parsoid\NodeData\DataMw; |
19 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
20 | use Wikimedia\Parsoid\NodeData\NodeData; |
21 | use Wikimedia\Parsoid\NodeData\TempData; |
22 | use Wikimedia\Parsoid\Tokens\CommentTk; |
23 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
24 | use Wikimedia\Parsoid\Tokens\EOFTk; |
25 | use Wikimedia\Parsoid\Tokens\NlTk; |
26 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
27 | use Wikimedia\Parsoid\Tokens\TagTk; |
28 | use Wikimedia\Parsoid\Tokens\Token; |
29 | use Wikimedia\Parsoid\Utils\DOMCompat; |
30 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
31 | use Wikimedia\Parsoid\Utils\DOMUtils; |
32 | use Wikimedia\Parsoid\Utils\PHPUtils; |
33 | use Wikimedia\Parsoid\Utils\TokenUtils; |
34 | use Wikimedia\Parsoid\Utils\Utils; |
35 | use Wikimedia\Parsoid\Utils\WTUtils; |
36 | use Wikimedia\Parsoid\Wt2Html\PipelineStage; |
37 | |
38 | class TreeBuilderStage extends PipelineStage { |
39 | /** @var int */ |
40 | private $tagId; |
41 | |
42 | /** @var bool */ |
43 | private $inTransclusion; |
44 | |
45 | /** @var int */ |
46 | private $tableDepth; |
47 | |
48 | /** @var RemexPipeline */ |
49 | private $remexPipeline; |
50 | |
51 | /** @var string|Token|null */ |
52 | private $lastToken; |
53 | |
54 | /** @var string */ |
55 | private $textContentBuffer = ''; |
56 | |
57 | public function __construct( |
58 | Env $env, array $options = [], string $stageId = "", |
59 | ?PipelineStage $prevStage = null |
60 | ) { |
61 | parent::__construct( $env, $prevStage ); |
62 | |
63 | // Reset variable state and set up the parser |
64 | $this->resetState( [] ); |
65 | } |
66 | |
67 | /** |
68 | * @inheritDoc |
69 | */ |
70 | public function resetState( array $options ): void { |
71 | parent::resetState( $options ); |
72 | |
73 | // Reset vars |
74 | $this->tagId = 1; // Assigned to start/self-closing tags |
75 | $this->inTransclusion = false; |
76 | |
77 | /* -------------------------------------------------------------------- |
78 | * Crude tracking of whether we are in a table |
79 | * |
80 | * The only requirement for correctness of detecting fostering content |
81 | * is that as long as there is an unclosed <table> tag, this value |
82 | * is positive. |
83 | * |
84 | * We can ensure that by making sure that independent of how many |
85 | * excess </table> tags we run into, this value is never negative. |
86 | * |
87 | * So, since this.tableDepth >= 0 always, whenever a <table> tag is seen, |
88 | * this.tableDepth >= 1 always, and our requirement is met. |
89 | * -------------------------------------------------------------------- */ |
90 | $this->tableDepth = 0; |
91 | |
92 | $this->remexPipeline = $this->env->fetchRemexPipeline( $this->toFragment ); |
93 | $this->textContentBuffer = ''; |
94 | $this->lastToken = null; |
95 | } |
96 | |
97 | /** |
98 | * Process a chunk of tokens and feed it to the HTML5 tree builder. |
99 | * This doesn't return anything. |
100 | * |
101 | * @param array $tokens Array of tokens to process |
102 | */ |
103 | public function processChunk( array $tokens ): void { |
104 | $s = null; |
105 | $profile = null; |
106 | if ( $this->env->profiling() ) { |
107 | $profile = $this->env->getCurrentProfile(); |
108 | $s = microtime( true ); |
109 | } |
110 | $n = count( $tokens ); |
111 | for ( $i = 0; $i < $n; $i++ ) { |
112 | $this->processToken( $tokens[$i] ); |
113 | } |
114 | if ( $profile ) { |
115 | $profile->bumpTimeUse( |
116 | 'HTML5 TreeBuilder', 1000 * ( microtime( true ) - $s ), 'HTML5' ); |
117 | } |
118 | } |
119 | |
120 | public function finalizeDOM(): Node { |
121 | // Check if the EOFTk actually made it all the way through, and flag the |
122 | // page where it did not! |
123 | if ( isset( $this->lastToken ) && !( $this->lastToken instanceof EOFTk ) ) { |
124 | $this->env->log( |
125 | 'error', 'EOFTk was lost in page', |
126 | $this->env->getContextTitle()->getPrefixedText() |
127 | ); |
128 | } |
129 | |
130 | if ( $this->toFragment ) { |
131 | // This is similar to DOMCompat::setInnerHTML() in that we can |
132 | // consider it equivalent to the fragment parsing algorithm, |
133 | // https://html.spec.whatwg.org/#html-fragment-parsing-algorithm |
134 | $node = $this->env->getTopLevelDoc()->createDocumentFragment(); |
135 | DOMUtils::migrateChildrenBetweenDocs( |
136 | DOMCompat::getBody( $this->remexPipeline->doc ), $node |
137 | ); |
138 | } else { |
139 | $node = DOMCompat::getBody( $this->remexPipeline->doc ); |
140 | } |
141 | |
142 | return $node; |
143 | } |
144 | |
145 | private function kvArrToAttr( array $kvArr ): array { |
146 | $attribs = []; |
147 | foreach ( $kvArr as $kv ) { |
148 | $attribs[$kv->k] = $kv->v; |
149 | |
150 | } |
151 | return $attribs; |
152 | } |
153 | |
154 | /** |
155 | * Keep this in sync with `DOMDataUtils.setNodeData()` |
156 | * |
157 | * @param array $attribs |
158 | * @param DataParsoid $dataParsoid |
159 | * @return array |
160 | */ |
161 | private function stashDataAttribs( array $attribs, DataParsoid $dataParsoid, ?DataMw $dataMw ): array { |
162 | $data = new NodeData; |
163 | $data->parsoid = $dataParsoid; |
164 | if ( $dataMw !== null ) { |
165 | $data->mw = $dataMw; |
166 | } |
167 | // Store in the top level doc since we'll be importing the nodes after treebuilding |
168 | $nodeId = DOMDataUtils::stashObjectInDoc( $this->env->getTopLevelDoc(), $data ); |
169 | $attribs[DOMDataUtils::DATA_OBJECT_ATTR_NAME] = (string)$nodeId; |
170 | return $attribs; |
171 | } |
172 | |
173 | /** |
174 | * Adapt the token format to internal HTML tree builder format, call the actual |
175 | * html tree builder by emitting the token. |
176 | * |
177 | * @param Token|string $token |
178 | */ |
179 | public function processToken( $token ): void { |
180 | if ( $this->pipelineId === 0 ) { |
181 | if ( $this->env->bumpWt2HtmlResourceUse( 'token' ) === false ) { |
182 | // `false` indicates that this bump pushed us over the threshold |
183 | // We don't want to log every token above that, which would be `null` |
184 | $this->env->log( 'warn', "wt2html: token limit exceeded" ); |
185 | } |
186 | } |
187 | |
188 | $dispatcher = $this->remexPipeline->dispatcher; |
189 | $attribs = isset( $token->attribs ) ? $this->kvArrToAttr( $token->attribs ) : []; |
190 | $dataParsoid = !is_string( $token ) ? $token->dataParsoid : new DataParsoid; |
191 | $dataMw = $token->dataMw ?? null; |
192 | $tmp = $dataParsoid->getTemp(); |
193 | |
194 | if ( $this->inTransclusion ) { |
195 | $tmp->setFlag( TempData::IN_TRANSCLUSION ); |
196 | } |
197 | |
198 | // Assign tagId to open/self-closing tags |
199 | if ( $token instanceof TagTk || $token instanceof SelfclosingTagTk ) { |
200 | $tmp->tagId = $this->tagId++; |
201 | } |
202 | |
203 | $this->env->log( 'trace/html', $this->pipelineId, static function () use ( $token ) { |
204 | return PHPUtils::jsonEncode( $token ); |
205 | } ); |
206 | |
207 | // Store the last token |
208 | $this->lastToken = $token; |
209 | |
210 | $isString = is_string( $token ) || $token instanceof NlTk; |
211 | if ( !$isString && $this->textContentBuffer !== '' ) { |
212 | // Finalize the combined string tokens |
213 | $dispatcher->characters( $this->textContentBuffer, 0, strlen( $this->textContentBuffer ), 0, 0 ); |
214 | |
215 | // If inside a table and a transclusion, add a meta tag after every |
216 | // text node so that we can detect fostered content that came from |
217 | // a transclusion. |
218 | if ( $this->inTransclusion && $this->tableDepth > 0 ) { |
219 | // The HTML spec says, "Space characters separated from non-space |
220 | // characters by non-character tokens are not affected by foster |
221 | // parenting" |
222 | if ( !preg_match( '/^\s*$/D', $this->textContentBuffer ) ) { |
223 | $this->env->log( |
224 | 'debug/html', $this->pipelineId, |
225 | 'Inserting shadow transclusion meta' |
226 | ); |
227 | $this->remexPipeline->insertExplicitStartTag( |
228 | 'meta', [ 'typeof' => 'mw:TransclusionShadow' ], true |
229 | ); |
230 | } |
231 | } |
232 | |
233 | $this->textContentBuffer = ''; |
234 | } |
235 | |
236 | if ( $isString ) { |
237 | $data = $token instanceof NlTk ? "\n" : $token; |
238 | // Combine string tokens to be finalized later |
239 | $this->textContentBuffer .= $data; |
240 | } elseif ( $token instanceof TagTk ) { |
241 | $tName = $token->getName(); |
242 | if ( $tName === 'table' ) { |
243 | $this->tableDepth++; |
244 | // Don't add foster box in transclusion |
245 | // Avoids unnecessary insertions, the case where a table |
246 | // doesn't have tsr info, and the messy unbalanced table case, |
247 | // like the navbox |
248 | if ( !$this->inTransclusion ) { |
249 | $this->env->log( 'debug/html', $this->pipelineId, 'Inserting foster box meta' ); |
250 | $this->remexPipeline->insertImplicitStartTag( |
251 | 'table', |
252 | [ 'typeof' => 'mw:FosterBox' ] |
253 | ); |
254 | } |
255 | } |
256 | |
257 | $node = $this->remexPipeline->insertExplicitStartTag( |
258 | $tName, |
259 | $this->stashDataAttribs( $attribs, $dataParsoid, $dataMw ), |
260 | false |
261 | ); |
262 | if ( !$node ) { |
263 | $this->handleDeletedStartTag( $tName, $dataParsoid ); |
264 | } |
265 | } elseif ( $token instanceof SelfclosingTagTk ) { |
266 | $tName = $token->getName(); |
267 | |
268 | // Re-expand an empty-line meta-token into its constituent comment + WS tokens |
269 | if ( TokenUtils::isEmptyLineMetaToken( $token ) ) { |
270 | $this->processChunk( $dataParsoid->tokens ); |
271 | return; |
272 | } |
273 | |
274 | $wasInserted = false; |
275 | |
276 | // Transclusion metas are placeholders and are eliminated after template-wrapping. |
277 | // Fostering them unnecessarily expands template ranges. Same for mw:Param metas. |
278 | if ( $tName === 'meta' ) { |
279 | $shouldNotFoster = TokenUtils::matchTypeOf( |
280 | $token, |
281 | '#^mw:(Transclusion|Param)(/|$)#' |
282 | ); |
283 | if ( $shouldNotFoster ) { |
284 | // transclusions state |
285 | $transType = TokenUtils::matchTypeOf( $token, '#^mw:Transclusion#' ); |
286 | if ( $transType ) { |
287 | // typeof starts with mw:Transclusion |
288 | $this->inTransclusion = ( $transType === 'mw:Transclusion' ); |
289 | } |
290 | $this->remexPipeline->insertUnfosteredMeta( |
291 | $this->stashDataAttribs( $attribs, $dataParsoid, $dataMw ) ); |
292 | $wasInserted = true; |
293 | } |
294 | } |
295 | |
296 | if ( !$wasInserted ) { |
297 | $node = $this->remexPipeline->insertExplicitStartTag( |
298 | $tName, |
299 | $this->stashDataAttribs( $attribs, $dataParsoid, $dataMw ), |
300 | false |
301 | ); |
302 | if ( $node ) { |
303 | if ( !Utils::isVoidElement( $tName ) ) { |
304 | $this->remexPipeline->insertExplicitEndTag( |
305 | $tName, ( $dataParsoid->stx ?? '' ) === 'html' ); |
306 | } |
307 | } else { |
308 | $this->insertPlaceholderMeta( $tName, $dataParsoid, true ); |
309 | } |
310 | } |
311 | } elseif ( $token instanceof EndTagTk ) { |
312 | $tName = $token->getName(); |
313 | if ( $tName === 'table' && $this->tableDepth > 0 ) { |
314 | $this->tableDepth--; |
315 | } |
316 | $node = $this->remexPipeline->insertExplicitEndTag( |
317 | $tName, |
318 | ( $dataParsoid->stx ?? '' ) === 'html' |
319 | ); |
320 | if ( $node ) { |
321 | // Copy data attribs from the end tag to the element |
322 | $nodeDP = DOMDataUtils::getDataParsoid( $node ); |
323 | if ( !WTUtils::hasLiteralHTMLMarker( $nodeDP ) |
324 | && isset( $dataParsoid->endTagSrc ) |
325 | ) { |
326 | $nodeDP->endTagSrc = $dataParsoid->endTagSrc; |
327 | } |
328 | if ( !empty( $dataParsoid->stx ) ) { |
329 | // FIXME: Not sure why we do this. For example, |
330 | // with "{|\n|x\n</table>", why should the entire table |
331 | // be marked HTML syntax? This is probably entirely |
332 | // 2013-era historical stuff. Investigate & fix. |
333 | // |
334 | // Same behavior with '''foo</b> |
335 | // |
336 | // Transfer stx flag |
337 | $nodeDP->stx = $dataParsoid->stx; |
338 | } |
339 | if ( isset( $dataParsoid->tsr ) ) { |
340 | $nodeDP->getTemp()->endTSR = $dataParsoid->tsr; |
341 | } |
342 | if ( isset( $nodeDP->autoInsertedStartToken ) ) { |
343 | $nodeDP->autoInsertedStart = true; |
344 | unset( $nodeDP->autoInsertedStartToken ); |
345 | } |
346 | if ( isset( $nodeDP->autoInsertedEndToken ) ) { |
347 | $nodeDP->autoInsertedEnd = true; |
348 | unset( $nodeDP->autoInsertedEndToken ); |
349 | } |
350 | } else { |
351 | // The tag was stripped. Insert an mw:Placeholder for round-tripping |
352 | $this->insertPlaceholderMeta( $tName, $dataParsoid, false ); |
353 | } |
354 | } elseif ( $token instanceof CommentTk ) { |
355 | $dp = $token->dataParsoid; |
356 | // @phan-suppress-next-line PhanUndeclaredProperty dynamic property |
357 | if ( isset( $dp->unclosedComment ) ) { |
358 | // Add a marker meta tag to aid accurate DSR computation |
359 | $attribs = [ 'typeof' => 'mw:Placeholder/UnclosedComment' ]; |
360 | $this->remexPipeline->insertUnfosteredMeta( |
361 | $this->stashDataAttribs( $attribs, $dp, $token->dataMw ) ); |
362 | } |
363 | $dispatcher->comment( $token->value, 0, 0 ); |
364 | } elseif ( $token instanceof EOFTk ) { |
365 | $dispatcher->endDocument( 0 ); |
366 | } else { |
367 | $errors = [ |
368 | '-------- Unhandled token ---------', |
369 | 'TYPE: ' . $token->getType(), |
370 | 'VAL : ' . PHPUtils::jsonEncode( $token ) |
371 | ]; |
372 | $this->env->log( 'error', implode( "\n", $errors ) ); |
373 | } |
374 | } |
375 | |
376 | /** |
377 | * Insert td/tr/th tag source or a placeholder meta |
378 | * |
379 | * @param string $name |
380 | * @param DataParsoid $dp |
381 | */ |
382 | private function handleDeletedStartTag( string $name, DataParsoid $dp ): void { |
383 | if ( ( $dp->stx ?? null ) !== 'html' && |
384 | ( $name === 'td' || $name === 'tr' || $name === 'th' ) |
385 | ) { |
386 | // A stripped wikitext-syntax table tag outside of a table. Re-insert the original |
387 | // page source. |
388 | if ( !empty( $dp->tsr ) && |
389 | $dp->tsr->start !== null && $dp->tsr->end !== null |
390 | ) { |
391 | $origTxt = $dp->tsr->substr( $this->frame->getSrcText() ); |
392 | } else { |
393 | switch ( $name ) { |
394 | case 'td': |
395 | $origTxt = '|'; |
396 | break; |
397 | case 'tr': |
398 | $origTxt = '|-'; |
399 | break; |
400 | case 'th': |
401 | $origTxt = '!'; |
402 | break; |
403 | default: |
404 | $origTxt = ''; |
405 | break; |
406 | } |
407 | } |
408 | if ( $origTxt !== '' ) { |
409 | $this->remexPipeline->dispatcher->characters( $origTxt, 0, strlen( $origTxt ), 0, |
410 | 0 ); |
411 | } |
412 | } else { |
413 | $this->insertPlaceholderMeta( $name, $dp, true ); |
414 | } |
415 | } |
416 | |
417 | /** |
418 | * Insert a placeholder meta for a deleted start or end tag |
419 | * |
420 | * @param string $name |
421 | * @param DataParsoid $dp |
422 | * @param bool $isStart |
423 | */ |
424 | private function insertPlaceholderMeta( |
425 | string $name, DataParsoid $dp, bool $isStart |
426 | ) { |
427 | // If node is in a position where the placeholder node will get fostered |
428 | // out, don't bother adding one since the browser and other compliant |
429 | // clients will move the placeholder out of the table. |
430 | if ( $this->remexPipeline->isFosterablePosition() ) { |
431 | return; |
432 | } |
433 | |
434 | $src = $dp->src ?? null; |
435 | |
436 | if ( !$src ) { |
437 | if ( !empty( $dp->tsr ) ) { |
438 | $src = $dp->tsr->substr( $this->frame->getSrcText() ); |
439 | } elseif ( WTUtils::hasLiteralHTMLMarker( $dp ) ) { |
440 | if ( $isStart ) { |
441 | $src = '<' . $name . '>'; |
442 | } else { |
443 | $src = '</' . $name . '>'; |
444 | } |
445 | } |
446 | } |
447 | |
448 | if ( $src ) { |
449 | $metaDP = new DataParsoid; |
450 | $metaDP->src = $src; |
451 | $metaDP->name = $name; |
452 | $this->remexPipeline->insertUnfosteredMeta( |
453 | $this->stashDataAttribs( |
454 | [ 'typeof' => 'mw:Placeholder/StrippedTag' ], |
455 | $metaDP, null |
456 | ) |
457 | ); |
458 | } |
459 | } |
460 | |
461 | /** |
462 | * @inheritDoc |
463 | */ |
464 | public function process( $input, array $opts ) { |
465 | '@phan-var array $input'; // @var array $input |
466 | $this->processChunk( $input ); |
467 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
468 | return $this->finalizeDOM(); |
469 | } |
470 | |
471 | /** |
472 | * @inheritDoc |
473 | */ |
474 | public function processChunkily( $input, array $opts ): Generator { |
475 | if ( $this->prevStage ) { |
476 | foreach ( $this->prevStage->processChunkily( $input, $opts ) as $chunk ) { |
477 | '@phan-var array $chunk'; // @var array $chunk |
478 | $this->processChunk( $chunk ); |
479 | } |
480 | yield $this->finalizeDOM(); |
481 | } else { |
482 | yield $this->process( $input, $opts ); |
483 | } |
484 | } |
485 | } |