Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 214 |
|
0.00% |
0 / 15 |
CRAP | |
0.00% |
0 / 1 |
PipelineUtils | |
0.00% |
0 / 214 |
|
0.00% |
0 / 15 |
5112 | |
0.00% |
0 / 1 |
getDOMFragmentToken | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
12 | |||
processContentInPipeline | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
expandAttrValueToDOM | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
6 | |||
expandAttrValuesToDOM | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
domAttrsToTagAttrs | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
20 | |||
convertDOMtoTokens | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
56 | |||
getWrapperTokens | |
0.00% |
0 / 47 |
|
0.00% |
0 / 1 |
462 | |||
encapsulateExpansionHTML | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
42 | |||
wrapAccum | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
6 | |||
addSpanWrappers | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
110 | |||
tunnelDOMThroughTokens | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
makeExpansion | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
doExtractExpansions | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
72 | |||
extractExpansions | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
fetchHTML | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\DOM\Comment; |
10 | use Wikimedia\Parsoid\DOM\Document; |
11 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
12 | use Wikimedia\Parsoid\DOM\Element; |
13 | use Wikimedia\Parsoid\DOM\Node; |
14 | use Wikimedia\Parsoid\DOM\NodeList; |
15 | use Wikimedia\Parsoid\DOM\Text; |
16 | use Wikimedia\Parsoid\NodeData\DataMw; |
17 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
18 | use Wikimedia\Parsoid\NodeData\TempData; |
19 | use Wikimedia\Parsoid\Tokens\CommentTk; |
20 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
21 | use Wikimedia\Parsoid\Tokens\EOFTk; |
22 | use Wikimedia\Parsoid\Tokens\KV; |
23 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
24 | use Wikimedia\Parsoid\Tokens\SourceRange; |
25 | use Wikimedia\Parsoid\Tokens\TagTk; |
26 | use Wikimedia\Parsoid\Tokens\Token; |
27 | use Wikimedia\Parsoid\Wt2Html\Frame; |
28 | |
29 | /** |
30 | * This file contains parsing pipeline related utilities. |
31 | */ |
32 | class PipelineUtils { |
33 | /** |
34 | * Creates a dom-fragment-token for processing 'content' (an array of tokens) |
35 | * in its own subpipeline all the way to DOM. These tokens will be processed |
36 | * by their own handler (DOMFragmentBuilder) in the last stage of the async |
37 | * pipeline. |
38 | * |
39 | * srcOffsets should always be provided to process top-level page content in a |
40 | * subpipeline. Without it, DSR computation and template wrapping cannot be done |
41 | * in the subpipeline. While unpackDOMFragment can do this on unwrapping, that can |
42 | * be a bit fragile and makes dom-fragments a leaky abstraction by leaking subpipeline |
43 | * processing into the top-level pipeline. |
44 | * |
45 | * @param Token[]|string $content The array of tokens to process. |
46 | * @param SourceRange $srcOffsets Wikitext source offsets (start/end) of these tokens. |
47 | * @param array $opts Parsing options. |
48 | * - Token token The token that generated the content. |
49 | * - bool inlineContext Is this DOM fragment used in an inline context? |
50 | * @return SelfclosingTagTk |
51 | */ |
52 | public static function getDOMFragmentToken( |
53 | $content, SourceRange $srcOffsets, array $opts = [] |
54 | ): SelfclosingTagTk { |
55 | $token = $opts['token']; |
56 | return new SelfclosingTagTk( 'mw:dom-fragment-token', [ |
57 | new KV( 'contextTok', $token, $token->dataParsoid->tsr->expandTsrV() ), |
58 | new KV( 'content', $content, $srcOffsets->expandTsrV() ), |
59 | new KV( 'inlineContext', ( $opts['inlineContext'] ?? false ) ? "1" : "0" ), |
60 | new KV( 'inPHPBlock', ( $opts['inPHPBlock'] ?? false ) ? "1" : "0" ), |
61 | ] ); |
62 | } |
63 | |
64 | /** |
65 | * Processes content (wikitext, array of tokens, whatever) in its own |
66 | * pipeline based on options. |
67 | * |
68 | * @param Env $env The environment/context for the expansion. |
69 | * @param Frame $frame |
70 | * The parent frame within which the expansion is taking place. |
71 | * Used for template expansion and source text tracking. |
72 | * @param string|Token|Token[] $content |
73 | * This could be wikitext or single token or an array of tokens. |
74 | * How this content is processed depends on what kind of pipeline |
75 | * is constructed specified by opts. |
76 | * @param array $opts |
77 | * Processing options that specify pipeline-type, opts, and callbacks. |
78 | * - string pipelineType |
79 | * - array pipelineOpts |
80 | * - array tplArgs - if set, defines parameters for the child frame |
81 | * - string tplArgs['name'] |
82 | * - KV[] tplArgs['attribs'] |
83 | * - string srcText - if set, defines the source text for the expansion |
84 | * - SourceRange srcOffsets - if set, defines the range within the |
85 | * source text that $content corresponds to |
86 | * - bool sol Whether tokens should be processed in start-of-line context. |
87 | * @return Token[]|DocumentFragment (depending on pipeline type) |
88 | */ |
89 | public static function processContentInPipeline( |
90 | Env $env, Frame $frame, $content, array $opts |
91 | ) { |
92 | // Build a pipeline |
93 | $pipeline = $env->getPipelineFactory()->getPipeline( |
94 | $opts['pipelineType'], |
95 | $opts['pipelineOpts'] |
96 | ); |
97 | |
98 | $pipeline->init( [ |
99 | 'toplevel' => false, |
100 | 'frame' => $frame, |
101 | 'tplArgs' => $opts['tplArgs'] ?? null, |
102 | 'srcText' => $opts['srcText'] ?? $frame->getSrcText(), |
103 | 'srcOffsets' => $opts['srcOffsets'] ?? null, |
104 | ] ); |
105 | |
106 | // Off the starting block ... ready, set, go! |
107 | return $pipeline->parse( $content, [ 'sol' => $opts['sol'] ] ); |
108 | } |
109 | |
110 | /** |
111 | * Expands value all the way to DOM. |
112 | * |
113 | * @param Env $env |
114 | * The environment/context for the expansion. |
115 | * @param Frame $frame |
116 | * The parent frame within which the expansion is taking place. |
117 | * Used for template expansion and source text tracking. |
118 | * @param array $v |
119 | * The value to process. |
120 | * The value is expected to be an associative array with a "html" property. |
121 | * The html property is expanded to DOM only if it is an array (of tokens). |
122 | * Non-arrays are passed back unexpanded. |
123 | * @param bool $expandTemplates |
124 | * Should any templates encountered here be expanded |
125 | * (usually false for nested templates since they are never directly editable). |
126 | * @param bool $inTemplate |
127 | * Unexpanded templates can occur in the content of extension tags. |
128 | * @return array |
129 | */ |
130 | public static function expandAttrValueToDOM( |
131 | Env $env, Frame $frame, array $v, bool $expandTemplates, bool $inTemplate |
132 | ): array { |
133 | if ( is_array( $v['html'] ?? null ) ) { |
134 | // Set up pipeline options |
135 | $opts = [ |
136 | 'pipelineType' => 'expanded-tokens-to-dom', |
137 | 'pipelineOpts' => [ |
138 | 'attrExpansion' => true, |
139 | 'inlineContext' => true, |
140 | 'expandTemplates' => $expandTemplates, |
141 | 'inTemplate' => $inTemplate |
142 | ], |
143 | 'srcOffsets' => $v['srcOffsets'], |
144 | 'sol' => true |
145 | ]; |
146 | $content = array_merge( $v['html'], [ new EOFTk() ] ); |
147 | $domFragment = self::processContentInPipeline( |
148 | $env, $frame, $content, $opts |
149 | ); |
150 | // Since we aren't at the top level, data attrs |
151 | // were not applied in cleanup. However, tmp |
152 | // was stripped. |
153 | $v['html'] = ContentUtils::ppToXML( |
154 | $domFragment, [ 'innerXML' => true ] |
155 | ); |
156 | } |
157 | // Remove srcOffsets after value is expanded, so they don't show |
158 | // up in the output data-mw attribute |
159 | unset( $v['srcOffsets'] ); |
160 | return $v; |
161 | } |
162 | |
163 | /** |
164 | * @param Env $env |
165 | * The environment/context for the expansion. |
166 | * @param Frame $frame |
167 | * The parent frame within which the expansion is taking place. |
168 | * Used for template expansion and source text tracking. |
169 | * @param array $vals |
170 | * Array of values to expand. |
171 | * Non-array elements of $vals are passed back unmodified. |
172 | * If an array element, it is expected to be an associative array with a "html" property. |
173 | * The html property is expanded to DOM only if it is an array (of tokens). |
174 | * @param bool $expandTemplates |
175 | * Should any templates encountered here be expanded |
176 | * (usually false for nested templates since they are never directly editable). |
177 | * @param bool $inTemplate |
178 | * Unexpanded templates can occur in the content of extension tags. |
179 | * @return array |
180 | */ |
181 | public static function expandAttrValuesToDOM( |
182 | Env $env, $frame, array $vals, bool $expandTemplates, bool $inTemplate |
183 | ): array { |
184 | $ret = []; |
185 | foreach ( $vals as $v ) { |
186 | $ret[] = self::expandAttrValueToDOM( $env, $frame, $v, $expandTemplates, $inTemplate ); |
187 | } |
188 | return $ret; |
189 | } |
190 | |
191 | /** |
192 | * Convert a DOM node to a token. The node comes from a DOM whose data attributes |
193 | * are stored outside the DOM. |
194 | * |
195 | * @param Element $node |
196 | * @param array<string,string> $attrs |
197 | * @return array{attrs:KV[],dataParsoid:?DataParsoid,dataMw:?DataMw} |
198 | */ |
199 | private static function domAttrsToTagAttrs( Element $node, array $attrs ): array { |
200 | $out = []; |
201 | foreach ( $attrs as $name => $value ) { |
202 | if ( $name !== DOMDataUtils::DATA_OBJECT_ATTR_NAME ) { |
203 | $out[] = new KV( $name, $value ); |
204 | } |
205 | } |
206 | return [ |
207 | 'attrs' => $out, |
208 | 'dataParsoid' => DOMDataUtils::getDataParsoid( $node ), |
209 | 'dataMw' => |
210 | DOMDataUtils::validDataMw( $node ) ? DOMDataUtils::getDataMw( $node ) : null, |
211 | ]; |
212 | } |
213 | |
214 | /** |
215 | * Convert a DOM to tokens. Data attributes for nodes are stored outside the DOM. |
216 | * |
217 | * @param Node $node The root of the DOM tree to convert to tokens |
218 | * @param Token[] $tokBuf This is where the tokens get stored |
219 | * @return array |
220 | */ |
221 | private static function convertDOMtoTokens( Node $node, array $tokBuf ): array { |
222 | if ( $node instanceof Element ) { |
223 | $nodeName = DOMCompat::nodeName( $node ); |
224 | $attrInfo = self::domAttrsToTagAttrs( $node, DOMUtils::attributes( $node ) ); |
225 | |
226 | if ( Utils::isVoidElement( $nodeName ) ) { |
227 | $tokBuf[] = new SelfclosingTagTk( |
228 | $nodeName, $attrInfo['attrs'], |
229 | $attrInfo['dataParsoid'], $attrInfo['dataMw'] |
230 | ); |
231 | } else { |
232 | $tokBuf[] = new TagTk( |
233 | $nodeName, $attrInfo['attrs'], |
234 | $attrInfo['dataParsoid'], $attrInfo['dataMw'] |
235 | ); |
236 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
237 | $tokBuf = self::convertDOMtoTokens( $child, $tokBuf ); |
238 | } |
239 | $endTag = new EndTagTk( $nodeName ); |
240 | // Keep stx parity |
241 | if ( WTUtils::isLiteralHTMLNode( $node ) ) { |
242 | $endTag->dataParsoid->stx = 'html'; |
243 | } |
244 | $tokBuf[] = $endTag; |
245 | } |
246 | } elseif ( $node instanceof Text ) { |
247 | PHPUtils::pushArray( $tokBuf, TokenUtils::newlinesToNlTks( $node->nodeValue ) ); |
248 | } elseif ( $node instanceof Comment ) { |
249 | $tokBuf[] = new CommentTk( $node->nodeValue ); |
250 | } else { |
251 | // getWrapperTokens calls convertDOMToTokens with a Element |
252 | // and children of dom elements are always text/comment/elements |
253 | // which are all covered above. |
254 | throw new UnreachableException( "Should never get here!" ); |
255 | } |
256 | |
257 | return $tokBuf; |
258 | } |
259 | |
260 | /** |
261 | * Get tokens representing a DOM forest (from transclusions, extensions, |
262 | * whatever that were generated as part of a separate processing pipeline) |
263 | * in the token stream. These tokens will tunnel the subtree through the |
264 | * token processing while preserving token stream semantics as if |
265 | * the DOM had been converted to tokens. |
266 | * |
267 | * @param DocumentFragment $domFragment List of DOM nodes that need to be tunneled through. |
268 | * @param array $opts |
269 | * @see encapsulateExpansionHTML's doc. for more info about these options. |
270 | * @return Token[] List of token representatives. |
271 | */ |
272 | private static function getWrapperTokens( |
273 | DocumentFragment $domFragment, array $opts |
274 | ): array { |
275 | if ( !$domFragment->hasChildNodes() ) { |
276 | return [ new TagTk( 'span' ), new EndTagTk( 'span' ) ]; |
277 | } |
278 | |
279 | $node = $domFragment->firstChild; |
280 | |
281 | // Do we represent this with inline or block elements? |
282 | // This is to ensure that we get p-wrapping correct. |
283 | // |
284 | // * If all content is inline, we use inline-elements to represent this |
285 | // so that this content gets swallowed into the P tag that wraps |
286 | // adjacent inline content. |
287 | // |
288 | // * If any part of this is a block content, we treat extension content |
289 | // independent of surrounding content and don't want inline content |
290 | // here to be swallowed into a P tag that wraps adjacent inline content. |
291 | // |
292 | // This behavior ensures that we and clients can "drop-in" extension content |
293 | // into the DOM without messing with fixing up paragraph tags of surrounding |
294 | // content. It could potentially introduce minor rendering differences when |
295 | // compared to PHP parser output, but we'll swallow it for now. |
296 | $wrapperType = 'INLINE'; |
297 | if ( !empty( $opts['pipelineOpts']['inlineContext'] ) ) { |
298 | // If the DOM fragment is being processed in the context where P wrapping |
299 | // has been suppressed, we represent the DOM fragment with inline-tokens. |
300 | // |
301 | // FIXME(SSS): Looks like we have some "impedance mismatch" here. But, this |
302 | // is correct in scenarios where link-content or image-captions are being |
303 | // processed in a sub-pipeline and we don't want a <div> in the link-caption |
304 | // to cause the <a>..</a> to get split apart. |
305 | // |
306 | // Filed as T49963 |
307 | } elseif ( !$opts['unpackOutput'] ) { |
308 | // Fragments that won't be unpacked aren't amenable to inspection, since |
309 | // the ultimate content is unknown. For example, refs shuttle content |
310 | // through treebuilding that ends up in the references list. |
311 | // |
312 | // FIXME(arlolra): Do we need a mechanism to specify content |
313 | // categories? |
314 | } else { |
315 | foreach ( $domFragment->childNodes as $n ) { |
316 | if ( |
317 | DOMUtils::isWikitextBlockNode( $n ) || |
318 | DOMUtils::hasBlockElementDescendant( $n ) |
319 | ) { |
320 | $wrapperType = 'BLOCK'; |
321 | break; |
322 | } |
323 | } |
324 | } |
325 | |
326 | $wrapperName = null; |
327 | if ( $wrapperType === 'BLOCK' && !DOMUtils::isWikitextBlockNode( $node ) ) { |
328 | $wrapperName = 'div'; |
329 | } elseif ( DOMCompat::nodeName( $node ) === 'a' ) { |
330 | // Do not use 'A' as a wrapper node because it could |
331 | // end up getting nested inside another 'A' and the DOM |
332 | // structure can change where the wrapper tokens are no |
333 | // longer siblings. |
334 | // Ex: "[http://foo.com Bad nesting [[Here]]]. |
335 | $wrapperName = 'span'; |
336 | } elseif ( |
337 | in_array( DOMCompat::nodeName( $node ), [ 'style', 'script' ], true ) && |
338 | ( $node->nextSibling !== null ) |
339 | ) { |
340 | // <style>/<script> tags are not fostered, so if we're wrapping |
341 | // more than a single node, they aren't a good representation for |
342 | // the content. It can lead to fosterable content being inserted |
343 | // in a fosterable position after treebuilding is done, which isn't |
344 | // roundtrippable. |
345 | $wrapperName = 'span'; |
346 | } elseif ( !( $node instanceof Element ) ) { |
347 | $wrapperName = 'span'; |
348 | } else { |
349 | $wrapperName = DOMCompat::nodeName( $node ); |
350 | } |
351 | |
352 | if ( $node instanceof Element ) { |
353 | Assert::invariant( |
354 | // No need to look for data-mw as well. |
355 | // Nodes that have data-mw also have data-parsoid. |
356 | !$node->hasAttribute( 'data-parsoid' ), |
357 | "Expected node to have its data attributes loaded" ); |
358 | |
359 | $nodeData = DOMDataUtils::getNodeData( $node )->cloneNodeData(); |
360 | |
361 | if ( $wrapperName !== DOMCompat::nodeName( $node ) ) { |
362 | // Create a copy of the node without children |
363 | $workNode = $node->ownerDocument->createElement( $wrapperName ); |
364 | |
365 | // Copy over attributes |
366 | foreach ( DOMUtils::attributes( $node ) as $name => $value ) { |
367 | // "typeof" is ignored since it'll be removed below. |
368 | if ( $name !== 'typeof' ) { |
369 | $workNode->setAttribute( $name, $value ); |
370 | } |
371 | } |
372 | |
373 | // We are applying a different wrapper. |
374 | // So, node's data-parsoid isn't applicable. |
375 | $nodeData->parsoid = new DataParsoid; |
376 | } else { |
377 | // Shallow clone since we don't want to convert the whole tree to tokens. |
378 | $workNode = $node->cloneNode( false ); |
379 | |
380 | // Reset 'tsr' since it isn't applicable. Neither is |
381 | // any auxiliary info like 'endTSR'. |
382 | // FIXME: The above comment is only true if we are reusing |
383 | // DOM fragments from cache from previous revisions in |
384 | // incremental parsing scenarios. See T98992 |
385 | if ( isset( $nodeData->parsoid->tsr ) ) { |
386 | $nodeData->parsoid->tsr = null; |
387 | } |
388 | if ( isset( $nodeData->parsoid->tmp->endTSR ) ) { |
389 | unset( $nodeData->parsoid->tmp->endTSR ); |
390 | } |
391 | |
392 | // The "in transclusion" flag was set on the first child for template |
393 | // wrapping in the nested pipeline, and doesn't apply to the dom |
394 | // fragment wrapper in this pipeline. Keeping it around can induce |
395 | // template wrapping of a foster box if the dom fragment is found in |
396 | // a fosterable position. |
397 | if ( |
398 | isset( $nodeData->parsoid ) && |
399 | $nodeData->parsoid->getTempFlag( TempData::IN_TRANSCLUSION ) |
400 | ) { |
401 | $nodeData->parsoid->tmp->setFlag( TempData::IN_TRANSCLUSION, false ); |
402 | } |
403 | } |
404 | |
405 | DOMDataUtils::setNodeData( $workNode, $nodeData ); |
406 | } else { |
407 | $workNode = $node->ownerDocument->createElement( $wrapperName ); |
408 | } |
409 | |
410 | $tokens = self::convertDOMtoTokens( $workNode, [] ); |
411 | |
412 | // Remove the typeof attribute from the first token. |
413 | // It will be replaced with mw:DOMFragment. |
414 | $tokens[0]->removeAttribute( 'typeof' ); |
415 | |
416 | // Remove the about attribute from the first token. |
417 | // We want to be able to distinguish when this wrapper was template |
418 | // annotated. |
419 | $tokens[0]->removeAttribute( 'about' ); |
420 | |
421 | return $tokens; |
422 | } |
423 | |
424 | /** |
425 | * Generates wrapper tokens for a HTML expansion -- the wrapper |
426 | * tokens are placeholders that adequately represent semantics |
427 | * of the HTML DOM for the purposes of additional token transformations |
428 | * that will be applied to them. |
429 | * |
430 | * @param Env $env |
431 | * The active environment/context. |
432 | * @param Token $token |
433 | * The token that generated the DOM. |
434 | * @param array $expansion |
435 | * - string html HTML of the expansion. |
436 | * - DocumentFragment domFragment Outermost nodes of the HTML. |
437 | * @param array $opts |
438 | * - SourceRange tsr |
439 | * The TSR to set on the generated tokens. This TSR is |
440 | * used to compute DSR on the placeholder tokens. |
441 | * The computed DSR is transferred over to the unpacked DOM |
442 | * if setDSR is true (see below). |
443 | * - bool setDSR |
444 | * When the DOM fragment is unpacked, this option governs |
445 | * whether the DSR from the placeholder node is transferred |
446 | * over to the unpacked DOM or not. |
447 | * For example: Cite, reused transclusions. |
448 | * - bool fromCache |
449 | * - array pipelineOpts |
450 | * - bool unpackOutput |
451 | * - string wrapperName |
452 | * @return Token[] |
453 | */ |
454 | public static function encapsulateExpansionHTML( |
455 | Env $env, Token $token, array $expansion, array $opts |
456 | ): array { |
457 | $opts['unpackOutput'] ??= true; // Default |
458 | // Get placeholder tokens to get our subdom through the token processing |
459 | // stages. These will be finally unwrapped on the DOM. |
460 | $toks = self::getWrapperTokens( $expansion['domFragment'], $opts ); |
461 | $firstWrapperToken = $toks[0]; |
462 | |
463 | // Add the DOMFragment type so that we get unwrapped later. |
464 | $fragmentType = 'mw:DOMFragment' . ( !$opts['unpackOutput'] ? '/sealed/' . $opts['wrapperName'] : '' ); |
465 | $firstWrapperToken->setAttribute( 'typeof', $fragmentType ); |
466 | |
467 | // Assign the HTML fragment to the data-parsoid.html on the first wrapper token. |
468 | $firstWrapperToken->dataParsoid->html = $expansion['html']; |
469 | |
470 | // Pass through setDSR flag |
471 | if ( !empty( $opts['setDSR'] ) ) { |
472 | $firstWrapperToken->dataParsoid->setTempFlag( |
473 | TempData::SET_DSR, $opts['setDSR'] ); |
474 | } |
475 | |
476 | // Pass through fromCache flag |
477 | if ( !empty( $opts['fromCache'] ) ) { |
478 | $firstWrapperToken->dataParsoid->setTempFlag( |
479 | TempData::FROM_CACHE, $opts['fromCache'] ); |
480 | } |
481 | |
482 | // Transfer the tsr. |
483 | // The first token gets the full width, the following tokens zero width. |
484 | $tokenTsr = $opts['tsr'] ?? $token->dataParsoid->tsr ?? null; |
485 | if ( $tokenTsr ) { |
486 | $firstWrapperToken->dataParsoid->tsr = $tokenTsr; |
487 | $firstWrapperToken->dataParsoid->extTagOffsets = $token->dataParsoid->extTagOffsets ?? null; |
488 | // XXX to investigate: if $tokenTsr->end is null, then we're losing |
489 | // the 'hint' we'd like to provide here that this is a zero-width |
490 | // source range. |
491 | // ->end can be set to null by WikiLinkHandler::bailTokens() |
492 | $endTsr = new SourceRange( $tokenTsr->end, $tokenTsr->end ); |
493 | for ( $i = 1; $i < count( $toks ); $i++ ) { |
494 | $toks[$i]->dataParsoid->tsr = clone $endTsr; |
495 | } |
496 | } |
497 | |
498 | return $toks; |
499 | } |
500 | |
501 | private static function wrapAccum( |
502 | Document $doc, array &$textCommentAccum |
503 | ): void { |
504 | // Wrap accumulated nodes in a span |
505 | $span = $doc->createElement( 'span' ); |
506 | $parentNode = $textCommentAccum[0]->parentNode; |
507 | $parentNode->insertBefore( $span, $textCommentAccum[0] ); |
508 | foreach ( $textCommentAccum as $n ) { |
509 | $span->appendChild( $n ); |
510 | } |
511 | $dp = new DataParsoid; |
512 | $dp->setTempFlag( TempData::WRAPPER ); |
513 | DOMDataUtils::setDataParsoid( $span, $dp ); |
514 | $textCommentAccum = []; |
515 | } |
516 | |
517 | /** |
518 | * Wrap text and comment nodes in a node list into spans, so that all |
519 | * top-level nodes are elements. |
520 | * |
521 | * @param NodeList $nodes List of DOM nodes to wrap, mix of node types. |
522 | * @param ?Node $startAt |
523 | * @param ?Node $stopAt |
524 | */ |
525 | public static function addSpanWrappers( |
526 | $nodes, |
527 | ?Node $startAt = null, |
528 | ?Node $stopAt = null |
529 | ): void { |
530 | $textCommentAccum = []; |
531 | $doc = $nodes->item( 0 )->ownerDocument; |
532 | |
533 | // Build a real array out of nodes. |
534 | // |
535 | // Operating directly on DOM child-nodes array |
536 | // and manipulating them by adding span wrappers |
537 | // changes the traversal itself |
538 | $nodeBuf = []; |
539 | foreach ( $nodes as $node ) { |
540 | $nodeBuf[] = $node; |
541 | } |
542 | |
543 | $start = ( $startAt === null ); |
544 | foreach ( $nodeBuf as $node ) { |
545 | if ( !$start ) { |
546 | if ( $startAt !== $node ) { |
547 | continue; |
548 | } |
549 | $start = true; |
550 | } |
551 | if ( $node instanceof Text || $node instanceof Comment ) { |
552 | $textCommentAccum[] = $node; |
553 | } elseif ( count( $textCommentAccum ) ) { |
554 | self::wrapAccum( $doc, $textCommentAccum ); |
555 | } |
556 | if ( $node === $stopAt ) { |
557 | break; |
558 | } |
559 | } |
560 | |
561 | if ( count( $textCommentAccum ) ) { |
562 | self::wrapAccum( $doc, $textCommentAccum ); |
563 | } |
564 | } |
565 | |
566 | /** |
567 | * Convert a HTML5 DOM into a mw:DOMFragment and generate appropriate |
568 | * tokens to insert into the token stream for further processing. |
569 | * |
570 | * The DOMPostProcessor will unpack the fragment and insert the HTML |
571 | * back into the DOM. |
572 | * |
573 | * @param Env $env |
574 | * The active environment/context. |
575 | * @param Token $token |
576 | * The token that generated the DOM. |
577 | * @param DocumentFragment $domFragment |
578 | * The DOM that the token expanded to. |
579 | * @param array $opts |
580 | * Options to be passed onto the encapsulation code |
581 | * See encapsulateExpansionHTML's doc. for more info about these options. |
582 | * @return Token[] |
583 | */ |
584 | public static function tunnelDOMThroughTokens( |
585 | Env $env, Token $token, DocumentFragment $domFragment, array $opts |
586 | ): array { |
587 | // Get placeholder tokens to get our subdom through the token processing |
588 | // stages. These will be finally unwrapped on the DOM. |
589 | $expansion = self::makeExpansion( $env, $domFragment ); |
590 | return self::encapsulateExpansionHTML( $env, $token, $expansion, $opts ); |
591 | } |
592 | |
593 | public static function makeExpansion( |
594 | Env $env, DocumentFragment $domFragment |
595 | ): array { |
596 | $fragmentId = $env->newFragmentId(); |
597 | $env->setDOMFragment( $fragmentId, $domFragment ); |
598 | return [ 'domFragment' => $domFragment, 'html' => $fragmentId ]; |
599 | } |
600 | |
601 | private static function doExtractExpansions( Env $env, array &$expansions, Node $node ): void { |
602 | $nodes = null; |
603 | $expAccum = null; |
604 | while ( $node ) { |
605 | if ( $node instanceof Element ) { |
606 | if ( DOMUtils::matchTypeOf( $node, '#^mw:(Transclusion$|Extension/)#' ) && |
607 | $node->hasAttribute( 'about' ) |
608 | ) { |
609 | $dp = DOMDataUtils::getDataParsoid( $node ); |
610 | $about = DOMCompat::getAttribute( $node, 'about' ); |
611 | $nodes = WTUtils::getAboutSiblings( $node, $about ); |
612 | $key = null; |
613 | if ( DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) ) { |
614 | $expAccum = $expansions['transclusions']; |
615 | $key = $dp->src; |
616 | } elseif ( DOMUtils::matchTypeOf( $node, '#^mw:Extension/#' ) ) { |
617 | $expAccum = $expansions['extensions']; |
618 | $key = $dp->src; |
619 | } else { |
620 | $expAccum = $expansions['media']; |
621 | // XXX gwicke: use proper key that is not |
622 | // source-based? This also needs to work for |
623 | // transclusion output. |
624 | $key = null; |
625 | } |
626 | |
627 | if ( $key ) { |
628 | throw new UnreachableException( 'Callsite was not ported!' ); |
629 | // FIXME: makeExpansion return type changed |
630 | // $expAccum[$key] = self::makeExpansion( $env, $nodes ); |
631 | } |
632 | |
633 | $node = end( $nodes ); |
634 | } else { |
635 | self::doExtractExpansions( $env, $expansions, $node->firstChild ); |
636 | } |
637 | } |
638 | $node = $node->nextSibling; |
639 | } |
640 | } |
641 | |
642 | /** |
643 | * Extract transclusion and extension expansions from a DOM, and return |
644 | * them in a structure like this: |
645 | * { |
646 | * transclusions: { |
647 | * 'key1': { |
648 | * html: 'html1', |
649 | * nodes: [<node1>, <node2>] |
650 | * } |
651 | * }, |
652 | * extensions: { |
653 | * 'key2': { |
654 | * html: 'html2', |
655 | * nodes: [<node1>, <node2>] |
656 | * } |
657 | * }, |
658 | * files: { |
659 | * 'key3': { |
660 | * html: 'html3', |
661 | * nodes: [<node1>, <node2>] |
662 | * } |
663 | * } |
664 | * } |
665 | * |
666 | * @param Env $env |
667 | * @param Element $body |
668 | * @return array |
669 | */ |
670 | public static function extractExpansions( Env $env, Element $body ): array { |
671 | $expansions = [ |
672 | 'transclusions' => [], |
673 | 'extensions' => [], |
674 | 'media' => [] |
675 | ]; |
676 | // Kick off the extraction |
677 | self::doExtractExpansions( $env, $expansions, $body->firstChild ); |
678 | return $expansions; |
679 | } |
680 | |
681 | /** |
682 | * Fetches output of encapsulations that return HTML from the legacy parser |
683 | */ |
684 | public static function fetchHTML( Env $env, string $source ): ?DocumentFragment { |
685 | $ret = $env->getDataAccess()->parseWikitext( |
686 | $env->getPageConfig(), $env->getMetadata(), $source |
687 | ); |
688 | return $ret === '' ? null : DOMUtils::parseHTMLToFragment( |
689 | $env->topLevelDoc, DOMUtils::stripPWrapper( $ret ) |
690 | ); |
691 | } |
692 | } |