Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 205 |
|
0.00% |
0 / 15 |
CRAP | |
0.00% |
0 / 1 |
PipelineUtils | |
0.00% |
0 / 205 |
|
0.00% |
0 / 15 |
5112 | |
0.00% |
0 / 1 |
getDOMFragmentToken | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
12 | |||
processContentInPipeline | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
expandAttrValueToDOM | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
6 | |||
expandAttrValuesToDOM | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
domAttrsToTagAttrs | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
20 | |||
convertDOMtoTokens | |
0.00% |
0 / 18 |
|
0.00% |
0 / 1 |
56 | |||
getWrapperTokens | |
0.00% |
0 / 47 |
|
0.00% |
0 / 1 |
462 | |||
encapsulateExpansionHTML | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
42 | |||
wrapAccum | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
6 | |||
addSpanWrappers | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
110 | |||
tunnelDOMThroughTokens | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
makeExpansion | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
doExtractExpansions | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
72 | |||
extractExpansions | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
fetchHTML | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\DOM\Comment; |
10 | use Wikimedia\Parsoid\DOM\Document; |
11 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
12 | use Wikimedia\Parsoid\DOM\Element; |
13 | use Wikimedia\Parsoid\DOM\Node; |
14 | use Wikimedia\Parsoid\DOM\NodeList; |
15 | use Wikimedia\Parsoid\DOM\Text; |
16 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
17 | use Wikimedia\Parsoid\NodeData\TempData; |
18 | use Wikimedia\Parsoid\Tokens\CommentTk; |
19 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
20 | use Wikimedia\Parsoid\Tokens\EOFTk; |
21 | use Wikimedia\Parsoid\Tokens\KV; |
22 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
23 | use Wikimedia\Parsoid\Tokens\SourceRange; |
24 | use Wikimedia\Parsoid\Tokens\TagTk; |
25 | use Wikimedia\Parsoid\Tokens\Token; |
26 | use Wikimedia\Parsoid\Wt2Html\Frame; |
27 | |
28 | /** |
29 | * This file contains parsing pipeline related utilities. |
30 | */ |
31 | class PipelineUtils { |
32 | /** |
33 | * Creates a dom-fragment-token for processing 'content' (an array of tokens) |
34 | * in its own subpipeline all the way to DOM. These tokens will be processed |
35 | * by their own handler (DOMFragmentBuilder) in the last stage of the async |
36 | * pipeline. |
37 | * |
38 | * srcOffsets should always be provided to process top-level page content in a |
39 | * subpipeline. Without it, DSR computation and template wrapping cannot be done |
40 | * in the subpipeline. While unpackDOMFragment can do this on unwrapping, that can |
41 | * be a bit fragile and makes dom-fragments a leaky abstraction by leaking subpipeline |
42 | * processing into the top-level pipeline. |
43 | * |
44 | * @param Token[]|string $content The array of tokens to process. |
45 | * @param SourceRange $srcOffsets Wikitext source offsets (start/end) of these tokens. |
46 | * @param array $opts Parsing options. |
47 | * - Token token The token that generated the content. |
48 | * - bool inlineContext Is this DOM fragment used in an inline context? |
49 | * @return SelfclosingTagTk |
50 | */ |
51 | public static function getDOMFragmentToken( |
52 | $content, SourceRange $srcOffsets, array $opts = [] |
53 | ): SelfclosingTagTk { |
54 | $token = $opts['token']; |
55 | return new SelfclosingTagTk( 'mw:dom-fragment-token', [ |
56 | new KV( 'contextTok', $token, $token->dataParsoid->tsr->expandTsrV() ), |
57 | new KV( 'content', $content, $srcOffsets->expandTsrV() ), |
58 | new KV( 'inlineContext', ( $opts['inlineContext'] ?? false ) ? "1" : "0" ), |
59 | new KV( 'inPHPBlock', ( $opts['inPHPBlock'] ?? false ) ? "1" : "0" ), |
60 | ] ); |
61 | } |
62 | |
63 | /** |
64 | * Processes content (wikitext, array of tokens, whatever) in its own |
65 | * pipeline based on options. |
66 | * |
67 | * @param Env $env The environment/context for the expansion. |
68 | * @param Frame $frame |
69 | * The parent frame within which the expansion is taking place. |
70 | * Used for template expansion and source text tracking. |
71 | * @param string|Token|Token[] $content |
72 | * This could be wikitext or single token or an array of tokens. |
73 | * How this content is processed depends on what kind of pipeline |
74 | * is constructed specified by opts. |
75 | * @param array $opts |
76 | * Processing options that specify pipeline-type, opts, and callbacks. |
77 | * - string pipelineType |
78 | * - array pipelineOpts |
79 | * - array tplArgs - if set, defines parameters for the child frame |
80 | * - string tplArgs['name'] |
81 | * - KV[] tplArgs['attribs'] |
82 | * - string srcText - if set, defines the source text for the expansion |
83 | * - SourceRange srcOffsets - if set, defines the range within the |
84 | * source text that $content corresponds to |
85 | * - bool sol Whether tokens should be processed in start-of-line context. |
86 | * @return Token[]|DocumentFragment (depending on pipeline type) |
87 | */ |
88 | public static function processContentInPipeline( |
89 | Env $env, Frame $frame, $content, array $opts |
90 | ) { |
91 | // Build a pipeline |
92 | $pipeline = $env->getPipelineFactory()->getPipeline( |
93 | $opts['pipelineType'], |
94 | $opts['pipelineOpts'] |
95 | ); |
96 | |
97 | $pipeline->init( [ |
98 | 'toplevel' => false, |
99 | 'frame' => $frame, |
100 | 'tplArgs' => $opts['tplArgs'] ?? null, |
101 | 'srcText' => $opts['srcText'] ?? $frame->getSrcText(), |
102 | 'srcOffsets' => $opts['srcOffsets'] ?? null, |
103 | ] ); |
104 | |
105 | // Off the starting block ... ready, set, go! |
106 | return $pipeline->parse( $content, [ 'sol' => $opts['sol'] ] ); |
107 | } |
108 | |
109 | /** |
110 | * Expands value all the way to DOM. |
111 | * |
112 | * @param Env $env |
113 | * The environment/context for the expansion. |
114 | * @param Frame $frame |
115 | * The parent frame within which the expansion is taking place. |
116 | * Used for template expansion and source text tracking. |
117 | * @param array $v |
118 | * The value to process. |
119 | * The value is expected to be an associative array with a "html" property. |
120 | * The html property is expanded to DOM only if it is an array (of tokens). |
121 | * Non-arrays are passed back unexpanded. |
122 | * @param bool $expandTemplates |
123 | * Should any templates encountered here be expanded |
124 | * (usually false for nested templates since they are never directly editable). |
125 | * @param bool $inTemplate |
126 | * Unexpanded templates can occur in the content of extension tags. |
127 | * @return array |
128 | */ |
129 | public static function expandAttrValueToDOM( |
130 | Env $env, Frame $frame, array $v, bool $expandTemplates, bool $inTemplate |
131 | ): array { |
132 | if ( is_array( $v['html'] ?? null ) ) { |
133 | // Set up pipeline options |
134 | $opts = [ |
135 | 'pipelineType' => 'tokens/x-mediawiki/expanded', |
136 | 'pipelineOpts' => [ |
137 | 'attrExpansion' => true, |
138 | 'inlineContext' => true, |
139 | 'expandTemplates' => $expandTemplates, |
140 | 'inTemplate' => $inTemplate |
141 | ], |
142 | 'srcOffsets' => $v['srcOffsets'], |
143 | 'sol' => true |
144 | ]; |
145 | $content = array_merge( $v['html'], [ new EOFTk() ] ); |
146 | $domFragment = self::processContentInPipeline( |
147 | $env, $frame, $content, $opts |
148 | ); |
149 | // Since we aren't at the top level, data attrs |
150 | // were not applied in cleanup. However, tmp |
151 | // was stripped. |
152 | $v['html'] = ContentUtils::ppToXML( |
153 | $domFragment, [ 'innerXML' => true ] |
154 | ); |
155 | } |
156 | // Remove srcOffsets after value is expanded, so they don't show |
157 | // up in the output data-mw attribute |
158 | unset( $v['srcOffsets'] ); |
159 | return $v; |
160 | } |
161 | |
162 | /** |
163 | * @param Env $env |
164 | * The environment/context for the expansion. |
165 | * @param Frame $frame |
166 | * The parent frame within which the expansion is taking place. |
167 | * Used for template expansion and source text tracking. |
168 | * @param array $vals |
169 | * Array of values to expand. |
170 | * Non-array elements of $vals are passed back unmodified. |
171 | * If an array element, it is expected to be an associative array with a "html" property. |
172 | * The html property is expanded to DOM only if it is an array (of tokens). |
173 | * @param bool $expandTemplates |
174 | * Should any templates encountered here be expanded |
175 | * (usually false for nested templates since they are never directly editable). |
176 | * @param bool $inTemplate |
177 | * Unexpanded templates can occur in the content of extension tags. |
178 | * @return array |
179 | */ |
180 | public static function expandAttrValuesToDOM( |
181 | Env $env, $frame, array $vals, bool $expandTemplates, bool $inTemplate |
182 | ): array { |
183 | $ret = []; |
184 | foreach ( $vals as $v ) { |
185 | $ret[] = self::expandAttrValueToDOM( $env, $frame, $v, $expandTemplates, $inTemplate ); |
186 | } |
187 | return $ret; |
188 | } |
189 | |
190 | /** |
191 | * Convert a DOM node to a token. The node comes from a DOM whose data attributes |
192 | * are stored outside the DOM. |
193 | * |
194 | * @param Element $node |
195 | * @param string[] $attrs |
196 | * @return array |
197 | */ |
198 | private static function domAttrsToTagAttrs( Element $node, array $attrs ): array { |
199 | $out = []; |
200 | foreach ( $attrs as $name => $value ) { |
201 | if ( $name !== DOMDataUtils::DATA_OBJECT_ATTR_NAME ) { |
202 | $out[] = new KV( $name, $value ); |
203 | } |
204 | } |
205 | if ( DOMDataUtils::validDataMw( $node ) ) { |
206 | $out[] = new KV( 'data-mw', PHPUtils::jsonEncode( DOMDataUtils::getDataMw( $node ) ) ); |
207 | } |
208 | return [ 'attrs' => $out, 'dataAttrs' => DOMDataUtils::getDataParsoid( $node ) ]; |
209 | } |
210 | |
211 | /** |
212 | * Convert a DOM to tokens. Data attributes for nodes are stored outside the DOM. |
213 | * |
214 | * @param Node $node The root of the DOM tree to convert to tokens |
215 | * @param Token[] $tokBuf This is where the tokens get stored |
216 | * @return array |
217 | */ |
218 | private static function convertDOMtoTokens( Node $node, array $tokBuf ): array { |
219 | if ( $node instanceof Element ) { |
220 | $nodeName = DOMCompat::nodeName( $node ); |
221 | $attrInfo = self::domAttrsToTagAttrs( $node, DOMUtils::attributes( $node ) ); |
222 | |
223 | if ( Utils::isVoidElement( $nodeName ) ) { |
224 | $tokBuf[] = new SelfclosingTagTk( $nodeName, $attrInfo['attrs'], $attrInfo['dataAttrs'] ); |
225 | } else { |
226 | $tokBuf[] = new TagTk( $nodeName, $attrInfo['attrs'], $attrInfo['dataAttrs'] ); |
227 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
228 | $tokBuf = self::convertDOMtoTokens( $child, $tokBuf ); |
229 | } |
230 | $endTag = new EndTagTk( $nodeName ); |
231 | // Keep stx parity |
232 | if ( WTUtils::isLiteralHTMLNode( $node ) ) { |
233 | $endTag->dataParsoid->stx = 'html'; |
234 | } |
235 | $tokBuf[] = $endTag; |
236 | } |
237 | } elseif ( $node instanceof Text ) { |
238 | PHPUtils::pushArray( $tokBuf, TokenUtils::newlinesToNlTks( $node->nodeValue ) ); |
239 | } elseif ( $node instanceof Comment ) { |
240 | $tokBuf[] = new CommentTk( $node->nodeValue ); |
241 | } else { |
242 | // getWrapperTokens calls convertDOMToTokens with a Element |
243 | // and children of dom elements are always text/comment/elements |
244 | // which are all covered above. |
245 | throw new UnreachableException( "Should never get here!" ); |
246 | } |
247 | |
248 | return $tokBuf; |
249 | } |
250 | |
251 | /** |
252 | * Get tokens representing a DOM forest (from transclusions, extensions, |
253 | * whatever that were generated as part of a separate processing pipeline) |
254 | * in the token stream. These tokens will tunnel the subtree through the |
255 | * token processing while preserving token stream semantics as if |
256 | * the DOM had been converted to tokens. |
257 | * |
258 | * @param DocumentFragment $domFragment List of DOM nodes that need to be tunneled through. |
259 | * @param array $opts |
260 | * @see encapsulateExpansionHTML's doc. for more info about these options. |
261 | * @return Token[] List of token representatives. |
262 | */ |
263 | private static function getWrapperTokens( |
264 | DocumentFragment $domFragment, array $opts |
265 | ): array { |
266 | if ( !$domFragment->hasChildNodes() ) { |
267 | return [ new TagTk( 'span' ), new EndTagTk( 'span' ) ]; |
268 | } |
269 | |
270 | $node = $domFragment->firstChild; |
271 | |
272 | // Do we represent this with inline or block elements? |
273 | // This is to ensure that we get p-wrapping correct. |
274 | // |
275 | // * If all content is inline, we use inline-elements to represent this |
276 | // so that this content gets swallowed into the P tag that wraps |
277 | // adjacent inline content. |
278 | // |
279 | // * If any part of this is a block content, we treat extension content |
280 | // independent of surrounding content and don't want inline content |
281 | // here to be swallowed into a P tag that wraps adjacent inline content. |
282 | // |
283 | // This behavior ensures that we and clients can "drop-in" extension content |
284 | // into the DOM without messing with fixing up paragraph tags of surrounding |
285 | // content. It could potentially introduce minor rendering differences when |
286 | // compared to PHP parser output, but we'll swallow it for now. |
287 | $wrapperType = 'INLINE'; |
288 | if ( !empty( $opts['pipelineOpts']['inlineContext'] ) ) { |
289 | // If the DOM fragment is being processed in the context where P wrapping |
290 | // has been suppressed, we represent the DOM fragment with inline-tokens. |
291 | // |
292 | // FIXME(SSS): Looks like we have some "impedance mismatch" here. But, this |
293 | // is correct in scenarios where link-content or image-captions are being |
294 | // processed in a sub-pipeline and we don't want a <div> in the link-caption |
295 | // to cause the <a>..</a> to get split apart. |
296 | // |
297 | // Filed as T49963 |
298 | } elseif ( !$opts['unpackOutput'] ) { |
299 | // Fragments that won't be unpacked aren't amenable to inspection, since |
300 | // the ultimate content is unknown. For example, refs shuttle content |
301 | // through treebuilding that ends up in the references list. |
302 | // |
303 | // FIXME(arlolra): Do we need a mechanism to specify content |
304 | // categories? |
305 | } else { |
306 | foreach ( $domFragment->childNodes as $n ) { |
307 | if ( |
308 | DOMUtils::isWikitextBlockNode( $n ) || |
309 | DOMUtils::hasBlockElementDescendant( $n ) |
310 | ) { |
311 | $wrapperType = 'BLOCK'; |
312 | break; |
313 | } |
314 | } |
315 | } |
316 | |
317 | $wrapperName = null; |
318 | if ( $wrapperType === 'BLOCK' && !DOMUtils::isWikitextBlockNode( $node ) ) { |
319 | $wrapperName = 'div'; |
320 | } elseif ( DOMCompat::nodeName( $node ) === 'a' ) { |
321 | // Do not use 'A' as a wrapper node because it could |
322 | // end up getting nested inside another 'A' and the DOM |
323 | // structure can change where the wrapper tokens are no |
324 | // longer siblings. |
325 | // Ex: "[http://foo.com Bad nesting [[Here]]]. |
326 | $wrapperName = 'span'; |
327 | } elseif ( |
328 | in_array( DOMCompat::nodeName( $node ), [ 'style', 'script' ], true ) && |
329 | ( $node->nextSibling !== null ) |
330 | ) { |
331 | // <style>/<script> tags are not fostered, so if we're wrapping |
332 | // more than a single node, they aren't a good representation for |
333 | // the content. It can lead to fosterable content being inserted |
334 | // in a fosterable position after treebuilding is done, which isn't |
335 | // roundtrippable. |
336 | $wrapperName = 'span'; |
337 | } elseif ( !( $node instanceof Element ) ) { |
338 | $wrapperName = 'span'; |
339 | } else { |
340 | $wrapperName = DOMCompat::nodeName( $node ); |
341 | } |
342 | |
343 | if ( $node instanceof Element ) { |
344 | Assert::invariant( |
345 | // No need to look for data-mw as well. |
346 | // Nodes that have data-mw also have data-parsoid. |
347 | !$node->hasAttribute( 'data-parsoid' ), |
348 | "Expected node to have its data attributes loaded" ); |
349 | |
350 | $nodeData = DOMDataUtils::getNodeData( $node )->cloneNodeData(); |
351 | |
352 | if ( $wrapperName !== DOMCompat::nodeName( $node ) ) { |
353 | // Create a copy of the node without children |
354 | $workNode = $node->ownerDocument->createElement( $wrapperName ); |
355 | |
356 | // Copy over attributes |
357 | foreach ( DOMUtils::attributes( $node ) as $name => $value ) { |
358 | // "typeof" is ignored since it'll be removed below. |
359 | if ( $name !== 'typeof' ) { |
360 | $workNode->setAttribute( $name, $value ); |
361 | } |
362 | } |
363 | |
364 | // We are applying a different wrapper. |
365 | // So, node's data-parsoid isn't applicable. |
366 | $nodeData->parsoid = new DataParsoid; |
367 | } else { |
368 | // Shallow clone since we don't want to convert the whole tree to tokens. |
369 | $workNode = $node->cloneNode( false ); |
370 | |
371 | // Reset 'tsr' since it isn't applicable. Neither is |
372 | // any auxiliary info like 'endTSR'. |
373 | // FIXME: The above comment is only true if we are reusing |
374 | // DOM fragments from cache from previous revisions in |
375 | // incremental parsing scenarios. See T98992 |
376 | if ( isset( $nodeData->parsoid->tsr ) ) { |
377 | $nodeData->parsoid->tsr = null; |
378 | } |
379 | if ( isset( $nodeData->parsoid->tmp->endTSR ) ) { |
380 | unset( $nodeData->parsoid->tmp->endTSR ); |
381 | } |
382 | |
383 | // The "in transclusion" flag was set on the first child for template |
384 | // wrapping in the nested pipeline, and doesn't apply to the dom |
385 | // fragment wrapper in this pipeline. Keeping it around can induce |
386 | // template wrapping of a foster box if the dom fragment is found in |
387 | // a fosterable position. |
388 | if ( |
389 | isset( $nodeData->parsoid ) && |
390 | $nodeData->parsoid->getTempFlag( TempData::IN_TRANSCLUSION ) |
391 | ) { |
392 | $nodeData->parsoid->tmp->setFlag( TempData::IN_TRANSCLUSION, false ); |
393 | } |
394 | } |
395 | |
396 | DOMDataUtils::setNodeData( $workNode, $nodeData ); |
397 | } else { |
398 | $workNode = $node->ownerDocument->createElement( $wrapperName ); |
399 | } |
400 | |
401 | $tokens = self::convertDOMtoTokens( $workNode, [] ); |
402 | |
403 | // Remove the typeof attribute from the first token. |
404 | // It will be replaced with mw:DOMFragment. |
405 | $tokens[0]->removeAttribute( 'typeof' ); |
406 | |
407 | // Remove the about attribute from the first token. |
408 | // We want to be able to distinguish when this wrapper was template |
409 | // annotated. |
410 | $tokens[0]->removeAttribute( 'about' ); |
411 | |
412 | return $tokens; |
413 | } |
414 | |
415 | /** |
416 | * Generates wrapper tokens for a HTML expansion -- the wrapper |
417 | * tokens are placeholders that adequately represent semantics |
418 | * of the HTML DOM for the purposes of additional token transformations |
419 | * that will be applied to them. |
420 | * |
421 | * @param Env $env |
422 | * The active environment/context. |
423 | * @param Token $token |
424 | * The token that generated the DOM. |
425 | * @param array $expansion |
426 | * - string html HTML of the expansion. |
427 | * - DocumentFragment domFragment Outermost nodes of the HTML. |
428 | * @param array $opts |
429 | * - SourceRange tsr |
430 | * The TSR to set on the generated tokens. This TSR is |
431 | * used to compute DSR on the placeholder tokens. |
432 | * The computed DSR is transferred over to the unpacked DOM |
433 | * if setDSR is true (see below). |
434 | * - bool setDSR |
435 | * When the DOM fragment is unpacked, this option governs |
436 | * whether the DSR from the placeholder node is transferred |
437 | * over to the unpacked DOM or not. |
438 | * For example: Cite, reused transclusions. |
439 | * - bool fromCache |
440 | * - array pipelineOpts |
441 | * - bool unpackOutput |
442 | * - string wrapperName |
443 | * @return Token[] |
444 | */ |
445 | public static function encapsulateExpansionHTML( |
446 | Env $env, Token $token, array $expansion, array $opts |
447 | ): array { |
448 | $opts['unpackOutput'] ??= true; // Default |
449 | // Get placeholder tokens to get our subdom through the token processing |
450 | // stages. These will be finally unwrapped on the DOM. |
451 | $toks = self::getWrapperTokens( $expansion['domFragment'], $opts ); |
452 | $firstWrapperToken = $toks[0]; |
453 | |
454 | // Add the DOMFragment type so that we get unwrapped later. |
455 | $fragmentType = 'mw:DOMFragment' . ( !$opts['unpackOutput'] ? '/sealed/' . $opts['wrapperName'] : '' ); |
456 | $firstWrapperToken->setAttribute( 'typeof', $fragmentType ); |
457 | |
458 | // Assign the HTML fragment to the data-parsoid.html on the first wrapper token. |
459 | $firstWrapperToken->dataParsoid->html = $expansion['html']; |
460 | |
461 | // Pass through setDSR flag |
462 | if ( !empty( $opts['setDSR'] ) ) { |
463 | $firstWrapperToken->dataParsoid->setTempFlag( |
464 | TempData::SET_DSR, $opts['setDSR'] ); |
465 | } |
466 | |
467 | // Pass through fromCache flag |
468 | if ( !empty( $opts['fromCache'] ) ) { |
469 | $firstWrapperToken->dataParsoid->setTempFlag( |
470 | TempData::FROM_CACHE, $opts['fromCache'] ); |
471 | } |
472 | |
473 | // Transfer the tsr. |
474 | // The first token gets the full width, the following tokens zero width. |
475 | $tokenTsr = $opts['tsr'] ?? $token->dataParsoid->tsr ?? null; |
476 | if ( $tokenTsr ) { |
477 | $firstWrapperToken->dataParsoid->tsr = $tokenTsr; |
478 | $firstWrapperToken->dataParsoid->extTagOffsets = $token->dataParsoid->extTagOffsets ?? null; |
479 | // XXX to investigate: if $tokenTsr->end is null, then we're losing |
480 | // the 'hint' we'd like to provide here that this is a zero-width |
481 | // source range. |
482 | // ->end can be set to null by WikiLinkHandler::bailTokens() |
483 | $endTsr = new SourceRange( $tokenTsr->end, $tokenTsr->end ); |
484 | for ( $i = 1; $i < count( $toks ); $i++ ) { |
485 | $toks[$i]->dataParsoid->tsr = clone $endTsr; |
486 | } |
487 | } |
488 | |
489 | return $toks; |
490 | } |
491 | |
492 | private static function wrapAccum( |
493 | Document $doc, array &$textCommentAccum |
494 | ): void { |
495 | // Wrap accumulated nodes in a span |
496 | $span = $doc->createElement( 'span' ); |
497 | $parentNode = $textCommentAccum[0]->parentNode; |
498 | $parentNode->insertBefore( $span, $textCommentAccum[0] ); |
499 | foreach ( $textCommentAccum as $n ) { |
500 | $span->appendChild( $n ); |
501 | } |
502 | $dp = new DataParsoid; |
503 | $dp->setTempFlag( TempData::WRAPPER ); |
504 | DOMDataUtils::setDataParsoid( $span, $dp ); |
505 | $textCommentAccum = []; |
506 | } |
507 | |
508 | /** |
509 | * Wrap text and comment nodes in a node list into spans, so that all |
510 | * top-level nodes are elements. |
511 | * |
512 | * @param NodeList $nodes List of DOM nodes to wrap, mix of node types. |
513 | * @param ?Node $startAt |
514 | * @param ?Node $stopAt |
515 | */ |
516 | public static function addSpanWrappers( |
517 | $nodes, |
518 | ?Node $startAt = null, |
519 | ?Node $stopAt = null |
520 | ): void { |
521 | $textCommentAccum = []; |
522 | $doc = $nodes->item( 0 )->ownerDocument; |
523 | |
524 | // Build a real array out of nodes. |
525 | // |
526 | // Operating directly on DOM child-nodes array |
527 | // and manipulating them by adding span wrappers |
528 | // changes the traversal itself |
529 | $nodeBuf = []; |
530 | foreach ( $nodes as $node ) { |
531 | $nodeBuf[] = $node; |
532 | } |
533 | |
534 | $start = ( $startAt === null ); |
535 | foreach ( $nodeBuf as $node ) { |
536 | if ( !$start ) { |
537 | if ( $startAt !== $node ) { |
538 | continue; |
539 | } |
540 | $start = true; |
541 | } |
542 | if ( $node instanceof Text || $node instanceof Comment ) { |
543 | $textCommentAccum[] = $node; |
544 | } elseif ( count( $textCommentAccum ) ) { |
545 | self::wrapAccum( $doc, $textCommentAccum ); |
546 | } |
547 | if ( $node === $stopAt ) { |
548 | break; |
549 | } |
550 | } |
551 | |
552 | if ( count( $textCommentAccum ) ) { |
553 | self::wrapAccum( $doc, $textCommentAccum ); |
554 | } |
555 | } |
556 | |
557 | /** |
558 | * Convert a HTML5 DOM into a mw:DOMFragment and generate appropriate |
559 | * tokens to insert into the token stream for further processing. |
560 | * |
561 | * The DOMPostProcessor will unpack the fragment and insert the HTML |
562 | * back into the DOM. |
563 | * |
564 | * @param Env $env |
565 | * The active environment/context. |
566 | * @param Token $token |
567 | * The token that generated the DOM. |
568 | * @param DocumentFragment $domFragment |
569 | * The DOM that the token expanded to. |
570 | * @param array $opts |
571 | * Options to be passed onto the encapsulation code |
572 | * See encapsulateExpansionHTML's doc. for more info about these options. |
573 | * @return Token[] |
574 | */ |
575 | public static function tunnelDOMThroughTokens( |
576 | Env $env, Token $token, DocumentFragment $domFragment, array $opts |
577 | ): array { |
578 | // Get placeholder tokens to get our subdom through the token processing |
579 | // stages. These will be finally unwrapped on the DOM. |
580 | $expansion = self::makeExpansion( $env, $domFragment ); |
581 | return self::encapsulateExpansionHTML( $env, $token, $expansion, $opts ); |
582 | } |
583 | |
584 | public static function makeExpansion( |
585 | Env $env, DocumentFragment $domFragment |
586 | ): array { |
587 | $fragmentId = $env->newFragmentId(); |
588 | $env->setDOMFragment( $fragmentId, $domFragment ); |
589 | return [ 'domFragment' => $domFragment, 'html' => $fragmentId ]; |
590 | } |
591 | |
592 | private static function doExtractExpansions( Env $env, array &$expansions, Node $node ): void { |
593 | $nodes = null; |
594 | $expAccum = null; |
595 | while ( $node ) { |
596 | if ( $node instanceof Element ) { |
597 | if ( DOMUtils::matchTypeOf( $node, '#^mw:(Transclusion$|Extension/)#' ) && |
598 | $node->hasAttribute( 'about' ) |
599 | ) { |
600 | $dp = DOMDataUtils::getDataParsoid( $node ); |
601 | $about = DOMCompat::getAttribute( $node, 'about' ); |
602 | $nodes = WTUtils::getAboutSiblings( $node, $about ); |
603 | $key = null; |
604 | if ( DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) ) { |
605 | $expAccum = $expansions['transclusions']; |
606 | $key = $dp->src; |
607 | } elseif ( DOMUtils::matchTypeOf( $node, '#^mw:Extension/#' ) ) { |
608 | $expAccum = $expansions['extensions']; |
609 | $key = $dp->src; |
610 | } else { |
611 | $expAccum = $expansions['media']; |
612 | // XXX gwicke: use proper key that is not |
613 | // source-based? This also needs to work for |
614 | // transclusion output. |
615 | $key = null; |
616 | } |
617 | |
618 | if ( $key ) { |
619 | throw new UnreachableException( 'Callsite was not ported!' ); |
620 | // FIXME: makeExpansion return type changed |
621 | // $expAccum[$key] = self::makeExpansion( $env, $nodes ); |
622 | } |
623 | |
624 | $node = end( $nodes ); |
625 | } else { |
626 | self::doExtractExpansions( $env, $expansions, $node->firstChild ); |
627 | } |
628 | } |
629 | $node = $node->nextSibling; |
630 | } |
631 | } |
632 | |
633 | /** |
634 | * Extract transclusion and extension expansions from a DOM, and return |
635 | * them in a structure like this: |
636 | * { |
637 | * transclusions: { |
638 | * 'key1': { |
639 | * html: 'html1', |
640 | * nodes: [<node1>, <node2>] |
641 | * } |
642 | * }, |
643 | * extensions: { |
644 | * 'key2': { |
645 | * html: 'html2', |
646 | * nodes: [<node1>, <node2>] |
647 | * } |
648 | * }, |
649 | * files: { |
650 | * 'key3': { |
651 | * html: 'html3', |
652 | * nodes: [<node1>, <node2>] |
653 | * } |
654 | * } |
655 | * } |
656 | * |
657 | * @param Env $env |
658 | * @param Element $body |
659 | * @return array |
660 | */ |
661 | public static function extractExpansions( Env $env, Element $body ): array { |
662 | $expansions = [ |
663 | 'transclusions' => [], |
664 | 'extensions' => [], |
665 | 'media' => [] |
666 | ]; |
667 | // Kick off the extraction |
668 | self::doExtractExpansions( $env, $expansions, $body->firstChild ); |
669 | return $expansions; |
670 | } |
671 | |
672 | /** |
673 | * Fetches output of encapsulations that return HTML from the legacy parser |
674 | */ |
675 | public static function fetchHTML( Env $env, string $source ): ?DocumentFragment { |
676 | $ret = $env->getDataAccess()->parseWikitext( |
677 | $env->getPageConfig(), $env->getMetadata(), $source |
678 | ); |
679 | return $ret === '' ? null : DOMUtils::parseHTMLToFragment( |
680 | $env->topLevelDoc, DOMUtils::stripPWrapper( $ret ) |
681 | ); |
682 | } |
683 | } |