Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 224 |
|
0.00% |
0 / 16 |
CRAP | |
0.00% |
0 / 1 |
PipelineUtils | |
0.00% |
0 / 224 |
|
0.00% |
0 / 16 |
5402 | |
0.00% |
0 / 1 |
pFragmentToParsoidFragmentMarkers | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
getDOMFragmentToken | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
12 | |||
processContentInPipeline | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
expandAttrValueToDOM | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
6 | |||
expandAttrValuesToDOM | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
domAttrsToTagAttrs | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
20 | |||
convertDOMtoTokens | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
56 | |||
getWrapperTokens | |
0.00% |
0 / 47 |
|
0.00% |
0 / 1 |
462 | |||
encapsulateExpansionHTML | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
42 | |||
wrapAccum | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
6 | |||
addSpanWrappers | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
110 | |||
tunnelDOMThroughTokens | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
makeExpansion | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
doExtractExpansions | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
72 | |||
extractExpansions | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
fetchHTML | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\DOM\Comment; |
10 | use Wikimedia\Parsoid\DOM\Document; |
11 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
12 | use Wikimedia\Parsoid\DOM\Element; |
13 | use Wikimedia\Parsoid\DOM\Node; |
14 | use Wikimedia\Parsoid\DOM\NodeList; |
15 | use Wikimedia\Parsoid\DOM\Text; |
16 | use Wikimedia\Parsoid\Fragments\PFragment; |
17 | use Wikimedia\Parsoid\Fragments\WikitextPFragment; |
18 | use Wikimedia\Parsoid\NodeData\DataMw; |
19 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
20 | use Wikimedia\Parsoid\NodeData\TempData; |
21 | use Wikimedia\Parsoid\Tokens\CommentTk; |
22 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
23 | use Wikimedia\Parsoid\Tokens\EOFTk; |
24 | use Wikimedia\Parsoid\Tokens\KV; |
25 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
26 | use Wikimedia\Parsoid\Tokens\SourceRange; |
27 | use Wikimedia\Parsoid\Tokens\TagTk; |
28 | use Wikimedia\Parsoid\Tokens\Token; |
29 | use Wikimedia\Parsoid\Wt2Html\Frame; |
30 | |
31 | /** |
32 | * This file contains parsing pipeline related utilities. |
33 | */ |
34 | class PipelineUtils { |
35 | // keep in sync with internal_strip_marker in Grammar.pegphp |
36 | public const PARSOID_FRAGMENT_PREFIX = '{{#parsoid\0fragment:'; |
37 | |
38 | /** |
39 | * Returns a wikitext string with embedded parsoid fragment markers, |
40 | * as well as a mapping from the marker IDs to PFragment objects. |
41 | * @return array{0:string,1:array<string,PFragment>} A array consisting of |
42 | * the wikitext string, followed by the id-to-PFragment map. |
43 | */ |
44 | public static function pFragmentToParsoidFragmentMarkers( PFragment $fragment ): array { |
45 | static $counter = 0; |
46 | $pieces = WikitextPFragment::castFromPFragment( $fragment )->split(); |
47 | $result = [ $pieces[0] ]; |
48 | $map = []; |
49 | for ( $i = 1; $i < count( $pieces ); $i += 2 ) { |
50 | $marker = self::PARSOID_FRAGMENT_PREFIX . ( $counter++ ) . '}}'; |
51 | $map[$marker] = $pieces[$i]; |
52 | $result[] = $marker; |
53 | $result[] = $pieces[$i + 1]; |
54 | } |
55 | return [ implode( '', $result ), $map ]; |
56 | } |
57 | |
58 | /** |
59 | * Creates a dom-fragment-token for processing 'content' (an array of tokens) |
60 | * in its own subpipeline all the way to DOM. These tokens will be processed |
61 | * by their own handler (DOMFragmentBuilder) in the last stage of the async |
62 | * pipeline. |
63 | * |
64 | * srcOffsets should always be provided to process top-level page content in a |
65 | * subpipeline. Without it, DSR computation and template wrapping cannot be done |
66 | * in the subpipeline. While unpackDOMFragment can do this on unwrapping, that can |
67 | * be a bit fragile and makes dom-fragments a leaky abstraction by leaking subpipeline |
68 | * processing into the top-level pipeline. |
69 | * |
70 | * @param string|Token|array<Token|string> $content The array of tokens to process. |
71 | * @param SourceRange $srcOffsets Wikitext source offsets (start/end) of these tokens. |
72 | * @param array $opts Parsing options. |
73 | * - Token token The token that generated the content. |
74 | * - bool inlineContext Is this DOM fragment used in an inline context? |
75 | * @return SelfclosingTagTk |
76 | */ |
77 | public static function getDOMFragmentToken( |
78 | $content, SourceRange $srcOffsets, array $opts = [] |
79 | ): SelfclosingTagTk { |
80 | $token = $opts['token']; |
81 | return new SelfclosingTagTk( 'mw:dom-fragment-token', [ |
82 | new KV( 'contextTok', $token, $token->dataParsoid->tsr->expandTsrV() ), |
83 | new KV( 'content', $content, $srcOffsets->expandTsrV() ), |
84 | new KV( 'inlineContext', ( $opts['inlineContext'] ?? false ) ? "1" : "0" ), |
85 | new KV( 'inPHPBlock', ( $opts['inPHPBlock'] ?? false ) ? "1" : "0" ), |
86 | ] ); |
87 | } |
88 | |
89 | /** |
90 | * Processes content (wikitext, array of tokens, whatever) in its own |
91 | * pipeline based on options. |
92 | * |
93 | * @param Env $env The environment/context for the expansion. |
94 | * @param Frame $frame |
95 | * The parent frame within which the expansion is taking place. |
96 | * Used for template expansion and source text tracking. |
97 | * @param string|Token|array<Token|string>|DocumentFragment|PFragment $content |
98 | * How this content is processed depends on what kind of pipeline |
99 | * is constructed specified by opts. |
100 | * @param array $opts |
101 | * Processing options that specify pipeline-type, opts, and callbacks. |
102 | * - string pipelineType |
103 | * - array pipelineOpts |
104 | * - array tplArgs - if set, defines parameters for the child frame |
105 | * - string tplArgs['name'] |
106 | * - KV[] tplArgs['attribs'] |
107 | * - string srcText - if set, defines the source text for the expansion |
108 | * - SourceRange srcOffsets - if set, defines the range within the |
109 | * source text that $content corresponds to |
110 | * - bool sol Whether tokens should be processed in start-of-line context. |
111 | * - bool toplevel Whether the pipeline is considered atTopLevel |
112 | * @return array<Token|string>|DocumentFragment (depending on pipeline type) |
113 | */ |
114 | public static function processContentInPipeline( |
115 | Env $env, Frame $frame, $content, array $opts |
116 | ) { |
117 | // Build a pipeline |
118 | $pipeline = $env->getPipelineFactory()->getPipeline( |
119 | $opts['pipelineType'], |
120 | $opts['pipelineOpts'] |
121 | ); |
122 | |
123 | $pipeline->init( [ |
124 | // NOTE: some pipelines force toplevel to true |
125 | 'toplevel' => $opts['toplevel'] ?? false, |
126 | 'frame' => $frame, |
127 | 'tplArgs' => $opts['tplArgs'] ?? null, |
128 | 'srcText' => $opts['srcText'] ?? $frame->getSrcText(), |
129 | 'srcOffsets' => $opts['srcOffsets'] ?? null, |
130 | ] ); |
131 | |
132 | // Off the starting block ... ready, set, go! |
133 | return $pipeline->parse( $content, [ 'sol' => $opts['sol'] ] ); |
134 | } |
135 | |
136 | /** |
137 | * Expands value all the way to DOM. |
138 | * |
139 | * @param Env $env |
140 | * The environment/context for the expansion. |
141 | * @param Frame $frame |
142 | * The parent frame within which the expansion is taking place. |
143 | * Used for template expansion and source text tracking. |
144 | * @param array $v |
145 | * The value to process. |
146 | * The value is expected to be an associative array with a "html" property. |
147 | * The html property is expanded to DOM only if it is an array (of tokens). |
148 | * Non-arrays are passed back unexpanded. |
149 | * @param bool $expandTemplates |
150 | * Should any templates encountered here be expanded |
151 | * (usually false for nested templates since they are never directly editable). |
152 | * @param bool $inTemplate |
153 | * Unexpanded templates can occur in the content of extension tags. |
154 | * @return array |
155 | */ |
156 | public static function expandAttrValueToDOM( |
157 | Env $env, Frame $frame, array $v, bool $expandTemplates, bool $inTemplate |
158 | ): array { |
159 | if ( is_array( $v['html'] ?? null ) ) { |
160 | // Set up pipeline options |
161 | $opts = [ |
162 | 'pipelineType' => 'expanded-tokens-to-fragment', |
163 | 'pipelineOpts' => [ |
164 | 'attrExpansion' => true, |
165 | 'inlineContext' => true, |
166 | 'expandTemplates' => $expandTemplates, |
167 | 'inTemplate' => $inTemplate |
168 | ], |
169 | 'srcOffsets' => $v['srcOffsets'], |
170 | 'sol' => true |
171 | ]; |
172 | $content = array_merge( $v['html'], [ new EOFTk() ] ); |
173 | $domFragment = self::processContentInPipeline( |
174 | $env, $frame, $content, $opts |
175 | ); |
176 | // Since we aren't at the top level, data attrs |
177 | // were not applied in cleanup. However, tmp |
178 | // was stripped. |
179 | $v['html'] = ContentUtils::ppToXML( |
180 | $domFragment, [ 'innerXML' => true ] |
181 | ); |
182 | } |
183 | // Remove srcOffsets after value is expanded, so they don't show |
184 | // up in the output data-mw attribute |
185 | unset( $v['srcOffsets'] ); |
186 | return $v; |
187 | } |
188 | |
189 | /** |
190 | * @param Env $env |
191 | * The environment/context for the expansion. |
192 | * @param Frame $frame |
193 | * The parent frame within which the expansion is taking place. |
194 | * Used for template expansion and source text tracking. |
195 | * @param array $vals |
196 | * Array of values to expand. |
197 | * Non-array elements of $vals are passed back unmodified. |
198 | * If an array element, it is expected to be an associative array with a "html" property. |
199 | * The html property is expanded to DOM only if it is an array (of tokens). |
200 | * @param bool $expandTemplates |
201 | * Should any templates encountered here be expanded |
202 | * (usually false for nested templates since they are never directly editable). |
203 | * @param bool $inTemplate |
204 | * Unexpanded templates can occur in the content of extension tags. |
205 | * @return array |
206 | */ |
207 | public static function expandAttrValuesToDOM( |
208 | Env $env, $frame, array $vals, bool $expandTemplates, bool $inTemplate |
209 | ): array { |
210 | $ret = []; |
211 | foreach ( $vals as $v ) { |
212 | $ret[] = self::expandAttrValueToDOM( $env, $frame, $v, $expandTemplates, $inTemplate ); |
213 | } |
214 | return $ret; |
215 | } |
216 | |
217 | /** |
218 | * Convert a DOM node to a token. The node comes from a DOM whose data attributes |
219 | * are stored outside the DOM. |
220 | * |
221 | * @param Element $node |
222 | * @param array<string,string> $attrs |
223 | * @return array{attrs:KV[],dataParsoid:?DataParsoid,dataMw:?DataMw} |
224 | */ |
225 | private static function domAttrsToTagAttrs( Element $node, array $attrs ): array { |
226 | $out = []; |
227 | foreach ( $attrs as $name => $value ) { |
228 | if ( $name !== DOMDataUtils::DATA_OBJECT_ATTR_NAME ) { |
229 | $out[] = new KV( $name, $value ); |
230 | } |
231 | } |
232 | return [ |
233 | 'attrs' => $out, |
234 | 'dataParsoid' => DOMDataUtils::getDataParsoid( $node ), |
235 | 'dataMw' => |
236 | DOMDataUtils::validDataMw( $node ) ? DOMDataUtils::getDataMw( $node ) : null, |
237 | ]; |
238 | } |
239 | |
240 | /** |
241 | * Convert a DOM to tokens. Data attributes for nodes are stored outside the DOM. |
242 | * |
243 | * @param Node $node The root of the DOM tree to convert to tokens |
244 | * @param array<Token|string> $tokBuf This is where the tokens get stored |
245 | * @return array |
246 | */ |
247 | private static function convertDOMtoTokens( Node $node, array $tokBuf ): array { |
248 | if ( $node instanceof Element ) { |
249 | $nodeName = DOMCompat::nodeName( $node ); |
250 | $attrInfo = self::domAttrsToTagAttrs( $node, DOMUtils::attributes( $node ) ); |
251 | |
252 | if ( Utils::isVoidElement( $nodeName ) ) { |
253 | $tokBuf[] = new SelfclosingTagTk( |
254 | $nodeName, $attrInfo['attrs'], |
255 | $attrInfo['dataParsoid'], $attrInfo['dataMw'] |
256 | ); |
257 | } else { |
258 | $tokBuf[] = new TagTk( |
259 | $nodeName, $attrInfo['attrs'], |
260 | $attrInfo['dataParsoid'], $attrInfo['dataMw'] |
261 | ); |
262 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
263 | $tokBuf = self::convertDOMtoTokens( $child, $tokBuf ); |
264 | } |
265 | $endTag = new EndTagTk( $nodeName ); |
266 | // Keep stx parity |
267 | if ( WTUtils::isLiteralHTMLNode( $node ) ) { |
268 | $endTag->dataParsoid->stx = 'html'; |
269 | } |
270 | $tokBuf[] = $endTag; |
271 | } |
272 | } elseif ( $node instanceof Text ) { |
273 | PHPUtils::pushArray( $tokBuf, TokenUtils::newlinesToNlTks( $node->nodeValue ) ); |
274 | } elseif ( $node instanceof Comment ) { |
275 | $tokBuf[] = new CommentTk( $node->nodeValue ); |
276 | } else { |
277 | // getWrapperTokens calls convertDOMToTokens with a Element |
278 | // and children of dom elements are always text/comment/elements |
279 | // which are all covered above. |
280 | throw new UnreachableException( "Should never get here!" ); |
281 | } |
282 | |
283 | return $tokBuf; |
284 | } |
285 | |
286 | /** |
287 | * Get tokens representing a DOM forest (from transclusions, extensions, |
288 | * whatever that were generated as part of a separate processing pipeline) |
289 | * in the token stream. These tokens will tunnel the subtree through the |
290 | * token processing while preserving token stream semantics as if |
291 | * the DOM had been converted to tokens. |
292 | * |
293 | * @param DocumentFragment $domFragment List of DOM nodes that need to be tunneled through. |
294 | * @param array $opts |
295 | * @see encapsulateExpansionHTML's doc. for more info about these options. |
296 | * @return array<Token|string> List of token representatives. |
297 | */ |
298 | private static function getWrapperTokens( |
299 | DocumentFragment $domFragment, array $opts |
300 | ): array { |
301 | if ( !$domFragment->hasChildNodes() ) { |
302 | return [ new TagTk( 'span' ), new EndTagTk( 'span' ) ]; |
303 | } |
304 | |
305 | $node = $domFragment->firstChild; |
306 | |
307 | // Do we represent this with inline or block elements? |
308 | // This is to ensure that we get p-wrapping correct. |
309 | // |
310 | // * If all content is inline, we use inline-elements to represent this |
311 | // so that this content gets swallowed into the P tag that wraps |
312 | // adjacent inline content. |
313 | // |
314 | // * If any part of this is a block content, we treat extension content |
315 | // independent of surrounding content and don't want inline content |
316 | // here to be swallowed into a P tag that wraps adjacent inline content. |
317 | // |
318 | // This behavior ensures that we and clients can "drop-in" extension content |
319 | // into the DOM without messing with fixing up paragraph tags of surrounding |
320 | // content. It could potentially introduce minor rendering differences when |
321 | // compared to PHP parser output, but we'll swallow it for now. |
322 | $wrapperType = 'INLINE'; |
323 | if ( !empty( $opts['pipelineOpts']['inlineContext'] ) ) { |
324 | // If the DOM fragment is being processed in the context where P wrapping |
325 | // has been suppressed, we represent the DOM fragment with inline-tokens. |
326 | // |
327 | // FIXME(SSS): Looks like we have some "impedance mismatch" here. But, this |
328 | // is correct in scenarios where link-content or image-captions are being |
329 | // processed in a sub-pipeline and we don't want a <div> in the link-caption |
330 | // to cause the <a>..</a> to get split apart. |
331 | // |
332 | // Filed as T49963 |
333 | } elseif ( !$opts['unpackOutput'] ) { |
334 | // Fragments that won't be unpacked aren't amenable to inspection, since |
335 | // the ultimate content is unknown. For example, refs shuttle content |
336 | // through treebuilding that ends up in the references list. |
337 | // |
338 | // FIXME(arlolra): Do we need a mechanism to specify content |
339 | // categories? |
340 | } else { |
341 | foreach ( $domFragment->childNodes as $n ) { |
342 | if ( |
343 | DOMUtils::isWikitextBlockNode( $n ) || |
344 | DOMUtils::hasBlockElementDescendant( $n ) |
345 | ) { |
346 | $wrapperType = 'BLOCK'; |
347 | break; |
348 | } |
349 | } |
350 | } |
351 | |
352 | $wrapperName = null; |
353 | if ( $wrapperType === 'BLOCK' && !DOMUtils::isWikitextBlockNode( $node ) ) { |
354 | $wrapperName = 'div'; |
355 | } elseif ( DOMCompat::nodeName( $node ) === 'a' ) { |
356 | // Do not use 'A' as a wrapper node because it could |
357 | // end up getting nested inside another 'A' and the DOM |
358 | // structure can change where the wrapper tokens are no |
359 | // longer siblings. |
360 | // Ex: "[http://foo.com Bad nesting [[Here]]]. |
361 | $wrapperName = 'span'; |
362 | } elseif ( |
363 | in_array( DOMCompat::nodeName( $node ), [ 'style', 'script' ], true ) && |
364 | ( $node->nextSibling !== null ) |
365 | ) { |
366 | // <style>/<script> tags are not fostered, so if we're wrapping |
367 | // more than a single node, they aren't a good representation for |
368 | // the content. It can lead to fosterable content being inserted |
369 | // in a fosterable position after treebuilding is done, which isn't |
370 | // roundtrippable. |
371 | $wrapperName = 'span'; |
372 | } elseif ( !( $node instanceof Element ) ) { |
373 | $wrapperName = 'span'; |
374 | } else { |
375 | $wrapperName = DOMCompat::nodeName( $node ); |
376 | } |
377 | |
378 | if ( $node instanceof Element ) { |
379 | Assert::invariant( |
380 | // No need to look for data-mw as well. |
381 | // Nodes that have data-mw also have data-parsoid. |
382 | !$node->hasAttribute( 'data-parsoid' ), |
383 | "Expected node to have its data attributes loaded" ); |
384 | |
385 | $nodeData = clone DOMDataUtils::getNodeData( $node ); |
386 | |
387 | if ( $wrapperName !== DOMCompat::nodeName( $node ) ) { |
388 | // Create a copy of the node without children |
389 | $workNode = $node->ownerDocument->createElement( $wrapperName ); |
390 | |
391 | // Copy over attributes |
392 | foreach ( DOMUtils::attributes( $node ) as $name => $value ) { |
393 | // "typeof" is ignored since it'll be removed below. |
394 | if ( $name !== 'typeof' ) { |
395 | $workNode->setAttribute( $name, $value ); |
396 | } |
397 | } |
398 | |
399 | // We are applying a different wrapper. |
400 | // So, node's data-parsoid isn't applicable. |
401 | $nodeData->parsoid = new DataParsoid; |
402 | } else { |
403 | // Shallow clone since we don't want to convert the whole tree to tokens. |
404 | $workNode = $node->cloneNode( false ); |
405 | |
406 | // Reset 'tsr' since it isn't applicable. Neither is |
407 | // any auxiliary info like 'endTSR'. |
408 | // FIXME: The above comment is only true if we are reusing |
409 | // DOM fragments from cache from previous revisions in |
410 | // incremental parsing scenarios. See T98992 |
411 | if ( isset( $nodeData->parsoid->tsr ) ) { |
412 | $nodeData->parsoid->tsr = null; |
413 | } |
414 | if ( isset( $nodeData->parsoid->tmp->endTSR ) ) { |
415 | unset( $nodeData->parsoid->tmp->endTSR ); |
416 | } |
417 | |
418 | // The "in transclusion" flag was set on the first child for template |
419 | // wrapping in the nested pipeline, and doesn't apply to the dom |
420 | // fragment wrapper in this pipeline. Keeping it around can induce |
421 | // template wrapping of a foster box if the dom fragment is found in |
422 | // a fosterable position. |
423 | if ( |
424 | isset( $nodeData->parsoid ) && |
425 | $nodeData->parsoid->getTempFlag( TempData::IN_TRANSCLUSION ) |
426 | ) { |
427 | $nodeData->parsoid->tmp->setFlag( TempData::IN_TRANSCLUSION, false ); |
428 | } |
429 | } |
430 | |
431 | DOMDataUtils::setNodeData( $workNode, $nodeData ); |
432 | } else { |
433 | $workNode = $node->ownerDocument->createElement( $wrapperName ); |
434 | } |
435 | |
436 | $tokens = self::convertDOMtoTokens( $workNode, [] ); |
437 | |
438 | // Remove the typeof attribute from the first token. |
439 | // It will be replaced with mw:DOMFragment. |
440 | $tokens[0]->removeAttribute( 'typeof' ); |
441 | |
442 | // Remove the about attribute from the first token. |
443 | // We want to be able to distinguish when this wrapper was template |
444 | // annotated. |
445 | $tokens[0]->removeAttribute( 'about' ); |
446 | |
447 | return $tokens; |
448 | } |
449 | |
450 | /** |
451 | * Generates wrapper tokens for a HTML expansion -- the wrapper |
452 | * tokens are placeholders that adequately represent semantics |
453 | * of the HTML DOM for the purposes of additional token transformations |
454 | * that will be applied to them. |
455 | * |
456 | * @param Env $env |
457 | * The active environment/context. |
458 | * @param Token $token |
459 | * The token that generated the DOM. |
460 | * @param array $expansion |
461 | * - string html HTML of the expansion. |
462 | * - DocumentFragment domFragment Outermost nodes of the HTML. |
463 | * @param array $opts |
464 | * - SourceRange tsr |
465 | * The TSR to set on the generated tokens. This TSR is |
466 | * used to compute DSR on the placeholder tokens. |
467 | * The computed DSR is transferred over to the unpacked DOM |
468 | * if setDSR is true (see below). |
469 | * - bool setDSR |
470 | * When the DOM fragment is unpacked, this option governs |
471 | * whether the DSR from the placeholder node is transferred |
472 | * over to the unpacked DOM or not. |
473 | * For example: Cite, reused transclusions. |
474 | * - bool fromCache |
475 | * - array pipelineOpts |
476 | * - bool unpackOutput |
477 | * - string wrapperName |
478 | * @return array<Token|string> |
479 | */ |
480 | public static function encapsulateExpansionHTML( |
481 | Env $env, Token $token, array $expansion, array $opts |
482 | ): array { |
483 | $opts['unpackOutput'] ??= true; // Default |
484 | // Get placeholder tokens to get our subdom through the token processing |
485 | // stages. These will be finally unwrapped on the DOM. |
486 | $toks = self::getWrapperTokens( $expansion['domFragment'], $opts ); |
487 | $firstWrapperToken = $toks[0]; |
488 | |
489 | // Add the DOMFragment type so that we get unwrapped later. |
490 | $fragmentType = 'mw:DOMFragment' . ( !$opts['unpackOutput'] ? '/sealed/' . $opts['wrapperName'] : '' ); |
491 | $firstWrapperToken->setAttribute( 'typeof', $fragmentType ); |
492 | |
493 | // Assign the HTML fragment to the data-parsoid.html on the first wrapper token. |
494 | $firstWrapperToken->dataParsoid->html = $expansion['html']; |
495 | |
496 | // Pass through setDSR flag |
497 | if ( !empty( $opts['setDSR'] ) ) { |
498 | $firstWrapperToken->dataParsoid->setTempFlag( |
499 | TempData::SET_DSR, $opts['setDSR'] ); |
500 | } |
501 | |
502 | // Pass through fromCache flag |
503 | if ( !empty( $opts['fromCache'] ) ) { |
504 | $firstWrapperToken->dataParsoid->setTempFlag( |
505 | TempData::FROM_CACHE, $opts['fromCache'] ); |
506 | } |
507 | |
508 | // Transfer the tsr. |
509 | // The first token gets the full width, the following tokens zero width. |
510 | $tokenTsr = $opts['tsr'] ?? $token->dataParsoid->tsr ?? null; |
511 | if ( $tokenTsr ) { |
512 | $firstWrapperToken->dataParsoid->tsr = $tokenTsr; |
513 | $firstWrapperToken->dataParsoid->extTagOffsets = $token->dataParsoid->extTagOffsets ?? null; |
514 | // XXX to investigate: if $tokenTsr->end is null, then we're losing |
515 | // the 'hint' we'd like to provide here that this is a zero-width |
516 | // source range. |
517 | // ->end can be set to null by WikiLinkHandler::bailTokens() |
518 | $endTsr = new SourceRange( $tokenTsr->end, $tokenTsr->end ); |
519 | for ( $i = 1; $i < count( $toks ); $i++ ) { |
520 | $toks[$i]->dataParsoid->tsr = clone $endTsr; |
521 | } |
522 | } |
523 | |
524 | return $toks; |
525 | } |
526 | |
527 | private static function wrapAccum( |
528 | Document $doc, array &$textCommentAccum |
529 | ): void { |
530 | // Wrap accumulated nodes in a span |
531 | $span = $doc->createElement( 'span' ); |
532 | $parentNode = $textCommentAccum[0]->parentNode; |
533 | $parentNode->insertBefore( $span, $textCommentAccum[0] ); |
534 | foreach ( $textCommentAccum as $n ) { |
535 | $span->appendChild( $n ); |
536 | } |
537 | $dp = new DataParsoid; |
538 | $dp->setTempFlag( TempData::WRAPPER ); |
539 | DOMDataUtils::setDataParsoid( $span, $dp ); |
540 | $textCommentAccum = []; |
541 | } |
542 | |
543 | /** |
544 | * Wrap text and comment nodes in a node list into spans, so that all |
545 | * top-level nodes are elements. |
546 | * |
547 | * @param NodeList $nodes List of DOM nodes to wrap, mix of node types. |
548 | * @param ?Node $startAt |
549 | * @param ?Node $stopAt |
550 | */ |
551 | public static function addSpanWrappers( |
552 | $nodes, |
553 | ?Node $startAt = null, |
554 | ?Node $stopAt = null |
555 | ): void { |
556 | $textCommentAccum = []; |
557 | $doc = $nodes->item( 0 )->ownerDocument; |
558 | |
559 | // Build a real array out of nodes. |
560 | // |
561 | // Operating directly on DOM child-nodes array |
562 | // and manipulating them by adding span wrappers |
563 | // changes the traversal itself |
564 | $nodeBuf = []; |
565 | foreach ( $nodes as $node ) { |
566 | $nodeBuf[] = $node; |
567 | } |
568 | |
569 | $start = ( $startAt === null ); |
570 | foreach ( $nodeBuf as $node ) { |
571 | if ( !$start ) { |
572 | if ( $startAt !== $node ) { |
573 | continue; |
574 | } |
575 | $start = true; |
576 | } |
577 | if ( $node instanceof Text || $node instanceof Comment ) { |
578 | $textCommentAccum[] = $node; |
579 | } elseif ( count( $textCommentAccum ) ) { |
580 | self::wrapAccum( $doc, $textCommentAccum ); |
581 | } |
582 | if ( $node === $stopAt ) { |
583 | break; |
584 | } |
585 | } |
586 | |
587 | if ( count( $textCommentAccum ) ) { |
588 | self::wrapAccum( $doc, $textCommentAccum ); |
589 | } |
590 | } |
591 | |
592 | /** |
593 | * Convert a HTML5 DOM into a mw:DOMFragment and generate appropriate |
594 | * tokens to insert into the token stream for further processing. |
595 | * |
596 | * The DOMPostProcessor will unpack the fragment and insert the HTML |
597 | * back into the DOM. |
598 | * |
599 | * @param Env $env |
600 | * The active environment/context. |
601 | * @param Token $token |
602 | * The token that generated the DOM. |
603 | * @param DocumentFragment $domFragment |
604 | * The DOM that the token expanded to. |
605 | * @param array $opts |
606 | * Options to be passed onto the encapsulation code |
607 | * See encapsulateExpansionHTML's doc. for more info about these options. |
608 | * @return array<Token|string> |
609 | */ |
610 | public static function tunnelDOMThroughTokens( |
611 | Env $env, Token $token, DocumentFragment $domFragment, array $opts |
612 | ): array { |
613 | // Get placeholder tokens to get our subdom through the token processing |
614 | // stages. These will be finally unwrapped on the DOM. |
615 | $expansion = self::makeExpansion( $env, $domFragment ); |
616 | return self::encapsulateExpansionHTML( $env, $token, $expansion, $opts ); |
617 | } |
618 | |
619 | public static function makeExpansion( |
620 | Env $env, DocumentFragment $domFragment |
621 | ): array { |
622 | $fragmentId = $env->newFragmentId(); |
623 | $env->setDOMFragment( $fragmentId, $domFragment ); |
624 | return [ 'domFragment' => $domFragment, 'html' => $fragmentId ]; |
625 | } |
626 | |
627 | private static function doExtractExpansions( Env $env, array &$expansions, Node $node ): void { |
628 | $nodes = null; |
629 | $expAccum = null; |
630 | while ( $node ) { |
631 | if ( $node instanceof Element ) { |
632 | if ( DOMUtils::matchTypeOf( $node, '#^mw:(Transclusion$|Extension/)#' ) && |
633 | $node->hasAttribute( 'about' ) |
634 | ) { |
635 | $dp = DOMDataUtils::getDataParsoid( $node ); |
636 | $about = DOMCompat::getAttribute( $node, 'about' ); |
637 | $nodes = WTUtils::getAboutSiblings( $node, $about ); |
638 | $key = null; |
639 | if ( DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) ) { |
640 | $expAccum = $expansions['transclusions']; |
641 | $key = $dp->src; |
642 | } elseif ( DOMUtils::matchTypeOf( $node, '#^mw:Extension/#' ) ) { |
643 | $expAccum = $expansions['extensions']; |
644 | $key = $dp->src; |
645 | } else { |
646 | $expAccum = $expansions['media']; |
647 | // XXX gwicke: use proper key that is not |
648 | // source-based? This also needs to work for |
649 | // transclusion output. |
650 | $key = null; |
651 | } |
652 | |
653 | if ( $key ) { |
654 | throw new UnreachableException( 'Callsite was not ported!' ); |
655 | // FIXME: makeExpansion return type changed |
656 | // $expAccum[$key] = self::makeExpansion( $env, $nodes ); |
657 | } |
658 | |
659 | $node = end( $nodes ); |
660 | } else { |
661 | self::doExtractExpansions( $env, $expansions, $node->firstChild ); |
662 | } |
663 | } |
664 | $node = $node->nextSibling; |
665 | } |
666 | } |
667 | |
668 | /** |
669 | * Extract transclusion and extension expansions from a DOM, and return |
670 | * them in a structure like this: |
671 | * { |
672 | * transclusions: { |
673 | * 'key1': { |
674 | * html: 'html1', |
675 | * nodes: [<node1>, <node2>] |
676 | * } |
677 | * }, |
678 | * extensions: { |
679 | * 'key2': { |
680 | * html: 'html2', |
681 | * nodes: [<node1>, <node2>] |
682 | * } |
683 | * }, |
684 | * files: { |
685 | * 'key3': { |
686 | * html: 'html3', |
687 | * nodes: [<node1>, <node2>] |
688 | * } |
689 | * } |
690 | * } |
691 | * |
692 | * @param Env $env |
693 | * @param Element $body |
694 | * @return array |
695 | */ |
696 | public static function extractExpansions( Env $env, Element $body ): array { |
697 | $expansions = [ |
698 | 'transclusions' => [], |
699 | 'extensions' => [], |
700 | 'media' => [] |
701 | ]; |
702 | // Kick off the extraction |
703 | self::doExtractExpansions( $env, $expansions, $body->firstChild ); |
704 | return $expansions; |
705 | } |
706 | |
707 | /** |
708 | * Fetches output of encapsulations that return HTML from the legacy parser |
709 | */ |
710 | public static function fetchHTML( Env $env, string $source ): ?DocumentFragment { |
711 | $ret = $env->getDataAccess()->parseWikitext( |
712 | $env->getPageConfig(), $env->getMetadata(), $source |
713 | ); |
714 | return $ret === '' ? null : DOMUtils::parseHTMLToFragment( |
715 | $env->getTopLevelDoc(), DOMUtils::stripPWrapper( $ret ) |
716 | ); |
717 | } |
718 | } |