Code Coverage for /src/src/Utils/PipelineUtils.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	0.00% covered (danger)	0.00%	0 / 224	0.00% covered (danger)	0.00%	0 / 16	CRAP	0.00% covered (danger)	0.00%	0 / 1
PipelineUtils	0.00% covered (danger)	0.00%	0 / 224	0.00% covered (danger)	0.00%	0 / 16	5402	0.00% covered (danger)	0.00%	0 / 1
pFragmentToParsoidFragmentMarkers	0.00% covered (danger)	0.00%	0 / 10	0.00% covered (danger)	0.00%	0 / 1	6
getDOMFragmentToken	0.00% covered (danger)	0.00%	0 / 7	0.00% covered (danger)	0.00%	0 / 1	12
processContentInPipeline	0.00% covered (danger)	0.00%	0 / 12	0.00% covered (danger)	0.00%	0 / 1	2
expandAttrValueToDOM	0.00% covered (danger)	0.00%	0 / 21	0.00% covered (danger)	0.00%	0 / 1	6
expandAttrValuesToDOM	0.00% covered (danger)	0.00%	0 / 4	0.00% covered (danger)	0.00%	0 / 1	6
domAttrsToTagAttrs	0.00% covered (danger)	0.00%	0 / 10	0.00% covered (danger)	0.00%	0 / 1	20
convertDOMtoTokens	0.00% covered (danger)	0.00%	0 / 24	0.00% covered (danger)	0.00%	0 / 1	56
getWrapperTokens	0.00% covered (danger)	0.00%	0 / 47	0.00% covered (danger)	0.00%	0 / 1	462
encapsulateExpansionHTML	0.00% covered (danger)	0.00%	0 / 20	0.00% covered (danger)	0.00%	0 / 1	42
wrapAccum	0.00% covered (danger)	0.00%	0 / 9	0.00% covered (danger)	0.00%	0 / 1	6
addSpanWrappers	0.00% covered (danger)	0.00%	0 / 19	0.00% covered (danger)	0.00%	0 / 1	110
tunnelDOMThroughTokens	0.00% covered (danger)	0.00%	0 / 2	0.00% covered (danger)	0.00%	0 / 1	2
makeExpansion	0.00% covered (danger)	0.00%	0 / 3	0.00% covered (danger)	0.00%	0 / 1	2
doExtractExpansions	0.00% covered (danger)	0.00%	0 / 23	0.00% covered (danger)	0.00%	0 / 1	72
extractExpansions	0.00% covered (danger)	0.00%	0 / 7	0.00% covered (danger)	0.00%	0 / 1	2
fetchHTML	0.00% covered (danger)	0.00%	0 / 6	0.00% covered (danger)	0.00%	0 / 1	6

1	<?php
2	declare( strict_types = 1 );
3
4	namespace Wikimedia\Parsoid\Utils;
5
6	use Wikimedia\Assert\Assert;
7	use Wikimedia\Assert\UnreachableException;
8	use Wikimedia\Parsoid\Config\Env;
9	use Wikimedia\Parsoid\DOM\Comment;
10	use Wikimedia\Parsoid\DOM\Document;
11	use Wikimedia\Parsoid\DOM\DocumentFragment;
12	use Wikimedia\Parsoid\DOM\Element;
13	use Wikimedia\Parsoid\DOM\Node;
14	use Wikimedia\Parsoid\DOM\NodeList;
15	use Wikimedia\Parsoid\DOM\Text;
16	use Wikimedia\Parsoid\Fragments\PFragment;
17	use Wikimedia\Parsoid\Fragments\WikitextPFragment;
18	use Wikimedia\Parsoid\NodeData\DataMw;
19	use Wikimedia\Parsoid\NodeData\DataParsoid;
20	use Wikimedia\Parsoid\NodeData\TempData;
21	use Wikimedia\Parsoid\Tokens\CommentTk;
22	use Wikimedia\Parsoid\Tokens\EndTagTk;
23	use Wikimedia\Parsoid\Tokens\EOFTk;
24	use Wikimedia\Parsoid\Tokens\KV;
25	use Wikimedia\Parsoid\Tokens\SelfclosingTagTk;
26	use Wikimedia\Parsoid\Tokens\SourceRange;
27	use Wikimedia\Parsoid\Tokens\TagTk;
28	use Wikimedia\Parsoid\Tokens\Token;
29	use Wikimedia\Parsoid\Wt2Html\Frame;
30
31	/**
32	* This file contains parsing pipeline related utilities.
33	*/
34	class PipelineUtils {
35	// keep in sync with internal_strip_marker in Grammar.pegphp
36	public const PARSOID_FRAGMENT_PREFIX = '{{#parsoid\0fragment:';
37
38	/**
39	* Returns a wikitext string with embedded parsoid fragment markers,
40	* as well as a mapping from the marker IDs to PFragment objects.
41	* @return array{0:string,1:array<string,PFragment>} A array consisting of
42	* the wikitext string, followed by the id-to-PFragment map.
43	*/
44	public static function pFragmentToParsoidFragmentMarkers( PFragment $fragment ): array {
45	static $counter = 0;
46	$pieces = WikitextPFragment::castFromPFragment( $fragment )->split();
47	$result = [ $pieces[0] ];
48	$map = [];
49	for ( $i = 1; $i < count( $pieces ); $i += 2 ) {
50	$marker = self::PARSOID_FRAGMENT_PREFIX . ( $counter++ ) . '}}';
51	$map[$marker] = $pieces[$i];
52	$result[] = $marker;
53	$result[] = $pieces[$i + 1];
54	}
55	return [ implode( '', $result ), $map ];
56	}
57
58	/**
59	* Creates a dom-fragment-token for processing 'content' (an array of tokens)
60	* in its own subpipeline all the way to DOM. These tokens will be processed
61	* by their own handler (DOMFragmentBuilder) in the last stage of the async
62	* pipeline.
63	*
64	* srcOffsets should always be provided to process top-level page content in a
65	* subpipeline. Without it, DSR computation and template wrapping cannot be done
66	* in the subpipeline. While unpackDOMFragment can do this on unwrapping, that can
67	* be a bit fragile and makes dom-fragments a leaky abstraction by leaking subpipeline
68	* processing into the top-level pipeline.
69	*
70	* @param string\|Token\|array<Token\|string> $content The array of tokens to process.
71	* @param SourceRange $srcOffsets Wikitext source offsets (start/end) of these tokens.
72	* @param array $opts Parsing options.
73	* - Token token The token that generated the content.
74	* - bool inlineContext Is this DOM fragment used in an inline context?
75	* @return SelfclosingTagTk
76	*/
77	public static function getDOMFragmentToken(
78	$content, SourceRange $srcOffsets, array $opts = []
79	): SelfclosingTagTk {
80	$token = $opts['token'];
81	return new SelfclosingTagTk( 'mw:dom-fragment-token', [
82	new KV( 'contextTok', $token, $token->dataParsoid->tsr->expandTsrV() ),
83	new KV( 'content', $content, $srcOffsets->expandTsrV() ),
84	new KV( 'inlineContext', ( $opts['inlineContext'] ?? false ) ? "1" : "0" ),
85	new KV( 'inPHPBlock', ( $opts['inPHPBlock'] ?? false ) ? "1" : "0" ),
86	] );
87	}
88
89	/**
90	* Processes content (wikitext, array of tokens, whatever) in its own
91	* pipeline based on options.
92	*
93	* @param Env $env The environment/context for the expansion.
94	* @param Frame $frame
95	* The parent frame within which the expansion is taking place.
96	* Used for template expansion and source text tracking.
97	* @param string\|Token\|array<Token\|string>\|DocumentFragment\|PFragment $content
98	* How this content is processed depends on what kind of pipeline
99	* is constructed specified by opts.
100	* @param array $opts
101	* Processing options that specify pipeline-type, opts, and callbacks.
102	* - string pipelineType
103	* - array pipelineOpts
104	* - array tplArgs - if set, defines parameters for the child frame
105	* - string tplArgs['name']
106	* - KV[] tplArgs['attribs']
107	* - string srcText - if set, defines the source text for the expansion
108	* - SourceRange srcOffsets - if set, defines the range within the
109	* source text that $content corresponds to
110	* - bool sol Whether tokens should be processed in start-of-line context.
111	* - bool toplevel Whether the pipeline is considered atTopLevel
112	* @return array<Token\|string>\|DocumentFragment (depending on pipeline type)
113	*/
114	public static function processContentInPipeline(
115	Env $env, Frame $frame, $content, array $opts
116	) {
117	// Build a pipeline
118	$pipeline = $env->getPipelineFactory()->getPipeline(
119	$opts['pipelineType'],
120	$opts['pipelineOpts']
121	);
122
123	$pipeline->init( [
124	// NOTE: some pipelines force toplevel to true
125	'toplevel' => $opts['toplevel'] ?? false,
126	'frame' => $frame,
127	'tplArgs' => $opts['tplArgs'] ?? null,
128	'srcText' => $opts['srcText'] ?? $frame->getSrcText(),
129	'srcOffsets' => $opts['srcOffsets'] ?? null,
130	] );
131
132	// Off the starting block ... ready, set, go!
133	return $pipeline->parse( $content, [ 'sol' => $opts['sol'] ] );
134	}
135
136	/**
137	* Expands value all the way to DOM.
138	*
139	* @param Env $env
140	* The environment/context for the expansion.
141	* @param Frame $frame
142	* The parent frame within which the expansion is taking place.
143	* Used for template expansion and source text tracking.
144	* @param array $v
145	* The value to process.
146	* The value is expected to be an associative array with a "html" property.
147	* The html property is expanded to DOM only if it is an array (of tokens).
148	* Non-arrays are passed back unexpanded.
149	* @param bool $expandTemplates
150	* Should any templates encountered here be expanded
151	* (usually false for nested templates since they are never directly editable).
152	* @param bool $inTemplate
153	* Unexpanded templates can occur in the content of extension tags.
154	* @return array
155	*/
156	public static function expandAttrValueToDOM(
157	Env $env, Frame $frame, array $v, bool $expandTemplates, bool $inTemplate
158	): array {
159	if ( is_array( $v['html'] ?? null ) ) {
160	// Set up pipeline options
161	$opts = [
162	'pipelineType' => 'expanded-tokens-to-fragment',
163	'pipelineOpts' => [
164	'attrExpansion' => true,
165	'inlineContext' => true,
166	'expandTemplates' => $expandTemplates,
167	'inTemplate' => $inTemplate
168	],
169	'srcOffsets' => $v['srcOffsets'],
170	'sol' => true
171	];
172	$content = array_merge( $v['html'], [ new EOFTk() ] );
173	$domFragment = self::processContentInPipeline(
174	$env, $frame, $content, $opts
175	);
176	// Since we aren't at the top level, data attrs
177	// were not applied in cleanup. However, tmp
178	// was stripped.
179	$v['html'] = ContentUtils::ppToXML(
180	$domFragment, [ 'innerXML' => true ]
181	);
182	}
183	// Remove srcOffsets after value is expanded, so they don't show
184	// up in the output data-mw attribute
185	unset( $v['srcOffsets'] );
186	return $v;
187	}
188
189	/**
190	* @param Env $env
191	* The environment/context for the expansion.
192	* @param Frame $frame
193	* The parent frame within which the expansion is taking place.
194	* Used for template expansion and source text tracking.
195	* @param array $vals
196	* Array of values to expand.
197	* Non-array elements of $vals are passed back unmodified.
198	* If an array element, it is expected to be an associative array with a "html" property.
199	* The html property is expanded to DOM only if it is an array (of tokens).
200	* @param bool $expandTemplates
201	* Should any templates encountered here be expanded
202	* (usually false for nested templates since they are never directly editable).
203	* @param bool $inTemplate
204	* Unexpanded templates can occur in the content of extension tags.
205	* @return array
206	*/
207	public static function expandAttrValuesToDOM(
208	Env $env, $frame, array $vals, bool $expandTemplates, bool $inTemplate
209	): array {
210	$ret = [];
211	foreach ( $vals as $v ) {
212	$ret[] = self::expandAttrValueToDOM( $env, $frame, $v, $expandTemplates, $inTemplate );
213	}
214	return $ret;
215	}
216
217	/**
218	* Convert a DOM node to a token. The node comes from a DOM whose data attributes
219	* are stored outside the DOM.
220	*
221	* @param Element $node
222	* @param array<string,string> $attrs
223	* @return array{attrs:KV[],dataParsoid:?DataParsoid,dataMw:?DataMw}
224	*/
225	private static function domAttrsToTagAttrs( Element $node, array $attrs ): array {
226	$out = [];
227	foreach ( $attrs as $name => $value ) {
228	if ( $name !== DOMDataUtils::DATA_OBJECT_ATTR_NAME ) {
229	$out[] = new KV( $name, $value );
230	}
231	}
232	return [
233	'attrs' => $out,
234	'dataParsoid' => DOMDataUtils::getDataParsoid( $node ),
235	'dataMw' =>
236	DOMDataUtils::validDataMw( $node ) ? DOMDataUtils::getDataMw( $node ) : null,
237	];
238	}
239
240	/**
241	* Convert a DOM to tokens. Data attributes for nodes are stored outside the DOM.
242	*
243	* @param Node $node The root of the DOM tree to convert to tokens
244	* @param array<Token\|string> $tokBuf This is where the tokens get stored
245	* @return array
246	*/
247	private static function convertDOMtoTokens( Node $node, array $tokBuf ): array {
248	if ( $node instanceof Element ) {
249	$nodeName = DOMCompat::nodeName( $node );
250	$attrInfo = self::domAttrsToTagAttrs( $node, DOMUtils::attributes( $node ) );
251
252	if ( Utils::isVoidElement( $nodeName ) ) {
253	$tokBuf[] = new SelfclosingTagTk(
254	$nodeName, $attrInfo['attrs'],
255	$attrInfo['dataParsoid'], $attrInfo['dataMw']
256	);
257	} else {
258	$tokBuf[] = new TagTk(
259	$nodeName, $attrInfo['attrs'],
260	$attrInfo['dataParsoid'], $attrInfo['dataMw']
261	);
262	for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) {
263	$tokBuf = self::convertDOMtoTokens( $child, $tokBuf );
264	}
265	$endTag = new EndTagTk( $nodeName );
266	// Keep stx parity
267	if ( WTUtils::isLiteralHTMLNode( $node ) ) {
268	$endTag->dataParsoid->stx = 'html';
269	}
270	$tokBuf[] = $endTag;
271	}
272	} elseif ( $node instanceof Text ) {
273	PHPUtils::pushArray( $tokBuf, TokenUtils::newlinesToNlTks( $node->nodeValue ) );
274	} elseif ( $node instanceof Comment ) {
275	$tokBuf[] = new CommentTk( $node->nodeValue );
276	} else {
277	// getWrapperTokens calls convertDOMToTokens with a Element
278	// and children of dom elements are always text/comment/elements
279	// which are all covered above.
280	throw new UnreachableException( "Should never get here!" );
281	}
282
283	return $tokBuf;
284	}
285
286	/**
287	* Get tokens representing a DOM forest (from transclusions, extensions,
288	* whatever that were generated as part of a separate processing pipeline)
289	* in the token stream. These tokens will tunnel the subtree through the
290	* token processing while preserving token stream semantics as if
291	* the DOM had been converted to tokens.
292	*
293	* @param DocumentFragment $domFragment List of DOM nodes that need to be tunneled through.
294	* @param array $opts
295	* @see encapsulateExpansionHTML's doc. for more info about these options.
296	* @return array<Token\|string> List of token representatives.
297	*/
298	private static function getWrapperTokens(
299	DocumentFragment $domFragment, array $opts
300	): array {
301	if ( !$domFragment->hasChildNodes() ) {
302	return [ new TagTk( 'span' ), new EndTagTk( 'span' ) ];
303	}
304
305	$node = $domFragment->firstChild;
306
307	// Do we represent this with inline or block elements?
308	// This is to ensure that we get p-wrapping correct.
309	//
310	// * If all content is inline, we use inline-elements to represent this
311	// so that this content gets swallowed into the P tag that wraps
312	// adjacent inline content.
313	//
314	// * If any part of this is a block content, we treat extension content
315	// independent of surrounding content and don't want inline content
316	// here to be swallowed into a P tag that wraps adjacent inline content.
317	//
318	// This behavior ensures that we and clients can "drop-in" extension content
319	// into the DOM without messing with fixing up paragraph tags of surrounding
320	// content. It could potentially introduce minor rendering differences when
321	// compared to PHP parser output, but we'll swallow it for now.
322	$wrapperType = 'INLINE';
323	if ( !empty( $opts['pipelineOpts']['inlineContext'] ) ) {
324	// If the DOM fragment is being processed in the context where P wrapping
325	// has been suppressed, we represent the DOM fragment with inline-tokens.
326	//
327	// FIXME(SSS): Looks like we have some "impedance mismatch" here. But, this
328	// is correct in scenarios where link-content or image-captions are being
329	// processed in a sub-pipeline and we don't want a <div> in the link-caption
330	// to cause the <a>..</a> to get split apart.
331	//
332	// Filed as T49963
333	} elseif ( !$opts['unpackOutput'] ) {
334	// Fragments that won't be unpacked aren't amenable to inspection, since
335	// the ultimate content is unknown. For example, refs shuttle content
336	// through treebuilding that ends up in the references list.
337	//
338	// FIXME(arlolra): Do we need a mechanism to specify content
339	// categories?
340	} else {
341	foreach ( $domFragment->childNodes as $n ) {
342	if (
343	DOMUtils::isWikitextBlockNode( $n ) \|\|
344	DOMUtils::hasBlockElementDescendant( $n )
345	) {
346	$wrapperType = 'BLOCK';
347	break;
348	}
349	}
350	}
351
352	$wrapperName = null;
353	if ( $wrapperType === 'BLOCK' && !DOMUtils::isWikitextBlockNode( $node ) ) {
354	$wrapperName = 'div';
355	} elseif ( DOMCompat::nodeName( $node ) === 'a' ) {
356	// Do not use 'A' as a wrapper node because it could
357	// end up getting nested inside another 'A' and the DOM
358	// structure can change where the wrapper tokens are no
359	// longer siblings.
360	// Ex: "[http://foo.com Bad nesting [[Here]]].
361	$wrapperName = 'span';
362	} elseif (
363	in_array( DOMCompat::nodeName( $node ), [ 'style', 'script' ], true ) &&
364	( $node->nextSibling !== null )
365	) {
366	// <style>/<script> tags are not fostered, so if we're wrapping
367	// more than a single node, they aren't a good representation for
368	// the content. It can lead to fosterable content being inserted
369	// in a fosterable position after treebuilding is done, which isn't
370	// roundtrippable.
371	$wrapperName = 'span';
372	} elseif ( !( $node instanceof Element ) ) {
373	$wrapperName = 'span';
374	} else {
375	$wrapperName = DOMCompat::nodeName( $node );
376	}
377
378	if ( $node instanceof Element ) {
379	Assert::invariant(
380	// No need to look for data-mw as well.
381	// Nodes that have data-mw also have data-parsoid.
382	!$node->hasAttribute( 'data-parsoid' ),
383	"Expected node to have its data attributes loaded" );
384
385	$nodeData = clone DOMDataUtils::getNodeData( $node );
386
387	if ( $wrapperName !== DOMCompat::nodeName( $node ) ) {
388	// Create a copy of the node without children
389	$workNode = $node->ownerDocument->createElement( $wrapperName );
390
391	// Copy over attributes
392	foreach ( DOMUtils::attributes( $node ) as $name => $value ) {
393	// "typeof" is ignored since it'll be removed below.
394	if ( $name !== 'typeof' ) {
395	$workNode->setAttribute( $name, $value );
396	}
397	}
398
399	// We are applying a different wrapper.
400	// So, node's data-parsoid isn't applicable.
401	$nodeData->parsoid = new DataParsoid;
402	} else {
403	// Shallow clone since we don't want to convert the whole tree to tokens.
404	$workNode = $node->cloneNode( false );
405
406	// Reset 'tsr' since it isn't applicable. Neither is
407	// any auxiliary info like 'endTSR'.
408	// FIXME: The above comment is only true if we are reusing
409	// DOM fragments from cache from previous revisions in
410	// incremental parsing scenarios. See T98992
411	if ( isset( $nodeData->parsoid->tsr ) ) {
412	$nodeData->parsoid->tsr = null;
413	}
414	if ( isset( $nodeData->parsoid->tmp->endTSR ) ) {
415	unset( $nodeData->parsoid->tmp->endTSR );
416	}
417
418	// The "in transclusion" flag was set on the first child for template
419	// wrapping in the nested pipeline, and doesn't apply to the dom
420	// fragment wrapper in this pipeline. Keeping it around can induce
421	// template wrapping of a foster box if the dom fragment is found in
422	// a fosterable position.
423	if (
424	isset( $nodeData->parsoid ) &&
425	$nodeData->parsoid->getTempFlag( TempData::IN_TRANSCLUSION )
426	) {
427	$nodeData->parsoid->tmp->setFlag( TempData::IN_TRANSCLUSION, false );
428	}
429	}
430
431	DOMDataUtils::setNodeData( $workNode, $nodeData );
432	} else {
433	$workNode = $node->ownerDocument->createElement( $wrapperName );
434	}
435
436	$tokens = self::convertDOMtoTokens( $workNode, [] );
437
438	// Remove the typeof attribute from the first token.
439	// It will be replaced with mw:DOMFragment.
440	$tokens[0]->removeAttribute( 'typeof' );
441
442	// Remove the about attribute from the first token.
443	// We want to be able to distinguish when this wrapper was template
444	// annotated.
445	$tokens[0]->removeAttribute( 'about' );
446
447	return $tokens;
448	}
449
450	/**
451	* Generates wrapper tokens for a HTML expansion -- the wrapper
452	* tokens are placeholders that adequately represent semantics
453	* of the HTML DOM for the purposes of additional token transformations
454	* that will be applied to them.
455	*
456	* @param Env $env
457	* The active environment/context.
458	* @param Token $token
459	* The token that generated the DOM.
460	* @param array $expansion
461	* - string html HTML of the expansion.
462	* - DocumentFragment domFragment Outermost nodes of the HTML.
463	* @param array $opts
464	* - SourceRange tsr
465	* The TSR to set on the generated tokens. This TSR is
466	* used to compute DSR on the placeholder tokens.
467	* The computed DSR is transferred over to the unpacked DOM
468	* if setDSR is true (see below).
469	* - bool setDSR
470	* When the DOM fragment is unpacked, this option governs
471	* whether the DSR from the placeholder node is transferred
472	* over to the unpacked DOM or not.
473	* For example: Cite, reused transclusions.
474	* - bool fromCache
475	* - array pipelineOpts
476	* - bool unpackOutput
477	* - string wrapperName
478	* @return array<Token\|string>
479	*/
480	public static function encapsulateExpansionHTML(
481	Env $env, Token $token, array $expansion, array $opts
482	): array {
483	$opts['unpackOutput'] ??= true; // Default
484	// Get placeholder tokens to get our subdom through the token processing
485	// stages. These will be finally unwrapped on the DOM.
486	$toks = self::getWrapperTokens( $expansion['domFragment'], $opts );
487	$firstWrapperToken = $toks[0];
488
489	// Add the DOMFragment type so that we get unwrapped later.
490	$fragmentType = 'mw:DOMFragment' . ( !$opts['unpackOutput'] ? '/sealed/' . $opts['wrapperName'] : '' );
491	$firstWrapperToken->setAttribute( 'typeof', $fragmentType );
492
493	// Assign the HTML fragment to the data-parsoid.html on the first wrapper token.
494	$firstWrapperToken->dataParsoid->html = $expansion['html'];
495
496	// Pass through setDSR flag
497	if ( !empty( $opts['setDSR'] ) ) {
498	$firstWrapperToken->dataParsoid->setTempFlag(
499	TempData::SET_DSR, $opts['setDSR'] );
500	}
501
502	// Pass through fromCache flag
503	if ( !empty( $opts['fromCache'] ) ) {
504	$firstWrapperToken->dataParsoid->setTempFlag(
505	TempData::FROM_CACHE, $opts['fromCache'] );
506	}
507
508	// Transfer the tsr.
509	// The first token gets the full width, the following tokens zero width.
510	$tokenTsr = $opts['tsr'] ?? $token->dataParsoid->tsr ?? null;
511	if ( $tokenTsr ) {
512	$firstWrapperToken->dataParsoid->tsr = $tokenTsr;
513	$firstWrapperToken->dataParsoid->extTagOffsets = $token->dataParsoid->extTagOffsets ?? null;
514	// XXX to investigate: if $tokenTsr->end is null, then we're losing
515	// the 'hint' we'd like to provide here that this is a zero-width
516	// source range.
517	// ->end can be set to null by WikiLinkHandler::bailTokens()
518	$endTsr = new SourceRange( $tokenTsr->end, $tokenTsr->end );
519	for ( $i = 1; $i < count( $toks ); $i++ ) {
520	$toks[$i]->dataParsoid->tsr = clone $endTsr;
521	}
522	}
523
524	return $toks;
525	}
526
527	private static function wrapAccum(
528	Document $doc, array &$textCommentAccum
529	): void {
530	// Wrap accumulated nodes in a span
531	$span = $doc->createElement( 'span' );
532	$parentNode = $textCommentAccum[0]->parentNode;
533	$parentNode->insertBefore( $span, $textCommentAccum[0] );
534	foreach ( $textCommentAccum as $n ) {
535	$span->appendChild( $n );
536	}
537	$dp = new DataParsoid;
538	$dp->setTempFlag( TempData::WRAPPER );
539	DOMDataUtils::setDataParsoid( $span, $dp );
540	$textCommentAccum = [];
541	}
542
543	/**
544	* Wrap text and comment nodes in a node list into spans, so that all
545	* top-level nodes are elements.
546	*
547	* @param NodeList $nodes List of DOM nodes to wrap, mix of node types.
548	* @param ?Node $startAt
549	* @param ?Node $stopAt
550	*/
551	public static function addSpanWrappers(
552	$nodes,
553	?Node $startAt = null,
554	?Node $stopAt = null
555	): void {
556	$textCommentAccum = [];
557	$doc = $nodes->item( 0 )->ownerDocument;
558
559	// Build a real array out of nodes.
560	//
561	// Operating directly on DOM child-nodes array
562	// and manipulating them by adding span wrappers
563	// changes the traversal itself
564	$nodeBuf = [];
565	foreach ( $nodes as $node ) {
566	$nodeBuf[] = $node;
567	}
568
569	$start = ( $startAt === null );
570	foreach ( $nodeBuf as $node ) {
571	if ( !$start ) {
572	if ( $startAt !== $node ) {
573	continue;
574	}
575	$start = true;
576	}
577	if ( $node instanceof Text \|\| $node instanceof Comment ) {
578	$textCommentAccum[] = $node;
579	} elseif ( count( $textCommentAccum ) ) {
580	self::wrapAccum( $doc, $textCommentAccum );
581	}
582	if ( $node === $stopAt ) {
583	break;
584	}
585	}
586
587	if ( count( $textCommentAccum ) ) {
588	self::wrapAccum( $doc, $textCommentAccum );
589	}
590	}
591
592	/**
593	* Convert a HTML5 DOM into a mw:DOMFragment and generate appropriate
594	* tokens to insert into the token stream for further processing.
595	*
596	* The DOMPostProcessor will unpack the fragment and insert the HTML
597	* back into the DOM.
598	*
599	* @param Env $env
600	* The active environment/context.
601	* @param Token $token
602	* The token that generated the DOM.
603	* @param DocumentFragment $domFragment
604	* The DOM that the token expanded to.
605	* @param array $opts
606	* Options to be passed onto the encapsulation code
607	* See encapsulateExpansionHTML's doc. for more info about these options.
608	* @return array<Token\|string>
609	*/
610	public static function tunnelDOMThroughTokens(
611	Env $env, Token $token, DocumentFragment $domFragment, array $opts
612	): array {
613	// Get placeholder tokens to get our subdom through the token processing
614	// stages. These will be finally unwrapped on the DOM.
615	$expansion = self::makeExpansion( $env, $domFragment );
616	return self::encapsulateExpansionHTML( $env, $token, $expansion, $opts );
617	}
618
619	public static function makeExpansion(
620	Env $env, DocumentFragment $domFragment
621	): array {
622	$fragmentId = $env->newFragmentId();
623	$env->setDOMFragment( $fragmentId, $domFragment );
624	return [ 'domFragment' => $domFragment, 'html' => $fragmentId ];
625	}
626
627	private static function doExtractExpansions( Env $env, array &$expansions, Node $node ): void {
628	$nodes = null;
629	$expAccum = null;
630	while ( $node ) {
631	if ( $node instanceof Element ) {
632	if ( DOMUtils::matchTypeOf( $node, '#^mw:(Transclusion$\|Extension/)#' ) &&
633	$node->hasAttribute( 'about' )
634	) {
635	$dp = DOMDataUtils::getDataParsoid( $node );
636	$about = DOMCompat::getAttribute( $node, 'about' );
637	$nodes = WTUtils::getAboutSiblings( $node, $about );
638	$key = null;
639	if ( DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) ) {
640	$expAccum = $expansions['transclusions'];
641	$key = $dp->src;
642	} elseif ( DOMUtils::matchTypeOf( $node, '#^mw:Extension/#' ) ) {
643	$expAccum = $expansions['extensions'];
644	$key = $dp->src;
645	} else {
646	$expAccum = $expansions['media'];
647	// XXX gwicke: use proper key that is not
648	// source-based? This also needs to work for
649	// transclusion output.
650	$key = null;
651	}
652
653	if ( $key ) {
654	throw new UnreachableException( 'Callsite was not ported!' );
655	// FIXME: makeExpansion return type changed
656	// $expAccum[$key] = self::makeExpansion( $env, $nodes );
657	}
658
659	$node = end( $nodes );
660	} else {
661	self::doExtractExpansions( $env, $expansions, $node->firstChild );
662	}
663	}
664	$node = $node->nextSibling;
665	}
666	}
667
668	/**
669	* Extract transclusion and extension expansions from a DOM, and return
670	* them in a structure like this:
671	* {
672	* transclusions: {
673	* 'key1': {
674	* html: 'html1',
675	* nodes: [<node1>, <node2>]
676	* }
677	* },
678	* extensions: {
679	* 'key2': {
680	* html: 'html2',
681	* nodes: [<node1>, <node2>]
682	* }
683	* },
684	* files: {
685	* 'key3': {
686	* html: 'html3',
687	* nodes: [<node1>, <node2>]
688	* }
689	* }
690	* }
691	*
692	* @param Env $env
693	* @param Element $body
694	* @return array
695	*/
696	public static function extractExpansions( Env $env, Element $body ): array {
697	$expansions = [
698	'transclusions' => [],
699	'extensions' => [],
700	'media' => []
701	];
702	// Kick off the extraction
703	self::doExtractExpansions( $env, $expansions, $body->firstChild );
704	return $expansions;
705	}
706
707	/**
708	* Fetches output of encapsulations that return HTML from the legacy parser
709	*/
710	public static function fetchHTML( Env $env, string $source ): ?DocumentFragment {
711	$ret = $env->getDataAccess()->parseWikitext(
712	$env->getPageConfig(), $env->getMetadata(), $source
713	);
714	return $ret === '' ? null : DOMUtils::parseHTMLToFragment(
715	$env->getTopLevelDoc(), DOMUtils::stripPWrapper( $ret )
716	);
717	}
718	}