Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 268 |
|
0.00% |
0 / 19 |
CRAP | |
0.00% |
0 / 1 |
PipelineUtils | |
0.00% |
0 / 268 |
|
0.00% |
0 / 19 |
6480 | |
0.00% |
0 / 1 |
pFragmentToParsoidFragmentMarkers | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
getDOMFragmentToken | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
12 | |||
processContentInPipeline | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
2 | |||
dumpTplSrc | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
preparePFragment | |
0.00% |
0 / 12 |
|
0.00% |
0 / 1 |
6 | |||
processTemplateSource | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
6 | |||
expandAttrValueToDOM | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
6 | |||
expandAttrValuesToDOM | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
domAttrsToTagAttrs | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
20 | |||
convertDOMtoTokens | |
0.00% |
0 / 24 |
|
0.00% |
0 / 1 |
56 | |||
getWrapperTokens | |
0.00% |
0 / 49 |
|
0.00% |
0 / 1 |
506 | |||
encapsulateExpansionHTML | |
0.00% |
0 / 20 |
|
0.00% |
0 / 1 |
42 | |||
wrapAccum | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
6 | |||
addSpanWrappers | |
0.00% |
0 / 19 |
|
0.00% |
0 / 1 |
110 | |||
tunnelDOMThroughTokens | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
makeExpansion | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
doExtractExpansions | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
72 | |||
extractExpansions | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 | |||
parseToHTML | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Utils; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Assert\UnreachableException; |
8 | use Wikimedia\Parsoid\Config\Env; |
9 | use Wikimedia\Parsoid\DOM\Comment; |
10 | use Wikimedia\Parsoid\DOM\Document; |
11 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
12 | use Wikimedia\Parsoid\DOM\Element; |
13 | use Wikimedia\Parsoid\DOM\Node; |
14 | use Wikimedia\Parsoid\DOM\NodeList; |
15 | use Wikimedia\Parsoid\DOM\Text; |
16 | use Wikimedia\Parsoid\Fragments\PFragment; |
17 | use Wikimedia\Parsoid\Fragments\WikitextPFragment; |
18 | use Wikimedia\Parsoid\NodeData\DataMw; |
19 | use Wikimedia\Parsoid\NodeData\DataParsoid; |
20 | use Wikimedia\Parsoid\NodeData\TempData; |
21 | use Wikimedia\Parsoid\Tokens\CommentTk; |
22 | use Wikimedia\Parsoid\Tokens\EndTagTk; |
23 | use Wikimedia\Parsoid\Tokens\EOFTk; |
24 | use Wikimedia\Parsoid\Tokens\KV; |
25 | use Wikimedia\Parsoid\Tokens\SelfclosingTagTk; |
26 | use Wikimedia\Parsoid\Tokens\SourceRange; |
27 | use Wikimedia\Parsoid\Tokens\TagTk; |
28 | use Wikimedia\Parsoid\Tokens\Token; |
29 | use Wikimedia\Parsoid\Wt2Html\Frame; |
30 | |
31 | /** |
32 | * This file contains parsing pipeline related utilities. |
33 | */ |
34 | class PipelineUtils { |
35 | // keep in sync with internal_strip_marker in Grammar.pegphp |
36 | public const PARSOID_FRAGMENT_PREFIX = '{{#parsoid\0fragment:'; |
37 | |
38 | /** |
39 | * Returns a wikitext string with embedded parsoid fragment markers, |
40 | * as well as a mapping from the marker IDs to PFragment objects. |
41 | * @return array{0:string,1:array<string,PFragment>} A array consisting of |
42 | * the wikitext string, followed by the id-to-PFragment map. |
43 | */ |
44 | public static function pFragmentToParsoidFragmentMarkers( PFragment $fragment ): array { |
45 | static $counter = 0; |
46 | $pieces = WikitextPFragment::castFromPFragment( $fragment )->split(); |
47 | $result = [ $pieces[0] ]; |
48 | $map = []; |
49 | for ( $i = 1; $i < count( $pieces ); $i += 2 ) { |
50 | $marker = self::PARSOID_FRAGMENT_PREFIX . ( $counter++ ) . '}}'; |
51 | $map[$marker] = $pieces[$i]; |
52 | $result[] = $marker; |
53 | $result[] = $pieces[$i + 1]; |
54 | } |
55 | return [ implode( '', $result ), $map ]; |
56 | } |
57 | |
58 | /** |
59 | * Creates a dom-fragment-token for processing 'content' (an array of tokens) |
60 | * in its own subpipeline all the way to DOM. These tokens will be processed |
61 | * by their own handler (DOMFragmentBuilder) in the last stage of the async |
62 | * pipeline. |
63 | * |
64 | * srcOffsets should always be provided to process top-level page content in a |
65 | * subpipeline. Without it, DSR computation and template wrapping cannot be done |
66 | * in the subpipeline. While unpackDOMFragment can do this on unwrapping, that can |
67 | * be a bit fragile and makes dom-fragments a leaky abstraction by leaking subpipeline |
68 | * processing into the top-level pipeline. |
69 | * |
70 | * @param string|Token|array<Token|string> $content The array of tokens to process. |
71 | * @param SourceRange $srcOffsets Wikitext source offsets (start/end) of these tokens. |
72 | * @param array $opts Parsing options. |
73 | * - Token token The token that generated the content. |
74 | * - bool inlineContext Is this DOM fragment used in an inline context? |
75 | * @return SelfclosingTagTk |
76 | */ |
77 | public static function getDOMFragmentToken( |
78 | $content, SourceRange $srcOffsets, array $opts = [] |
79 | ): SelfclosingTagTk { |
80 | $token = $opts['token']; |
81 | return new SelfclosingTagTk( 'mw:dom-fragment-token', [ |
82 | new KV( 'contextTok', $token, $token->dataParsoid->tsr->expandTsrV() ), |
83 | new KV( 'content', $content, $srcOffsets->expandTsrV() ), |
84 | new KV( 'inlineContext', ( $opts['inlineContext'] ?? false ) ? "1" : "0" ), |
85 | new KV( 'inPHPBlock', ( $opts['inPHPBlock'] ?? false ) ? "1" : "0" ), |
86 | ] ); |
87 | } |
88 | |
89 | /** |
90 | * Processes content (wikitext, array of tokens, whatever) in its own |
91 | * pipeline based on options. |
92 | * |
93 | * @param Env $env The environment/context for the expansion. |
94 | * @param Frame $frame |
95 | * The parent frame within which the expansion is taking place. |
96 | * Used for template expansion and source text tracking. |
97 | * @param string|Token|array<Token|string>|DocumentFragment|PFragment $content |
98 | * How this content is processed depends on what kind of pipeline |
99 | * is constructed specified by opts. |
100 | * @param array $opts |
101 | * Processing options that specify pipeline-type, opts, and callbacks. |
102 | * - string pipelineType |
103 | * - array pipelineOpts |
104 | * - array tplArgs - if set, defines parameters for the child frame |
105 | * - string tplArgs['name'] |
106 | * - KV[] tplArgs['attribs'] |
107 | * - string srcText - if set, defines the source text for the expansion |
108 | * - SourceRange srcOffsets - if set, defines the range within the |
109 | * source text that $content corresponds to |
110 | * - bool sol Whether tokens should be processed in start-of-line context. |
111 | * - bool toplevel Whether the pipeline is considered atTopLevel |
112 | * @return array<Token|string>|DocumentFragment (depending on pipeline type) |
113 | */ |
114 | public static function processContentInPipeline( |
115 | Env $env, Frame $frame, $content, array $opts |
116 | ) { |
117 | // Build a pipeline |
118 | $pipeline = $env->getPipelineFactory()->getPipeline( |
119 | $opts['pipelineType'], |
120 | $opts['pipelineOpts'] |
121 | ); |
122 | |
123 | $pipeline->init( [ |
124 | // NOTE: some pipelines force toplevel to true |
125 | 'toplevel' => $opts['toplevel'] ?? false, |
126 | 'frame' => $frame, |
127 | 'tplArgs' => $opts['tplArgs'] ?? null, |
128 | 'srcText' => $opts['srcText'] ?? $frame->getSrcText(), |
129 | 'srcOffsets' => $opts['srcOffsets'] ?? null, |
130 | ] ); |
131 | |
132 | // Off the starting block ... ready, set, go! |
133 | return $pipeline->parse( $content, [ 'sol' => $opts['sol'] ] ); |
134 | } |
135 | |
136 | /** |
137 | * Dump template source if '--dump tplsrc' flag was set |
138 | */ |
139 | public static function dumpTplSrc( |
140 | Env $env, Token $token, string $templateName, string $src, |
141 | bool $fragmentMode = false |
142 | ): void { |
143 | $codec = DOMDataUtils::getCodec( $env->getTopLevelDoc() ); |
144 | $dump = str_repeat( '=', 28 ) . " template source " . ( $fragmentMode ? '(FRAGMENT)' : '' ) . |
145 | str_repeat( '=', 28 ) . "\n"; |
146 | $dp = $codec->toJsonArray( $token->dataParsoid, DataParsoid::class ); |
147 | $dump .= 'TEMPLATE:' . $templateName . 'TRANSCLUSION:' . |
148 | PHPUtils::jsonEncode( $dp['src'] ) . "\n"; |
149 | $dump .= str_repeat( '-', 80 ) . "\n"; |
150 | $dump .= $src . "\n"; |
151 | $dump .= str_repeat( '-', 80 ) . "\n"; |
152 | $env->writeDump( $dump ); |
153 | } |
154 | |
155 | public static function preparePFragment( |
156 | Env $env, |
157 | Frame $frame, |
158 | PFragment $pFragment, |
159 | array $opts |
160 | ): array { |
161 | [ $wikitext, $pFragmentMap ] = |
162 | self::pFragmentToParsoidFragmentMarkers( $pFragment ); |
163 | // FUTURE WORK: Fragment should probably contain a Frame pointer as |
164 | // well, since srcOffsets are only meaningful in relation to a specific |
165 | // Frame::$srcText. When that happens, we should assign an appropriate |
166 | // $frame here. |
167 | $srcOffsets = $pFragment->getSrcOffsets() ?? $opts['srcOffsets'] ?? null; |
168 | if ( !empty( $opts['processInNewFrame'] ) ) { |
169 | $frame = $frame->newChild( $frame->getTitle(), [], $wikitext ); |
170 | $srcOffsets = new SourceRange( 0, strlen( $wikitext ) ); |
171 | } |
172 | $env->addToPFragmentMap( $pFragmentMap ); |
173 | return [ |
174 | 'frame' => $frame, |
175 | 'wikitext' => $wikitext, |
176 | 'srcOffsets' => $srcOffsets, |
177 | ]; |
178 | } |
179 | |
180 | public static function processTemplateSource( |
181 | Env $env, Frame $frame, Token $token, ?array $tplArgs, |
182 | string $src, array $opts = [] |
183 | ): array { |
184 | if ( $src === '' ) { |
185 | return []; |
186 | } |
187 | |
188 | // Get a nested transformation pipeline for the wikitext that takes |
189 | // us through stages 1-2, with the appropriate pipeline options set. |
190 | // |
191 | // Simply returning the tokenized source here (which may be correct |
192 | // when using the legacy preprocessor because we don't expect to |
193 | // tokenize any templates or include directives so skipping those |
194 | // handlers should be ok) won't work since the options for the pipeline |
195 | // we're in probably aren't what we want. |
196 | $toks = self::processContentInPipeline( |
197 | $env, |
198 | $frame, |
199 | $src, |
200 | [ |
201 | 'pipelineType' => 'wikitext-to-expanded-tokens', |
202 | 'pipelineOpts' => [ |
203 | 'inTemplate' => true, |
204 | // FIXME: In reality, this is broken for parser tests where |
205 | // we expand templates natively. We do want all nested templates |
206 | // to be expanded. But, setting this to !usePHPPreProcessor seems |
207 | // to break a number of tests. Not pursuing this line of enquiry |
208 | // for now since this parserTests vs production distinction will |
209 | // disappear with parser integration. We'll just bear the stench |
210 | // till that time. |
211 | // |
212 | // NOTE: No expansion required for nested templates. |
213 | 'expandTemplates' => $opts['expandTemplates'] ?? false, |
214 | 'extTag' => $opts['extTag'] ?? null, |
215 | ], |
216 | 'srcText' => $src, |
217 | 'srcOffsets' => new SourceRange( 0, strlen( $src ) ), |
218 | 'tplArgs' => $tplArgs, |
219 | // HEADS UP: You might be wondering why we are forcing "sol" => true without |
220 | // using information about whether the transclusion is used in a SOL context. |
221 | // |
222 | // Ex: "foo {{1x|*bar}}" Here, "*bar" is not in SOL context relative to the |
223 | // top-level page and so, should it be actually be parsed as a list item? |
224 | // |
225 | // So, there is a use-case where one could argue that the sol value here |
226 | // should be conditioned on the page-level context where "{{1x|*bar}}" showed |
227 | // up. So, in this example "foo {{1x|*bar}}, sol would be false and in this |
228 | // example "foo\n{{1x|*bar}}", sol would be true. That is effectively how |
229 | // the legacy parser behaves. (Ignore T2529 for the moment.) |
230 | // |
231 | // But, Parsoid is a different beast. Since the Parsoid/JS days, templates |
232 | // have been processed asynchronously. So, {{1x|*bar}} would be expanded and |
233 | // tokenized before even its preceding context might have been processed. |
234 | // From the start, Parsoid has aimed to decouple the processing of fragment |
235 | // generators (be it templates, extensions, or something else) from the |
236 | // processing of the page they are embedded in. This has been the |
237 | // starting point of many a wikitext 2.0 proposal on mediawiki.org; |
238 | // see also [[mw:Parsing/Notes/Wikitext_2.0#Implications_of_this_model]]. |
239 | // |
240 | // The main performance implication is that you can process a transclusion |
241 | // concurrently *and* cache the output of {{1x|*bar}} since its output is |
242 | // the same no matter where on the page it appears. Without this decoupled |
243 | // model, if you got "{{mystery-template-that-takes-30-secs}}{{1x|*bar}}" |
244 | // you have to wait 30 secs before you get to expand {{1x|*bar}} |
245 | // because you have to wait and see whether the mystery template will |
246 | // leave you in SOL state or non-SOL state. |
247 | // |
248 | // In a stroke of good luck, wikitext editors seem to have agreed |
249 | // that it is better for all templates to be expanded in a |
250 | // consistent SOL state and not be dependent on their context; |
251 | // turn now to phab task T2529 which (via a fragile hack) tried |
252 | // to ensure that every template which started with |
253 | // start-of-line-sensitive markup was evaluated in a |
254 | // start-of-line context (by hackily inserting a newline). Not |
255 | // everyone was satisfied with this hack (see T14974), but it's |
256 | // been the way things work for over a decade now (as evidenced |
257 | // by T14974 never having been "fixed"). |
258 | // |
259 | // So, while we've established we would prefer *not* to use page |
260 | // context to set the initial SOL value for tokenizing the |
261 | // template, what *should* the initial SOL value be? |
262 | // |
263 | // * Treat every transclusion as a fresh document starting in SOL |
264 | // state, ie set "sol" => true always. This is supported by |
265 | // most current wiki use, and is the intent behind the original |
266 | // T2529 hack (although that hack left a number of edge cases, |
267 | // described below). |
268 | // |
269 | // * Use `"sol" => false` for templates -- this was the solution |
270 | // rejected by the original T2529 as being contrary to editor |
271 | // expectations. |
272 | // |
273 | // * In the future, one might allow the template itself to |
274 | // specify that its initial SOL state should be, using a |
275 | // mechanism similar to what might be necessary for typed |
276 | // templates. This could also address T14974. This is not |
277 | // excluded by Parsoid at this point; but it would probably be |
278 | // signaled by a template "return type" which is *not* DOM |
279 | // therefore the template wouldn't get parsed "as wikitext" |
280 | // (ie, T14974 wants an "attribute-value" return type which is |
281 | // a plain string, and some of the wikitext 2.0 proposals |
282 | // anticipate a "attribute name/value" dictionary as a possible |
283 | // return type). |
284 | // |
285 | // In support of using sol=>true as the default initial state, |
286 | // let's examine the sol-sensitive wikitext constructs, and |
287 | // implicitly the corner cases left open by the T2529 hack. (For |
288 | // non-sol-sensitive constructs, the initial SOL state is |
289 | // irrelevant.) |
290 | // |
291 | // - SOL-sensitive contructs include lists, headings, indent-pre, |
292 | // and table syntax. |
293 | // - Of these, only lists, headings, and table syntax are actually handled in |
294 | // the PEG tokenizer and are impacted by SOL state. |
295 | // - Indent-Pre has its own handler that operates in a full page token context |
296 | // and isn't impacted. |
297 | // - T2529 effectively means for *#:; (lists) and {| (table start), newlines |
298 | // are added which means no matter what value we set here, they will get |
299 | // processed in sol state. |
300 | // - This leaves us with headings (=), table heading (!), table row (|), and |
301 | // table close (|}) syntax that would be impacted by what we set here. |
302 | // - Given that table row/heading/close templates are very very common on wikis |
303 | // and used for constructing complex tables, sol => true will let us handle |
304 | // those without hacks. We aren't fully off the hook there -- see the code |
305 | // in TokenStreamPatcher, AttributeExpander, TableFixups that all exist to |
306 | // to work around the fact that decoupled processing isn't the wikitext |
307 | // default. But, without sol => true, we'll likely be in deeper trouble. |
308 | // - But, this can cause some occasional bad parses where "=|!" aren't meant |
309 | // to be processed as a sol-wikitext construct. |
310 | // - Note also that the workaround for T14974 (ie, the T2529 hack applying |
311 | // where sol=false is actually desired) has traditionally been to add an |
312 | // initial <nowiki/> which ensures that the "T2529 characters" are not |
313 | // initial. There are a number of alternative mechanisms to accomplish |
314 | // this (ie, HTML-encode the first character). |
315 | // |
316 | // To honor the spirit of T2529 it seems plausible to try to lint |
317 | // away the remaining corner cases where T2529 does *not* result |
318 | // in start-of-line state for template expansion, and to use the |
319 | // various workarounds for compatibility in the meantime. |
320 | // |
321 | // We should also pick *one* of the workarounds for T14974 |
322 | // (probably `<nowiki/>` at the first position in the template), |
323 | // support that (until a better mechanism exists), and (if |
324 | // possible) lint away any others. |
325 | 'sol' => true |
326 | ] |
327 | ); |
328 | return $toks; |
329 | } |
330 | |
331 | /** |
332 | * Expands value all the way to DOM. |
333 | * |
334 | * @param Env $env |
335 | * The environment/context for the expansion. |
336 | * @param Frame $frame |
337 | * The parent frame within which the expansion is taking place. |
338 | * Used for template expansion and source text tracking. |
339 | * @param array $v |
340 | * The value to process. |
341 | * The value is expected to be an associative array with a "html" property. |
342 | * The html property is expanded to DOM only if it is an array (of tokens). |
343 | * Non-arrays are passed back unexpanded. |
344 | * @param bool $expandTemplates |
345 | * Should any templates encountered here be expanded |
346 | * (usually false for nested templates since they are never directly editable). |
347 | * @param bool $inTemplate |
348 | * Unexpanded templates can occur in the content of extension tags. |
349 | * @return array |
350 | */ |
351 | public static function expandAttrValueToDOM( |
352 | Env $env, Frame $frame, array $v, bool $expandTemplates, bool $inTemplate |
353 | ): array { |
354 | if ( is_array( $v['html'] ?? null ) ) { |
355 | // Set up pipeline options |
356 | $opts = [ |
357 | 'pipelineType' => 'expanded-tokens-to-fragment', |
358 | 'pipelineOpts' => [ |
359 | 'attrExpansion' => true, |
360 | 'inlineContext' => true, |
361 | 'expandTemplates' => $expandTemplates, |
362 | 'inTemplate' => $inTemplate |
363 | ], |
364 | 'srcOffsets' => $v['srcOffsets'], |
365 | 'sol' => true |
366 | ]; |
367 | $content = array_merge( $v['html'], [ new EOFTk() ] ); |
368 | $domFragment = self::processContentInPipeline( |
369 | $env, $frame, $content, $opts |
370 | ); |
371 | // Since we aren't at the top level, data attrs |
372 | // were not applied in cleanup. However, tmp |
373 | // was stripped. |
374 | $v['html'] = ContentUtils::ppToXML( |
375 | $domFragment, [ 'innerXML' => true, 'fragment' => true ] |
376 | ); |
377 | } |
378 | // Remove srcOffsets after value is expanded, so they don't show |
379 | // up in the output data-mw attribute |
380 | unset( $v['srcOffsets'] ); |
381 | return $v; |
382 | } |
383 | |
384 | /** |
385 | * @param Env $env |
386 | * The environment/context for the expansion. |
387 | * @param Frame $frame |
388 | * The parent frame within which the expansion is taking place. |
389 | * Used for template expansion and source text tracking. |
390 | * @param array $vals |
391 | * Array of values to expand. |
392 | * Non-array elements of $vals are passed back unmodified. |
393 | * If an array element, it is expected to be an associative array with a "html" property. |
394 | * The html property is expanded to DOM only if it is an array (of tokens). |
395 | * @param bool $expandTemplates |
396 | * Should any templates encountered here be expanded |
397 | * (usually false for nested templates since they are never directly editable). |
398 | * @param bool $inTemplate |
399 | * Unexpanded templates can occur in the content of extension tags. |
400 | * @return array |
401 | */ |
402 | public static function expandAttrValuesToDOM( |
403 | Env $env, $frame, array $vals, bool $expandTemplates, bool $inTemplate |
404 | ): array { |
405 | $ret = []; |
406 | foreach ( $vals as $v ) { |
407 | $ret[] = self::expandAttrValueToDOM( $env, $frame, $v, $expandTemplates, $inTemplate ); |
408 | } |
409 | return $ret; |
410 | } |
411 | |
412 | /** |
413 | * Convert a DOM node to a token. The node comes from a DOM whose data attributes |
414 | * are stored outside the DOM. |
415 | * |
416 | * @param Element $node |
417 | * @param array<string,string> $attrs |
418 | * @return array{attrs:KV[],dataParsoid:?DataParsoid,dataMw:?DataMw} |
419 | */ |
420 | private static function domAttrsToTagAttrs( Element $node, array $attrs ): array { |
421 | $out = []; |
422 | foreach ( $attrs as $name => $value ) { |
423 | if ( $name !== DOMDataUtils::DATA_OBJECT_ATTR_NAME ) { |
424 | $out[] = new KV( $name, $value ); |
425 | } |
426 | } |
427 | $dmw = DOMDataUtils::getDataMw( $node ); |
428 | return [ |
429 | 'attrs' => $out, |
430 | 'dataParsoid' => DOMDataUtils::getDataParsoid( $node ), |
431 | 'dataMw' => $dmw->isEmpty() ? null : $dmw, |
432 | ]; |
433 | } |
434 | |
435 | /** |
436 | * Convert a DOM to tokens. Data attributes for nodes are stored outside the DOM. |
437 | * |
438 | * @param Node $node The root of the DOM tree to convert to tokens |
439 | * @param array<Token|string> $tokBuf This is where the tokens get stored |
440 | * @return array |
441 | */ |
442 | private static function convertDOMtoTokens( Node $node, array $tokBuf ): array { |
443 | if ( $node instanceof Element ) { |
444 | $nodeName = DOMCompat::nodeName( $node ); |
445 | $attrInfo = self::domAttrsToTagAttrs( $node, DOMUtils::attributes( $node ) ); |
446 | |
447 | if ( Utils::isVoidElement( $nodeName ) ) { |
448 | $tokBuf[] = new SelfclosingTagTk( |
449 | $nodeName, $attrInfo['attrs'], |
450 | $attrInfo['dataParsoid'], $attrInfo['dataMw'] |
451 | ); |
452 | } else { |
453 | $tokBuf[] = new TagTk( |
454 | $nodeName, $attrInfo['attrs'], |
455 | $attrInfo['dataParsoid'], $attrInfo['dataMw'] |
456 | ); |
457 | for ( $child = $node->firstChild; $child; $child = $child->nextSibling ) { |
458 | $tokBuf = self::convertDOMtoTokens( $child, $tokBuf ); |
459 | } |
460 | $endTag = new EndTagTk( $nodeName ); |
461 | // Keep stx parity |
462 | if ( WTUtils::isLiteralHTMLNode( $node ) ) { |
463 | $endTag->dataParsoid->stx = 'html'; |
464 | } |
465 | $tokBuf[] = $endTag; |
466 | } |
467 | } elseif ( $node instanceof Text ) { |
468 | PHPUtils::pushArray( $tokBuf, TokenUtils::newlinesToNlTks( $node->nodeValue ) ); |
469 | } elseif ( $node instanceof Comment ) { |
470 | $tokBuf[] = new CommentTk( $node->nodeValue ); |
471 | } else { |
472 | // getWrapperTokens calls convertDOMToTokens with a Element |
473 | // and children of dom elements are always text/comment/elements |
474 | // which are all covered above. |
475 | throw new UnreachableException( "Should never get here!" ); |
476 | } |
477 | |
478 | return $tokBuf; |
479 | } |
480 | |
481 | /** |
482 | * Get tokens representing a DOM forest (from transclusions, extensions, |
483 | * whatever that were generated as part of a separate processing pipeline) |
484 | * in the token stream. These tokens will tunnel the subtree through the |
485 | * token processing while preserving token stream semantics as if |
486 | * the DOM had been converted to tokens. |
487 | * |
488 | * @param DocumentFragment $domFragment List of DOM nodes that need to be tunneled through. |
489 | * @param array $opts |
490 | * @see encapsulateExpansionHTML's doc. for more info about these options. |
491 | * @return array<Token|string> List of token representatives. |
492 | */ |
493 | private static function getWrapperTokens( |
494 | DocumentFragment $domFragment, array $opts |
495 | ): array { |
496 | if ( !$domFragment->hasChildNodes() ) { |
497 | return [ new TagTk( 'span' ), new EndTagTk( 'span' ) ]; |
498 | } |
499 | |
500 | $node = $domFragment->firstChild; |
501 | |
502 | // Do we represent this with inline or block elements? |
503 | // This is to ensure that we get p-wrapping correct. |
504 | // |
505 | // * If all content is inline, we use inline-elements to represent this |
506 | // so that this content gets swallowed into the P tag that wraps |
507 | // adjacent inline content. |
508 | // |
509 | // * If any part of this is a block content, we treat extension content |
510 | // independent of surrounding content and don't want inline content |
511 | // here to be swallowed into a P tag that wraps adjacent inline content. |
512 | // |
513 | // This behavior ensures that we and clients can "drop-in" extension content |
514 | // into the DOM without messing with fixing up paragraph tags of surrounding |
515 | // content. It could potentially introduce minor rendering differences when |
516 | // compared to PHP parser output, but we'll swallow it for now. |
517 | $wrapperType = 'INLINE'; |
518 | if ( !empty( $opts['pipelineOpts']['inlineContext'] ) ) { |
519 | // If the DOM fragment is being processed in the context where P wrapping |
520 | // has been suppressed, we represent the DOM fragment with inline-tokens. |
521 | // |
522 | // FIXME(SSS): Looks like we have some "impedance mismatch" here. But, this |
523 | // is correct in scenarios where link-content or image-captions are being |
524 | // processed in a sub-pipeline and we don't want a <div> in the link-caption |
525 | // to cause the <a>..</a> to get split apart. |
526 | // |
527 | // Filed as T49963 |
528 | } elseif ( !$opts['unpackOutput'] ) { |
529 | // Fragments that won't be unpacked aren't amenable to inspection, since |
530 | // the ultimate content is unknown. For example, refs shuttle content |
531 | // through treebuilding that ends up in the references list. |
532 | // |
533 | // FIXME(arlolra): Do we need a mechanism to specify content |
534 | // categories? |
535 | } else { |
536 | foreach ( $domFragment->childNodes as $n ) { |
537 | if ( |
538 | DOMUtils::isWikitextBlockNode( $n ) || |
539 | DOMUtils::hasBlockElementDescendant( $n ) |
540 | ) { |
541 | $wrapperType = 'BLOCK'; |
542 | break; |
543 | } |
544 | } |
545 | } |
546 | |
547 | $wrapperName = null; |
548 | if ( $wrapperType === 'BLOCK' && !DOMUtils::isWikitextBlockNode( $node ) ) { |
549 | $wrapperName = 'div'; |
550 | } elseif ( DOMCompat::nodeName( $node ) === 'a' ) { |
551 | // Do not use 'A' as a wrapper node because it could |
552 | // end up getting nested inside another 'A' and the DOM |
553 | // structure can change where the wrapper tokens are no |
554 | // longer siblings. |
555 | // Ex: "[http://foo.com Bad nesting [[Here]]]. |
556 | $wrapperName = 'span'; |
557 | } elseif ( |
558 | in_array( DOMCompat::nodeName( $node ), [ 'style', 'script' ], true ) && |
559 | ( $node->nextSibling !== null ) |
560 | ) { |
561 | // <style>/<script> tags are not fostered, so if we're wrapping |
562 | // more than a single node, they aren't a good representation for |
563 | // the content. It can lead to fosterable content being inserted |
564 | // in a fosterable position after treebuilding is done, which isn't |
565 | // roundtrippable. |
566 | $wrapperName = 'span'; |
567 | } elseif ( !( $node instanceof Element ) ) { |
568 | $wrapperName = 'span'; |
569 | } else { |
570 | $wrapperName = DOMCompat::nodeName( $node ); |
571 | } |
572 | |
573 | if ( $node instanceof Element ) { |
574 | Assert::invariant( |
575 | // No need to look for data-mw as well. |
576 | // Nodes that have data-mw also have data-parsoid. |
577 | !$node->hasAttribute( 'data-parsoid' ), |
578 | "Expected node to have its data attributes loaded" ); |
579 | |
580 | $nodeData = clone DOMDataUtils::getNodeData( $node ); |
581 | |
582 | if ( $wrapperName !== DOMCompat::nodeName( $node ) ) { |
583 | // Create a copy of the node without children |
584 | $workNode = $node->ownerDocument->createElement( $wrapperName ); |
585 | |
586 | // Copy over attributes |
587 | foreach ( DOMUtils::attributes( $node ) as $name => $value ) { |
588 | // "typeof" is ignored since it'll be removed below. |
589 | if ( $name !== 'typeof' ) { |
590 | $workNode->setAttribute( $name, $value ); |
591 | } |
592 | } |
593 | |
594 | // We are applying a different wrapper. |
595 | // So, node's data-parsoid isn't applicable. |
596 | $nodeData->parsoid = new DataParsoid; |
597 | } else { |
598 | // Shallow clone since we don't want to convert the whole tree to tokens. |
599 | $workNode = $node->cloneNode( false ); |
600 | |
601 | // Reset 'tsr' since it isn't applicable. Neither is |
602 | // any auxiliary info like 'endTSR'. |
603 | // FIXME: The above comment is only true if we are reusing |
604 | // DOM fragments from cache from previous revisions in |
605 | // incremental parsing scenarios. See T98992 |
606 | if ( isset( $nodeData->parsoid->tsr ) ) { |
607 | $nodeData->parsoid->tsr = null; |
608 | } |
609 | if ( isset( $nodeData->parsoid->tmp->endTSR ) ) { |
610 | unset( $nodeData->parsoid->tmp->endTSR ); |
611 | } |
612 | |
613 | // The "in transclusion" flag was set on the first child for template |
614 | // wrapping in the nested pipeline, and doesn't apply to the dom |
615 | // fragment wrapper in this pipeline. Keeping it around can induce |
616 | // template wrapping of a foster box if the dom fragment is found in |
617 | // a fosterable position. |
618 | if ( |
619 | $nodeData->parsoid !== null && |
620 | $nodeData->parsoid->getTempFlag( TempData::IN_TRANSCLUSION ) |
621 | ) { |
622 | $nodeData->parsoid->tmp->setFlag( TempData::IN_TRANSCLUSION, false ); |
623 | } |
624 | // Similarly for "fostered", it applies to the nested pipeline and, |
625 | // if transferred, can interfere when unpacking |
626 | if ( isset( $nodeData->parsoid->fostered ) ) { |
627 | unset( $nodeData->parsoid->fostered ); |
628 | } |
629 | |
630 | // Note that the TempData::WRAPPER flag may be transfered to the |
631 | // fragment wrapper. Depending on the contents of the fragment, |
632 | // it's questionable if that's truly representative. Our modeling |
633 | // based on the first node of the fragment has limitations. |
634 | } |
635 | |
636 | DOMDataUtils::setNodeData( $workNode, $nodeData ); |
637 | } else { |
638 | $workNode = $node->ownerDocument->createElement( $wrapperName ); |
639 | } |
640 | |
641 | $tokens = self::convertDOMtoTokens( $workNode, [] ); |
642 | |
643 | // Remove the typeof attribute from the first token. |
644 | // It will be replaced with mw:DOMFragment. |
645 | $tokens[0]->removeAttribute( 'typeof' ); |
646 | |
647 | // Remove the about attribute from the first token. |
648 | // We want to be able to distinguish when this wrapper was template |
649 | // annotated. |
650 | $tokens[0]->removeAttribute( 'about' ); |
651 | |
652 | return $tokens; |
653 | } |
654 | |
655 | /** |
656 | * Generates wrapper tokens for a HTML expansion -- the wrapper |
657 | * tokens are placeholders that adequately represent semantics |
658 | * of the HTML DOM for the purposes of additional token transformations |
659 | * that will be applied to them. |
660 | * |
661 | * @param Env $env |
662 | * The active environment/context. |
663 | * @param Token $token |
664 | * The token that generated the DOM. |
665 | * @param array $expansion |
666 | * - string html HTML of the expansion. |
667 | * - DocumentFragment domFragment Outermost nodes of the HTML. |
668 | * @param array $opts |
669 | * - SourceRange tsr |
670 | * The TSR to set on the generated tokens. This TSR is |
671 | * used to compute DSR on the placeholder tokens. |
672 | * The computed DSR is transferred over to the unpacked DOM |
673 | * if setDSR is true (see below). |
674 | * - bool setDSR |
675 | * When the DOM fragment is unpacked, this option governs |
676 | * whether the DSR from the placeholder node is transferred |
677 | * over to the unpacked DOM or not. |
678 | * For example: Cite, reused transclusions. |
679 | * - bool fromCache |
680 | * - array pipelineOpts |
681 | * - bool unpackOutput |
682 | * - string wrapperName |
683 | * @return array<Token|string> |
684 | */ |
685 | public static function encapsulateExpansionHTML( |
686 | Env $env, Token $token, array $expansion, array $opts |
687 | ): array { |
688 | $opts['unpackOutput'] ??= true; // Default |
689 | // Get placeholder tokens to get our subdom through the token processing |
690 | // stages. These will be finally unwrapped on the DOM. |
691 | $toks = self::getWrapperTokens( $expansion['domFragment'], $opts ); |
692 | $firstWrapperToken = $toks[0]; |
693 | |
694 | // Add the DOMFragment type so that we get unwrapped later. |
695 | $fragmentType = 'mw:DOMFragment' . ( !$opts['unpackOutput'] ? '/sealed/' . $opts['wrapperName'] : '' ); |
696 | $firstWrapperToken->setAttribute( 'typeof', $fragmentType ); |
697 | |
698 | // Assign the HTML fragment to the data-parsoid.html on the first wrapper token. |
699 | $firstWrapperToken->dataParsoid->html = $expansion['html']; |
700 | |
701 | // Pass through setDSR flag |
702 | if ( !empty( $opts['setDSR'] ) ) { |
703 | $firstWrapperToken->dataParsoid->setTempFlag( |
704 | TempData::SET_DSR, $opts['setDSR'] ); |
705 | } |
706 | |
707 | // Pass through fromCache flag |
708 | if ( !empty( $opts['fromCache'] ) ) { |
709 | $firstWrapperToken->dataParsoid->setTempFlag( |
710 | TempData::FROM_CACHE, $opts['fromCache'] ); |
711 | } |
712 | |
713 | // Transfer the tsr. |
714 | // The first token gets the full width, the following tokens zero width. |
715 | $tokenTsr = $opts['tsr'] ?? $token->dataParsoid->tsr ?? null; |
716 | if ( $tokenTsr ) { |
717 | $firstWrapperToken->dataParsoid->tsr = $tokenTsr; |
718 | $firstWrapperToken->dataParsoid->extTagOffsets = $token->dataParsoid->extTagOffsets ?? null; |
719 | // XXX to investigate: if $tokenTsr->end is null, then we're losing |
720 | // the 'hint' we'd like to provide here that this is a zero-width |
721 | // source range. |
722 | // ->end can be set to null by WikiLinkHandler::bailTokens() |
723 | $endTsr = new SourceRange( $tokenTsr->end, $tokenTsr->end ); |
724 | for ( $i = 1; $i < count( $toks ); $i++ ) { |
725 | $toks[$i]->dataParsoid->tsr = clone $endTsr; |
726 | } |
727 | } |
728 | |
729 | return $toks; |
730 | } |
731 | |
732 | private static function wrapAccum( |
733 | Document $doc, array &$textCommentAccum |
734 | ): void { |
735 | // Wrap accumulated nodes in a span |
736 | $span = $doc->createElement( 'span' ); |
737 | $parentNode = $textCommentAccum[0]->parentNode; |
738 | $parentNode->insertBefore( $span, $textCommentAccum[0] ); |
739 | foreach ( $textCommentAccum as $n ) { |
740 | $span->appendChild( $n ); |
741 | } |
742 | $dp = new DataParsoid; |
743 | $dp->setTempFlag( TempData::WRAPPER ); |
744 | DOMDataUtils::setDataParsoid( $span, $dp ); |
745 | $textCommentAccum = []; |
746 | } |
747 | |
748 | /** |
749 | * Wrap text and comment nodes in a node list into spans, so that all |
750 | * top-level nodes are elements. |
751 | * |
752 | * @param NodeList $nodes List of DOM nodes to wrap, mix of node types. |
753 | * @param ?Node $startAt |
754 | * @param ?Node $stopAt |
755 | */ |
756 | public static function addSpanWrappers( |
757 | $nodes, |
758 | ?Node $startAt = null, |
759 | ?Node $stopAt = null |
760 | ): void { |
761 | $textCommentAccum = []; |
762 | $doc = $nodes->item( 0 )->ownerDocument; |
763 | |
764 | // Build a real array out of nodes. |
765 | // |
766 | // Operating directly on DOM child-nodes array |
767 | // and manipulating them by adding span wrappers |
768 | // changes the traversal itself |
769 | $nodeBuf = []; |
770 | foreach ( $nodes as $node ) { |
771 | $nodeBuf[] = $node; |
772 | } |
773 | |
774 | $start = ( $startAt === null ); |
775 | foreach ( $nodeBuf as $node ) { |
776 | if ( !$start ) { |
777 | if ( $startAt !== $node ) { |
778 | continue; |
779 | } |
780 | $start = true; |
781 | } |
782 | if ( $node instanceof Text || $node instanceof Comment ) { |
783 | $textCommentAccum[] = $node; |
784 | } elseif ( count( $textCommentAccum ) ) { |
785 | self::wrapAccum( $doc, $textCommentAccum ); |
786 | } |
787 | if ( $node === $stopAt ) { |
788 | break; |
789 | } |
790 | } |
791 | |
792 | if ( count( $textCommentAccum ) ) { |
793 | self::wrapAccum( $doc, $textCommentAccum ); |
794 | } |
795 | } |
796 | |
797 | /** |
798 | * Convert a HTML5 DOM into a mw:DOMFragment and generate appropriate |
799 | * tokens to insert into the token stream for further processing. |
800 | * |
801 | * The DOMProcessorPipeline will unpack the fragment and insert the HTML |
802 | * back into the DOM. |
803 | * |
804 | * @param Env $env |
805 | * The active environment/context. |
806 | * @param Token $token |
807 | * The token that generated the DOM. |
808 | * @param DocumentFragment $domFragment |
809 | * The DOM that the token expanded to. |
810 | * @param array $opts |
811 | * Options to be passed onto the encapsulation code |
812 | * See encapsulateExpansionHTML's doc. for more info about these options. |
813 | * @return array<Token|string> |
814 | */ |
815 | public static function tunnelDOMThroughTokens( |
816 | Env $env, Token $token, DocumentFragment $domFragment, array $opts |
817 | ): array { |
818 | // Get placeholder tokens to get our subdom through the token processing |
819 | // stages. These will be finally unwrapped on the DOM. |
820 | $expansion = self::makeExpansion( $env, $domFragment ); |
821 | return self::encapsulateExpansionHTML( $env, $token, $expansion, $opts ); |
822 | } |
823 | |
824 | public static function makeExpansion( |
825 | Env $env, DocumentFragment $domFragment |
826 | ): array { |
827 | $fragmentId = $env->newFragmentId(); |
828 | $env->setDOMFragment( $fragmentId, $domFragment ); |
829 | return [ 'domFragment' => $domFragment, 'html' => $fragmentId ]; |
830 | } |
831 | |
832 | private static function doExtractExpansions( Env $env, array &$expansions, Node $node ): void { |
833 | $nodes = null; |
834 | $expAccum = null; |
835 | while ( $node ) { |
836 | if ( $node instanceof Element ) { |
837 | if ( DOMUtils::matchTypeOf( $node, '#^mw:(Transclusion$|Extension/)#' ) && |
838 | $node->hasAttribute( 'about' ) |
839 | ) { |
840 | $dp = DOMDataUtils::getDataParsoid( $node ); |
841 | $about = DOMCompat::getAttribute( $node, 'about' ); |
842 | $nodes = WTUtils::getAboutSiblings( $node, $about ); |
843 | $key = null; |
844 | if ( DOMUtils::hasTypeOf( $node, 'mw:Transclusion' ) ) { |
845 | $expAccum = $expansions['transclusions']; |
846 | $key = $dp->src; |
847 | } elseif ( DOMUtils::matchTypeOf( $node, '#^mw:Extension/#' ) ) { |
848 | $expAccum = $expansions['extensions']; |
849 | $key = $dp->src; |
850 | } else { |
851 | $expAccum = $expansions['media']; |
852 | // XXX gwicke: use proper key that is not |
853 | // source-based? This also needs to work for |
854 | // transclusion output. |
855 | $key = null; |
856 | } |
857 | |
858 | if ( $key ) { |
859 | throw new UnreachableException( 'Callsite was not ported!' ); |
860 | // FIXME: makeExpansion return type changed |
861 | // $expAccum[$key] = self::makeExpansion( $env, $nodes ); |
862 | } |
863 | |
864 | $node = end( $nodes ); |
865 | } else { |
866 | self::doExtractExpansions( $env, $expansions, $node->firstChild ); |
867 | } |
868 | } |
869 | $node = $node->nextSibling; |
870 | } |
871 | } |
872 | |
873 | /** |
874 | * Extract transclusion and extension expansions from a DOM, and return |
875 | * them in a structure like this: |
876 | * { |
877 | * transclusions: { |
878 | * 'key1': { |
879 | * html: 'html1', |
880 | * nodes: [<node1>, <node2>] |
881 | * } |
882 | * }, |
883 | * extensions: { |
884 | * 'key2': { |
885 | * html: 'html2', |
886 | * nodes: [<node1>, <node2>] |
887 | * } |
888 | * }, |
889 | * files: { |
890 | * 'key3': { |
891 | * html: 'html3', |
892 | * nodes: [<node1>, <node2>] |
893 | * } |
894 | * } |
895 | * } |
896 | * |
897 | * @param Env $env |
898 | * @param Element $body |
899 | * @return array |
900 | */ |
901 | public static function extractExpansions( Env $env, Element $body ): array { |
902 | $expansions = [ |
903 | 'transclusions' => [], |
904 | 'extensions' => [], |
905 | 'media' => [] |
906 | ]; |
907 | // Kick off the extraction |
908 | self::doExtractExpansions( $env, $expansions, $body->firstChild ); |
909 | return $expansions; |
910 | } |
911 | |
912 | /** |
913 | * Fetches output of encapsulations that return HTML from the legacy parser |
914 | */ |
915 | public static function parseToHTML( Env $env, string $source ): ?DocumentFragment { |
916 | $ret = $env->getDataAccess()->parseWikitext( |
917 | $env->getPageConfig(), $env->getMetadata(), $source |
918 | ); |
919 | return $ret === '' ? null : DOMUtils::parseHTMLToFragment( |
920 | $env->getTopLevelDoc(), DOMUtils::stripPWrapper( $ret ) |
921 | ); |
922 | } |
923 | } |