Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
83.43% |
438 / 525 |
|
20.00% |
2 / 10 |
CRAP | |
0.00% |
0 / 1 |
DOMPostProcessor | |
83.43% |
438 / 525 |
|
20.00% |
2 / 10 |
84.23 | |
0.00% |
0 / 1 |
__construct | |
92.86% |
26 / 28 |
|
0.00% |
0 / 1 |
1.00 | |||
registerProcessors | |
73.33% |
22 / 30 |
|
0.00% |
0 / 1 |
9.21 | |||
getDefaultProcessors | |
90.88% |
259 / 285 |
|
0.00% |
0 / 1 |
10.08 | |||
setSourceOffsets | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
resetState | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
updateBodyClasslist | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
1 | |||
addMetaData | |
89.58% |
86 / 96 |
|
0.00% |
0 / 1 |
16.29 | |||
doPostProcess | |
50.00% |
33 / 66 |
|
0.00% |
0 / 1 |
96.00 | |||
process | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
processChunkily | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html; |
5 | |
6 | use Closure; |
7 | use DateTime; |
8 | use Generator; |
9 | use Wikimedia\Assert\Assert; |
10 | use Wikimedia\Parsoid\Config\Env; |
11 | use Wikimedia\Parsoid\DOM\Document; |
12 | use Wikimedia\Parsoid\DOM\Element; |
13 | use Wikimedia\Parsoid\DOM\Node; |
14 | use Wikimedia\Parsoid\Ext\DOMProcessor as ExtDOMProcessor; |
15 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
16 | use Wikimedia\Parsoid\Tokens\SourceRange; |
17 | use Wikimedia\Parsoid\Utils\ContentUtils; |
18 | use Wikimedia\Parsoid\Utils\DOMCompat; |
19 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
20 | use Wikimedia\Parsoid\Utils\DOMUtils; |
21 | use Wikimedia\Parsoid\Utils\PHPUtils; |
22 | use Wikimedia\Parsoid\Utils\Utils; |
23 | use Wikimedia\Parsoid\Utils\WTUtils; |
24 | use Wikimedia\Parsoid\Wt2Html\PP\Handlers\CleanUp; |
25 | use Wikimedia\Parsoid\Wt2Html\PP\Handlers\DedupeStyles; |
26 | use Wikimedia\Parsoid\Wt2Html\PP\Handlers\DisplaySpace; |
27 | use Wikimedia\Parsoid\Wt2Html\PP\Handlers\HandleLinkNeighbours; |
28 | use Wikimedia\Parsoid\Wt2Html\PP\Handlers\Headings; |
29 | use Wikimedia\Parsoid\Wt2Html\PP\Handlers\LiFixups; |
30 | use Wikimedia\Parsoid\Wt2Html\PP\Handlers\TableFixups; |
31 | use Wikimedia\Parsoid\Wt2Html\PP\Handlers\UnpackDOMFragments; |
32 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddLinkAttributes; |
33 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddMediaInfo; |
34 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\AddRedLinks; |
35 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\ComputeDSR; |
36 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\ConvertOffsets; |
37 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\I18n; |
38 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\LangConverter; |
39 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\Linter; |
40 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\MarkFosteredContent; |
41 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\MigrateTemplateMarkerMetas; |
42 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\MigrateTrailingNLs; |
43 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\Normalize; |
44 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\ProcessTreeBuilderFixups; |
45 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\PWrap; |
46 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\WrapAnnotations; |
47 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\WrapSections; |
48 | use Wikimedia\Parsoid\Wt2Html\PP\Processors\WrapTemplates; |
49 | |
50 | /** |
51 | * Perform post-processing steps on an already-built HTML DOM. |
52 | */ |
53 | class DOMPostProcessor extends PipelineStage { |
54 | /** @var array */ |
55 | private $options; |
56 | |
57 | private array $seenIds = []; |
58 | private array $processors = []; |
59 | |
60 | /** @var ParsoidExtensionAPI Provides post-processing support to extensions */ |
61 | private $extApi; |
62 | |
63 | /** @var array */ |
64 | private $metadataMap; |
65 | |
66 | /** @var string */ |
67 | private $timeProfile = ''; |
68 | |
69 | public function __construct( |
70 | Env $env, array $options = [], string $stageId = "", |
71 | ?PipelineStage $prevStage = null |
72 | ) { |
73 | parent::__construct( $env, $prevStage ); |
74 | |
75 | $this->options = $options; |
76 | $this->extApi = new ParsoidExtensionAPI( $env ); |
77 | |
78 | // map from mediawiki metadata names to RDFa property names |
79 | $this->metadataMap = [ |
80 | 'ns' => [ |
81 | 'property' => 'mw:pageNamespace', |
82 | 'content' => '%d', |
83 | ], |
84 | 'id' => [ |
85 | 'property' => 'mw:pageId', |
86 | 'content' => '%d', |
87 | ], |
88 | |
89 | // DO NOT ADD rev_user, rev_userid, and rev_comment (See T125266) |
90 | |
91 | // 'rev_revid' is used to set the overall subject of the document, we don't |
92 | // need to add a specific <meta> or <link> element for it. |
93 | |
94 | 'rev_parentid' => [ |
95 | 'rel' => 'dc:replaces', |
96 | 'resource' => 'mwr:revision/%d', |
97 | ], |
98 | 'rev_timestamp' => [ |
99 | 'property' => 'dc:modified', |
100 | 'content' => static function ( $m ) { |
101 | # Convert from TS_MW ("mediawiki timestamp") format |
102 | $dt = DateTime::createFromFormat( 'YmdHis', $m['rev_timestamp'] ); |
103 | # Note that DateTime::ISO8601 is not actually ISO8601, alas. |
104 | return $dt->format( 'Y-m-d\TH:i:s.000\Z' ); |
105 | }, |
106 | ], |
107 | 'rev_sha1' => [ |
108 | 'property' => 'mw:revisionSHA1', |
109 | 'content' => '%s', |
110 | ] |
111 | ]; |
112 | } |
113 | |
114 | public function registerProcessors( ?array $processors ): void { |
115 | foreach ( $processors ?: $this->getDefaultProcessors() as $p ) { |
116 | if ( empty( $p['name'] ) ) { |
117 | $p['name'] = Utils::stripNamespace( $p['Processor'] ); |
118 | } |
119 | if ( empty( $p['shortcut'] ) ) { |
120 | $p['shortcut'] = $p['name']; |
121 | } |
122 | if ( !empty( $p['isTraverser'] ) ) { |
123 | $t = new DOMPPTraverser( |
124 | $p['tplInfo'] ?? false, |
125 | $p['applyToAttributeEmbeddedHTML'] ?? false |
126 | ); |
127 | foreach ( $p['handlers'] as $h ) { |
128 | $t->addHandler( $h['nodeName'], $h['action'] ); |
129 | } |
130 | $p['proc'] = function ( Node $workNode, array $options, bool $atTopLevel ) use ( $t ) { |
131 | return $t->run( $this->env, $workNode, $options, $atTopLevel ); |
132 | }; |
133 | } else { |
134 | $classNameOrSpec = $p['Processor']; |
135 | if ( empty( $p['isExtPP'] ) ) { |
136 | // Internal processor w/ ::run() method, class name given |
137 | $c = new $classNameOrSpec(); |
138 | $p['proc'] = function ( Node $workNode, array $options, bool $atTopLevel ) use ( $c ) { |
139 | return $c->run( $this->env, $workNode, $options, $atTopLevel ); |
140 | }; |
141 | } else { |
142 | // Extension post processor, object factory spec given |
143 | $objectFactory = $this->env->getSiteConfig()->getObjectFactory(); |
144 | $c = $objectFactory->createObject( $classNameOrSpec, [ |
145 | 'allowClassName' => true, |
146 | 'assertClass' => ExtDOMProcessor::class, |
147 | ] ); |
148 | $p['proc'] = function ( Node $workNode, array $options, bool $atTopLevel ) use ( $c ) { |
149 | return $c->wtPostprocess( $this->extApi, $workNode, $options ); |
150 | }; |
151 | } |
152 | } |
153 | $this->processors[] = $p; |
154 | } |
155 | } |
156 | |
157 | public function getDefaultProcessors(): array { |
158 | $env = $this->env; |
159 | $options = $this->options; |
160 | $seenIds = &$this->seenIds; |
161 | $usedIdIndex = []; |
162 | $abouts = []; |
163 | |
164 | $tableFixer = new TableFixups( $env ); |
165 | |
166 | /* --------------------------------------------------------------------------- |
167 | * FIXME: |
168 | * 1. PipelineFactory caches pipelines per env |
169 | * 2. PipelineFactory.parse uses a default cache key |
170 | * 3. ParserTests uses a shared/global env object for all tests. |
171 | * 4. ParserTests also uses PipelineFactory.parse (via env.getContentHandler()) |
172 | * => the pipeline constructed for the first test that runs wt2html |
173 | * is used for all subsequent wt2html tests |
174 | * 5. If we are selectively turning on/off options on a per-test basis |
175 | * in parser tests, those options won't work if those options are |
176 | * also used to configure pipeline construction (including which DOM passes |
177 | * are enabled). |
178 | * |
179 | * Ex: if (env.wrapSections) { addPP('wrapSections', wrapSections); } |
180 | * |
181 | * This won't do what you expect it to do. This is primarily a |
182 | * parser tests script issue -- but given the abstraction layers that |
183 | * are on top of the parser pipeline construction, fixing that is |
184 | * not straightforward right now. So, this note is a warning to future |
185 | * developers to pay attention to how they construct pipelines. |
186 | * --------------------------------------------------------------------------- */ |
187 | |
188 | $processors = [ |
189 | // Common post processing |
190 | [ |
191 | 'Processor' => MarkFosteredContent::class, |
192 | 'shortcut' => 'fostered', |
193 | 'skipNested' => false |
194 | ], |
195 | [ |
196 | 'Processor' => ProcessTreeBuilderFixups::class, |
197 | 'shortcut' => 'process-fixups', |
198 | 'skipNested' => false |
199 | ], |
200 | [ |
201 | 'Processor' => Normalize::class, |
202 | 'skipNested' => false |
203 | ], |
204 | [ |
205 | 'Processor' => PWrap::class, |
206 | 'shortcut' => 'pwrap', |
207 | 'skipNested' => true |
208 | // Don't need to process HTML in embedded attributes |
209 | ], |
210 | // This is run at all levels for now - gallery extension's "packed" mode |
211 | // would otherwise need a post-processing pass to scale media after it |
212 | // has been fetched. That introduces an ordering dependency that may |
213 | // or may not complicate things. |
214 | [ |
215 | 'Processor' => AddMediaInfo::class, |
216 | 'shortcut' => 'media', |
217 | 'skipNested' => false |
218 | ], |
219 | // Run this after: |
220 | // * ProcessTreeBuilderFixups because this pass needs |
221 | // autoInsertedStart / autoInsertedEnd information. |
222 | // * PWrap because PWrap can add additional opportunities |
223 | // for meta migration which we will miss if we run this |
224 | // before p-wrapping. |
225 | // FIXME: But, pwrapping doesn't run on nested pipelines! |
226 | // |
227 | // We could potentially move this just before WrapTemplates |
228 | // by seeing this as a preprocessing pass for that. But, we |
229 | // will have to update the pass to update DSR properties |
230 | // where required. |
231 | // |
232 | // In summary, this can at most be moved before AddMediaInfo or |
233 | // after MigrateTrailingNLs without needing any other changes. |
234 | [ |
235 | 'Processor' => MigrateTemplateMarkerMetas::class, |
236 | 'shortcut' => 'migrate-metas', |
237 | 'omit' => $options['inTemplate'], |
238 | 'skipNested' => false |
239 | ], |
240 | [ |
241 | 'Processor' => MigrateTrailingNLs::class, |
242 | 'shortcut' => 'migrate-nls', |
243 | 'skipNested' => false |
244 | ], |
245 | // - DSR computation and template wrapping are only relevant for top-level |
246 | // content and hence are omitted. But, they cannot be skipped for |
247 | // top-level content even if they are part of nested level pipelines, |
248 | // because such content might be embedded in attributes and they may |
249 | // need to be processed independently. |
250 | [ |
251 | 'Processor' => ComputeDSR::class, |
252 | 'shortcut' => 'dsr', |
253 | 'omit' => $options['inTemplate'], |
254 | 'skipNested' => false |
255 | ], |
256 | [ |
257 | 'Processor' => WrapTemplates::class, |
258 | 'shortcut' => 'tplwrap', |
259 | 'omit' => $options['inTemplate'], |
260 | 'skipNested' => false |
261 | ], |
262 | [ |
263 | 'name' => 'AddAnnotationIds', |
264 | 'shortcut' => 'ann-ids', |
265 | 'skipNested' => false, |
266 | 'isTraverser' => true, |
267 | 'handlers' => [ |
268 | [ |
269 | 'nodeName' => 'meta', |
270 | 'action' => static function ( $node ) use ( &$abouts, $env ) { |
271 | // TODO: $abouts can be part of DTState |
272 | $isStart = false; |
273 | // isStart gets modified (not read) by extractAnnotationType |
274 | $t = WTUtils::extractAnnotationType( $node, $isStart ); |
275 | if ( $t !== null ) { |
276 | $about = null; |
277 | if ( $isStart ) { |
278 | // The 'mwa' prefix is specific to annotations; |
279 | // if other DOM ranges are to use this mechanism, another prefix |
280 | // should be used. |
281 | $about = $env->newAnnotationId(); |
282 | if ( !array_key_exists( $t, $abouts ) ) { |
283 | $abouts[$t] = []; |
284 | } |
285 | array_push( $abouts[$t], $about ); |
286 | } else { |
287 | if ( array_key_exists( $t, $abouts ) ) { |
288 | $about = array_pop( $abouts[$t] ); |
289 | } |
290 | } |
291 | if ( $about === null ) { |
292 | // this doesn't have a start tag, so we don't handle it when creating |
293 | // annotation ranges, and we replace it with a string |
294 | $textAnn = $node->ownerDocument->createTextNode( '</' . $t . '>' ); |
295 | $parentNode = $node->parentNode; |
296 | $parentNode->insertBefore( $textAnn, $node ); |
297 | DOMCompat::remove( $node ); |
298 | return $textAnn; |
299 | } |
300 | DOMDataUtils::getDataMw( $node )->rangeId = $about; |
301 | } |
302 | return true; |
303 | } |
304 | ] |
305 | ], |
306 | 'withAnnotations' => true |
307 | ], |
308 | [ |
309 | 'Processor' => WrapAnnotations::class, |
310 | 'shortcut' => 'annwrap', |
311 | 'skipNested' => false, |
312 | 'withAnnotations' => true |
313 | ], |
314 | // 1. Link prefixes and suffixes |
315 | // 2. Unpack DOM fragments |
316 | // Always run this on nested pipelines so that |
317 | // when we get to the top level pipeline, all |
318 | // embedded fragments have been expanded! |
319 | [ |
320 | 'name' => 'HandleLinkNeighbours,UnpackDOMFragments', |
321 | 'shortcut' => 'dom-unpack', |
322 | 'skipNested' => false, |
323 | 'isTraverser' => true, |
324 | 'handlers' => [ |
325 | [ |
326 | 'nodeName' => 'a', |
327 | 'action' => static fn ( $node ) => HandleLinkNeighbours::handler( $node, $env ) |
328 | ], |
329 | [ |
330 | 'nodeName' => null, |
331 | 'action' => static fn ( $node ) => UnpackDOMFragments::handler( $node, $env ) |
332 | ] |
333 | ] |
334 | ] |
335 | ]; |
336 | |
337 | /** |
338 | * FIXME: There are two potential ordering problems here. |
339 | * |
340 | * 1. unpackDOMFragment should always run immediately |
341 | * before these extensionPostProcessors, which we do currently. |
342 | * This ensures packed content get processed correctly by extensions |
343 | * before additional transformations are run on the DOM. |
344 | * |
345 | * This ordering issue is handled through documentation. |
346 | * |
347 | * 2. This has existed all along (in the PHP parser as well as Parsoid |
348 | * which is probably how the ref-in-ref hack works - because of how |
349 | * parser functions and extension tags are procesed, #tag:ref doesn't |
350 | * see a nested ref anymore) and this patch only exposes that problem |
351 | * more clearly with the unpackOutput property. |
352 | * |
353 | * * Consider the set of extensions that |
354 | * (a) process wikitext |
355 | * (b) provide an extensionPostProcessor |
356 | * (c) run the extensionPostProcessor only on the top-level |
357 | * As of today, there is exactly one extension (Cite) that has all |
358 | * these properties, so the problem below is a speculative problem |
359 | * for today. But, this could potentially be a problem in the future. |
360 | * |
361 | * * Let us say there are at least two of them, E1 and E2 that |
362 | * support extension tags <e1> and <e2> respectively. |
363 | * |
364 | * * Let us say in an instance of <e1> on the page, <e2> is present |
365 | * and in another instance of <e2> on the page, <e1> is present. |
366 | * |
367 | * * In what order should E1's and E2's extensionPostProcessors be |
368 | * run on the top-level? Depending on what these handlers do, you |
369 | * could get potentially different results. You can see this quite |
370 | * starkly with the unpackOutput flag. |
371 | * |
372 | * * The ideal solution to this problem is to require that every extension's |
373 | * extensionPostProcessor be idempotent which lets us run these |
374 | * post processors repeatedly till the DOM stabilizes. But, this |
375 | * still doesn't necessarily guarantee that ordering doesn't matter. |
376 | * It just guarantees that with the unpackOutput flag set to false |
377 | * multiple extensions, all sealed fragments get fully processed. |
378 | * So, we still need to worry about that problem. |
379 | * |
380 | * But, idempotence *could* potentially be a sufficient property in most cases. |
381 | * To see this, consider that there is a Footnotes extension which is similar |
382 | * to the Cite extension in that they both extract inline content in the |
383 | * page source to a separate section of output and leave behind pointers to |
384 | * the global section in the output DOM. Given this, the Cite and Footnote |
385 | * extension post processors would essentially walk the dom and |
386 | * move any existing inline content into that global section till it is |
387 | * done. So, even if a <footnote> has a <ref> and a <ref> has a <footnote>, |
388 | * we ultimately end up with all footnote content in the footnotes section |
389 | * and all ref content in the references section and the DOM stabilizes. |
390 | * Ordering is irrelevant here. |
391 | * |
392 | * So, perhaps one way of catching these problems would be in code review |
393 | * by analyzing what the DOM postprocessor does and see if it introduces |
394 | * potential ordering issues. |
395 | */ |
396 | foreach ( $env->getSiteConfig()->getExtDOMProcessors() as $extName => $domProcs ) { |
397 | foreach ( $domProcs as $i => $domProcSpec ) { |
398 | $processors[] = [ |
399 | 'isExtPP' => true, // This is an extension DOM post processor |
400 | 'name' => "pp:$extName:$i", |
401 | 'Processor' => $domProcSpec, |
402 | // This should be documented in the spec that an extension's |
403 | // wtDOMProcess handler is run once on the top level document. |
404 | 'skipNested' => true |
405 | ]; |
406 | } |
407 | } |
408 | |
409 | $processors = array_merge( $processors, [ |
410 | [ |
411 | 'name' => 'MigrateTrailingCategories,TableFixups,DedupeStyles', |
412 | 'shortcut' => 'fixups', |
413 | 'skipNested' => true, |
414 | 'isTraverser' => true, |
415 | 'applyToAttributeEmbeddedHTML' => true, |
416 | 'tplInfo' => true, |
417 | 'handlers' => [ |
418 | // Move trailing categories in <li>s out of the list |
419 | [ |
420 | 'nodeName' => 'li', |
421 | 'action' => static fn ( $node, $state ) => LiFixups::migrateTrailingCategories( $node, $state ) |
422 | ], |
423 | [ |
424 | 'nodeName' => 'dt', |
425 | 'action' => static fn ( $node, $state ) => LiFixups::migrateTrailingCategories( $node, $state ) |
426 | ], |
427 | [ |
428 | 'nodeName' => 'dd', |
429 | 'action' => static fn ( $node, $state ) => LiFixups::migrateTrailingCategories( $node, $state ) |
430 | ], |
431 | // 2. Fix up issues from templated table cells and table cell attributes |
432 | [ |
433 | 'nodeName' => 'td', |
434 | 'action' => fn ( $node ) => $tableFixer->stripDoubleTDs( $node, $this->frame ) |
435 | ], |
436 | [ |
437 | 'nodeName' => 'td', |
438 | 'action' => fn ( $node ) => $tableFixer->handleTableCellTemplates( $node, $this->frame ) |
439 | ], |
440 | [ |
441 | 'nodeName' => 'th', |
442 | 'action' => fn ( $node ) => $tableFixer->handleTableCellTemplates( $node, $this->frame ) |
443 | ], |
444 | // 3. Deduplicate template styles |
445 | // (should run after dom-fragment expansion + after extension post-processors) |
446 | [ |
447 | 'nodeName' => 'style', |
448 | 'action' => static fn ( $node, $dtState ) => DedupeStyles::dedupe( $node, $env, $dtState ) |
449 | ] |
450 | ] |
451 | ], |
452 | [ |
453 | 'Processor' => Linter::class, |
454 | 'omit' => !$env->linting(), |
455 | 'skipNested' => true |
456 | // FIXME: T214994: Have to process HTML in embedded attributes? |
457 | ], |
458 | // Strip marker metas -- removes left over marker metas (ex: metas |
459 | // nested in expanded tpl/extension output). |
460 | [ |
461 | 'name' => 'CleanUp-stripMarkerMetas', |
462 | 'shortcut' => 'strip-metas', |
463 | 'skipNested' => true, |
464 | 'isTraverser' => true, |
465 | 'applyToAttributeEmbeddedHTML' => true, |
466 | 'handlers' => [ |
467 | [ |
468 | 'nodeName' => 'meta', |
469 | 'action' => static fn ( $node ) => CleanUp::stripMarkerMetas( $node ), |
470 | ] |
471 | ] |
472 | ], |
473 | // Language conversion and Red link marking are done here |
474 | // *before* we cleanup and save data-parsoid because they |
475 | // are also used in pb2pb/html2html passes, and we want to |
476 | // keep their input/output formats consistent. |
477 | [ |
478 | 'Processor' => LangConverter::class, |
479 | 'shortcut' => 'lang-converter', |
480 | 'skipNested' => true |
481 | // FIXME: T214994: Have to process HTML in embedded attributes? |
482 | ], |
483 | [ |
484 | 'Processor' => AddRedLinks::class, |
485 | 'shortcut' => 'redlinks', |
486 | 'skipNested' => true, |
487 | // FIXME: T214994: Have to process HTML in embedded attributes? |
488 | ], |
489 | [ |
490 | 'name' => 'DisplaySpace', |
491 | 'shortcut' => 'displayspace', |
492 | 'skipNested' => true, |
493 | // Don't need to process HTML in embedded attributes |
494 | 'applyToAttributeEmbeddedHTML' => false, |
495 | 'isTraverser' => true, |
496 | 'handlers' => [ |
497 | [ |
498 | 'nodeName' => null, |
499 | 'action' => static fn ( $node ) => DisplaySpace::leftHandler( $node ) |
500 | ], |
501 | [ |
502 | 'nodeName' => null, |
503 | 'action' => static fn ( $node ) => DisplaySpace::rightHandler( $node ) |
504 | ], |
505 | ] |
506 | ], |
507 | [ |
508 | 'Processor' => AddLinkAttributes::class, |
509 | 'shortcut' => 'linkclasses', |
510 | // FIXME: T214994: Might be beneficial to process HTML in embedded attributes |
511 | // since some (not yet known) use cases might benefit from this information |
512 | // on these hidden links. |
513 | 'skipNested' => true |
514 | ], |
515 | // Benefits from running after determining which media are redlinks |
516 | [ |
517 | 'name' => 'Headings-genAnchors', |
518 | 'shortcut' => 'heading-ids', |
519 | 'skipNested' => true, |
520 | 'isTraverser' => true, |
521 | // No need to generate heading ids for HTML embedded in attributes |
522 | 'applyToAttributeEmbeddedHTML' => false, |
523 | 'handlers' => [ |
524 | [ |
525 | 'nodeName' => null, |
526 | 'action' => static fn ( $node ) => Headings::genAnchors( $node, $env ) |
527 | ], |
528 | [ |
529 | 'nodeName' => null, |
530 | 'action' => static function ( $node ) use ( &$seenIds ) { |
531 | // TODO: $seenIds can be part of DTState |
532 | return Headings::dedupeHeadingIds( $seenIds, $node ); |
533 | } |
534 | ] |
535 | ] |
536 | ], |
537 | // Add <section> wrappers around sections |
538 | [ |
539 | 'Processor' => WrapSections::class, |
540 | 'shortcut' => 'sections', |
541 | 'skipNested' => true |
542 | // Don't need to process HTML in embedded attributes |
543 | ], |
544 | [ |
545 | 'Processor' => ConvertOffsets::class, |
546 | 'shortcut' => 'convertoffsets', |
547 | 'skipNested' => true, |
548 | // FIXME: T214994: Have to process HTML in embedded attributes? |
549 | ], |
550 | [ |
551 | 'Processor' => I18n::class, |
552 | 'shortcut' => 'i18n', |
553 | // FIXME(T214994): This should probably be `true`, since we |
554 | // want this to be another html2html type pass, but then our |
555 | // processor would need to handle nested content. Redlinks, |
556 | // displayspace, and others are ignoring that for now though, |
557 | // so let's wait until there's a more general mechanism. |
558 | 'skipNested' => false, |
559 | ], |
560 | [ |
561 | 'name' => 'CleanUp-handleEmptyElts,CleanUp-cleanup', |
562 | 'shortcut' => 'cleanup', |
563 | 'skipNested' => true, |
564 | 'isTraverser' => true, |
565 | 'applyToAttributeEmbeddedHTML' => true, |
566 | 'tplInfo' => true, |
567 | 'handlers' => [ |
568 | // Strip empty elements from template content |
569 | [ |
570 | 'nodeName' => null, |
571 | 'action' => static fn ( $node, $state ) => CleanUp::handleEmptyElements( $node, $state ) |
572 | ], |
573 | // Additional cleanup |
574 | [ |
575 | 'nodeName' => null, |
576 | 'action' => static fn ( $node, $state ) => CleanUp::finalCleanup( $node, $state ) |
577 | ] |
578 | ] |
579 | ], |
580 | [ |
581 | 'name' => 'CleanUp-saveDataParsoid', |
582 | 'shortcut' => 'saveDP', |
583 | 'skipNested' => true, |
584 | 'isTraverser' => true, |
585 | // FIXME This means the data-* from embedded HTML fragments won't end up |
586 | // in the pagebundle. But, if we try to call this on those fragments, |
587 | // we get multiple calls to store embedded docs. So, we may need to |
588 | // write a custom traverser if we want these embedded data* objects |
589 | // in the pagebundle (this is not a regression since they weren't part |
590 | // of the pagebundle all this while anyway.) |
591 | 'applyToAttributeEmbeddedHTML' => false, |
592 | 'tplInfo' => true, |
593 | 'handlers' => [ |
594 | // Save data.parsoid into data-parsoid html attribute. |
595 | // Make this its own thing so that any changes to the DOM |
596 | // don't affect other handlers that run alongside it. |
597 | [ |
598 | 'nodeName' => null, |
599 | 'action' => static function ( $node, $state ) use ( $env, &$usedIdIndex ) { |
600 | // TODO: $usedIdIndex can be part of DTState |
601 | if ( $state->atTopLevel && DOMUtils::isBody( $node ) ) { |
602 | $usedIdIndex = DOMDataUtils::usedIdIndex( $node ); |
603 | } |
604 | return CleanUp::saveDataParsoid( $usedIdIndex, $node, $env, $state ); |
605 | } |
606 | ] |
607 | ] |
608 | ], |
609 | ] ); |
610 | |
611 | return $processors; |
612 | } |
613 | |
614 | /** |
615 | * @inheritDoc |
616 | */ |
617 | public function setSourceOffsets( SourceRange $so ): void { |
618 | $this->options['sourceOffsets'] = $so; |
619 | } |
620 | |
621 | /** |
622 | * @inheritDoc |
623 | */ |
624 | public function resetState( array $options ): void { |
625 | parent::resetState( $options ); |
626 | $this->seenIds = []; |
627 | } |
628 | |
629 | private function updateBodyClasslist( Element $body, Env $env ): void { |
630 | $dir = $env->getPageConfig()->getPageLanguageDir(); |
631 | $bodyCL = DOMCompat::getClassList( $body ); |
632 | $bodyCL->add( 'mw-content-' . $dir ); |
633 | $bodyCL->add( 'sitedir-' . $dir ); |
634 | $bodyCL->add( $dir ); |
635 | $body->setAttribute( 'dir', $dir ); |
636 | |
637 | // Set 'mw-body-content' directly on the body. |
638 | // This is the designated successor for #bodyContent in core skins. |
639 | $bodyCL->add( 'mw-body-content' ); |
640 | // Set 'parsoid-body' to add the desired layout styling from Vector. |
641 | $bodyCL->add( 'parsoid-body' ); |
642 | // Also, add the 'mediawiki' class. |
643 | // Some MediaWiki:Common.css seem to target this selector. |
644 | $bodyCL->add( 'mediawiki' ); |
645 | // Set 'mw-parser-output' directly on the body. |
646 | // Templates target this class as part of the TemplateStyles RFC |
647 | // FIXME: This isn't expected to be found on the same element as the |
648 | // body class above, since some css targets it as a descendant. |
649 | // In visual diff'ing, we migrate the body contents to a wrapper div |
650 | // with this class to reduce visual differences. Consider getting |
651 | // rid of it. |
652 | $bodyCL->add( 'mw-parser-output' ); |
653 | } |
654 | |
655 | /** |
656 | * FIXME: consider moving to DOMUtils or Env. |
657 | * |
658 | * @param Env $env |
659 | * @param Document $document |
660 | */ |
661 | public function addMetaData( Env $env, Document $document ): void { |
662 | $title = $env->getContextTitle(); |
663 | |
664 | // Set the charset in the <head> first. |
665 | // This also adds the <head> element if it was missing. |
666 | DOMUtils::appendToHead( $document, 'meta', [ 'charset' => 'utf-8' ] ); |
667 | |
668 | // add mw: and mwr: RDFa prefixes |
669 | $prefixes = [ |
670 | 'dc: http://purl.org/dc/terms/', |
671 | 'mw: http://mediawiki.org/rdf/' |
672 | ]; |
673 | $document->documentElement->setAttribute( 'prefix', implode( ' ', $prefixes ) ); |
674 | |
675 | // (From wfParseUrl in core:) |
676 | // Protocol-relative URLs are handled really badly by parse_url(). |
677 | // It's so bad that the easiest way to handle them is to just prepend |
678 | // 'https:' and strip the protocol out later. |
679 | $baseURI = $env->getSiteConfig()->baseURI(); |
680 | $wasRelative = substr( $baseURI, 0, 2 ) == '//'; |
681 | if ( $wasRelative ) { |
682 | $baseURI = "https:$baseURI"; |
683 | } |
684 | // add 'https://' to baseURI if it was missing |
685 | $pu = parse_url( $baseURI ); |
686 | $mwrPrefix = ( !empty( $pu['scheme'] ) ? '' : 'https://' ) . |
687 | $baseURI . 'Special:Redirect/'; |
688 | |
689 | ( DOMCompat::getHead( $document ) )->setAttribute( 'prefix', 'mwr: ' . $mwrPrefix ); |
690 | |
691 | // add <head> content based on page meta data: |
692 | |
693 | // Add page / revision metadata to the <head> |
694 | // PORT-FIXME: We will need to do some refactoring to eliminate |
695 | // this hardcoding. Probably even merge this into metadataMap |
696 | $pageConfig = $env->getPageConfig(); |
697 | $revProps = [ |
698 | 'id' => $pageConfig->getPageId(), |
699 | 'ns' => $title->getNamespace(), |
700 | 'rev_parentid' => $pageConfig->getParentRevisionId(), |
701 | 'rev_revid' => $pageConfig->getRevisionId(), |
702 | 'rev_sha1' => $pageConfig->getRevisionSha1(), |
703 | 'rev_timestamp' => $pageConfig->getRevisionTimestamp() |
704 | ]; |
705 | foreach ( $revProps as $key => $value ) { |
706 | // generate proper attributes for the <meta> or <link> tag |
707 | if ( $value === null || $value === '' || !isset( $this->metadataMap[$key] ) ) { |
708 | continue; |
709 | } |
710 | |
711 | $attrs = []; |
712 | $mdm = $this->metadataMap[$key]; |
713 | |
714 | /** FIXME: The JS side has a bunch of other checks here */ |
715 | |
716 | foreach ( $mdm as $k => $v ) { |
717 | // evaluate a function, or perform sprintf-style formatting, or |
718 | // use string directly, depending on value in metadataMap |
719 | if ( $v instanceof Closure ) { |
720 | $v = $v( $revProps ); |
721 | } elseif ( strpos( $v, '%' ) !== false ) { |
722 | // @phan-suppress-next-line PhanPluginPrintfVariableFormatString |
723 | $v = sprintf( $v, $value ); |
724 | } |
725 | $attrs[$k] = $v; |
726 | } |
727 | |
728 | // <link> is used if there's a resource or href attribute. |
729 | DOMUtils::appendToHead( $document, |
730 | isset( $attrs['resource'] ) || isset( $attrs['href'] ) ? 'link' : 'meta', |
731 | $attrs |
732 | ); |
733 | } |
734 | |
735 | if ( $revProps['rev_revid'] ) { |
736 | $document->documentElement->setAttribute( |
737 | 'about', $mwrPrefix . 'revision/' . $revProps['rev_revid'] |
738 | ); |
739 | } |
740 | |
741 | // Normalize before comparison |
742 | if ( $title->isSameLinkAs( $env->getSiteConfig()->mainPageLinkTarget() ) ) { |
743 | DOMUtils::appendToHead( $document, 'meta', [ |
744 | 'property' => 'isMainPage', |
745 | 'content' => 'true' /* HTML attribute values should be strings */ |
746 | ] ); |
747 | } |
748 | |
749 | // Set the parsoid content-type strings |
750 | // FIXME: Should we be using http-equiv for this? |
751 | DOMUtils::appendToHead( $document, 'meta', [ |
752 | 'property' => 'mw:htmlVersion', |
753 | 'content' => $env->getOutputContentVersion() |
754 | ] |
755 | ); |
756 | // Temporary backward compatibility for clients |
757 | // This could be skipped if we support a version downgrade path |
758 | // with a major version bump. |
759 | DOMUtils::appendToHead( $document, 'meta', [ |
760 | 'property' => 'mw:html:version', |
761 | 'content' => $env->getOutputContentVersion() |
762 | ] |
763 | ); |
764 | |
765 | $expTitle = explode( '/', $title->getPrefixedDBKey() ); |
766 | $expTitle = array_map( static function ( $comp ) { |
767 | return PHPUtils::encodeURIComponent( $comp ); |
768 | }, $expTitle ); |
769 | |
770 | DOMUtils::appendToHead( $document, 'link', [ |
771 | 'rel' => 'dc:isVersionOf', |
772 | 'href' => $env->getSiteConfig()->baseURI() . implode( '/', $expTitle ) |
773 | ] ); |
774 | |
775 | // Add base href pointing to the wiki root |
776 | DOMUtils::appendToHead( $document, 'base', [ |
777 | 'href' => $env->getSiteConfig()->baseURI() |
778 | ] ); |
779 | |
780 | // Stick data attributes in the head |
781 | if ( $env->pageBundle ) { |
782 | DOMDataUtils::injectPageBundle( $document, DOMDataUtils::getPageBundle( $document ) ); |
783 | } |
784 | |
785 | // PageConfig guarantees language will always be non-null. |
786 | $lang = $env->getPageConfig()->getPageLanguageBcp47(); |
787 | $body = DOMCompat::getBody( $document ); |
788 | $body->setAttribute( 'lang', $lang->toBcp47Code() ); |
789 | $this->updateBodyClasslist( $body, $env ); |
790 | $env->getSiteConfig()->exportMetadataToHeadBcp47( |
791 | $document, $env->getMetadata(), |
792 | $title->getPrefixedText(), $lang |
793 | ); |
794 | |
795 | // Indicate whether LanguageConverter is enabled, so that downstream |
796 | // caches can split on variant (if necessary) |
797 | DOMUtils::appendToHead( $document, 'meta', [ |
798 | 'http-equiv' => 'content-language', |
799 | // Note that this is "wrong": we should be returning |
800 | // $env->htmlContentLanguageBcp47()->toBcp47Code() directly |
801 | // but for back-compat we'll return the "old" mediawiki-internal |
802 | // code for now |
803 | 'content' => Utils::bcp47ToMwCode( # T323052: remove this call |
804 | $env->htmlContentLanguageBcp47()->toBcp47Code() |
805 | ), |
806 | ] |
807 | ); |
808 | DOMUtils::appendToHead( $document, 'meta', [ |
809 | 'http-equiv' => 'vary', |
810 | 'content' => $env->htmlVary() |
811 | ] |
812 | ); |
813 | |
814 | if ( $env->profiling() ) { |
815 | $profile = $env->getCurrentProfile(); |
816 | $body->appendChild( $body->ownerDocument->createTextNode( "\n" ) ); |
817 | $body->appendChild( $body->ownerDocument->createComment( $this->timeProfile ) ); |
818 | $body->appendChild( $body->ownerDocument->createTextNode( "\n" ) ); |
819 | } |
820 | } |
821 | |
822 | public function doPostProcess( Node $node ): void { |
823 | $env = $this->env; |
824 | |
825 | $hasDumpFlags = $env->hasDumpFlags(); |
826 | |
827 | if ( $hasDumpFlags && $env->hasDumpFlag( 'dom:post-builder' ) ) { |
828 | $opts = []; |
829 | $env->writeDump( ContentUtils::dumpDOM( $node, 'DOM: after tree builder', $opts ) ); |
830 | } |
831 | |
832 | $prefix = null; |
833 | $traceLevel = null; |
834 | $resourceCategory = null; |
835 | |
836 | $profile = null; |
837 | if ( $env->profiling() ) { |
838 | $profile = $env->getCurrentProfile(); |
839 | if ( $this->atTopLevel ) { |
840 | $this->timeProfile = str_repeat( "-", 85 ) . "\n"; |
841 | $prefix = 'TOP'; |
842 | // Turn off DOM pass timing tracing on non-top-level documents |
843 | $resourceCategory = 'DOMPasses:TOP'; |
844 | } else { |
845 | $prefix = '---'; |
846 | $resourceCategory = 'DOMPasses:NESTED'; |
847 | } |
848 | } |
849 | |
850 | foreach ( $this->processors as $pp ) { |
851 | // - Nested pipelines are used for both top-level and non-top-level content. |
852 | // - Omit is currently set only for templated content pipelines. |
853 | // - But, skipNested can be set for both templated content as well as |
854 | // top-level content. |
855 | if ( !empty( $pp['omit'] ) ) { |
856 | continue; |
857 | } |
858 | Assert::invariant( isset( $pp['skipNested'] ), |
859 | "skipNested property missing for " . $pp['name'] . " processor." ); |
860 | if ( $pp['skipNested'] && !$this->atTopLevel ) { |
861 | continue; |
862 | } |
863 | |
864 | // avoids wondering why the pass doesn't run on attributes when setting to true on a non-traverser pass |
865 | if ( $pp['applyToAttributeEmbeddedHTML'] ?? false ) { |
866 | Assert::invariant( ( $pp['isTraverser'] ?? false ) === true, |
867 | 'applyToAttributeEmbeddedHTML can only be executed for DOM traverser passes, and ' . $pp['name'] . |
868 | 'is not such a pass' ); |
869 | } |
870 | |
871 | // error_log("RUNNING " . ($pp['shortcut'] ?? $pp['name'])); |
872 | |
873 | if ( !empty( $pp['withAnnotations'] ) && !$this->env->hasAnnotations ) { |
874 | continue; |
875 | } |
876 | |
877 | $ppName = null; |
878 | $ppStart = null; |
879 | |
880 | // Trace |
881 | if ( $profile ) { |
882 | $ppName = $pp['name'] . str_repeat( |
883 | " ", |
884 | ( strlen( $pp['name'] ) < 30 ) ? 30 - strlen( $pp['name'] ) : 0 |
885 | ); |
886 | $ppStart = microtime( true ); |
887 | } |
888 | |
889 | $opts = null; |
890 | if ( $hasDumpFlags ) { |
891 | $opts = [ |
892 | 'env' => $env, |
893 | 'dumpFragmentMap' => $this->atTopLevel, |
894 | 'keepTmp' => true |
895 | ]; |
896 | |
897 | if ( $env->hasDumpFlag( 'dom:pre-' . $pp['shortcut'] ) |
898 | || $env->hasDumpFlag( 'dom:pre-*' ) |
899 | ) { |
900 | $env->writeDump( |
901 | ContentUtils::dumpDOM( $node, 'DOM: pre-' . $pp['shortcut'], $opts ) |
902 | ); |
903 | } |
904 | } |
905 | |
906 | // Excessive to do it here always, but protects against future changes |
907 | // to how $this->frame may be updated. |
908 | $pp['proc']( $node, [ 'frame' => $this->frame ] + $this->options, $this->atTopLevel ); |
909 | |
910 | if ( $hasDumpFlags && ( $env->hasDumpFlag( 'dom:post-' . $pp['shortcut'] ) |
911 | || $env->hasDumpFlag( 'dom:post-*' ) ) |
912 | ) { |
913 | $env->writeDump( |
914 | ContentUtils::dumpDOM( $node, 'DOM: post-' . $pp['shortcut'], $opts ) |
915 | ); |
916 | } |
917 | |
918 | if ( $profile ) { |
919 | $ppElapsed = 1000 * ( microtime( true ) - $ppStart ); |
920 | if ( $this->atTopLevel ) { |
921 | $this->timeProfile .= str_pad( $prefix . '; ' . $ppName, 65 ) . |
922 | ' time = ' . |
923 | str_pad( number_format( $ppElapsed, 2 ), 10, ' ', STR_PAD_LEFT ) . "\n"; |
924 | } |
925 | $profile->bumpTimeUse( $resourceCategory, $ppElapsed, 'DOM' ); |
926 | } |
927 | } |
928 | |
929 | // For sub-pipeline documents, we are done. |
930 | // For the top-level document, we generate <head> and add it. |
931 | if ( $this->atTopLevel ) { |
932 | self::addMetaData( $env, $node->ownerDocument ); |
933 | if ( $env->hasDumpFlag( 'wt2html:limits' ) ) { |
934 | /* |
935 | * PORT-FIXME: Not yet implemented |
936 | $env->printWt2HtmlResourceUsage( [ |
937 | 'HTML Size' => strlen( DOMCompat::getOuterHTML( $document->documentElement ) ) |
938 | ] ); |
939 | */ |
940 | } |
941 | } |
942 | } |
943 | |
944 | /** |
945 | * @inheritDoc |
946 | */ |
947 | public function process( $node, array $opts = null ) { |
948 | '@phan-var Node $node'; // @var Node $node |
949 | $this->doPostProcess( $node ); |
950 | // @phan-suppress-next-line PhanTypeMismatchReturnSuperType |
951 | return $node; |
952 | } |
953 | |
954 | /** |
955 | * @inheritDoc |
956 | */ |
957 | public function processChunkily( $input, ?array $options ): Generator { |
958 | if ( $this->prevStage ) { |
959 | // The previous stage will yield a DOM. |
960 | // FIXME: Should we change the signature of that to return a DOM |
961 | // If we do so, a pipeline stage returns either a generator or |
962 | // concrete output (in this case, a DOM). |
963 | $node = $this->prevStage->processChunkily( $input, $options )->current(); |
964 | } else { |
965 | $node = $input; |
966 | } |
967 | $this->process( $node ); |
968 | yield $node; |
969 | } |
970 | } |