Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 90 |
|
0.00% |
0 / 9 |
CRAP | |
0.00% |
0 / 1 |
ParserPipelineFactory | |
0.00% |
0 / 90 |
|
0.00% |
0 / 9 |
650 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
defaultOptions | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
procNamesToProcs | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
12 | |||
makePipeline | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
42 | |||
getCacheKey | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
56 | |||
parse | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
selectiveDOMUpdate | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
getPipeline | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
returnPipeline | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\Core\InternalException; |
9 | use Wikimedia\Parsoid\Core\SelectiveUpdateData; |
10 | use Wikimedia\Parsoid\DOM\Document; |
11 | use Wikimedia\Parsoid\Utils\PHPUtils; |
12 | use Wikimedia\Parsoid\Utils\Utils; |
13 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\AddAnnotationIds; |
14 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\AddLinkAttributes; |
15 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\CleanUp; |
16 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\DedupeStyles; |
17 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\DisplaySpace; |
18 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\HandleLinkNeighbours; |
19 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\Headings; |
20 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\LiFixups; |
21 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\TableFixups; |
22 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\UnpackDOMFragments; |
23 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\AddMediaInfo; |
24 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\AddMetaData; |
25 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\AddRedLinks; |
26 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\ComputeDSR; |
27 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\ConvertOffsets; |
28 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\LangConverter; |
29 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\Linter; |
30 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\MarkFosteredContent; |
31 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\MigrateTemplateMarkerMetas; |
32 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\MigrateTrailingNLs; |
33 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\Normalize; |
34 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\ProcessEmbeddedDocs; |
35 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\ProcessTreeBuilderFixups; |
36 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\PWrap; |
37 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\RunExtensionProcessors; |
38 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\UpdateTemplateOutput; |
39 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\WrapAnnotations; |
40 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\WrapSections; |
41 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\WrapTemplates; |
42 | use Wikimedia\Parsoid\Wt2Html\TreeBuilder\TreeBuilderStage; |
43 | use Wikimedia\Parsoid\Wt2Html\TT\AttributeExpander; |
44 | use Wikimedia\Parsoid\Wt2Html\TT\BehaviorSwitchHandler; |
45 | use Wikimedia\Parsoid\Wt2Html\TT\DOMFragmentBuilder; |
46 | use Wikimedia\Parsoid\Wt2Html\TT\ExtensionHandler; |
47 | use Wikimedia\Parsoid\Wt2Html\TT\ExternalLinkHandler; |
48 | use Wikimedia\Parsoid\Wt2Html\TT\LanguageVariantHandler; |
49 | use Wikimedia\Parsoid\Wt2Html\TT\ListHandler; |
50 | use Wikimedia\Parsoid\Wt2Html\TT\OnlyInclude; |
51 | use Wikimedia\Parsoid\Wt2Html\TT\ParagraphWrapper; |
52 | use Wikimedia\Parsoid\Wt2Html\TT\PreHandler; |
53 | use Wikimedia\Parsoid\Wt2Html\TT\QuoteTransformer; |
54 | use Wikimedia\Parsoid\Wt2Html\TT\SanitizerHandler; |
55 | use Wikimedia\Parsoid\Wt2Html\TT\TemplateHandler; |
56 | use Wikimedia\Parsoid\Wt2Html\TT\TokenStreamPatcher; |
57 | use Wikimedia\Parsoid\Wt2Html\TT\WikiLinkHandler; |
58 | |
59 | /** |
60 | * This class assembles parser pipelines from parser stages |
61 | */ |
62 | class ParserPipelineFactory { |
63 | private static $globalPipelineId = 0; |
64 | |
65 | private const DOM_PROCESSOR_CONFIG = [ |
66 | 'addmetadata' => AddMetaData::class, |
67 | 'annwrap' => WrapAnnotations::class, |
68 | 'convertoffsets' => ConvertOffsets::class, |
69 | 'dsr' => ComputeDSR::class, |
70 | 'embedded-docs' => ProcessEmbeddedDocs::class, |
71 | 'extpp' => RunExtensionProcessors::class, |
72 | 'fostered' => MarkFosteredContent::class, |
73 | 'linter' => Linter::class, |
74 | 'lang-converter' => LangConverter::class, |
75 | 'media' => AddMediaInfo::class, |
76 | 'migrate-metas' => MigrateTemplateMarkerMetas::class, |
77 | 'migrate-nls' => MigrateTrailingNLs::class, |
78 | 'normalize' => Normalize::class, |
79 | 'process-fixups' => ProcessTreeBuilderFixups::class, |
80 | 'pwrap' => PWrap::class, |
81 | 'redlinks' => AddRedLinks::class, |
82 | 'sections' => WrapSections::class, // Don't process HTML in embedded attributes |
83 | 'tplwrap' => WrapTemplates::class, |
84 | 'update-template' => UpdateTemplateOutput::class, |
85 | 'ann-ids' => [ |
86 | 'name' => 'AddAnnotationIds', |
87 | 'handlers' => [ |
88 | [ 'nodeName' => 'meta', 'action' => [ AddAnnotationIds::class, 'handler' ] ] |
89 | ], |
90 | 'withAnnotations' => true |
91 | ], |
92 | 'linkneighbours+dom-unpack' => [ |
93 | 'name' => 'HandleLinkNeighbours,UnpackDOMFragments', |
94 | 'handlers' => [ |
95 | // Link prefixes and suffixes |
96 | [ 'nodeName' => 'a', 'action' => [ HandleLinkNeighbours::class, 'handler' ] ], |
97 | [ 'nodeName' => null, 'action' => [ UnpackDOMFragments::class, 'handler' ] ] |
98 | ] |
99 | ], |
100 | 'fixups' => [ |
101 | 'name' => 'MigrateTrailingCategories,TableFixups', |
102 | 'tplInfo' => true, |
103 | 'handlers' => [ |
104 | // 1. Move trailing categories in <li>s out of the list |
105 | [ 'nodeName' => 'li', 'action' => [ LiFixups::class, 'migrateTrailingSolTransparentLinks' ] ], |
106 | [ 'nodeName' => 'dt', 'action' => [ LiFixups::class, 'migrateTrailingSolTransparentLinks' ] ], |
107 | [ 'nodeName' => 'dd', 'action' => [ LiFixups::class, 'migrateTrailingSolTransparentLinks' ] ], |
108 | // 2. Fix up issues from templated table cells and table cell attributes |
109 | [ 'nodeName' => 'td', 'action' => [ TableFixups::class, 'handleTableCellTemplates' ] ], |
110 | [ 'nodeName' => 'th', 'action' => [ TableFixups::class, 'handleTableCellTemplates' ] ], |
111 | ] |
112 | ], |
113 | 'fixups+dedupe-styles' => [ |
114 | 'name' => 'MigrateTrailingCategories,TableFixups,DedupeStyles', |
115 | 'tplInfo' => true, |
116 | 'handlers' => [ |
117 | // 1. Move trailing categories in <li>s out of the list |
118 | [ 'nodeName' => 'li', 'action' => [ LiFixups::class, 'migrateTrailingSolTransparentLinks' ] ], |
119 | [ 'nodeName' => 'dt', 'action' => [ LiFixups::class, 'migrateTrailingSolTransparentLinks' ] ], |
120 | [ 'nodeName' => 'dd', 'action' => [ LiFixups::class, 'migrateTrailingSolTransparentLinks' ] ], |
121 | // 2. Fix up issues from templated table cells and table cell attributes |
122 | [ 'nodeName' => 'td', 'action' => [ TableFixups::class, 'handleTableCellTemplates' ] ], |
123 | [ 'nodeName' => 'th', 'action' => [ TableFixups::class, 'handleTableCellTemplates' ] ], |
124 | // 3. Deduplicate template styles |
125 | // (should run after dom-fragment expansion + after extension post-processors) |
126 | [ 'nodeName' => 'style', 'action' => [ DedupeStyles::class, 'dedupe' ] ] |
127 | ] |
128 | ], |
129 | // Strip marker metas -- removes left over marker metas (ex: metas |
130 | // nested in expanded tpl/extension output). |
131 | 'strip-metas' => [ |
132 | 'name' => 'CleanUp-stripMarkerMetas', |
133 | 'handlers' => [ |
134 | [ 'nodeName' => 'meta', 'action' => [ CleanUp::class, 'stripMarkerMetas' ] ] |
135 | ] |
136 | ], |
137 | 'displayspace' => [ |
138 | 'name' => 'DisplaySpace', |
139 | 'handlers' => [ |
140 | [ 'nodeName' => null, 'action' => [ DisplaySpace::class, 'leftHandler' ] ], |
141 | [ 'nodeName' => null, 'action' => [ DisplaySpace::class, 'rightHandler' ] ], |
142 | ] |
143 | ], |
144 | 'linkclasses' => [ |
145 | 'name' => 'AddLinkAttributes', |
146 | 'handlers' => [ |
147 | [ 'nodeName' => 'a', 'action' => [ AddLinkAttributes::class, 'handler' ] ] |
148 | ] |
149 | ], |
150 | 'gen-anchors' => [ |
151 | 'name' => 'Headings-genAnchors', |
152 | 'handlers' => [ |
153 | [ 'nodeName' => null, 'action' => [ Headings::class, 'genAnchors' ] ], |
154 | ] |
155 | ], |
156 | 'dedupe-heading-ids' => [ |
157 | 'name' => 'Headings-dedupeIds', |
158 | 'handlers' => [ |
159 | [ 'nodeName' => null, 'action' => [ Headings::class, 'dedupeHeadingIds' ] ] |
160 | ] |
161 | ], |
162 | 'heading-ids' => [ |
163 | 'name' => 'Headings-genAnchors', |
164 | 'handlers' => [ |
165 | [ 'nodeName' => null, 'action' => [ Headings::class, 'genAnchors' ] ], |
166 | [ 'nodeName' => null, 'action' => [ Headings::class, 'dedupeHeadingIds' ] ] |
167 | ] |
168 | ], |
169 | 'cleanup' => [ |
170 | 'name' => 'CleanUp-handleEmptyElts,CleanUp-cleanup', |
171 | 'tplInfo' => true, |
172 | 'handlers' => [ |
173 | // Strip empty elements from template content |
174 | [ 'nodeName' => null, 'action' => [ CleanUp::class, 'handleEmptyElements' ] ], |
175 | // Additional cleanup |
176 | [ 'nodeName' => null, 'action' => [ CleanUp::class, 'finalCleanup' ] ] |
177 | ] |
178 | ], |
179 | 'saveDP' => [ |
180 | 'name' => 'CleanUp-saveDataParsoid', |
181 | 'tplInfo' => true, |
182 | 'handlers' => [ |
183 | // Mark which data.parsoid's should be serialized into |
184 | // data-parsoid html attributes. |
185 | // Make this its own thing so that any changes to the DOM |
186 | // don't affect other handlers that run alongside it. |
187 | [ 'nodeName' => null, 'action' => [ CleanUp::class, 'saveDataParsoid' ] ] |
188 | ] |
189 | ] |
190 | ]; |
191 | |
192 | // NOTES about ordering / inclusion: |
193 | // |
194 | // media: |
195 | // This is run at all levels for now - gallery extension's "packed" mode |
196 | // would otherwise need a post-processing pass to scale media after it |
197 | // has been fetched. That introduces an ordering dependency that may |
198 | // or may not complicate things. |
199 | // migrate-metas: |
200 | // - Run this after 'pwrap' because it can add additional opportunities for |
201 | // meta migration which we will miss if we run this before p-wrapping. |
202 | // - We could potentially move this just before 'tplwrap' by seeing this |
203 | // as a preprocessing pass for that. But, we will have to update the pass |
204 | // to update DSR properties where required. |
205 | // - In summary, this can at most be moved before 'media' or after |
206 | // 'migrate-nls' without needing any other changes. |
207 | // dsr, tplwrap: |
208 | // DSR computation and template wrapping cannot be skipped for top-level content |
209 | // even if they are part of nested level pipelines, because such content might be |
210 | // embedded in attributes and they may need to be processed independently. |
211 | // |
212 | // Nested (non-top-level) pipelines can never include the following: |
213 | // - lang-converter, convertoffsets, dedupe-styles, cleanup, saveDP |
214 | // |
215 | // FIXME: Perhaps introduce a config flag in the processor config that |
216 | // verifies this property against a pipeline's 'toplevel' state. |
217 | public const NESTED_PIPELINE_DOM_TRANSFORMS = [ |
218 | 'fostered', 'process-fixups', 'normalize', 'pwrap', |
219 | 'media', 'migrate-metas', 'migrate-nls', 'dsr', 'tplwrap', |
220 | 'ann-ids', 'annwrap', 'linkneighbours+dom-unpack' |
221 | ]; |
222 | |
223 | // NOTES about ordering: |
224 | // lang-converter, redlinks: |
225 | // Language conversion and redlink marking are done here |
226 | // *before* we cleanup and save data-parsoid because they |
227 | // are also used in pb2pb/html2html passes, and we want to |
228 | // keep their input/output formats consistent. |
229 | public const FULL_PARSE_GLOBAL_DOM_TRANSFORMS = [ |
230 | // FIXME: It should be documented in the spec that an extension's |
231 | // wtDOMProcess handler is run once on the top level document. |
232 | 'extpp', |
233 | 'fixups+dedupe-styles', 'linter', 'strip-metas', |
234 | 'lang-converter', 'redlinks', 'displayspace', 'linkclasses', |
235 | // Benefits from running after determining which media are redlinks |
236 | 'heading-ids', |
237 | 'sections', 'convertoffsets', 'cleanup', |
238 | 'embedded-docs', |
239 | 'saveDP', 'addmetadata' |
240 | ]; |
241 | |
242 | // Skipping sections, addmetadata from the above pipeline |
243 | // |
244 | // FIXME: Skip extpp, linter, lang-converter, redlinks, heading-ids, convertoffsets, saveDP for now. |
245 | // This replicates behavior prior to this refactor. |
246 | public const FULL_PARSE_EMBEDDED_DOC_DOM_TRANSFORMS = [ |
247 | 'fixups+dedupe-styles', 'strip-metas', |
248 | 'displayspace', 'linkclasses', |
249 | 'cleanup', |
250 | // Need to run this recursively |
251 | 'embedded-docs', |
252 | // FIXME This means the data-* from embedded HTML fragments won't end up |
253 | // in the pagebundle. But, if we try to call this on those fragments, |
254 | // we get multiple calls to store embedded docs. So, we may need to |
255 | // write a custom traverser if we want these embedded data* objects |
256 | // in the pagebundle (this is not a regression since they weren't part |
257 | // of the pagebundle all this while anyway.) |
258 | /* 'saveDP' */ |
259 | ]; |
260 | |
261 | public const SELECTIVE_UPDATE_FRAGMENT_GLOBAL_DOM_TRANSFORMS = [ |
262 | 'extpp', // FIXME: this should be a different processor |
263 | 'fixups', 'strip-metas', 'redlinks', 'displayspace', 'linkclasses', |
264 | 'gen-anchors', 'convertoffsets', 'cleanup', |
265 | // FIXME: This will probably need some special-case code to first |
266 | // strip old metadata before adding fresh metadata. |
267 | 'addmetadata' |
268 | ]; |
269 | |
270 | public const SELECTIVE_UPDATE_GLOBAL_DOM_TRANSFORMS = [ |
271 | 'update-template', 'linter', 'lang-converter', /* FIXME: Are lang converters idempotent? */ |
272 | 'heading-ids', 'sections', 'saveDP' |
273 | ]; |
274 | |
275 | private static $stages = [ |
276 | "Tokenizer" => [ |
277 | "class" => PegTokenizer::class, |
278 | ], |
279 | "TokenTransform2" => [ |
280 | "class" => TokenHandlerPipeline::class, |
281 | "token-handlers" => [ |
282 | OnlyInclude::class, |
283 | |
284 | TemplateHandler::class, |
285 | ExtensionHandler::class, |
286 | |
287 | // Expand attributes after templates to avoid expanding unused branches. |
288 | // No expansion of quotes, paragraphs etc in attributes, |
289 | // as with the legacy parser - up to end of TokenTransform2. |
290 | AttributeExpander::class, |
291 | |
292 | // now all attributes expanded to tokens or string |
293 | // more convenient after attribute expansion |
294 | WikiLinkHandler::class, |
295 | ExternalLinkHandler::class, |
296 | LanguageVariantHandler::class, |
297 | |
298 | // This converts dom-fragment-token tokens all the way to DOM |
299 | // and wraps them in DOMFragment wrapper tokens which will then |
300 | // get unpacked into the DOM by a dom-fragment unpacker. |
301 | DOMFragmentBuilder::class |
302 | ], |
303 | ], |
304 | "TokenTransform3" => [ |
305 | "class" => TokenHandlerPipeline::class, |
306 | "token-handlers" => [ |
307 | TokenStreamPatcher::class, |
308 | // add <pre>s |
309 | PreHandler::class, |
310 | QuoteTransformer::class, |
311 | // add before transforms that depend on behavior switches |
312 | // examples: toc generation, edit sections |
313 | BehaviorSwitchHandler::class, |
314 | |
315 | ListHandler::class, |
316 | SanitizerHandler::class, |
317 | // Wrap tokens into paragraphs post-sanitization so that |
318 | // tags that converted to text by the sanitizer have a chance |
319 | // of getting wrapped into paragraphs. The sanitizer does not |
320 | // require the existence of p-tags for its functioning. |
321 | ParagraphWrapper::class |
322 | ], |
323 | ], |
324 | // Build a tree out of the fully processed token stream |
325 | "TreeBuilder" => [ |
326 | "class" => TreeBuilderStage::class, |
327 | ], |
328 | // DOM transformer for top-level documents. |
329 | // This performs a lot of post-processing of the DOM |
330 | // (Template wrapping, broken wikitext/html detection, etc.) |
331 | "FullParseDOMTransform" => [ |
332 | "class" => DOMProcessorPipeline::class, |
333 | "processors" => [ |
334 | self::NESTED_PIPELINE_DOM_TRANSFORMS, |
335 | self::FULL_PARSE_GLOBAL_DOM_TRANSFORMS |
336 | ], |
337 | ], |
338 | // DOM transformer for fragments of a top-level document |
339 | "NestedFragmentDOMTransform" => [ |
340 | "class" => DOMProcessorPipeline::class, |
341 | "processors" => self::NESTED_PIPELINE_DOM_TRANSFORMS |
342 | ], |
343 | // DOM transformations to run on attribute-embedded docs of the top level doc |
344 | "FullParseEmbeddedDocsDOMTransform" => [ |
345 | "class" => DOMProcessorPipeline::class, |
346 | "processors" => self::FULL_PARSE_EMBEDDED_DOC_DOM_TRANSFORMS |
347 | ], |
348 | // DOM transformer for fragments during selective updates. |
349 | // This may eventually become identical to NestedFrgmentDOMTransform, |
350 | // but at this time, it is unclear if that will materialize. |
351 | "SelectiveUpdateFragmentDOMTransform" => [ |
352 | "class" => DOMProcessorPipeline::class, |
353 | "processors" => [ |
354 | self::NESTED_PIPELINE_DOM_TRANSFORMS, |
355 | self::SELECTIVE_UPDATE_FRAGMENT_GLOBAL_DOM_TRANSFORMS |
356 | ], |
357 | ], |
358 | // DOM transformer for the top-level page during selective updates. |
359 | "SelectiveUpdateDOMTransform" => [ |
360 | // For use in the top-level of the selective-update pipeline |
361 | "class" => DOMProcessorPipeline::class, |
362 | "processors" => self::SELECTIVE_UPDATE_GLOBAL_DOM_TRANSFORMS |
363 | ] |
364 | ]; |
365 | |
366 | private static $pipelineRecipes = [ |
367 | // This pipeline takes wikitext as input and emits a fully |
368 | // processed DOM as output. This is the pipeline used for |
369 | // all top-level documents. |
370 | "fullparse-wikitext-to-dom" => [ |
371 | "alwaysToplevel" => true, |
372 | "outType" => "DOM", |
373 | "stages" => [ |
374 | "Tokenizer", "TokenTransform2", "TokenTransform3", "TreeBuilder", "FullParseDOMTransform" |
375 | ] |
376 | ], |
377 | |
378 | "fullparse-embedded-docs-dom-to-dom" => [ |
379 | "alwaysToplevel" => true, |
380 | "outType" => "DOM", |
381 | "stages" => [ "FullParseEmbeddedDocsDOMTransform" ] |
382 | ], |
383 | |
384 | // This pipeline takes a DOM and emits a fully processed DOM as output. |
385 | "selective-update-dom-to-dom" => [ |
386 | "alwaysToplevel" => true, |
387 | "outType" => "DOM", |
388 | "stages" => [ "SelectiveUpdateDOMTransform" ] |
389 | ], |
390 | |
391 | // This pipeline takes wikitext as input and emits a partially |
392 | // processed DOM as output. This is the pipeline used for processing |
393 | // page fragments to DOM in a selective page update context |
394 | // This is always toplevel because the wikitext being updated |
395 | // is found at the toplevel of the page. |
396 | "selective-update-fragment-wikitext-to-dom" => [ |
397 | "alwaysToplevel" => true, |
398 | "outType" => "DOM", |
399 | "stages" => [ |
400 | "Tokenizer", "TokenTransform2", "TokenTransform3", "TreeBuilder", "SelectiveUpdateFragmentDOMTransform" |
401 | ] |
402 | ], |
403 | |
404 | // This pipeline takes wikitext as input and emits a fully |
405 | // processed DOM as output. This is the pipeline used for |
406 | // wikitext fragments of a top-level document that should be |
407 | // processed to a DOM fragment. This pipeline doesn't run all |
408 | // of the DOM transformations in the DOMTransform pipeline. |
409 | // We will like use a specialized DOMTransform stage here. |
410 | "wikitext-to-fragment" => [ |
411 | // FIXME: This is known to be always *not* top-level |
412 | // We could use a different flag to lock these pipelines too. |
413 | "outType" => "DOM", |
414 | "stages" => [ |
415 | "Tokenizer", "TokenTransform2", "TokenTransform3", "TreeBuilder", "NestedFragmentDOMTransform" |
416 | ] |
417 | ], |
418 | |
419 | // This pipeline takes tokens from stage 2 and emits a DOM fragment |
420 | // as output - this runs the same DOM transforms as the 'wikitext-to-fragment' |
421 | // pipeline and will get a spcialized DOMTransform stage as above. |
422 | "expanded-tokens-to-fragment" => [ |
423 | "outType" => "DOM", |
424 | "stages" => [ "TokenTransform3", "TreeBuilder", "NestedFragmentDOMTransform" ] |
425 | ], |
426 | |
427 | // This pipeline takes wikitext as input and emits tokens that |
428 | // have had all templates, extensions, links, images processed |
429 | "wikitext-to-expanded-tokens" => [ |
430 | "outType" => "Tokens", |
431 | "stages" => [ "Tokenizer", "TokenTransform2" ] |
432 | ], |
433 | |
434 | // This pipeline takes tokens from the PEG tokenizer and emits |
435 | // tokens that have had all templates and extensions processed. |
436 | "peg-tokens-to-expanded-tokens" => [ |
437 | "outType" => "Tokens", |
438 | "stages" => [ "TokenTransform2" ] |
439 | ] |
440 | ]; |
441 | |
442 | private static $supportedOptions = [ |
443 | // If true, templates found in content will have its contents expanded |
444 | 'expandTemplates', |
445 | |
446 | // If true, indicates pipeline is processing the expanded content of a |
447 | // template or its arguments |
448 | 'inTemplate', |
449 | |
450 | // The extension tag that is being processed (Ex: ref, references) |
451 | // (in current usage, only used for native tag implementation) |
452 | 'extTag', |
453 | |
454 | // Extension-specific options |
455 | 'extTagOpts', |
456 | |
457 | // Content being parsed is used in an inline context |
458 | 'inlineContext', |
459 | |
460 | // Are we processing content of attributes? |
461 | // (in current usage, used for transcluded attr. keys/values) |
462 | 'attrExpansion', |
463 | ]; |
464 | |
465 | private array $pipelineCache = []; |
466 | |
467 | private Env $env; |
468 | |
469 | public function __construct( Env $env ) { |
470 | $this->env = $env; |
471 | } |
472 | |
473 | /** |
474 | * Default options processing |
475 | * |
476 | * @param array $options |
477 | * @return array |
478 | */ |
479 | private function defaultOptions( array $options ): array { |
480 | // default: not in a template |
481 | $options['inTemplate'] ??= false; |
482 | |
483 | // default: wrap templates |
484 | $options['expandTemplates'] ??= true; |
485 | |
486 | // Catch pipeline option typos |
487 | foreach ( $options as $k => $v ) { |
488 | Assert::invariant( |
489 | in_array( $k, self::$supportedOptions, true ), |
490 | 'Invalid cacheKey option: ' . $k |
491 | ); |
492 | } |
493 | |
494 | return $options; |
495 | } |
496 | |
497 | public static function procNamesToProcs( array $procNames ): array { |
498 | $processors = []; |
499 | foreach ( $procNames as $name ) { |
500 | $proc = self::DOM_PROCESSOR_CONFIG[$name]; |
501 | if ( !is_array( $proc ) ) { |
502 | $proc = [ |
503 | 'name' => Utils::stripNamespace( $proc ), |
504 | 'Processor' => $proc, |
505 | ]; |
506 | } |
507 | $proc['shortcut'] = $name; |
508 | $processors[] = $proc; |
509 | } |
510 | return $processors; |
511 | } |
512 | |
513 | /** |
514 | * Generic pipeline creation from the above recipes. |
515 | * |
516 | * @param string $type |
517 | * @param string $cacheKey |
518 | * @param array $options |
519 | * @return ParserPipeline |
520 | */ |
521 | private function makePipeline( |
522 | string $type, string $cacheKey, array $options |
523 | ): ParserPipeline { |
524 | if ( !isset( self::$pipelineRecipes[$type] ) ) { |
525 | throw new InternalException( 'Unsupported Pipeline: ' . $type ); |
526 | } |
527 | $recipe = self::$pipelineRecipes[$type]; |
528 | $pipeStages = []; |
529 | $prevStage = null; |
530 | $recipeStages = $recipe["stages"]; |
531 | |
532 | foreach ( $recipeStages as $stageId ) { |
533 | $stageData = self::$stages[$stageId]; |
534 | $stage = new $stageData["class"]( $this->env, $options, $stageId, $prevStage ); |
535 | if ( isset( $stageData["token-handlers"] ) ) { |
536 | foreach ( $stageData["token-handlers"] as $tName ) { |
537 | $stage->addTransformer( new $tName( $stage, $options ) ); |
538 | } |
539 | } elseif ( isset( $stageData["processors"] ) ) { |
540 | $processors = []; |
541 | array_walk_recursive( |
542 | $stageData["processors"], |
543 | static function ( $p ) use ( &$processors ) { |
544 | $processors[] = $p; |
545 | } |
546 | ); |
547 | $stage->registerProcessors( |
548 | self::procNamesToProcs( $processors ) |
549 | ); |
550 | } |
551 | $prevStage = $stage; |
552 | $pipeStages[] = $stage; |
553 | } |
554 | |
555 | return new ParserPipeline( |
556 | $recipe['alwaysToplevel'] ?? false, |
557 | $type, |
558 | $recipe["outType"], |
559 | $cacheKey, |
560 | $pipeStages, |
561 | $this->env |
562 | ); |
563 | } |
564 | |
565 | private function getCacheKey( string $cacheKey, array $options ): string { |
566 | if ( empty( $options['expandTemplates'] ) ) { |
567 | $cacheKey .= '::noExpand'; |
568 | } |
569 | if ( !empty( $options['inlineContext'] ) ) { |
570 | $cacheKey .= '::inlineContext'; |
571 | } |
572 | if ( !empty( $options['inTemplate'] ) ) { |
573 | $cacheKey .= '::inTemplate'; |
574 | } |
575 | if ( !empty( $options['attrExpansion'] ) ) { |
576 | $cacheKey .= '::attrExpansion'; |
577 | } |
578 | if ( isset( $options['extTag'] ) ) { |
579 | $cacheKey .= '::' . $options['extTag']; |
580 | // FIXME: This is not the best strategy. But, instead of |
581 | // premature complexity, let us see how extensions want to |
582 | // use this and then figure out what constraints are needed. |
583 | if ( isset( $options['extTagOpts'] ) ) { |
584 | $cacheKey .= '::' . PHPUtils::jsonEncode( $options['extTagOpts'] ); |
585 | } |
586 | } |
587 | return $cacheKey; |
588 | } |
589 | |
590 | public function parse( string $src ): Document { |
591 | $pipe = $this->getPipeline( 'fullparse-wikitext-to-dom' ); |
592 | $pipe->init( [ |
593 | 'frame' => $this->env->topFrame, |
594 | 'toFragment' => false, |
595 | ] ); |
596 | // Top-level doc parsing always start in SOL state |
597 | return $pipe->parseChunkily( $src, [ 'sol' => true ] )->ownerDocument; |
598 | } |
599 | |
600 | /** |
601 | * @param SelectiveUpdateData $selparData |
602 | * @param array $options Options for selective DOM update |
603 | * - mode: (string) One of "template", "section", "generic" |
604 | * For now, defaults to 'template', if absent |
605 | */ |
606 | public function selectiveDOMUpdate( SelectiveUpdateData $selparData, array $options = [] ): Document { |
607 | $pipe = $this->getPipeline( 'selective-update-dom-to-dom' ); |
608 | $pipe->init( [ |
609 | 'frame' => $this->env->topFrame, |
610 | 'toFragment' => false, |
611 | ] ); |
612 | return $pipe->selectiveParse( $selparData, $options ); |
613 | } |
614 | |
615 | /** |
616 | * Get a pipeline of a given type. Pipelines are cached as they are |
617 | * frequently created. |
618 | * |
619 | * @param string $type |
620 | * @param array $options These also determine the key under which the |
621 | * pipeline is cached for reuse. |
622 | * @return ParserPipeline |
623 | */ |
624 | public function getPipeline( |
625 | string $type, array $options = [] |
626 | ): ParserPipeline { |
627 | $options = $this->defaultOptions( $options ); |
628 | $cacheKey = $this->getCacheKey( $type, $options ); |
629 | |
630 | $this->pipelineCache[$cacheKey] ??= []; |
631 | |
632 | if ( $this->pipelineCache[$cacheKey] ) { |
633 | $pipe = array_pop( $this->pipelineCache[$cacheKey] ); |
634 | } else { |
635 | $pipe = $this->makePipeline( $type, $cacheKey, $options ); |
636 | } |
637 | |
638 | // Debugging aid: Assign unique id to the pipeline |
639 | $pipe->setPipelineId( self::$globalPipelineId++ ); |
640 | |
641 | return $pipe; |
642 | } |
643 | |
644 | /** |
645 | * Callback called by a pipeline at the end of its processing. Returns the |
646 | * pipeline to the cache. |
647 | * |
648 | * @param ParserPipeline $pipe |
649 | */ |
650 | public function returnPipeline( ParserPipeline $pipe ): void { |
651 | $cacheKey = $pipe->getCacheKey(); |
652 | $this->pipelineCache[$cacheKey] ??= []; |
653 | if ( count( $this->pipelineCache[$cacheKey] ) < 100 ) { |
654 | $this->pipelineCache[$cacheKey][] = $pipe; |
655 | } |
656 | } |
657 | } |