Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 90 |
|
0.00% |
0 / 9 |
CRAP | |
0.00% |
0 / 1 |
ParserPipelineFactory | |
0.00% |
0 / 90 |
|
0.00% |
0 / 9 |
650 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
defaultOptions | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
procNamesToProcs | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
12 | |||
makePipeline | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
42 | |||
getCacheKey | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
56 | |||
parse | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
selectiveDOMUpdate | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
getPipeline | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
returnPipeline | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\Core\InternalException; |
9 | use Wikimedia\Parsoid\Core\SelectiveUpdateData; |
10 | use Wikimedia\Parsoid\DOM\Document; |
11 | use Wikimedia\Parsoid\Utils\PHPUtils; |
12 | use Wikimedia\Parsoid\Utils\Utils; |
13 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\AddAnnotationIds; |
14 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\AddLinkAttributes; |
15 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\CleanUp; |
16 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\DedupeStyles; |
17 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\DisplaySpace; |
18 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\HandleLinkNeighbours; |
19 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\Headings; |
20 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\LiFixups; |
21 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\TableFixups; |
22 | use Wikimedia\Parsoid\Wt2Html\DOM\Handlers\UnpackDOMFragments; |
23 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\AddMediaInfo; |
24 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\AddMetaData; |
25 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\AddRedLinks; |
26 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\ComputeDSR; |
27 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\ConvertOffsets; |
28 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\LangConverter; |
29 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\Linter; |
30 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\MarkFosteredContent; |
31 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\MigrateTemplateMarkerMetas; |
32 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\MigrateTrailingNLs; |
33 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\Normalize; |
34 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\ProcessEmbeddedDocs; |
35 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\ProcessTreeBuilderFixups; |
36 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\PWrap; |
37 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\RunExtensionProcessors; |
38 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\UpdateTemplateOutput; |
39 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\WrapAnnotations; |
40 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\WrapSections; |
41 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\WrapTemplates; |
42 | use Wikimedia\Parsoid\Wt2Html\TreeBuilder\TreeBuilderStage; |
43 | use Wikimedia\Parsoid\Wt2Html\TT\AttributeExpander; |
44 | use Wikimedia\Parsoid\Wt2Html\TT\BehaviorSwitchHandler; |
45 | use Wikimedia\Parsoid\Wt2Html\TT\DOMFragmentBuilder; |
46 | use Wikimedia\Parsoid\Wt2Html\TT\ExtensionHandler; |
47 | use Wikimedia\Parsoid\Wt2Html\TT\ExternalLinkHandler; |
48 | use Wikimedia\Parsoid\Wt2Html\TT\LanguageVariantHandler; |
49 | use Wikimedia\Parsoid\Wt2Html\TT\ListHandler; |
50 | use Wikimedia\Parsoid\Wt2Html\TT\OnlyInclude; |
51 | use Wikimedia\Parsoid\Wt2Html\TT\ParagraphWrapper; |
52 | use Wikimedia\Parsoid\Wt2Html\TT\PreHandler; |
53 | use Wikimedia\Parsoid\Wt2Html\TT\QuoteTransformer; |
54 | use Wikimedia\Parsoid\Wt2Html\TT\SanitizerHandler; |
55 | use Wikimedia\Parsoid\Wt2Html\TT\TemplateHandler; |
56 | use Wikimedia\Parsoid\Wt2Html\TT\TokenStreamPatcher; |
57 | use Wikimedia\Parsoid\Wt2Html\TT\WikiLinkHandler; |
58 | |
59 | /** |
60 | * This class assembles parser pipelines from parser stages |
61 | */ |
62 | class ParserPipelineFactory { |
63 | private static $globalPipelineId = 0; |
64 | |
65 | private const DOM_PROCESSOR_CONFIG = [ |
66 | 'addmetadata' => AddMetaData::class, |
67 | 'annwrap' => WrapAnnotations::class, |
68 | 'convertoffsets' => ConvertOffsets::class, |
69 | 'dsr' => ComputeDSR::class, |
70 | 'embedded-docs' => ProcessEmbeddedDocs::class, |
71 | 'extpp' => RunExtensionProcessors::class, |
72 | 'fostered' => MarkFosteredContent::class, |
73 | 'linter' => Linter::class, |
74 | 'lang-converter' => LangConverter::class, |
75 | 'media' => AddMediaInfo::class, |
76 | 'migrate-metas' => MigrateTemplateMarkerMetas::class, |
77 | 'migrate-nls' => MigrateTrailingNLs::class, |
78 | 'normalize' => Normalize::class, |
79 | 'process-fixups' => ProcessTreeBuilderFixups::class, |
80 | 'pwrap' => PWrap::class, |
81 | 'redlinks' => AddRedLinks::class, |
82 | 'sections' => WrapSections::class, // Don't process HTML in embedded attributes |
83 | 'tplwrap' => WrapTemplates::class, |
84 | 'update-template' => UpdateTemplateOutput::class, |
85 | 'ann-ids' => [ |
86 | 'name' => 'AddAnnotationIds', |
87 | 'handlers' => [ |
88 | [ 'nodeName' => 'meta', 'action' => [ AddAnnotationIds::class, 'handler' ] ] |
89 | ], |
90 | 'withAnnotations' => true |
91 | ], |
92 | 'linkneighbours+dom-unpack' => [ |
93 | 'name' => 'HandleLinkNeighbours,UnpackDOMFragments', |
94 | 'handlers' => [ |
95 | // Link prefixes and suffixes |
96 | [ 'nodeName' => 'a', 'action' => [ HandleLinkNeighbours::class, 'handler' ] ], |
97 | [ 'nodeName' => null, 'action' => [ UnpackDOMFragments::class, 'handler' ] ] |
98 | ] |
99 | ], |
100 | 'fixups' => [ |
101 | 'name' => 'MigrateTrailingCategories,TableFixups', |
102 | 'tplInfo' => true, |
103 | 'handlers' => [ |
104 | // 1. Move trailing categories in <li>s out of the list |
105 | [ 'nodeName' => 'li', 'action' => [ LiFixups::class, 'migrateTrailingSolTransparentLinks' ] ], |
106 | [ 'nodeName' => 'dt', 'action' => [ LiFixups::class, 'migrateTrailingSolTransparentLinks' ] ], |
107 | [ 'nodeName' => 'dd', 'action' => [ LiFixups::class, 'migrateTrailingSolTransparentLinks' ] ], |
108 | // 2. Fix up issues from templated table cells and table cell attributes |
109 | [ 'nodeName' => 'td', 'action' => [ TableFixups::class, 'handleTableCellTemplates' ] ], |
110 | [ 'nodeName' => 'th', 'action' => [ TableFixups::class, 'handleTableCellTemplates' ] ], |
111 | ] |
112 | ], |
113 | 'fixups+dedupe-styles' => [ |
114 | 'name' => 'MigrateTrailingCategories,TableFixups,DedupeStyles', |
115 | 'tplInfo' => true, |
116 | 'handlers' => [ |
117 | // 1. Move trailing categories in <li>s out of the list |
118 | [ 'nodeName' => 'li', 'action' => [ LiFixups::class, 'migrateTrailingSolTransparentLinks' ] ], |
119 | [ 'nodeName' => 'dt', 'action' => [ LiFixups::class, 'migrateTrailingSolTransparentLinks' ] ], |
120 | [ 'nodeName' => 'dd', 'action' => [ LiFixups::class, 'migrateTrailingSolTransparentLinks' ] ], |
121 | // 2. Fix up issues from templated table cells and table cell attributes |
122 | [ 'nodeName' => 'td', 'action' => [ TableFixups::class, 'handleTableCellTemplates' ] ], |
123 | [ 'nodeName' => 'th', 'action' => [ TableFixups::class, 'handleTableCellTemplates' ] ], |
124 | // 3. Deduplicate template styles |
125 | // (should run after dom-fragment expansion + after extension post-processors) |
126 | [ 'nodeName' => 'style', 'action' => [ DedupeStyles::class, 'dedupe' ] ] |
127 | ] |
128 | ], |
129 | // Strip marker metas -- removes left over marker metas (ex: metas |
130 | // nested in expanded tpl/extension output). |
131 | 'strip-metas' => [ |
132 | 'name' => 'CleanUp-stripMarkerMetas', |
133 | 'handlers' => [ |
134 | [ 'nodeName' => 'meta', 'action' => [ CleanUp::class, 'stripMarkerMetas' ] ] |
135 | ] |
136 | ], |
137 | 'displayspace+linkclasses' => [ |
138 | 'name' => 'DisplaySpace+AddLinkAttributes', |
139 | 'handlers' => [ |
140 | [ 'nodeName' => null, 'action' => [ DisplaySpace::class, 'leftHandler' ] ], |
141 | [ 'nodeName' => null, 'action' => [ DisplaySpace::class, 'rightHandler' ] ], |
142 | [ 'nodeName' => 'a', 'action' => [ AddLinkAttributes::class, 'handler' ] ] |
143 | ] |
144 | ], |
145 | 'gen-anchors' => [ |
146 | 'name' => 'Headings-genAnchors', |
147 | 'handlers' => [ |
148 | [ 'nodeName' => null, 'action' => [ Headings::class, 'genAnchors' ] ], |
149 | ] |
150 | ], |
151 | 'dedupe-heading-ids' => [ |
152 | 'name' => 'Headings-dedupeIds', |
153 | 'handlers' => [ |
154 | [ 'nodeName' => null, 'action' => [ Headings::class, 'dedupeHeadingIds' ] ] |
155 | ] |
156 | ], |
157 | 'heading-ids' => [ |
158 | 'name' => 'Headings-genAnchors', |
159 | 'handlers' => [ |
160 | [ 'nodeName' => null, 'action' => [ Headings::class, 'genAnchors' ] ], |
161 | [ 'nodeName' => null, 'action' => [ Headings::class, 'dedupeHeadingIds' ] ] |
162 | ] |
163 | ], |
164 | 'cleanup' => [ |
165 | 'name' => 'CleanUp-handleEmptyElts,CleanUp-cleanup', |
166 | 'tplInfo' => true, |
167 | 'handlers' => [ |
168 | // Strip empty elements from template content |
169 | [ 'nodeName' => null, 'action' => [ CleanUp::class, 'handleEmptyElements' ] ], |
170 | // Additional cleanup |
171 | [ 'nodeName' => null, 'action' => [ CleanUp::class, 'finalCleanup' ] ] |
172 | ] |
173 | ], |
174 | 'saveDP' => [ |
175 | 'name' => 'CleanUp-saveDataParsoid', |
176 | 'tplInfo' => true, |
177 | 'handlers' => [ |
178 | // Mark which data.parsoid's should be serialized into |
179 | // data-parsoid html attributes. |
180 | // Make this its own thing so that any changes to the DOM |
181 | // don't affect other handlers that run alongside it. |
182 | [ 'nodeName' => null, 'action' => [ CleanUp::class, 'saveDataParsoid' ] ] |
183 | ] |
184 | ] |
185 | ]; |
186 | |
187 | // NOTES about ordering / inclusion: |
188 | // |
189 | // media: |
190 | // This is run at all levels for now - gallery extension's "packed" mode |
191 | // would otherwise need a post-processing pass to scale media after it |
192 | // has been fetched. That introduces an ordering dependency that may |
193 | // or may not complicate things. |
194 | // migrate-metas: |
195 | // - Run this after 'pwrap' because it can add additional opportunities for |
196 | // meta migration which we will miss if we run this before p-wrapping. |
197 | // - We could potentially move this just before 'tplwrap' by seeing this |
198 | // as a preprocessing pass for that. But, we will have to update the pass |
199 | // to update DSR properties where required. |
200 | // - In summary, this can at most be moved before 'media' or after |
201 | // 'migrate-nls' without needing any other changes. |
202 | // dsr, tplwrap: |
203 | // DSR computation and template wrapping cannot be skipped for top-level content |
204 | // even if they are part of nested level pipelines, because such content might be |
205 | // embedded in attributes and they may need to be processed independently. |
206 | // |
207 | // Nested (non-top-level) pipelines can never include the following: |
208 | // - lang-converter, convertoffsets, dedupe-styles, cleanup, saveDP |
209 | // |
210 | // FIXME: Perhaps introduce a config flag in the processor config that |
211 | // verifies this property against a pipeline's 'toplevel' state. |
212 | public const NESTED_PIPELINE_DOM_TRANSFORMS = [ |
213 | 'fostered', 'process-fixups', 'normalize', 'pwrap', |
214 | 'media', 'migrate-metas', 'migrate-nls', 'dsr', 'tplwrap', |
215 | 'ann-ids', 'annwrap', 'linkneighbours+dom-unpack' |
216 | ]; |
217 | |
218 | // NOTES about ordering: |
219 | // lang-converter, redlinks: |
220 | // Language conversion and redlink marking are done here |
221 | // *before* we cleanup and save data-parsoid because they |
222 | // are also used in pb2pb/html2html passes, and we want to |
223 | // keep their input/output formats consistent. |
224 | public const FULL_PARSE_GLOBAL_DOM_TRANSFORMS = [ |
225 | // FIXME: It should be documented in the spec that an extension's |
226 | // wtDOMProcess handler is run once on the top level document. |
227 | 'extpp', |
228 | 'fixups+dedupe-styles', 'linter', 'strip-metas', |
229 | 'lang-converter', 'redlinks', 'displayspace+linkclasses', |
230 | // Benefits from running after determining which media are redlinks |
231 | 'heading-ids', |
232 | 'sections', 'convertoffsets', 'cleanup', |
233 | 'embedded-docs', |
234 | 'saveDP', 'addmetadata' |
235 | ]; |
236 | |
237 | // Skipping sections, addmetadata from the above pipeline |
238 | // |
239 | // FIXME: Skip extpp, linter, lang-converter, redlinks, heading-ids, convertoffsets, saveDP for now. |
240 | // This replicates behavior prior to this refactor. |
241 | public const FULL_PARSE_EMBEDDED_DOC_DOM_TRANSFORMS = [ |
242 | 'fixups+dedupe-styles', 'strip-metas', |
243 | 'displayspace+linkclasses', |
244 | 'cleanup', |
245 | // Need to run this recursively |
246 | 'embedded-docs', |
247 | // FIXME This means the data-* from embedded HTML fragments won't end up |
248 | // in the pagebundle. But, if we try to call this on those fragments, |
249 | // we get multiple calls to store embedded docs. So, we may need to |
250 | // write a custom traverser if we want these embedded data* objects |
251 | // in the pagebundle (this is not a regression since they weren't part |
252 | // of the pagebundle all this while anyway.) |
253 | /* 'saveDP' */ |
254 | ]; |
255 | |
256 | public const SELECTIVE_UPDATE_FRAGMENT_GLOBAL_DOM_TRANSFORMS = [ |
257 | 'extpp', // FIXME: this should be a different processor |
258 | 'fixups', 'strip-metas', 'redlinks', 'displayspace+linkclasses', |
259 | 'gen-anchors', 'convertoffsets', 'cleanup', |
260 | // FIXME: This will probably need some special-case code to first |
261 | // strip old metadata before adding fresh metadata. |
262 | 'addmetadata' |
263 | ]; |
264 | |
265 | public const SELECTIVE_UPDATE_GLOBAL_DOM_TRANSFORMS = [ |
266 | 'update-template', 'linter', 'lang-converter', /* FIXME: Are lang converters idempotent? */ |
267 | 'heading-ids', 'sections', 'saveDP' |
268 | ]; |
269 | |
270 | private static $stages = [ |
271 | "Tokenizer" => [ |
272 | "class" => PegTokenizer::class, |
273 | ], |
274 | "TokenTransform2" => [ |
275 | "class" => TokenTransformManager::class, |
276 | "transformers" => [ |
277 | OnlyInclude::class, |
278 | |
279 | TemplateHandler::class, |
280 | ExtensionHandler::class, |
281 | |
282 | // Expand attributes after templates to avoid expanding unused branches. |
283 | // No expansion of quotes, paragraphs etc in attributes, |
284 | // as with the legacy parser - up to end of TokenTransform2. |
285 | AttributeExpander::class, |
286 | |
287 | // now all attributes expanded to tokens or string |
288 | // more convenient after attribute expansion |
289 | WikiLinkHandler::class, |
290 | ExternalLinkHandler::class, |
291 | LanguageVariantHandler::class, |
292 | |
293 | // This converts dom-fragment-token tokens all the way to DOM |
294 | // and wraps them in DOMFragment wrapper tokens which will then |
295 | // get unpacked into the DOM by a dom-fragment unpacker. |
296 | DOMFragmentBuilder::class |
297 | ], |
298 | ], |
299 | "TokenTransform3" => [ |
300 | "class" => TokenTransformManager::class, |
301 | "transformers" => [ |
302 | TokenStreamPatcher::class, |
303 | // add <pre>s |
304 | PreHandler::class, |
305 | QuoteTransformer::class, |
306 | // add before transforms that depend on behavior switches |
307 | // examples: toc generation, edit sections |
308 | BehaviorSwitchHandler::class, |
309 | |
310 | ListHandler::class, |
311 | SanitizerHandler::class, |
312 | // Wrap tokens into paragraphs post-sanitization so that |
313 | // tags that converted to text by the sanitizer have a chance |
314 | // of getting wrapped into paragraphs. The sanitizer does not |
315 | // require the existence of p-tags for its functioning. |
316 | ParagraphWrapper::class |
317 | ], |
318 | ], |
319 | // Build a tree out of the fully processed token stream |
320 | "TreeBuilder" => [ |
321 | "class" => TreeBuilderStage::class, |
322 | ], |
323 | // DOM transformer for top-level documents. |
324 | // This performs a lot of post-processing of the DOM |
325 | // (Template wrapping, broken wikitext/html detection, etc.) |
326 | "FullParseDOMTransform" => [ |
327 | "class" => DOMPostProcessor::class, |
328 | "processors" => [ |
329 | self::NESTED_PIPELINE_DOM_TRANSFORMS, |
330 | self::FULL_PARSE_GLOBAL_DOM_TRANSFORMS |
331 | ], |
332 | ], |
333 | // DOM transformer for fragments of a top-level document |
334 | "NestedFragmentDOMTransform" => [ |
335 | "class" => DOMPostProcessor::class, |
336 | "processors" => self::NESTED_PIPELINE_DOM_TRANSFORMS |
337 | ], |
338 | // DOM transformations to run on attribute-embedded docs of the top level doc |
339 | "FullParseEmbeddedDocsDOMTransform" => [ |
340 | "class" => DOMPostProcessor::class, |
341 | "processors" => self::FULL_PARSE_EMBEDDED_DOC_DOM_TRANSFORMS |
342 | ], |
343 | // DOM transformer for fragments during selective updates. |
344 | // This may eventually become identical to NestedFrgmentDOMTransform, |
345 | // but at this time, it is unclear if that will materialize. |
346 | "SelectiveUpdateFragmentDOMTransform" => [ |
347 | "class" => DOMPostProcessor::class, |
348 | "processors" => [ |
349 | self::NESTED_PIPELINE_DOM_TRANSFORMS, |
350 | self::SELECTIVE_UPDATE_FRAGMENT_GLOBAL_DOM_TRANSFORMS |
351 | ], |
352 | ], |
353 | // DOM transformer for the top-level page during selective updates. |
354 | "SelectiveUpdateDOMTransform" => [ |
355 | // For use in the top-level of the selective-update pipeline |
356 | "class" => DOMPostProcessor::class, |
357 | "processors" => self::SELECTIVE_UPDATE_GLOBAL_DOM_TRANSFORMS |
358 | ] |
359 | ]; |
360 | |
361 | private static $pipelineRecipes = [ |
362 | // This pipeline takes wikitext as input and emits a fully |
363 | // processed DOM as output. This is the pipeline used for |
364 | // all top-level documents. |
365 | "fullparse-wikitext-to-dom" => [ |
366 | "alwaysToplevel" => true, |
367 | "outType" => "DOM", |
368 | "stages" => [ |
369 | "Tokenizer", "TokenTransform2", "TokenTransform3", "TreeBuilder", "FullParseDOMTransform" |
370 | ] |
371 | ], |
372 | |
373 | "fullparse-embedded-docs-dom-to-dom" => [ |
374 | "alwaysToplevel" => true, |
375 | "outType" => "DOM", |
376 | "stages" => [ "FullParseEmbeddedDocsDOMTransform" ] |
377 | ], |
378 | |
379 | // This pipeline takes a DOM and emits a fully processed DOM as output. |
380 | "selective-update-dom-to-dom" => [ |
381 | "alwaysToplevel" => true, |
382 | "outType" => "DOM", |
383 | "stages" => [ "SelectiveUpdateDOMTransform" ] |
384 | ], |
385 | |
386 | // This pipeline takes wikitext as input and emits a partially |
387 | // processed DOM as output. This is the pipeline used for processing |
388 | // page fragments to DOM in a selective page update context |
389 | // This is always toplevel because the wikitext being updated |
390 | // is found at the toplevel of the page. |
391 | "selective-update-fragment-wikitext-to-dom" => [ |
392 | "alwaysToplevel" => true, |
393 | "outType" => "DOM", |
394 | "stages" => [ |
395 | "Tokenizer", "TokenTransform2", "TokenTransform3", "TreeBuilder", "SelectiveUpdateFragmentDOMTransform" |
396 | ] |
397 | ], |
398 | |
399 | // This pipeline takes wikitext as input and emits a fully |
400 | // processed DOM as output. This is the pipeline used for |
401 | // wikitext fragments of a top-level document that should be |
402 | // processed to a DOM fragment. This pipeline doesn't run all |
403 | // of the DOM transformations in the DOMTransform pipeline. |
404 | // We will like use a specialized DOMTransform stage here. |
405 | "wikitext-to-fragment" => [ |
406 | // FIXME: This is known to be always *not* top-level |
407 | // We could use a different flag to lock these pipelines too. |
408 | "outType" => "DOM", |
409 | "stages" => [ |
410 | "Tokenizer", "TokenTransform2", "TokenTransform3", "TreeBuilder", "NestedFragmentDOMTransform" |
411 | ] |
412 | ], |
413 | |
414 | // This pipeline takes tokens from stage 2 and emits a DOM fragment |
415 | // as output - this runs the same DOM transforms as the 'wikitext-to-fragment' |
416 | // pipeline and will get a spcialized DOMTransform stage as above. |
417 | "expanded-tokens-to-fragment" => [ |
418 | "outType" => "DOM", |
419 | "stages" => [ "TokenTransform3", "TreeBuilder", "NestedFragmentDOMTransform" ] |
420 | ], |
421 | |
422 | // This pipeline takes wikitext as input and emits tokens that |
423 | // have had all templates, extensions, links, images processed |
424 | "wikitext-to-expanded-tokens" => [ |
425 | "outType" => "Tokens", |
426 | "stages" => [ "Tokenizer", "TokenTransform2" ] |
427 | ], |
428 | |
429 | // This pipeline takes tokens from the PEG tokenizer and emits |
430 | // tokens that have had all templates and extensions processed. |
431 | "peg-tokens-to-expanded-tokens" => [ |
432 | "outType" => "Tokens", |
433 | "stages" => [ "TokenTransform2" ] |
434 | ] |
435 | ]; |
436 | |
437 | private static $supportedOptions = [ |
438 | // If true, templates found in content will have its contents expanded |
439 | 'expandTemplates', |
440 | |
441 | // If true, indicates pipeline is processing the expanded content of a |
442 | // template or its arguments |
443 | 'inTemplate', |
444 | |
445 | // The extension tag that is being processed (Ex: ref, references) |
446 | // (in current usage, only used for native tag implementation) |
447 | 'extTag', |
448 | |
449 | // Extension-specific options |
450 | 'extTagOpts', |
451 | |
452 | // Content being parsed is used in an inline context |
453 | 'inlineContext', |
454 | |
455 | // Are we processing content of attributes? |
456 | // (in current usage, used for transcluded attr. keys/values) |
457 | 'attrExpansion', |
458 | ]; |
459 | |
460 | private array $pipelineCache = []; |
461 | |
462 | private Env $env; |
463 | |
464 | public function __construct( Env $env ) { |
465 | $this->env = $env; |
466 | } |
467 | |
468 | /** |
469 | * Default options processing |
470 | * |
471 | * @param array $options |
472 | * @return array |
473 | */ |
474 | private function defaultOptions( array $options ): array { |
475 | // default: not in a template |
476 | $options['inTemplate'] ??= false; |
477 | |
478 | // default: wrap templates |
479 | $options['expandTemplates'] ??= true; |
480 | |
481 | // Catch pipeline option typos |
482 | foreach ( $options as $k => $v ) { |
483 | Assert::invariant( |
484 | in_array( $k, self::$supportedOptions, true ), |
485 | 'Invalid cacheKey option: ' . $k |
486 | ); |
487 | } |
488 | |
489 | return $options; |
490 | } |
491 | |
492 | public static function procNamesToProcs( array $procNames ): array { |
493 | $processors = []; |
494 | foreach ( $procNames as $name ) { |
495 | $proc = self::DOM_PROCESSOR_CONFIG[$name]; |
496 | if ( !is_array( $proc ) ) { |
497 | $proc = [ |
498 | 'name' => Utils::stripNamespace( $proc ), |
499 | 'Processor' => $proc, |
500 | ]; |
501 | } |
502 | $proc['shortcut'] = $name; |
503 | $processors[] = $proc; |
504 | } |
505 | return $processors; |
506 | } |
507 | |
508 | /** |
509 | * Generic pipeline creation from the above recipes. |
510 | * |
511 | * @param string $type |
512 | * @param string $cacheKey |
513 | * @param array $options |
514 | * @return ParserPipeline |
515 | */ |
516 | private function makePipeline( |
517 | string $type, string $cacheKey, array $options |
518 | ): ParserPipeline { |
519 | if ( !isset( self::$pipelineRecipes[$type] ) ) { |
520 | throw new InternalException( 'Unsupported Pipeline: ' . $type ); |
521 | } |
522 | $recipe = self::$pipelineRecipes[$type]; |
523 | $pipeStages = []; |
524 | $prevStage = null; |
525 | $recipeStages = $recipe["stages"]; |
526 | |
527 | foreach ( $recipeStages as $stageId ) { |
528 | $stageData = self::$stages[$stageId]; |
529 | $stage = new $stageData["class"]( $this->env, $options, $stageId, $prevStage ); |
530 | if ( isset( $stageData["transformers"] ) ) { |
531 | foreach ( $stageData["transformers"] as $tName ) { |
532 | $stage->addTransformer( new $tName( $stage, $options ) ); |
533 | } |
534 | } elseif ( isset( $stageData["processors"] ) ) { |
535 | $processors = []; |
536 | array_walk_recursive( |
537 | $stageData["processors"], |
538 | static function ( $p ) use ( &$processors ) { |
539 | $processors[] = $p; |
540 | } |
541 | ); |
542 | $stage->registerProcessors( |
543 | self::procNamesToProcs( $processors ) |
544 | ); |
545 | } |
546 | $prevStage = $stage; |
547 | $pipeStages[] = $stage; |
548 | } |
549 | |
550 | return new ParserPipeline( |
551 | $recipe['alwaysToplevel'] ?? false, |
552 | $type, |
553 | $recipe["outType"], |
554 | $cacheKey, |
555 | $pipeStages, |
556 | $this->env |
557 | ); |
558 | } |
559 | |
560 | private function getCacheKey( string $cacheKey, array $options ): string { |
561 | if ( empty( $options['expandTemplates'] ) ) { |
562 | $cacheKey .= '::noExpand'; |
563 | } |
564 | if ( !empty( $options['inlineContext'] ) ) { |
565 | $cacheKey .= '::inlineContext'; |
566 | } |
567 | if ( !empty( $options['inTemplate'] ) ) { |
568 | $cacheKey .= '::inTemplate'; |
569 | } |
570 | if ( !empty( $options['attrExpansion'] ) ) { |
571 | $cacheKey .= '::attrExpansion'; |
572 | } |
573 | if ( isset( $options['extTag'] ) ) { |
574 | $cacheKey .= '::' . $options['extTag']; |
575 | // FIXME: This is not the best strategy. But, instead of |
576 | // premature complexity, let us see how extensions want to |
577 | // use this and then figure out what constraints are needed. |
578 | if ( isset( $options['extTagOpts'] ) ) { |
579 | $cacheKey .= '::' . PHPUtils::jsonEncode( $options['extTagOpts'] ); |
580 | } |
581 | } |
582 | return $cacheKey; |
583 | } |
584 | |
585 | public function parse( string $src ): Document { |
586 | $pipe = $this->getPipeline( 'fullparse-wikitext-to-dom' ); |
587 | $pipe->init( [ |
588 | 'frame' => $this->env->topFrame, |
589 | 'toFragment' => false, |
590 | ] ); |
591 | // Top-level doc parsing always start in SOL state |
592 | return $pipe->parseChunkily( $src, [ 'sol' => true ] )->ownerDocument; |
593 | } |
594 | |
595 | /** |
596 | * @param SelectiveUpdateData $selparData |
597 | * @param array $options Options for selective DOM update |
598 | * - mode: (string) One of "template", "section", "generic" |
599 | * For now, defaults to 'template', if absent |
600 | */ |
601 | public function selectiveDOMUpdate( SelectiveUpdateData $selparData, array $options = [] ): Document { |
602 | $pipe = $this->getPipeline( 'selective-update-dom-to-dom' ); |
603 | $pipe->init( [ |
604 | 'frame' => $this->env->topFrame, |
605 | 'toFragment' => false, |
606 | ] ); |
607 | return $pipe->selectiveParse( $selparData, $options ); |
608 | } |
609 | |
610 | /** |
611 | * Get a pipeline of a given type. Pipelines are cached as they are |
612 | * frequently created. |
613 | * |
614 | * @param string $type |
615 | * @param array $options These also determine the key under which the |
616 | * pipeline is cached for reuse. |
617 | * @return ParserPipeline |
618 | */ |
619 | public function getPipeline( |
620 | string $type, array $options = [] |
621 | ): ParserPipeline { |
622 | $options = $this->defaultOptions( $options ); |
623 | $cacheKey = $this->getCacheKey( $type, $options ); |
624 | |
625 | $this->pipelineCache[$cacheKey] ??= []; |
626 | |
627 | if ( $this->pipelineCache[$cacheKey] ) { |
628 | $pipe = array_pop( $this->pipelineCache[$cacheKey] ); |
629 | } else { |
630 | $pipe = $this->makePipeline( $type, $cacheKey, $options ); |
631 | } |
632 | |
633 | // Debugging aid: Assign unique id to the pipeline |
634 | $pipe->setPipelineId( self::$globalPipelineId++ ); |
635 | |
636 | return $pipe; |
637 | } |
638 | |
639 | /** |
640 | * Callback called by a pipeline at the end of its processing. Returns the |
641 | * pipeline to the cache. |
642 | * |
643 | * @param ParserPipeline $pipe |
644 | */ |
645 | public function returnPipeline( ParserPipeline $pipe ): void { |
646 | $cacheKey = $pipe->getCacheKey(); |
647 | $this->pipelineCache[$cacheKey] ??= []; |
648 | if ( count( $this->pipelineCache[$cacheKey] ) < 100 ) { |
649 | $this->pipelineCache[$cacheKey][] = $pipe; |
650 | } |
651 | } |
652 | } |