Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 70 |
|
0.00% |
0 / 7 |
CRAP | |
0.00% |
0 / 1 |
ParserPipelineFactory | |
0.00% |
0 / 70 |
|
0.00% |
0 / 7 |
506 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
defaultOptions | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
6 | |||
makePipeline | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
42 | |||
getCacheKey | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
72 | |||
parse | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
2 | |||
getPipeline | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
returnPipeline | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wt2Html; |
5 | |
6 | use Wikimedia\Assert\Assert; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\Core\InternalException; |
9 | use Wikimedia\Parsoid\DOM\Document; |
10 | use Wikimedia\Parsoid\Utils\PHPUtils; |
11 | use Wikimedia\Parsoid\Wt2Html\TreeBuilder\TreeBuilderStage; |
12 | use Wikimedia\Parsoid\Wt2Html\TT\AttributeExpander; |
13 | use Wikimedia\Parsoid\Wt2Html\TT\BehaviorSwitchHandler; |
14 | use Wikimedia\Parsoid\Wt2Html\TT\DOMFragmentBuilder; |
15 | use Wikimedia\Parsoid\Wt2Html\TT\ExtensionHandler; |
16 | use Wikimedia\Parsoid\Wt2Html\TT\ExternalLinkHandler; |
17 | use Wikimedia\Parsoid\Wt2Html\TT\IncludeOnly; |
18 | use Wikimedia\Parsoid\Wt2Html\TT\LanguageVariantHandler; |
19 | use Wikimedia\Parsoid\Wt2Html\TT\ListHandler; |
20 | use Wikimedia\Parsoid\Wt2Html\TT\NoInclude; |
21 | use Wikimedia\Parsoid\Wt2Html\TT\OnlyInclude; |
22 | use Wikimedia\Parsoid\Wt2Html\TT\ParagraphWrapper; |
23 | use Wikimedia\Parsoid\Wt2Html\TT\PreHandler; |
24 | use Wikimedia\Parsoid\Wt2Html\TT\QuoteTransformer; |
25 | use Wikimedia\Parsoid\Wt2Html\TT\SanitizerHandler; |
26 | use Wikimedia\Parsoid\Wt2Html\TT\TemplateHandler; |
27 | use Wikimedia\Parsoid\Wt2Html\TT\TokenStreamPatcher; |
28 | use Wikimedia\Parsoid\Wt2Html\TT\WikiLinkHandler; |
29 | |
30 | /** |
31 | * This class assembles parser pipelines from parser stages |
32 | */ |
33 | class ParserPipelineFactory { |
34 | private static $globalPipelineId = 0; |
35 | |
36 | private static $stages = [ |
37 | "Tokenizer" => [ |
38 | "class" => PegTokenizer::class, |
39 | ], |
40 | "TokenTransform2" => [ |
41 | "class" => TokenTransformManager::class, |
42 | "transformers" => [ |
43 | OnlyInclude::class, |
44 | IncludeOnly::class, |
45 | NoInclude::class, |
46 | |
47 | TemplateHandler::class, |
48 | ExtensionHandler::class, |
49 | |
50 | // Expand attributes after templates to avoid expanding unused branches |
51 | // No expansion of quotes, paragraphs etc in attributes, as in |
52 | // PHP parser- up to text/x-mediawiki/expanded only. |
53 | AttributeExpander::class, |
54 | |
55 | // now all attributes expanded to tokens or string |
56 | // more convenient after attribute expansion |
57 | WikiLinkHandler::class, |
58 | ExternalLinkHandler::class, |
59 | LanguageVariantHandler::class, |
60 | |
61 | // This converts dom-fragment-token tokens all the way to DOM |
62 | // and wraps them in DOMFragment wrapper tokens which will then |
63 | // get unpacked into the DOM by a dom-fragment unpacker. |
64 | DOMFragmentBuilder::class |
65 | ], |
66 | ], |
67 | "TokenTransform3" => [ |
68 | "class" => TokenTransformManager::class, |
69 | "transformers" => [ |
70 | TokenStreamPatcher::class, |
71 | // add <pre>s |
72 | PreHandler::class, |
73 | QuoteTransformer::class, |
74 | // add before transforms that depend on behavior switches |
75 | // examples: toc generation, edit sections |
76 | BehaviorSwitchHandler::class, |
77 | |
78 | ListHandler::class, |
79 | SanitizerHandler::class, |
80 | // Wrap tokens into paragraphs post-sanitization so that |
81 | // tags that converted to text by the sanitizer have a chance |
82 | // of getting wrapped into paragraphs. The sanitizer does not |
83 | // require the existence of p-tags for its functioning. |
84 | ParagraphWrapper::class |
85 | ], |
86 | ], |
87 | "TreeBuilder" => [ |
88 | // Build a tree out of the fully processed token stream |
89 | "class" => TreeBuilderStage::class, |
90 | ], |
91 | "DOMPP" => [ |
92 | // Generic DOM transformer. |
93 | // This performs a lot of post-processing of the DOM |
94 | // (Template wrapping, broken wikitext/html detection, etc.) |
95 | "class" => DOMPostProcessor::class, |
96 | "processors" => [], |
97 | ], |
98 | ]; |
99 | |
100 | private static $pipelineRecipes = [ |
101 | // This pipeline takes wikitext as input and emits a fully |
102 | // processed DOM as output. This is the pipeline used for |
103 | // all top-level documents. |
104 | // Stages 1-5 of the pipeline |
105 | "text/x-mediawiki/full" => [ |
106 | "outType" => "DOM", |
107 | "stages" => [ |
108 | "Tokenizer", "TokenTransform2", "TokenTransform3", "TreeBuilder", "DOMPP" |
109 | ] |
110 | ], |
111 | |
112 | // This pipeline takes wikitext as input and emits tokens that |
113 | // have had all templates, extensions, links, images processed |
114 | // Stages 1-2 of the pipeline |
115 | "text/x-mediawiki" => [ |
116 | "outType" => "Tokens", |
117 | "stages" => [ "Tokenizer", "TokenTransform2" ] |
118 | ], |
119 | |
120 | // This pipeline takes tokens from the PEG tokenizer and emits |
121 | // tokens that have had all templates and extensions processed. |
122 | // Stage 2 of the pipeline |
123 | "tokens/x-mediawiki" => [ |
124 | "outType" => "Tokens", |
125 | "stages" => [ "TokenTransform2" ] |
126 | ], |
127 | |
128 | // This pipeline takes tokens from stage 2 and emits a fully |
129 | // processed DOM as output. |
130 | // Stages 3-5 of the pipeline |
131 | "tokens/x-mediawiki/expanded" => [ |
132 | "outType" => "DOM", |
133 | "stages" => [ "TokenTransform3", "TreeBuilder", "DOMPP" ] |
134 | ], |
135 | ]; |
136 | |
137 | private static $supportedOptions = [ |
138 | // If true, templates found in content will have its contents expanded |
139 | 'expandTemplates', |
140 | |
141 | // If true, indicates pipeline is processing the expanded content of a |
142 | // template or its arguments |
143 | 'inTemplate', |
144 | |
145 | // If true, indicates that we are in a <includeonly> context |
146 | // (in current usage, isInclude === inTemplate) |
147 | 'isInclude', |
148 | |
149 | // The extension tag that is being processed (Ex: ref, references) |
150 | // (in current usage, only used for native tag implementation) |
151 | 'extTag', |
152 | |
153 | // Extension-specific options |
154 | 'extTagOpts', |
155 | |
156 | // Content being parsed is used in an inline context |
157 | 'inlineContext', |
158 | |
159 | // Are we processing content of attributes? |
160 | // (in current usage, used for transcluded attr. keys/values) |
161 | 'attrExpansion' |
162 | ]; |
163 | |
164 | private array $pipelineCache = []; |
165 | |
166 | /** @var Env */ |
167 | private $env; |
168 | |
169 | public function __construct( Env $env ) { |
170 | $this->env = $env; |
171 | } |
172 | |
173 | /** |
174 | * Default options processing |
175 | * |
176 | * @param array $options |
177 | * @return array |
178 | */ |
179 | private function defaultOptions( array $options ): array { |
180 | // default: not in a template |
181 | $options['inTemplate'] ??= false; |
182 | |
183 | // default: not an include context |
184 | $options['isInclude'] ??= false; |
185 | |
186 | // default: wrap templates |
187 | $options['expandTemplates'] ??= true; |
188 | |
189 | // Catch pipeline option typos |
190 | foreach ( $options as $k => $v ) { |
191 | Assert::invariant( |
192 | in_array( $k, self::$supportedOptions, true ), |
193 | 'Invalid cacheKey option: ' . $k |
194 | ); |
195 | } |
196 | |
197 | return $options; |
198 | } |
199 | |
200 | /** |
201 | * Generic pipeline creation from the above recipes. |
202 | * |
203 | * @param string $type |
204 | * @param string $cacheKey |
205 | * @param array $options |
206 | * @return ParserPipeline |
207 | */ |
208 | private function makePipeline( |
209 | string $type, string $cacheKey, array $options |
210 | ): ParserPipeline { |
211 | if ( !isset( self::$pipelineRecipes[$type] ) ) { |
212 | throw new InternalException( 'Unsupported Pipeline: ' . $type ); |
213 | } |
214 | $recipe = self::$pipelineRecipes[$type]; |
215 | $pipeStages = []; |
216 | $prevStage = null; |
217 | $recipeStages = $recipe["stages"]; |
218 | |
219 | foreach ( $recipeStages as $stageId ) { |
220 | $stageData = self::$stages[$stageId]; |
221 | $stage = new $stageData["class"]( $this->env, $options, $stageId, $prevStage ); |
222 | if ( isset( $stageData["transformers"] ) ) { |
223 | foreach ( $stageData["transformers"] as $tName ) { |
224 | $stage->addTransformer( new $tName( $stage, $options ) ); |
225 | } |
226 | } elseif ( isset( $stageData["processors"] ) ) { |
227 | $stage->registerProcessors( $stageData["processors"] ); |
228 | } |
229 | |
230 | $prevStage = $stage; |
231 | $pipeStages[] = $stage; |
232 | } |
233 | |
234 | return new ParserPipeline( |
235 | $type, |
236 | $recipe["outType"], |
237 | $cacheKey, |
238 | $pipeStages, |
239 | $this->env |
240 | ); |
241 | } |
242 | |
243 | private function getCacheKey( string $cacheKey, array $options ): string { |
244 | if ( empty( $options['isInclude'] ) ) { |
245 | $cacheKey .= '::noInclude'; |
246 | } |
247 | if ( empty( $options['expandTemplates'] ) ) { |
248 | $cacheKey .= '::noExpand'; |
249 | } |
250 | if ( !empty( $options['inlineContext'] ) ) { |
251 | $cacheKey .= '::inlineContext'; |
252 | } |
253 | if ( !empty( $options['inTemplate'] ) ) { |
254 | $cacheKey .= '::inTemplate'; |
255 | } |
256 | if ( !empty( $options['attrExpansion'] ) ) { |
257 | $cacheKey .= '::attrExpansion'; |
258 | } |
259 | if ( isset( $options['extTag'] ) ) { |
260 | $cacheKey .= '::' . $options['extTag']; |
261 | // FIXME: This is not the best strategy. But, instead of |
262 | // premature complexity, let us see how extensions want to |
263 | // use this and then figure out what constraints are needed. |
264 | if ( isset( $options['extTagOpts'] ) ) { |
265 | $cacheKey .= '::' . PHPUtils::jsonEncode( $options['extTagOpts'] ); |
266 | } |
267 | } |
268 | return $cacheKey; |
269 | } |
270 | |
271 | public function parse( string $src ): Document { |
272 | $pipe = $this->getPipeline( 'text/x-mediawiki/full' ); |
273 | $pipe->init( [ |
274 | 'toplevel' => true, |
275 | 'frame' => $this->env->topFrame, |
276 | ] ); |
277 | |
278 | $result = $pipe->parseChunkily( $src, [ |
279 | 'atTopLevel' => true, |
280 | // Top-level doc parsing always start in SOL state |
281 | 'sol' => true, |
282 | ] ); |
283 | |
284 | return $result->ownerDocument; |
285 | } |
286 | |
287 | /** |
288 | * Get a pipeline of a given type. Pipelines are cached as they are |
289 | * frequently created. |
290 | * |
291 | * @param string $type |
292 | * @param array $options These also determine the key under which the |
293 | * pipeline is cached for reuse. |
294 | * @return ParserPipeline |
295 | */ |
296 | public function getPipeline( |
297 | string $type, array $options = [] |
298 | ): ParserPipeline { |
299 | $options = $this->defaultOptions( $options ); |
300 | $cacheKey = $this->getCacheKey( $type, $options ); |
301 | |
302 | $this->pipelineCache[$cacheKey] ??= []; |
303 | |
304 | if ( $this->pipelineCache[$cacheKey] ) { |
305 | $pipe = array_pop( $this->pipelineCache[$cacheKey] ); |
306 | } else { |
307 | $pipe = $this->makePipeline( $type, $cacheKey, $options ); |
308 | } |
309 | |
310 | // Debugging aid: Assign unique id to the pipeline |
311 | $pipe->setPipelineId( self::$globalPipelineId++ ); |
312 | |
313 | return $pipe; |
314 | } |
315 | |
316 | /** |
317 | * Callback called by a pipeline at the end of its processing. Returns the |
318 | * pipeline to the cache. |
319 | * |
320 | * @param ParserPipeline $pipe |
321 | */ |
322 | public function returnPipeline( ParserPipeline $pipe ): void { |
323 | $cacheKey = $pipe->getCacheKey(); |
324 | $this->pipelineCache[$cacheKey] ??= []; |
325 | if ( count( $this->pipelineCache[$cacheKey] ) < 100 ) { |
326 | $this->pipelineCache[$cacheKey][] = $pipe; |
327 | } |
328 | } |
329 | } |