Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
20.81% |
31 / 149 |
|
0.00% |
0 / 5 |
CRAP | |
0.00% |
0 / 1 |
ParsoidParser | |
20.81% |
31 / 149 |
|
0.00% |
0 / 5 |
335.43 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
genParserOutput | |
0.00% |
0 / 80 |
|
0.00% |
0 / 1 |
110 | |||
parse | |
88.57% |
31 / 35 |
|
0.00% |
0 / 1 |
9.12 | |||
parseFakeRevision | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
12 | |||
makeLimitReport | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Parser\Parsoid; |
4 | |
5 | use MediaWiki\Content\TextContent; |
6 | use MediaWiki\Content\WikitextContent; |
7 | use MediaWiki\Context\RequestContext; |
8 | use MediaWiki\Languages\LanguageConverterFactory; |
9 | use MediaWiki\MainConfigNames; |
10 | use MediaWiki\MediaWikiServices; |
11 | use MediaWiki\Page\PageReference; |
12 | use MediaWiki\Parser\ParserFactory; |
13 | use MediaWiki\Parser\ParserOptions; |
14 | use MediaWiki\Parser\ParserOutput; |
15 | use MediaWiki\Parser\Parsoid\Config\PageConfigFactory; |
16 | use MediaWiki\Revision\MutableRevisionRecord; |
17 | use MediaWiki\Revision\RevisionRecord; |
18 | use MediaWiki\Revision\SlotRecord; |
19 | use MediaWiki\Title\Title; |
20 | use MediaWiki\WikiMap\WikiMap; |
21 | use Wikimedia\Assert\Assert; |
22 | use Wikimedia\Parsoid\Config\PageConfig; |
23 | use Wikimedia\Parsoid\Parsoid; |
24 | |
25 | /** |
26 | * Parser implementation which uses Parsoid. |
27 | * |
28 | * Currently incomplete; see T236809 for the long-term plan. |
29 | * |
30 | * @since 1.41 |
31 | * @unstable since 1.41; see T236809 for plan. |
32 | */ |
33 | class ParsoidParser /* eventually this will extend \Parser */ { |
34 | /** |
35 | * @unstable |
36 | * This should not be used widely right now since this may go away. |
37 | * This is being added to support DiscussionTools with Parsoid HTML |
38 | * and after initial exploration, this may be implemented differently. |
39 | */ |
40 | public const PARSOID_TITLE_KEY = "parsoid:title-dbkey"; |
41 | private Parsoid $parsoid; |
42 | private PageConfigFactory $pageConfigFactory; |
43 | private LanguageConverterFactory $languageConverterFactory; |
44 | private ParserFactory $legacyParserFactory; |
45 | |
46 | /** |
47 | * @param Parsoid $parsoid |
48 | * @param PageConfigFactory $pageConfigFactory |
49 | * @param LanguageConverterFactory $languageConverterFactory |
50 | * @param ParserFactory $legacyParserFactory |
51 | */ |
52 | public function __construct( |
53 | Parsoid $parsoid, |
54 | PageConfigFactory $pageConfigFactory, |
55 | LanguageConverterFactory $languageConverterFactory, |
56 | ParserFactory $legacyParserFactory |
57 | ) { |
58 | $this->parsoid = $parsoid; |
59 | $this->pageConfigFactory = $pageConfigFactory; |
60 | $this->languageConverterFactory = $languageConverterFactory; |
61 | $this->legacyParserFactory = $legacyParserFactory; |
62 | } |
63 | |
64 | /** |
65 | * Internal helper to avoid code deuplication across two methods |
66 | * |
67 | * @param PageConfig $pageConfig |
68 | * @param ParserOptions $options |
69 | * @return ParserOutput |
70 | */ |
71 | private function genParserOutput( |
72 | PageConfig $pageConfig, ParserOptions $options, ?ParserOutput $previousOutput |
73 | ): ParserOutput { |
74 | $parserOutput = new ParserOutput(); |
75 | |
76 | // Parsoid itself does not vary output by parser options right now. |
77 | // But, ensure that any option use by extensions, parser functions, |
78 | // recursive parses, or (in the unlikely future scenario) Parsoid itself |
79 | // are recorded as used. |
80 | $options->registerWatcher( [ $parserOutput, 'recordOption' ] ); |
81 | |
82 | // The enable/disable logic here matches that in Parser::internalParseHalfParsed(), |
83 | // although __NOCONTENTCONVERT__ is handled internal to Parsoid. |
84 | // |
85 | // T349137: It might be preferable to handle __NOCONTENTCONVERT__ here rather than |
86 | // by inspecting the DOM inside Parsoid. That will come in a separate patch. |
87 | $htmlVariantLanguage = null; |
88 | if ( !( $options->getDisableContentConversion() || $options->getInterfaceMessage() ) ) { |
89 | // NOTES (some of these are TODOs for read views integration) |
90 | // 1. This html variant conversion is a pre-cache transform. HtmlOutputRendererHelper |
91 | // has another variant conversion that is a post-cache transform based on the |
92 | // 'Accept-Language' header. If that header is set, there is really no reason to |
93 | // do this conversion here. So, eventually, we are likely to either not pass in |
94 | // the htmlVariantLanguage option below OR disable language conversion from the |
95 | // wt2html path in Parsoid and this and the Accept-Language variant conversion |
96 | // both would have to be handled as post-cache transforms. |
97 | // |
98 | // 2. Parser.php calls convert() which computes a preferred variant from the |
99 | // target language. But, we cannot do that unconditionally here because REST API |
100 | // requests specify the exact variant via the 'Content-Language' header. |
101 | // |
102 | // For Parsoid page views, either the callers will have to compute the |
103 | // preferred variant and set it in ParserOptions OR the REST API will have |
104 | // to set some other flag indicating that the preferred variant should not |
105 | // be computed. For now, I am adding a temporary hack, but this should be |
106 | // replaced with something more sensible (T267067). |
107 | // |
108 | // 3. Additionally, Parsoid's callers will have to set targetLanguage in ParserOptions |
109 | // to mimic the logic in Parser.php (missing right now). |
110 | $langCode = $pageConfig->getPageLanguageBcp47(); |
111 | if ( $options->getRenderReason() === 'page-view' ) { // TEMPORARY HACK |
112 | $langFactory = MediaWikiServices::getInstance()->getLanguageFactory(); |
113 | $lang = $langFactory->getLanguage( $langCode ); |
114 | $langConv = $this->languageConverterFactory->getLanguageConverter( $lang ); |
115 | $htmlVariantLanguage = $langFactory->getLanguage( $langConv->getPreferredVariant() ); |
116 | } else { |
117 | $htmlVariantLanguage = $langCode; |
118 | } |
119 | } |
120 | $oldPageConfig = null; |
121 | $oldPageBundle = null; |
122 | |
123 | // T371713: Temporary statistics collection code to determine |
124 | // feasibility of Parsoid selective update |
125 | $sampleRate = MediaWikiServices::getInstance()->getMainConfig()->get( |
126 | MainConfigNames::ParsoidSelectiveUpdateSampleRate |
127 | ); |
128 | $doSample = ( $sampleRate && mt_rand( 1, $sampleRate ) === 1 ); |
129 | if ( $doSample && $previousOutput !== null && $previousOutput->getCacheRevisionId() ) { |
130 | // Allow fetching the old wikitext corresponding to the |
131 | // $previousOutput |
132 | $oldPageConfig = $this->pageConfigFactory->create( |
133 | Title::newFromLinkTarget( $pageConfig->getLinkTarget() ), |
134 | $options->getUserIdentity(), |
135 | $previousOutput->getCacheRevisionId(), |
136 | null, |
137 | $previousOutput->getLanguage(), |
138 | ); |
139 | $oldPageBundle = |
140 | PageBundleParserOutputConverter::pageBundleFromParserOutput( |
141 | $previousOutput |
142 | ); |
143 | } |
144 | |
145 | $defaultOptions = [ |
146 | 'pageBundle' => true, |
147 | 'wrapSections' => true, |
148 | 'logLinterData' => true, |
149 | 'body_only' => false, |
150 | 'htmlVariantLanguage' => $htmlVariantLanguage, |
151 | 'offsetType' => 'byte', |
152 | 'outputContentVersion' => Parsoid::defaultHTMLVersion(), |
153 | 'previousOutput' => $oldPageBundle, |
154 | 'previousInput' => $oldPageConfig, |
155 | // The following are passed for metrics & labelling |
156 | 'sampleStats' => $doSample, |
157 | 'renderReason' => $options->getRenderReason(), |
158 | 'userAgent' => RequestContext::getMain()->getRequest()->getHeader( 'User-Agent' ), |
159 | ]; |
160 | |
161 | $parserOutput->resetParseStartTime(); |
162 | |
163 | // This can throw ClientError or ResourceLimitExceededException. |
164 | // Callers are responsible for figuring out how to handle them. |
165 | $pageBundle = $this->parsoid->wikitext2html( |
166 | $pageConfig, |
167 | $defaultOptions, |
168 | $headers, |
169 | $parserOutput ); |
170 | |
171 | $parserOutput = PageBundleParserOutputConverter::parserOutputFromPageBundle( $pageBundle, $parserOutput ); |
172 | |
173 | // Record the page title in dbkey form so that post-cache transforms |
174 | // have access to the title. |
175 | $parserOutput->setExtensionData( |
176 | self::PARSOID_TITLE_KEY, |
177 | Title::newFromLinkTarget( $pageConfig->getLinkTarget() )->getPrefixedDBkey() |
178 | ); |
179 | |
180 | // Register a watcher again because the $parserOutput arg |
181 | // and $parserOutput return value above are different objects! |
182 | $options->registerWatcher( [ $parserOutput, 'recordOption' ] ); |
183 | |
184 | $parserOutput->setFromParserOptions( $options ); |
185 | |
186 | $parserOutput->recordTimeProfile(); |
187 | $this->makeLimitReport( $options, $parserOutput ); |
188 | |
189 | // T371713: Collect statistics on parsing time -vs- presence of |
190 | // $previousOutput |
191 | $stats = MediaWikiServices::getInstance()->getStatsFactory(); |
192 | $labels = [ |
193 | 'type' => $previousOutput === null ? 'full' : 'selective', |
194 | 'wiki' => WikiMap::getCurrentWikiId(), |
195 | 'reason' => $options->getRenderReason() ?: 'unknown', |
196 | ]; |
197 | $stats |
198 | ->getCounter( 'Parsoid_parse_cpu_seconds' ) |
199 | ->setLabels( $labels ) |
200 | ->incrementBy( $parserOutput->getTimeProfile( 'cpu' ) ); |
201 | $stats |
202 | ->getCounter( 'Parsoid_parse_total' ) |
203 | ->setLabels( $labels ) |
204 | ->increment(); |
205 | |
206 | // Add Parsoid skinning module |
207 | $parserOutput->addModuleStyles( [ 'mediawiki.skinning.content.parsoid' ] ); |
208 | |
209 | // Record Parsoid version in extension data; this allows |
210 | // us to use the onRejectParserCacheValue hook to selectively |
211 | // expire "bad" generated content in the event of a rollback. |
212 | $parserOutput->setExtensionData( |
213 | 'core:parsoid-version', Parsoid::version() |
214 | ); |
215 | $parserOutput->setExtensionData( |
216 | 'core:html-version', Parsoid::defaultHTMLVersion() |
217 | ); |
218 | |
219 | return $parserOutput; |
220 | } |
221 | |
222 | /** |
223 | * Convert wikitext to HTML |
224 | * Do not call this function recursively. |
225 | * |
226 | * @param string|TextContent $text Text we want to parse |
227 | * @param-taint $text escapes_htmlnoent |
228 | * @param PageReference $page |
229 | * @param ParserOptions $options |
230 | * @param bool $linestart |
231 | * @param bool $clearState |
232 | * @param int|null $revId ID of the revision being rendered. This is used to render |
233 | * REVISION* magic words. 0 means that any current revision will be used. Null means |
234 | * that {{REVISIONID}}/{{REVISIONUSER}} will be empty and {{REVISIONTIMESTAMP}} will |
235 | * use the current timestamp. |
236 | * @param ?ParserOutput $previousOutput The (optional) result of a |
237 | * previous parse of this page, which can be used for selective update. |
238 | * @return ParserOutput |
239 | * @return-taint escaped |
240 | * @unstable since 1.41 |
241 | */ |
242 | public function parse( |
243 | $text, PageReference $page, ParserOptions $options, |
244 | bool $linestart = true, bool $clearState = true, ?int $revId = null, |
245 | ?ParserOutput $previousOutput = null |
246 | ): ParserOutput { |
247 | Assert::invariant( $linestart, '$linestart=false is not yet supported' ); |
248 | Assert::invariant( $clearState, '$clearState=false is not yet supported' ); |
249 | $title = Title::newFromPageReference( $page ); |
250 | $lang = $options->getTargetLanguage(); |
251 | if ( $lang === null && $options->getInterfaceMessage() ) { |
252 | $lang = $options->getUserLangObj(); |
253 | } |
254 | $pageConfig = $revId === null || $revId === 0 ? null : $this->pageConfigFactory->create( |
255 | $title, |
256 | $options->getUserIdentity(), |
257 | $revId, |
258 | null, // unused |
259 | $lang // defaults to title page language if null |
260 | ); |
261 | $content = null; |
262 | if ( $text instanceof TextContent ) { |
263 | $content = $text; |
264 | $text = $content->getText(); |
265 | } |
266 | if ( !( $pageConfig && $pageConfig->getPageMainContent() === $text ) ) { |
267 | // This is a bit awkward! But we really need to parse $text, which |
268 | // may or may not correspond to the $revId provided! |
269 | // T332928 suggests one solution: splitting the "have revid" |
270 | // callers from the "bare text, no associated revision" callers. |
271 | $revisionRecord = new MutableRevisionRecord( $title ); |
272 | if ( $revId !== null ) { |
273 | $revisionRecord->setId( $revId ); |
274 | } |
275 | $revisionRecord->setSlot( |
276 | SlotRecord::newUnsaved( |
277 | SlotRecord::MAIN, |
278 | $content ?? new WikitextContent( $text ) |
279 | ) |
280 | ); |
281 | $pageConfig = $this->pageConfigFactory->create( |
282 | $title, |
283 | $options->getUserIdentity(), |
284 | $revisionRecord, |
285 | null, // unused |
286 | $lang // defaults to title page language if null |
287 | ); |
288 | } |
289 | |
290 | return $this->genParserOutput( $pageConfig, $options, $previousOutput ); |
291 | } |
292 | |
293 | /** |
294 | * @internal |
295 | * |
296 | * Convert custom wikitext (stored in main slot of the $fakeRev arg) to HTML. |
297 | * Callers are expected NOT to stuff the result into ParserCache. |
298 | * |
299 | * @param RevisionRecord $fakeRev Revision to parse |
300 | * @param PageReference $page |
301 | * @param ParserOptions $options |
302 | * @return ParserOutput |
303 | * @unstable since 1.41 |
304 | */ |
305 | public function parseFakeRevision( |
306 | RevisionRecord $fakeRev, PageReference $page, ParserOptions $options |
307 | ): ParserOutput { |
308 | wfDeprecated( __METHOD__, '1.43' ); |
309 | $title = Title::newFromPageReference( $page ); |
310 | $lang = $options->getTargetLanguage(); |
311 | if ( $lang === null && $options->getInterfaceMessage() ) { |
312 | $lang = $options->getUserLangObj(); |
313 | } |
314 | $pageConfig = $this->pageConfigFactory->create( |
315 | $title, |
316 | $options->getUserIdentity(), |
317 | $fakeRev, |
318 | null, // unused |
319 | $lang // defaults to title page language if null |
320 | ); |
321 | |
322 | return $this->genParserOutput( $pageConfig, $options, null ); |
323 | } |
324 | |
325 | /** |
326 | * Set the limit report data in the current ParserOutput. |
327 | * This is ported from Parser::makeLimitReport() and should eventually |
328 | * use the method from the superclass directly. |
329 | */ |
330 | protected function makeLimitReport( |
331 | ParserOptions $parserOptions, ParserOutput $parserOutput |
332 | ) { |
333 | $maxIncludeSize = $parserOptions->getMaxIncludeSize(); |
334 | |
335 | $cpuTime = $parserOutput->getTimeProfile( 'cpu' ); |
336 | if ( $cpuTime !== null ) { |
337 | $parserOutput->setLimitReportData( 'limitreport-cputime', |
338 | sprintf( "%.3f", $cpuTime ) |
339 | ); |
340 | } |
341 | |
342 | $wallTime = $parserOutput->getTimeProfile( 'wall' ); |
343 | $parserOutput->setLimitReportData( 'limitreport-walltime', |
344 | sprintf( "%.3f", $wallTime ) |
345 | ); |
346 | |
347 | $parserOutput->setLimitReportData( 'limitreport-timingprofile', [ 'not yet supported' ] ); |
348 | |
349 | // Add other cache related metadata |
350 | $parserOutput->setLimitReportData( 'cachereport-timestamp', |
351 | $parserOutput->getCacheTime() ); |
352 | $parserOutput->setLimitReportData( 'cachereport-ttl', |
353 | $parserOutput->getCacheExpiry() ); |
354 | $parserOutput->setLimitReportData( 'cachereport-transientcontent', |
355 | $parserOutput->hasReducedExpiry() ); |
356 | } |
357 | |
358 | } |