Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
50.50% |
153 / 303 |
|
11.11% |
2 / 18 |
CRAP | |
0.00% |
0 / 1 |
Parsoid | |
50.50% |
153 / 303 |
|
11.11% |
2 / 18 |
629.00 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
version | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
defaultHTMLVersion | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
resolveContentVersion | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
supportsLanguageConversion | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setupCommonOptions | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
72 | |||
parseWikitext | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
72 | |||
wikitext2html | |
97.22% |
35 / 36 |
|
0.00% |
0 / 1 |
6 | |||
recordParseMetrics | |
45.76% |
27 / 59 |
|
0.00% |
0 / 1 |
21.92 | |||
wikitext2lint | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
dom2wikitext | |
0.00% |
0 / 21 |
|
0.00% |
0 / 1 |
6 | |||
recordSerializationMetrics | |
69.23% |
18 / 26 |
|
0.00% |
0 / 1 |
4.47 | |||
html2wikitext | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
pb2pb | |
96.88% |
62 / 64 |
|
0.00% |
0 / 1 |
7 | |||
findDowngrade | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
downgrade | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
implementsLanguageConversionBcp47 | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
2.03 | |||
downgrade999to2 | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid; |
5 | |
6 | use Composer\InstalledVersions; |
7 | use Composer\Semver\Comparator; |
8 | use Composer\Semver\Semver; |
9 | use InvalidArgumentException; |
10 | use LogicException; |
11 | use Wikimedia\Assert\Assert; |
12 | use Wikimedia\Bcp47Code\Bcp47Code; |
13 | use Wikimedia\Parsoid\Config\DataAccess; |
14 | use Wikimedia\Parsoid\Config\Env; |
15 | use Wikimedia\Parsoid\Config\PageConfig; |
16 | use Wikimedia\Parsoid\Config\SiteConfig; |
17 | use Wikimedia\Parsoid\Config\StubMetadataCollector; |
18 | use Wikimedia\Parsoid\Core\ContentMetadataCollector; |
19 | use Wikimedia\Parsoid\Core\DomPageBundle; |
20 | use Wikimedia\Parsoid\Core\PageBundle; |
21 | use Wikimedia\Parsoid\Core\ResourceLimitExceededException; |
22 | use Wikimedia\Parsoid\Core\SelectiveUpdateData; |
23 | use Wikimedia\Parsoid\DOM\Document; |
24 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
25 | use Wikimedia\Parsoid\Language\LanguageConverter; |
26 | use Wikimedia\Parsoid\Logger\LintLogger; |
27 | use Wikimedia\Parsoid\Utils\ComputeSelectiveStats; |
28 | use Wikimedia\Parsoid\Utils\ContentUtils; |
29 | use Wikimedia\Parsoid\Utils\DOMCompat; |
30 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
31 | use Wikimedia\Parsoid\Utils\DOMUtils; |
32 | use Wikimedia\Parsoid\Utils\Timing; |
33 | use Wikimedia\Parsoid\Utils\Utils; |
34 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\AddRedLinks; |
35 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\ConvertOffsets; |
36 | |
37 | class Parsoid { |
38 | |
39 | /** |
40 | * Available HTML content versions. |
41 | * @see https://www.mediawiki.org/wiki/Parsoid/API#Content_Negotiation |
42 | * @see https://www.mediawiki.org/wiki/Specs/HTML#Versioning |
43 | */ |
44 | public const AVAILABLE_VERSIONS = [ '2.8.0', '999.0.0' ]; |
45 | |
46 | private const DOWNGRADES = [ |
47 | [ 'from' => '999.0.0', 'to' => '2.0.0', 'func' => 'downgrade999to2' ], |
48 | ]; |
49 | |
50 | /** @var SiteConfig */ |
51 | private $siteConfig; |
52 | |
53 | /** @var DataAccess */ |
54 | private $dataAccess; |
55 | |
56 | public function __construct( |
57 | SiteConfig $siteConfig, DataAccess $dataAccess |
58 | ) { |
59 | $this->siteConfig = $siteConfig; |
60 | $this->dataAccess = $dataAccess; |
61 | } |
62 | |
63 | /** |
64 | * Returns the currently-installed version of Parsoid. |
65 | * @return string |
66 | */ |
67 | public static function version(): string { |
68 | try { |
69 | // See https://getcomposer.org/doc/07-runtime.md#knowing-the-version-of-package-x |
70 | return InstalledVersions::getVersion( 'wikimedia/parsoid' ) ?? |
71 | // From the composer runtime API docs: |
72 | // "It is nonetheless a good idea to make sure you |
73 | // handle the null return value as gracefully as |
74 | // possible for safety." |
75 | 'null'; |
76 | } catch ( \Throwable $t ) { |
77 | // Belt-and-suspenders protection against parts of the composer |
78 | // runtime API being absent in production. |
79 | return 'error'; |
80 | } |
81 | } |
82 | |
83 | /** |
84 | * Returns the default HTML content version |
85 | * @return string |
86 | */ |
87 | public static function defaultHTMLVersion(): string { |
88 | return self::AVAILABLE_VERSIONS[0]; |
89 | } |
90 | |
91 | /** |
92 | * See if any content version Parsoid knows how to produce satisfies the |
93 | * the supplied version, when interpreted with semver caret semantics. |
94 | * This will allow us to make backwards compatible changes, without the need |
95 | * for clients to bump the version in their headers all the time. |
96 | * |
97 | * @param string $version |
98 | * @return string|null |
99 | */ |
100 | public static function resolveContentVersion( string $version ) { |
101 | foreach ( self::AVAILABLE_VERSIONS as $i => $a ) { |
102 | if ( Semver::satisfies( $a, "^{$version}" ) && |
103 | // The section wrapping in 1.6.x should have induced a major |
104 | // version bump, since it requires upgrading clients to |
105 | // handle it. We therefore hardcode this in so that we can |
106 | // fail hard. |
107 | Comparator::greaterThanOrEqualTo( $version, '1.6.0' ) |
108 | ) { |
109 | return $a; |
110 | } |
111 | } |
112 | return null; |
113 | } |
114 | |
115 | /** |
116 | * Determine if language conversion is enabled, aka if the optional |
117 | * wikimedia/langconv library is installed. |
118 | * @return bool True if the wikimedia/langconv library is available |
119 | */ |
120 | public static function supportsLanguageConversion(): bool { |
121 | return class_exists( '\Wikimedia\LangConv\ReplacementMachine' ); |
122 | } |
123 | |
124 | private function setupCommonOptions( array $options ): array { |
125 | $envOptions = []; |
126 | if ( isset( $options['offsetType'] ) ) { |
127 | $envOptions['offsetType'] = $options['offsetType']; |
128 | } |
129 | if ( isset( $options['traceFlags'] ) ) { |
130 | $envOptions['traceFlags'] = $options['traceFlags']; |
131 | } |
132 | if ( isset( $options['dumpFlags'] ) ) { |
133 | $envOptions['dumpFlags'] = $options['dumpFlags']; |
134 | } |
135 | if ( isset( $options['debugFlags'] ) ) { |
136 | $envOptions['debugFlags'] = $options['debugFlags']; |
137 | } |
138 | if ( !empty( $options['htmlVariantLanguage'] ) ) { |
139 | $envOptions['htmlVariantLanguage'] = $options['htmlVariantLanguage']; |
140 | } |
141 | if ( !empty( $options['wtVariantLanguage'] ) ) { |
142 | $envOptions['wtVariantLanguage'] = $options['wtVariantLanguage']; |
143 | } |
144 | if ( isset( $options['logLevels'] ) ) { |
145 | $envOptions['logLevels'] = $options['logLevels']; |
146 | } |
147 | return $envOptions; |
148 | } |
149 | |
150 | /** |
151 | * Parsing code shared between the next two methods. |
152 | * |
153 | * @param PageConfig $pageConfig |
154 | * @param ContentMetadataCollector $metadata |
155 | * @param array $options See wikitext2html. |
156 | * @param ?SelectiveUpdateData $selparData See wikitext2html. |
157 | * @return array{0:Env,1:Document,2:?string} |
158 | * The returned document is in "prepared and loaded" form. |
159 | */ |
160 | private function parseWikitext( |
161 | PageConfig $pageConfig, |
162 | ContentMetadataCollector $metadata, |
163 | array $options = [], |
164 | ?SelectiveUpdateData $selparData = null |
165 | ): array { |
166 | $envOptions = $this->setupCommonOptions( $options ); |
167 | if ( isset( $options['outputContentVersion'] ) ) { |
168 | $envOptions['outputContentVersion'] = $options['outputContentVersion']; |
169 | } |
170 | if ( isset( $options['wrapSections'] ) ) { |
171 | $envOptions['wrapSections'] = (bool)$options['wrapSections']; |
172 | } |
173 | if ( isset( $options['pageBundle'] ) ) { |
174 | $envOptions['pageBundle'] = (bool)$options['pageBundle']; |
175 | } |
176 | if ( isset( $options['logLinterData'] ) ) { |
177 | $envOptions['logLinterData'] = (bool)$options['logLinterData']; |
178 | } |
179 | if ( isset( $options['linterOverrides'] ) ) { |
180 | $envOptions['linterOverrides'] = $options['linterOverrides']; |
181 | } |
182 | $envOptions['skipLanguageConversionPass'] = |
183 | $options['skipLanguageConversionPass'] ?? false; |
184 | |
185 | $env = new Env( |
186 | $this->siteConfig, $pageConfig, $this->dataAccess, $metadata, $envOptions |
187 | ); |
188 | if ( !$env->compareWt2HtmlLimit( |
189 | 'wikitextSize', strlen( $env->topFrame->getSrcText() ) |
190 | ) ) { |
191 | throw new ResourceLimitExceededException( |
192 | "wt2html: wikitextSize limit exceeded" |
193 | ); |
194 | } |
195 | $contentmodel = $options['contentmodel'] ?? null; |
196 | $handler = $env->getContentHandler( $contentmodel ); |
197 | $extApi = new ParsoidExtensionAPI( $env ); |
198 | $doc = $handler->toDOM( $extApi, $selparData ); |
199 | if ( !DOMDataUtils::isPreparedAndLoaded( $doc ) ) { |
200 | // DEPRECATED. Extensions for other content types might still |
201 | // be returning plain/stored docs here. Prepare and load them |
202 | // for consistency. |
203 | $dpb = new DomPageBundle( $doc ); |
204 | $doc = $dpb->toDom(); |
205 | } |
206 | return [ $env, $doc, $contentmodel ]; |
207 | } |
208 | |
209 | /** |
210 | * Parse the wikitext supplied in a `PageConfig` to HTML. |
211 | * |
212 | * @param PageConfig $pageConfig |
213 | * @param array $options [ |
214 | * 'wrapSections' => (bool) Whether `<section>` wrappers should be added. |
215 | * 'pageBundle' => (bool) Sets ids on nodes and stores |
216 | * data-* attributes in a JSON blob. |
217 | * 'body_only' => (bool|null) Only return the <body> children (T181657) |
218 | * 'outputContentVersion' => (string|null) Version of HTML to output. |
219 | * `null` returns the default version. |
220 | * 'contentmodel' => (string|null) The content model of the input. |
221 | * 'offsetType' => (string) ucs2, char, byte are valid values |
222 | * what kind of source offsets should be emitted? |
223 | * 'skipLanguageConversionPass' => (bool) Skip the language variant conversion pass (defaults to false) |
224 | * 'htmlVariantLanguage' => (Bcp47Code) If non-null, the language variant used for Parsoid HTML. |
225 | * 'wtVariantLanguage' => (Bcp47Code) If non-null, the language variant used for wikitext. |
226 | * 'logLinterData' => (bool) Should we log linter data if linting is enabled? |
227 | * 'linterOverrides' => (array) Override the site linting configs. |
228 | * // Debugging options, not for use in production |
229 | * 'traceFlags' => (array) associative array with tracing options |
230 | * 'dumpFlags' => (array) associative array with dump options |
231 | * 'debugFlags' => (array) associative array with debug options |
232 | * 'logLevels' => (string[]) Levels to log |
233 | * // Experimental options, not considered stable |
234 | * 'sampleStats' => (bool) If true, okay to perform "expensive" |
235 | * analysis to generate metrics. |
236 | * 'renderReason' => (?string) Passed through from MediaWiki core |
237 | * to classify metrics; see |
238 | * ParserOptions::getRenderReason() |
239 | * 'previousInput' => (?PageConfig) wikitext, revision ID, etc of |
240 | * some recent parse of this page. |
241 | * Not guaranteed to be usable for selective |
242 | * update, and could even be from a "newer" |
243 | * revision (if this is a render of an old |
244 | * revision). |
245 | * 'previousOutput' => (?PageBundle) output of the prior parse of |
246 | * 'previousInput' |
247 | * ] |
248 | * @param ?array &$headers |
249 | * @param ?ContentMetadataCollector $metadata Pass in a CMC in order to |
250 | * collect and retrieve metadata about the parse. |
251 | * @param ?SelectiveUpdateData $selparData |
252 | * @return PageBundle|string |
253 | */ |
254 | public function wikitext2html( |
255 | PageConfig $pageConfig, array $options = [], ?array &$headers = null, |
256 | ?ContentMetadataCollector $metadata = null, ?SelectiveUpdateData $selparData = null |
257 | ) { |
258 | if ( $metadata === null ) { |
259 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
260 | } |
261 | |
262 | $parseTiming = Timing::start(); |
263 | [ $env, $doc, $contentmodel ] = $this->parseWikitext( $pageConfig, $metadata, $options, $selparData ); |
264 | DOMDataUtils::visitAndStoreDataAttribs( DOMCompat::getBody( $doc ), [ |
265 | 'storeInPageBundle' => $env->pageBundle, |
266 | 'outputContentVersion' => $env->getOutputContentVersion(), |
267 | ] ); |
268 | $parseTimeMs = $parseTiming->end(); |
269 | |
270 | // FIXME: Does this belong in parseWikitext so that the other endpoint |
271 | // is covered as well? It probably depends on expectations of the |
272 | // Rest API. If callers of /page/lint/ assume that will update the |
273 | // results on the Special page. |
274 | if ( $env->linting() ) { |
275 | ( new LintLogger( $env ) )->logLintOutput(); |
276 | } |
277 | |
278 | $headers = DOMUtils::findHttpEquivHeaders( $doc ); |
279 | $body_only = !empty( $options['body_only'] ); |
280 | $node = $body_only ? DOMCompat::getBody( $doc ) : $doc; |
281 | |
282 | if ( $env->pageBundle ) { |
283 | $out = [ |
284 | 'pb' => PageBundle::fromDomPageBundle( $env->pageBundle, [ |
285 | 'body_only' => $body_only, |
286 | 'contentversion' => $env->getOutputContentVersion(), |
287 | 'headers' => $headers, |
288 | 'contentmodel' => $contentmodel, |
289 | 'offsetType' => $env->getCurrentOffsetType(), |
290 | ] ), |
291 | ]; |
292 | $out['html'] = $out['pb']->html; // for use in metrics |
293 | } else { |
294 | $out = [ |
295 | 'html' => ContentUtils::toXML( $node, [ |
296 | 'innerXML' => $body_only, |
297 | ] ), |
298 | ]; |
299 | } |
300 | |
301 | $this->recordParseMetrics( |
302 | $env, $parseTimeMs, $out, $headers, $contentmodel, $options |
303 | ); |
304 | |
305 | if ( $env->pageBundle ) { |
306 | return $out['pb']; |
307 | } else { |
308 | return $out['html']; |
309 | } |
310 | } |
311 | |
312 | /** |
313 | * |
314 | */ |
315 | private function recordParseMetrics( |
316 | Env $env, float $parseTimeMs, |
317 | array $out, ?array $headers, string $contentmodel, |
318 | array $options |
319 | ) { |
320 | $metrics = $this->siteConfig->metrics(); |
321 | |
322 | $pageConfig = $env->getPageConfig(); |
323 | |
324 | // This is somewhat suspect because ParsoidHandler::tryToCreatePageConfig |
325 | // can set a revision id on a MutableRevisionRecord, but it might be simpler |
326 | // to make that go away |
327 | if ( $pageConfig->getRevisionId() ) { |
328 | $mstr = 'pageWithOldid'; |
329 | } else { |
330 | $mstr = 'wt'; |
331 | } |
332 | |
333 | $timing = Timing::fakeTiming( $this->siteConfig, $parseTimeMs ); |
334 | $timing->end( "entry.wt2html.{$mstr}.parse", 'wt2html_parse_seconds', [ 'type' => $mstr ] ); |
335 | $version = 'default'; |
336 | |
337 | if ( Semver::satisfies( |
338 | $env->getOutputContentVersion(), '!=' . self::defaultHTMLVersion() |
339 | ) ) { |
340 | if ( $metrics ) { |
341 | $metrics->increment( 'entry.wt2html.parse.version.notdefault' ); |
342 | } |
343 | $version = 'non-default'; |
344 | } |
345 | |
346 | $this->siteConfig->incrementCounter( 'wt2html_parse_total', [ |
347 | 'type' => $mstr, |
348 | 'version' => $version |
349 | ] ); |
350 | |
351 | // @phan-suppress-next-line PhanDeprecatedFunction |
352 | $timing = Timing::fakeTiming( $this->siteConfig, strlen( $pageConfig->getPageMainContent() ) ); |
353 | $timing->end( |
354 | "entry.wt2html.{$mstr}.size.input", |
355 | "wt2html_size_input_bytes", |
356 | [ "type" => $mstr ] |
357 | ); |
358 | |
359 | $outSize = strlen( $out['html'] ); |
360 | $timing = Timing::fakeTiming( $this->siteConfig, $outSize ); |
361 | $timing->end( "entry.wt2html.{$mstr}.size.output", "wt2html_size_output_bytes", [ "type" => $mstr ] ); |
362 | |
363 | if ( $parseTimeMs > 10 && $outSize > 100 ) { |
364 | // * Don't bother with this metric for really small parse times |
365 | // p99 for initialization time is ~7ms according to grafana. |
366 | // So, 10ms ensures that startup overheads don't skew the metrics |
367 | // * For body_only=false requests, <head> section isn't generated |
368 | // and if the output is small, per-request overheads can skew |
369 | // the timePerKB metrics. |
370 | // |
371 | // NOTE: This is slightly misleading since there are fixed costs |
372 | // for generating output like the <head> section and should be factored in, |
373 | // but this is good enough for now as a useful first degree of approxmation. |
374 | $msPerKB = $parseTimeMs * 1024 / $outSize; |
375 | $timing = Timing::fakeTiming( $this->siteConfig, $msPerKB ); |
376 | $timing->end( |
377 | 'entry.wt2html.timePerKB', |
378 | 'wt2html_msPerKB', |
379 | [] |
380 | ); |
381 | } |
382 | |
383 | // Expensive analyses: sampleStats is randomly sampled will not be |
384 | // true "often" |
385 | $doSample = $options['sampleStats'] ?? false; |
386 | if ( !$doSample ) { |
387 | return; |
388 | } |
389 | |
390 | try { |
391 | // create new page bundle for this computation to ensure we |
392 | // don't inadvertently corrupt the main document result. |
393 | $newPb = new PageBundle( |
394 | $out['html'], |
395 | $out['pb']->parsoid ?? null, $out['pb']->mw ?? null, |
396 | $env->getOutputContentVersion(), |
397 | $headers, |
398 | $contentmodel |
399 | ); |
400 | $labels = ComputeSelectiveStats::classify( |
401 | $env, |
402 | $options['previousInput'] ?? null, |
403 | $options['previousOutput'] ?? null, |
404 | $pageConfig, |
405 | $newPb |
406 | ); |
407 | $labels['wiki'] = $this->siteConfig->iwp(); |
408 | $labels['reason'] = $options['renderReason'] ?? 'unknown'; |
409 | $labels['useragent'] = ComputeSelectiveStats::filterUserAgent( $options['userAgent'] ?: null ); |
410 | |
411 | $this->siteConfig->incrementCounter( 'selective_update_total', $labels ); |
412 | $this->siteConfig->incrementCounter( 'selective_update_seconds', $labels, $parseTimeMs / 1000. ); |
413 | } catch ( \Throwable $t ) { |
414 | // Don't ever allow bugs in the classification code to |
415 | // impact the availability of content for read views/editing, |
416 | // just log. |
417 | $env->log( 'warn', 'Classification failure', $t->getTraceAsString() ); |
418 | } |
419 | } |
420 | |
421 | /** |
422 | * Lint the wikitext supplied in a `PageConfig`. |
423 | * |
424 | * @param PageConfig $pageConfig |
425 | * @param array $options See wikitext2html. |
426 | * @param ?ContentMetadataCollector $metadata Pass in a CMC in order to |
427 | * collect and retrieve metadata about the parse. |
428 | * @return array |
429 | */ |
430 | public function wikitext2lint( |
431 | PageConfig $pageConfig, array $options = [], |
432 | ?ContentMetadataCollector $metadata = null |
433 | ): array { |
434 | if ( $metadata === null ) { |
435 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
436 | } |
437 | [ $env, ] = $this->parseWikitext( $pageConfig, $metadata, $options ); |
438 | return $env->getLints(); |
439 | } |
440 | |
441 | /** |
442 | * Serialize DOM to wikitext. |
443 | * |
444 | * @param PageConfig $pageConfig |
445 | * @param Document|PageBundle|DomPageBundle $doc This is either a page |
446 | * bundle or a "naive" DOM without special handling of |
447 | * data-parsoid/data-mw etc. A naive DOM can either be in "single |
448 | * document" form (data attributes in an element in the <head>) or in |
449 | * "inline attributes" form. |
450 | * @param array $options [ |
451 | * 'inputContentVersion' => (string) The content version of the input. |
452 | * Necessary if it differs from the current default in order to |
453 | * account for any serialization differences. |
454 | * 'offsetType' => (string) ucs2, char, byte are valid values |
455 | * what kind of source offsets are present in the HTML? |
456 | * 'contentmodel' => (string|null) The content model of the input. |
457 | * 'htmlVariantLanguage' => (Bcp47Code) If non-null, the language variant used for Parsoid HTML. |
458 | * 'wtVariantLanguage' => (Bcp47Code) If non-null, the language variant used for wikitext. |
459 | * 'traceFlags' => (array) associative array with tracing options |
460 | * 'dumpFlags' => (array) associative array with dump options |
461 | * 'debugFlags' => (array) associative array with debug options |
462 | * 'logLevels' => (string[]) Levels to log |
463 | * 'htmlSize' => (int) Size of the HTML that generated $doc |
464 | * ] |
465 | * @param ?SelectiveUpdateData $selserData |
466 | * @return string |
467 | */ |
468 | public function dom2wikitext( |
469 | PageConfig $pageConfig, $doc, array $options = [], |
470 | ?SelectiveUpdateData $selserData = null |
471 | ): string { |
472 | Assert::invariant( |
473 | !DOMDataUtils::isPrepared( $doc ), |
474 | "document should not be already prepared" |
475 | ); |
476 | $envOptions = $this->setupCommonOptions( $options ); |
477 | if ( isset( $options['inputContentVersion'] ) ) { |
478 | $envOptions['inputContentVersion'] = $options['inputContentVersion']; |
479 | } |
480 | $envOptions['topLevelDoc'] = $doc; |
481 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
482 | $env = new Env( |
483 | $this->siteConfig, $pageConfig, $this->dataAccess, $metadata, $envOptions |
484 | ); |
485 | $env->bumpHtml2WtResourceUse( 'htmlSize', $options['htmlSize'] ?? 0 ); |
486 | $contentmodel = $options['contentmodel'] ?? null; |
487 | $handler = $env->getContentHandler( $contentmodel ); |
488 | $extApi = new ParsoidExtensionAPI( $env ); |
489 | |
490 | $serialTiming = Timing::start(); |
491 | $wikitext = $handler->fromDOM( $extApi, $selserData ); |
492 | $serialTime = $serialTiming->end(); |
493 | |
494 | $this->recordSerializationMetrics( $options, $serialTime, $wikitext ); |
495 | |
496 | return $wikitext; |
497 | } |
498 | |
499 | /** |
500 | * |
501 | */ |
502 | private function recordSerializationMetrics( |
503 | array $options, float $serialTime, string $wikitext |
504 | ) { |
505 | $siteConfig = $this->siteConfig; |
506 | $metrics = $siteConfig->metrics(); |
507 | |
508 | $htmlSize = $options['htmlSize'] ?? 0; |
509 | $timing = Timing::fakeTiming( $this->siteConfig, $htmlSize ); |
510 | $timing->end( 'entry.html2wt.size.input', 'html2wt_size_input_bytes' ); |
511 | |
512 | if ( isset( $options['inputContentVersion'] ) ) { |
513 | if ( $metrics ) { |
514 | $metrics->increment( |
515 | 'entry.html2wt.original.version.' . $options['inputContentVersion'] |
516 | ); |
517 | } |
518 | $this->siteConfig->incrementCounter( |
519 | 'html2wt_original_version', |
520 | [ 'input_content_version' => $options['inputContentVersion'] ] |
521 | ); |
522 | } |
523 | |
524 | $timing = Timing::fakeTiming( $this->siteConfig, $serialTime ); |
525 | $timing->end( 'entry.html2wt.total', 'html2wt_total_seconds', [] ); |
526 | |
527 | $timing = Timing::fakeTiming( $this->siteConfig, strlen( $wikitext ) ); |
528 | $timing->end( 'entry.html2wt.size.output', 'html2wt_size_output_bytes', [] ); |
529 | |
530 | if ( $htmlSize ) { // Avoid division by zero |
531 | // NOTE: the name timePerInputKB is misleading, since $htmlSize is |
532 | // in characters, not bytes. |
533 | $msPerKB = $serialTime * 1024 / $htmlSize; |
534 | $timing = Timing::fakeTiming( $this->siteConfig, $msPerKB ); |
535 | $timing->end( |
536 | 'entry.html2wt.timePerInputKB', |
537 | 'html2wt_msPerKB', |
538 | [] |
539 | ); |
540 | } |
541 | } |
542 | |
543 | /** |
544 | * Serialize HTML to wikitext. Convenience method for dom2wikitext. |
545 | * |
546 | * @param PageConfig $pageConfig |
547 | * @param string $html |
548 | * @param array $options |
549 | * @param ?SelectiveUpdateData $selserData |
550 | * @return string |
551 | */ |
552 | public function html2wikitext( |
553 | PageConfig $pageConfig, string $html, array $options = [], |
554 | ?SelectiveUpdateData $selserData = null |
555 | ): string { |
556 | $doc = DOMUtils::parseHTML( $html, true ); |
557 | $options['htmlSize'] ??= mb_strlen( $html ); |
558 | return $this->dom2wikitext( $pageConfig, $doc, $options, $selserData ); |
559 | } |
560 | |
561 | /** |
562 | * Update the supplied PageBundle based on the `$update` type. |
563 | * |
564 | * 'convertoffsets': Convert offsets between formats (byte, char, ucs2) |
565 | * 'redlinks': Refreshes the classes of known, missing, etc. links. |
566 | * 'variant': Converts the HTML based on the supplied variant. |
567 | * |
568 | * Note that these are DOM transforms, and not roundtrips through wikitext. |
569 | * |
570 | * @param PageConfig $pageConfig |
571 | * @param string $update 'redlinks'|'variant' |
572 | * @param PageBundle|DomPageBundle $pb |
573 | * @param array $options |
574 | * @return PageBundle |
575 | */ |
576 | public function pb2pb( |
577 | PageConfig $pageConfig, string $update, $pb, |
578 | array $options = [] |
579 | ): PageBundle { |
580 | $envOptions = [ |
581 | 'pageBundle' => true, |
582 | 'topLevelDoc' => $pb, |
583 | ]; |
584 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
585 | $env = new Env( |
586 | $this->siteConfig, $pageConfig, $this->dataAccess, $metadata, $envOptions |
587 | ); |
588 | $doc = $env->getTopLevelDoc(); |
589 | |
590 | switch ( $update ) { |
591 | case 'convertoffsets': |
592 | // This method also calls Env::setCurrentOffsetType, which |
593 | // is used by PageBundle::fromDomPageBundle() below to set |
594 | // 'offsetType' in the 'parsoid' property of the page bundle |
595 | ContentUtils::convertOffsets( |
596 | $env, $doc, $options['inputOffsetType'], $options['outputOffsetType'] |
597 | ); |
598 | if ( isset( $pb->parsoid['counter'] ) ) { |
599 | $internalPB = $env->pageBundle; |
600 | $internalPB->parsoid['counter'] = $pb->parsoid['counter']; |
601 | } |
602 | break; |
603 | |
604 | case 'redlinks': |
605 | ContentUtils::convertOffsets( |
606 | $env, $doc, $env->getRequestOffsetType(), 'byte' |
607 | ); |
608 | ( new AddRedLinks() )->run( $env, DOMCompat::getBody( $doc ) ); |
609 | ( new ConvertOffsets() )->run( $env, DOMCompat::getBody( $doc ), [], true ); |
610 | break; |
611 | |
612 | case 'variant': |
613 | ContentUtils::convertOffsets( |
614 | $env, $doc, $env->getRequestOffsetType(), 'byte' |
615 | ); |
616 | |
617 | // Note that `maybeConvert` could still be a no-op, in case the |
618 | // __NOCONTENTCONVERT__ magic word is present, or the htmlVariant |
619 | // is a base language code or otherwise invalid. |
620 | $hasWtVariant = $options['variant']['wikitext'] ?? |
621 | // Deprecated name for this option: |
622 | $options['variant']['source'] ?? false; |
623 | LanguageConverter::maybeConvert( |
624 | $env, $doc, |
625 | Utils::mwCodeToBcp47( |
626 | $options['variant']['html'] ?? |
627 | // Deprecated name for this option: |
628 | $options['variant']['target'], |
629 | // Be strict in what we accept. |
630 | true, $this->siteConfig->getLogger() |
631 | ), |
632 | $hasWtVariant ? |
633 | Utils::mwCodeToBcp47( |
634 | $options['variant']['wikitext'] ?? |
635 | // Deprecated name for this option: |
636 | $options['variant']['source'], |
637 | // Be strict in what we accept. |
638 | true, $this->siteConfig->getLogger() |
639 | ) : null |
640 | ); |
641 | |
642 | // NOTE: Keep this in sync with code in core's LanguageVariantConverter |
643 | // Update content-language and vary headers. |
644 | DOMUtils::addHttpEquivHeaders( $doc, [ |
645 | 'content-language' => $env->htmlContentLanguageBcp47()->toBcp47Code(), |
646 | 'vary' => $env->htmlVary() |
647 | ] ); |
648 | |
649 | ( new ConvertOffsets() )->run( $env, DOMCompat::getBody( $doc ), [], true ); |
650 | break; |
651 | |
652 | default: |
653 | throw new LogicException( $update . 'is an unknown transformation' ); |
654 | } |
655 | |
656 | DOMDataUtils::visitAndStoreDataAttribs( |
657 | DOMCompat::getBody( $doc ), [ |
658 | 'storeInPageBundle' => $env->pageBundle, |
659 | 'outputContentVersion' => $env->getOutputContentVersion(), |
660 | ] |
661 | ); |
662 | return PageBundle::fromDomPageBundle( $env->pageBundle, [ |
663 | 'body_only' => !empty( $options['body_only'] ), |
664 | // Prefer the passed in version, since this was just a transformation |
665 | 'contentversion' => $pb->version ?? $env->getOutputContentVersion(), |
666 | 'headers' => DOMUtils::findHttpEquivHeaders( $doc ), |
667 | // Prefer the passed in content model |
668 | 'contentmodel' => $pb->contentmodel ?? $pageConfig->getContentModel(), |
669 | 'offsetType' => $env->getCurrentOffsetType(), |
670 | ] ); |
671 | } |
672 | |
673 | /** |
674 | * Check whether a given content version can be downgraded to the requested |
675 | * content version. |
676 | * |
677 | * @param string $from Current content version |
678 | * @param string $to Requested content version |
679 | * @return string[]|null The downgrade that will fulfill the request, as |
680 | * [ 'from' => <old version>, 'to' => <new version> ], or null if it |
681 | * can't be fulfilled. |
682 | */ |
683 | public static function findDowngrade( string $from, string $to ): ?array { |
684 | foreach ( self::DOWNGRADES as [ 'from' => $dgFrom, 'to' => $dgTo ] ) { |
685 | if ( |
686 | Semver::satisfies( $from, "^$dgFrom" ) && |
687 | Semver::satisfies( $to, "^$dgTo" ) |
688 | ) { |
689 | // FIXME: Make this a class? |
690 | return [ 'from' => $dgFrom, 'to' => $dgTo ]; |
691 | } |
692 | } |
693 | return null; |
694 | } |
695 | |
696 | /** |
697 | * Downgrade a document to an older content version. |
698 | * |
699 | * @param string[] $dg Value returned by findDowngrade(). |
700 | * @param PageBundle $pageBundle |
701 | */ |
702 | public static function downgrade( |
703 | array $dg, PageBundle $pageBundle |
704 | ): void { |
705 | foreach ( self::DOWNGRADES as [ 'from' => $dgFrom, 'to' => $dgTo, 'func' => $dgFunc ] ) { |
706 | if ( $dg['from'] === $dgFrom && $dg['to'] === $dgTo ) { |
707 | self::$dgFunc( $pageBundle ); |
708 | |
709 | // FIXME: Maybe this resolve should just be part of the $dg |
710 | $pageBundle->version = self::resolveContentVersion( $dg['to'] ); |
711 | |
712 | // FIXME: Maybe this should be a helper to avoid the rt |
713 | $doc = DOMUtils::parseHTML( $pageBundle->html ); |
714 | // Match the http-equiv meta to the content-type header |
715 | $meta = DOMCompat::querySelector( $doc, |
716 | 'meta[property="mw:htmlVersion"], meta[property="mw:html:version"]' ); |
717 | if ( $meta ) { |
718 | $meta->setAttribute( 'content', $pageBundle->version ); |
719 | $pageBundle->html = ContentUtils::toXML( $doc ); |
720 | } |
721 | |
722 | return; |
723 | } |
724 | } |
725 | throw new InvalidArgumentException( |
726 | "Unsupported downgrade: {$dg['from']} -> {$dg['to']}" |
727 | ); |
728 | } |
729 | |
730 | /** |
731 | * Check if language variant conversion is implemented for a language |
732 | * |
733 | * @internal FIXME: Remove once Parsoid's language variant work is completed |
734 | * @param PageConfig $pageConfig |
735 | * @param Bcp47Code $htmlVariant Variant language to check |
736 | * @return bool |
737 | */ |
738 | public function implementsLanguageConversionBcp47( PageConfig $pageConfig, Bcp47Code $htmlVariant ): bool { |
739 | // Hardcode disable zh lang conversion support since Parsoid's |
740 | // implementation is incomplete and not performant (T346657). |
741 | if ( $pageConfig->getPageLanguageBcp47()->toBcp47Code() === 'zh' ) { |
742 | return false; |
743 | } |
744 | |
745 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
746 | $env = new Env( $this->siteConfig, $pageConfig, $this->dataAccess, $metadata ); |
747 | return LanguageConverter::implementsLanguageConversionBcp47( $env, $htmlVariant ); |
748 | } |
749 | |
750 | /** |
751 | * Downgrade the given document and pagebundle from 999.x to 2.x. |
752 | * |
753 | * @param PageBundle $pageBundle |
754 | */ |
755 | private static function downgrade999to2( PageBundle $pageBundle ) { |
756 | // Effectively, skip applying data-parsoid. Note that if we were to |
757 | // support a pb2html downgrade, we'd need to apply the full thing, |
758 | // but that would create complications where ids would be left behind. |
759 | // See the comment in around `DOMDataUtils::applyPageBundle` |
760 | $newPageBundle = new PageBundle( |
761 | $pageBundle->html, |
762 | null, |
763 | $pageBundle->mw |
764 | ); |
765 | $pageBundle->html = $newPageBundle->toInlineAttributeHtml(); |
766 | |
767 | // Now, modify the pagebundle to the expected form. This is important |
768 | // since, at least in the serialization path, the original pb will be |
769 | // applied to the modified content and its presence could cause lost |
770 | // deletions. |
771 | $pageBundle->mw = [ 'ids' => [] ]; |
772 | } |
773 | } |