Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
52.35% |
156 / 298 |
|
11.11% |
2 / 18 |
CRAP | |
0.00% |
0 / 1 |
Parsoid | |
52.35% |
156 / 298 |
|
11.11% |
2 / 18 |
552.70 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
version | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
defaultHTMLVersion | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
resolveContentVersion | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
supportsLanguageConversion | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setupCommonOptions | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
72 | |||
parseWikitext | |
0.00% |
0 / 26 |
|
0.00% |
0 / 1 |
56 | |||
wikitext2html | |
97.22% |
35 / 36 |
|
0.00% |
0 / 1 |
6 | |||
recordParseMetrics | |
45.76% |
27 / 59 |
|
0.00% |
0 / 1 |
21.92 | |||
wikitext2lint | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
dom2wikitext | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
6 | |||
recordSerializationMetrics | |
69.23% |
18 / 26 |
|
0.00% |
0 / 1 |
4.47 | |||
html2wikitext | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
pb2pb | |
97.01% |
65 / 67 |
|
0.00% |
0 / 1 |
7 | |||
findDowngrade | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
downgrade | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
implementsLanguageConversionBcp47 | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
2.03 | |||
downgrade999to2 | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid; |
5 | |
6 | use Composer\InstalledVersions; |
7 | use Composer\Semver\Comparator; |
8 | use Composer\Semver\Semver; |
9 | use InvalidArgumentException; |
10 | use LogicException; |
11 | use Wikimedia\Bcp47Code\Bcp47Code; |
12 | use Wikimedia\Parsoid\Config\DataAccess; |
13 | use Wikimedia\Parsoid\Config\Env; |
14 | use Wikimedia\Parsoid\Config\PageConfig; |
15 | use Wikimedia\Parsoid\Config\SiteConfig; |
16 | use Wikimedia\Parsoid\Config\StubMetadataCollector; |
17 | use Wikimedia\Parsoid\Core\ContentMetadataCollector; |
18 | use Wikimedia\Parsoid\Core\PageBundle; |
19 | use Wikimedia\Parsoid\Core\ResourceLimitExceededException; |
20 | use Wikimedia\Parsoid\Core\SelectiveUpdateData; |
21 | use Wikimedia\Parsoid\DOM\Document; |
22 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
23 | use Wikimedia\Parsoid\Language\LanguageConverter; |
24 | use Wikimedia\Parsoid\Logger\LintLogger; |
25 | use Wikimedia\Parsoid\Utils\ComputeSelectiveStats; |
26 | use Wikimedia\Parsoid\Utils\ContentUtils; |
27 | use Wikimedia\Parsoid\Utils\DOMCompat; |
28 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
29 | use Wikimedia\Parsoid\Utils\DOMUtils; |
30 | use Wikimedia\Parsoid\Utils\Timing; |
31 | use Wikimedia\Parsoid\Utils\Utils; |
32 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\AddRedLinks; |
33 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\ConvertOffsets; |
34 | |
35 | class Parsoid { |
36 | |
37 | /** |
38 | * Available HTML content versions. |
39 | * @see https://www.mediawiki.org/wiki/Parsoid/API#Content_Negotiation |
40 | * @see https://www.mediawiki.org/wiki/Specs/HTML#Versioning |
41 | */ |
42 | public const AVAILABLE_VERSIONS = [ '2.8.0', '999.0.0' ]; |
43 | |
44 | private const DOWNGRADES = [ |
45 | [ 'from' => '999.0.0', 'to' => '2.0.0', 'func' => 'downgrade999to2' ], |
46 | ]; |
47 | |
48 | /** @var SiteConfig */ |
49 | private $siteConfig; |
50 | |
51 | /** @var DataAccess */ |
52 | private $dataAccess; |
53 | |
54 | public function __construct( |
55 | SiteConfig $siteConfig, DataAccess $dataAccess |
56 | ) { |
57 | $this->siteConfig = $siteConfig; |
58 | $this->dataAccess = $dataAccess; |
59 | } |
60 | |
61 | /** |
62 | * Returns the currently-installed version of Parsoid. |
63 | * @return string |
64 | */ |
65 | public static function version(): string { |
66 | try { |
67 | // See https://getcomposer.org/doc/07-runtime.md#knowing-the-version-of-package-x |
68 | return InstalledVersions::getVersion( 'wikimedia/parsoid' ) ?? |
69 | // From the composer runtime API docs: |
70 | // "It is nonetheless a good idea to make sure you |
71 | // handle the null return value as gracefully as |
72 | // possible for safety." |
73 | 'null'; |
74 | } catch ( \Throwable $t ) { |
75 | // Belt-and-suspenders protection against parts of the composer |
76 | // runtime API being absent in production. |
77 | return 'error'; |
78 | } |
79 | } |
80 | |
81 | /** |
82 | * Returns the default HTML content version |
83 | * @return string |
84 | */ |
85 | public static function defaultHTMLVersion(): string { |
86 | return self::AVAILABLE_VERSIONS[0]; |
87 | } |
88 | |
89 | /** |
90 | * See if any content version Parsoid knows how to produce satisfies the |
91 | * the supplied version, when interpreted with semver caret semantics. |
92 | * This will allow us to make backwards compatible changes, without the need |
93 | * for clients to bump the version in their headers all the time. |
94 | * |
95 | * @param string $version |
96 | * @return string|null |
97 | */ |
98 | public static function resolveContentVersion( string $version ) { |
99 | foreach ( self::AVAILABLE_VERSIONS as $i => $a ) { |
100 | if ( Semver::satisfies( $a, "^{$version}" ) && |
101 | // The section wrapping in 1.6.x should have induced a major |
102 | // version bump, since it requires upgrading clients to |
103 | // handle it. We therefore hardcode this in so that we can |
104 | // fail hard. |
105 | Comparator::greaterThanOrEqualTo( $version, '1.6.0' ) |
106 | ) { |
107 | return $a; |
108 | } |
109 | } |
110 | return null; |
111 | } |
112 | |
113 | /** |
114 | * Determine if language conversion is enabled, aka if the optional |
115 | * wikimedia/langconv library is installed. |
116 | * @return bool True if the wikimedia/langconv library is available |
117 | */ |
118 | public static function supportsLanguageConversion(): bool { |
119 | return class_exists( '\Wikimedia\LangConv\ReplacementMachine' ); |
120 | } |
121 | |
122 | private function setupCommonOptions( array $options ): array { |
123 | $envOptions = []; |
124 | if ( isset( $options['offsetType'] ) ) { |
125 | $envOptions['offsetType'] = $options['offsetType']; |
126 | } |
127 | if ( isset( $options['traceFlags'] ) ) { |
128 | $envOptions['traceFlags'] = $options['traceFlags']; |
129 | } |
130 | if ( isset( $options['dumpFlags'] ) ) { |
131 | $envOptions['dumpFlags'] = $options['dumpFlags']; |
132 | } |
133 | if ( isset( $options['debugFlags'] ) ) { |
134 | $envOptions['debugFlags'] = $options['debugFlags']; |
135 | } |
136 | if ( !empty( $options['htmlVariantLanguage'] ) ) { |
137 | $envOptions['htmlVariantLanguage'] = $options['htmlVariantLanguage']; |
138 | } |
139 | if ( !empty( $options['wtVariantLanguage'] ) ) { |
140 | $envOptions['wtVariantLanguage'] = $options['wtVariantLanguage']; |
141 | } |
142 | if ( isset( $options['logLevels'] ) ) { |
143 | $envOptions['logLevels'] = $options['logLevels']; |
144 | } |
145 | return $envOptions; |
146 | } |
147 | |
148 | /** |
149 | * Parsing code shared between the next two methods. |
150 | * |
151 | * @param PageConfig $pageConfig |
152 | * @param ContentMetadataCollector $metadata |
153 | * @param array $options See wikitext2html. |
154 | * @param ?SelectiveUpdateData $selparData See wikitext2html. |
155 | * @return array{0:Env,1:Document,2:?string} |
156 | * The returned document is in "prepared and loaded" form. |
157 | */ |
158 | private function parseWikitext( |
159 | PageConfig $pageConfig, |
160 | ContentMetadataCollector $metadata, |
161 | array $options = [], |
162 | ?SelectiveUpdateData $selparData = null |
163 | ): array { |
164 | $envOptions = $this->setupCommonOptions( $options ); |
165 | if ( isset( $options['outputContentVersion'] ) ) { |
166 | $envOptions['outputContentVersion'] = $options['outputContentVersion']; |
167 | } |
168 | if ( isset( $options['wrapSections'] ) ) { |
169 | $envOptions['wrapSections'] = (bool)$options['wrapSections']; |
170 | } |
171 | if ( isset( $options['pageBundle'] ) ) { |
172 | $envOptions['pageBundle'] = (bool)$options['pageBundle']; |
173 | } |
174 | if ( isset( $options['logLinterData'] ) ) { |
175 | $envOptions['logLinterData'] = (bool)$options['logLinterData']; |
176 | } |
177 | if ( isset( $options['linterOverrides'] ) ) { |
178 | $envOptions['linterOverrides'] = $options['linterOverrides']; |
179 | } |
180 | $envOptions['skipLanguageConversionPass'] = |
181 | $options['skipLanguageConversionPass'] ?? false; |
182 | |
183 | $env = new Env( |
184 | $this->siteConfig, $pageConfig, $this->dataAccess, $metadata, $envOptions |
185 | ); |
186 | if ( !$env->compareWt2HtmlLimit( |
187 | 'wikitextSize', strlen( $env->topFrame->getSrcText() ) |
188 | ) ) { |
189 | throw new ResourceLimitExceededException( |
190 | "wt2html: wikitextSize limit exceeded" |
191 | ); |
192 | } |
193 | $contentmodel = $options['contentmodel'] ?? null; |
194 | $handler = $env->getContentHandler( $contentmodel ); |
195 | $extApi = new ParsoidExtensionAPI( $env ); |
196 | return [ $env, $handler->toDOM( $extApi, $selparData ), $contentmodel ]; |
197 | } |
198 | |
199 | /** |
200 | * Parse the wikitext supplied in a `PageConfig` to HTML. |
201 | * |
202 | * @param PageConfig $pageConfig |
203 | * @param array $options [ |
204 | * 'wrapSections' => (bool) Whether `<section>` wrappers should be added. |
205 | * 'pageBundle' => (bool) Sets ids on nodes and stores |
206 | * data-* attributes in a JSON blob. |
207 | * 'body_only' => (bool|null) Only return the <body> children (T181657) |
208 | * 'outputContentVersion' => (string|null) Version of HTML to output. |
209 | * `null` returns the default version. |
210 | * 'contentmodel' => (string|null) The content model of the input. |
211 | * 'offsetType' => (string) ucs2, char, byte are valid values |
212 | * what kind of source offsets should be emitted? |
213 | * 'skipLanguageConversionPass' => (bool) Skip the language variant conversion pass (defaults to false) |
214 | * 'htmlVariantLanguage' => (Bcp47Code) If non-null, the language variant used for Parsoid HTML. |
215 | * 'wtVariantLanguage' => (Bcp47Code) If non-null, the language variant used for wikitext. |
216 | * 'logLinterData' => (bool) Should we log linter data if linting is enabled? |
217 | * 'linterOverrides' => (array) Override the site linting configs. |
218 | * // Debugging options, not for use in production |
219 | * 'traceFlags' => (array) associative array with tracing options |
220 | * 'dumpFlags' => (array) associative array with dump options |
221 | * 'debugFlags' => (array) associative array with debug options |
222 | * 'logLevels' => (string[]) Levels to log |
223 | * // Experimental options, not considered stable |
224 | * 'sampleStats' => (bool) If true, okay to perform "expensive" |
225 | * analysis to generate metrics. |
226 | * 'renderReason' => (?string) Passed through from MediaWiki core |
227 | * to classify metrics; see |
228 | * ParserOptions::getRenderReason() |
229 | * 'previousInput' => (?PageConfig) wikitext, revision ID, etc of |
230 | * some recent parse of this page. |
231 | * Not guaranteed to be usable for selective |
232 | * update, and could even be from a "newer" |
233 | * revision (if this is a render of an old |
234 | * revision). |
235 | * 'previousOutput' => (?PageBundle) output of the prior parse of |
236 | * 'previousInput' |
237 | * ] |
238 | * @param ?array &$headers |
239 | * @param ?ContentMetadataCollector $metadata Pass in a CMC in order to |
240 | * collect and retrieve metadata about the parse. |
241 | * @param ?SelectiveUpdateData $selparData |
242 | * @return PageBundle|string |
243 | */ |
244 | public function wikitext2html( |
245 | PageConfig $pageConfig, array $options = [], ?array &$headers = null, |
246 | ?ContentMetadataCollector $metadata = null, ?SelectiveUpdateData $selparData = null |
247 | ) { |
248 | if ( $metadata === null ) { |
249 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
250 | } |
251 | |
252 | $parseTiming = Timing::start(); |
253 | [ $env, $doc, $contentmodel ] = $this->parseWikitext( $pageConfig, $metadata, $options, $selparData ); |
254 | DOMDataUtils::visitAndStoreDataAttribs( DOMCompat::getBody( $doc ), [ |
255 | 'storeInPageBundle' => $env->pageBundle, |
256 | 'outputContentVersion' => $env->getOutputContentVersion(), |
257 | ] ); |
258 | $parseTimeMs = $parseTiming->end(); |
259 | |
260 | // FIXME: Does this belong in parseWikitext so that the other endpoint |
261 | // is covered as well? It probably depends on expectations of the |
262 | // Rest API. If callers of /page/lint/ assume that will update the |
263 | // results on the Special page. |
264 | if ( $env->linting() ) { |
265 | ( new LintLogger( $env ) )->logLintOutput(); |
266 | } |
267 | |
268 | $headers = DOMUtils::findHttpEquivHeaders( $doc ); |
269 | $body_only = !empty( $options['body_only'] ); |
270 | $node = $body_only ? DOMCompat::getBody( $doc ) : $doc; |
271 | |
272 | if ( $env->pageBundle ) { |
273 | $out = [ |
274 | 'pb' => PageBundle::fromDomPageBundle( $env->pageBundle, [ |
275 | 'body_only' => $body_only, |
276 | 'contentversion' => $env->getOutputContentVersion(), |
277 | 'headers' => $headers, |
278 | 'contentmodel' => $contentmodel, |
279 | 'offsetType' => $env->getCurrentOffsetType(), |
280 | ] ), |
281 | ]; |
282 | $out['html'] = $out['pb']->html; // for use in metrics |
283 | } else { |
284 | $out = [ |
285 | 'html' => ContentUtils::toXML( $node, [ |
286 | 'innerXML' => $body_only, |
287 | ] ), |
288 | ]; |
289 | } |
290 | |
291 | $this->recordParseMetrics( |
292 | $env, $parseTimeMs, $out, $headers, $contentmodel, $options |
293 | ); |
294 | |
295 | if ( $env->pageBundle ) { |
296 | return $out['pb']; |
297 | } else { |
298 | return $out['html']; |
299 | } |
300 | } |
301 | |
302 | /** |
303 | * |
304 | */ |
305 | private function recordParseMetrics( |
306 | Env $env, float $parseTimeMs, |
307 | array $out, ?array $headers, string $contentmodel, |
308 | array $options |
309 | ) { |
310 | $metrics = $this->siteConfig->metrics(); |
311 | |
312 | $pageConfig = $env->getPageConfig(); |
313 | |
314 | // This is somewhat suspect because ParsoidHandler::tryToCreatePageConfig |
315 | // can set a revision id on a MutableRevisionRecord, but it might be simpler |
316 | // to make that go away |
317 | if ( $pageConfig->getRevisionId() ) { |
318 | $mstr = 'pageWithOldid'; |
319 | } else { |
320 | $mstr = 'wt'; |
321 | } |
322 | |
323 | $timing = Timing::fakeTiming( $this->siteConfig, $parseTimeMs ); |
324 | $timing->end( "entry.wt2html.{$mstr}.parse", 'wt2html_parse_seconds', [ 'type' => $mstr ] ); |
325 | $version = 'default'; |
326 | |
327 | if ( Semver::satisfies( |
328 | $env->getOutputContentVersion(), '!=' . self::defaultHTMLVersion() |
329 | ) ) { |
330 | if ( $metrics ) { |
331 | $metrics->increment( 'entry.wt2html.parse.version.notdefault' ); |
332 | } |
333 | $version = 'non-default'; |
334 | } |
335 | |
336 | $this->siteConfig->incrementCounter( 'wt2html_parse_total', [ |
337 | 'type' => $mstr, |
338 | 'version' => $version |
339 | ] ); |
340 | |
341 | // @phan-suppress-next-line PhanDeprecatedFunction |
342 | $timing = Timing::fakeTiming( $this->siteConfig, strlen( $pageConfig->getPageMainContent() ) ); |
343 | $timing->end( |
344 | "entry.wt2html.{$mstr}.size.input", |
345 | "wt2html_size_input_bytes", |
346 | [ "type" => $mstr ] |
347 | ); |
348 | |
349 | $outSize = strlen( $out['html'] ); |
350 | $timing = Timing::fakeTiming( $this->siteConfig, $outSize ); |
351 | $timing->end( "entry.wt2html.{$mstr}.size.output", "wt2html_size_output_bytes", [ "type" => $mstr ] ); |
352 | |
353 | if ( $parseTimeMs > 10 && $outSize > 100 ) { |
354 | // * Don't bother with this metric for really small parse times |
355 | // p99 for initialization time is ~7ms according to grafana. |
356 | // So, 10ms ensures that startup overheads don't skew the metrics |
357 | // * For body_only=false requests, <head> section isn't generated |
358 | // and if the output is small, per-request overheads can skew |
359 | // the timePerKB metrics. |
360 | // |
361 | // NOTE: This is slightly misleading since there are fixed costs |
362 | // for generating output like the <head> section and should be factored in, |
363 | // but this is good enough for now as a useful first degree of approxmation. |
364 | $msPerKB = $parseTimeMs * 1024 / $outSize; |
365 | $timing = Timing::fakeTiming( $this->siteConfig, $msPerKB ); |
366 | $timing->end( |
367 | 'entry.wt2html.timePerKB', |
368 | 'wt2html_msPerKB', |
369 | [] |
370 | ); |
371 | } |
372 | |
373 | // Expensive analyses: sampleStats is randomly sampled will not be |
374 | // true "often" |
375 | $doSample = $options['sampleStats'] ?? false; |
376 | if ( !$doSample ) { |
377 | return; |
378 | } |
379 | |
380 | try { |
381 | // create new page bundle for this computation to ensure we |
382 | // don't inadvertently corrupt the main document result. |
383 | $newPb = new PageBundle( |
384 | $out['html'], |
385 | $out['pb']->parsoid ?? null, $out['pb']->mw ?? null, |
386 | $env->getOutputContentVersion(), |
387 | $headers, |
388 | $contentmodel |
389 | ); |
390 | $labels = ComputeSelectiveStats::classify( |
391 | $env, |
392 | $options['previousInput'] ?? null, |
393 | $options['previousOutput'] ?? null, |
394 | $pageConfig, |
395 | $newPb |
396 | ); |
397 | $labels['wiki'] = $this->siteConfig->iwp(); |
398 | $labels['reason'] = $options['renderReason'] ?? 'unknown'; |
399 | $labels['useragent'] = ComputeSelectiveStats::filterUserAgent( $options['userAgent'] ?: null ); |
400 | |
401 | $this->siteConfig->incrementCounter( 'selective_update_total', $labels ); |
402 | $this->siteConfig->incrementCounter( 'selective_update_seconds', $labels, $parseTimeMs / 1000. ); |
403 | } catch ( \Throwable $t ) { |
404 | // Don't ever allow bugs in the classification code to |
405 | // impact the availability of content for read views/editing, |
406 | // just log. |
407 | $env->log( 'warn', 'Classification failure', $t->getTraceAsString() ); |
408 | } |
409 | } |
410 | |
411 | /** |
412 | * Lint the wikitext supplied in a `PageConfig`. |
413 | * |
414 | * @param PageConfig $pageConfig |
415 | * @param array $options See wikitext2html. |
416 | * @param ?ContentMetadataCollector $metadata Pass in a CMC in order to |
417 | * collect and retrieve metadata about the parse. |
418 | * @return array |
419 | */ |
420 | public function wikitext2lint( |
421 | PageConfig $pageConfig, array $options = [], |
422 | ?ContentMetadataCollector $metadata = null |
423 | ): array { |
424 | if ( $metadata === null ) { |
425 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
426 | } |
427 | [ $env, ] = $this->parseWikitext( $pageConfig, $metadata, $options ); |
428 | return $env->getLints(); |
429 | } |
430 | |
431 | /** |
432 | * Serialize DOM to wikitext. |
433 | * |
434 | * @param PageConfig $pageConfig |
435 | * @param Document $doc Data attributes are expected to have been applied |
436 | * already. Loading them will happen once the environment is created. |
437 | * @param array $options [ |
438 | * 'inputContentVersion' => (string) The content version of the input. |
439 | * Necessary if it differs from the current default in order to |
440 | * account for any serialization differences. |
441 | * 'offsetType' => (string) ucs2, char, byte are valid values |
442 | * what kind of source offsets are present in the HTML? |
443 | * 'contentmodel' => (string|null) The content model of the input. |
444 | * 'htmlVariantLanguage' => (Bcp47Code) If non-null, the language variant used for Parsoid HTML. |
445 | * 'wtVariantLanguage' => (Bcp47Code) If non-null, the language variant used for wikitext. |
446 | * 'traceFlags' => (array) associative array with tracing options |
447 | * 'dumpFlags' => (array) associative array with dump options |
448 | * 'debugFlags' => (array) associative array with debug options |
449 | * 'logLevels' => (string[]) Levels to log |
450 | * 'htmlSize' => (int) Size of the HTML that generated $doc |
451 | * ] |
452 | * @param ?SelectiveUpdateData $selserData |
453 | * @return string |
454 | */ |
455 | public function dom2wikitext( |
456 | PageConfig $pageConfig, Document $doc, array $options = [], |
457 | ?SelectiveUpdateData $selserData = null |
458 | ): string { |
459 | $envOptions = $this->setupCommonOptions( $options ); |
460 | if ( isset( $options['inputContentVersion'] ) ) { |
461 | $envOptions['inputContentVersion'] = $options['inputContentVersion']; |
462 | } |
463 | $envOptions['topLevelDoc'] = $doc; |
464 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
465 | $env = new Env( |
466 | $this->siteConfig, $pageConfig, $this->dataAccess, $metadata, $envOptions |
467 | ); |
468 | $env->bumpHtml2WtResourceUse( 'htmlSize', $options['htmlSize'] ?? 0 ); |
469 | $contentmodel = $options['contentmodel'] ?? null; |
470 | $handler = $env->getContentHandler( $contentmodel ); |
471 | $extApi = new ParsoidExtensionAPI( $env ); |
472 | |
473 | $serialTiming = Timing::start(); |
474 | $wikitext = $handler->fromDOM( $extApi, $selserData ); |
475 | $serialTime = $serialTiming->end(); |
476 | |
477 | $this->recordSerializationMetrics( $options, $serialTime, $wikitext ); |
478 | |
479 | return $wikitext; |
480 | } |
481 | |
482 | /** |
483 | * |
484 | */ |
485 | private function recordSerializationMetrics( |
486 | array $options, float $serialTime, string $wikitext |
487 | ) { |
488 | $siteConfig = $this->siteConfig; |
489 | $metrics = $siteConfig->metrics(); |
490 | |
491 | $htmlSize = $options['htmlSize'] ?? 0; |
492 | $timing = Timing::fakeTiming( $this->siteConfig, $htmlSize ); |
493 | $timing->end( 'entry.html2wt.size.input', 'html2wt_size_input_bytes' ); |
494 | |
495 | if ( isset( $options['inputContentVersion'] ) ) { |
496 | if ( $metrics ) { |
497 | $metrics->increment( |
498 | 'entry.html2wt.original.version.' . $options['inputContentVersion'] |
499 | ); |
500 | } |
501 | $this->siteConfig->incrementCounter( |
502 | 'html2wt_original_version', |
503 | [ 'input_content_version' => $options['inputContentVersion'] ] |
504 | ); |
505 | } |
506 | |
507 | $timing = Timing::fakeTiming( $this->siteConfig, $serialTime ); |
508 | $timing->end( 'entry.html2wt.total', 'html2wt_total_seconds', [] ); |
509 | |
510 | $timing = Timing::fakeTiming( $this->siteConfig, strlen( $wikitext ) ); |
511 | $timing->end( 'entry.html2wt.size.output', 'html2wt_size_output_bytes', [] ); |
512 | |
513 | if ( $htmlSize ) { // Avoid division by zero |
514 | // NOTE: the name timePerInputKB is misleading, since $htmlSize is |
515 | // in characters, not bytes. |
516 | $msPerKB = $serialTime * 1024 / $htmlSize; |
517 | $timing = Timing::fakeTiming( $this->siteConfig, $msPerKB ); |
518 | $timing->end( |
519 | 'entry.html2wt.timePerInputKB', |
520 | 'html2wt_msPerKB', |
521 | [] |
522 | ); |
523 | } |
524 | } |
525 | |
526 | /** |
527 | * Serialize HTML to wikitext. Convenience method for dom2wikitext. |
528 | * |
529 | * @param PageConfig $pageConfig |
530 | * @param string $html |
531 | * @param array $options |
532 | * @param ?SelectiveUpdateData $selserData |
533 | * @return string |
534 | */ |
535 | public function html2wikitext( |
536 | PageConfig $pageConfig, string $html, array $options = [], |
537 | ?SelectiveUpdateData $selserData = null |
538 | ): string { |
539 | $doc = DOMUtils::parseHTML( $html, true ); |
540 | $options['htmlSize'] ??= mb_strlen( $html ); |
541 | return $this->dom2wikitext( $pageConfig, $doc, $options, $selserData ); |
542 | } |
543 | |
544 | /** |
545 | * Update the supplied PageBundle based on the `$update` type. |
546 | * |
547 | * 'convertoffsets': Convert offsets between formats (byte, char, ucs2) |
548 | * 'redlinks': Refreshes the classes of known, missing, etc. links. |
549 | * 'variant': Converts the HTML based on the supplied variant. |
550 | * |
551 | * Note that these are DOM transforms, and not roundtrips through wikitext. |
552 | * |
553 | * @param PageConfig $pageConfig |
554 | * @param string $update 'redlinks'|'variant' |
555 | * @param PageBundle $pb |
556 | * @param array $options |
557 | * @return PageBundle |
558 | */ |
559 | public function pb2pb( |
560 | PageConfig $pageConfig, string $update, PageBundle $pb, |
561 | array $options = [] |
562 | ): PageBundle { |
563 | $envOptions = [ |
564 | 'pageBundle' => true, |
565 | 'topLevelDoc' => DOMUtils::parseHTML( $pb->toHtml(), true ), |
566 | ]; |
567 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
568 | $env = new Env( |
569 | $this->siteConfig, $pageConfig, $this->dataAccess, $metadata, $envOptions |
570 | ); |
571 | $doc = $env->getTopLevelDoc(); |
572 | DOMDataUtils::visitAndLoadDataAttribs( |
573 | DOMCompat::getBody( $doc ), [ 'markNew' => true ] |
574 | ); |
575 | |
576 | switch ( $update ) { |
577 | case 'convertoffsets': |
578 | // This method also calls Env::setCurrentOffsetType, which |
579 | // is used by PageBundle::fromDomPageBundle() below to set |
580 | // 'offsetType' in the 'parsoid' property of the page bundle |
581 | ContentUtils::convertOffsets( |
582 | $env, $doc, $options['inputOffsetType'], $options['outputOffsetType'] |
583 | ); |
584 | if ( isset( $pb->parsoid['counter'] ) ) { |
585 | $internalPB = $env->pageBundle; |
586 | $internalPB->parsoid['counter'] = $pb->parsoid['counter']; |
587 | } |
588 | break; |
589 | |
590 | case 'redlinks': |
591 | ContentUtils::convertOffsets( |
592 | $env, $doc, $env->getRequestOffsetType(), 'byte' |
593 | ); |
594 | ( new AddRedLinks() )->run( $env, DOMCompat::getBody( $doc ) ); |
595 | ( new ConvertOffsets() )->run( $env, DOMCompat::getBody( $doc ), [], true ); |
596 | break; |
597 | |
598 | case 'variant': |
599 | ContentUtils::convertOffsets( |
600 | $env, $doc, $env->getRequestOffsetType(), 'byte' |
601 | ); |
602 | |
603 | // Note that `maybeConvert` could still be a no-op, in case the |
604 | // __NOCONTENTCONVERT__ magic word is present, or the htmlVariant |
605 | // is a base language code or otherwise invalid. |
606 | $hasWtVariant = $options['variant']['wikitext'] ?? |
607 | // Deprecated name for this option: |
608 | $options['variant']['source'] ?? false; |
609 | LanguageConverter::maybeConvert( |
610 | $env, $doc, |
611 | Utils::mwCodeToBcp47( |
612 | $options['variant']['html'] ?? |
613 | // Deprecated name for this option: |
614 | $options['variant']['target'], |
615 | // Be strict in what we accept. |
616 | true, $this->siteConfig->getLogger() |
617 | ), |
618 | $hasWtVariant ? |
619 | Utils::mwCodeToBcp47( |
620 | $options['variant']['wikitext'] ?? |
621 | // Deprecated name for this option: |
622 | $options['variant']['source'], |
623 | // Be strict in what we accept. |
624 | true, $this->siteConfig->getLogger() |
625 | ) : null |
626 | ); |
627 | |
628 | // NOTE: Keep this in sync with code in core's LanguageVariantConverter |
629 | // Update content-language and vary headers. |
630 | DOMUtils::addHttpEquivHeaders( $doc, [ |
631 | 'content-language' => $env->htmlContentLanguageBcp47()->toBcp47Code(), |
632 | 'vary' => $env->htmlVary() |
633 | ] ); |
634 | |
635 | ( new ConvertOffsets() )->run( $env, DOMCompat::getBody( $doc ), [], true ); |
636 | break; |
637 | |
638 | default: |
639 | throw new LogicException( $update . 'is an unknown transformation' ); |
640 | } |
641 | |
642 | DOMDataUtils::visitAndStoreDataAttribs( |
643 | DOMCompat::getBody( $doc ), [ |
644 | 'storeInPageBundle' => $env->pageBundle, |
645 | 'outputContentVersion' => $env->getOutputContentVersion(), |
646 | ] |
647 | ); |
648 | return PageBundle::fromDomPageBundle( $env->pageBundle, [ |
649 | 'body_only' => !empty( $options['body_only'] ), |
650 | // Prefer the passed in version, since this was just a transformation |
651 | 'contentversion' => $pb->version ?? $env->getOutputContentVersion(), |
652 | 'headers' => DOMUtils::findHttpEquivHeaders( $doc ), |
653 | // Prefer the passed in content model |
654 | 'contentmodel' => $pb->contentmodel ?? $pageConfig->getContentModel(), |
655 | 'offsetType' => $env->getCurrentOffsetType(), |
656 | ] ); |
657 | } |
658 | |
659 | /** |
660 | * Check whether a given content version can be downgraded to the requested |
661 | * content version. |
662 | * |
663 | * @param string $from Current content version |
664 | * @param string $to Requested content version |
665 | * @return string[]|null The downgrade that will fulfill the request, as |
666 | * [ 'from' => <old version>, 'to' => <new version> ], or null if it |
667 | * can't be fulfilled. |
668 | */ |
669 | public static function findDowngrade( string $from, string $to ): ?array { |
670 | foreach ( self::DOWNGRADES as [ 'from' => $dgFrom, 'to' => $dgTo ] ) { |
671 | if ( |
672 | Semver::satisfies( $from, "^$dgFrom" ) && |
673 | Semver::satisfies( $to, "^$dgTo" ) |
674 | ) { |
675 | // FIXME: Make this a class? |
676 | return [ 'from' => $dgFrom, 'to' => $dgTo ]; |
677 | } |
678 | } |
679 | return null; |
680 | } |
681 | |
682 | /** |
683 | * Downgrade a document to an older content version. |
684 | * |
685 | * @param string[] $dg Value returned by findDowngrade(). |
686 | * @param PageBundle $pageBundle |
687 | */ |
688 | public static function downgrade( |
689 | array $dg, PageBundle $pageBundle |
690 | ): void { |
691 | foreach ( self::DOWNGRADES as [ 'from' => $dgFrom, 'to' => $dgTo, 'func' => $dgFunc ] ) { |
692 | if ( $dg['from'] === $dgFrom && $dg['to'] === $dgTo ) { |
693 | call_user_func( [ self::class, $dgFunc ], $pageBundle ); |
694 | |
695 | // FIXME: Maybe this resolve should just be part of the $dg |
696 | $pageBundle->version = self::resolveContentVersion( $dg['to'] ); |
697 | |
698 | // FIXME: Maybe this should be a helper to avoid the rt |
699 | $doc = DOMUtils::parseHTML( $pageBundle->html ); |
700 | // Match the http-equiv meta to the content-type header |
701 | $meta = DOMCompat::querySelector( $doc, |
702 | 'meta[property="mw:htmlVersion"], meta[property="mw:html:version"]' ); |
703 | if ( $meta ) { |
704 | $meta->setAttribute( 'content', $pageBundle->version ); |
705 | $pageBundle->html = ContentUtils::toXML( $doc ); |
706 | } |
707 | |
708 | return; |
709 | } |
710 | } |
711 | throw new InvalidArgumentException( |
712 | "Unsupported downgrade: {$dg['from']} -> {$dg['to']}" |
713 | ); |
714 | } |
715 | |
716 | /** |
717 | * Check if language variant conversion is implemented for a language |
718 | * |
719 | * @internal FIXME: Remove once Parsoid's language variant work is completed |
720 | * @param PageConfig $pageConfig |
721 | * @param Bcp47Code $htmlVariant Variant language to check |
722 | * @return bool |
723 | */ |
724 | public function implementsLanguageConversionBcp47( PageConfig $pageConfig, Bcp47Code $htmlVariant ): bool { |
725 | // Hardcode disable zh lang conversion support since Parsoid's |
726 | // implementation is incomplete and not performant (T346657). |
727 | if ( $pageConfig->getPageLanguageBcp47()->toBcp47Code() === 'zh' ) { |
728 | return false; |
729 | } |
730 | |
731 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
732 | $env = new Env( $this->siteConfig, $pageConfig, $this->dataAccess, $metadata ); |
733 | return LanguageConverter::implementsLanguageConversionBcp47( $env, $htmlVariant ); |
734 | } |
735 | |
736 | /** |
737 | * Downgrade the given document and pagebundle from 999.x to 2.x. |
738 | * |
739 | * @param PageBundle $pageBundle |
740 | */ |
741 | private static function downgrade999to2( PageBundle $pageBundle ) { |
742 | // Effectively, skip applying data-parsoid. Note that if we were to |
743 | // support a pb2html downgrade, we'd need to apply the full thing, |
744 | // but that would create complications where ids would be left behind. |
745 | // See the comment in around `DOMDataUtils::applyPageBundle` |
746 | $newPageBundle = new PageBundle( |
747 | $pageBundle->html, |
748 | [ 'ids' => [] ], |
749 | $pageBundle->mw |
750 | ); |
751 | $pageBundle->html = $newPageBundle->toInlineAttributeHtml(); |
752 | |
753 | // Now, modify the pagebundle to the expected form. This is important |
754 | // since, at least in the serialization path, the original pb will be |
755 | // applied to the modified content and its presence could cause lost |
756 | // deletions. |
757 | $pageBundle->mw = [ 'ids' => [] ]; |
758 | } |
759 | } |