Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
55.36% |
155 / 280 |
|
10.53% |
2 / 19 |
CRAP | |
0.00% |
0 / 1 |
Parsoid | |
55.36% |
155 / 280 |
|
10.53% |
2 / 19 |
440.91 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
version | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
defaultHTMLVersion | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
resolveContentVersion | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
supportsLanguageConversion | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setupCommonOptions | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
72 | |||
parseWikitext | |
0.00% |
0 / 27 |
|
0.00% |
0 / 1 |
56 | |||
wikitext2html | |
96.55% |
28 / 29 |
|
0.00% |
0 / 1 |
6 | |||
recordParseMetrics | |
69.44% |
25 / 36 |
|
0.00% |
0 / 1 |
7.03 | |||
wikitext2lint | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
dom2wikitext | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
6 | |||
recordSerializationMetrics | |
70.37% |
19 / 27 |
|
0.00% |
0 / 1 |
4.42 | |||
html2wikitext | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
pb2pb | |
97.30% |
72 / 74 |
|
0.00% |
0 / 1 |
7 | |||
substTopLevelTemplates | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
findDowngrade | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
downgrade | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
implementsLanguageConversionBcp47 | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
2.03 | |||
downgrade999to2 | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid; |
5 | |
6 | use Composer\InstalledVersions; |
7 | use Composer\Semver\Comparator; |
8 | use Composer\Semver\Semver; |
9 | use InvalidArgumentException; |
10 | use LogicException; |
11 | use Wikimedia\Bcp47Code\Bcp47Code; |
12 | use Wikimedia\Parsoid\Config\DataAccess; |
13 | use Wikimedia\Parsoid\Config\Env; |
14 | use Wikimedia\Parsoid\Config\PageConfig; |
15 | use Wikimedia\Parsoid\Config\SiteConfig; |
16 | use Wikimedia\Parsoid\Config\StubMetadataCollector; |
17 | use Wikimedia\Parsoid\Core\ContentMetadataCollector; |
18 | use Wikimedia\Parsoid\Core\PageBundle; |
19 | use Wikimedia\Parsoid\Core\ResourceLimitExceededException; |
20 | use Wikimedia\Parsoid\Core\SelectiveUpdateData; |
21 | use Wikimedia\Parsoid\DOM\Document; |
22 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
23 | use Wikimedia\Parsoid\Language\LanguageConverter; |
24 | use Wikimedia\Parsoid\Logger\LintLogger; |
25 | use Wikimedia\Parsoid\Utils\ContentUtils; |
26 | use Wikimedia\Parsoid\Utils\DOMCompat; |
27 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
28 | use Wikimedia\Parsoid\Utils\DOMUtils; |
29 | use Wikimedia\Parsoid\Utils\Timing; |
30 | use Wikimedia\Parsoid\Utils\Utils; |
31 | use Wikimedia\Parsoid\Wikitext\Wikitext; |
32 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\AddRedLinks; |
33 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\ConvertOffsets; |
34 | |
35 | class Parsoid { |
36 | |
37 | /** |
38 | * Available HTML content versions. |
39 | * @see https://www.mediawiki.org/wiki/Parsoid/API#Content_Negotiation |
40 | * @see https://www.mediawiki.org/wiki/Specs/HTML#Versioning |
41 | */ |
42 | public const AVAILABLE_VERSIONS = [ '2.8.0', '999.0.0' ]; |
43 | |
44 | private const DOWNGRADES = [ |
45 | [ 'from' => '999.0.0', 'to' => '2.0.0', 'func' => 'downgrade999to2' ], |
46 | ]; |
47 | |
48 | /** @var SiteConfig */ |
49 | private $siteConfig; |
50 | |
51 | /** @var DataAccess */ |
52 | private $dataAccess; |
53 | |
54 | public function __construct( |
55 | SiteConfig $siteConfig, DataAccess $dataAccess |
56 | ) { |
57 | $this->siteConfig = $siteConfig; |
58 | $this->dataAccess = $dataAccess; |
59 | } |
60 | |
61 | /** |
62 | * Returns the currently-installed version of Parsoid. |
63 | * @return string |
64 | */ |
65 | public static function version(): string { |
66 | try { |
67 | // See https://getcomposer.org/doc/07-runtime.md#knowing-the-version-of-package-x |
68 | return InstalledVersions::getVersion( 'wikimedia/parsoid' ) ?? |
69 | // From the composer runtime API docs: |
70 | // "It is nonetheless a good idea to make sure you |
71 | // handle the null return value as gracefully as |
72 | // possible for safety." |
73 | 'null'; |
74 | } catch ( \Throwable $t ) { |
75 | // Belt-and-suspenders protection against parts of the composer |
76 | // runtime API being absent in production. |
77 | return 'error'; |
78 | } |
79 | } |
80 | |
81 | /** |
82 | * Returns the default HTML content version |
83 | * @return string |
84 | */ |
85 | public static function defaultHTMLVersion(): string { |
86 | return self::AVAILABLE_VERSIONS[0]; |
87 | } |
88 | |
89 | /** |
90 | * See if any content version Parsoid knows how to produce satisfies the |
91 | * the supplied version, when interpreted with semver caret semantics. |
92 | * This will allow us to make backwards compatible changes, without the need |
93 | * for clients to bump the version in their headers all the time. |
94 | * |
95 | * @param string $version |
96 | * @return string|null |
97 | */ |
98 | public static function resolveContentVersion( string $version ) { |
99 | foreach ( self::AVAILABLE_VERSIONS as $i => $a ) { |
100 | if ( Semver::satisfies( $a, "^{$version}" ) && |
101 | // The section wrapping in 1.6.x should have induced a major |
102 | // version bump, since it requires upgrading clients to |
103 | // handle it. We therefore hardcode this in so that we can |
104 | // fail hard. |
105 | Comparator::greaterThanOrEqualTo( $version, '1.6.0' ) |
106 | ) { |
107 | return $a; |
108 | } |
109 | } |
110 | return null; |
111 | } |
112 | |
113 | /** |
114 | * Determine if language conversion is enabled, aka if the optional |
115 | * wikimedia/langconv library is installed. |
116 | * @return bool True if the wikimedia/langconv library is available |
117 | */ |
118 | public static function supportsLanguageConversion(): bool { |
119 | return class_exists( '\Wikimedia\LangConv\ReplacementMachine' ); |
120 | } |
121 | |
122 | private function setupCommonOptions( array $options ): array { |
123 | $envOptions = []; |
124 | if ( isset( $options['offsetType'] ) ) { |
125 | $envOptions['offsetType'] = $options['offsetType']; |
126 | } |
127 | if ( isset( $options['traceFlags'] ) ) { |
128 | $envOptions['traceFlags'] = $options['traceFlags']; |
129 | } |
130 | if ( isset( $options['dumpFlags'] ) ) { |
131 | $envOptions['dumpFlags'] = $options['dumpFlags']; |
132 | } |
133 | if ( isset( $options['debugFlags'] ) ) { |
134 | $envOptions['debugFlags'] = $options['debugFlags']; |
135 | } |
136 | if ( !empty( $options['htmlVariantLanguage'] ) ) { |
137 | $envOptions['htmlVariantLanguage'] = $options['htmlVariantLanguage']; |
138 | } |
139 | if ( !empty( $options['wtVariantLanguage'] ) ) { |
140 | $envOptions['wtVariantLanguage'] = $options['wtVariantLanguage']; |
141 | } |
142 | if ( isset( $options['logLevels'] ) ) { |
143 | $envOptions['logLevels'] = $options['logLevels']; |
144 | } |
145 | return $envOptions; |
146 | } |
147 | |
148 | /** |
149 | * Parsing code shared between the next two methods. |
150 | * |
151 | * @param PageConfig $pageConfig |
152 | * @param ContentMetadataCollector $metadata |
153 | * @param array $options See wikitext2html. |
154 | * @param ?SelectiveUpdateData $selparData See wikitext2html. |
155 | * @return array |
156 | */ |
157 | private function parseWikitext( |
158 | PageConfig $pageConfig, |
159 | ContentMetadataCollector $metadata, |
160 | array $options = [], |
161 | ?SelectiveUpdateData $selparData = null |
162 | ): array { |
163 | $envOptions = $this->setupCommonOptions( $options ); |
164 | if ( isset( $options['outputContentVersion'] ) ) { |
165 | $envOptions['outputContentVersion'] = $options['outputContentVersion']; |
166 | } |
167 | $envOptions['discardDataParsoid'] = !empty( $options['discardDataParsoid'] ); |
168 | if ( isset( $options['wrapSections'] ) ) { |
169 | $envOptions['wrapSections'] = (bool)$options['wrapSections']; |
170 | } |
171 | if ( isset( $options['pageBundle'] ) ) { |
172 | $envOptions['pageBundle'] = (bool)$options['pageBundle']; |
173 | } |
174 | if ( isset( $options['logLinterData'] ) ) { |
175 | $envOptions['logLinterData'] = (bool)$options['logLinterData']; |
176 | } |
177 | if ( isset( $options['linterOverrides'] ) ) { |
178 | $envOptions['linterOverrides'] = $options['linterOverrides']; |
179 | } |
180 | $envOptions['skipLanguageConversionPass'] = |
181 | $options['skipLanguageConversionPass'] ?? false; |
182 | |
183 | $env = new Env( |
184 | $this->siteConfig, $pageConfig, $this->dataAccess, $metadata, $envOptions |
185 | ); |
186 | if ( !$env->compareWt2HtmlLimit( |
187 | 'wikitextSize', strlen( $env->topFrame->getSrcText() ) |
188 | ) ) { |
189 | throw new ResourceLimitExceededException( |
190 | "wt2html: wikitextSize limit exceeded" |
191 | ); |
192 | } |
193 | $contentmodel = $options['contentmodel'] ?? null; |
194 | $handler = $env->getContentHandler( $contentmodel ); |
195 | $extApi = new ParsoidExtensionAPI( $env ); |
196 | // FIXME: Hardcoded to assume 'mode' is 'template' |
197 | return [ $env, $handler->toDOM( $extApi, $selparData ), $contentmodel ]; |
198 | } |
199 | |
200 | /** |
201 | * Parse the wikitext supplied in a `PageConfig` to HTML. |
202 | * |
203 | * @param PageConfig $pageConfig |
204 | * @param array $options [ |
205 | * 'wrapSections' => (bool) Whether `<section>` wrappers should be added. |
206 | * 'pageBundle' => (bool) Sets ids on nodes and stores |
207 | * data-* attributes in a JSON blob. |
208 | * 'body_only' => (bool|null) Only return the <body> children (T181657) |
209 | * 'outputContentVersion' => (string|null) Version of HTML to output. |
210 | * `null` returns the default version. |
211 | * 'contentmodel' => (string|null) The content model of the input. |
212 | * 'discardDataParsoid' => (bool) Drop all data-parsoid annotations. |
213 | * 'offsetType' => (string) ucs2, char, byte are valid values |
214 | * what kind of source offsets should be emitted? |
215 | * 'skipLanguageConversionPass' => (bool) Skip the language variant conversion pass (defaults to false) |
216 | * 'htmlVariantLanguage' => (Bcp47Code) If non-null, the language variant used for Parsoid HTML. |
217 | * 'wtVariantLanguage' => (Bcp47Code) If non-null, the language variant used for wikitext. |
218 | * 'logLinterData' => (bool) Should we log linter data if linting is enabled? |
219 | * 'linterOverrides' => (array) Override the site linting configs. |
220 | * 'traceFlags' => (array) associative array with tracing options |
221 | * 'dumpFlags' => (array) associative array with dump options |
222 | * 'debugFlags' => (array) associative array with debug options |
223 | * 'logLevels' => (string[]) Levels to log |
224 | * ] |
225 | * @param ?array &$headers |
226 | * @param ?ContentMetadataCollector $metadata Pass in a CMC in order to |
227 | * collect and retrieve metadata about the parse. |
228 | * @param ?SelectiveUpdateData $selparData |
229 | * @return PageBundle|string |
230 | */ |
231 | public function wikitext2html( |
232 | PageConfig $pageConfig, array $options = [], ?array &$headers = null, |
233 | ?ContentMetadataCollector $metadata = null, ?SelectiveUpdateData $selparData = null |
234 | ) { |
235 | if ( $metadata === null ) { |
236 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
237 | } |
238 | |
239 | $parseTiming = Timing::start(); |
240 | [ $env, $doc, $contentmodel ] = $this->parseWikitext( $pageConfig, $metadata, $options, $selparData ); |
241 | $parseTime = $parseTiming->end(); |
242 | |
243 | // FIXME: Does this belong in parseWikitext so that the other endpoint |
244 | // is covered as well? It probably depends on expectations of the |
245 | // Rest API. If callers of /page/lint/ assume that will update the |
246 | // results on the Special page. |
247 | if ( $env->linting() ) { |
248 | ( new LintLogger( $env ) )->logLintOutput(); |
249 | } |
250 | |
251 | $headers = DOMUtils::findHttpEquivHeaders( $doc ); |
252 | $body_only = !empty( $options['body_only'] ); |
253 | $node = $body_only ? DOMCompat::getBody( $doc ) : $doc; |
254 | |
255 | if ( $env->pageBundle ) { |
256 | $out = ContentUtils::extractDpAndSerialize( $node, [ |
257 | 'innerXML' => $body_only, |
258 | ] ); |
259 | } else { |
260 | $out = [ |
261 | 'html' => ContentUtils::toXML( $node, [ |
262 | 'innerXML' => $body_only, |
263 | ] ), |
264 | ]; |
265 | } |
266 | |
267 | $this->recordParseMetrics( $env, $parseTime, $out ); |
268 | |
269 | if ( $env->pageBundle ) { |
270 | return new PageBundle( |
271 | $out['html'], |
272 | $out['pb']->parsoid, $out['pb']->mw ?? null, |
273 | $env->getOutputContentVersion(), |
274 | $headers, |
275 | $contentmodel |
276 | ); |
277 | } else { |
278 | return $out['html']; |
279 | } |
280 | } |
281 | |
282 | /** |
283 | * |
284 | */ |
285 | private function recordParseMetrics( |
286 | Env $env, float $parseTime, array $out |
287 | ) { |
288 | $metrics = $this->siteConfig->metrics(); |
289 | if ( !$metrics ) { |
290 | return; |
291 | } |
292 | |
293 | $pageConfig = $env->getPageConfig(); |
294 | |
295 | // This is somewhat suspect because ParsoidHandler::tryToCreatePageConfig |
296 | // can set a revision id on a MutableRevisionRecord, but it might be simpler |
297 | // to make that go away |
298 | if ( $pageConfig->getRevisionId() ) { |
299 | $mstr = 'pageWithOldid'; |
300 | } else { |
301 | $mstr = 'wt'; |
302 | } |
303 | |
304 | $timing = Timing::fakeTiming( $this->siteConfig, $parseTime, true ); |
305 | $timing->end( "entry.wt2html.{$mstr}.parse", 'wt2html_parse_seconds', [ 'type' => $mstr ] ); |
306 | $version = 'default'; |
307 | |
308 | if ( Semver::satisfies( |
309 | $env->getOutputContentVersion(), '!=' . self::defaultHTMLVersion() |
310 | ) ) { |
311 | $metrics->increment( 'entry.wt2html.parse.version.notdefault' ); |
312 | $version = 'non-default'; |
313 | } |
314 | |
315 | $this->siteConfig->incrementCounter( 'wt2hml_parse_total', [ |
316 | 'type' => $mstr, |
317 | 'version' => $version |
318 | ] ); |
319 | |
320 | // @phan-suppress-next-line PhanDeprecatedFunction |
321 | $timing = Timing::fakeTiming( $this->siteConfig, strlen( $pageConfig->getPageMainContent() ) ); |
322 | $timing->end( |
323 | "entry.wt2html.{$mstr}.size.input", |
324 | "wt2html_size_input_bytes", |
325 | [ "type" => $mstr ] |
326 | ); |
327 | |
328 | $outSize = strlen( $out['html'] ); |
329 | $timing = Timing::fakeTiming( $this->siteConfig, $outSize ); |
330 | $timing->end( "entry.wt2html.{$mstr}.size.output", "wt2html_size_output_bytes", [ "type" => $mstr ] ); |
331 | |
332 | if ( $parseTime > 10 && $outSize > 100 ) { |
333 | // * Don't bother with this metric for really small parse times |
334 | // p99 for initialization time is ~7ms according to grafana. |
335 | // So, 10ms ensures that startup overheads don't skew the metrics |
336 | // * For body_only=false requests, <head> section isn't generated |
337 | // and if the output is small, per-request overheads can skew |
338 | // the timePerKB metrics. |
339 | // |
340 | // NOTE: This is slightly misleading since there are fixed costs |
341 | // for generating output like the <head> section and should be factored in, |
342 | // but this is good enough for now as a useful first degree of approxmation. |
343 | $msPerKB = $parseTime * 1024 / $outSize; |
344 | $timing = Timing::fakeTiming( $this->siteConfig, $msPerKB ); |
345 | $timing->end( |
346 | 'entry.wt2html.timePerKB', |
347 | 'wt2html_msPerKB', |
348 | [] |
349 | ); |
350 | } |
351 | } |
352 | |
353 | /** |
354 | * Lint the wikitext supplied in a `PageConfig`. |
355 | * |
356 | * @param PageConfig $pageConfig |
357 | * @param array $options See wikitext2html. |
358 | * @param ?ContentMetadataCollector $metadata Pass in a CMC in order to |
359 | * collect and retrieve metadata about the parse. |
360 | * @return array |
361 | */ |
362 | public function wikitext2lint( |
363 | PageConfig $pageConfig, array $options = [], |
364 | ?ContentMetadataCollector $metadata = null |
365 | ): array { |
366 | if ( $metadata === null ) { |
367 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
368 | } |
369 | [ $env, ] = $this->parseWikitext( $pageConfig, $metadata, $options ); |
370 | return $env->getLints(); |
371 | } |
372 | |
373 | /** |
374 | * Serialize DOM to wikitext. |
375 | * |
376 | * @param PageConfig $pageConfig |
377 | * @param Document $doc Data attributes are expected to have been applied |
378 | * already. Loading them will happen once the environment is created. |
379 | * @param array $options [ |
380 | * 'inputContentVersion' => (string) The content version of the input. |
381 | * Necessary if it differs from the current default in order to |
382 | * account for any serialization differences. |
383 | * 'offsetType' => (string) ucs2, char, byte are valid values |
384 | * what kind of source offsets are present in the HTML? |
385 | * 'contentmodel' => (string|null) The content model of the input. |
386 | * 'htmlVariantLanguage' => (Bcp47Code) If non-null, the language variant used for Parsoid HTML. |
387 | * 'wtVariantLanguage' => (Bcp47Code) If non-null, the language variant used for wikitext. |
388 | * 'traceFlags' => (array) associative array with tracing options |
389 | * 'dumpFlags' => (array) associative array with dump options |
390 | * 'debugFlags' => (array) associative array with debug options |
391 | * 'logLevels' => (string[]) Levels to log |
392 | * 'htmlSize' => (int) Size of the HTML that generated $doc |
393 | * ] |
394 | * @param ?SelectiveUpdateData $selserData |
395 | * @return string |
396 | */ |
397 | public function dom2wikitext( |
398 | PageConfig $pageConfig, Document $doc, array $options = [], |
399 | ?SelectiveUpdateData $selserData = null |
400 | ): string { |
401 | $envOptions = $this->setupCommonOptions( $options ); |
402 | if ( isset( $options['inputContentVersion'] ) ) { |
403 | $envOptions['inputContentVersion'] = $options['inputContentVersion']; |
404 | } |
405 | $envOptions['topLevelDoc'] = $doc; |
406 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
407 | $env = new Env( |
408 | $this->siteConfig, $pageConfig, $this->dataAccess, $metadata, $envOptions |
409 | ); |
410 | $env->bumpHtml2WtResourceUse( 'htmlSize', $options['htmlSize'] ?? 0 ); |
411 | $contentmodel = $options['contentmodel'] ?? null; |
412 | $handler = $env->getContentHandler( $contentmodel ); |
413 | $extApi = new ParsoidExtensionAPI( $env ); |
414 | |
415 | $serialTiming = Timing::start(); |
416 | $wikitext = $handler->fromDOM( $extApi, $selserData ); |
417 | $serialTime = $serialTiming->end(); |
418 | |
419 | $this->recordSerializationMetrics( $options, $serialTime, $wikitext ); |
420 | |
421 | return $wikitext; |
422 | } |
423 | |
424 | /** |
425 | * |
426 | */ |
427 | private function recordSerializationMetrics( |
428 | array $options, float $serialTime, string $wikitext |
429 | ) { |
430 | $siteConfig = $this->siteConfig; |
431 | $metrics = $siteConfig->metrics(); |
432 | if ( !$metrics ) { |
433 | return; |
434 | } |
435 | |
436 | $htmlSize = $options['htmlSize'] ?? 0; |
437 | $timing = Timing::fakeTiming( $this->siteConfig, $htmlSize ); |
438 | $timing->end( 'entry.html2wt.size.input', 'html2wt_size_input_bytes' ); |
439 | |
440 | if ( isset( $options['inputContentVersion'] ) ) { |
441 | $metrics->increment( |
442 | 'entry.html2wt.original.version.' . $options['inputContentVersion'] |
443 | ); |
444 | $this->siteConfig->incrementCounter( |
445 | 'html2wt_original_version', |
446 | [ 'input_content_version' => $options['inputContentVersion'] ] |
447 | ); |
448 | } |
449 | |
450 | $timing = Timing::fakeTiming( $this->siteConfig, $serialTime, true ); |
451 | $timing->end( 'entry.html2wt.total', 'html2wt_total_seconds', [] ); |
452 | |
453 | $timing = Timing::fakeTiming( $this->siteConfig, strlen( $wikitext ) ); |
454 | $timing->end( 'entry.html2wt.size.output', 'html2wt_size_output_bytes', [] ); |
455 | |
456 | if ( $htmlSize ) { // Avoid division by zero |
457 | // NOTE: the name timePerInputKB is misleading, since $htmlSize is |
458 | // in characters, not bytes. |
459 | $msPerKB = $serialTime * 1024 / $htmlSize; |
460 | $timing = Timing::fakeTiming( $this->siteConfig, $msPerKB ); |
461 | $timing->end( |
462 | 'entry.html2wt.timePerInputKB', |
463 | 'html2wt_msPerKB', |
464 | [] |
465 | ); |
466 | } |
467 | } |
468 | |
469 | /** |
470 | * Serialize HTML to wikitext. Convenience method for dom2wikitext. |
471 | * |
472 | * @param PageConfig $pageConfig |
473 | * @param string $html |
474 | * @param array $options |
475 | * @param ?SelectiveUpdateData $selserData |
476 | * @return string |
477 | */ |
478 | public function html2wikitext( |
479 | PageConfig $pageConfig, string $html, array $options = [], |
480 | ?SelectiveUpdateData $selserData = null |
481 | ): string { |
482 | $doc = DOMUtils::parseHTML( $html, true ); |
483 | $options['htmlSize'] ??= mb_strlen( $html ); |
484 | return $this->dom2wikitext( $pageConfig, $doc, $options, $selserData ); |
485 | } |
486 | |
487 | /** |
488 | * Update the supplied PageBundle based on the `$update` type. |
489 | * |
490 | * 'convertoffsets': Convert offsets between formats (byte, char, ucs2) |
491 | * 'redlinks': Refreshes the classes of known, missing, etc. links. |
492 | * 'variant': Converts the HTML based on the supplied variant. |
493 | * |
494 | * Note that these are DOM transforms, and not roundtrips through wikitext. |
495 | * |
496 | * @param PageConfig $pageConfig |
497 | * @param string $update 'redlinks'|'variant' |
498 | * @param PageBundle $pb |
499 | * @param array $options |
500 | * @return PageBundle |
501 | */ |
502 | public function pb2pb( |
503 | PageConfig $pageConfig, string $update, PageBundle $pb, |
504 | array $options = [] |
505 | ): PageBundle { |
506 | $envOptions = [ |
507 | 'pageBundle' => true, |
508 | 'topLevelDoc' => DOMUtils::parseHTML( $pb->toHtml(), true ), |
509 | ]; |
510 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
511 | $env = new Env( |
512 | $this->siteConfig, $pageConfig, $this->dataAccess, $metadata, $envOptions |
513 | ); |
514 | $doc = $env->topLevelDoc; |
515 | DOMDataUtils::visitAndLoadDataAttribs( |
516 | DOMCompat::getBody( $doc ), [ 'markNew' => true ] |
517 | ); |
518 | |
519 | $dataBagPB = DOMDataUtils::getPageBundle( $doc ); |
520 | switch ( $update ) { |
521 | case 'convertoffsets': |
522 | ContentUtils::convertOffsets( |
523 | $env, $doc, $options['inputOffsetType'], $options['outputOffsetType'] |
524 | ); |
525 | $dataBagPB->parsoid['offsetType'] = $options['outputOffsetType']; |
526 | $dataBagPB->parsoid['counter'] = $pb->parsoid['counter']; |
527 | break; |
528 | |
529 | case 'redlinks': |
530 | ContentUtils::convertOffsets( |
531 | $env, $doc, $env->getRequestOffsetType(), 'byte' |
532 | ); |
533 | ( new AddRedLinks() )->run( $env, DOMCompat::getBody( $doc ) ); |
534 | ( new ConvertOffsets() )->run( $env, DOMCompat::getBody( $doc ), [], true ); |
535 | break; |
536 | |
537 | case 'variant': |
538 | ContentUtils::convertOffsets( |
539 | $env, $doc, $env->getRequestOffsetType(), 'byte' |
540 | ); |
541 | |
542 | // Note that `maybeConvert` could still be a no-op, in case the |
543 | // __NOCONTENTCONVERT__ magic word is present, or the htmlVariant |
544 | // is a base language code or otherwise invalid. |
545 | $hasWtVariant = $options['variant']['wikitext'] ?? |
546 | // Deprecated name for this option: |
547 | $options['variant']['source'] ?? false; |
548 | LanguageConverter::maybeConvert( |
549 | $env, $doc, |
550 | Utils::mwCodeToBcp47( |
551 | $options['variant']['html'] ?? |
552 | // Deprecated name for this option: |
553 | $options['variant']['target'], |
554 | // Be strict in what we accept. |
555 | true, $this->siteConfig->getLogger() |
556 | ), |
557 | $hasWtVariant ? |
558 | Utils::mwCodeToBcp47( |
559 | $options['variant']['wikitext'] ?? |
560 | // Deprecated name for this option: |
561 | $options['variant']['source'], |
562 | // Be strict in what we accept. |
563 | true, $this->siteConfig->getLogger() |
564 | ) : null |
565 | ); |
566 | |
567 | // NOTE: Keep this in sync with code in core's LanguageVariantConverter |
568 | // Update content-language and vary headers. |
569 | DOMUtils::addHttpEquivHeaders( $doc, [ |
570 | 'content-language' => $env->htmlContentLanguageBcp47()->toBcp47Code(), |
571 | 'vary' => $env->htmlVary() |
572 | ] ); |
573 | |
574 | ( new ConvertOffsets() )->run( $env, DOMCompat::getBody( $doc ), [], true ); |
575 | break; |
576 | |
577 | default: |
578 | throw new LogicException( $update . 'is an unknown transformation' ); |
579 | } |
580 | |
581 | DOMDataUtils::visitAndStoreDataAttribs( |
582 | DOMCompat::getBody( $doc ), [ |
583 | 'discardDataParsoid' => $env->discardDataParsoid, |
584 | 'storeInPageBundle' => $env->pageBundle, |
585 | 'env' => $env, |
586 | ] |
587 | ); |
588 | $body_only = !empty( $options['body_only'] ); |
589 | $node = $body_only ? DOMCompat::getBody( $doc ) : $doc; |
590 | DOMDataUtils::injectPageBundle( $doc, $dataBagPB ); |
591 | $out = ContentUtils::extractDpAndSerialize( $node, [ |
592 | 'innerXML' => $body_only, |
593 | ] ); |
594 | return new PageBundle( |
595 | $out['html'], |
596 | $out['pb']->parsoid, $out['pb']->mw ?? null, |
597 | // Prefer the passed in version, since this was just a transformation |
598 | $pb->version ?? $env->getOutputContentVersion(), |
599 | DOMUtils::findHttpEquivHeaders( $doc ), |
600 | // Prefer the passed in content model |
601 | $pb->contentmodel ?? $pageConfig->getContentModel() |
602 | ); |
603 | } |
604 | |
605 | /** |
606 | * Perform pre-save transformations with top-level templates subst'd. |
607 | * |
608 | * @param PageConfig $pageConfig |
609 | * @param string $wikitext |
610 | * @return string |
611 | */ |
612 | public function substTopLevelTemplates( |
613 | PageConfig $pageConfig, string $wikitext |
614 | ): string { |
615 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
616 | $env = new Env( $this->siteConfig, $pageConfig, $this->dataAccess, $metadata ); |
617 | return Wikitext::pst( $env, $wikitext, true /* $substTLTemplates */ ); |
618 | } |
619 | |
620 | /** |
621 | * Check whether a given content version can be downgraded to the requested |
622 | * content version. |
623 | * |
624 | * @param string $from Current content version |
625 | * @param string $to Requested content version |
626 | * @return string[]|null The downgrade that will fulfill the request, as |
627 | * [ 'from' => <old version>, 'to' => <new version> ], or null if it |
628 | * can't be fulfilled. |
629 | */ |
630 | public static function findDowngrade( string $from, string $to ): ?array { |
631 | foreach ( self::DOWNGRADES as [ 'from' => $dgFrom, 'to' => $dgTo ] ) { |
632 | if ( |
633 | Semver::satisfies( $from, "^$dgFrom" ) && |
634 | Semver::satisfies( $to, "^$dgTo" ) |
635 | ) { |
636 | // FIXME: Make this a class? |
637 | return [ 'from' => $dgFrom, 'to' => $dgTo ]; |
638 | } |
639 | } |
640 | return null; |
641 | } |
642 | |
643 | /** |
644 | * Downgrade a document to an older content version. |
645 | * |
646 | * @param string[] $dg Value returned by findDowngrade(). |
647 | * @param PageBundle $pageBundle |
648 | */ |
649 | public static function downgrade( |
650 | array $dg, PageBundle $pageBundle |
651 | ): void { |
652 | foreach ( self::DOWNGRADES as [ 'from' => $dgFrom, 'to' => $dgTo, 'func' => $dgFunc ] ) { |
653 | if ( $dg['from'] === $dgFrom && $dg['to'] === $dgTo ) { |
654 | call_user_func( [ self::class, $dgFunc ], $pageBundle ); |
655 | |
656 | // FIXME: Maybe this resolve should just be part of the $dg |
657 | $pageBundle->version = self::resolveContentVersion( $dg['to'] ); |
658 | |
659 | // FIXME: Maybe this should be a helper to avoid the rt |
660 | $doc = DOMUtils::parseHTML( $pageBundle->html ); |
661 | // Match the http-equiv meta to the content-type header |
662 | $meta = DOMCompat::querySelector( $doc, |
663 | 'meta[property="mw:htmlVersion"], meta[property="mw:html:version"]' ); |
664 | if ( $meta ) { |
665 | $meta->setAttribute( 'content', $pageBundle->version ); |
666 | $pageBundle->html = ContentUtils::toXML( $doc ); |
667 | } |
668 | |
669 | return; |
670 | } |
671 | } |
672 | throw new InvalidArgumentException( |
673 | "Unsupported downgrade: {$dg['from']} -> {$dg['to']}" |
674 | ); |
675 | } |
676 | |
677 | /** |
678 | * Check if language variant conversion is implemented for a language |
679 | * |
680 | * @internal FIXME: Remove once Parsoid's language variant work is completed |
681 | * @param PageConfig $pageConfig |
682 | * @param Bcp47Code $htmlVariant Variant language to check |
683 | * @return bool |
684 | */ |
685 | public function implementsLanguageConversionBcp47( PageConfig $pageConfig, Bcp47Code $htmlVariant ): bool { |
686 | // Hardcode disable zh lang conversion support since Parsoid's |
687 | // implementation is incomplete and not performant (T346657). |
688 | if ( $pageConfig->getPageLanguageBcp47()->toBcp47Code() === 'zh' ) { |
689 | return false; |
690 | } |
691 | |
692 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
693 | $env = new Env( $this->siteConfig, $pageConfig, $this->dataAccess, $metadata ); |
694 | return LanguageConverter::implementsLanguageConversionBcp47( $env, $htmlVariant ); |
695 | } |
696 | |
697 | /** |
698 | * Downgrade the given document and pagebundle from 999.x to 2.x. |
699 | * |
700 | * @param PageBundle $pageBundle |
701 | */ |
702 | private static function downgrade999to2( PageBundle $pageBundle ) { |
703 | // Effectively, skip applying data-parsoid. Note that if we were to |
704 | // support a pb2html downgrade, we'd need to apply the full thing, |
705 | // but that would create complications where ids would be left behind. |
706 | // See the comment in around `DOMDataUtils::applyPageBundle` |
707 | $newPageBundle = new PageBundle( |
708 | $pageBundle->html, |
709 | [ 'ids' => [] ], |
710 | $pageBundle->mw |
711 | ); |
712 | $pageBundle->html = $newPageBundle->toHtml(); |
713 | // Now, modify the pagebundle to the expected form. This is important |
714 | // since, at least in the serialization path, the original pb will be |
715 | // applied to the modified content and its presence could cause lost |
716 | // deletions. |
717 | $pageBundle->mw = [ 'ids' => [] ]; |
718 | } |
719 | } |