Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
54.18% |
136 / 251 |
|
10.53% |
2 / 19 |
CRAP | |
0.00% |
0 / 1 |
Parsoid | |
54.18% |
136 / 251 |
|
10.53% |
2 / 19 |
457.94 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
version | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
defaultHTMLVersion | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
resolveContentVersion | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
supportsLanguageConversion | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
setupCommonOptions | |
0.00% |
0 / 16 |
|
0.00% |
0 / 1 |
72 | |||
parseWikitext | |
0.00% |
0 / 27 |
|
0.00% |
0 / 1 |
56 | |||
wikitext2html | |
96.55% |
28 / 29 |
|
0.00% |
0 / 1 |
6 | |||
recordParseMetrics | |
76.19% |
16 / 21 |
|
0.00% |
0 / 1 |
6.49 | |||
wikitext2lint | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
dom2wikitext | |
0.00% |
0 / 17 |
|
0.00% |
0 / 1 |
6 | |||
recordSerializationMetrics | |
71.43% |
10 / 14 |
|
0.00% |
0 / 1 |
4.37 | |||
html2wikitext | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
pb2pb | |
97.30% |
72 / 74 |
|
0.00% |
0 / 1 |
7 | |||
substTopLevelTemplates | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
findDowngrade | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
20 | |||
downgrade | |
0.00% |
0 / 14 |
|
0.00% |
0 / 1 |
30 | |||
implementsLanguageConversionBcp47 | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
2.03 | |||
downgrade999to2 | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid; |
5 | |
6 | use Composer\InstalledVersions; |
7 | use Composer\Semver\Comparator; |
8 | use Composer\Semver\Semver; |
9 | use InvalidArgumentException; |
10 | use LogicException; |
11 | use Wikimedia\Bcp47Code\Bcp47Code; |
12 | use Wikimedia\Parsoid\Config\DataAccess; |
13 | use Wikimedia\Parsoid\Config\Env; |
14 | use Wikimedia\Parsoid\Config\PageConfig; |
15 | use Wikimedia\Parsoid\Config\SiteConfig; |
16 | use Wikimedia\Parsoid\Config\StubMetadataCollector; |
17 | use Wikimedia\Parsoid\Core\ContentMetadataCollector; |
18 | use Wikimedia\Parsoid\Core\PageBundle; |
19 | use Wikimedia\Parsoid\Core\ResourceLimitExceededException; |
20 | use Wikimedia\Parsoid\Core\SelserData; |
21 | use Wikimedia\Parsoid\DOM\Document; |
22 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
23 | use Wikimedia\Parsoid\Language\LanguageConverter; |
24 | use Wikimedia\Parsoid\Logger\LintLogger; |
25 | use Wikimedia\Parsoid\Utils\ContentUtils; |
26 | use Wikimedia\Parsoid\Utils\DOMCompat; |
27 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
28 | use Wikimedia\Parsoid\Utils\DOMUtils; |
29 | use Wikimedia\Parsoid\Utils\Timing; |
30 | use Wikimedia\Parsoid\Utils\Utils; |
31 | use Wikimedia\Parsoid\Wikitext\Wikitext; |
32 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\AddRedLinks; |
33 | use Wikimedia\Parsoid\Wt2Html\DOM\Processors\ConvertOffsets; |
34 | |
35 | class Parsoid { |
36 | |
37 | /** |
38 | * Available HTML content versions. |
39 | * @see https://www.mediawiki.org/wiki/Parsoid/API#Content_Negotiation |
40 | * @see https://www.mediawiki.org/wiki/Specs/HTML#Versioning |
41 | */ |
42 | public const AVAILABLE_VERSIONS = [ '2.8.0', '999.0.0' ]; |
43 | |
44 | private const DOWNGRADES = [ |
45 | [ 'from' => '999.0.0', 'to' => '2.0.0', 'func' => 'downgrade999to2' ], |
46 | ]; |
47 | |
48 | /** @var SiteConfig */ |
49 | private $siteConfig; |
50 | |
51 | /** @var DataAccess */ |
52 | private $dataAccess; |
53 | |
54 | public function __construct( |
55 | SiteConfig $siteConfig, DataAccess $dataAccess |
56 | ) { |
57 | $this->siteConfig = $siteConfig; |
58 | $this->dataAccess = $dataAccess; |
59 | } |
60 | |
61 | /** |
62 | * Returns the currently-installed version of Parsoid. |
63 | * @return string |
64 | */ |
65 | public static function version(): string { |
66 | try { |
67 | // See https://getcomposer.org/doc/07-runtime.md#knowing-the-version-of-package-x |
68 | return InstalledVersions::getVersion( 'wikimedia/parsoid' ) ?? |
69 | // From the composer runtime API docs: |
70 | // "It is nonetheless a good idea to make sure you |
71 | // handle the null return value as gracefully as |
72 | // possible for safety." |
73 | 'null'; |
74 | } catch ( \Throwable $t ) { |
75 | // Belt-and-suspenders protection against parts of the composer |
76 | // runtime API being absent in production. |
77 | return 'error'; |
78 | } |
79 | } |
80 | |
81 | /** |
82 | * Returns the default HTML content version |
83 | * @return string |
84 | */ |
85 | public static function defaultHTMLVersion(): string { |
86 | return self::AVAILABLE_VERSIONS[0]; |
87 | } |
88 | |
89 | /** |
90 | * See if any content version Parsoid knows how to produce satisfies the |
91 | * the supplied version, when interpreted with semver caret semantics. |
92 | * This will allow us to make backwards compatible changes, without the need |
93 | * for clients to bump the version in their headers all the time. |
94 | * |
95 | * @param string $version |
96 | * @return string|null |
97 | */ |
98 | public static function resolveContentVersion( string $version ) { |
99 | foreach ( self::AVAILABLE_VERSIONS as $i => $a ) { |
100 | if ( Semver::satisfies( $a, "^{$version}" ) && |
101 | // The section wrapping in 1.6.x should have induced a major |
102 | // version bump, since it requires upgrading clients to |
103 | // handle it. We therefore hardcode this in so that we can |
104 | // fail hard. |
105 | Comparator::greaterThanOrEqualTo( $version, '1.6.0' ) |
106 | ) { |
107 | return $a; |
108 | } |
109 | } |
110 | return null; |
111 | } |
112 | |
113 | /** |
114 | * Determine if language conversion is enabled, aka if the optional |
115 | * wikimedia/langconv library is installed. |
116 | * @return bool True if the wikimedia/langconv library is available |
117 | */ |
118 | public static function supportsLanguageConversion(): bool { |
119 | return class_exists( '\Wikimedia\LangConv\ReplacementMachine' ); |
120 | } |
121 | |
122 | private function setupCommonOptions( array $options ): array { |
123 | $envOptions = []; |
124 | if ( isset( $options['offsetType'] ) ) { |
125 | $envOptions['offsetType'] = $options['offsetType']; |
126 | } |
127 | if ( isset( $options['traceFlags'] ) ) { |
128 | $envOptions['traceFlags'] = $options['traceFlags']; |
129 | } |
130 | if ( isset( $options['dumpFlags'] ) ) { |
131 | $envOptions['dumpFlags'] = $options['dumpFlags']; |
132 | } |
133 | if ( isset( $options['debugFlags'] ) ) { |
134 | $envOptions['debugFlags'] = $options['debugFlags']; |
135 | } |
136 | if ( !empty( $options['htmlVariantLanguage'] ) ) { |
137 | $envOptions['htmlVariantLanguage'] = $options['htmlVariantLanguage']; |
138 | } |
139 | if ( !empty( $options['wtVariantLanguage'] ) ) { |
140 | $envOptions['wtVariantLanguage'] = $options['wtVariantLanguage']; |
141 | } |
142 | if ( isset( $options['logLevels'] ) ) { |
143 | $envOptions['logLevels'] = $options['logLevels']; |
144 | } |
145 | return $envOptions; |
146 | } |
147 | |
148 | /** |
149 | * Parsing code shared between the next two methods. |
150 | * |
151 | * @param PageConfig $pageConfig |
152 | * @param ContentMetadataCollector $metadata |
153 | * @param array $options See wikitext2html. |
154 | * @return array |
155 | */ |
156 | private function parseWikitext( |
157 | PageConfig $pageConfig, |
158 | ContentMetadataCollector $metadata, |
159 | array $options = [] |
160 | ): array { |
161 | $envOptions = $this->setupCommonOptions( $options ); |
162 | if ( isset( $options['outputContentVersion'] ) ) { |
163 | $envOptions['outputContentVersion'] = $options['outputContentVersion']; |
164 | } |
165 | $envOptions['discardDataParsoid'] = !empty( $options['discardDataParsoid'] ); |
166 | if ( isset( $options['wrapSections'] ) ) { |
167 | $envOptions['wrapSections'] = (bool)$options['wrapSections']; |
168 | } |
169 | if ( isset( $options['pageBundle'] ) ) { |
170 | $envOptions['pageBundle'] = (bool)$options['pageBundle']; |
171 | } |
172 | if ( isset( $options['logLinterData'] ) ) { |
173 | $envOptions['logLinterData'] = (bool)$options['logLinterData']; |
174 | } |
175 | if ( isset( $options['linterOverrides'] ) ) { |
176 | $envOptions['linterOverrides'] = $options['linterOverrides']; |
177 | } |
178 | $envOptions['skipLanguageConversionPass'] = |
179 | $options['skipLanguageConversionPass'] ?? false; |
180 | |
181 | $env = new Env( |
182 | $this->siteConfig, $pageConfig, $this->dataAccess, $metadata, $envOptions |
183 | ); |
184 | if ( !$env->compareWt2HtmlLimit( |
185 | 'wikitextSize', strlen( $env->topFrame->getSrcText() ) |
186 | ) ) { |
187 | throw new ResourceLimitExceededException( |
188 | "wt2html: wikitextSize limit exceeded" |
189 | ); |
190 | } |
191 | $contentmodel = $options['contentmodel'] ?? null; |
192 | $handler = $env->getContentHandler( $contentmodel ); |
193 | $extApi = new ParsoidExtensionAPI( $env ); |
194 | return [ $env, $handler->toDOM( $extApi ), $contentmodel ]; |
195 | } |
196 | |
197 | /** |
198 | * Parse the wikitext supplied in a `PageConfig` to HTML. |
199 | * |
200 | * @param PageConfig $pageConfig |
201 | * @param array $options [ |
202 | * 'wrapSections' => (bool) Whether `<section>` wrappers should be added. |
203 | * 'pageBundle' => (bool) Sets ids on nodes and stores |
204 | * data-* attributes in a JSON blob. |
205 | * 'body_only' => (bool|null) Only return the <body> children (T181657) |
206 | * 'outputContentVersion' => (string|null) Version of HTML to output. |
207 | * `null` returns the default version. |
208 | * 'contentmodel' => (string|null) The content model of the input. |
209 | * 'discardDataParsoid' => (bool) Drop all data-parsoid annotations. |
210 | * 'offsetType' => (string) ucs2, char, byte are valid values |
211 | * what kind of source offsets should be emitted? |
212 | * 'skipLanguageConversionPass' => (bool) Skip the language variant conversion pass (defaults to false) |
213 | * 'htmlVariantLanguage' => (Bcp47Code) If non-null, the language variant used for Parsoid HTML. |
214 | * 'wtVariantLanguage' => (Bcp47Code) If non-null, the language variant used for wikitext. |
215 | * 'logLinterData' => (bool) Should we log linter data if linting is enabled? |
216 | * 'linterOverrides' => (array) Override the site linting configs. |
217 | * 'traceFlags' => (array) associative array with tracing options |
218 | * 'dumpFlags' => (array) associative array with dump options |
219 | * 'debugFlags' => (array) associative array with debug options |
220 | * 'logLevels' => (string[]) Levels to log |
221 | * ] |
222 | * @param ?array &$headers |
223 | * @param ?ContentMetadataCollector $metadata Pass in a CMC in order to |
224 | * collect and retrieve metadata about the parse. |
225 | * @return PageBundle|string |
226 | */ |
227 | public function wikitext2html( |
228 | PageConfig $pageConfig, array $options = [], ?array &$headers = null, |
229 | ?ContentMetadataCollector $metadata = null |
230 | ) { |
231 | if ( $metadata === null ) { |
232 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
233 | } |
234 | |
235 | $parseTiming = Timing::start(); |
236 | [ $env, $doc, $contentmodel ] = $this->parseWikitext( $pageConfig, $metadata, $options ); |
237 | $parseTime = $parseTiming->end(); |
238 | |
239 | // FIXME: Does this belong in parseWikitext so that the other endpoint |
240 | // is covered as well? It probably depends on expectations of the |
241 | // Rest API. If callers of /page/lint/ assume that will update the |
242 | // results on the Special page. |
243 | if ( $env->linting() ) { |
244 | ( new LintLogger( $env ) )->logLintOutput(); |
245 | } |
246 | |
247 | $headers = DOMUtils::findHttpEquivHeaders( $doc ); |
248 | $body_only = !empty( $options['body_only'] ); |
249 | $node = $body_only ? DOMCompat::getBody( $doc ) : $doc; |
250 | |
251 | if ( $env->pageBundle ) { |
252 | $out = ContentUtils::extractDpAndSerialize( $node, [ |
253 | 'innerXML' => $body_only, |
254 | ] ); |
255 | } else { |
256 | $out = [ |
257 | 'html' => ContentUtils::toXML( $node, [ |
258 | 'innerXML' => $body_only, |
259 | ] ), |
260 | ]; |
261 | } |
262 | |
263 | $this->recordParseMetrics( $env, $parseTime, $out ); |
264 | |
265 | if ( $env->pageBundle ) { |
266 | return new PageBundle( |
267 | $out['html'], |
268 | $out['pb']->parsoid, $out['pb']->mw ?? null, |
269 | $env->getOutputContentVersion(), |
270 | $headers, |
271 | $contentmodel |
272 | ); |
273 | } else { |
274 | return $out['html']; |
275 | } |
276 | } |
277 | |
278 | /** |
279 | * |
280 | */ |
281 | private function recordParseMetrics( |
282 | Env $env, float $parseTime, array $out |
283 | ) { |
284 | $metrics = $this->siteConfig->metrics(); |
285 | if ( !$metrics ) { |
286 | return; |
287 | } |
288 | |
289 | $pageConfig = $env->getPageConfig(); |
290 | |
291 | // This is somewhat suspect because ParsoidHandler::tryToCreatePageConfig |
292 | // can set a revision id on a MutableRevisionRecord, but it might be simpler |
293 | // to make that go away |
294 | if ( $pageConfig->getRevisionId() ) { |
295 | $mstr = 'pageWithOldid'; |
296 | } else { |
297 | $mstr = 'wt'; |
298 | } |
299 | |
300 | $metrics->timing( "entry.wt2html.{$mstr}.parse", $parseTime ); |
301 | |
302 | if ( Semver::satisfies( |
303 | $env->getOutputContentVersion(), '!=' . self::defaultHTMLVersion() |
304 | ) ) { |
305 | $metrics->increment( 'entry.wt2html.parse.version.notdefault' ); |
306 | } |
307 | |
308 | $metrics->timing( |
309 | "entry.wt2html.{$mstr}.size.input", |
310 | // @phan-suppress-next-line PhanDeprecatedFunction |
311 | strlen( $pageConfig->getPageMainContent() ) |
312 | ); |
313 | |
314 | $outSize = strlen( $out['html'] ); |
315 | $metrics->timing( "entry.wt2html.{$mstr}.size.output", $outSize ); |
316 | |
317 | if ( $parseTime > 10 && $outSize > 100 ) { |
318 | // * Don't bother with this metric for really small parse times |
319 | // p99 for initialization time is ~7ms according to grafana. |
320 | // So, 10ms ensures that startup overheads don't skew the metrics |
321 | // * For body_only=false requests, <head> section isn't generated |
322 | // and if the output is small, per-request overheads can skew |
323 | // the timePerKB metrics. |
324 | // |
325 | // NOTE: This is slightly misleading since there are fixed costs |
326 | // for generating output like the <head> section and should be factored in, |
327 | // but this is good enough for now as a useful first degree of approxmation. |
328 | $timePerKB = $parseTime * 1024 / $outSize; |
329 | $metrics->timing( 'entry.wt2html.timePerKB', $timePerKB ); |
330 | } |
331 | } |
332 | |
333 | /** |
334 | * Lint the wikitext supplied in a `PageConfig`. |
335 | * |
336 | * @param PageConfig $pageConfig |
337 | * @param array $options See wikitext2html. |
338 | * @return array |
339 | */ |
340 | public function wikitext2lint( |
341 | PageConfig $pageConfig, array $options = [] |
342 | ): array { |
343 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
344 | [ $env, ] = $this->parseWikitext( $pageConfig, $metadata, $options ); |
345 | return $env->getLints(); |
346 | } |
347 | |
348 | /** |
349 | * Serialize DOM to wikitext. |
350 | * |
351 | * @param PageConfig $pageConfig |
352 | * @param Document $doc Data attributes are expected to have been applied |
353 | * already. Loading them will happen once the environment is created. |
354 | * @param array $options [ |
355 | * 'inputContentVersion' => (string) The content version of the input. |
356 | * Necessary if it differs from the current default in order to |
357 | * account for any serialization differences. |
358 | * 'offsetType' => (string) ucs2, char, byte are valid values |
359 | * what kind of source offsets are present in the HTML? |
360 | * 'contentmodel' => (string|null) The content model of the input. |
361 | * 'htmlVariantLanguage' => (Bcp47Code) If non-null, the language variant used for Parsoid HTML. |
362 | * 'wtVariantLanguage' => (Bcp47Code) If non-null, the language variant used for wikitext. |
363 | * 'traceFlags' => (array) associative array with tracing options |
364 | * 'dumpFlags' => (array) associative array with dump options |
365 | * 'debugFlags' => (array) associative array with debug options |
366 | * 'logLevels' => (string[]) Levels to log |
367 | * 'htmlSize' => (int) Size of the HTML that generated $doc |
368 | * ] |
369 | * @param ?SelserData $selserData |
370 | * @return string |
371 | */ |
372 | public function dom2wikitext( |
373 | PageConfig $pageConfig, Document $doc, array $options = [], |
374 | ?SelserData $selserData = null |
375 | ): string { |
376 | $envOptions = $this->setupCommonOptions( $options ); |
377 | if ( isset( $options['inputContentVersion'] ) ) { |
378 | $envOptions['inputContentVersion'] = $options['inputContentVersion']; |
379 | } |
380 | $envOptions['topLevelDoc'] = $doc; |
381 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
382 | $env = new Env( |
383 | $this->siteConfig, $pageConfig, $this->dataAccess, $metadata, $envOptions |
384 | ); |
385 | $env->bumpHtml2WtResourceUse( 'htmlSize', $options['htmlSize'] ?? 0 ); |
386 | $contentmodel = $options['contentmodel'] ?? null; |
387 | $handler = $env->getContentHandler( $contentmodel ); |
388 | $extApi = new ParsoidExtensionAPI( $env ); |
389 | |
390 | $serialTiming = Timing::start(); |
391 | $wikitext = $handler->fromDOM( $extApi, $selserData ); |
392 | $serialTime = $serialTiming->end(); |
393 | |
394 | $this->recordSerializationMetrics( $options, $serialTime, $wikitext ); |
395 | |
396 | return $wikitext; |
397 | } |
398 | |
399 | /** |
400 | * |
401 | */ |
402 | private function recordSerializationMetrics( |
403 | array $options, float $serialTime, string $wikitext |
404 | ) { |
405 | $metrics = $this->siteConfig->metrics(); |
406 | if ( !$metrics ) { |
407 | return; |
408 | } |
409 | |
410 | $htmlSize = $options['htmlSize'] ?? 0; |
411 | $metrics->timing( 'entry.html2wt.size.input', $htmlSize ); |
412 | |
413 | if ( isset( $options['inputContentVersion'] ) ) { |
414 | $metrics->increment( |
415 | 'entry.html2wt.original.version.' . $options['inputContentVersion'] |
416 | ); |
417 | } |
418 | |
419 | $metrics->timing( 'entry.html2wt.total', $serialTime ); |
420 | $metrics->timing( 'entry.html2wt.size.output', strlen( $wikitext ) ); |
421 | |
422 | if ( $htmlSize ) { // Avoid division by zero |
423 | // NOTE: the name timePerInputKB is misleading, since $htmlSize is |
424 | // in characters, not bytes. |
425 | $timePerInputKB = $serialTime * 1024 / $htmlSize; |
426 | $metrics->timing( 'entry.html2wt.timePerInputKB', $timePerInputKB ); |
427 | } |
428 | } |
429 | |
430 | /** |
431 | * Serialize HTML to wikitext. Convenience method for dom2wikitext. |
432 | * |
433 | * @param PageConfig $pageConfig |
434 | * @param string $html |
435 | * @param array $options |
436 | * @param ?SelserData $selserData |
437 | * @return string |
438 | */ |
439 | public function html2wikitext( |
440 | PageConfig $pageConfig, string $html, array $options = [], |
441 | ?SelserData $selserData = null |
442 | ): string { |
443 | $doc = DOMUtils::parseHTML( $html, true ); |
444 | $options['htmlSize'] ??= mb_strlen( $html ); |
445 | return $this->dom2wikitext( $pageConfig, $doc, $options, $selserData ); |
446 | } |
447 | |
448 | /** |
449 | * Update the supplied PageBundle based on the `$update` type. |
450 | * |
451 | * 'convertoffsets': Convert offsets between formats (byte, char, ucs2) |
452 | * 'redlinks': Refreshes the classes of known, missing, etc. links. |
453 | * 'variant': Converts the HTML based on the supplied variant. |
454 | * |
455 | * Note that these are DOM transforms, and not roundtrips through wikitext. |
456 | * |
457 | * @param PageConfig $pageConfig |
458 | * @param string $update 'redlinks'|'variant' |
459 | * @param PageBundle $pb |
460 | * @param array $options |
461 | * @return PageBundle |
462 | */ |
463 | public function pb2pb( |
464 | PageConfig $pageConfig, string $update, PageBundle $pb, |
465 | array $options = [] |
466 | ): PageBundle { |
467 | $envOptions = [ |
468 | 'pageBundle' => true, |
469 | 'topLevelDoc' => DOMUtils::parseHTML( $pb->toHtml(), true ), |
470 | ]; |
471 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
472 | $env = new Env( |
473 | $this->siteConfig, $pageConfig, $this->dataAccess, $metadata, $envOptions |
474 | ); |
475 | $doc = $env->topLevelDoc; |
476 | DOMDataUtils::visitAndLoadDataAttribs( |
477 | DOMCompat::getBody( $doc ), [ 'markNew' => true ] |
478 | ); |
479 | |
480 | $dataBagPB = DOMDataUtils::getPageBundle( $doc ); |
481 | switch ( $update ) { |
482 | case 'convertoffsets': |
483 | ContentUtils::convertOffsets( |
484 | $env, $doc, $options['inputOffsetType'], $options['outputOffsetType'] |
485 | ); |
486 | $dataBagPB->parsoid['offsetType'] = $options['outputOffsetType']; |
487 | $dataBagPB->parsoid['counter'] = $pb->parsoid['counter']; |
488 | break; |
489 | |
490 | case 'redlinks': |
491 | ContentUtils::convertOffsets( |
492 | $env, $doc, $env->getRequestOffsetType(), 'byte' |
493 | ); |
494 | ( new AddRedLinks() )->run( $env, DOMCompat::getBody( $doc ) ); |
495 | ( new ConvertOffsets() )->run( $env, DOMCompat::getBody( $doc ), [], true ); |
496 | break; |
497 | |
498 | case 'variant': |
499 | ContentUtils::convertOffsets( |
500 | $env, $doc, $env->getRequestOffsetType(), 'byte' |
501 | ); |
502 | |
503 | // Note that `maybeConvert` could still be a no-op, in case the |
504 | // __NOCONTENTCONVERT__ magic word is present, or the htmlVariant |
505 | // is a base language code or otherwise invalid. |
506 | $hasWtVariant = $options['variant']['wikitext'] ?? |
507 | // Deprecated name for this option: |
508 | $options['variant']['source'] ?? false; |
509 | LanguageConverter::maybeConvert( |
510 | $env, $doc, |
511 | Utils::mwCodeToBcp47( |
512 | $options['variant']['html'] ?? |
513 | // Deprecated name for this option: |
514 | $options['variant']['target'], |
515 | // Be strict in what we accept. |
516 | true, $this->siteConfig->getLogger() |
517 | ), |
518 | $hasWtVariant ? |
519 | Utils::mwCodeToBcp47( |
520 | $options['variant']['wikitext'] ?? |
521 | // Deprecated name for this option: |
522 | $options['variant']['source'], |
523 | // Be strict in what we accept. |
524 | true, $this->siteConfig->getLogger() |
525 | ) : null |
526 | ); |
527 | |
528 | // NOTE: Keep this in sync with code in core's LanguageVariantConverter |
529 | // Update content-language and vary headers. |
530 | DOMUtils::addHttpEquivHeaders( $doc, [ |
531 | 'content-language' => $env->htmlContentLanguageBcp47()->toBcp47Code(), |
532 | 'vary' => $env->htmlVary() |
533 | ] ); |
534 | |
535 | ( new ConvertOffsets() )->run( $env, DOMCompat::getBody( $doc ), [], true ); |
536 | break; |
537 | |
538 | default: |
539 | throw new LogicException( $update . 'is an unknown transformation' ); |
540 | } |
541 | |
542 | DOMDataUtils::visitAndStoreDataAttribs( |
543 | DOMCompat::getBody( $doc ), [ |
544 | 'discardDataParsoid' => $env->discardDataParsoid, |
545 | 'storeInPageBundle' => $env->pageBundle, |
546 | 'env' => $env, |
547 | ] |
548 | ); |
549 | $body_only = !empty( $options['body_only'] ); |
550 | $node = $body_only ? DOMCompat::getBody( $doc ) : $doc; |
551 | DOMDataUtils::injectPageBundle( $doc, $dataBagPB ); |
552 | $out = ContentUtils::extractDpAndSerialize( $node, [ |
553 | 'innerXML' => $body_only, |
554 | ] ); |
555 | return new PageBundle( |
556 | $out['html'], |
557 | $out['pb']->parsoid, $out['pb']->mw ?? null, |
558 | // Prefer the passed in version, since this was just a transformation |
559 | $pb->version ?? $env->getOutputContentVersion(), |
560 | DOMUtils::findHttpEquivHeaders( $doc ), |
561 | // Prefer the passed in content model |
562 | $pb->contentmodel ?? $pageConfig->getContentModel() |
563 | ); |
564 | } |
565 | |
566 | /** |
567 | * Perform pre-save transformations with top-level templates subst'd. |
568 | * |
569 | * @param PageConfig $pageConfig |
570 | * @param string $wikitext |
571 | * @return string |
572 | */ |
573 | public function substTopLevelTemplates( |
574 | PageConfig $pageConfig, string $wikitext |
575 | ): string { |
576 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
577 | $env = new Env( $this->siteConfig, $pageConfig, $this->dataAccess, $metadata ); |
578 | return Wikitext::pst( $env, $wikitext, true /* $substTLTemplates */ ); |
579 | } |
580 | |
581 | /** |
582 | * Check whether a given content version can be downgraded to the requested |
583 | * content version. |
584 | * |
585 | * @param string $from Current content version |
586 | * @param string $to Requested content version |
587 | * @return string[]|null The downgrade that will fulfill the request, as |
588 | * [ 'from' => <old version>, 'to' => <new version> ], or null if it |
589 | * can't be fulfilled. |
590 | */ |
591 | public static function findDowngrade( string $from, string $to ): ?array { |
592 | foreach ( self::DOWNGRADES as [ 'from' => $dgFrom, 'to' => $dgTo ] ) { |
593 | if ( |
594 | Semver::satisfies( $from, "^$dgFrom" ) && |
595 | Semver::satisfies( $to, "^$dgTo" ) |
596 | ) { |
597 | // FIXME: Make this a class? |
598 | return [ 'from' => $dgFrom, 'to' => $dgTo ]; |
599 | } |
600 | } |
601 | return null; |
602 | } |
603 | |
604 | /** |
605 | * Downgrade a document to an older content version. |
606 | * |
607 | * @param string[] $dg Value returned by findDowngrade(). |
608 | * @param PageBundle $pageBundle |
609 | */ |
610 | public static function downgrade( |
611 | array $dg, PageBundle $pageBundle |
612 | ): void { |
613 | foreach ( self::DOWNGRADES as [ 'from' => $dgFrom, 'to' => $dgTo, 'func' => $dgFunc ] ) { |
614 | if ( $dg['from'] === $dgFrom && $dg['to'] === $dgTo ) { |
615 | call_user_func( [ self::class, $dgFunc ], $pageBundle ); |
616 | |
617 | // FIXME: Maybe this resolve should just be part of the $dg |
618 | $pageBundle->version = self::resolveContentVersion( $dg['to'] ); |
619 | |
620 | // FIXME: Maybe this should be a helper to avoid the rt |
621 | $doc = DOMUtils::parseHTML( $pageBundle->html ); |
622 | // Match the http-equiv meta to the content-type header |
623 | $meta = DOMCompat::querySelector( $doc, |
624 | 'meta[property="mw:htmlVersion"], meta[property="mw:html:version"]' ); |
625 | if ( $meta ) { |
626 | $meta->setAttribute( 'content', $pageBundle->version ); |
627 | $pageBundle->html = ContentUtils::toXML( $doc ); |
628 | } |
629 | |
630 | return; |
631 | } |
632 | } |
633 | throw new InvalidArgumentException( |
634 | "Unsupported downgrade: {$dg['from']} -> {$dg['to']}" |
635 | ); |
636 | } |
637 | |
638 | /** |
639 | * Check if language variant conversion is implemented for a language |
640 | * |
641 | * @internal FIXME: Remove once Parsoid's language variant work is completed |
642 | * @param PageConfig $pageConfig |
643 | * @param Bcp47Code $htmlVariant Variant language to check |
644 | * @return bool |
645 | */ |
646 | public function implementsLanguageConversionBcp47( PageConfig $pageConfig, Bcp47Code $htmlVariant ): bool { |
647 | // Hardcode disable zh lang conversion support since Parsoid's |
648 | // implementation is incomplete and not performant (T346657). |
649 | if ( $pageConfig->getPageLanguageBcp47()->toBcp47Code() === 'zh' ) { |
650 | return false; |
651 | } |
652 | |
653 | $metadata = new StubMetadataCollector( $this->siteConfig ); |
654 | $env = new Env( $this->siteConfig, $pageConfig, $this->dataAccess, $metadata ); |
655 | return LanguageConverter::implementsLanguageConversionBcp47( $env, $htmlVariant ); |
656 | } |
657 | |
658 | /** |
659 | * Downgrade the given document and pagebundle from 999.x to 2.x. |
660 | * |
661 | * @param PageBundle $pageBundle |
662 | */ |
663 | private static function downgrade999to2( PageBundle $pageBundle ) { |
664 | // Effectively, skip applying data-parsoid. Note that if we were to |
665 | // support a pb2html downgrade, we'd need to apply the full thing, |
666 | // but that would create complications where ids would be left behind. |
667 | // See the comment in around `DOMDataUtils::applyPageBundle` |
668 | $newPageBundle = new PageBundle( |
669 | $pageBundle->html, |
670 | [ 'ids' => [] ], |
671 | $pageBundle->mw |
672 | ); |
673 | $pageBundle->html = $newPageBundle->toHtml(); |
674 | // Now, modify the pagebundle to the expected form. This is important |
675 | // since, at least in the serialization path, the original pb will be |
676 | // applied to the modified content and its presence could cause lost |
677 | // deletions. |
678 | $pageBundle->mw = [ 'ids' => [] ]; |
679 | } |
680 | } |