Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
65.99% covered (warning)
65.99%
229 / 347
15.38% covered (danger)
15.38%
2 / 13
CRAP
0.00% covered (danger)
0.00%
0 / 1
HtmlInputTransformHelper
65.99% covered (warning)
65.99%
229 / 347
15.38% covered (danger)
15.38%
2 / 13
416.64
0.00% covered (danger)
0.00%
0 / 1
 __construct
92.31% covered (success)
92.31%
12 / 13
0.00% covered (danger)
0.00%
0 / 1
2.00
 getParamSettings
0.00% covered (danger)
0.00%
0 / 44
0.00% covered (danger)
0.00%
0 / 1
2
 normalizeParameters
72.73% covered (warning)
72.73%
16 / 22
0.00% covered (danger)
0.00%
0 / 1
21.19
 init
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 initInternal
77.78% covered (warning)
77.78%
49 / 63
0.00% covered (danger)
0.00%
0 / 1
24.39
 getTransform
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 setMetrics
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
12
 setOriginal
78.38% covered (warning)
78.38%
58 / 74
0.00% covered (danger)
0.00%
0 / 1
28.35
 getContent
72.22% covered (warning)
72.22%
13 / 18
0.00% covered (danger)
0.00%
0 / 1
4.34
 putContent
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
2
 fetchParserOutputFromParsoid
73.68% covered (warning)
73.68%
28 / 38
0.00% covered (danger)
0.00%
0 / 1
13.21
 fetchSelserContextFromStash
89.36% covered (warning)
89.36%
42 / 47
0.00% covered (danger)
0.00%
0 / 1
5.03
 throwHttpExceptionForStatus
0.00% covered (danger)
0.00%
0 / 9
0.00% covered (danger)
0.00%
0 / 1
6
1<?php
2/**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 * @file
19 */
20namespace MediaWiki\Rest\Handler\Helper;
21
22use InvalidArgumentException;
23use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
24use MediaWiki\Content\Content;
25use MediaWiki\Edit\ParsoidOutputStash;
26use MediaWiki\Edit\ParsoidRenderID;
27use MediaWiki\Edit\SelserContext;
28use MediaWiki\Language\LanguageCode;
29use MediaWiki\MainConfigNames;
30use MediaWiki\Page\PageIdentity;
31use MediaWiki\Page\PageLookup;
32use MediaWiki\Page\PageRecord;
33use MediaWiki\Page\ParserOutputAccess;
34use MediaWiki\Parser\ParserOptions;
35use MediaWiki\Parser\ParserOutput;
36use MediaWiki\Parser\Parsoid\HtmlToContentTransform;
37use MediaWiki\Parser\Parsoid\HtmlTransformFactory;
38use MediaWiki\Parser\Parsoid\PageBundleParserOutputConverter;
39use MediaWiki\Rest\Handler;
40use MediaWiki\Rest\HttpException;
41use MediaWiki\Rest\LocalizedHttpException;
42use MediaWiki\Rest\ResponseInterface;
43use MediaWiki\Revision\RevisionAccessException;
44use MediaWiki\Revision\RevisionLookup;
45use MediaWiki\Revision\RevisionRecord;
46use MediaWiki\Status\Status;
47use MWUnknownContentModelException;
48use Wikimedia\Bcp47Code\Bcp47Code;
49use Wikimedia\Message\MessageValue;
50use Wikimedia\ParamValidator\ParamValidator;
51use Wikimedia\Parsoid\Core\ClientError;
52use Wikimedia\Parsoid\Core\PageBundle;
53use Wikimedia\Parsoid\Core\ResourceLimitExceededException;
54use Wikimedia\Parsoid\Parsoid;
55use Wikimedia\Stats\StatsFactory;
56
57/**
58 * REST helper for converting HTML to page content source (e.g. wikitext).
59 *
60 * @since 1.40
61 *
62 * @unstable Pending consolidation of the Parsoid extension with core code.
63 */
64class HtmlInputTransformHelper {
65    /**
66     * @internal
67     */
68    public const CONSTRUCTOR_OPTIONS = [
69        MainConfigNames::ParsoidCacheConfig
70    ];
71
72    /** @var PageIdentity|null */
73    private $page = null;
74
75    /**
76     * @var HtmlToContentTransform
77     */
78    private $transform;
79
80    /**
81     * @var array
82     */
83    private $envOptions;
84
85    private StatsFactory $statsFactory;
86    private HtmlTransformFactory $htmlTransformFactory;
87    private ParsoidOutputStash $parsoidOutputStash;
88    private ParserOutputAccess $parserOutputAccess;
89    private PageLookup $pageLookup;
90    private RevisionLookup $revisionLookup;
91
92    /**
93     * @param StatsFactory $statsFactory
94     * @param HtmlTransformFactory $htmlTransformFactory
95     * @param ParsoidOutputStash $parsoidOutputStash
96     * @param ParserOutputAccess $parserOutputAccess
97     * @param PageLookup $pageLookup
98     * @param RevisionLookup $revisionLookup
99     * @param array $envOptions
100     * @param ?PageIdentity $page
101     * @param array|string $body Body structure, or an HTML string
102     * @param array $parameters
103     * @param RevisionRecord|null $originalRevision
104     * @param Bcp47Code|null $pageLanguage
105     */
106    public function __construct(
107        StatsFactory $statsFactory,
108        HtmlTransformFactory $htmlTransformFactory,
109        ParsoidOutputStash $parsoidOutputStash,
110        ParserOutputAccess $parserOutputAccess,
111        PageLookup $pageLookup,
112        RevisionLookup $revisionLookup,
113        array $envOptions = [],
114        ?PageIdentity $page = null,
115        $body = '',
116        array $parameters = [],
117        ?RevisionRecord $originalRevision = null,
118        ?Bcp47Code $pageLanguage = null
119    ) {
120        $this->statsFactory = $statsFactory;
121        $this->htmlTransformFactory = $htmlTransformFactory;
122        $this->parsoidOutputStash = $parsoidOutputStash;
123        $this->envOptions = $envOptions + [
124            'outputContentVersion' => Parsoid::defaultHTMLVersion(),
125            'offsetType' => 'byte',
126        ];
127        $this->parserOutputAccess = $parserOutputAccess;
128        $this->pageLookup = $pageLookup;
129        $this->revisionLookup = $revisionLookup;
130        if ( $page === null ) {
131            wfDeprecated( __METHOD__ . ' without $page', '1.43' );
132        } else {
133            $this->initInternal( $page, $body, $parameters, $originalRevision, $pageLanguage );
134        }
135    }
136
137    /**
138     * @return array
139     */
140    public function getParamSettings(): array {
141        // JSON body schema:
142        /*
143        doc:
144            properties:
145                headers:
146                    type: array
147                    items:
148                        type: string
149                body:
150                    type: [ string, object ]
151            required: [ body ]
152
153        body:
154            properties:
155                offsetType:
156                    type: string
157                revid:
158                    type: integer
159                renderid:
160                    type: string
161                etag:
162                    type: string
163                html:
164                    type: [ doc, string ]
165                data-mw:
166                    type: doc
167                original:
168                    properties:
169                        html:
170                            type: doc
171                        source:
172                            type: doc
173                        data-mw:
174                            type: doc
175                        data-parsoid:
176                            type: doc
177            required: [ html ]
178         */
179
180        // FUTURE: more params
181        // - slot (for loading the base content)
182
183        return [
184            // XXX: should we really declare this here? Or should end endpoint do this?
185            //      We are not reading this property...
186            'title' => [
187                Handler::PARAM_SOURCE => 'path',
188                ParamValidator::PARAM_TYPE => 'string',
189                ParamValidator::PARAM_DEFAULT => '',
190                ParamValidator::PARAM_REQUIRED => false,
191                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-title' )
192            ],
193            // XXX: Needed for compatibility with the parsoid transform endpoint.
194            //      But revid should just be part of the info about the original data
195            //      in the body.
196            'oldid' => [
197                Handler::PARAM_SOURCE => 'path',
198                ParamValidator::PARAM_TYPE => 'int',
199                ParamValidator::PARAM_DEFAULT => 0,
200                ParamValidator::PARAM_REQUIRED => false,
201                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-oldid' )
202            ],
203            // XXX: Supported for compatibility with the parsoid transform endpoint.
204            //      If given, it should be 'html' or 'pagebundle'.
205            'from' => [
206                Handler::PARAM_SOURCE => 'path',
207                ParamValidator::PARAM_TYPE => 'string',
208                ParamValidator::PARAM_DEFAULT => '',
209                ParamValidator::PARAM_REQUIRED => false,
210                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-from' )
211            ],
212            // XXX: Supported for compatibility with the parsoid transform endpoint.
213            //      Ignored.
214            'format' => [
215                Handler::PARAM_SOURCE => 'path',
216                ParamValidator::PARAM_TYPE => 'string',
217                ParamValidator::PARAM_DEFAULT => '',
218                ParamValidator::PARAM_REQUIRED => false,
219                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-format' )
220            ],
221            'contentmodel' => [ // XXX: get this from the Accept header?
222                Handler::PARAM_SOURCE => 'query',
223                ParamValidator::PARAM_TYPE => 'string',
224                ParamValidator::PARAM_DEFAULT => '',
225                ParamValidator::PARAM_REQUIRED => false,
226                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-contentmodel' )
227            ],
228            'language' => [ // TODO: get this from Accept-Language header?!
229                Handler::PARAM_SOURCE => 'query',
230                ParamValidator::PARAM_TYPE => 'string',
231                ParamValidator::PARAM_DEFAULT => '',
232                ParamValidator::PARAM_REQUIRED => false,
233                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-language' )
234            ]
235        ];
236    }
237
238    /**
239     * Modify body and parameters to provide compatibility with legacy endpoints.
240     *
241     * @see ParsoidHandler::getRequestAttributes
242     *
243     * @param array<string,mixed> &$body
244     * @param array<string,mixed> &$parameters
245     *
246     * @throws HttpException
247     *
248     * @return void
249     */
250    private static function normalizeParameters( array &$body, array &$parameters ) {
251        // If the revision ID is given in the path, pretend it was given in the body.
252        if ( isset( $parameters['oldid'] ) && (int)$parameters['oldid'] > 0 ) {
253            $body['original']['revid'] = (int)$parameters['oldid'];
254        }
255
256        // If an etag is given in the body, use it as the render ID.
257        // Note that we support ETag format in the renderid field.
258        if ( !empty( $body['original']['etag'] ) ) {
259            // @phan-suppress-next-line PhanTypeInvalidDimOffset false positive
260            $body['original']['renderid'] = $body['original']['etag'];
261        }
262
263        // Accept 'wikitext' as an alias for 'source'.
264        if ( isset( $body['original']['wikitext'] ) ) {
265            // @phan-suppress-next-line PhanTypeInvalidDimOffset false positive
266            $body['original']['source'] = $body['original']['wikitext'];
267            unset( $body['original']['wikitext'] );
268        }
269
270        // If 'from' is not set, we accept page bundle style input as well as full HTML.
271        // If 'from' is set, we only accept page bundle style input if it is set to FORMAT_PAGEBUNDLE.
272        if (
273            isset( $parameters['from'] ) && $parameters['from'] !== '' &&
274            $parameters['from'] !== ParsoidFormatHelper::FORMAT_PAGEBUNDLE
275        ) {
276            unset( $body['original']['data-parsoid']['body'] );
277            unset( $body['original']['data-mw']['body'] );
278            unset( $body['data-mw']['body'] );
279        }
280
281        // If 'from' is given, it must be html or pagebundle.
282        if (
283            isset( $parameters['from'] ) && $parameters['from'] !== '' &&
284            $parameters['from'] !== ParsoidFormatHelper::FORMAT_HTML &&
285            $parameters['from'] !== ParsoidFormatHelper::FORMAT_PAGEBUNDLE
286        ) {
287            throw new LocalizedHttpException(
288                new MessageValue( "rest-unsupported-transform-input", [ $parameters['from'] ] ), 400
289            );
290        }
291
292        if ( isset( $body['contentmodel'] ) && $body['contentmodel'] !== '' ) {
293            $parameters['contentmodel'] = $body['contentmodel'];
294        } elseif ( isset( $parameters['format'] ) && $parameters['format'] !== '' ) {
295            $parameters['contentmodel'] = $parameters['format'];
296        }
297    }
298
299    /**
300     * @param PageIdentity $page
301     * @param array|string $body Body structure, or an HTML string
302     * @param array $parameters
303     * @param RevisionRecord|null $originalRevision
304     * @param Bcp47Code|null $pageLanguage
305     *
306     * @throws HttpException
307     * @deprecated since 1.43; pass arguments to constructor instead
308     */
309    public function init(
310        PageIdentity $page,
311        $body,
312        array $parameters,
313        ?RevisionRecord $originalRevision = null,
314        ?Bcp47Code $pageLanguage = null
315    ) {
316        wfDeprecated( __METHOD__, '1.43' );
317        $this->initInternal( $page, $body, $parameters, $originalRevision, $pageLanguage );
318    }
319
320    /**
321     * @param PageIdentity $page
322     * @param array|string $body Body structure, or an HTML string
323     * @param array $parameters
324     * @param RevisionRecord|null $originalRevision
325     * @param Bcp47Code|null $pageLanguage
326     *
327     * @throws HttpException
328     */
329    private function initInternal(
330        PageIdentity $page,
331        $body,
332        array $parameters,
333        ?RevisionRecord $originalRevision = null,
334        ?Bcp47Code $pageLanguage = null
335    ) {
336        if ( is_string( $body ) ) {
337            $body = [ 'html' => $body ];
338        }
339
340        self::normalizeParameters( $body, $parameters );
341
342        $this->page = $page;
343
344        if ( !isset( $body['html'] ) ) {
345            throw new LocalizedHttpException( new MessageValue( "rest-missing-body-field", [ 'html' ] ) );
346        }
347
348        $html = is_array( $body['html'] ) ? $body['html']['body'] : $body['html'];
349
350        // TODO: validate $body against a proper schema.
351        $this->transform = $this->htmlTransformFactory->getHtmlToContentTransform(
352            $html,
353            $this->page
354        );
355
356        $this->transform->setMetrics( $this->statsFactory );
357
358        // NOTE: Env::getContentModel will fall back to the page's recorded content model
359        //       if none is set here.
360        $this->transform->setOptions( [
361            'contentmodel' => $parameters['contentmodel'] ?? null,
362            'offsetType' => $body['offsetType'] ?? $this->envOptions['offsetType'],
363        ] );
364
365        $original = $body['original'] ?? [];
366        $originalRendering = null;
367
368        if ( !isset( $original['html'] ) && !empty( $original['renderid'] ) ) {
369            $key = $original['renderid'];
370            if ( preg_match( '!^(W/)?".*"$!', $key ) ) {
371                $originalRendering = ParsoidRenderID::newFromETag( $key );
372
373                if ( !$originalRendering ) {
374                    throw new LocalizedHttpException( new MessageValue( "rest-bad-etag", [ $key ] ), 400 );
375                }
376            } else {
377                $originalRendering = ParsoidRenderID::newFromKey( $key );
378            }
379        } elseif ( !empty( $original['html'] ) || !empty( $original['data-parsoid'] ) ) {
380            // NOTE: We might have an incomplete PageBundle here, with no HTML but with data-parsoid!
381            // XXX: Do we need to support that, or can that just be a 400?
382            $originalRendering = new PageBundle(
383                $original['html']['body'] ?? '',
384                $original['data-parsoid']['body'] ?? null,
385                $original['data-mw']['body'] ?? null,
386                null, // will be derived from $original['html']['headers']['content-type']
387                $original['html']['headers'] ?? []
388            );
389        }
390
391        if ( !$originalRevision && !empty( $original['revid'] ) ) {
392            $originalRevision = (int)$original['revid'];
393        }
394
395        if ( $originalRevision || $originalRendering ) {
396            $this->setOriginal( $originalRevision, $originalRendering );
397        } else {
398            if ( $this->page->exists() ) {
399                $this->statsFactory
400                    ->getCounter( 'html_input_transform_total' )
401                    ->setLabel( 'original_html_given', 'false' )
402                    ->setLabel( 'page_exists', 'true' )
403                    ->setLabel( 'status', 'unknown' )
404                    ->copyToStatsdAt( 'html_input_transform.original_html.not_given.page_exists' )
405                    ->increment();
406            } else {
407                $this->statsFactory
408                    ->getCounter( 'html_input_transform_total' )
409                    ->setLabel( 'original_html_given', 'false' )
410                    ->setLabel( 'page_exists', 'false' )
411                    ->setLabel( 'status', 'unknown' )
412                    ->copyToStatsdAt( 'html_input_transform.original_html.not_given.page_not_exist' )
413                    ->increment();
414            }
415        }
416
417        if ( isset( $body['data-mw']['body'] ) ) {
418            $this->transform->setModifiedDataMW( $body['data-mw']['body'] );
419        }
420
421        if ( $pageLanguage ) {
422            $this->transform->setContentLanguage( $pageLanguage );
423        } elseif ( isset( $parameters['language'] ) && $parameters['language'] !== '' ) {
424            $pageLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
425                $parameters['language']
426            );
427            $this->transform->setContentLanguage( $pageLanguage );
428        }
429
430        if ( isset( $original['source']['body'] ) ) {
431            // XXX: do we really have to support wikitext overrides?
432            $this->transform->setOriginalText( $original['source']['body'] );
433        }
434    }
435
436    /**
437     * Return HTMLTransform object, so additional context can be provided by calling setters on it.
438     * @return HtmlToContentTransform
439     */
440    public function getTransform(): HtmlToContentTransform {
441        return $this->transform;
442    }
443
444    /**
445     * Set metrics sink.
446     *
447     * @note Passing a StatsdDataFactoryInterface here has been deprecated
448     * since 1.43.
449     *
450     * @param StatsFactory|StatsdDataFactoryInterface $statsFactory
451     */
452    public function setMetrics( $statsFactory ) {
453        if ( $statsFactory instanceof StatsdDataFactoryInterface ) {
454            // Uncomment this once all WMF code has been transitioned, but
455            // leave it in for the 1.43 release.
456            wfDeprecated( __METHOD__ . ' with StatsdDataFactoryInterface', '1.43' );
457            return;
458        }
459        $this->statsFactory = $statsFactory;
460
461        if ( $this->transform ) {
462            $this->transform->setMetrics( $statsFactory );
463        }
464    }
465
466    /**
467     * Supply information about the revision and rendering that was the original basis of
468     * the input HTML. This is used to apply selective serialization (selser), if possible.
469     *
470     * @param RevisionRecord|int|null $rev
471     * @param ParsoidRenderID|PageBundle|ParserOutput|null $originalRendering
472     */
473    public function setOriginal( $rev, $originalRendering ) {
474        if ( $originalRendering instanceof ParsoidRenderID ) {
475            $renderId = $originalRendering;
476
477            // If the client asked for a render ID, load original data from stash
478            try {
479                $selserContext = $this->fetchSelserContextFromStash( $renderId );
480            } catch ( InvalidArgumentException $ex ) {
481                $this->statsFactory
482                    ->getCounter( 'html_input_transform_total' )
483                    ->setLabel( 'original_html_given', 'as_renderid' )
484                    ->setLabel( 'page_exists', 'unknown' )
485                    ->setLabel( 'status', 'bad_renderid' )
486                    ->copyToStatsdAt( 'html_input_transform.original_html.given.as_renderid.bad' )
487                    ->increment();
488                throw new LocalizedHttpException( new MessageValue( "rest-bad-stash-key" ),
489                    400,
490                    [
491                        'reason' => $ex->getMessage(),
492                        'key' => "$renderId"
493                    ]
494                );
495            }
496
497            if ( !$selserContext ) {
498                // NOTE: When the client asked for a specific stash key (resp. etag),
499                //       we should fail with a 412 if we don't have the specific rendering.
500                //       On the other hand, of the client only provided a base revision ID,
501                //       we can re-parse and hope for the best.
502
503                throw new LocalizedHttpException(
504                    new MessageValue( "rest-no-stashed-content", [ $renderId->getKey() ] ), 412
505                );
506
507                // TODO: This class should provide getETag and getLastModified methods for use by
508                //       the REST endpoint, to provide proper support for conditionals.
509                //       However, that requires some refactoring of how HTTP conditional checks
510                //       work in the Handler base class.
511            }
512
513            if ( !$rev ) {
514                $rev = $renderId->getRevisionID();
515            }
516
517            $originalRendering = $selserContext->getPageBundle();
518            $content = $selserContext->getContent();
519
520            if ( $content ) {
521                $this->transform->setOriginalContent( $content );
522            }
523        } elseif ( !$originalRendering && $rev ) {
524            // The client provided a revision ID, but not stash key.
525            // Try to get a rendering for the given revision, and use it as the basis for selser.
526            // Chances are good that the resulting diff will be reasonably clean.
527            // NOTE: If we don't have a revision ID, we should not attempt selser!
528            $originalRendering = $this->fetchParserOutputFromParsoid( $this->page, $rev, true );
529
530            if ( $originalRendering ) {
531                $this->statsFactory->getCounter( 'html_input_transform_total' )
532                    ->setLabel( 'original_html_given', 'as_revid' )
533                    ->setLabel( 'page_exists', 'unknown' )
534                    ->setLabel( 'status', 'found' )
535                    ->copyToStatsdAt( 'html_input_transform.original_html.given.as_revid.found' )
536                    ->increment();
537            } else {
538                $this->statsFactory->getCounter( 'html_input_transform_total' )
539                    ->setLabel( 'original_html_given', 'as_revid' )
540                    ->setLabel( 'page_exists', 'unknown' )
541                    ->setLabel( 'status', 'not_found' )
542                    ->copyToStatsdAt( 'html_input_transform.original_html.given.as_revid.not_found' )
543                    ->increment();
544            }
545        } elseif ( $originalRendering ) {
546            $this->statsFactory->getCounter( 'html_input_transform_total' )
547                ->setLabel( 'original_html_given', 'true' )
548                ->setLabel( 'page_exists', 'unknown' )
549                ->setLabel( 'status', 'verbatim' )
550                ->copyToStatsdAt( 'html_input_transform.original_html.given.verbatim' )
551                ->increment();
552        }
553
554        if ( $originalRendering instanceof ParserOutput ) {
555            $originalRendering = PageBundleParserOutputConverter::pageBundleFromParserOutput( $originalRendering );
556
557            // NOTE: Use the default if we got a ParserOutput object.
558            //       Don't apply the default if we got passed a PageBundle,
559            //       in that case, we want to require the version to be explicit.
560            if ( $originalRendering->version === null && !isset( $originalRendering->headers['content-type'] ) ) {
561                $originalRendering->version = Parsoid::defaultHTMLVersion();
562            }
563        }
564
565        if ( !$originalRendering instanceof PageBundle ) {
566            return;
567        }
568
569        if ( $originalRendering->version !== null ) {
570            $this->transform->setOriginalSchemaVersion( $originalRendering->version );
571        } elseif ( !empty( $originalRendering->headers['content-type'] ) ) {
572            $vOriginal = ParsoidFormatHelper::parseContentTypeHeader(
573                // @phan-suppress-next-line PhanTypeArraySuspiciousNullable Silly Phan, we just checked.
574                $originalRendering->headers['content-type']
575            );
576
577            if ( $vOriginal ) {
578                $this->transform->setOriginalSchemaVersion( $vOriginal );
579            }
580        }
581
582        if ( $rev instanceof RevisionRecord ) {
583            $this->transform->setOriginalRevision( $rev );
584        } elseif ( $rev && is_int( $rev ) ) {
585            $this->transform->setOriginalRevisionId( $rev );
586        }
587
588        // NOTE: We might have an incomplete PageBundle here, with no HTML.
589        //       PageBundle::$html is declared to not be nullable, so it would be set to the empty
590        //       string if not given.
591        if ( $originalRendering->html !== '' ) {
592            $this->transform->setOriginalHtml( $originalRendering->html );
593        }
594
595        if ( $originalRendering->parsoid !== null ) {
596            $this->transform->setOriginalDataParsoid( $originalRendering->parsoid );
597        }
598
599        if ( $originalRendering->mw !== null ) {
600            $this->transform->setOriginalDataMW( $originalRendering->mw );
601        }
602    }
603
604    /**
605     * @return Content the content derived from the input HTML.
606     * @throws HttpException
607     */
608    public function getContent(): Content {
609        try {
610            return $this->transform->htmlToContent();
611        } catch ( ClientError $e ) {
612            throw new LocalizedHttpException(
613                new MessageValue( 'rest-html-backend-error', [ $e->getMessage() ] ),
614                400,
615                [ 'reason' => $e->getMessage() ]
616            );
617        } catch ( ResourceLimitExceededException $e ) {
618            throw new LocalizedHttpException(
619                new MessageValue( 'rest-resource-limit-exceeded' ),
620                413,
621                [ 'reason' => $e->getMessage() ]
622            );
623        } catch ( MWUnknownContentModelException $e ) {
624            throw new LocalizedHttpException(
625                new MessageValue( "rest-unknown-content-model", [ $e->getModelId() ] ),
626                400
627            );
628        }
629    }
630
631    /**
632     * Creates a response containing the content derived from the input HTML.
633     * This will set the appropriate Content-Type header.
634     *
635     * @param ResponseInterface $response
636     */
637    public function putContent( ResponseInterface $response ) {
638        $content = $this->getContent();
639        $data = $content->serialize();
640
641        try {
642            $contentType = ParsoidFormatHelper::getContentType(
643                $content->getModel(),
644                $this->envOptions['outputContentVersion']
645            );
646        } catch ( InvalidArgumentException $e ) {
647            // If Parsoid doesn't know the content type,
648            // ask the ContentHandler!
649            $contentType = $content->getDefaultFormat();
650        }
651
652        $response->setHeader( 'Content-Type', $contentType );
653        $response->getBody()->write( $data );
654    }
655
656    /**
657     * @param PageIdentity $page
658     * @param RevisionRecord|int $revision
659     * @param bool $mayParse
660     *
661     * @return ParserOutput|null
662     * @throws HttpException
663     */
664    private function fetchParserOutputFromParsoid( PageIdentity $page, $revision, bool $mayParse ): ?ParserOutput {
665        $parserOptions = ParserOptions::newFromAnon();
666        $parserOptions->setUseParsoid();
667
668        try {
669            if ( !$page instanceof PageRecord ) {
670                $name = "$page";
671                $page = $this->pageLookup->getPageByReference( $page );
672                if ( !$page ) {
673                    throw new RevisionAccessException( 'Page {name} not found',
674                        [ 'name' => $name ] );
675                }
676            }
677
678            if ( is_int( $revision ) ) {
679                $revId = $revision;
680                $revision = $this->revisionLookup->getRevisionById( $revId, 0, $page );
681
682                if ( !$revision ) {
683                    throw new RevisionAccessException( 'Revision {revId} not found',
684                        [ 'revId' => $revId ] );
685                }
686            }
687
688            if ( $page->getId() !== $revision->getPageId() ) {
689                throw new RevisionAccessException( 'Revision {revId} does not belong to page {name}',
690                    [ 'name' => $page->getDBkey(),
691                        'revId' => $revision->getId() ] );
692            }
693
694            if ( $mayParse ) {
695                try {
696                    $status = $this->parserOutputAccess->getParserOutput(
697                        $page, $parserOptions, $revision
698                    );
699                } catch ( ClientError $e ) {
700                    $status = Status::newFatal( 'parsoid-client-error', $e->getMessage() );
701                } catch ( ResourceLimitExceededException $e ) {
702                    $status = Status::newFatal( 'parsoid-resource-limit-exceeded', $e->getMessage() );
703                }
704
705                if ( !$status->isOK() ) {
706                    $this->throwHttpExceptionForStatus( $status );
707                }
708
709                $parserOutput = $status->getValue();
710            } else {
711                $parserOutput = $this->parserOutputAccess->getCachedParserOutput(
712                    $page, $parserOptions, $revision
713                );
714            }
715        } catch ( RevisionAccessException $e ) {
716            // The client supplied bad revision ID, or the revision was deleted or suppressed.
717            throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ),
718                404,
719                [ 'reason' => $e->getMessage() ]
720            );
721        }
722
723        return $parserOutput;
724    }
725
726    /**
727     * @param ParsoidRenderID $renderID
728     *
729     * @return SelserContext|null
730     */
731    private function fetchSelserContextFromStash( $renderID ): ?SelserContext {
732        $selserContext = $this->parsoidOutputStash->get( $renderID );
733        $labels = [
734            'original_html_given' => 'as_renderid',
735            'page_exists' => 'unknown',
736            'status' => 'hit-stashed'
737        ];
738        $counter = $this->statsFactory->getCounter( 'html_input_transform_total' );
739        if ( $selserContext ) {
740            $counter->setLabels( $labels )
741                ->copyToStatsdAt( 'html_input_transform.original_html.given.as_renderid.stash_hit.found.hit' )
742                ->increment();
743            return $selserContext;
744        } else {
745            // Looks like the rendering is gone from stash (or the client send us a bogus key).
746            // Try to load it from the parser cache instead.
747            // On a wiki with low edit frequency, there is a good chance that it's still there.
748            try {
749                $parserOutput = $this->fetchParserOutputFromParsoid( $this->page, $renderID->getRevisionID(), false );
750
751                if ( !$parserOutput ) {
752                    $labels[ 'status' ] = 'miss-fallback_not_found';
753                    $counter->setLabels( $labels )->copyToStatsdAt(
754                        'html_input_transform.original_html.given.as_renderid.stash_miss_pc_fallback.not_found.miss'
755                    )->increment();
756                    return null;
757                }
758
759                $cachedRenderID = ParsoidRenderID::newFromParserOutput( $parserOutput );
760                if ( $cachedRenderID->getKey() !== $renderID->getKey() ) {
761                    $labels[ 'status' ] = 'mismatch-fallback_not_found';
762                    $counter->setLabels( $labels )
763                        ->copyToStatsdAt(
764                            'html_input_transform.original_html.given.as_renderid.' .
765                            'stash_miss_pc_fallback.not_found.mismatch'
766                        )
767                        ->increment();
768
769                    // It's not the correct rendering.
770                    return null;
771                }
772                $labels[ 'status' ] = 'hit-fallback_found';
773                $counter->setLabels( $labels )
774                    ->copyToStatsdAt(
775                        'html_input_transform.original_html.given.as_renderid.' .
776                        'stash_miss_pc_fallback.found.hit'
777                    )
778                    ->increment();
779
780                $pb = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput );
781                return new SelserContext( $pb, $renderID->getRevisionID() );
782            } catch ( HttpException $e ) {
783                $labels[ 'status' ] = 'failed-fallback_not_found';
784                $counter->setLabels( $labels )
785                    ->copyToStatsdAt(
786                        'html_input_transform.original_html.given.as_renderid.' .
787                        'stash_miss_pc_fallback.not_found.failed'
788                    )
789                    ->increment();
790
791                // If the revision isn't found, don't trigger a 404. Return null to trigger a 412.
792                return null;
793            }
794        }
795    }
796
797    /**
798     * @param Status $status
799     *
800     * @return never
801     * @throws HttpException
802     */
803    private function throwHttpExceptionForStatus( Status $status ) {
804        // TODO: make this nicer.
805        if ( $status->hasMessage( 'parsoid-resource-limit-exceeded' ) ) {
806            throw new LocalizedHttpException( new MessageValue( "rest-parsoid-resource-exceeded" ),
807                413,
808                [ 'reason' => $status->getHTML() ]
809            );
810        } else {
811            throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error" ),
812                400,
813                [ 'reason' => $status->getHTML() ]
814            );
815        }
816    }
817
818}