Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
67.05% covered (warning)
67.05%
234 / 349
15.38% covered (danger)
15.38%
2 / 13
CRAP
0.00% covered (danger)
0.00%
0 / 1
HtmlInputTransformHelper
67.05% covered (warning)
67.05%
234 / 349
15.38% covered (danger)
15.38%
2 / 13
387.28
0.00% covered (danger)
0.00%
0 / 1
 __construct
92.31% covered (success)
92.31%
12 / 13
0.00% covered (danger)
0.00%
0 / 1
2.00
 getParamSettings
0.00% covered (danger)
0.00%
0 / 44
0.00% covered (danger)
0.00%
0 / 1
2
 normalizeParameters
72.73% covered (warning)
72.73%
16 / 22
0.00% covered (danger)
0.00%
0 / 1
21.19
 init
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 initInternal
79.41% covered (warning)
79.41%
54 / 68
0.00% covered (danger)
0.00%
0 / 1
24.85
 getTransform
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 setMetrics
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
6
 setOriginal
78.38% covered (warning)
78.38%
58 / 74
0.00% covered (danger)
0.00%
0 / 1
28.35
 getContent
72.22% covered (warning)
72.22%
13 / 18
0.00% covered (danger)
0.00%
0 / 1
4.34
 putContent
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
2
 fetchParserOutputFromParsoid
73.68% covered (warning)
73.68%
28 / 38
0.00% covered (danger)
0.00%
0 / 1
13.21
 fetchSelserContextFromStash
89.36% covered (warning)
89.36%
42 / 47
0.00% covered (danger)
0.00%
0 / 1
5.03
 throwHttpExceptionForStatus
0.00% covered (danger)
0.00%
0 / 9
0.00% covered (danger)
0.00%
0 / 1
6
1<?php
2/**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 * @file
19 */
20namespace MediaWiki\Rest\Handler\Helper;
21
22use InvalidArgumentException;
23use MediaWiki\Content\Content;
24use MediaWiki\Edit\ParsoidOutputStash;
25use MediaWiki\Edit\ParsoidRenderID;
26use MediaWiki\Edit\SelserContext;
27use MediaWiki\Exception\MWUnknownContentModelException;
28use MediaWiki\Language\LanguageCode;
29use MediaWiki\MainConfigNames;
30use MediaWiki\Page\PageIdentity;
31use MediaWiki\Page\PageLookup;
32use MediaWiki\Page\PageRecord;
33use MediaWiki\Page\ParserOutputAccess;
34use MediaWiki\Parser\ParserOptions;
35use MediaWiki\Parser\ParserOutput;
36use MediaWiki\Parser\Parsoid\HtmlToContentTransform;
37use MediaWiki\Parser\Parsoid\HtmlTransformFactory;
38use MediaWiki\Parser\Parsoid\PageBundleParserOutputConverter;
39use MediaWiki\Rest\Handler;
40use MediaWiki\Rest\HttpException;
41use MediaWiki\Rest\LocalizedHttpException;
42use MediaWiki\Rest\ResponseInterface;
43use MediaWiki\Revision\RevisionAccessException;
44use MediaWiki\Revision\RevisionLookup;
45use MediaWiki\Revision\RevisionRecord;
46use MediaWiki\Status\Status;
47use Wikimedia\Bcp47Code\Bcp47Code;
48use Wikimedia\Message\MessageValue;
49use Wikimedia\ParamValidator\ParamValidator;
50use Wikimedia\Parsoid\Core\ClientError;
51use Wikimedia\Parsoid\Core\PageBundle;
52use Wikimedia\Parsoid\Core\ResourceLimitExceededException;
53use Wikimedia\Parsoid\Parsoid;
54use Wikimedia\Stats\StatsFactory;
55
56/**
57 * REST helper for converting HTML to page content source (e.g. wikitext).
58 *
59 * @since 1.40
60 *
61 * @unstable Pending consolidation of the Parsoid extension with core code.
62 */
63class HtmlInputTransformHelper {
64    /**
65     * @internal
66     */
67    public const CONSTRUCTOR_OPTIONS = [
68        MainConfigNames::ParsoidCacheConfig
69    ];
70
71    /** @var PageIdentity|null */
72    private $page = null;
73
74    /**
75     * @var HtmlToContentTransform
76     */
77    private $transform;
78
79    /**
80     * @var array
81     */
82    private $envOptions;
83
84    private StatsFactory $statsFactory;
85    private HtmlTransformFactory $htmlTransformFactory;
86    private ParsoidOutputStash $parsoidOutputStash;
87    private ParserOutputAccess $parserOutputAccess;
88    private PageLookup $pageLookup;
89    private RevisionLookup $revisionLookup;
90
91    /**
92     * @param StatsFactory $statsFactory
93     * @param HtmlTransformFactory $htmlTransformFactory
94     * @param ParsoidOutputStash $parsoidOutputStash
95     * @param ParserOutputAccess $parserOutputAccess
96     * @param PageLookup $pageLookup
97     * @param RevisionLookup $revisionLookup
98     * @param array $envOptions
99     * @param ?PageIdentity $page
100     * @param array|string $body Body structure, or an HTML string
101     * @param array $parameters
102     * @param RevisionRecord|null $originalRevision
103     * @param Bcp47Code|null $pageLanguage
104     */
105    public function __construct(
106        StatsFactory $statsFactory,
107        HtmlTransformFactory $htmlTransformFactory,
108        ParsoidOutputStash $parsoidOutputStash,
109        ParserOutputAccess $parserOutputAccess,
110        PageLookup $pageLookup,
111        RevisionLookup $revisionLookup,
112        array $envOptions = [],
113        ?PageIdentity $page = null,
114        $body = '',
115        array $parameters = [],
116        ?RevisionRecord $originalRevision = null,
117        ?Bcp47Code $pageLanguage = null
118    ) {
119        $this->statsFactory = $statsFactory;
120        $this->htmlTransformFactory = $htmlTransformFactory;
121        $this->parsoidOutputStash = $parsoidOutputStash;
122        $this->envOptions = $envOptions + [
123            'outputContentVersion' => Parsoid::defaultHTMLVersion(),
124            'offsetType' => 'byte',
125        ];
126        $this->parserOutputAccess = $parserOutputAccess;
127        $this->pageLookup = $pageLookup;
128        $this->revisionLookup = $revisionLookup;
129        if ( $page === null ) {
130            wfDeprecated( __METHOD__ . ' without $page', '1.43' );
131        } else {
132            $this->initInternal( $page, $body, $parameters, $originalRevision, $pageLanguage );
133        }
134    }
135
136    public function getParamSettings(): array {
137        // JSON body schema:
138        /*
139        doc:
140            properties:
141                headers:
142                    type: array
143                    items:
144                        type: string
145                body:
146                    type: [ string, object ]
147            required: [ body ]
148
149        body:
150            properties:
151                offsetType:
152                    type: string
153                revid:
154                    type: integer
155                renderid:
156                    type: string
157                etag:
158                    type: string
159                html:
160                    type: [ doc, string ]
161                data-mw:
162                    type: doc
163                original:
164                    properties:
165                        html:
166                            type: doc
167                        source:
168                            type: doc
169                        data-mw:
170                            type: doc
171                        data-parsoid:
172                            type: doc
173            required: [ html ]
174         */
175
176        // FUTURE: more params
177        // - slot (for loading the base content)
178
179        return [
180            // XXX: should we really declare this here? Or should end endpoint do this?
181            //      We are not reading this property...
182            'title' => [
183                Handler::PARAM_SOURCE => 'path',
184                ParamValidator::PARAM_TYPE => 'string',
185                ParamValidator::PARAM_DEFAULT => '',
186                ParamValidator::PARAM_REQUIRED => false,
187                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-title' )
188            ],
189            // XXX: Needed for compatibility with the parsoid transform endpoint.
190            //      But revid should just be part of the info about the original data
191            //      in the body.
192            'oldid' => [
193                Handler::PARAM_SOURCE => 'path',
194                ParamValidator::PARAM_TYPE => 'int',
195                ParamValidator::PARAM_DEFAULT => 0,
196                ParamValidator::PARAM_REQUIRED => false,
197                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-oldid' )
198            ],
199            // XXX: Supported for compatibility with the parsoid transform endpoint.
200            //      If given, it should be 'html' or 'pagebundle'.
201            'from' => [
202                Handler::PARAM_SOURCE => 'path',
203                ParamValidator::PARAM_TYPE => 'string',
204                ParamValidator::PARAM_DEFAULT => '',
205                ParamValidator::PARAM_REQUIRED => false,
206                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-from' )
207            ],
208            // XXX: Supported for compatibility with the parsoid transform endpoint.
209            //      Ignored.
210            'format' => [
211                Handler::PARAM_SOURCE => 'path',
212                ParamValidator::PARAM_TYPE => 'string',
213                ParamValidator::PARAM_DEFAULT => '',
214                ParamValidator::PARAM_REQUIRED => false,
215                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-format' )
216            ],
217            'contentmodel' => [ // XXX: get this from the Accept header?
218                Handler::PARAM_SOURCE => 'query',
219                ParamValidator::PARAM_TYPE => 'string',
220                ParamValidator::PARAM_DEFAULT => '',
221                ParamValidator::PARAM_REQUIRED => false,
222                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-contentmodel' )
223            ],
224            'language' => [ // TODO: get this from Accept-Language header?!
225                Handler::PARAM_SOURCE => 'query',
226                ParamValidator::PARAM_TYPE => 'string',
227                ParamValidator::PARAM_DEFAULT => '',
228                ParamValidator::PARAM_REQUIRED => false,
229                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-language' )
230            ]
231        ];
232    }
233
234    /**
235     * Modify body and parameters to provide compatibility with legacy endpoints.
236     *
237     * @see ParsoidHandler::getRequestAttributes
238     *
239     * @param array<string,mixed> &$body
240     * @param array<string,mixed> &$parameters
241     *
242     * @throws HttpException
243     *
244     * @return void
245     */
246    private static function normalizeParameters( array &$body, array &$parameters ) {
247        // If the revision ID is given in the path, pretend it was given in the body.
248        if ( isset( $parameters['oldid'] ) && (int)$parameters['oldid'] > 0 ) {
249            $body['original']['revid'] = (int)$parameters['oldid'];
250        }
251
252        // If an etag is given in the body, use it as the render ID.
253        // Note that we support ETag format in the renderid field.
254        if ( !empty( $body['original']['etag'] ) ) {
255            // @phan-suppress-next-line PhanTypeInvalidDimOffset false positive
256            $body['original']['renderid'] = $body['original']['etag'];
257        }
258
259        // Accept 'wikitext' as an alias for 'source'.
260        if ( isset( $body['original']['wikitext'] ) ) {
261            // @phan-suppress-next-line PhanTypeInvalidDimOffset false positive
262            $body['original']['source'] = $body['original']['wikitext'];
263            unset( $body['original']['wikitext'] );
264        }
265
266        // If 'from' is not set, we accept page bundle style input as well as full HTML.
267        // If 'from' is set, we only accept page bundle style input if it is set to FORMAT_PAGEBUNDLE.
268        if (
269            isset( $parameters['from'] ) && $parameters['from'] !== '' &&
270            $parameters['from'] !== ParsoidFormatHelper::FORMAT_PAGEBUNDLE
271        ) {
272            unset( $body['original']['data-parsoid']['body'] );
273            unset( $body['original']['data-mw']['body'] );
274            unset( $body['data-mw']['body'] );
275        }
276
277        // If 'from' is given, it must be html or pagebundle.
278        if (
279            isset( $parameters['from'] ) && $parameters['from'] !== '' &&
280            $parameters['from'] !== ParsoidFormatHelper::FORMAT_HTML &&
281            $parameters['from'] !== ParsoidFormatHelper::FORMAT_PAGEBUNDLE
282        ) {
283            throw new LocalizedHttpException(
284                new MessageValue( "rest-unsupported-transform-input", [ $parameters['from'] ] ), 400
285            );
286        }
287
288        if ( isset( $body['contentmodel'] ) && $body['contentmodel'] !== '' ) {
289            $parameters['contentmodel'] = $body['contentmodel'];
290        } elseif ( isset( $parameters['format'] ) && $parameters['format'] !== '' ) {
291            $parameters['contentmodel'] = $parameters['format'];
292        }
293    }
294
295    /**
296     * @param PageIdentity $page
297     * @param array|string $body Body structure, or an HTML string
298     * @param array $parameters
299     * @param RevisionRecord|null $originalRevision
300     * @param Bcp47Code|null $pageLanguage
301     *
302     * @throws HttpException
303     * @deprecated since 1.43; pass arguments to constructor instead
304     */
305    public function init(
306        PageIdentity $page,
307        $body,
308        array $parameters,
309        ?RevisionRecord $originalRevision = null,
310        ?Bcp47Code $pageLanguage = null
311    ) {
312        wfDeprecated( __METHOD__, '1.43' );
313        $this->initInternal( $page, $body, $parameters, $originalRevision, $pageLanguage );
314    }
315
316    /**
317     * @param PageIdentity $page
318     * @param array|string $body Body structure, or an HTML string
319     * @param array $parameters
320     * @param RevisionRecord|null $originalRevision
321     * @param Bcp47Code|null $pageLanguage
322     *
323     * @throws HttpException
324     */
325    private function initInternal(
326        PageIdentity $page,
327        $body,
328        array $parameters,
329        ?RevisionRecord $originalRevision = null,
330        ?Bcp47Code $pageLanguage = null
331    ) {
332        if ( is_string( $body ) ) {
333            $body = [ 'html' => $body ];
334        }
335
336        self::normalizeParameters( $body, $parameters );
337
338        $this->page = $page;
339
340        if ( !isset( $body['html'] ) ) {
341            throw new LocalizedHttpException( new MessageValue( "rest-missing-body-field", [ 'html' ] ) );
342        }
343
344        $html = is_array( $body['html'] ) ? $body['html']['body'] : $body['html'];
345
346        // TODO: validate $body against a proper schema.
347        $this->transform = $this->htmlTransformFactory->getHtmlToContentTransform(
348            $html,
349            $this->page
350        );
351
352        $this->transform->setMetrics( $this->statsFactory );
353
354        // NOTE: Env::getContentModel will fall back to the page's recorded content model
355        //       if none is set here.
356        $this->transform->setOptions( [
357            'contentmodel' => $parameters['contentmodel'] ?? null,
358            'offsetType' => $body['offsetType'] ?? $this->envOptions['offsetType'],
359        ] );
360
361        $original = $body['original'] ?? [];
362        $originalRendering = null;
363
364        if ( !isset( $original['html'] ) && !empty( $original['renderid'] ) ) {
365            $key = $original['renderid'];
366            if ( preg_match( '!^(W/)?".*"$!', $key ) ) {
367                $originalRendering = ParsoidRenderID::newFromETag( $key );
368
369                if ( !$originalRendering ) {
370                    throw new LocalizedHttpException( new MessageValue( "rest-bad-etag", [ $key ] ), 400 );
371                }
372            } else {
373                try {
374                    $originalRendering = ParsoidRenderID::newFromKey( $key );
375                } catch ( InvalidArgumentException $e ) {
376                    throw new LocalizedHttpException(
377                        new MessageValue( 'rest-parsoid-bad-render-id', [ $key ] ),
378                        400
379                    );
380                }
381            }
382        } elseif ( !empty( $original['html'] ) || !empty( $original['data-parsoid'] ) ) {
383            // NOTE: We might have an incomplete PageBundle here, with no HTML but with data-parsoid!
384            // XXX: Do we need to support that, or can that just be a 400?
385            $originalRendering = new PageBundle(
386                $original['html']['body'] ?? '',
387                $original['data-parsoid']['body'] ?? null,
388                $original['data-mw']['body'] ?? null,
389                null, // will be derived from $original['html']['headers']['content-type']
390                $original['html']['headers'] ?? []
391            );
392        }
393
394        if ( !$originalRevision && !empty( $original['revid'] ) ) {
395            $originalRevision = (int)$original['revid'];
396        }
397
398        if ( $originalRevision || $originalRendering ) {
399            $this->setOriginal( $originalRevision, $originalRendering );
400        } else {
401            if ( $this->page->exists() ) {
402                $this->statsFactory
403                    ->getCounter( 'html_input_transform_total' )
404                    ->setLabel( 'original_html_given', 'false' )
405                    ->setLabel( 'page_exists', 'true' )
406                    ->setLabel( 'status', 'unknown' )
407                    ->copyToStatsdAt( 'html_input_transform.original_html.not_given.page_exists' )
408                    ->increment();
409            } else {
410                $this->statsFactory
411                    ->getCounter( 'html_input_transform_total' )
412                    ->setLabel( 'original_html_given', 'false' )
413                    ->setLabel( 'page_exists', 'false' )
414                    ->setLabel( 'status', 'unknown' )
415                    ->copyToStatsdAt( 'html_input_transform.original_html.not_given.page_not_exist' )
416                    ->increment();
417            }
418        }
419
420        if ( isset( $body['data-mw']['body'] ) ) {
421            $this->transform->setModifiedDataMW( $body['data-mw']['body'] );
422        }
423
424        if ( $pageLanguage ) {
425            $this->transform->setContentLanguage( $pageLanguage );
426        } elseif ( isset( $parameters['language'] ) && $parameters['language'] !== '' ) {
427            $pageLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
428                $parameters['language']
429            );
430            $this->transform->setContentLanguage( $pageLanguage );
431        }
432
433        if ( isset( $original['source']['body'] ) ) {
434            // XXX: do we really have to support wikitext overrides?
435            $this->transform->setOriginalText( $original['source']['body'] );
436        }
437    }
438
439    /**
440     * Return HTMLTransform object, so additional context can be provided by calling setters on it.
441     */
442    public function getTransform(): HtmlToContentTransform {
443        return $this->transform;
444    }
445
446    /**
447     * Set metrics sink.
448     */
449    public function setMetrics( StatsFactory $statsFactory ) {
450        $this->statsFactory = $statsFactory;
451
452        if ( $this->transform ) {
453            $this->transform->setMetrics( $statsFactory );
454        }
455    }
456
457    /**
458     * Supply information about the revision and rendering that was the original basis of
459     * the input HTML. This is used to apply selective serialization (selser), if possible.
460     *
461     * @param RevisionRecord|int|null $rev
462     * @param ParsoidRenderID|PageBundle|ParserOutput|null $originalRendering
463     */
464    public function setOriginal( $rev, $originalRendering ) {
465        if ( $originalRendering instanceof ParsoidRenderID ) {
466            $renderId = $originalRendering;
467
468            // If the client asked for a render ID, load original data from stash
469            try {
470                $selserContext = $this->fetchSelserContextFromStash( $renderId );
471            } catch ( InvalidArgumentException $ex ) {
472                $this->statsFactory
473                    ->getCounter( 'html_input_transform_total' )
474                    ->setLabel( 'original_html_given', 'as_renderid' )
475                    ->setLabel( 'page_exists', 'unknown' )
476                    ->setLabel( 'status', 'bad_renderid' )
477                    ->copyToStatsdAt( 'html_input_transform.original_html.given.as_renderid.bad' )
478                    ->increment();
479                throw new LocalizedHttpException( new MessageValue( "rest-bad-stash-key" ),
480                    400,
481                    [
482                        'reason' => $ex->getMessage(),
483                        'key' => "$renderId"
484                    ]
485                );
486            }
487
488            if ( !$selserContext ) {
489                // NOTE: When the client asked for a specific stash key (resp. etag),
490                //       we should fail with a 412 if we don't have the specific rendering.
491                //       On the other hand, of the client only provided a base revision ID,
492                //       we can re-parse and hope for the best.
493
494                throw new LocalizedHttpException(
495                    new MessageValue( "rest-no-stashed-content", [ $renderId->getKey() ] ), 412
496                );
497
498                // TODO: This class should provide getETag and getLastModified methods for use by
499                //       the REST endpoint, to provide proper support for conditionals.
500                //       However, that requires some refactoring of how HTTP conditional checks
501                //       work in the Handler base class.
502            }
503
504            if ( !$rev ) {
505                $rev = $renderId->getRevisionID();
506            }
507
508            $originalRendering = $selserContext->getPageBundle();
509            $content = $selserContext->getContent();
510
511            if ( $content ) {
512                $this->transform->setOriginalContent( $content );
513            }
514        } elseif ( !$originalRendering && $rev ) {
515            // The client provided a revision ID, but not stash key.
516            // Try to get a rendering for the given revision, and use it as the basis for selser.
517            // Chances are good that the resulting diff will be reasonably clean.
518            // NOTE: If we don't have a revision ID, we should not attempt selser!
519            $originalRendering = $this->fetchParserOutputFromParsoid( $this->page, $rev, true );
520
521            if ( $originalRendering ) {
522                $this->statsFactory->getCounter( 'html_input_transform_total' )
523                    ->setLabel( 'original_html_given', 'as_revid' )
524                    ->setLabel( 'page_exists', 'unknown' )
525                    ->setLabel( 'status', 'found' )
526                    ->copyToStatsdAt( 'html_input_transform.original_html.given.as_revid.found' )
527                    ->increment();
528            } else {
529                $this->statsFactory->getCounter( 'html_input_transform_total' )
530                    ->setLabel( 'original_html_given', 'as_revid' )
531                    ->setLabel( 'page_exists', 'unknown' )
532                    ->setLabel( 'status', 'not_found' )
533                    ->copyToStatsdAt( 'html_input_transform.original_html.given.as_revid.not_found' )
534                    ->increment();
535            }
536        } elseif ( $originalRendering ) {
537            $this->statsFactory->getCounter( 'html_input_transform_total' )
538                ->setLabel( 'original_html_given', 'true' )
539                ->setLabel( 'page_exists', 'unknown' )
540                ->setLabel( 'status', 'verbatim' )
541                ->copyToStatsdAt( 'html_input_transform.original_html.given.verbatim' )
542                ->increment();
543        }
544
545        if ( $originalRendering instanceof ParserOutput ) {
546            $originalRendering = PageBundleParserOutputConverter::pageBundleFromParserOutput( $originalRendering );
547
548            // NOTE: Use the default if we got a ParserOutput object.
549            //       Don't apply the default if we got passed a PageBundle,
550            //       in that case, we want to require the version to be explicit.
551            if ( $originalRendering->version === null && !isset( $originalRendering->headers['content-type'] ) ) {
552                $originalRendering->version = Parsoid::defaultHTMLVersion();
553            }
554        }
555
556        if ( !$originalRendering instanceof PageBundle ) {
557            return;
558        }
559
560        if ( $originalRendering->version !== null ) {
561            $this->transform->setOriginalSchemaVersion( $originalRendering->version );
562        } elseif ( !empty( $originalRendering->headers['content-type'] ) ) {
563            $vOriginal = ParsoidFormatHelper::parseContentTypeHeader(
564                // @phan-suppress-next-line PhanTypeArraySuspiciousNullable Silly Phan, we just checked.
565                $originalRendering->headers['content-type']
566            );
567
568            if ( $vOriginal ) {
569                $this->transform->setOriginalSchemaVersion( $vOriginal );
570            }
571        }
572
573        if ( $rev instanceof RevisionRecord ) {
574            $this->transform->setOriginalRevision( $rev );
575        } elseif ( $rev && is_int( $rev ) ) {
576            $this->transform->setOriginalRevisionId( $rev );
577        }
578
579        // NOTE: We might have an incomplete PageBundle here, with no HTML.
580        //       PageBundle::$html is declared to not be nullable, so it would be set to the empty
581        //       string if not given.
582        if ( $originalRendering->html !== '' ) {
583            $this->transform->setOriginalHtml( $originalRendering->html );
584        }
585
586        if ( $originalRendering->parsoid !== null ) {
587            $this->transform->setOriginalDataParsoid( $originalRendering->parsoid );
588        }
589
590        if ( $originalRendering->mw !== null ) {
591            $this->transform->setOriginalDataMW( $originalRendering->mw );
592        }
593    }
594
595    /**
596     * @return Content the content derived from the input HTML.
597     * @throws HttpException
598     */
599    public function getContent(): Content {
600        try {
601            return $this->transform->htmlToContent();
602        } catch ( ClientError $e ) {
603            throw new LocalizedHttpException(
604                new MessageValue( 'rest-html-backend-error', [ $e->getMessage() ] ),
605                400,
606                [ 'reason' => $e->getMessage() ]
607            );
608        } catch ( ResourceLimitExceededException $e ) {
609            throw new LocalizedHttpException(
610                new MessageValue( 'rest-resource-limit-exceeded' ),
611                413,
612                [ 'reason' => $e->getMessage() ]
613            );
614        } catch ( MWUnknownContentModelException $e ) {
615            throw new LocalizedHttpException(
616                new MessageValue( "rest-unknown-content-model", [ $e->getModelId() ] ),
617                400
618            );
619        }
620    }
621
622    /**
623     * Creates a response containing the content derived from the input HTML.
624     * This will set the appropriate Content-Type header.
625     */
626    public function putContent( ResponseInterface $response ) {
627        $content = $this->getContent();
628        $data = $content->serialize();
629
630        try {
631            $contentType = ParsoidFormatHelper::getContentType(
632                $content->getModel(),
633                $this->envOptions['outputContentVersion']
634            );
635        } catch ( InvalidArgumentException $e ) {
636            // If Parsoid doesn't know the content type,
637            // ask the ContentHandler!
638            $contentType = $content->getDefaultFormat();
639        }
640
641        $response->setHeader( 'Content-Type', $contentType );
642        $response->getBody()->write( $data );
643    }
644
645    /**
646     * @param PageIdentity $page
647     * @param RevisionRecord|int $revision
648     * @param bool $mayParse
649     *
650     * @return ParserOutput|null
651     * @throws HttpException
652     */
653    private function fetchParserOutputFromParsoid( PageIdentity $page, $revision, bool $mayParse ): ?ParserOutput {
654        $parserOptions = ParserOptions::newFromAnon();
655        $parserOptions->setUseParsoid();
656
657        try {
658            if ( !$page instanceof PageRecord ) {
659                $name = "$page";
660                $page = $this->pageLookup->getPageByReference( $page );
661                if ( !$page ) {
662                    throw new RevisionAccessException( 'Page {name} not found',
663                        [ 'name' => $name ] );
664                }
665            }
666
667            if ( is_int( $revision ) ) {
668                $revId = $revision;
669                $revision = $this->revisionLookup->getRevisionById( $revId, 0, $page );
670
671                if ( !$revision ) {
672                    throw new RevisionAccessException( 'Revision {revId} not found',
673                        [ 'revId' => $revId ] );
674                }
675            }
676
677            if ( $page->getId() !== $revision->getPageId() ) {
678                throw new RevisionAccessException( 'Revision {revId} does not belong to page {name}',
679                    [ 'name' => $page->getDBkey(),
680                        'revId' => $revision->getId() ] );
681            }
682
683            if ( $mayParse ) {
684                try {
685                    $status = $this->parserOutputAccess->getParserOutput(
686                        $page, $parserOptions, $revision
687                    );
688                } catch ( ClientError $e ) {
689                    $status = Status::newFatal( 'parsoid-client-error', $e->getMessage() );
690                } catch ( ResourceLimitExceededException $e ) {
691                    $status = Status::newFatal( 'parsoid-resource-limit-exceeded', $e->getMessage() );
692                }
693
694                if ( !$status->isOK() ) {
695                    $this->throwHttpExceptionForStatus( $status );
696                }
697
698                $parserOutput = $status->getValue();
699            } else {
700                $parserOutput = $this->parserOutputAccess->getCachedParserOutput(
701                    $page, $parserOptions, $revision
702                );
703            }
704        } catch ( RevisionAccessException $e ) {
705            // The client supplied bad revision ID, or the revision was deleted or suppressed.
706            throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ),
707                404,
708                [ 'reason' => $e->getMessage() ]
709            );
710        }
711
712        return $parserOutput;
713    }
714
715    /**
716     * @param ParsoidRenderID $renderID
717     *
718     * @return SelserContext|null
719     */
720    private function fetchSelserContextFromStash( $renderID ): ?SelserContext {
721        $selserContext = $this->parsoidOutputStash->get( $renderID );
722        $labels = [
723            'original_html_given' => 'as_renderid',
724            'page_exists' => 'unknown',
725            'status' => 'hit-stashed'
726        ];
727        $counter = $this->statsFactory->getCounter( 'html_input_transform_total' );
728        if ( $selserContext ) {
729            $counter->setLabels( $labels )
730                ->copyToStatsdAt( 'html_input_transform.original_html.given.as_renderid.stash_hit.found.hit' )
731                ->increment();
732            return $selserContext;
733        } else {
734            // Looks like the rendering is gone from stash (or the client send us a bogus key).
735            // Try to load it from the parser cache instead.
736            // On a wiki with low edit frequency, there is a good chance that it's still there.
737            try {
738                $parserOutput = $this->fetchParserOutputFromParsoid( $this->page, $renderID->getRevisionID(), false );
739
740                if ( !$parserOutput ) {
741                    $labels[ 'status' ] = 'miss-fallback_not_found';
742                    $counter->setLabels( $labels )->copyToStatsdAt(
743                        'html_input_transform.original_html.given.as_renderid.stash_miss_pc_fallback.not_found.miss'
744                    )->increment();
745                    return null;
746                }
747
748                $cachedRenderID = ParsoidRenderID::newFromParserOutput( $parserOutput );
749                if ( $cachedRenderID->getKey() !== $renderID->getKey() ) {
750                    $labels[ 'status' ] = 'mismatch-fallback_not_found';
751                    $counter->setLabels( $labels )
752                        ->copyToStatsdAt(
753                            'html_input_transform.original_html.given.as_renderid.' .
754                            'stash_miss_pc_fallback.not_found.mismatch'
755                        )
756                        ->increment();
757
758                    // It's not the correct rendering.
759                    return null;
760                }
761                $labels[ 'status' ] = 'hit-fallback_found';
762                $counter->setLabels( $labels )
763                    ->copyToStatsdAt(
764                        'html_input_transform.original_html.given.as_renderid.' .
765                        'stash_miss_pc_fallback.found.hit'
766                    )
767                    ->increment();
768
769                $pb = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput );
770                return new SelserContext( $pb, $renderID->getRevisionID() );
771            } catch ( HttpException $e ) {
772                $labels[ 'status' ] = 'failed-fallback_not_found';
773                $counter->setLabels( $labels )
774                    ->copyToStatsdAt(
775                        'html_input_transform.original_html.given.as_renderid.' .
776                        'stash_miss_pc_fallback.not_found.failed'
777                    )
778                    ->increment();
779
780                // If the revision isn't found, don't trigger a 404. Return null to trigger a 412.
781                return null;
782            }
783        }
784    }
785
786    /**
787     * @param Status $status
788     *
789     * @return never
790     * @throws HttpException
791     */
792    private function throwHttpExceptionForStatus( Status $status ) {
793        // TODO: make this nicer.
794        if ( $status->hasMessage( 'parsoid-resource-limit-exceeded' ) ) {
795            throw new LocalizedHttpException( new MessageValue( "rest-parsoid-resource-exceeded" ),
796                413,
797                [ 'reason' => $status->getHTML() ]
798            );
799        } else {
800            throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error" ),
801                400,
802                [ 'reason' => $status->getHTML() ]
803            );
804        }
805    }
806
807}