Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
66.57% covered (warning)
66.57%
229 / 344
15.38% covered (danger)
15.38%
2 / 13
CRAP
0.00% covered (danger)
0.00%
0 / 1
HtmlInputTransformHelper
66.57% covered (warning)
66.57%
229 / 344
15.38% covered (danger)
15.38%
2 / 13
392.62
0.00% covered (danger)
0.00%
0 / 1
 __construct
92.31% covered (success)
92.31%
12 / 13
0.00% covered (danger)
0.00%
0 / 1
2.00
 getParamSettings
0.00% covered (danger)
0.00%
0 / 44
0.00% covered (danger)
0.00%
0 / 1
2
 normalizeParameters
72.73% covered (warning)
72.73%
16 / 22
0.00% covered (danger)
0.00%
0 / 1
21.19
 init
0.00% covered (danger)
0.00%
0 / 2
0.00% covered (danger)
0.00%
0 / 1
2
 initInternal
77.78% covered (warning)
77.78%
49 / 63
0.00% covered (danger)
0.00%
0 / 1
24.39
 getTransform
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 setMetrics
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
6
 setOriginal
78.38% covered (warning)
78.38%
58 / 74
0.00% covered (danger)
0.00%
0 / 1
28.35
 getContent
72.22% covered (warning)
72.22%
13 / 18
0.00% covered (danger)
0.00%
0 / 1
4.34
 putContent
100.00% covered (success)
100.00%
10 / 10
100.00% covered (success)
100.00%
1 / 1
2
 fetchParserOutputFromParsoid
73.68% covered (warning)
73.68%
28 / 38
0.00% covered (danger)
0.00%
0 / 1
13.21
 fetchSelserContextFromStash
89.36% covered (warning)
89.36%
42 / 47
0.00% covered (danger)
0.00%
0 / 1
5.03
 throwHttpExceptionForStatus
0.00% covered (danger)
0.00%
0 / 9
0.00% covered (danger)
0.00%
0 / 1
6
1<?php
2/**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 * @file
19 */
20namespace MediaWiki\Rest\Handler\Helper;
21
22use InvalidArgumentException;
23use MediaWiki\Content\Content;
24use MediaWiki\Edit\ParsoidOutputStash;
25use MediaWiki\Edit\ParsoidRenderID;
26use MediaWiki\Edit\SelserContext;
27use MediaWiki\Language\LanguageCode;
28use MediaWiki\MainConfigNames;
29use MediaWiki\Page\PageIdentity;
30use MediaWiki\Page\PageLookup;
31use MediaWiki\Page\PageRecord;
32use MediaWiki\Page\ParserOutputAccess;
33use MediaWiki\Parser\ParserOptions;
34use MediaWiki\Parser\ParserOutput;
35use MediaWiki\Parser\Parsoid\HtmlToContentTransform;
36use MediaWiki\Parser\Parsoid\HtmlTransformFactory;
37use MediaWiki\Parser\Parsoid\PageBundleParserOutputConverter;
38use MediaWiki\Rest\Handler;
39use MediaWiki\Rest\HttpException;
40use MediaWiki\Rest\LocalizedHttpException;
41use MediaWiki\Rest\ResponseInterface;
42use MediaWiki\Revision\RevisionAccessException;
43use MediaWiki\Revision\RevisionLookup;
44use MediaWiki\Revision\RevisionRecord;
45use MediaWiki\Status\Status;
46use MWUnknownContentModelException;
47use Wikimedia\Bcp47Code\Bcp47Code;
48use Wikimedia\Message\MessageValue;
49use Wikimedia\ParamValidator\ParamValidator;
50use Wikimedia\Parsoid\Core\ClientError;
51use Wikimedia\Parsoid\Core\PageBundle;
52use Wikimedia\Parsoid\Core\ResourceLimitExceededException;
53use Wikimedia\Parsoid\Parsoid;
54use Wikimedia\Stats\StatsFactory;
55
56/**
57 * REST helper for converting HTML to page content source (e.g. wikitext).
58 *
59 * @since 1.40
60 *
61 * @unstable Pending consolidation of the Parsoid extension with core code.
62 */
63class HtmlInputTransformHelper {
64    /**
65     * @internal
66     */
67    public const CONSTRUCTOR_OPTIONS = [
68        MainConfigNames::ParsoidCacheConfig
69    ];
70
71    /** @var PageIdentity|null */
72    private $page = null;
73
74    /**
75     * @var HtmlToContentTransform
76     */
77    private $transform;
78
79    /**
80     * @var array
81     */
82    private $envOptions;
83
84    private StatsFactory $statsFactory;
85    private HtmlTransformFactory $htmlTransformFactory;
86    private ParsoidOutputStash $parsoidOutputStash;
87    private ParserOutputAccess $parserOutputAccess;
88    private PageLookup $pageLookup;
89    private RevisionLookup $revisionLookup;
90
91    /**
92     * @param StatsFactory $statsFactory
93     * @param HtmlTransformFactory $htmlTransformFactory
94     * @param ParsoidOutputStash $parsoidOutputStash
95     * @param ParserOutputAccess $parserOutputAccess
96     * @param PageLookup $pageLookup
97     * @param RevisionLookup $revisionLookup
98     * @param array $envOptions
99     * @param ?PageIdentity $page
100     * @param array|string $body Body structure, or an HTML string
101     * @param array $parameters
102     * @param RevisionRecord|null $originalRevision
103     * @param Bcp47Code|null $pageLanguage
104     */
105    public function __construct(
106        StatsFactory $statsFactory,
107        HtmlTransformFactory $htmlTransformFactory,
108        ParsoidOutputStash $parsoidOutputStash,
109        ParserOutputAccess $parserOutputAccess,
110        PageLookup $pageLookup,
111        RevisionLookup $revisionLookup,
112        array $envOptions = [],
113        ?PageIdentity $page = null,
114        $body = '',
115        array $parameters = [],
116        ?RevisionRecord $originalRevision = null,
117        ?Bcp47Code $pageLanguage = null
118    ) {
119        $this->statsFactory = $statsFactory;
120        $this->htmlTransformFactory = $htmlTransformFactory;
121        $this->parsoidOutputStash = $parsoidOutputStash;
122        $this->envOptions = $envOptions + [
123            'outputContentVersion' => Parsoid::defaultHTMLVersion(),
124            'offsetType' => 'byte',
125        ];
126        $this->parserOutputAccess = $parserOutputAccess;
127        $this->pageLookup = $pageLookup;
128        $this->revisionLookup = $revisionLookup;
129        if ( $page === null ) {
130            wfDeprecated( __METHOD__ . ' without $page', '1.43' );
131        } else {
132            $this->initInternal( $page, $body, $parameters, $originalRevision, $pageLanguage );
133        }
134    }
135
136    /**
137     * @return array
138     */
139    public function getParamSettings(): array {
140        // JSON body schema:
141        /*
142        doc:
143            properties:
144                headers:
145                    type: array
146                    items:
147                        type: string
148                body:
149                    type: [ string, object ]
150            required: [ body ]
151
152        body:
153            properties:
154                offsetType:
155                    type: string
156                revid:
157                    type: integer
158                renderid:
159                    type: string
160                etag:
161                    type: string
162                html:
163                    type: [ doc, string ]
164                data-mw:
165                    type: doc
166                original:
167                    properties:
168                        html:
169                            type: doc
170                        source:
171                            type: doc
172                        data-mw:
173                            type: doc
174                        data-parsoid:
175                            type: doc
176            required: [ html ]
177         */
178
179        // FUTURE: more params
180        // - slot (for loading the base content)
181
182        return [
183            // XXX: should we really declare this here? Or should end endpoint do this?
184            //      We are not reading this property...
185            'title' => [
186                Handler::PARAM_SOURCE => 'path',
187                ParamValidator::PARAM_TYPE => 'string',
188                ParamValidator::PARAM_DEFAULT => '',
189                ParamValidator::PARAM_REQUIRED => false,
190                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-title' )
191            ],
192            // XXX: Needed for compatibility with the parsoid transform endpoint.
193            //      But revid should just be part of the info about the original data
194            //      in the body.
195            'oldid' => [
196                Handler::PARAM_SOURCE => 'path',
197                ParamValidator::PARAM_TYPE => 'int',
198                ParamValidator::PARAM_DEFAULT => 0,
199                ParamValidator::PARAM_REQUIRED => false,
200                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-oldid' )
201            ],
202            // XXX: Supported for compatibility with the parsoid transform endpoint.
203            //      If given, it should be 'html' or 'pagebundle'.
204            'from' => [
205                Handler::PARAM_SOURCE => 'path',
206                ParamValidator::PARAM_TYPE => 'string',
207                ParamValidator::PARAM_DEFAULT => '',
208                ParamValidator::PARAM_REQUIRED => false,
209                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-from' )
210            ],
211            // XXX: Supported for compatibility with the parsoid transform endpoint.
212            //      Ignored.
213            'format' => [
214                Handler::PARAM_SOURCE => 'path',
215                ParamValidator::PARAM_TYPE => 'string',
216                ParamValidator::PARAM_DEFAULT => '',
217                ParamValidator::PARAM_REQUIRED => false,
218                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-format' )
219            ],
220            'contentmodel' => [ // XXX: get this from the Accept header?
221                Handler::PARAM_SOURCE => 'query',
222                ParamValidator::PARAM_TYPE => 'string',
223                ParamValidator::PARAM_DEFAULT => '',
224                ParamValidator::PARAM_REQUIRED => false,
225                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-contentmodel' )
226            ],
227            'language' => [ // TODO: get this from Accept-Language header?!
228                Handler::PARAM_SOURCE => 'query',
229                ParamValidator::PARAM_TYPE => 'string',
230                ParamValidator::PARAM_DEFAULT => '',
231                ParamValidator::PARAM_REQUIRED => false,
232                Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-language' )
233            ]
234        ];
235    }
236
237    /**
238     * Modify body and parameters to provide compatibility with legacy endpoints.
239     *
240     * @see ParsoidHandler::getRequestAttributes
241     *
242     * @param array<string,mixed> &$body
243     * @param array<string,mixed> &$parameters
244     *
245     * @throws HttpException
246     *
247     * @return void
248     */
249    private static function normalizeParameters( array &$body, array &$parameters ) {
250        // If the revision ID is given in the path, pretend it was given in the body.
251        if ( isset( $parameters['oldid'] ) && (int)$parameters['oldid'] > 0 ) {
252            $body['original']['revid'] = (int)$parameters['oldid'];
253        }
254
255        // If an etag is given in the body, use it as the render ID.
256        // Note that we support ETag format in the renderid field.
257        if ( !empty( $body['original']['etag'] ) ) {
258            // @phan-suppress-next-line PhanTypeInvalidDimOffset false positive
259            $body['original']['renderid'] = $body['original']['etag'];
260        }
261
262        // Accept 'wikitext' as an alias for 'source'.
263        if ( isset( $body['original']['wikitext'] ) ) {
264            // @phan-suppress-next-line PhanTypeInvalidDimOffset false positive
265            $body['original']['source'] = $body['original']['wikitext'];
266            unset( $body['original']['wikitext'] );
267        }
268
269        // If 'from' is not set, we accept page bundle style input as well as full HTML.
270        // If 'from' is set, we only accept page bundle style input if it is set to FORMAT_PAGEBUNDLE.
271        if (
272            isset( $parameters['from'] ) && $parameters['from'] !== '' &&
273            $parameters['from'] !== ParsoidFormatHelper::FORMAT_PAGEBUNDLE
274        ) {
275            unset( $body['original']['data-parsoid']['body'] );
276            unset( $body['original']['data-mw']['body'] );
277            unset( $body['data-mw']['body'] );
278        }
279
280        // If 'from' is given, it must be html or pagebundle.
281        if (
282            isset( $parameters['from'] ) && $parameters['from'] !== '' &&
283            $parameters['from'] !== ParsoidFormatHelper::FORMAT_HTML &&
284            $parameters['from'] !== ParsoidFormatHelper::FORMAT_PAGEBUNDLE
285        ) {
286            throw new LocalizedHttpException(
287                new MessageValue( "rest-unsupported-transform-input", [ $parameters['from'] ] ), 400
288            );
289        }
290
291        if ( isset( $body['contentmodel'] ) && $body['contentmodel'] !== '' ) {
292            $parameters['contentmodel'] = $body['contentmodel'];
293        } elseif ( isset( $parameters['format'] ) && $parameters['format'] !== '' ) {
294            $parameters['contentmodel'] = $parameters['format'];
295        }
296    }
297
298    /**
299     * @param PageIdentity $page
300     * @param array|string $body Body structure, or an HTML string
301     * @param array $parameters
302     * @param RevisionRecord|null $originalRevision
303     * @param Bcp47Code|null $pageLanguage
304     *
305     * @throws HttpException
306     * @deprecated since 1.43; pass arguments to constructor instead
307     */
308    public function init(
309        PageIdentity $page,
310        $body,
311        array $parameters,
312        ?RevisionRecord $originalRevision = null,
313        ?Bcp47Code $pageLanguage = null
314    ) {
315        wfDeprecated( __METHOD__, '1.43' );
316        $this->initInternal( $page, $body, $parameters, $originalRevision, $pageLanguage );
317    }
318
319    /**
320     * @param PageIdentity $page
321     * @param array|string $body Body structure, or an HTML string
322     * @param array $parameters
323     * @param RevisionRecord|null $originalRevision
324     * @param Bcp47Code|null $pageLanguage
325     *
326     * @throws HttpException
327     */
328    private function initInternal(
329        PageIdentity $page,
330        $body,
331        array $parameters,
332        ?RevisionRecord $originalRevision = null,
333        ?Bcp47Code $pageLanguage = null
334    ) {
335        if ( is_string( $body ) ) {
336            $body = [ 'html' => $body ];
337        }
338
339        self::normalizeParameters( $body, $parameters );
340
341        $this->page = $page;
342
343        if ( !isset( $body['html'] ) ) {
344            throw new LocalizedHttpException( new MessageValue( "rest-missing-body-field", [ 'html' ] ) );
345        }
346
347        $html = is_array( $body['html'] ) ? $body['html']['body'] : $body['html'];
348
349        // TODO: validate $body against a proper schema.
350        $this->transform = $this->htmlTransformFactory->getHtmlToContentTransform(
351            $html,
352            $this->page
353        );
354
355        $this->transform->setMetrics( $this->statsFactory );
356
357        // NOTE: Env::getContentModel will fall back to the page's recorded content model
358        //       if none is set here.
359        $this->transform->setOptions( [
360            'contentmodel' => $parameters['contentmodel'] ?? null,
361            'offsetType' => $body['offsetType'] ?? $this->envOptions['offsetType'],
362        ] );
363
364        $original = $body['original'] ?? [];
365        $originalRendering = null;
366
367        if ( !isset( $original['html'] ) && !empty( $original['renderid'] ) ) {
368            $key = $original['renderid'];
369            if ( preg_match( '!^(W/)?".*"$!', $key ) ) {
370                $originalRendering = ParsoidRenderID::newFromETag( $key );
371
372                if ( !$originalRendering ) {
373                    throw new LocalizedHttpException( new MessageValue( "rest-bad-etag", [ $key ] ), 400 );
374                }
375            } else {
376                $originalRendering = ParsoidRenderID::newFromKey( $key );
377            }
378        } elseif ( !empty( $original['html'] ) || !empty( $original['data-parsoid'] ) ) {
379            // NOTE: We might have an incomplete PageBundle here, with no HTML but with data-parsoid!
380            // XXX: Do we need to support that, or can that just be a 400?
381            $originalRendering = new PageBundle(
382                $original['html']['body'] ?? '',
383                $original['data-parsoid']['body'] ?? null,
384                $original['data-mw']['body'] ?? null,
385                null, // will be derived from $original['html']['headers']['content-type']
386                $original['html']['headers'] ?? []
387            );
388        }
389
390        if ( !$originalRevision && !empty( $original['revid'] ) ) {
391            $originalRevision = (int)$original['revid'];
392        }
393
394        if ( $originalRevision || $originalRendering ) {
395            $this->setOriginal( $originalRevision, $originalRendering );
396        } else {
397            if ( $this->page->exists() ) {
398                $this->statsFactory
399                    ->getCounter( 'html_input_transform_total' )
400                    ->setLabel( 'original_html_given', 'false' )
401                    ->setLabel( 'page_exists', 'true' )
402                    ->setLabel( 'status', 'unknown' )
403                    ->copyToStatsdAt( 'html_input_transform.original_html.not_given.page_exists' )
404                    ->increment();
405            } else {
406                $this->statsFactory
407                    ->getCounter( 'html_input_transform_total' )
408                    ->setLabel( 'original_html_given', 'false' )
409                    ->setLabel( 'page_exists', 'false' )
410                    ->setLabel( 'status', 'unknown' )
411                    ->copyToStatsdAt( 'html_input_transform.original_html.not_given.page_not_exist' )
412                    ->increment();
413            }
414        }
415
416        if ( isset( $body['data-mw']['body'] ) ) {
417            $this->transform->setModifiedDataMW( $body['data-mw']['body'] );
418        }
419
420        if ( $pageLanguage ) {
421            $this->transform->setContentLanguage( $pageLanguage );
422        } elseif ( isset( $parameters['language'] ) && $parameters['language'] !== '' ) {
423            $pageLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
424                $parameters['language']
425            );
426            $this->transform->setContentLanguage( $pageLanguage );
427        }
428
429        if ( isset( $original['source']['body'] ) ) {
430            // XXX: do we really have to support wikitext overrides?
431            $this->transform->setOriginalText( $original['source']['body'] );
432        }
433    }
434
435    /**
436     * Return HTMLTransform object, so additional context can be provided by calling setters on it.
437     * @return HtmlToContentTransform
438     */
439    public function getTransform(): HtmlToContentTransform {
440        return $this->transform;
441    }
442
443    /**
444     * Set metrics sink.
445     */
446    public function setMetrics( StatsFactory $statsFactory ) {
447        $this->statsFactory = $statsFactory;
448
449        if ( $this->transform ) {
450            $this->transform->setMetrics( $statsFactory );
451        }
452    }
453
454    /**
455     * Supply information about the revision and rendering that was the original basis of
456     * the input HTML. This is used to apply selective serialization (selser), if possible.
457     *
458     * @param RevisionRecord|int|null $rev
459     * @param ParsoidRenderID|PageBundle|ParserOutput|null $originalRendering
460     */
461    public function setOriginal( $rev, $originalRendering ) {
462        if ( $originalRendering instanceof ParsoidRenderID ) {
463            $renderId = $originalRendering;
464
465            // If the client asked for a render ID, load original data from stash
466            try {
467                $selserContext = $this->fetchSelserContextFromStash( $renderId );
468            } catch ( InvalidArgumentException $ex ) {
469                $this->statsFactory
470                    ->getCounter( 'html_input_transform_total' )
471                    ->setLabel( 'original_html_given', 'as_renderid' )
472                    ->setLabel( 'page_exists', 'unknown' )
473                    ->setLabel( 'status', 'bad_renderid' )
474                    ->copyToStatsdAt( 'html_input_transform.original_html.given.as_renderid.bad' )
475                    ->increment();
476                throw new LocalizedHttpException( new MessageValue( "rest-bad-stash-key" ),
477                    400,
478                    [
479                        'reason' => $ex->getMessage(),
480                        'key' => "$renderId"
481                    ]
482                );
483            }
484
485            if ( !$selserContext ) {
486                // NOTE: When the client asked for a specific stash key (resp. etag),
487                //       we should fail with a 412 if we don't have the specific rendering.
488                //       On the other hand, of the client only provided a base revision ID,
489                //       we can re-parse and hope for the best.
490
491                throw new LocalizedHttpException(
492                    new MessageValue( "rest-no-stashed-content", [ $renderId->getKey() ] ), 412
493                );
494
495                // TODO: This class should provide getETag and getLastModified methods for use by
496                //       the REST endpoint, to provide proper support for conditionals.
497                //       However, that requires some refactoring of how HTTP conditional checks
498                //       work in the Handler base class.
499            }
500
501            if ( !$rev ) {
502                $rev = $renderId->getRevisionID();
503            }
504
505            $originalRendering = $selserContext->getPageBundle();
506            $content = $selserContext->getContent();
507
508            if ( $content ) {
509                $this->transform->setOriginalContent( $content );
510            }
511        } elseif ( !$originalRendering && $rev ) {
512            // The client provided a revision ID, but not stash key.
513            // Try to get a rendering for the given revision, and use it as the basis for selser.
514            // Chances are good that the resulting diff will be reasonably clean.
515            // NOTE: If we don't have a revision ID, we should not attempt selser!
516            $originalRendering = $this->fetchParserOutputFromParsoid( $this->page, $rev, true );
517
518            if ( $originalRendering ) {
519                $this->statsFactory->getCounter( 'html_input_transform_total' )
520                    ->setLabel( 'original_html_given', 'as_revid' )
521                    ->setLabel( 'page_exists', 'unknown' )
522                    ->setLabel( 'status', 'found' )
523                    ->copyToStatsdAt( 'html_input_transform.original_html.given.as_revid.found' )
524                    ->increment();
525            } else {
526                $this->statsFactory->getCounter( 'html_input_transform_total' )
527                    ->setLabel( 'original_html_given', 'as_revid' )
528                    ->setLabel( 'page_exists', 'unknown' )
529                    ->setLabel( 'status', 'not_found' )
530                    ->copyToStatsdAt( 'html_input_transform.original_html.given.as_revid.not_found' )
531                    ->increment();
532            }
533        } elseif ( $originalRendering ) {
534            $this->statsFactory->getCounter( 'html_input_transform_total' )
535                ->setLabel( 'original_html_given', 'true' )
536                ->setLabel( 'page_exists', 'unknown' )
537                ->setLabel( 'status', 'verbatim' )
538                ->copyToStatsdAt( 'html_input_transform.original_html.given.verbatim' )
539                ->increment();
540        }
541
542        if ( $originalRendering instanceof ParserOutput ) {
543            $originalRendering = PageBundleParserOutputConverter::pageBundleFromParserOutput( $originalRendering );
544
545            // NOTE: Use the default if we got a ParserOutput object.
546            //       Don't apply the default if we got passed a PageBundle,
547            //       in that case, we want to require the version to be explicit.
548            if ( $originalRendering->version === null && !isset( $originalRendering->headers['content-type'] ) ) {
549                $originalRendering->version = Parsoid::defaultHTMLVersion();
550            }
551        }
552
553        if ( !$originalRendering instanceof PageBundle ) {
554            return;
555        }
556
557        if ( $originalRendering->version !== null ) {
558            $this->transform->setOriginalSchemaVersion( $originalRendering->version );
559        } elseif ( !empty( $originalRendering->headers['content-type'] ) ) {
560            $vOriginal = ParsoidFormatHelper::parseContentTypeHeader(
561                // @phan-suppress-next-line PhanTypeArraySuspiciousNullable Silly Phan, we just checked.
562                $originalRendering->headers['content-type']
563            );
564
565            if ( $vOriginal ) {
566                $this->transform->setOriginalSchemaVersion( $vOriginal );
567            }
568        }
569
570        if ( $rev instanceof RevisionRecord ) {
571            $this->transform->setOriginalRevision( $rev );
572        } elseif ( $rev && is_int( $rev ) ) {
573            $this->transform->setOriginalRevisionId( $rev );
574        }
575
576        // NOTE: We might have an incomplete PageBundle here, with no HTML.
577        //       PageBundle::$html is declared to not be nullable, so it would be set to the empty
578        //       string if not given.
579        if ( $originalRendering->html !== '' ) {
580            $this->transform->setOriginalHtml( $originalRendering->html );
581        }
582
583        if ( $originalRendering->parsoid !== null ) {
584            $this->transform->setOriginalDataParsoid( $originalRendering->parsoid );
585        }
586
587        if ( $originalRendering->mw !== null ) {
588            $this->transform->setOriginalDataMW( $originalRendering->mw );
589        }
590    }
591
592    /**
593     * @return Content the content derived from the input HTML.
594     * @throws HttpException
595     */
596    public function getContent(): Content {
597        try {
598            return $this->transform->htmlToContent();
599        } catch ( ClientError $e ) {
600            throw new LocalizedHttpException(
601                new MessageValue( 'rest-html-backend-error', [ $e->getMessage() ] ),
602                400,
603                [ 'reason' => $e->getMessage() ]
604            );
605        } catch ( ResourceLimitExceededException $e ) {
606            throw new LocalizedHttpException(
607                new MessageValue( 'rest-resource-limit-exceeded' ),
608                413,
609                [ 'reason' => $e->getMessage() ]
610            );
611        } catch ( MWUnknownContentModelException $e ) {
612            throw new LocalizedHttpException(
613                new MessageValue( "rest-unknown-content-model", [ $e->getModelId() ] ),
614                400
615            );
616        }
617    }
618
619    /**
620     * Creates a response containing the content derived from the input HTML.
621     * This will set the appropriate Content-Type header.
622     *
623     * @param ResponseInterface $response
624     */
625    public function putContent( ResponseInterface $response ) {
626        $content = $this->getContent();
627        $data = $content->serialize();
628
629        try {
630            $contentType = ParsoidFormatHelper::getContentType(
631                $content->getModel(),
632                $this->envOptions['outputContentVersion']
633            );
634        } catch ( InvalidArgumentException $e ) {
635            // If Parsoid doesn't know the content type,
636            // ask the ContentHandler!
637            $contentType = $content->getDefaultFormat();
638        }
639
640        $response->setHeader( 'Content-Type', $contentType );
641        $response->getBody()->write( $data );
642    }
643
644    /**
645     * @param PageIdentity $page
646     * @param RevisionRecord|int $revision
647     * @param bool $mayParse
648     *
649     * @return ParserOutput|null
650     * @throws HttpException
651     */
652    private function fetchParserOutputFromParsoid( PageIdentity $page, $revision, bool $mayParse ): ?ParserOutput {
653        $parserOptions = ParserOptions::newFromAnon();
654        $parserOptions->setUseParsoid();
655
656        try {
657            if ( !$page instanceof PageRecord ) {
658                $name = "$page";
659                $page = $this->pageLookup->getPageByReference( $page );
660                if ( !$page ) {
661                    throw new RevisionAccessException( 'Page {name} not found',
662                        [ 'name' => $name ] );
663                }
664            }
665
666            if ( is_int( $revision ) ) {
667                $revId = $revision;
668                $revision = $this->revisionLookup->getRevisionById( $revId, 0, $page );
669
670                if ( !$revision ) {
671                    throw new RevisionAccessException( 'Revision {revId} not found',
672                        [ 'revId' => $revId ] );
673                }
674            }
675
676            if ( $page->getId() !== $revision->getPageId() ) {
677                throw new RevisionAccessException( 'Revision {revId} does not belong to page {name}',
678                    [ 'name' => $page->getDBkey(),
679                        'revId' => $revision->getId() ] );
680            }
681
682            if ( $mayParse ) {
683                try {
684                    $status = $this->parserOutputAccess->getParserOutput(
685                        $page, $parserOptions, $revision
686                    );
687                } catch ( ClientError $e ) {
688                    $status = Status::newFatal( 'parsoid-client-error', $e->getMessage() );
689                } catch ( ResourceLimitExceededException $e ) {
690                    $status = Status::newFatal( 'parsoid-resource-limit-exceeded', $e->getMessage() );
691                }
692
693                if ( !$status->isOK() ) {
694                    $this->throwHttpExceptionForStatus( $status );
695                }
696
697                $parserOutput = $status->getValue();
698            } else {
699                $parserOutput = $this->parserOutputAccess->getCachedParserOutput(
700                    $page, $parserOptions, $revision
701                );
702            }
703        } catch ( RevisionAccessException $e ) {
704            // The client supplied bad revision ID, or the revision was deleted or suppressed.
705            throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ),
706                404,
707                [ 'reason' => $e->getMessage() ]
708            );
709        }
710
711        return $parserOutput;
712    }
713
714    /**
715     * @param ParsoidRenderID $renderID
716     *
717     * @return SelserContext|null
718     */
719    private function fetchSelserContextFromStash( $renderID ): ?SelserContext {
720        $selserContext = $this->parsoidOutputStash->get( $renderID );
721        $labels = [
722            'original_html_given' => 'as_renderid',
723            'page_exists' => 'unknown',
724            'status' => 'hit-stashed'
725        ];
726        $counter = $this->statsFactory->getCounter( 'html_input_transform_total' );
727        if ( $selserContext ) {
728            $counter->setLabels( $labels )
729                ->copyToStatsdAt( 'html_input_transform.original_html.given.as_renderid.stash_hit.found.hit' )
730                ->increment();
731            return $selserContext;
732        } else {
733            // Looks like the rendering is gone from stash (or the client send us a bogus key).
734            // Try to load it from the parser cache instead.
735            // On a wiki with low edit frequency, there is a good chance that it's still there.
736            try {
737                $parserOutput = $this->fetchParserOutputFromParsoid( $this->page, $renderID->getRevisionID(), false );
738
739                if ( !$parserOutput ) {
740                    $labels[ 'status' ] = 'miss-fallback_not_found';
741                    $counter->setLabels( $labels )->copyToStatsdAt(
742                        'html_input_transform.original_html.given.as_renderid.stash_miss_pc_fallback.not_found.miss'
743                    )->increment();
744                    return null;
745                }
746
747                $cachedRenderID = ParsoidRenderID::newFromParserOutput( $parserOutput );
748                if ( $cachedRenderID->getKey() !== $renderID->getKey() ) {
749                    $labels[ 'status' ] = 'mismatch-fallback_not_found';
750                    $counter->setLabels( $labels )
751                        ->copyToStatsdAt(
752                            'html_input_transform.original_html.given.as_renderid.' .
753                            'stash_miss_pc_fallback.not_found.mismatch'
754                        )
755                        ->increment();
756
757                    // It's not the correct rendering.
758                    return null;
759                }
760                $labels[ 'status' ] = 'hit-fallback_found';
761                $counter->setLabels( $labels )
762                    ->copyToStatsdAt(
763                        'html_input_transform.original_html.given.as_renderid.' .
764                        'stash_miss_pc_fallback.found.hit'
765                    )
766                    ->increment();
767
768                $pb = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput );
769                return new SelserContext( $pb, $renderID->getRevisionID() );
770            } catch ( HttpException $e ) {
771                $labels[ 'status' ] = 'failed-fallback_not_found';
772                $counter->setLabels( $labels )
773                    ->copyToStatsdAt(
774                        'html_input_transform.original_html.given.as_renderid.' .
775                        'stash_miss_pc_fallback.not_found.failed'
776                    )
777                    ->increment();
778
779                // If the revision isn't found, don't trigger a 404. Return null to trigger a 412.
780                return null;
781            }
782        }
783    }
784
785    /**
786     * @param Status $status
787     *
788     * @return never
789     * @throws HttpException
790     */
791    private function throwHttpExceptionForStatus( Status $status ) {
792        // TODO: make this nicer.
793        if ( $status->hasMessage( 'parsoid-resource-limit-exceeded' ) ) {
794            throw new LocalizedHttpException( new MessageValue( "rest-parsoid-resource-exceeded" ),
795                413,
796                [ 'reason' => $status->getHTML() ]
797            );
798        } else {
799            throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error" ),
800                400,
801                [ 'reason' => $status->getHTML() ]
802            );
803        }
804    }
805
806}