Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
39.88% covered (danger)
39.88%
195 / 489
12.00% covered (danger)
12.00%
3 / 25
CRAP
0.00% covered (danger)
0.00%
0 / 1
ParsoidHandler
39.88% covered (danger)
39.88%
195 / 489
12.00% covered (danger)
12.00%
3 / 25
3520.75
0.00% covered (danger)
0.00%
0 / 1
 factory
0.00% covered (danger)
0.00%
0 / 7
0.00% covered (danger)
0.00%
0 / 1
2
 __construct
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
1
 getSupportedRequestTypes
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
2
 assertDomainIsCorrect
0.00% covered (danger)
0.00%
0 / 24
0.00% covered (danger)
0.00%
0 / 1
42
 getParsedBody
0.00% covered (danger)
0.00%
0 / 15
0.00% covered (danger)
0.00%
0 / 1
42
 getOpts
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
2
 getRequestAttributes
0.00% covered (danger)
0.00%
0 / 87
0.00% covered (danger)
0.00%
0 / 1
462
 getHtmlOutputRendererHelper
95.00% covered (success)
95.00%
19 / 20
0.00% covered (danger)
0.00%
0 / 1
6
 getHtmlInputTransformHelper
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
1
 acceptable
0.00% covered (danger)
0.00%
0 / 30
0.00% covered (danger)
0.00%
0 / 1
272
 tryToCreatePageConfig
91.89% covered (success)
91.89%
34 / 37
0.00% covered (danger)
0.00%
0 / 1
7.03
 tryToCreatePageIdentity
55.56% covered (warning)
55.56%
5 / 9
0.00% covered (danger)
0.00%
0 / 1
3.79
 getTransformEndpoint
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getPageContentEndpoint
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
6
 getRevisionContentEndpoint
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
6
 wtLint
53.85% covered (warning)
53.85%
7 / 13
0.00% covered (danger)
0.00%
0 / 1
3.88
 wt2html
78.16% covered (warning)
78.16%
68 / 87
0.00% covered (danger)
0.00%
0 / 1
30.00
 newParsoid
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 parseHTML
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 html2wt
87.50% covered (warning)
87.50%
7 / 8
0.00% covered (danger)
0.00%
0 / 1
3.02
 pb2pb
0.00% covered (danger)
0.00%
0 / 46
0.00% covered (danger)
0.00%
0 / 1
90
 updateRedLinks
0.00% covered (danger)
0.00%
0 / 17
0.00% covered (danger)
0.00%
0 / 1
2
 languageConversion
81.08% covered (warning)
81.08%
30 / 37
0.00% covered (danger)
0.00%
0 / 1
4.11
 execute
n/a
0 / 0
n/a
0 / 0
0
 validatePb
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
6
 pageConfigToPageIdentity
44.44% covered (danger)
44.44%
4 / 9
0.00% covered (danger)
0.00%
0 / 1
2.69
1<?php
2/**
3 * Copyright (C) 2011-2020 Wikimedia Foundation and others.
4 *
5 * @license GPL-2.0-or-later
6 */
7
8namespace MediaWiki\Rest\Handler;
9
10use Composer\Semver\Semver;
11use InvalidArgumentException;
12use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
13use LogicException;
14use MediaWiki\Content\WikitextContent;
15use MediaWiki\Context\RequestContext;
16use MediaWiki\Language\LanguageCode;
17use MediaWiki\Logger\LoggerFactory;
18use MediaWiki\MainConfigNames;
19use MediaWiki\MediaWikiServices;
20use MediaWiki\Page\PageIdentity;
21use MediaWiki\Page\ProperPageIdentity;
22use MediaWiki\Parser\ParserOptions;
23use MediaWiki\Parser\ParserOutput;
24use MediaWiki\Parser\Parsoid\Config\SiteConfig;
25use MediaWiki\Registration\ExtensionRegistry;
26use MediaWiki\Rest\Handler;
27use MediaWiki\Rest\Handler\Helper\HtmlInputTransformHelper;
28use MediaWiki\Rest\Handler\Helper\HtmlOutputRendererHelper;
29use MediaWiki\Rest\Handler\Helper\ParsoidFormatHelper;
30use MediaWiki\Rest\HttpException;
31use MediaWiki\Rest\LocalizedHttpException;
32use MediaWiki\Rest\RequestInterface;
33use MediaWiki\Rest\Response;
34use MediaWiki\Rest\ResponseHeaders;
35use MediaWiki\Revision\MutableRevisionRecord;
36use MediaWiki\Revision\RevisionAccessException;
37use MediaWiki\Revision\RevisionLookup;
38use MediaWiki\Revision\SlotRecord;
39use MediaWiki\Revision\SuppressedDataException;
40use MediaWiki\Title\MalformedTitleException;
41use MediaWiki\Title\Title;
42use MediaWiki\WikiMap\WikiMap;
43use MobileContext;
44use Wikimedia\Http\HttpAcceptParser;
45use Wikimedia\Message\DataMessageValue;
46use Wikimedia\Message\MessageValue;
47use Wikimedia\Parsoid\Config\DataAccess;
48use Wikimedia\Parsoid\Config\PageConfig;
49use Wikimedia\Parsoid\Config\PageConfigFactory;
50use Wikimedia\Parsoid\Core\ClientError;
51use Wikimedia\Parsoid\Core\HtmlPageBundle;
52use Wikimedia\Parsoid\Core\ResourceLimitExceededException;
53use Wikimedia\Parsoid\DOM\Document;
54use Wikimedia\Parsoid\Parsoid;
55use Wikimedia\Parsoid\Utils\ContentUtils;
56use Wikimedia\Parsoid\Utils\DOMCompat;
57use Wikimedia\Parsoid\Utils\DOMUtils;
58use Wikimedia\Parsoid\Utils\Timing;
59
60// TODO logging, timeouts(?), CORS
61// TODO content negotiation (routes.js routes.acceptable)
62// TODO handle MaxConcurrentCallsError (pool counter?)
63
64/**
65 * Base class for Parsoid handlers.
66 * @internal For use by the Parsoid extension
67 */
68abstract class ParsoidHandler extends Handler {
69
70    private RevisionLookup $revisionLookup;
71    protected SiteConfig $siteConfig;
72    protected PageConfigFactory $pageConfigFactory;
73    protected DataAccess $dataAccess;
74
75    /** @var ExtensionRegistry */
76    protected $extensionRegistry;
77
78    /** @var ?StatsdDataFactoryInterface A statistics aggregator */
79    protected $metrics;
80
81    /** @var array */
82    private $requestAttributes;
83
84    public static function factory(): static {
85        $services = MediaWikiServices::getInstance();
86        // @phan-suppress-next-line PhanTypeInstantiateAbstractStatic
87        return new static(
88            $services->getRevisionLookup(),
89            $services->getParsoidSiteConfig(),
90            $services->getParsoidPageConfigFactory(),
91            $services->getParsoidDataAccess()
92        );
93    }
94
95    public function __construct(
96        RevisionLookup $revisionLookup,
97        SiteConfig $siteConfig,
98        PageConfigFactory $pageConfigFactory,
99        DataAccess $dataAccess
100    ) {
101        $this->revisionLookup = $revisionLookup;
102        $this->siteConfig = $siteConfig;
103        $this->pageConfigFactory = $pageConfigFactory;
104        $this->dataAccess = $dataAccess;
105        $this->extensionRegistry = ExtensionRegistry::getInstance();
106        $this->metrics = $siteConfig->metrics();
107    }
108
109    public function getSupportedRequestTypes(): array {
110        return array_merge( parent::getSupportedRequestTypes(), [
111            'application/x-www-form-urlencoded',
112            'multipart/form-data'
113        ] );
114    }
115
116    /**
117     * Verify that the {domain} path parameter matches the actual domain.
118     * @todo Remove this when we no longer need to support the {domain}
119     *       parameter with backwards compatibility with the parsoid
120     *       extension.
121     * @param string $domain Domain name parameter to validate
122     */
123    protected function assertDomainIsCorrect( $domain ): void {
124        // We are cutting some corners here (IDN, non-ASCII casing)
125        // since domain name support is provisional.
126        // TODO use a proper validator instead
127        $server = RequestContext::getMain()->getConfig()->get( MainConfigNames::Server );
128        $expectedDomain = parse_url( $server, PHP_URL_HOST );
129        if ( !$expectedDomain ) {
130            throw new LogicException( 'Cannot parse $wgServer' );
131        }
132        if ( strcasecmp( $expectedDomain, $domain ) === 0 ) {
133            return;
134        }
135
136        // TODO: This should really go away! It's only acceptable because
137        //       this entire method is going to be removed once we no longer
138        //       need the parsoid extension endpoints with the {domain} parameter.
139        if ( $this->extensionRegistry->isLoaded( 'MobileFrontend' ) ) {
140            // @phan-suppress-next-line PhanUndeclaredClassMethod
141            $mobileServer = MobileContext::singleton()->getMobileUrl( $server );
142            $expectedMobileDomain = parse_url( $mobileServer, PHP_URL_HOST );
143            if ( $expectedMobileDomain && strcasecmp( $expectedMobileDomain, $domain ) === 0 ) {
144                return;
145            }
146        }
147
148        $msg = new DataMessageValue(
149            'mwparsoid-invalid-domain',
150            [],
151            'invalid-domain',
152            [ 'expected' => $expectedDomain, 'actual' => $domain, ]
153        );
154
155        throw new LocalizedHttpException( $msg, 400, [
156            'error' => 'parameter-validation-failed',
157            'name' => 'domain',
158            'value' => $domain,
159            'failureCode' => $msg->getCode(),
160            'failureData' => $msg->getData(),
161        ] );
162    }
163
164    /**
165     * Get the parsed body by content-type
166     */
167    protected function getParsedBody(): array {
168        $request = $this->getRequest();
169        [ $contentType ] = explode( ';', $request->getHeader( 'Content-Type' )[0] ?? '', 2 );
170        switch ( $contentType ) {
171            case 'application/x-www-form-urlencoded':
172            case 'multipart/form-data':
173                return $request->getPostParams();
174            case 'application/json':
175                $json = json_decode( $request->getBody()->getContents(), true );
176                if ( !is_array( $json ) ) {
177                    throw new LocalizedHttpException(
178                        new MessageValue( "rest-json-body-parse-error", [ 'not a valid JSON object' ] ), 400 );
179                }
180                return $json;
181            default:
182                throw new LocalizedHttpException(
183                    new MessageValue( "rest-unsupported-content-type", [ $contentType ?? '(null)' ] ),
184                    415
185                );
186        }
187    }
188
189    protected function getOpts( array $body, RequestInterface $request ): array {
190        return array_merge(
191            $body,
192            array_intersect_key( $request->getPathParams(), [ 'from' => true, 'format' => true ] )
193        );
194    }
195
196    /**
197     * Rough equivalent of req.local from Parsoid-JS.
198     * FIXME most of these should be replaced with more native ways of handling the request.
199     * @return array
200     */
201    protected function &getRequestAttributes(): array {
202        if ( $this->requestAttributes ) {
203            return $this->requestAttributes;
204        }
205
206        $request = $this->getRequest();
207        $body = ( $request->getMethod() === 'POST' ) ? $this->getParsedBody() : [];
208        $opts = $this->getOpts( $body, $request );
209        '@phan-var array<string,array|bool|string> $opts'; // @var array<string,array|bool|string> $opts
210        $contentLanguage = $request->getHeaderLine( 'Content-Language' ) ?: null;
211        if ( $contentLanguage ) {
212            $contentLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
213                $contentLanguage
214            );
215        }
216        $attribs = [
217            'pageName' => $request->getPathParam( 'title' ) ?? '',
218            'oldid' => $request->getPathParam( 'revision' ),
219            // "body_only" flag to return just the body (instead of the entire HTML doc)
220            // We would like to deprecate use of this flag: T181657
221            'body_only' => $request->getQueryParams()['body_only'] ?? $body['body_only'] ?? null,
222            'errorEnc' => ParsoidFormatHelper::ERROR_ENCODING[$opts['format']] ?? 'plain',
223            'iwp' => WikiMap::getCurrentWikiId(), // PORT-FIXME verify
224            'offsetType' => $body['offsetType']
225                ?? $request->getQueryParams()['offsetType']
226                // Lint requests should return UCS2 offsets by default
227                ?? ( $opts['format'] === ParsoidFormatHelper::FORMAT_LINT ? 'ucs2' : 'byte' ),
228            'pagelanguage' => $contentLanguage,
229        ];
230
231        // For use in getHtmlOutputRendererHelper
232        $opts['stash'] = $request->getQueryParams()['stash'] ?? false;
233
234        if ( $request->getMethod() === 'POST' ) {
235            if ( isset( $opts['original']['revid'] ) ) {
236                $attribs['oldid'] = $opts['original']['revid'];
237            }
238            if ( isset( $opts['original']['title'] ) ) {
239                $attribs['pageName'] = $opts['original']['title'];
240            }
241        }
242        if ( $attribs['oldid'] !== null ) {
243            if ( $attribs['oldid'] === '' ) {
244                $attribs['oldid'] = null;
245            } else {
246                $attribs['oldid'] = (int)$attribs['oldid'];
247            }
248        }
249
250        // For use in getHtmlOutputRendererHelper
251        $opts['accept-language'] = $request->getHeaderLine( 'Accept-Language' ) ?: null;
252
253        $acceptLanguage = null;
254        if ( $opts['accept-language'] !== null ) {
255            $acceptLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
256                HtmlOutputRendererHelper::getAcceptedTargetLanguage(
257                    $opts['accept-language']
258                )
259            );
260        }
261
262        // Init pageName if oldid is provided and is a valid revision
263        if ( ( $attribs['pageName'] === '' ) && $attribs['oldid'] ) {
264            $rev = $this->revisionLookup->getRevisionById( $attribs['oldid'] );
265            if ( $rev ) {
266                $attribs['pageName'] = $rev->getPage()->getDBkey();
267            }
268        }
269
270        $attribs['envOptions'] = [
271            // We use `prefix` but ought to use `domain` (T206764)
272            'prefix' => $attribs['iwp'],
273            // For the legacy "domain" path parameter used by the endpoints exposed
274            // by the parsoid extension. Will be null for core endpoints.
275            'domain' => $request->getPathParam( 'domain' ),
276            'pageName' => $attribs['pageName'],
277            'cookie' => $request->getHeaderLine( 'Cookie' ),
278            'reqId' => $request->getHeaderLine( 'X-Request-Id' ),
279            'userAgent' => $request->getHeaderLine( 'User-Agent' ),
280            // Used in pb2pb variant updates and wtLint
281            'htmlVariantLanguage' => $acceptLanguage,
282            // Semver::satisfies checks below expect a valid outputContentVersion value.
283            // Better to set it here instead of adding the default value at every check.
284            'outputContentVersion' => Parsoid::defaultHTMLVersion(),
285        ];
286
287        # Convert language codes in $opts['updates']['variant'] if present
288        $sourceVariant = $opts['updates']['variant']['wikitext'] ??
289            $opts['updates']['variant']['source'] ?? null;
290        if ( $sourceVariant ) {
291            $sourceVariant = LanguageCode::normalizeNonstandardCodeAndWarn(
292                $sourceVariant
293            );
294            unset( $opts['updates']['variant']['source'] );
295            $opts['updates']['variant']['wikitext'] = $sourceVariant;
296        }
297        $targetVariant = $opts['updates']['variant']['html'] ??
298            $opts['updates']['variant']['target'] ?? null;
299        if ( $targetVariant ) {
300            $targetVariant = LanguageCode::normalizeNonstandardCodeAndWarn(
301                $targetVariant
302            );
303            unset( $opts['updates']['variant']['target'] );
304            $opts['updates']['variant']['html'] = $targetVariant;
305        }
306        if ( isset( $opts['wikitext']['headers']['content-language'] ) ) {
307            $contentLanguage = $opts['wikitext']['headers']['content-language'];
308            $contentLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
309                $contentLanguage
310            );
311            $opts['wikitext']['headers']['content-language'] = $contentLanguage;
312        }
313        if ( isset( $opts['original']['wikitext']['headers']['content-language'] ) ) {
314            $contentLanguage = $opts['original']['wikitext']['headers']['content-language'];
315            $contentLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
316                $contentLanguage
317            );
318            $opts['original']['wikitext']['headers']['content-language'] = $contentLanguage;
319        }
320
321        $attribs['opts'] = $opts;
322
323        // TODO: Remove assertDomainIsCorrect() once we no longer need to support the {domain}
324        //       parameter for the endpoints exposed by the parsoid extension.
325        if ( $attribs['envOptions']['domain'] !== null ) {
326            $this->assertDomainIsCorrect( $attribs['envOptions']['domain'] );
327        }
328
329        $this->requestAttributes = $attribs;
330        return $this->requestAttributes;
331    }
332
333    /**
334     * @param array $attribs
335     * @param ?string $source
336     * @param PageIdentity $page
337     * @param ?int $revId
338     *
339     * @return HtmlOutputRendererHelper
340     */
341    private function getHtmlOutputRendererHelper(
342        array $attribs,
343        ?string $source,
344        PageIdentity $page,
345        ?int $revId
346    ): HtmlOutputRendererHelper {
347        $services = MediaWikiServices::getInstance();
348
349        // Request lenient rev handling
350        $lenientRevHandling = true;
351
352        $authority = $this->getAuthority();
353
354        $params = [];
355        $helper = $services->getPageRestHelperFactory()->newHtmlOutputRendererHelper(
356            $page, $params, $authority, $revId, $lenientRevHandling
357        );
358
359        // XXX: should default to the page's content model?
360        $model = $attribs['opts']['contentmodel']
361            ?? ( $attribs['envOptions']['contentmodel'] ?? CONTENT_MODEL_WIKITEXT );
362
363        if ( $source !== null ) {
364            $helper->setContentSource( $source, $model );
365        }
366
367        if ( isset( $attribs['opts']['stash'] ) ) {
368            $helper->setStashingEnabled( $attribs['opts']['stash'] );
369        }
370
371        if ( isset( $attribs['envOptions']['outputContentVersion'] ) ) {
372            $helper->setOutputProfileVersion( $attribs['envOptions']['outputContentVersion'] );
373        }
374
375        if ( isset( $attribs['pagelanguage'] ) ) {
376            $helper->setPageLanguage( $attribs['pagelanguage'] );
377        }
378
379        if ( isset( $attribs['opts']['accept-language'] ) ) {
380            $helper->setVariantConversionLanguage( $attribs['opts']['accept-language'] );
381        }
382
383        return $helper;
384    }
385
386    /**
387     * @param array $attribs
388     * @param string $html
389     * @param PageIdentity $page
390     *
391     * @return HtmlInputTransformHelper
392     */
393    protected function getHtmlInputTransformHelper(
394        array $attribs,
395        string $html,
396        PageIdentity $page
397    ): HtmlInputTransformHelper {
398        $services = MediaWikiServices::getInstance();
399
400        $parameters = $attribs['opts'] + $attribs;
401        $body = $attribs['opts'];
402
403        $body['html'] = $html;
404
405        $helper = $services->getPageRestHelperFactory()->newHtmlInputTransformHelper(
406            $attribs['envOptions'] + [
407                'offsetType' => $attribs['offsetType'],
408            ],
409            $page,
410            $body,
411            $parameters
412        );
413
414        $helper->setMetrics( $this->siteConfig->prefixedStatsFactory() );
415
416        return $helper;
417    }
418
419    /**
420     * FIXME: Combine with ParsoidFormatHelper::parseContentTypeHeader
421     */
422    private const NEW_SPEC =
423        '#^https://www.mediawiki.org/wiki/Specs/(HTML|pagebundle)/(\d+\.\d+\.\d+)$#D';
424
425    /**
426     * This method checks if we support the requested content formats
427     * As a side-effect, it updates $attribs to set outputContentVersion
428     * that Parsoid should generate based on request headers.
429     *
430     * @param array &$attribs Request attributes from getRequestAttributes()
431     * @return bool
432     */
433    protected function acceptable( array &$attribs ): bool {
434        $request = $this->getRequest();
435        $format = $attribs['opts']['format'];
436
437        if ( $format === ParsoidFormatHelper::FORMAT_WIKITEXT ) {
438            return true;
439        }
440
441        $acceptHeader = $request->getHeader( 'Accept' );
442        if ( !$acceptHeader ) {
443            return true;
444        }
445
446        $parser = new HttpAcceptParser();
447        $acceptableTypes = $parser->parseAccept( $acceptHeader[0] );  // FIXME: Multiple headers valid?
448        if ( !$acceptableTypes ) {
449            return true;
450        }
451
452        // `acceptableTypes` is already sorted by quality.
453        foreach ( $acceptableTypes as $t ) {
454            $type = "{$t['type']}/{$t['subtype']}";
455            $profile = $t['params']['profile'] ?? null;
456            if (
457                ( $format === ParsoidFormatHelper::FORMAT_HTML && $type === 'text/html' ) ||
458                ( $format === ParsoidFormatHelper::FORMAT_PAGEBUNDLE && $type === 'application/json' )
459            ) {
460                if ( $profile ) {
461                    preg_match( self::NEW_SPEC, $profile, $matches );
462                    if ( $matches && strtolower( $matches[1] ) === $format ) {
463                        $contentVersion = Parsoid::resolveContentVersion( $matches[2] );
464                        if ( $contentVersion ) {
465                            // $attribs mutated here!
466                            $attribs['envOptions']['outputContentVersion'] = $contentVersion;
467                            return true;
468                        } else {
469                            continue;
470                        }
471                    } else {
472                        continue;
473                    }
474                } else {
475                    return true;
476                }
477            } elseif (
478                ( $type === '*/*' ) ||
479                ( $format === ParsoidFormatHelper::FORMAT_HTML && $type === 'text/*' )
480            ) {
481                return true;
482            }
483        }
484
485        return false;
486    }
487
488    /**
489     * Try to create a PageConfig object. If we get an exception (because content
490     * may be missing or inaccessible), throw an appropriate HTTP response object
491     * for callers to handle.
492     *
493     * @param array $attribs
494     * @param ?string $wikitextOverride
495     *   Custom wikitext to use instead of the real content of the page.
496     * @param bool $html2WtMode
497     * @return PageConfig
498     * @throws HttpException
499     */
500    protected function tryToCreatePageConfig(
501        array $attribs, ?string $wikitextOverride = null, bool $html2WtMode = false
502    ): PageConfig {
503        $revId = $attribs['oldid'];
504        $pagelanguageOverride = $attribs['pagelanguage'];
505        $title = $attribs['pageName'];
506
507        $title = ( $title !== '' ) ? Title::newFromText( $title ) : Title::newMainPage();
508        if ( !$title ) {
509            throw new LocalizedHttpException(
510                new MessageValue( "rest-invalid-title", [ 'pageName' ] ), 400
511            );
512        }
513        $user = RequestContext::getMain()->getUser();
514
515        if ( $wikitextOverride === null ) {
516            $revisionRecord = null;
517        } else {
518            // Create a mutable revision record point to the same revision
519            // and set to the desired wikitext.
520            $revisionRecord = new MutableRevisionRecord( $title );
521            // Don't set id to $revId if we have $wikitextOverride
522            // A revision corresponds to specific wikitext, which $wikitextOverride
523            // might not be.
524            $revisionRecord->setId( 0 );
525            $revisionRecord->setSlot(
526                SlotRecord::newUnsaved(
527                    SlotRecord::MAIN,
528                    new WikitextContent( $wikitextOverride )
529                )
530            );
531        }
532
533        $hasOldId = ( $revId !== null );
534        $ensureAccessibleContent = !$html2WtMode || $hasOldId;
535
536        try {
537            // Note: Parsoid by design isn't supposed to use the user
538            // context right now, and all user state is expected to be
539            // introduced as a post-parse transform.  So although we pass a
540            // User here, it only currently affects the output in obscure
541            // corner cases; see PageConfigFactory::create() for more.
542            // @phan-suppress-next-line PhanUndeclaredMethod method defined in subtype
543            $pageConfig = $this->pageConfigFactory->createFromParserOptions(
544                ParserOptions::newFromUser( $user ),
545                $title,
546                $revisionRecord ?? $revId,
547                $pagelanguageOverride,
548                $ensureAccessibleContent
549            );
550        } catch ( SuppressedDataException $e ) {
551            throw new LocalizedHttpException(
552                new MessageValue( "rest-permission-denied-revision", [ $e->getMessage() ] ), 403
553            );
554        } catch ( RevisionAccessException $e ) {
555            throw new LocalizedHttpException(
556                new MessageValue( "rest-specified-revision-unavailable", [ $e->getMessage() ] ), 404
557            );
558        }
559
560        // All good!
561        return $pageConfig;
562    }
563
564    /**
565     * Try to create a PageIdentity object.
566     * If no page is specified in the request, this will return the wiki's main page.
567     * If an invalid page is requested, this throws an appropriate HTTPException.
568     *
569     * @param array $attribs
570     * @return PageIdentity
571     * @throws HttpException
572     */
573    protected function tryToCreatePageIdentity( array $attribs ): PageIdentity {
574        if ( $attribs['pageName'] === '' ) {
575            return Title::newMainPage();
576        }
577
578        // XXX: Should be injected, but the Parsoid extension relies on the
579        //      constructor signature. Also, ParsoidHandler should go away soon anyway.
580        $pageStore = MediaWikiServices::getInstance()->getPageStore();
581
582        $page = $pageStore->getPageByText( $attribs['pageName'] );
583
584        if ( !$page ) {
585            throw new LocalizedHttpException(
586                new MessageValue( "rest-invalid-title", [ 'pageName' ] ), 400
587            );
588        }
589
590        return $page;
591    }
592
593    /**
594     * Get the path for the transform endpoint. May be overwritten to override the path.
595     *
596     * This is done in the parsoid extension, for backwards compatibility
597     * with the old endpoint URLs.
598     *
599     * @stable to override
600     *
601     * @param string $format The format the endpoint is expected to return.
602     *
603     * @return string
604     */
605    protected function getTransformEndpoint( string $format = ParsoidFormatHelper::FORMAT_HTML ): string {
606        return '/coredev/v0/transform/{from}/to/{format}/{title}/{revision}';
607    }
608
609    /**
610     * Get the path for the page content endpoint. May be overwritten to override the path.
611     *
612     * This is done in the parsoid extension, for backwards compatibility
613     * with the old endpoint URLs.
614     *
615     * @stable to override
616     *
617     * @param string $format The format the endpoint is expected to return.
618     *
619     * @return string
620     */
621    protected function getPageContentEndpoint( string $format = ParsoidFormatHelper::FORMAT_HTML ): string {
622        if ( $format !== ParsoidFormatHelper::FORMAT_HTML ) {
623            throw new InvalidArgumentException( 'Unsupported page content format: ' . $format );
624        }
625        return '/v1/page/{title}/html';
626    }
627
628    /**
629     * Get the path for the page content endpoint. May be overwritten to override the path.
630     *
631     * This is done in the parsoid extension, for backwards compatibility
632     * with the old endpoint URLs.
633     *
634     * @stable to override
635     *
636     * @param string $format The format the endpoint is expected to return.
637     *
638     * @return string
639     */
640    protected function getRevisionContentEndpoint( string $format = ParsoidFormatHelper::FORMAT_HTML ): string {
641        if ( $format !== ParsoidFormatHelper::FORMAT_HTML ) {
642            throw new InvalidArgumentException( 'Unsupported revision content format: ' . $format );
643        }
644        return '/v1/revision/{revision}/html';
645    }
646
647    private function wtLint(
648        PageConfig $pageConfig, array $attribs, ?array $linterOverrides = []
649    ): array {
650        $envOptions = $attribs['envOptions'] + [
651            'linterOverrides' => $linterOverrides,
652            'offsetType' => $attribs['offsetType'],
653        ];
654        try {
655            $parsoid = $this->newParsoid();
656            $parserOutput = new ParserOutput();
657            return $parsoid->wikitext2lint( $pageConfig, $envOptions, $parserOutput );
658        } catch ( ClientError $e ) {
659            throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error", [ $e->getMessage() ] ), 400 );
660        } catch ( ResourceLimitExceededException $e ) {
661            throw new LocalizedHttpException(
662                new MessageValue( "rest-parsoid-resource-exceeded", [ $e->getMessage() ] ), 413
663            );
664        }
665    }
666
667    /**
668     * Wikitext -> HTML helper.
669     * Spec'd in https://phabricator.wikimedia.org/T75955 and the API tests.
670     *
671     * @param PageConfig $pageConfig
672     * @param array $attribs Request attributes from getRequestAttributes()
673     * @param ?string $wikitext Wikitext to transform (or null to use the
674     *   page specified in the request attributes).
675     *
676     * @return Response
677     */
678    protected function wt2html(
679        PageConfig $pageConfig, array $attribs, ?string $wikitext = null
680    ) {
681        $request = $this->getRequest();
682        $opts = $attribs['opts'];
683        $format = $opts['format'];
684        $oldid = $attribs['oldid'];
685        $stash = $opts['stash'] ?? false;
686
687        if ( $format === ParsoidFormatHelper::FORMAT_LINT ) {
688            $linterOverrides = [];
689            if ( $this->extensionRegistry->isLoaded( 'Linter' ) ) { // T360809
690                $disabled = [];
691                $services = MediaWikiServices::getInstance();
692                $linterCategories = $services->getMainConfig()->get( 'LinterCategories' );
693                foreach ( $linterCategories as $name => $cat ) {
694                    if ( $cat['priority'] === 'none' ) {
695                        $disabled[] = $name;
696                    }
697                }
698                $linterOverrides['disabled'] = $disabled;
699            }
700            $lints = $this->wtLint( $pageConfig, $attribs, $linterOverrides );
701            $response = $this->getResponseFactory()->createJson( $lints );
702            return $response;
703        }
704
705        // TODO: This method should take a PageIdentity + revId,
706        //       to reduce the usage of PageConfig in MW core.
707        $helper = $this->getHtmlOutputRendererHelper(
708            $attribs,
709            $wikitext,
710            $this->pageConfigToPageIdentity( $pageConfig ),
711            // Id will be 0 if we have $wikitext but that isn't valid
712            // to call $helper->setRevision with.  In any case, the revision
713            // will be reset when $helper->setContent is called with $wikitext.
714            // Ideally, the revision would be pass through here instead of
715            // the id and wikitext.
716            $pageConfig->getRevisionId() ?: null
717        );
718
719        $needsPageBundle = ( $format === ParsoidFormatHelper::FORMAT_PAGEBUNDLE );
720
721        if ( $attribs['body_only'] ) {
722            $helper->setFlavor( 'fragment' );
723        } elseif ( !$needsPageBundle ) {
724            // Inline data-parsoid. This will happen when no special params are set.
725            $helper->setFlavor( 'edit' );
726        }
727
728        if ( $wikitext === null && $oldid !== null ) {
729            $mstr = 'pageWithOldid';
730        } else {
731            $mstr = 'wt';
732        }
733
734        $parseTiming = Timing::start();
735
736        if ( $needsPageBundle ) {
737            $pb = $helper->getPageBundle();
738
739            // Handle custom offset requests as a pb2pb transform
740            if (
741                $helper->isParsoidContent() &&
742                ( $attribs['offsetType'] !== 'byte' )
743            ) {
744                $parsoid = $this->newParsoid();
745                $pb = $parsoid->pb2pb(
746                    $pageConfig,
747                    'convertoffsets',
748                    $pb,
749                    [
750                        'inputOffsetType' => 'byte',
751                        'outputOffsetType' => $attribs['offsetType']
752                    ]
753                );
754            }
755
756            $response = $this->getResponseFactory()->createJson( $pb->responseData() );
757            $helper->putHeaders( $response, false );
758
759            ParsoidFormatHelper::setContentType(
760                $response,
761                ParsoidFormatHelper::FORMAT_PAGEBUNDLE,
762                $pb->version
763            );
764        } else {
765            $out = $helper->getHtml();
766
767            // TODO: offsetType conversion isn't supported right now for non-pagebundle endpoints
768            // Once the OutputTransform framework lands, we might revisit this.
769
770            $response = $this->getResponseFactory()->create();
771            $response->getBody()->write( $out->getRawText() );
772
773            $helper->putHeaders( $response, true );
774
775            // Emit an ETag only if stashing is enabled. It's not reliably useful otherwise.
776            if ( $stash ) {
777                $eTag = $helper->getETag();
778                if ( $eTag ) {
779                    $response->setHeader( 'ETag', $eTag );
780                }
781            }
782        }
783
784        // XXX: For pagebundle requests, this can be somewhat inflated
785        // because of pagebundle json-encoding overheads
786        $outSize = $response->getBody()->getSize();
787        $parseTime = $parseTiming->end();
788
789        // Ignore slow parse metrics for non-oldid parses
790        if ( $mstr === 'pageWithOldid' ) {
791            if ( $parseTime > 3000 ) {
792                LoggerFactory::getInstance( 'slow-parsoid' )
793                    ->info( 'Parsing {title} was slow, took {time} seconds', [
794                        'time' => number_format( $parseTime / 1000, 2 ),
795                        'title' => Title::newFromLinkTarget( $pageConfig->getLinkTarget() )->getPrefixedText(),
796                    ] );
797            }
798
799            if ( $parseTime > 10 && $outSize > 100 ) {
800                // * Don't bother with this metric for really small parse times
801                //   p99 for initialization time is ~7ms according to grafana.
802                //   So, 10ms ensures that startup overheads don't skew the metrics
803                // * For body_only=false requests, <head> section isn't generated
804                //   and if the output is small, per-request overheads can skew
805                //   the timePerKB metrics.
806
807                // NOTE: This is slightly misleading since there are fixed costs
808                // for generating output like the <head> section and should be factored in,
809                // but this is good enough for now as a useful first degree of approxmation.
810                $timePerKB = $parseTime * 1024 / $outSize;
811                if ( $timePerKB > 500 ) {
812                    // At 100ms/KB, even a 100KB page which isn't that large will take 10s.
813                    // So, we probably want to shoot for a threshold under 100ms.
814                    // But, let's start with 500ms+ outliers first and see what we uncover.
815                    LoggerFactory::getInstance( 'slow-parsoid' )
816                        ->info( 'Parsing {title} was slow, timePerKB took {timePerKB} ms, total: {time} seconds', [
817                            'time' => number_format( $parseTime / 1000, 2 ),
818                            'timePerKB' => number_format( $timePerKB, 1 ),
819                            'title' => Title::newFromLinkTarget( $pageConfig->getLinkTarget() )->getPrefixedText(),
820                        ] );
821                }
822            }
823        }
824
825        if ( $wikitext !== null ) {
826            // Don't cache requests when wt is set in case somebody uses
827            // GET for wikitext parsing
828            // XXX: can we just refuse to do wikitext parsing in a GET request?
829            $response->setHeader( ResponseHeaders::CACHE_CONTROL, 'private,no-cache,s-maxage=0' );
830        } elseif ( $oldid !== null ) {
831            // XXX: can this go away? Parsoid's PageContent class doesn't expose supressed revision content.
832            if ( $request->getHeaderLine( 'Cookie' ) ||
833                $request->getHeaderLine( 'Authorization' ) ) {
834                // Don't cache requests with a session.
835                $response->setHeader( ResponseHeaders::CACHE_CONTROL, 'private,no-cache,s-maxage=0' );
836            }
837        }
838        return $response;
839    }
840
841    protected function newParsoid(): Parsoid {
842        return new Parsoid( $this->siteConfig, $this->dataAccess );
843    }
844
845    protected function parseHTML( string $html, bool $validateXMLNames = false ): Document {
846        return DOMUtils::parseHTML( $html, $validateXMLNames );
847    }
848
849    /**
850     * @param PageConfig|PageIdentity $page
851     * @param array $attribs Attributes gotten from requests
852     * @param string $html Original HTML
853     *
854     * @return Response
855     * @throws HttpException
856     */
857    protected function html2wt(
858        $page, array $attribs, string $html
859    ) {
860        if ( $page instanceof PageConfig ) {
861            // TODO: Deprecate passing a PageConfig.
862            //       Ideally, callers would use HtmlToContentTransform directly.
863            $page = Title::newFromLinkTarget( $page->getLinkTarget() );
864        }
865
866        try {
867            $transform = $this->getHtmlInputTransformHelper( $attribs, $html, $page );
868
869            $response = $this->getResponseFactory()->create();
870            $transform->putContent( $response );
871
872            return $response;
873        } catch ( ClientError $e ) {
874            throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error", [ $e->getMessage() ] ), 400 );
875        }
876    }
877
878    /**
879     * Pagebundle -> pagebundle helper.
880     *
881     * @param array<string,array|string> $attribs
882     * @return Response
883     * @throws HttpException
884     */
885    protected function pb2pb( array $attribs ) {
886        $opts = $attribs['opts'];
887
888        $revision = $opts['previous'] ?? $opts['original'] ?? null;
889        if ( !isset( $revision['html'] ) ) {
890            throw new LocalizedHttpException( new MessageValue( "rest-missing-revision-html" ), 400 );
891        }
892
893        $vOriginal = ParsoidFormatHelper::parseContentTypeHeader(
894            $revision['html']['headers']['content-type'] ?? '' );
895        if ( $vOriginal === null ) {
896            throw new LocalizedHttpException( new MessageValue( "rest-missing-revision-html-content-type" ), 400 );
897        }
898        $attribs['envOptions']['inputContentVersion'] = $vOriginal;
899        '@phan-var array<string,array|string> $attribs'; // @var array<string,array|string> $attribs
900
901        $this->metrics->increment(
902            'pb2pb.original.version.' . $attribs['envOptions']['inputContentVersion']
903        );
904
905        if ( !empty( $opts['updates'] ) ) {
906            // FIXME: Handling missing revisions uniformly for all update types
907            // is not probably the right thing to do but probably okay for now.
908            // This might need revisiting as we add newer types.
909            $pageConfig = $this->tryToCreatePageConfig( $attribs, null, true );
910            // If we're only updating parts of the original version, it should
911            // satisfy the requested content version, since we'll be returning
912            // that same one.
913            // FIXME: Since this endpoint applies the acceptable middleware,
914            // `getOutputContentVersion` is not what's been passed in, but what
915            // can be produced.  Maybe that should be selectively applied so
916            // that we can update older versions where it makes sense?
917            // Uncommenting below implies that we can only update the latest
918            // version, since carrot semantics is applied in both directions.
919            // if ( !Semver::satisfies(
920            //     $attribs['envOptions']['inputContentVersion'],
921            //     "^{$attribs['envOptions']['outputContentVersion']}"
922            // ) ) {
923            //  throw new HttpException(
924            //         'We do not know how to do this conversion.', 415
925            //     );
926            // }
927            if ( !empty( $opts['updates']['redlinks'] ) ) {
928                // Q(arlolra): Should redlinks be more complex than a bool?
929                // See gwicke's proposal at T114413#2240381
930                return $this->updateRedLinks( $pageConfig, $attribs, $revision );
931            } elseif ( isset( $opts['updates']['variant'] ) ) {
932                return $this->languageConversion( $pageConfig, $attribs, $revision );
933            } else {
934                throw new LocalizedHttpException( new MessageValue( "rest-unknown-parsoid-transformation" ), 400 );
935            }
936        }
937
938        // TODO(arlolra): subbu has some sage advice in T114413#2365456 that
939        // we should probably be more explicit about the pb2pb conversion
940        // requested rather than this increasingly complex fallback logic.
941        $downgrade = Parsoid::findDowngrade(
942            $attribs['envOptions']['inputContentVersion'],
943            $attribs['envOptions']['outputContentVersion']
944        );
945        if ( $downgrade ) {
946            $pb = HtmlPageBundle::newFromJsonArray( [
947                'html' => $revision['html']['body'],
948                'parsoid' => $revision['data-parsoid']['body'] ?? null,
949                'mw' => $revision['data-mw']['body'] ?? null,
950                'counters' => $revision['counters']['body'] ?? null,
951            ] );
952            $this->validatePb( $pb, $attribs['envOptions']['inputContentVersion'] );
953            Parsoid::downgrade( $downgrade, $pb, $this->siteConfig );
954
955            if ( !empty( $attribs['body_only'] ) ) {
956                $doc = $this->parseHTML( $pb->html );
957                $body = DOMCompat::getBody( $doc );
958                $pb->html = ContentUtils::toXML( $body, [ 'innerXML' => true ] );
959            }
960
961            $response = $this->getResponseFactory()->createJson( $pb->responseData() );
962            ParsoidFormatHelper::setContentType(
963                $response, ParsoidFormatHelper::FORMAT_PAGEBUNDLE, $pb->version
964            );
965            return $response;
966            // Ensure we only reuse from semantically similar content versions.
967        } elseif ( Semver::satisfies( $attribs['envOptions']['outputContentVersion'],
968            '^' . $attribs['envOptions']['inputContentVersion'] ) ) {
969            $pageConfig = $this->tryToCreatePageConfig( $attribs );
970            return $this->wt2html( $pageConfig, $attribs );
971        } else {
972            throw new LocalizedHttpException( new MessageValue( "rest-unsupported-profile-conversion" ), 415 );
973        }
974    }
975
976    /**
977     * Update red links on a document.
978     *
979     * @param PageConfig $pageConfig
980     * @param array $attribs
981     * @param array $revision
982     * @return Response
983     */
984    protected function updateRedLinks(
985        PageConfig $pageConfig, array $attribs, array $revision
986    ) {
987        $parsoid = $this->newParsoid();
988
989        $pb = HtmlPageBundle::newFromJsonArray( [
990            'html' => $revision['html']['body'],
991            'parsoid' => $revision['data-parsoid']['body'] ?? null,
992            'mw' => $revision['data-mw']['body'] ?? null,
993            'counters' => $revision['counters']['body'] ?? null,
994            'version' => $attribs['envOptions']['inputContentVersion'],
995            'headers' => $revision['html']['headers'] ?? null,
996            'contentmodel' => $revision['contentmodel'] ?? null,
997        ] );
998
999        $out = $parsoid->pb2pb( $pageConfig, 'redlinks', $pb, [] );
1000
1001        $this->validatePb( $out, $attribs['envOptions']['inputContentVersion'] );
1002
1003        $response = $this->getResponseFactory()->createJson( $out->responseData() );
1004        ParsoidFormatHelper::setContentType(
1005            $response, ParsoidFormatHelper::FORMAT_PAGEBUNDLE, $out->version
1006        );
1007        return $response;
1008    }
1009
1010    /**
1011     * Do variant conversion on a document.
1012     *
1013     * @param PageConfig $pageConfig
1014     * @param array $attribs
1015     * @param array $revision
1016     * @return Response
1017     * @throws HttpException
1018     */
1019    protected function languageConversion(
1020        PageConfig $pageConfig, array $attribs, array $revision
1021    ) {
1022        $opts = $attribs['opts'];
1023        $target = $opts['updates']['variant']['html'] ??
1024            $opts['updates']['variant']['target'] ??
1025            $attribs['envOptions']['htmlVariantLanguage'];
1026        $source = $opts['updates']['variant']['wikitext'] ??
1027            $opts['updates']['variant']['source'] ?? null;
1028
1029        if ( !$target ) {
1030            throw new LocalizedHttpException( new MessageValue( "rest-target-variant-required" ), 400 );
1031        }
1032
1033        $pageIdentity = $this->tryToCreatePageIdentity( $attribs );
1034
1035        $pb = HtmlPageBundle::newFromJsonArray( [
1036            'html' => $revision['html']['body'],
1037            'parsoid' => $revision['data-parsoid']['body'] ?? null,
1038            'mw' => $revision['data-mw']['body'] ?? null,
1039            'counters' => $revision['counters']['body'] ?? null,
1040            'version' => $attribs['envOptions']['inputContentVersion'],
1041            'headers' => $revision['html']['headers'] ?? null,
1042            'contentmodel' => $revision['contentmodel'] ?? null,
1043        ] );
1044
1045        // XXX: DI should inject HtmlTransformFactory
1046        $languageVariantConverter = MediaWikiServices::getInstance()
1047            ->getHtmlTransformFactory()
1048            ->getLanguageVariantConverter( $pageIdentity );
1049        $languageVariantConverter->setPageConfig( $pageConfig );
1050        $httpContentLanguage = $attribs['pagelanguage' ] ?? null;
1051        if ( $httpContentLanguage ) {
1052            $languageVariantConverter->setPageLanguageOverride( $httpContentLanguage );
1053        }
1054
1055        try {
1056            $out = $languageVariantConverter->convertPageBundleVariant( $pb, $target, $source );
1057        } catch ( InvalidArgumentException $e ) {
1058            throw new LocalizedHttpException(
1059                new MessageValue( "rest-unsupported-language-conversion", [ $source ?? '(unspecified)', $target ] ),
1060                400,
1061                [ 'reason' => $e->getMessage() ]
1062            );
1063        }
1064
1065        $response = $this->getResponseFactory()->createJson( $out->responseData() );
1066        ParsoidFormatHelper::setContentType(
1067            $response, ParsoidFormatHelper::FORMAT_PAGEBUNDLE, $out->version
1068        );
1069        return $response;
1070    }
1071
1072    /** @inheritDoc */
1073    abstract public function execute(): Response;
1074
1075    /**
1076     * Validate a HtmlPageBundle against the given contentVersion, and throw
1077     * an HttpException if it does not match.
1078     * @param HtmlPageBundle $pb
1079     * @param string $contentVersion
1080     * @throws HttpException
1081     */
1082    private function validatePb( HtmlPageBundle $pb, string $contentVersion ): void {
1083        $errorMessage = '';
1084        if ( !$pb->validate( $contentVersion, $errorMessage ) ) {
1085            throw new LocalizedHttpException(
1086                new MessageValue( "rest-page-bundle-validation-error", [ $errorMessage ] ),
1087                400
1088            );
1089        }
1090    }
1091
1092    /**
1093     * @param PageConfig $page
1094     *
1095     * @return ProperPageIdentity
1096     * @throws HttpException
1097     */
1098    private function pageConfigToPageIdentity( PageConfig $page ): ProperPageIdentity {
1099        $services = MediaWikiServices::getInstance();
1100
1101        $title = $page->getLinkTarget();
1102        try {
1103            $page = $services->getPageStore()->getPageForLink( $title );
1104        } catch ( MalformedTitleException | InvalidArgumentException ) {
1105            // Note that even some well-formed links are still invalid
1106            // parameters for getPageForLink(), e.g. interwiki links or special pages.
1107            throw new HttpException(
1108                "Bad title: $title", # uses LinkTarget::__toString()
1109                400
1110            );
1111        }
1112
1113        return $page;
1114    }
1115}