Code Coverage
 
Lines
Functions and Methods
Classes and Traits
Total
41.15% covered (danger)
41.15%
200 / 486
12.00% covered (danger)
12.00%
3 / 25
CRAP
0.00% covered (danger)
0.00%
0 / 1
ParsoidHandler
41.15% covered (danger)
41.15%
200 / 486
12.00% covered (danger)
12.00%
3 / 25
3309.27
0.00% covered (danger)
0.00%
0 / 1
 factory
0.00% covered (danger)
0.00%
0 / 7
0.00% covered (danger)
0.00%
0 / 1
2
 __construct
100.00% covered (success)
100.00%
6 / 6
100.00% covered (success)
100.00%
1 / 1
1
 getSupportedRequestTypes
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
2
 assertDomainIsCorrect
0.00% covered (danger)
0.00%
0 / 24
0.00% covered (danger)
0.00%
0 / 1
42
 getParsedBody
0.00% covered (danger)
0.00%
0 / 15
0.00% covered (danger)
0.00%
0 / 1
42
 getOpts
0.00% covered (danger)
0.00%
0 / 4
0.00% covered (danger)
0.00%
0 / 1
2
 getRequestAttributes
0.00% covered (danger)
0.00%
0 / 87
0.00% covered (danger)
0.00%
0 / 1
462
 getHtmlOutputRendererHelper
95.00% covered (success)
95.00%
19 / 20
0.00% covered (danger)
0.00%
0 / 1
6
 getHtmlInputTransformHelper
100.00% covered (success)
100.00%
14 / 14
100.00% covered (success)
100.00%
1 / 1
1
 acceptable
0.00% covered (danger)
0.00%
0 / 30
0.00% covered (danger)
0.00%
0 / 1
272
 tryToCreatePageConfig
91.89% covered (success)
91.89%
34 / 37
0.00% covered (danger)
0.00%
0 / 1
7.03
 tryToCreatePageIdentity
55.56% covered (warning)
55.56%
5 / 9
0.00% covered (danger)
0.00%
0 / 1
3.79
 getTransformEndpoint
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 getPageContentEndpoint
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
6
 getRevisionContentEndpoint
0.00% covered (danger)
0.00%
0 / 3
0.00% covered (danger)
0.00%
0 / 1
6
 wtLint
53.85% covered (warning)
53.85%
7 / 13
0.00% covered (danger)
0.00%
0 / 1
3.88
 wt2html
85.06% covered (warning)
85.06%
74 / 87
0.00% covered (danger)
0.00%
0 / 1
25.92
 newParsoid
100.00% covered (success)
100.00%
1 / 1
100.00% covered (success)
100.00%
1 / 1
1
 parseHTML
0.00% covered (danger)
0.00%
0 / 1
0.00% covered (danger)
0.00%
0 / 1
2
 html2wt
87.50% covered (warning)
87.50%
7 / 8
0.00% covered (danger)
0.00%
0 / 1
3.02
 pb2pb
0.00% covered (danger)
0.00%
0 / 45
0.00% covered (danger)
0.00%
0 / 1
90
 updateRedLinks
0.00% covered (danger)
0.00%
0 / 16
0.00% covered (danger)
0.00%
0 / 1
2
 languageConversion
80.56% covered (warning)
80.56%
29 / 36
0.00% covered (danger)
0.00%
0 / 1
4.12
 execute
n/a
0 / 0
n/a
0 / 0
0
 validatePb
0.00% covered (danger)
0.00%
0 / 6
0.00% covered (danger)
0.00%
0 / 1
6
 pageConfigToPageIdentity
44.44% covered (danger)
44.44%
4 / 9
0.00% covered (danger)
0.00%
0 / 1
2.69
1<?php
2/**
3 * Copyright (C) 2011-2020 Wikimedia Foundation and others.
4 *
5 * @license GPL-2.0-or-later
6 */
7
8namespace MediaWiki\Rest\Handler;
9
10use Composer\Semver\Semver;
11use InvalidArgumentException;
12use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
13use LogicException;
14use MediaWiki\Content\WikitextContent;
15use MediaWiki\Context\RequestContext;
16use MediaWiki\Language\LanguageCode;
17use MediaWiki\Logger\LoggerFactory;
18use MediaWiki\MainConfigNames;
19use MediaWiki\MediaWikiServices;
20use MediaWiki\Page\PageIdentity;
21use MediaWiki\Page\ProperPageIdentity;
22use MediaWiki\Parser\ParserOptions;
23use MediaWiki\Parser\ParserOutput;
24use MediaWiki\Parser\Parsoid\Config\SiteConfig;
25use MediaWiki\Registration\ExtensionRegistry;
26use MediaWiki\Rest\Handler;
27use MediaWiki\Rest\Handler\Helper\HtmlInputTransformHelper;
28use MediaWiki\Rest\Handler\Helper\HtmlOutputRendererHelper;
29use MediaWiki\Rest\Handler\Helper\ParsoidFormatHelper;
30use MediaWiki\Rest\HttpException;
31use MediaWiki\Rest\LocalizedHttpException;
32use MediaWiki\Rest\RequestInterface;
33use MediaWiki\Rest\Response;
34use MediaWiki\Revision\MutableRevisionRecord;
35use MediaWiki\Revision\RevisionAccessException;
36use MediaWiki\Revision\RevisionLookup;
37use MediaWiki\Revision\SlotRecord;
38use MediaWiki\Revision\SuppressedDataException;
39use MediaWiki\Title\MalformedTitleException;
40use MediaWiki\Title\Title;
41use MediaWiki\WikiMap\WikiMap;
42use MobileContext;
43use Wikimedia\Http\HttpAcceptParser;
44use Wikimedia\Message\DataMessageValue;
45use Wikimedia\Message\MessageValue;
46use Wikimedia\Parsoid\Config\DataAccess;
47use Wikimedia\Parsoid\Config\PageConfig;
48use Wikimedia\Parsoid\Config\PageConfigFactory;
49use Wikimedia\Parsoid\Core\ClientError;
50use Wikimedia\Parsoid\Core\HtmlPageBundle;
51use Wikimedia\Parsoid\Core\ResourceLimitExceededException;
52use Wikimedia\Parsoid\DOM\Document;
53use Wikimedia\Parsoid\Parsoid;
54use Wikimedia\Parsoid\Utils\ContentUtils;
55use Wikimedia\Parsoid\Utils\DOMCompat;
56use Wikimedia\Parsoid\Utils\DOMUtils;
57use Wikimedia\Parsoid\Utils\Timing;
58
59// TODO logging, timeouts(?), CORS
60// TODO content negotiation (routes.js routes.acceptable)
61// TODO handle MaxConcurrentCallsError (pool counter?)
62
63/**
64 * Base class for Parsoid handlers.
65 * @internal For use by the Parsoid extension
66 */
67abstract class ParsoidHandler extends Handler {
68
69    private RevisionLookup $revisionLookup;
70    protected SiteConfig $siteConfig;
71    protected PageConfigFactory $pageConfigFactory;
72    protected DataAccess $dataAccess;
73
74    /** @var ExtensionRegistry */
75    protected $extensionRegistry;
76
77    /** @var ?StatsdDataFactoryInterface A statistics aggregator */
78    protected $metrics;
79
80    /** @var array */
81    private $requestAttributes;
82
83    public static function factory(): static {
84        $services = MediaWikiServices::getInstance();
85        // @phan-suppress-next-line PhanTypeInstantiateAbstractStatic
86        return new static(
87            $services->getRevisionLookup(),
88            $services->getParsoidSiteConfig(),
89            $services->getParsoidPageConfigFactory(),
90            $services->getParsoidDataAccess()
91        );
92    }
93
94    public function __construct(
95        RevisionLookup $revisionLookup,
96        SiteConfig $siteConfig,
97        PageConfigFactory $pageConfigFactory,
98        DataAccess $dataAccess
99    ) {
100        $this->revisionLookup = $revisionLookup;
101        $this->siteConfig = $siteConfig;
102        $this->pageConfigFactory = $pageConfigFactory;
103        $this->dataAccess = $dataAccess;
104        $this->extensionRegistry = ExtensionRegistry::getInstance();
105        $this->metrics = $siteConfig->metrics();
106    }
107
108    public function getSupportedRequestTypes(): array {
109        return array_merge( parent::getSupportedRequestTypes(), [
110            'application/x-www-form-urlencoded',
111            'multipart/form-data'
112        ] );
113    }
114
115    /**
116     * Verify that the {domain} path parameter matches the actual domain.
117     * @todo Remove this when we no longer need to support the {domain}
118     *       parameter with backwards compatibility with the parsoid
119     *       extension.
120     * @param string $domain Domain name parameter to validate
121     */
122    protected function assertDomainIsCorrect( $domain ): void {
123        // We are cutting some corners here (IDN, non-ASCII casing)
124        // since domain name support is provisional.
125        // TODO use a proper validator instead
126        $server = RequestContext::getMain()->getConfig()->get( MainConfigNames::Server );
127        $expectedDomain = parse_url( $server, PHP_URL_HOST );
128        if ( !$expectedDomain ) {
129            throw new LogicException( 'Cannot parse $wgServer' );
130        }
131        if ( strcasecmp( $expectedDomain, $domain ) === 0 ) {
132            return;
133        }
134
135        // TODO: This should really go away! It's only acceptable because
136        //       this entire method is going to be removed once we no longer
137        //       need the parsoid extension endpoints with the {domain} parameter.
138        if ( $this->extensionRegistry->isLoaded( 'MobileFrontend' ) ) {
139            // @phan-suppress-next-line PhanUndeclaredClassMethod
140            $mobileServer = MobileContext::singleton()->getMobileUrl( $server );
141            $expectedMobileDomain = parse_url( $mobileServer, PHP_URL_HOST );
142            if ( $expectedMobileDomain && strcasecmp( $expectedMobileDomain, $domain ) === 0 ) {
143                return;
144            }
145        }
146
147        $msg = new DataMessageValue(
148            'mwparsoid-invalid-domain',
149            [],
150            'invalid-domain',
151            [ 'expected' => $expectedDomain, 'actual' => $domain, ]
152        );
153
154        throw new LocalizedHttpException( $msg, 400, [
155            'error' => 'parameter-validation-failed',
156            'name' => 'domain',
157            'value' => $domain,
158            'failureCode' => $msg->getCode(),
159            'failureData' => $msg->getData(),
160        ] );
161    }
162
163    /**
164     * Get the parsed body by content-type
165     */
166    protected function getParsedBody(): array {
167        $request = $this->getRequest();
168        [ $contentType ] = explode( ';', $request->getHeader( 'Content-Type' )[0] ?? '', 2 );
169        switch ( $contentType ) {
170            case 'application/x-www-form-urlencoded':
171            case 'multipart/form-data':
172                return $request->getPostParams();
173            case 'application/json':
174                $json = json_decode( $request->getBody()->getContents(), true );
175                if ( !is_array( $json ) ) {
176                    throw new LocalizedHttpException(
177                        new MessageValue( "rest-json-body-parse-error", [ 'not a valid JSON object' ] ), 400 );
178                }
179                return $json;
180            default:
181                throw new LocalizedHttpException(
182                    new MessageValue( "rest-unsupported-content-type", [ $contentType ?? '(null)' ] ),
183                    415
184                );
185        }
186    }
187
188    protected function getOpts( array $body, RequestInterface $request ): array {
189        return array_merge(
190            $body,
191            array_intersect_key( $request->getPathParams(), [ 'from' => true, 'format' => true ] )
192        );
193    }
194
195    /**
196     * Rough equivalent of req.local from Parsoid-JS.
197     * FIXME most of these should be replaced with more native ways of handling the request.
198     * @return array
199     */
200    protected function &getRequestAttributes(): array {
201        if ( $this->requestAttributes ) {
202            return $this->requestAttributes;
203        }
204
205        $request = $this->getRequest();
206        $body = ( $request->getMethod() === 'POST' ) ? $this->getParsedBody() : [];
207        $opts = $this->getOpts( $body, $request );
208        '@phan-var array<string,array|bool|string> $opts'; // @var array<string,array|bool|string> $opts
209        $contentLanguage = $request->getHeaderLine( 'Content-Language' ) ?: null;
210        if ( $contentLanguage ) {
211            $contentLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
212                $contentLanguage
213            );
214        }
215        $attribs = [
216            'pageName' => $request->getPathParam( 'title' ) ?? '',
217            'oldid' => $request->getPathParam( 'revision' ),
218            // "body_only" flag to return just the body (instead of the entire HTML doc)
219            // We would like to deprecate use of this flag: T181657
220            'body_only' => $request->getQueryParams()['body_only'] ?? $body['body_only'] ?? null,
221            'errorEnc' => ParsoidFormatHelper::ERROR_ENCODING[$opts['format']] ?? 'plain',
222            'iwp' => WikiMap::getCurrentWikiId(), // PORT-FIXME verify
223            'offsetType' => $body['offsetType']
224                ?? $request->getQueryParams()['offsetType']
225                // Lint requests should return UCS2 offsets by default
226                ?? ( $opts['format'] === ParsoidFormatHelper::FORMAT_LINT ? 'ucs2' : 'byte' ),
227            'pagelanguage' => $contentLanguage,
228        ];
229
230        // For use in getHtmlOutputRendererHelper
231        $opts['stash'] = $request->getQueryParams()['stash'] ?? false;
232
233        if ( $request->getMethod() === 'POST' ) {
234            if ( isset( $opts['original']['revid'] ) ) {
235                $attribs['oldid'] = $opts['original']['revid'];
236            }
237            if ( isset( $opts['original']['title'] ) ) {
238                $attribs['pageName'] = $opts['original']['title'];
239            }
240        }
241        if ( $attribs['oldid'] !== null ) {
242            if ( $attribs['oldid'] === '' ) {
243                $attribs['oldid'] = null;
244            } else {
245                $attribs['oldid'] = (int)$attribs['oldid'];
246            }
247        }
248
249        // For use in getHtmlOutputRendererHelper
250        $opts['accept-language'] = $request->getHeaderLine( 'Accept-Language' ) ?: null;
251
252        $acceptLanguage = null;
253        if ( $opts['accept-language'] !== null ) {
254            $acceptLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
255                HtmlOutputRendererHelper::getAcceptedTargetLanguage(
256                    $opts['accept-language']
257                )
258            );
259        }
260
261        // Init pageName if oldid is provided and is a valid revision
262        if ( ( $attribs['pageName'] === '' ) && $attribs['oldid'] ) {
263            $rev = $this->revisionLookup->getRevisionById( $attribs['oldid'] );
264            if ( $rev ) {
265                $attribs['pageName'] = $rev->getPage()->getDBkey();
266            }
267        }
268
269        $attribs['envOptions'] = [
270            // We use `prefix` but ought to use `domain` (T206764)
271            'prefix' => $attribs['iwp'],
272            // For the legacy "domain" path parameter used by the endpoints exposed
273            // by the parsoid extension. Will be null for core endpoints.
274            'domain' => $request->getPathParam( 'domain' ),
275            'pageName' => $attribs['pageName'],
276            'cookie' => $request->getHeaderLine( 'Cookie' ),
277            'reqId' => $request->getHeaderLine( 'X-Request-Id' ),
278            'userAgent' => $request->getHeaderLine( 'User-Agent' ),
279            // Used in pb2pb variant updates and wtLint
280            'htmlVariantLanguage' => $acceptLanguage,
281            // Semver::satisfies checks below expect a valid outputContentVersion value.
282            // Better to set it here instead of adding the default value at every check.
283            'outputContentVersion' => Parsoid::defaultHTMLVersion(),
284        ];
285
286        # Convert language codes in $opts['updates']['variant'] if present
287        $sourceVariant = $opts['updates']['variant']['wikitext'] ??
288            $opts['updates']['variant']['source'] ?? null;
289        if ( $sourceVariant ) {
290            $sourceVariant = LanguageCode::normalizeNonstandardCodeAndWarn(
291                $sourceVariant
292            );
293            unset( $opts['updates']['variant']['source'] );
294            $opts['updates']['variant']['wikitext'] = $sourceVariant;
295        }
296        $targetVariant = $opts['updates']['variant']['html'] ??
297            $opts['updates']['variant']['target'] ?? null;
298        if ( $targetVariant ) {
299            $targetVariant = LanguageCode::normalizeNonstandardCodeAndWarn(
300                $targetVariant
301            );
302            unset( $opts['updates']['variant']['target'] );
303            $opts['updates']['variant']['html'] = $targetVariant;
304        }
305        if ( isset( $opts['wikitext']['headers']['content-language'] ) ) {
306            $contentLanguage = $opts['wikitext']['headers']['content-language'];
307            $contentLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
308                $contentLanguage
309            );
310            $opts['wikitext']['headers']['content-language'] = $contentLanguage;
311        }
312        if ( isset( $opts['original']['wikitext']['headers']['content-language'] ) ) {
313            $contentLanguage = $opts['original']['wikitext']['headers']['content-language'];
314            $contentLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
315                $contentLanguage
316            );
317            $opts['original']['wikitext']['headers']['content-language'] = $contentLanguage;
318        }
319
320        $attribs['opts'] = $opts;
321
322        // TODO: Remove assertDomainIsCorrect() once we no longer need to support the {domain}
323        //       parameter for the endpoints exposed by the parsoid extension.
324        if ( $attribs['envOptions']['domain'] !== null ) {
325            $this->assertDomainIsCorrect( $attribs['envOptions']['domain'] );
326        }
327
328        $this->requestAttributes = $attribs;
329        return $this->requestAttributes;
330    }
331
332    /**
333     * @param array $attribs
334     * @param ?string $source
335     * @param PageIdentity $page
336     * @param ?int $revId
337     *
338     * @return HtmlOutputRendererHelper
339     */
340    private function getHtmlOutputRendererHelper(
341        array $attribs,
342        ?string $source,
343        PageIdentity $page,
344        ?int $revId
345    ): HtmlOutputRendererHelper {
346        $services = MediaWikiServices::getInstance();
347
348        // Request lenient rev handling
349        $lenientRevHandling = true;
350
351        $authority = $this->getAuthority();
352
353        $params = [];
354        $helper = $services->getPageRestHelperFactory()->newHtmlOutputRendererHelper(
355            $page, $params, $authority, $revId, $lenientRevHandling
356        );
357
358        // XXX: should default to the page's content model?
359        $model = $attribs['opts']['contentmodel']
360            ?? ( $attribs['envOptions']['contentmodel'] ?? CONTENT_MODEL_WIKITEXT );
361
362        if ( $source !== null ) {
363            $helper->setContentSource( $source, $model );
364        }
365
366        if ( isset( $attribs['opts']['stash'] ) ) {
367            $helper->setStashingEnabled( $attribs['opts']['stash'] );
368        }
369
370        if ( isset( $attribs['envOptions']['outputContentVersion'] ) ) {
371            $helper->setOutputProfileVersion( $attribs['envOptions']['outputContentVersion'] );
372        }
373
374        if ( isset( $attribs['pagelanguage'] ) ) {
375            $helper->setPageLanguage( $attribs['pagelanguage'] );
376        }
377
378        if ( isset( $attribs['opts']['accept-language'] ) ) {
379            $helper->setVariantConversionLanguage( $attribs['opts']['accept-language'] );
380        }
381
382        return $helper;
383    }
384
385    /**
386     * @param array $attribs
387     * @param string $html
388     * @param PageIdentity $page
389     *
390     * @return HtmlInputTransformHelper
391     */
392    protected function getHtmlInputTransformHelper(
393        array $attribs,
394        string $html,
395        PageIdentity $page
396    ): HtmlInputTransformHelper {
397        $services = MediaWikiServices::getInstance();
398
399        $parameters = $attribs['opts'] + $attribs;
400        $body = $attribs['opts'];
401
402        $body['html'] = $html;
403
404        $helper = $services->getPageRestHelperFactory()->newHtmlInputTransformHelper(
405            $attribs['envOptions'] + [
406                'offsetType' => $attribs['offsetType'],
407            ],
408            $page,
409            $body,
410            $parameters
411        );
412
413        $helper->setMetrics( $this->siteConfig->prefixedStatsFactory() );
414
415        return $helper;
416    }
417
418    /**
419     * FIXME: Combine with ParsoidFormatHelper::parseContentTypeHeader
420     */
421    private const NEW_SPEC =
422        '#^https://www.mediawiki.org/wiki/Specs/(HTML|pagebundle)/(\d+\.\d+\.\d+)$#D';
423
424    /**
425     * This method checks if we support the requested content formats
426     * As a side-effect, it updates $attribs to set outputContentVersion
427     * that Parsoid should generate based on request headers.
428     *
429     * @param array &$attribs Request attributes from getRequestAttributes()
430     * @return bool
431     */
432    protected function acceptable( array &$attribs ): bool {
433        $request = $this->getRequest();
434        $format = $attribs['opts']['format'];
435
436        if ( $format === ParsoidFormatHelper::FORMAT_WIKITEXT ) {
437            return true;
438        }
439
440        $acceptHeader = $request->getHeader( 'Accept' );
441        if ( !$acceptHeader ) {
442            return true;
443        }
444
445        $parser = new HttpAcceptParser();
446        $acceptableTypes = $parser->parseAccept( $acceptHeader[0] );  // FIXME: Multiple headers valid?
447        if ( !$acceptableTypes ) {
448            return true;
449        }
450
451        // `acceptableTypes` is already sorted by quality.
452        foreach ( $acceptableTypes as $t ) {
453            $type = "{$t['type']}/{$t['subtype']}";
454            $profile = $t['params']['profile'] ?? null;
455            if (
456                ( $format === ParsoidFormatHelper::FORMAT_HTML && $type === 'text/html' ) ||
457                ( $format === ParsoidFormatHelper::FORMAT_PAGEBUNDLE && $type === 'application/json' )
458            ) {
459                if ( $profile ) {
460                    preg_match( self::NEW_SPEC, $profile, $matches );
461                    if ( $matches && strtolower( $matches[1] ) === $format ) {
462                        $contentVersion = Parsoid::resolveContentVersion( $matches[2] );
463                        if ( $contentVersion ) {
464                            // $attribs mutated here!
465                            $attribs['envOptions']['outputContentVersion'] = $contentVersion;
466                            return true;
467                        } else {
468                            continue;
469                        }
470                    } else {
471                        continue;
472                    }
473                } else {
474                    return true;
475                }
476            } elseif (
477                ( $type === '*/*' ) ||
478                ( $format === ParsoidFormatHelper::FORMAT_HTML && $type === 'text/*' )
479            ) {
480                return true;
481            }
482        }
483
484        return false;
485    }
486
487    /**
488     * Try to create a PageConfig object. If we get an exception (because content
489     * may be missing or inaccessible), throw an appropriate HTTP response object
490     * for callers to handle.
491     *
492     * @param array $attribs
493     * @param ?string $wikitextOverride
494     *   Custom wikitext to use instead of the real content of the page.
495     * @param bool $html2WtMode
496     * @return PageConfig
497     * @throws HttpException
498     */
499    protected function tryToCreatePageConfig(
500        array $attribs, ?string $wikitextOverride = null, bool $html2WtMode = false
501    ): PageConfig {
502        $revId = $attribs['oldid'];
503        $pagelanguageOverride = $attribs['pagelanguage'];
504        $title = $attribs['pageName'];
505
506        $title = ( $title !== '' ) ? Title::newFromText( $title ) : Title::newMainPage();
507        if ( !$title ) {
508            throw new LocalizedHttpException(
509                new MessageValue( "rest-invalid-title", [ 'pageName' ] ), 400
510            );
511        }
512        $user = RequestContext::getMain()->getUser();
513
514        if ( $wikitextOverride === null ) {
515            $revisionRecord = null;
516        } else {
517            // Create a mutable revision record point to the same revision
518            // and set to the desired wikitext.
519            $revisionRecord = new MutableRevisionRecord( $title );
520            // Don't set id to $revId if we have $wikitextOverride
521            // A revision corresponds to specific wikitext, which $wikitextOverride
522            // might not be.
523            $revisionRecord->setId( 0 );
524            $revisionRecord->setSlot(
525                SlotRecord::newUnsaved(
526                    SlotRecord::MAIN,
527                    new WikitextContent( $wikitextOverride )
528                )
529            );
530        }
531
532        $hasOldId = ( $revId !== null );
533        $ensureAccessibleContent = !$html2WtMode || $hasOldId;
534
535        try {
536            // Note: Parsoid by design isn't supposed to use the user
537            // context right now, and all user state is expected to be
538            // introduced as a post-parse transform.  So although we pass a
539            // User here, it only currently affects the output in obscure
540            // corner cases; see PageConfigFactory::create() for more.
541            // @phan-suppress-next-line PhanUndeclaredMethod method defined in subtype
542            $pageConfig = $this->pageConfigFactory->createFromParserOptions(
543                ParserOptions::newFromUser( $user ),
544                $title,
545                $revisionRecord ?? $revId,
546                $pagelanguageOverride,
547                $ensureAccessibleContent
548            );
549        } catch ( SuppressedDataException $e ) {
550            throw new LocalizedHttpException(
551                new MessageValue( "rest-permission-denied-revision", [ $e->getMessage() ] ), 403
552            );
553        } catch ( RevisionAccessException $e ) {
554            throw new LocalizedHttpException(
555                new MessageValue( "rest-specified-revision-unavailable", [ $e->getMessage() ] ), 404
556            );
557        }
558
559        // All good!
560        return $pageConfig;
561    }
562
563    /**
564     * Try to create a PageIdentity object.
565     * If no page is specified in the request, this will return the wiki's main page.
566     * If an invalid page is requested, this throws an appropriate HTTPException.
567     *
568     * @param array $attribs
569     * @return PageIdentity
570     * @throws HttpException
571     */
572    protected function tryToCreatePageIdentity( array $attribs ): PageIdentity {
573        if ( $attribs['pageName'] === '' ) {
574            return Title::newMainPage();
575        }
576
577        // XXX: Should be injected, but the Parsoid extension relies on the
578        //      constructor signature. Also, ParsoidHandler should go away soon anyway.
579        $pageStore = MediaWikiServices::getInstance()->getPageStore();
580
581        $page = $pageStore->getPageByText( $attribs['pageName'] );
582
583        if ( !$page ) {
584            throw new LocalizedHttpException(
585                new MessageValue( "rest-invalid-title", [ 'pageName' ] ), 400
586            );
587        }
588
589        return $page;
590    }
591
592    /**
593     * Get the path for the transform endpoint. May be overwritten to override the path.
594     *
595     * This is done in the parsoid extension, for backwards compatibility
596     * with the old endpoint URLs.
597     *
598     * @stable to override
599     *
600     * @param string $format The format the endpoint is expected to return.
601     *
602     * @return string
603     */
604    protected function getTransformEndpoint( string $format = ParsoidFormatHelper::FORMAT_HTML ): string {
605        return '/coredev/v0/transform/{from}/to/{format}/{title}/{revision}';
606    }
607
608    /**
609     * Get the path for the page content endpoint. May be overwritten to override the path.
610     *
611     * This is done in the parsoid extension, for backwards compatibility
612     * with the old endpoint URLs.
613     *
614     * @stable to override
615     *
616     * @param string $format The format the endpoint is expected to return.
617     *
618     * @return string
619     */
620    protected function getPageContentEndpoint( string $format = ParsoidFormatHelper::FORMAT_HTML ): string {
621        if ( $format !== ParsoidFormatHelper::FORMAT_HTML ) {
622            throw new InvalidArgumentException( 'Unsupported page content format: ' . $format );
623        }
624        return '/v1/page/{title}/html';
625    }
626
627    /**
628     * Get the path for the page content endpoint. May be overwritten to override the path.
629     *
630     * This is done in the parsoid extension, for backwards compatibility
631     * with the old endpoint URLs.
632     *
633     * @stable to override
634     *
635     * @param string $format The format the endpoint is expected to return.
636     *
637     * @return string
638     */
639    protected function getRevisionContentEndpoint( string $format = ParsoidFormatHelper::FORMAT_HTML ): string {
640        if ( $format !== ParsoidFormatHelper::FORMAT_HTML ) {
641            throw new InvalidArgumentException( 'Unsupported revision content format: ' . $format );
642        }
643        return '/v1/revision/{revision}/html';
644    }
645
646    private function wtLint(
647        PageConfig $pageConfig, array $attribs, ?array $linterOverrides = []
648    ): array {
649        $envOptions = $attribs['envOptions'] + [
650            'linterOverrides' => $linterOverrides,
651            'offsetType' => $attribs['offsetType'],
652        ];
653        try {
654            $parsoid = $this->newParsoid();
655            $parserOutput = new ParserOutput();
656            return $parsoid->wikitext2lint( $pageConfig, $envOptions, $parserOutput );
657        } catch ( ClientError $e ) {
658            throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error", [ $e->getMessage() ] ), 400 );
659        } catch ( ResourceLimitExceededException $e ) {
660            throw new LocalizedHttpException(
661                new MessageValue( "rest-parsoid-resource-exceeded", [ $e->getMessage() ] ), 413
662            );
663        }
664    }
665
666    /**
667     * Wikitext -> HTML helper.
668     * Spec'd in https://phabricator.wikimedia.org/T75955 and the API tests.
669     *
670     * @param PageConfig $pageConfig
671     * @param array $attribs Request attributes from getRequestAttributes()
672     * @param ?string $wikitext Wikitext to transform (or null to use the
673     *   page specified in the request attributes).
674     *
675     * @return Response
676     */
677    protected function wt2html(
678        PageConfig $pageConfig, array $attribs, ?string $wikitext = null
679    ) {
680        $request = $this->getRequest();
681        $opts = $attribs['opts'];
682        $format = $opts['format'];
683        $oldid = $attribs['oldid'];
684        $stash = $opts['stash'] ?? false;
685
686        if ( $format === ParsoidFormatHelper::FORMAT_LINT ) {
687            $linterOverrides = [];
688            if ( $this->extensionRegistry->isLoaded( 'Linter' ) ) { // T360809
689                $disabled = [];
690                $services = MediaWikiServices::getInstance();
691                $linterCategories = $services->getMainConfig()->get( 'LinterCategories' );
692                foreach ( $linterCategories as $name => $cat ) {
693                    if ( $cat['priority'] === 'none' ) {
694                        $disabled[] = $name;
695                    }
696                }
697                $linterOverrides['disabled'] = $disabled;
698            }
699            $lints = $this->wtLint( $pageConfig, $attribs, $linterOverrides );
700            $response = $this->getResponseFactory()->createJson( $lints );
701            return $response;
702        }
703
704        // TODO: This method should take a PageIdentity + revId,
705        //       to reduce the usage of PageConfig in MW core.
706        $helper = $this->getHtmlOutputRendererHelper(
707            $attribs,
708            $wikitext,
709            $this->pageConfigToPageIdentity( $pageConfig ),
710            // Id will be 0 if we have $wikitext but that isn't valid
711            // to call $helper->setRevision with.  In any case, the revision
712            // will be reset when $helper->setContent is called with $wikitext.
713            // Ideally, the revision would be pass through here instead of
714            // the id and wikitext.
715            $pageConfig->getRevisionId() ?: null
716        );
717
718        $needsPageBundle = ( $format === ParsoidFormatHelper::FORMAT_PAGEBUNDLE );
719
720        if ( $attribs['body_only'] ) {
721            $helper->setFlavor( 'fragment' );
722        } elseif ( !$needsPageBundle ) {
723            // Inline data-parsoid. This will happen when no special params are set.
724            $helper->setFlavor( 'edit' );
725        }
726
727        if ( $wikitext === null && $oldid !== null ) {
728            $mstr = 'pageWithOldid';
729        } else {
730            $mstr = 'wt';
731        }
732
733        $parseTiming = Timing::start();
734
735        if ( $needsPageBundle ) {
736            $pb = $helper->getPageBundle();
737
738            // Handle custom offset requests as a pb2pb transform
739            if (
740                $helper->isParsoidContent() &&
741                ( $attribs['offsetType'] !== 'byte' )
742            ) {
743                $parsoid = $this->newParsoid();
744                $pb = $parsoid->pb2pb(
745                    $pageConfig,
746                    'convertoffsets',
747                    $pb,
748                    [
749                        'inputOffsetType' => 'byte',
750                        'outputOffsetType' => $attribs['offsetType']
751                    ]
752                );
753            }
754
755            $response = $this->getResponseFactory()->createJson( $pb->responseData() );
756            $helper->putHeaders( $response, false );
757
758            ParsoidFormatHelper::setContentType(
759                $response,
760                ParsoidFormatHelper::FORMAT_PAGEBUNDLE,
761                $pb->version
762            );
763        } else {
764            $out = $helper->getHtml();
765
766            // TODO: offsetType conversion isn't supported right now for non-pagebundle endpoints
767            // Once the OutputTransform framework lands, we might revisit this.
768
769            $response = $this->getResponseFactory()->create();
770            $response->getBody()->write( $out->getRawText() );
771
772            $helper->putHeaders( $response, true );
773
774            // Emit an ETag only if stashing is enabled. It's not reliably useful otherwise.
775            if ( $stash ) {
776                $eTag = $helper->getETag();
777                if ( $eTag ) {
778                    $response->setHeader( 'ETag', $eTag );
779                }
780            }
781        }
782
783        // XXX: For pagebundle requests, this can be somewhat inflated
784        // because of pagebundle json-encoding overheads
785        $outSize = $response->getBody()->getSize();
786        $parseTime = $parseTiming->end();
787
788        // Ignore slow parse metrics for non-oldid parses
789        if ( $mstr === 'pageWithOldid' ) {
790            if ( $parseTime > 3000 ) {
791                LoggerFactory::getInstance( 'slow-parsoid' )
792                    ->info( 'Parsing {title} was slow, took {time} seconds', [
793                        'time' => number_format( $parseTime / 1000, 2 ),
794                        'title' => Title::newFromLinkTarget( $pageConfig->getLinkTarget() )->getPrefixedText(),
795                    ] );
796            }
797
798            if ( $parseTime > 10 && $outSize > 100 ) {
799                // * Don't bother with this metric for really small parse times
800                //   p99 for initialization time is ~7ms according to grafana.
801                //   So, 10ms ensures that startup overheads don't skew the metrics
802                // * For body_only=false requests, <head> section isn't generated
803                //   and if the output is small, per-request overheads can skew
804                //   the timePerKB metrics.
805
806                // NOTE: This is slightly misleading since there are fixed costs
807                // for generating output like the <head> section and should be factored in,
808                // but this is good enough for now as a useful first degree of approxmation.
809                $timePerKB = $parseTime * 1024 / $outSize;
810                if ( $timePerKB > 500 ) {
811                    // At 100ms/KB, even a 100KB page which isn't that large will take 10s.
812                    // So, we probably want to shoot for a threshold under 100ms.
813                    // But, let's start with 500ms+ outliers first and see what we uncover.
814                    LoggerFactory::getInstance( 'slow-parsoid' )
815                        ->info( 'Parsing {title} was slow, timePerKB took {timePerKB} ms, total: {time} seconds', [
816                            'time' => number_format( $parseTime / 1000, 2 ),
817                            'timePerKB' => number_format( $timePerKB, 1 ),
818                            'title' => Title::newFromLinkTarget( $pageConfig->getLinkTarget() )->getPrefixedText(),
819                        ] );
820                }
821            }
822        }
823
824        if ( $wikitext !== null ) {
825            // Don't cache requests when wt is set in case somebody uses
826            // GET for wikitext parsing
827            // XXX: can we just refuse to do wikitext parsing in a GET request?
828            $response->setHeader( 'Cache-Control', 'private,no-cache,s-maxage=0' );
829        } elseif ( $oldid !== null ) {
830            // XXX: can this go away? Parsoid's PageContent class doesn't expose supressed revision content.
831            if ( $request->getHeaderLine( 'Cookie' ) ||
832                $request->getHeaderLine( 'Authorization' ) ) {
833                // Don't cache requests with a session.
834                $response->setHeader( 'Cache-Control', 'private,no-cache,s-maxage=0' );
835            }
836        }
837        return $response;
838    }
839
840    protected function newParsoid(): Parsoid {
841        return new Parsoid( $this->siteConfig, $this->dataAccess );
842    }
843
844    protected function parseHTML( string $html, bool $validateXMLNames = false ): Document {
845        return DOMUtils::parseHTML( $html, $validateXMLNames );
846    }
847
848    /**
849     * @param PageConfig|PageIdentity $page
850     * @param array $attribs Attributes gotten from requests
851     * @param string $html Original HTML
852     *
853     * @return Response
854     * @throws HttpException
855     */
856    protected function html2wt(
857        $page, array $attribs, string $html
858    ) {
859        if ( $page instanceof PageConfig ) {
860            // TODO: Deprecate passing a PageConfig.
861            //       Ideally, callers would use HtmlToContentTransform directly.
862            $page = Title::newFromLinkTarget( $page->getLinkTarget() );
863        }
864
865        try {
866            $transform = $this->getHtmlInputTransformHelper( $attribs, $html, $page );
867
868            $response = $this->getResponseFactory()->create();
869            $transform->putContent( $response );
870
871            return $response;
872        } catch ( ClientError $e ) {
873            throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error", [ $e->getMessage() ] ), 400 );
874        }
875    }
876
877    /**
878     * Pagebundle -> pagebundle helper.
879     *
880     * @param array<string,array|string> $attribs
881     * @return Response
882     * @throws HttpException
883     */
884    protected function pb2pb( array $attribs ) {
885        $opts = $attribs['opts'];
886
887        $revision = $opts['previous'] ?? $opts['original'] ?? null;
888        if ( !isset( $revision['html'] ) ) {
889            throw new LocalizedHttpException( new MessageValue( "rest-missing-revision-html" ), 400 );
890        }
891
892        $vOriginal = ParsoidFormatHelper::parseContentTypeHeader(
893            $revision['html']['headers']['content-type'] ?? '' );
894        if ( $vOriginal === null ) {
895            throw new LocalizedHttpException( new MessageValue( "rest-missing-revision-html-content-type" ), 400 );
896        }
897        $attribs['envOptions']['inputContentVersion'] = $vOriginal;
898        '@phan-var array<string,array|string> $attribs'; // @var array<string,array|string> $attribs
899
900        $this->metrics->increment(
901            'pb2pb.original.version.' . $attribs['envOptions']['inputContentVersion']
902        );
903
904        if ( !empty( $opts['updates'] ) ) {
905            // FIXME: Handling missing revisions uniformly for all update types
906            // is not probably the right thing to do but probably okay for now.
907            // This might need revisiting as we add newer types.
908            $pageConfig = $this->tryToCreatePageConfig( $attribs, null, true );
909            // If we're only updating parts of the original version, it should
910            // satisfy the requested content version, since we'll be returning
911            // that same one.
912            // FIXME: Since this endpoint applies the acceptable middleware,
913            // `getOutputContentVersion` is not what's been passed in, but what
914            // can be produced.  Maybe that should be selectively applied so
915            // that we can update older versions where it makes sense?
916            // Uncommenting below implies that we can only update the latest
917            // version, since carrot semantics is applied in both directions.
918            // if ( !Semver::satisfies(
919            //     $attribs['envOptions']['inputContentVersion'],
920            //     "^{$attribs['envOptions']['outputContentVersion']}"
921            // ) ) {
922            //  throw new HttpException(
923            //         'We do not know how to do this conversion.', 415
924            //     );
925            // }
926            if ( !empty( $opts['updates']['redlinks'] ) ) {
927                // Q(arlolra): Should redlinks be more complex than a bool?
928                // See gwicke's proposal at T114413#2240381
929                return $this->updateRedLinks( $pageConfig, $attribs, $revision );
930            } elseif ( isset( $opts['updates']['variant'] ) ) {
931                return $this->languageConversion( $pageConfig, $attribs, $revision );
932            } else {
933                throw new LocalizedHttpException( new MessageValue( "rest-unknown-parsoid-transformation" ), 400 );
934            }
935        }
936
937        // TODO(arlolra): subbu has some sage advice in T114413#2365456 that
938        // we should probably be more explicit about the pb2pb conversion
939        // requested rather than this increasingly complex fallback logic.
940        $downgrade = Parsoid::findDowngrade(
941            $attribs['envOptions']['inputContentVersion'],
942            $attribs['envOptions']['outputContentVersion']
943        );
944        if ( $downgrade ) {
945            $pb = new HtmlPageBundle(
946                $revision['html']['body'],
947                $revision['data-parsoid']['body'] ?? null,
948                $revision['data-mw']['body'] ?? null
949            );
950            $this->validatePb( $pb, $attribs['envOptions']['inputContentVersion'] );
951            Parsoid::downgrade( $downgrade, $pb, $this->siteConfig );
952
953            if ( !empty( $attribs['body_only'] ) ) {
954                $doc = $this->parseHTML( $pb->html );
955                $body = DOMCompat::getBody( $doc );
956                $pb->html = ContentUtils::toXML( $body, [ 'innerXML' => true ] );
957            }
958
959            $response = $this->getResponseFactory()->createJson( $pb->responseData() );
960            ParsoidFormatHelper::setContentType(
961                $response, ParsoidFormatHelper::FORMAT_PAGEBUNDLE, $pb->version
962            );
963            return $response;
964            // Ensure we only reuse from semantically similar content versions.
965        } elseif ( Semver::satisfies( $attribs['envOptions']['outputContentVersion'],
966            '^' . $attribs['envOptions']['inputContentVersion'] ) ) {
967            $pageConfig = $this->tryToCreatePageConfig( $attribs );
968            return $this->wt2html( $pageConfig, $attribs );
969        } else {
970            throw new LocalizedHttpException( new MessageValue( "rest-unsupported-profile-conversion" ), 415 );
971        }
972    }
973
974    /**
975     * Update red links on a document.
976     *
977     * @param PageConfig $pageConfig
978     * @param array $attribs
979     * @param array $revision
980     * @return Response
981     */
982    protected function updateRedLinks(
983        PageConfig $pageConfig, array $attribs, array $revision
984    ) {
985        $parsoid = $this->newParsoid();
986
987        $pb = new HtmlPageBundle(
988            $revision['html']['body'],
989            $revision['data-parsoid']['body'] ?? null,
990            $revision['data-mw']['body'] ?? null,
991            $attribs['envOptions']['inputContentVersion'],
992            $revision['html']['headers'] ?? null,
993            $revision['contentmodel'] ?? null
994        );
995
996        $out = $parsoid->pb2pb( $pageConfig, 'redlinks', $pb, [] );
997
998        $this->validatePb( $out, $attribs['envOptions']['inputContentVersion'] );
999
1000        $response = $this->getResponseFactory()->createJson( $out->responseData() );
1001        ParsoidFormatHelper::setContentType(
1002            $response, ParsoidFormatHelper::FORMAT_PAGEBUNDLE, $out->version
1003        );
1004        return $response;
1005    }
1006
1007    /**
1008     * Do variant conversion on a document.
1009     *
1010     * @param PageConfig $pageConfig
1011     * @param array $attribs
1012     * @param array $revision
1013     * @return Response
1014     * @throws HttpException
1015     */
1016    protected function languageConversion(
1017        PageConfig $pageConfig, array $attribs, array $revision
1018    ) {
1019        $opts = $attribs['opts'];
1020        $target = $opts['updates']['variant']['html'] ??
1021            $opts['updates']['variant']['target'] ??
1022            $attribs['envOptions']['htmlVariantLanguage'];
1023        $source = $opts['updates']['variant']['wikitext'] ??
1024            $opts['updates']['variant']['source'] ?? null;
1025
1026        if ( !$target ) {
1027            throw new LocalizedHttpException( new MessageValue( "rest-target-variant-required" ), 400 );
1028        }
1029
1030        $pageIdentity = $this->tryToCreatePageIdentity( $attribs );
1031
1032        $pb = new HtmlPageBundle(
1033            $revision['html']['body'],
1034            $revision['data-parsoid']['body'] ?? null,
1035            $revision['data-mw']['body'] ?? null,
1036            $attribs['envOptions']['inputContentVersion'],
1037            $revision['html']['headers'] ?? null,
1038            $revision['contentmodel'] ?? null
1039        );
1040
1041        // XXX: DI should inject HtmlTransformFactory
1042        $languageVariantConverter = MediaWikiServices::getInstance()
1043            ->getHtmlTransformFactory()
1044            ->getLanguageVariantConverter( $pageIdentity );
1045        $languageVariantConverter->setPageConfig( $pageConfig );
1046        $httpContentLanguage = $attribs['pagelanguage' ] ?? null;
1047        if ( $httpContentLanguage ) {
1048            $languageVariantConverter->setPageLanguageOverride( $httpContentLanguage );
1049        }
1050
1051        try {
1052            $out = $languageVariantConverter->convertPageBundleVariant( $pb, $target, $source );
1053        } catch ( InvalidArgumentException $e ) {
1054            throw new LocalizedHttpException(
1055                new MessageValue( "rest-unsupported-language-conversion", [ $source ?? '(unspecified)', $target ] ),
1056                400,
1057                [ 'reason' => $e->getMessage() ]
1058            );
1059        }
1060
1061        $response = $this->getResponseFactory()->createJson( $out->responseData() );
1062        ParsoidFormatHelper::setContentType(
1063            $response, ParsoidFormatHelper::FORMAT_PAGEBUNDLE, $out->version
1064        );
1065        return $response;
1066    }
1067
1068    /** @inheritDoc */
1069    abstract public function execute(): Response;
1070
1071    /**
1072     * Validate a HtmlPageBundle against the given contentVersion, and throw
1073     * an HttpException if it does not match.
1074     * @param HtmlPageBundle $pb
1075     * @param string $contentVersion
1076     * @throws HttpException
1077     */
1078    private function validatePb( HtmlPageBundle $pb, string $contentVersion ): void {
1079        $errorMessage = '';
1080        if ( !$pb->validate( $contentVersion, $errorMessage ) ) {
1081            throw new LocalizedHttpException(
1082                new MessageValue( "rest-page-bundle-validation-error", [ $errorMessage ] ),
1083                400
1084            );
1085        }
1086    }
1087
1088    /**
1089     * @param PageConfig $page
1090     *
1091     * @return ProperPageIdentity
1092     * @throws HttpException
1093     */
1094    private function pageConfigToPageIdentity( PageConfig $page ): ProperPageIdentity {
1095        $services = MediaWikiServices::getInstance();
1096
1097        $title = $page->getLinkTarget();
1098        try {
1099            $page = $services->getPageStore()->getPageForLink( $title );
1100        } catch ( MalformedTitleException | InvalidArgumentException ) {
1101            // Note that even some well-formed links are still invalid
1102            // parameters for getPageForLink(), e.g. interwiki links or special pages.
1103            throw new HttpException(
1104                "Bad title: $title", # uses LinkTarget::__toString()
1105                400
1106            );
1107        }
1108
1109        return $page;
1110    }
1111
1112}