Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
64.65% |
214 / 331 |
|
15.38% |
2 / 13 |
CRAP | |
0.00% |
0 / 1 |
| HtmlInputTransformHelper | |
64.65% |
214 / 331 |
|
15.38% |
2 / 13 |
456.73 | |
0.00% |
0 / 1 |
| __construct | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
2.00 | |||
| getParamSettings | |
0.00% |
0 / 44 |
|
0.00% |
0 / 1 |
2 | |||
| normalizeParameters | |
72.73% |
16 / 22 |
|
0.00% |
0 / 1 |
21.19 | |||
| init | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| initInternal | |
79.10% |
53 / 67 |
|
0.00% |
0 / 1 |
25.02 | |||
| getTransform | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| setMetrics | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| setOriginal | |
72.22% |
52 / 72 |
|
0.00% |
0 / 1 |
34.34 | |||
| getContent | |
72.22% |
13 / 18 |
|
0.00% |
0 / 1 |
4.34 | |||
| putContent | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
| fetchParserOutputFromParsoid | |
73.68% |
28 / 38 |
|
0.00% |
0 / 1 |
13.21 | |||
| fetchSelserContextFromStash | |
90.62% |
29 / 32 |
|
0.00% |
0 / 1 |
5.02 | |||
| throwHttpExceptionForStatus | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
6 | |||
| 1 | <?php |
| 2 | /** |
| 3 | * @license GPL-2.0-or-later |
| 4 | * @file |
| 5 | */ |
| 6 | namespace MediaWiki\Rest\Handler\Helper; |
| 7 | |
| 8 | use InvalidArgumentException; |
| 9 | use MediaWiki\Content\Content; |
| 10 | use MediaWiki\Edit\ParsoidOutputStash; |
| 11 | use MediaWiki\Edit\ParsoidRenderID; |
| 12 | use MediaWiki\Edit\SelserContext; |
| 13 | use MediaWiki\Exception\MWUnknownContentModelException; |
| 14 | use MediaWiki\Language\LanguageCode; |
| 15 | use MediaWiki\MainConfigNames; |
| 16 | use MediaWiki\Page\PageIdentity; |
| 17 | use MediaWiki\Page\PageLookup; |
| 18 | use MediaWiki\Page\PageRecord; |
| 19 | use MediaWiki\Page\ParserOutputAccess; |
| 20 | use MediaWiki\Parser\ParserOptions; |
| 21 | use MediaWiki\Parser\ParserOutput; |
| 22 | use MediaWiki\Parser\Parsoid\HtmlToContentTransform; |
| 23 | use MediaWiki\Parser\Parsoid\HtmlTransformFactory; |
| 24 | use MediaWiki\Parser\Parsoid\PageBundleParserOutputConverter; |
| 25 | use MediaWiki\Rest\Handler; |
| 26 | use MediaWiki\Rest\HttpException; |
| 27 | use MediaWiki\Rest\LocalizedHttpException; |
| 28 | use MediaWiki\Rest\ResponseInterface; |
| 29 | use MediaWiki\Revision\RevisionAccessException; |
| 30 | use MediaWiki\Revision\RevisionLookup; |
| 31 | use MediaWiki\Revision\RevisionRecord; |
| 32 | use MediaWiki\Status\Status; |
| 33 | use Wikimedia\Bcp47Code\Bcp47Code; |
| 34 | use Wikimedia\Message\MessageValue; |
| 35 | use Wikimedia\ParamValidator\ParamValidator; |
| 36 | use Wikimedia\Parsoid\Core\ClientError; |
| 37 | use Wikimedia\Parsoid\Core\HtmlPageBundle; |
| 38 | use Wikimedia\Parsoid\Core\ResourceLimitExceededException; |
| 39 | use Wikimedia\Parsoid\Parsoid; |
| 40 | use Wikimedia\Stats\StatsFactory; |
| 41 | |
| 42 | /** |
| 43 | * REST helper for converting HTML to page content source (e.g. wikitext). |
| 44 | * |
| 45 | * @since 1.40 |
| 46 | * |
| 47 | * @unstable Pending consolidation of the Parsoid extension with core code. |
| 48 | */ |
| 49 | class HtmlInputTransformHelper { |
| 50 | /** |
| 51 | * @internal |
| 52 | */ |
| 53 | public const CONSTRUCTOR_OPTIONS = [ |
| 54 | MainConfigNames::ParsoidCacheConfig |
| 55 | ]; |
| 56 | |
| 57 | /** @var PageIdentity|null */ |
| 58 | private $page = null; |
| 59 | |
| 60 | /** |
| 61 | * @var HtmlToContentTransform |
| 62 | */ |
| 63 | private $transform; |
| 64 | |
| 65 | /** |
| 66 | * @var array |
| 67 | */ |
| 68 | private $envOptions; |
| 69 | |
| 70 | private StatsFactory $statsFactory; |
| 71 | private HtmlTransformFactory $htmlTransformFactory; |
| 72 | private ParsoidOutputStash $parsoidOutputStash; |
| 73 | private ParserOutputAccess $parserOutputAccess; |
| 74 | private PageLookup $pageLookup; |
| 75 | private RevisionLookup $revisionLookup; |
| 76 | |
| 77 | /** |
| 78 | * @param StatsFactory $statsFactory |
| 79 | * @param HtmlTransformFactory $htmlTransformFactory |
| 80 | * @param ParsoidOutputStash $parsoidOutputStash |
| 81 | * @param ParserOutputAccess $parserOutputAccess |
| 82 | * @param PageLookup $pageLookup |
| 83 | * @param RevisionLookup $revisionLookup |
| 84 | * @param array $envOptions |
| 85 | * @param ?PageIdentity $page |
| 86 | * @param array|string $body Body structure, or an HTML string |
| 87 | * @param array $parameters |
| 88 | * @param RevisionRecord|null $originalRevision |
| 89 | * @param Bcp47Code|null $pageLanguage |
| 90 | */ |
| 91 | public function __construct( |
| 92 | StatsFactory $statsFactory, |
| 93 | HtmlTransformFactory $htmlTransformFactory, |
| 94 | ParsoidOutputStash $parsoidOutputStash, |
| 95 | ParserOutputAccess $parserOutputAccess, |
| 96 | PageLookup $pageLookup, |
| 97 | RevisionLookup $revisionLookup, |
| 98 | array $envOptions = [], |
| 99 | ?PageIdentity $page = null, |
| 100 | $body = '', |
| 101 | array $parameters = [], |
| 102 | ?RevisionRecord $originalRevision = null, |
| 103 | ?Bcp47Code $pageLanguage = null |
| 104 | ) { |
| 105 | $this->statsFactory = $statsFactory; |
| 106 | $this->htmlTransformFactory = $htmlTransformFactory; |
| 107 | $this->parsoidOutputStash = $parsoidOutputStash; |
| 108 | $this->envOptions = $envOptions + [ |
| 109 | 'outputContentVersion' => Parsoid::defaultHTMLVersion(), |
| 110 | 'offsetType' => 'byte', |
| 111 | ]; |
| 112 | $this->parserOutputAccess = $parserOutputAccess; |
| 113 | $this->pageLookup = $pageLookup; |
| 114 | $this->revisionLookup = $revisionLookup; |
| 115 | if ( $page === null ) { |
| 116 | wfDeprecated( __METHOD__ . ' without $page', '1.43' ); |
| 117 | } else { |
| 118 | $this->initInternal( $page, $body, $parameters, $originalRevision, $pageLanguage ); |
| 119 | } |
| 120 | } |
| 121 | |
| 122 | public function getParamSettings(): array { |
| 123 | // JSON body schema: |
| 124 | /* |
| 125 | doc: |
| 126 | properties: |
| 127 | headers: |
| 128 | type: array |
| 129 | items: |
| 130 | type: string |
| 131 | body: |
| 132 | type: [ string, object ] |
| 133 | required: [ body ] |
| 134 | |
| 135 | body: |
| 136 | properties: |
| 137 | offsetType: |
| 138 | type: string |
| 139 | revid: |
| 140 | type: integer |
| 141 | renderid: |
| 142 | type: string |
| 143 | etag: |
| 144 | type: string |
| 145 | html: |
| 146 | type: [ doc, string ] |
| 147 | data-mw: |
| 148 | type: doc |
| 149 | original: |
| 150 | properties: |
| 151 | html: |
| 152 | type: doc |
| 153 | source: |
| 154 | type: doc |
| 155 | data-mw: |
| 156 | type: doc |
| 157 | data-parsoid: |
| 158 | type: doc |
| 159 | required: [ html ] |
| 160 | */ |
| 161 | |
| 162 | // FUTURE: more params |
| 163 | // - slot (for loading the base content) |
| 164 | |
| 165 | return [ |
| 166 | // XXX: should we really declare this here? Or should end endpoint do this? |
| 167 | // We are not reading this property... |
| 168 | 'title' => [ |
| 169 | Handler::PARAM_SOURCE => 'path', |
| 170 | ParamValidator::PARAM_TYPE => 'string', |
| 171 | ParamValidator::PARAM_DEFAULT => '', |
| 172 | ParamValidator::PARAM_REQUIRED => false, |
| 173 | Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-title' ) |
| 174 | ], |
| 175 | // XXX: Needed for compatibility with the parsoid transform endpoint. |
| 176 | // But revid should just be part of the info about the original data |
| 177 | // in the body. |
| 178 | 'oldid' => [ |
| 179 | Handler::PARAM_SOURCE => 'path', |
| 180 | ParamValidator::PARAM_TYPE => 'int', |
| 181 | ParamValidator::PARAM_DEFAULT => 0, |
| 182 | ParamValidator::PARAM_REQUIRED => false, |
| 183 | Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-oldid' ) |
| 184 | ], |
| 185 | // XXX: Supported for compatibility with the parsoid transform endpoint. |
| 186 | // If given, it should be 'html' or 'pagebundle'. |
| 187 | 'from' => [ |
| 188 | Handler::PARAM_SOURCE => 'path', |
| 189 | ParamValidator::PARAM_TYPE => 'string', |
| 190 | ParamValidator::PARAM_DEFAULT => '', |
| 191 | ParamValidator::PARAM_REQUIRED => false, |
| 192 | Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-from' ) |
| 193 | ], |
| 194 | // XXX: Supported for compatibility with the parsoid transform endpoint. |
| 195 | // Ignored. |
| 196 | 'format' => [ |
| 197 | Handler::PARAM_SOURCE => 'path', |
| 198 | ParamValidator::PARAM_TYPE => 'string', |
| 199 | ParamValidator::PARAM_DEFAULT => '', |
| 200 | ParamValidator::PARAM_REQUIRED => false, |
| 201 | Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-format' ) |
| 202 | ], |
| 203 | 'contentmodel' => [ // XXX: get this from the Accept header? |
| 204 | Handler::PARAM_SOURCE => 'query', |
| 205 | ParamValidator::PARAM_TYPE => 'string', |
| 206 | ParamValidator::PARAM_DEFAULT => '', |
| 207 | ParamValidator::PARAM_REQUIRED => false, |
| 208 | Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-contentmodel' ) |
| 209 | ], |
| 210 | 'language' => [ // TODO: get this from Accept-Language header?! |
| 211 | Handler::PARAM_SOURCE => 'query', |
| 212 | ParamValidator::PARAM_TYPE => 'string', |
| 213 | ParamValidator::PARAM_DEFAULT => '', |
| 214 | ParamValidator::PARAM_REQUIRED => false, |
| 215 | Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-language' ) |
| 216 | ] |
| 217 | ]; |
| 218 | } |
| 219 | |
| 220 | /** |
| 221 | * Modify body and parameters to provide compatibility with legacy endpoints. |
| 222 | * |
| 223 | * @see ParsoidHandler::getRequestAttributes |
| 224 | * |
| 225 | * @param array<string,mixed> &$body |
| 226 | * @param array<string,mixed> &$parameters |
| 227 | * |
| 228 | * @throws HttpException |
| 229 | * |
| 230 | * @return void |
| 231 | */ |
| 232 | private static function normalizeParameters( array &$body, array &$parameters ) { |
| 233 | // If the revision ID is given in the path, pretend it was given in the body. |
| 234 | if ( isset( $parameters['oldid'] ) && (int)$parameters['oldid'] > 0 ) { |
| 235 | $body['original']['revid'] = (int)$parameters['oldid']; |
| 236 | } |
| 237 | |
| 238 | // If an etag is given in the body, use it as the render ID. |
| 239 | // Note that we support ETag format in the renderid field. |
| 240 | // @phan-suppress-next-line PhanRedundantCondition False positive |
| 241 | if ( !empty( $body['original']['etag'] ) ) { |
| 242 | // @phan-suppress-next-line PhanTypeInvalidDimOffset false positive |
| 243 | $body['original']['renderid'] = $body['original']['etag']; |
| 244 | } |
| 245 | |
| 246 | // Accept 'wikitext' as an alias for 'source'. |
| 247 | // @phan-suppress-next-line PhanImpossibleCondition False positive |
| 248 | if ( isset( $body['original']['wikitext'] ) ) { |
| 249 | // @phan-suppress-next-line PhanTypeInvalidDimOffset false positive |
| 250 | $body['original']['source'] = $body['original']['wikitext']; |
| 251 | unset( $body['original']['wikitext'] ); |
| 252 | } |
| 253 | |
| 254 | // If 'from' is not set, we accept page bundle style input as well as full HTML. |
| 255 | // If 'from' is set, we only accept page bundle style input if it is set to FORMAT_PAGEBUNDLE. |
| 256 | if ( |
| 257 | isset( $parameters['from'] ) && $parameters['from'] !== '' && |
| 258 | $parameters['from'] !== ParsoidFormatHelper::FORMAT_PAGEBUNDLE |
| 259 | ) { |
| 260 | unset( $body['original']['data-parsoid']['body'] ); |
| 261 | unset( $body['original']['data-mw']['body'] ); |
| 262 | unset( $body['data-mw']['body'] ); |
| 263 | } |
| 264 | |
| 265 | // If 'from' is given, it must be html or pagebundle. |
| 266 | if ( |
| 267 | isset( $parameters['from'] ) && $parameters['from'] !== '' && |
| 268 | $parameters['from'] !== ParsoidFormatHelper::FORMAT_HTML && |
| 269 | $parameters['from'] !== ParsoidFormatHelper::FORMAT_PAGEBUNDLE |
| 270 | ) { |
| 271 | throw new LocalizedHttpException( |
| 272 | new MessageValue( "rest-unsupported-transform-input", [ $parameters['from'] ] ), 400 |
| 273 | ); |
| 274 | } |
| 275 | |
| 276 | if ( isset( $body['contentmodel'] ) && $body['contentmodel'] !== '' ) { |
| 277 | $parameters['contentmodel'] = $body['contentmodel']; |
| 278 | } elseif ( isset( $parameters['format'] ) && $parameters['format'] !== '' ) { |
| 279 | $parameters['contentmodel'] = $parameters['format']; |
| 280 | } |
| 281 | } |
| 282 | |
| 283 | /** |
| 284 | * @param PageIdentity $page |
| 285 | * @param array|string $body Body structure, or an HTML string |
| 286 | * @param array $parameters |
| 287 | * @param RevisionRecord|null $originalRevision |
| 288 | * @param Bcp47Code|null $pageLanguage |
| 289 | * |
| 290 | * @throws HttpException |
| 291 | * @deprecated since 1.43; pass arguments to constructor instead |
| 292 | */ |
| 293 | public function init( |
| 294 | PageIdentity $page, |
| 295 | $body, |
| 296 | array $parameters, |
| 297 | ?RevisionRecord $originalRevision = null, |
| 298 | ?Bcp47Code $pageLanguage = null |
| 299 | ) { |
| 300 | wfDeprecated( __METHOD__, '1.43' ); |
| 301 | $this->initInternal( $page, $body, $parameters, $originalRevision, $pageLanguage ); |
| 302 | } |
| 303 | |
| 304 | /** |
| 305 | * @param PageIdentity $page |
| 306 | * @param array|string $body Body structure, or an HTML string |
| 307 | * @param array $parameters |
| 308 | * @param RevisionRecord|null $originalRevision |
| 309 | * @param Bcp47Code|null $pageLanguage |
| 310 | * |
| 311 | * @throws HttpException |
| 312 | */ |
| 313 | private function initInternal( |
| 314 | PageIdentity $page, |
| 315 | $body, |
| 316 | array $parameters, |
| 317 | ?RevisionRecord $originalRevision = null, |
| 318 | ?Bcp47Code $pageLanguage = null |
| 319 | ) { |
| 320 | if ( is_string( $body ) ) { |
| 321 | $body = [ 'html' => $body ]; |
| 322 | } |
| 323 | |
| 324 | self::normalizeParameters( $body, $parameters ); |
| 325 | |
| 326 | $this->page = $page; |
| 327 | |
| 328 | if ( !isset( $body['html'] ) ) { |
| 329 | throw new LocalizedHttpException( new MessageValue( "rest-missing-body-field", [ 'html' ] ) ); |
| 330 | } |
| 331 | |
| 332 | $html = is_array( $body['html'] ) ? $body['html']['body'] : $body['html']; |
| 333 | |
| 334 | // TODO: validate $body against a proper schema. |
| 335 | $this->transform = $this->htmlTransformFactory->getHtmlToContentTransform( |
| 336 | $html, |
| 337 | $this->page |
| 338 | ); |
| 339 | |
| 340 | $this->transform->setMetrics( $this->statsFactory ); |
| 341 | |
| 342 | // NOTE: Env::getContentModel will fall back to the page's recorded content model |
| 343 | // if none is set here. |
| 344 | $this->transform->setOptions( [ |
| 345 | 'contentmodel' => $parameters['contentmodel'] ?? null, |
| 346 | 'offsetType' => $body['offsetType'] ?? $this->envOptions['offsetType'], |
| 347 | ] ); |
| 348 | |
| 349 | $original = $body['original'] ?? []; |
| 350 | $originalRendering = null; |
| 351 | |
| 352 | if ( !isset( $original['html'] ) && !empty( $original['renderid'] ) ) { |
| 353 | $key = $original['renderid']; |
| 354 | if ( preg_match( '!^(W/)?".*"$!', $key ) ) { |
| 355 | $originalRendering = ParsoidRenderID::newFromETag( $key ); |
| 356 | |
| 357 | if ( !$originalRendering ) { |
| 358 | throw new LocalizedHttpException( new MessageValue( "rest-bad-etag", [ $key ] ), 400 ); |
| 359 | } |
| 360 | } else { |
| 361 | try { |
| 362 | $originalRendering = ParsoidRenderID::newFromKey( $key ); |
| 363 | } catch ( InvalidArgumentException ) { |
| 364 | throw new LocalizedHttpException( |
| 365 | new MessageValue( 'rest-parsoid-bad-render-id', [ $key ] ), |
| 366 | 400 |
| 367 | ); |
| 368 | } |
| 369 | } |
| 370 | } elseif ( !empty( $original['html'] ) || !empty( $original['data-parsoid'] ) ) { |
| 371 | // NOTE: We might have an incomplete HtmlPageBundle here, with no HTML but with data-parsoid! |
| 372 | // XXX: Do we need to support that, or can that just be a 400? |
| 373 | $originalRendering = HtmlPageBundle::newFromJsonArray( [ |
| 374 | 'html' => $original['html']['body'] ?? '', |
| 375 | 'parsoid' => $original['data-parsoid']['body'] ?? null, |
| 376 | 'mw' => $original['data-mw']['body'] ?? null, |
| 377 | 'counters' => $original['counters']['body'] ?? null, |
| 378 | 'version' => null, // will be derived from $original['html']['headers']['content-type'] |
| 379 | 'headers' => $original['html']['headers'] ?? [] |
| 380 | ] ); |
| 381 | } |
| 382 | |
| 383 | if ( !$originalRevision && !empty( $original['revid'] ) ) { |
| 384 | $originalRevision = (int)$original['revid']; |
| 385 | } |
| 386 | |
| 387 | if ( $originalRevision || $originalRendering ) { |
| 388 | $this->setOriginal( $originalRevision, $originalRendering ); |
| 389 | } else { |
| 390 | if ( $this->page->exists() ) { |
| 391 | $this->statsFactory |
| 392 | ->getCounter( 'html_input_transform_total' ) |
| 393 | ->setLabel( 'original_html_given', 'false' ) |
| 394 | ->setLabel( 'page_exists', 'true' ) |
| 395 | ->setLabel( 'status', 'unknown' ) |
| 396 | ->increment(); |
| 397 | } else { |
| 398 | $this->statsFactory |
| 399 | ->getCounter( 'html_input_transform_total' ) |
| 400 | ->setLabel( 'original_html_given', 'false' ) |
| 401 | ->setLabel( 'page_exists', 'false' ) |
| 402 | ->setLabel( 'status', 'unknown' ) |
| 403 | ->increment(); |
| 404 | } |
| 405 | } |
| 406 | |
| 407 | if ( isset( $body['data-mw']['body'] ) ) { |
| 408 | $this->transform->setModifiedDataMW( $body['data-mw']['body'] ); |
| 409 | } |
| 410 | |
| 411 | if ( $pageLanguage ) { |
| 412 | $this->transform->setContentLanguage( $pageLanguage ); |
| 413 | } elseif ( isset( $parameters['language'] ) && $parameters['language'] !== '' ) { |
| 414 | $pageLanguage = LanguageCode::normalizeNonstandardCodeAndWarn( |
| 415 | $parameters['language'] |
| 416 | ); |
| 417 | $this->transform->setContentLanguage( $pageLanguage ); |
| 418 | } |
| 419 | |
| 420 | if ( isset( $original['source']['body'] ) ) { |
| 421 | // XXX: do we really have to support wikitext overrides? |
| 422 | $this->transform->setOriginalText( $original['source']['body'] ); |
| 423 | } |
| 424 | } |
| 425 | |
| 426 | /** |
| 427 | * Return HTMLTransform object, so additional context can be provided by calling setters on it. |
| 428 | */ |
| 429 | public function getTransform(): HtmlToContentTransform { |
| 430 | return $this->transform; |
| 431 | } |
| 432 | |
| 433 | /** |
| 434 | * Set metrics sink. |
| 435 | */ |
| 436 | public function setMetrics( StatsFactory $statsFactory ) { |
| 437 | $this->statsFactory = $statsFactory; |
| 438 | |
| 439 | if ( $this->transform ) { |
| 440 | $this->transform->setMetrics( $statsFactory ); |
| 441 | } |
| 442 | } |
| 443 | |
| 444 | /** |
| 445 | * Supply information about the revision and rendering that was the original basis of |
| 446 | * the input HTML. This is used to apply selective serialization (selser), if possible. |
| 447 | * |
| 448 | * @param RevisionRecord|int|null $rev |
| 449 | * @param ParsoidRenderID|HtmlPageBundle|ParserOutput|null $originalRendering |
| 450 | */ |
| 451 | public function setOriginal( $rev, $originalRendering ) { |
| 452 | if ( $originalRendering instanceof ParsoidRenderID ) { |
| 453 | $renderId = $originalRendering; |
| 454 | |
| 455 | // If the client asked for a render ID, load original data from stash |
| 456 | try { |
| 457 | $selserContext = $this->fetchSelserContextFromStash( $renderId ); |
| 458 | } catch ( InvalidArgumentException $ex ) { |
| 459 | $this->statsFactory |
| 460 | ->getCounter( 'html_input_transform_total' ) |
| 461 | ->setLabel( 'original_html_given', 'as_renderid' ) |
| 462 | ->setLabel( 'page_exists', 'unknown' ) |
| 463 | ->setLabel( 'status', 'bad_renderid' ) |
| 464 | ->increment(); |
| 465 | throw new LocalizedHttpException( new MessageValue( "rest-bad-stash-key" ), |
| 466 | 400, |
| 467 | [ |
| 468 | 'reason' => $ex->getMessage(), |
| 469 | 'key' => "$renderId" |
| 470 | ] |
| 471 | ); |
| 472 | } |
| 473 | |
| 474 | if ( !$selserContext ) { |
| 475 | // NOTE: When the client asked for a specific stash key (resp. etag), |
| 476 | // we should fail with a 412 if we don't have the specific rendering. |
| 477 | // On the other hand, of the client only provided a base revision ID, |
| 478 | // we can re-parse and hope for the best. |
| 479 | |
| 480 | throw new LocalizedHttpException( |
| 481 | new MessageValue( "rest-no-stashed-content", [ $renderId->getKey() ] ), 412 |
| 482 | ); |
| 483 | |
| 484 | // TODO: This class should provide getETag and getLastModified methods for use by |
| 485 | // the REST endpoint, to provide proper support for conditionals. |
| 486 | // However, that requires some refactoring of how HTTP conditional checks |
| 487 | // work in the Handler base class. |
| 488 | } |
| 489 | |
| 490 | if ( !$rev ) { |
| 491 | $rev = $renderId->getRevisionID(); |
| 492 | } |
| 493 | |
| 494 | $originalRendering = $selserContext->getPageBundle(); |
| 495 | $content = $selserContext->getContent(); |
| 496 | |
| 497 | if ( $content ) { |
| 498 | $this->transform->setOriginalContent( $content ); |
| 499 | } |
| 500 | } elseif ( !$originalRendering && $rev ) { |
| 501 | // The client provided a revision ID, but not stash key. |
| 502 | // Try to get a rendering for the given revision, and use it as the basis for selser. |
| 503 | // Chances are good that the resulting diff will be reasonably clean. |
| 504 | // NOTE: If we don't have a revision ID, we should not attempt selser! |
| 505 | $originalRendering = $this->fetchParserOutputFromParsoid( $this->page, $rev, true ); |
| 506 | |
| 507 | if ( $originalRendering ) { |
| 508 | $this->statsFactory->getCounter( 'html_input_transform_total' ) |
| 509 | ->setLabel( 'original_html_given', 'as_revid' ) |
| 510 | ->setLabel( 'page_exists', 'unknown' ) |
| 511 | ->setLabel( 'status', 'found' ) |
| 512 | ->increment(); |
| 513 | } else { |
| 514 | $this->statsFactory->getCounter( 'html_input_transform_total' ) |
| 515 | ->setLabel( 'original_html_given', 'as_revid' ) |
| 516 | ->setLabel( 'page_exists', 'unknown' ) |
| 517 | ->setLabel( 'status', 'not_found' ) |
| 518 | ->increment(); |
| 519 | } |
| 520 | } elseif ( $originalRendering ) { |
| 521 | $this->statsFactory->getCounter( 'html_input_transform_total' ) |
| 522 | ->setLabel( 'original_html_given', 'true' ) |
| 523 | ->setLabel( 'page_exists', 'unknown' ) |
| 524 | ->setLabel( 'status', 'verbatim' ) |
| 525 | ->increment(); |
| 526 | } |
| 527 | |
| 528 | if ( $originalRendering instanceof ParserOutput ) { |
| 529 | $originalRendering = PageBundleParserOutputConverter::pageBundleFromParserOutput( $originalRendering ); |
| 530 | |
| 531 | // NOTE: Use the default if we got a ParserOutput object. |
| 532 | // Don't apply the default if we got passed a HtmlPageBundle, |
| 533 | // in that case, we want to require the version to be explicit. |
| 534 | if ( $originalRendering->version === null && !isset( $originalRendering->headers['content-type'] ) ) { |
| 535 | $originalRendering->version = Parsoid::defaultHTMLVersion(); |
| 536 | } |
| 537 | } |
| 538 | |
| 539 | if ( !$originalRendering instanceof HtmlPageBundle ) { |
| 540 | return; |
| 541 | } |
| 542 | |
| 543 | if ( $originalRendering->version !== null ) { |
| 544 | $this->transform->setOriginalSchemaVersion( $originalRendering->version ); |
| 545 | } elseif ( !empty( $originalRendering->headers['content-type'] ) ) { |
| 546 | $vOriginal = ParsoidFormatHelper::parseContentTypeHeader( |
| 547 | // @phan-suppress-next-line PhanTypeArraySuspiciousNullable Silly Phan, we just checked. |
| 548 | $originalRendering->headers['content-type'] |
| 549 | ); |
| 550 | |
| 551 | if ( $vOriginal ) { |
| 552 | $this->transform->setOriginalSchemaVersion( $vOriginal ); |
| 553 | } |
| 554 | } |
| 555 | |
| 556 | if ( $rev instanceof RevisionRecord ) { |
| 557 | $this->transform->setOriginalRevision( $rev ); |
| 558 | } elseif ( $rev && is_int( $rev ) ) { |
| 559 | $this->transform->setOriginalRevisionId( $rev ); |
| 560 | } |
| 561 | |
| 562 | // NOTE: We might have an incomplete HtmlPageBundle here, with no HTML. |
| 563 | // HtmlPageBundle::$html is declared to not be nullable, so it would be set to the empty |
| 564 | // string if not given. |
| 565 | if ( $originalRendering->html !== '' ) { |
| 566 | $this->transform->setOriginalHtml( $originalRendering->html ); |
| 567 | } |
| 568 | |
| 569 | $originalDataParsoid = $originalRendering->parsoid; |
| 570 | if ( $originalDataParsoid !== null ) { |
| 571 | $this->transform->setOriginalDataParsoid( $originalDataParsoid ); |
| 572 | } |
| 573 | |
| 574 | $originalDataMW = $originalRendering->mw; |
| 575 | if ( $originalDataMW !== null ) { |
| 576 | $this->transform->setOriginalDataMW( $originalDataMW ); |
| 577 | } |
| 578 | } |
| 579 | |
| 580 | /** |
| 581 | * @return Content the content derived from the input HTML. |
| 582 | * @throws HttpException |
| 583 | */ |
| 584 | public function getContent(): Content { |
| 585 | try { |
| 586 | return $this->transform->htmlToContent(); |
| 587 | } catch ( ClientError $e ) { |
| 588 | throw new LocalizedHttpException( |
| 589 | new MessageValue( 'rest-html-backend-error', [ $e->getMessage() ] ), |
| 590 | 400, |
| 591 | [ 'reason' => $e->getMessage() ] |
| 592 | ); |
| 593 | } catch ( ResourceLimitExceededException $e ) { |
| 594 | throw new LocalizedHttpException( |
| 595 | new MessageValue( 'rest-resource-limit-exceeded' ), |
| 596 | 413, |
| 597 | [ 'reason' => $e->getMessage() ] |
| 598 | ); |
| 599 | } catch ( MWUnknownContentModelException $e ) { |
| 600 | throw new LocalizedHttpException( |
| 601 | new MessageValue( "rest-unknown-content-model", [ $e->getModelId() ] ), |
| 602 | 400 |
| 603 | ); |
| 604 | } |
| 605 | } |
| 606 | |
| 607 | /** |
| 608 | * Creates a response containing the content derived from the input HTML. |
| 609 | * This will set the appropriate Content-Type header. |
| 610 | */ |
| 611 | public function putContent( ResponseInterface $response ) { |
| 612 | $content = $this->getContent(); |
| 613 | $data = $content->serialize(); |
| 614 | |
| 615 | try { |
| 616 | $contentType = ParsoidFormatHelper::getContentType( |
| 617 | $content->getModel(), |
| 618 | $this->envOptions['outputContentVersion'] |
| 619 | ); |
| 620 | } catch ( InvalidArgumentException ) { |
| 621 | // If Parsoid doesn't know the content type, |
| 622 | // ask the ContentHandler! |
| 623 | $contentType = $content->getDefaultFormat(); |
| 624 | } |
| 625 | |
| 626 | $response->setHeader( 'Content-Type', $contentType ); |
| 627 | $response->getBody()->write( $data ); |
| 628 | } |
| 629 | |
| 630 | /** |
| 631 | * @param PageIdentity $page |
| 632 | * @param RevisionRecord|int $revision |
| 633 | * @param bool $mayParse |
| 634 | * |
| 635 | * @return ParserOutput|null |
| 636 | * @throws HttpException |
| 637 | */ |
| 638 | private function fetchParserOutputFromParsoid( PageIdentity $page, $revision, bool $mayParse ): ?ParserOutput { |
| 639 | $parserOptions = ParserOptions::newFromAnon(); |
| 640 | $parserOptions->setUseParsoid(); |
| 641 | |
| 642 | try { |
| 643 | if ( !$page instanceof PageRecord ) { |
| 644 | $name = "$page"; |
| 645 | $page = $this->pageLookup->getPageByReference( $page ); |
| 646 | if ( !$page ) { |
| 647 | throw new RevisionAccessException( 'Page {name} not found', |
| 648 | [ 'name' => $name ] ); |
| 649 | } |
| 650 | } |
| 651 | |
| 652 | if ( is_int( $revision ) ) { |
| 653 | $revId = $revision; |
| 654 | $revision = $this->revisionLookup->getRevisionById( $revId, 0, $page ); |
| 655 | |
| 656 | if ( !$revision ) { |
| 657 | throw new RevisionAccessException( 'Revision {revId} not found', |
| 658 | [ 'revId' => $revId ] ); |
| 659 | } |
| 660 | } |
| 661 | |
| 662 | if ( $page->getId() !== $revision->getPageId() ) { |
| 663 | throw new RevisionAccessException( 'Revision {revId} does not belong to page {name}', |
| 664 | [ 'name' => $page->getDBkey(), |
| 665 | 'revId' => $revision->getId() ] ); |
| 666 | } |
| 667 | |
| 668 | if ( $mayParse ) { |
| 669 | try { |
| 670 | $status = $this->parserOutputAccess->getParserOutput( |
| 671 | $page, $parserOptions, $revision |
| 672 | ); |
| 673 | } catch ( ClientError $e ) { |
| 674 | $status = Status::newFatal( 'parsoid-client-error', $e->getMessage() ); |
| 675 | } catch ( ResourceLimitExceededException $e ) { |
| 676 | $status = Status::newFatal( 'parsoid-resource-limit-exceeded', $e->getMessage() ); |
| 677 | } |
| 678 | |
| 679 | if ( !$status->isOK() ) { |
| 680 | $this->throwHttpExceptionForStatus( $status ); |
| 681 | } |
| 682 | |
| 683 | $parserOutput = $status->getValue(); |
| 684 | } else { |
| 685 | $parserOutput = $this->parserOutputAccess->getCachedParserOutput( |
| 686 | $page, $parserOptions, $revision |
| 687 | ); |
| 688 | } |
| 689 | } catch ( RevisionAccessException $e ) { |
| 690 | // The client supplied bad revision ID, or the revision was deleted or suppressed. |
| 691 | throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ), |
| 692 | 404, |
| 693 | [ 'reason' => $e->getMessage() ] |
| 694 | ); |
| 695 | } |
| 696 | |
| 697 | return $parserOutput; |
| 698 | } |
| 699 | |
| 700 | /** |
| 701 | * @param ParsoidRenderID $renderID |
| 702 | * |
| 703 | * @return SelserContext|null |
| 704 | */ |
| 705 | private function fetchSelserContextFromStash( $renderID ): ?SelserContext { |
| 706 | $selserContext = $this->parsoidOutputStash->get( $renderID ); |
| 707 | $labels = [ |
| 708 | 'original_html_given' => 'as_renderid', |
| 709 | 'page_exists' => 'unknown', |
| 710 | 'status' => 'hit-stashed' |
| 711 | ]; |
| 712 | $counter = $this->statsFactory->getCounter( 'html_input_transform_total' ); |
| 713 | if ( $selserContext ) { |
| 714 | $counter->setLabels( $labels ) |
| 715 | ->increment(); |
| 716 | return $selserContext; |
| 717 | } else { |
| 718 | // Looks like the rendering is gone from stash (or the client send us a bogus key). |
| 719 | // Try to load it from the parser cache instead. |
| 720 | // On a wiki with low edit frequency, there is a good chance that it's still there. |
| 721 | try { |
| 722 | $parserOutput = $this->fetchParserOutputFromParsoid( $this->page, $renderID->getRevisionID(), false ); |
| 723 | |
| 724 | if ( !$parserOutput ) { |
| 725 | $labels[ 'status' ] = 'miss-fallback_not_found'; |
| 726 | $counter->setLabels( $labels )->increment(); |
| 727 | return null; |
| 728 | } |
| 729 | |
| 730 | $cachedRenderID = ParsoidRenderID::newFromParserOutput( $parserOutput ); |
| 731 | if ( $cachedRenderID->getKey() !== $renderID->getKey() ) { |
| 732 | $labels[ 'status' ] = 'mismatch-fallback_not_found'; |
| 733 | $counter->setLabels( $labels ) |
| 734 | ->increment(); |
| 735 | |
| 736 | // It's not the correct rendering. |
| 737 | return null; |
| 738 | } |
| 739 | $labels[ 'status' ] = 'hit-fallback_found'; |
| 740 | $counter->setLabels( $labels ) |
| 741 | ->increment(); |
| 742 | |
| 743 | $pb = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput ); |
| 744 | return new SelserContext( $pb, $renderID->getRevisionID() ); |
| 745 | } catch ( HttpException ) { |
| 746 | $labels[ 'status' ] = 'failed-fallback_not_found'; |
| 747 | $counter->setLabels( $labels ) |
| 748 | ->increment(); |
| 749 | |
| 750 | // If the revision isn't found, don't trigger a 404. Return null to trigger a 412. |
| 751 | return null; |
| 752 | } |
| 753 | } |
| 754 | } |
| 755 | |
| 756 | /** |
| 757 | * @param Status $status |
| 758 | * |
| 759 | * @return never |
| 760 | * @throws HttpException |
| 761 | */ |
| 762 | private function throwHttpExceptionForStatus( Status $status ) { |
| 763 | // TODO: make this nicer. |
| 764 | if ( $status->hasMessage( 'parsoid-resource-limit-exceeded' ) ) { |
| 765 | throw new LocalizedHttpException( new MessageValue( "rest-parsoid-resource-exceeded" ), |
| 766 | 413, |
| 767 | [ 'reason' => $status->getHTML() ] |
| 768 | ); |
| 769 | } else { |
| 770 | throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error" ), |
| 771 | 400, |
| 772 | [ 'reason' => $status->getHTML() ] |
| 773 | ); |
| 774 | } |
| 775 | } |
| 776 | |
| 777 | } |