Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
65.99% |
229 / 347 |
|
15.38% |
2 / 13 |
CRAP | |
0.00% |
0 / 1 |
HtmlInputTransformHelper | |
65.99% |
229 / 347 |
|
15.38% |
2 / 13 |
416.64 | |
0.00% |
0 / 1 |
__construct | |
92.31% |
12 / 13 |
|
0.00% |
0 / 1 |
2.00 | |||
getParamSettings | |
0.00% |
0 / 44 |
|
0.00% |
0 / 1 |
2 | |||
normalizeParameters | |
72.73% |
16 / 22 |
|
0.00% |
0 / 1 |
21.19 | |||
init | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
initInternal | |
77.78% |
49 / 63 |
|
0.00% |
0 / 1 |
24.39 | |||
getTransform | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
setMetrics | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
setOriginal | |
78.38% |
58 / 74 |
|
0.00% |
0 / 1 |
28.35 | |||
getContent | |
72.22% |
13 / 18 |
|
0.00% |
0 / 1 |
4.34 | |||
putContent | |
100.00% |
10 / 10 |
|
100.00% |
1 / 1 |
2 | |||
fetchParserOutputFromParsoid | |
73.68% |
28 / 38 |
|
0.00% |
0 / 1 |
13.21 | |||
fetchSelserContextFromStash | |
89.36% |
42 / 47 |
|
0.00% |
0 / 1 |
5.03 | |||
throwHttpExceptionForStatus | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
6 |
1 | <?php |
2 | /** |
3 | * This program is free software; you can redistribute it and/or modify |
4 | * it under the terms of the GNU General Public License as published by |
5 | * the Free Software Foundation; either version 2 of the License, or |
6 | * (at your option) any later version. |
7 | * |
8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | * GNU General Public License for more details. |
12 | * |
13 | * You should have received a copy of the GNU General Public License along |
14 | * with this program; if not, write to the Free Software Foundation, Inc., |
15 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
16 | * http://www.gnu.org/copyleft/gpl.html |
17 | * |
18 | * @file |
19 | */ |
20 | namespace MediaWiki\Rest\Handler\Helper; |
21 | |
22 | use InvalidArgumentException; |
23 | use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface; |
24 | use MediaWiki\Content\Content; |
25 | use MediaWiki\Edit\ParsoidOutputStash; |
26 | use MediaWiki\Edit\ParsoidRenderID; |
27 | use MediaWiki\Edit\SelserContext; |
28 | use MediaWiki\Language\LanguageCode; |
29 | use MediaWiki\MainConfigNames; |
30 | use MediaWiki\Page\PageIdentity; |
31 | use MediaWiki\Page\PageLookup; |
32 | use MediaWiki\Page\PageRecord; |
33 | use MediaWiki\Page\ParserOutputAccess; |
34 | use MediaWiki\Parser\ParserOptions; |
35 | use MediaWiki\Parser\ParserOutput; |
36 | use MediaWiki\Parser\Parsoid\HtmlToContentTransform; |
37 | use MediaWiki\Parser\Parsoid\HtmlTransformFactory; |
38 | use MediaWiki\Parser\Parsoid\PageBundleParserOutputConverter; |
39 | use MediaWiki\Rest\Handler; |
40 | use MediaWiki\Rest\HttpException; |
41 | use MediaWiki\Rest\LocalizedHttpException; |
42 | use MediaWiki\Rest\ResponseInterface; |
43 | use MediaWiki\Revision\RevisionAccessException; |
44 | use MediaWiki\Revision\RevisionLookup; |
45 | use MediaWiki\Revision\RevisionRecord; |
46 | use MediaWiki\Status\Status; |
47 | use MWUnknownContentModelException; |
48 | use Wikimedia\Bcp47Code\Bcp47Code; |
49 | use Wikimedia\Message\MessageValue; |
50 | use Wikimedia\ParamValidator\ParamValidator; |
51 | use Wikimedia\Parsoid\Core\ClientError; |
52 | use Wikimedia\Parsoid\Core\PageBundle; |
53 | use Wikimedia\Parsoid\Core\ResourceLimitExceededException; |
54 | use Wikimedia\Parsoid\Parsoid; |
55 | use Wikimedia\Stats\StatsFactory; |
56 | |
57 | /** |
58 | * REST helper for converting HTML to page content source (e.g. wikitext). |
59 | * |
60 | * @since 1.40 |
61 | * |
62 | * @unstable Pending consolidation of the Parsoid extension with core code. |
63 | */ |
64 | class HtmlInputTransformHelper { |
65 | /** |
66 | * @internal |
67 | */ |
68 | public const CONSTRUCTOR_OPTIONS = [ |
69 | MainConfigNames::ParsoidCacheConfig |
70 | ]; |
71 | |
72 | /** @var PageIdentity|null */ |
73 | private $page = null; |
74 | |
75 | /** |
76 | * @var HtmlToContentTransform |
77 | */ |
78 | private $transform; |
79 | |
80 | /** |
81 | * @var array |
82 | */ |
83 | private $envOptions; |
84 | |
85 | private StatsFactory $statsFactory; |
86 | private HtmlTransformFactory $htmlTransformFactory; |
87 | private ParsoidOutputStash $parsoidOutputStash; |
88 | private ParserOutputAccess $parserOutputAccess; |
89 | private PageLookup $pageLookup; |
90 | private RevisionLookup $revisionLookup; |
91 | |
92 | /** |
93 | * @param StatsFactory $statsFactory |
94 | * @param HtmlTransformFactory $htmlTransformFactory |
95 | * @param ParsoidOutputStash $parsoidOutputStash |
96 | * @param ParserOutputAccess $parserOutputAccess |
97 | * @param PageLookup $pageLookup |
98 | * @param RevisionLookup $revisionLookup |
99 | * @param array $envOptions |
100 | * @param ?PageIdentity $page |
101 | * @param array|string $body Body structure, or an HTML string |
102 | * @param array $parameters |
103 | * @param RevisionRecord|null $originalRevision |
104 | * @param Bcp47Code|null $pageLanguage |
105 | */ |
106 | public function __construct( |
107 | StatsFactory $statsFactory, |
108 | HtmlTransformFactory $htmlTransformFactory, |
109 | ParsoidOutputStash $parsoidOutputStash, |
110 | ParserOutputAccess $parserOutputAccess, |
111 | PageLookup $pageLookup, |
112 | RevisionLookup $revisionLookup, |
113 | array $envOptions = [], |
114 | ?PageIdentity $page = null, |
115 | $body = '', |
116 | array $parameters = [], |
117 | ?RevisionRecord $originalRevision = null, |
118 | ?Bcp47Code $pageLanguage = null |
119 | ) { |
120 | $this->statsFactory = $statsFactory; |
121 | $this->htmlTransformFactory = $htmlTransformFactory; |
122 | $this->parsoidOutputStash = $parsoidOutputStash; |
123 | $this->envOptions = $envOptions + [ |
124 | 'outputContentVersion' => Parsoid::defaultHTMLVersion(), |
125 | 'offsetType' => 'byte', |
126 | ]; |
127 | $this->parserOutputAccess = $parserOutputAccess; |
128 | $this->pageLookup = $pageLookup; |
129 | $this->revisionLookup = $revisionLookup; |
130 | if ( $page === null ) { |
131 | wfDeprecated( __METHOD__ . ' without $page', '1.43' ); |
132 | } else { |
133 | $this->initInternal( $page, $body, $parameters, $originalRevision, $pageLanguage ); |
134 | } |
135 | } |
136 | |
137 | /** |
138 | * @return array |
139 | */ |
140 | public function getParamSettings(): array { |
141 | // JSON body schema: |
142 | /* |
143 | doc: |
144 | properties: |
145 | headers: |
146 | type: array |
147 | items: |
148 | type: string |
149 | body: |
150 | type: [ string, object ] |
151 | required: [ body ] |
152 | |
153 | body: |
154 | properties: |
155 | offsetType: |
156 | type: string |
157 | revid: |
158 | type: integer |
159 | renderid: |
160 | type: string |
161 | etag: |
162 | type: string |
163 | html: |
164 | type: [ doc, string ] |
165 | data-mw: |
166 | type: doc |
167 | original: |
168 | properties: |
169 | html: |
170 | type: doc |
171 | source: |
172 | type: doc |
173 | data-mw: |
174 | type: doc |
175 | data-parsoid: |
176 | type: doc |
177 | required: [ html ] |
178 | */ |
179 | |
180 | // FUTURE: more params |
181 | // - slot (for loading the base content) |
182 | |
183 | return [ |
184 | // XXX: should we really declare this here? Or should end endpoint do this? |
185 | // We are not reading this property... |
186 | 'title' => [ |
187 | Handler::PARAM_SOURCE => 'path', |
188 | ParamValidator::PARAM_TYPE => 'string', |
189 | ParamValidator::PARAM_DEFAULT => '', |
190 | ParamValidator::PARAM_REQUIRED => false, |
191 | Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-title' ) |
192 | ], |
193 | // XXX: Needed for compatibility with the parsoid transform endpoint. |
194 | // But revid should just be part of the info about the original data |
195 | // in the body. |
196 | 'oldid' => [ |
197 | Handler::PARAM_SOURCE => 'path', |
198 | ParamValidator::PARAM_TYPE => 'int', |
199 | ParamValidator::PARAM_DEFAULT => 0, |
200 | ParamValidator::PARAM_REQUIRED => false, |
201 | Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-oldid' ) |
202 | ], |
203 | // XXX: Supported for compatibility with the parsoid transform endpoint. |
204 | // If given, it should be 'html' or 'pagebundle'. |
205 | 'from' => [ |
206 | Handler::PARAM_SOURCE => 'path', |
207 | ParamValidator::PARAM_TYPE => 'string', |
208 | ParamValidator::PARAM_DEFAULT => '', |
209 | ParamValidator::PARAM_REQUIRED => false, |
210 | Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-from' ) |
211 | ], |
212 | // XXX: Supported for compatibility with the parsoid transform endpoint. |
213 | // Ignored. |
214 | 'format' => [ |
215 | Handler::PARAM_SOURCE => 'path', |
216 | ParamValidator::PARAM_TYPE => 'string', |
217 | ParamValidator::PARAM_DEFAULT => '', |
218 | ParamValidator::PARAM_REQUIRED => false, |
219 | Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-format' ) |
220 | ], |
221 | 'contentmodel' => [ // XXX: get this from the Accept header? |
222 | Handler::PARAM_SOURCE => 'query', |
223 | ParamValidator::PARAM_TYPE => 'string', |
224 | ParamValidator::PARAM_DEFAULT => '', |
225 | ParamValidator::PARAM_REQUIRED => false, |
226 | Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-contentmodel' ) |
227 | ], |
228 | 'language' => [ // TODO: get this from Accept-Language header?! |
229 | Handler::PARAM_SOURCE => 'query', |
230 | ParamValidator::PARAM_TYPE => 'string', |
231 | ParamValidator::PARAM_DEFAULT => '', |
232 | ParamValidator::PARAM_REQUIRED => false, |
233 | Handler::PARAM_DESCRIPTION => new MessageValue( 'rest-param-desc-html-input-language' ) |
234 | ] |
235 | ]; |
236 | } |
237 | |
238 | /** |
239 | * Modify body and parameters to provide compatibility with legacy endpoints. |
240 | * |
241 | * @see ParsoidHandler::getRequestAttributes |
242 | * |
243 | * @param array<string,mixed> &$body |
244 | * @param array<string,mixed> &$parameters |
245 | * |
246 | * @throws HttpException |
247 | * |
248 | * @return void |
249 | */ |
250 | private static function normalizeParameters( array &$body, array &$parameters ) { |
251 | // If the revision ID is given in the path, pretend it was given in the body. |
252 | if ( isset( $parameters['oldid'] ) && (int)$parameters['oldid'] > 0 ) { |
253 | $body['original']['revid'] = (int)$parameters['oldid']; |
254 | } |
255 | |
256 | // If an etag is given in the body, use it as the render ID. |
257 | // Note that we support ETag format in the renderid field. |
258 | if ( !empty( $body['original']['etag'] ) ) { |
259 | // @phan-suppress-next-line PhanTypeInvalidDimOffset false positive |
260 | $body['original']['renderid'] = $body['original']['etag']; |
261 | } |
262 | |
263 | // Accept 'wikitext' as an alias for 'source'. |
264 | if ( isset( $body['original']['wikitext'] ) ) { |
265 | // @phan-suppress-next-line PhanTypeInvalidDimOffset false positive |
266 | $body['original']['source'] = $body['original']['wikitext']; |
267 | unset( $body['original']['wikitext'] ); |
268 | } |
269 | |
270 | // If 'from' is not set, we accept page bundle style input as well as full HTML. |
271 | // If 'from' is set, we only accept page bundle style input if it is set to FORMAT_PAGEBUNDLE. |
272 | if ( |
273 | isset( $parameters['from'] ) && $parameters['from'] !== '' && |
274 | $parameters['from'] !== ParsoidFormatHelper::FORMAT_PAGEBUNDLE |
275 | ) { |
276 | unset( $body['original']['data-parsoid']['body'] ); |
277 | unset( $body['original']['data-mw']['body'] ); |
278 | unset( $body['data-mw']['body'] ); |
279 | } |
280 | |
281 | // If 'from' is given, it must be html or pagebundle. |
282 | if ( |
283 | isset( $parameters['from'] ) && $parameters['from'] !== '' && |
284 | $parameters['from'] !== ParsoidFormatHelper::FORMAT_HTML && |
285 | $parameters['from'] !== ParsoidFormatHelper::FORMAT_PAGEBUNDLE |
286 | ) { |
287 | throw new LocalizedHttpException( |
288 | new MessageValue( "rest-unsupported-transform-input", [ $parameters['from'] ] ), 400 |
289 | ); |
290 | } |
291 | |
292 | if ( isset( $body['contentmodel'] ) && $body['contentmodel'] !== '' ) { |
293 | $parameters['contentmodel'] = $body['contentmodel']; |
294 | } elseif ( isset( $parameters['format'] ) && $parameters['format'] !== '' ) { |
295 | $parameters['contentmodel'] = $parameters['format']; |
296 | } |
297 | } |
298 | |
299 | /** |
300 | * @param PageIdentity $page |
301 | * @param array|string $body Body structure, or an HTML string |
302 | * @param array $parameters |
303 | * @param RevisionRecord|null $originalRevision |
304 | * @param Bcp47Code|null $pageLanguage |
305 | * |
306 | * @throws HttpException |
307 | * @deprecated since 1.43; pass arguments to constructor instead |
308 | */ |
309 | public function init( |
310 | PageIdentity $page, |
311 | $body, |
312 | array $parameters, |
313 | ?RevisionRecord $originalRevision = null, |
314 | ?Bcp47Code $pageLanguage = null |
315 | ) { |
316 | wfDeprecated( __METHOD__, '1.43' ); |
317 | $this->initInternal( $page, $body, $parameters, $originalRevision, $pageLanguage ); |
318 | } |
319 | |
320 | /** |
321 | * @param PageIdentity $page |
322 | * @param array|string $body Body structure, or an HTML string |
323 | * @param array $parameters |
324 | * @param RevisionRecord|null $originalRevision |
325 | * @param Bcp47Code|null $pageLanguage |
326 | * |
327 | * @throws HttpException |
328 | */ |
329 | private function initInternal( |
330 | PageIdentity $page, |
331 | $body, |
332 | array $parameters, |
333 | ?RevisionRecord $originalRevision = null, |
334 | ?Bcp47Code $pageLanguage = null |
335 | ) { |
336 | if ( is_string( $body ) ) { |
337 | $body = [ 'html' => $body ]; |
338 | } |
339 | |
340 | self::normalizeParameters( $body, $parameters ); |
341 | |
342 | $this->page = $page; |
343 | |
344 | if ( !isset( $body['html'] ) ) { |
345 | throw new LocalizedHttpException( new MessageValue( "rest-missing-body-field", [ 'html' ] ) ); |
346 | } |
347 | |
348 | $html = is_array( $body['html'] ) ? $body['html']['body'] : $body['html']; |
349 | |
350 | // TODO: validate $body against a proper schema. |
351 | $this->transform = $this->htmlTransformFactory->getHtmlToContentTransform( |
352 | $html, |
353 | $this->page |
354 | ); |
355 | |
356 | $this->transform->setMetrics( $this->statsFactory ); |
357 | |
358 | // NOTE: Env::getContentModel will fall back to the page's recorded content model |
359 | // if none is set here. |
360 | $this->transform->setOptions( [ |
361 | 'contentmodel' => $parameters['contentmodel'] ?? null, |
362 | 'offsetType' => $body['offsetType'] ?? $this->envOptions['offsetType'], |
363 | ] ); |
364 | |
365 | $original = $body['original'] ?? []; |
366 | $originalRendering = null; |
367 | |
368 | if ( !isset( $original['html'] ) && !empty( $original['renderid'] ) ) { |
369 | $key = $original['renderid']; |
370 | if ( preg_match( '!^(W/)?".*"$!', $key ) ) { |
371 | $originalRendering = ParsoidRenderID::newFromETag( $key ); |
372 | |
373 | if ( !$originalRendering ) { |
374 | throw new LocalizedHttpException( new MessageValue( "rest-bad-etag", [ $key ] ), 400 ); |
375 | } |
376 | } else { |
377 | $originalRendering = ParsoidRenderID::newFromKey( $key ); |
378 | } |
379 | } elseif ( !empty( $original['html'] ) || !empty( $original['data-parsoid'] ) ) { |
380 | // NOTE: We might have an incomplete PageBundle here, with no HTML but with data-parsoid! |
381 | // XXX: Do we need to support that, or can that just be a 400? |
382 | $originalRendering = new PageBundle( |
383 | $original['html']['body'] ?? '', |
384 | $original['data-parsoid']['body'] ?? null, |
385 | $original['data-mw']['body'] ?? null, |
386 | null, // will be derived from $original['html']['headers']['content-type'] |
387 | $original['html']['headers'] ?? [] |
388 | ); |
389 | } |
390 | |
391 | if ( !$originalRevision && !empty( $original['revid'] ) ) { |
392 | $originalRevision = (int)$original['revid']; |
393 | } |
394 | |
395 | if ( $originalRevision || $originalRendering ) { |
396 | $this->setOriginal( $originalRevision, $originalRendering ); |
397 | } else { |
398 | if ( $this->page->exists() ) { |
399 | $this->statsFactory |
400 | ->getCounter( 'html_input_transform_total' ) |
401 | ->setLabel( 'original_html_given', 'false' ) |
402 | ->setLabel( 'page_exists', 'true' ) |
403 | ->setLabel( 'status', 'unknown' ) |
404 | ->copyToStatsdAt( 'html_input_transform.original_html.not_given.page_exists' ) |
405 | ->increment(); |
406 | } else { |
407 | $this->statsFactory |
408 | ->getCounter( 'html_input_transform_total' ) |
409 | ->setLabel( 'original_html_given', 'false' ) |
410 | ->setLabel( 'page_exists', 'false' ) |
411 | ->setLabel( 'status', 'unknown' ) |
412 | ->copyToStatsdAt( 'html_input_transform.original_html.not_given.page_not_exist' ) |
413 | ->increment(); |
414 | } |
415 | } |
416 | |
417 | if ( isset( $body['data-mw']['body'] ) ) { |
418 | $this->transform->setModifiedDataMW( $body['data-mw']['body'] ); |
419 | } |
420 | |
421 | if ( $pageLanguage ) { |
422 | $this->transform->setContentLanguage( $pageLanguage ); |
423 | } elseif ( isset( $parameters['language'] ) && $parameters['language'] !== '' ) { |
424 | $pageLanguage = LanguageCode::normalizeNonstandardCodeAndWarn( |
425 | $parameters['language'] |
426 | ); |
427 | $this->transform->setContentLanguage( $pageLanguage ); |
428 | } |
429 | |
430 | if ( isset( $original['source']['body'] ) ) { |
431 | // XXX: do we really have to support wikitext overrides? |
432 | $this->transform->setOriginalText( $original['source']['body'] ); |
433 | } |
434 | } |
435 | |
436 | /** |
437 | * Return HTMLTransform object, so additional context can be provided by calling setters on it. |
438 | * @return HtmlToContentTransform |
439 | */ |
440 | public function getTransform(): HtmlToContentTransform { |
441 | return $this->transform; |
442 | } |
443 | |
444 | /** |
445 | * Set metrics sink. |
446 | * |
447 | * @note Passing a StatsdDataFactoryInterface here has been deprecated |
448 | * since 1.43. |
449 | * |
450 | * @param StatsFactory|StatsdDataFactoryInterface $statsFactory |
451 | */ |
452 | public function setMetrics( $statsFactory ) { |
453 | if ( $statsFactory instanceof StatsdDataFactoryInterface ) { |
454 | // Uncomment this once all WMF code has been transitioned, but |
455 | // leave it in for the 1.43 release. |
456 | wfDeprecated( __METHOD__ . ' with StatsdDataFactoryInterface', '1.43' ); |
457 | return; |
458 | } |
459 | $this->statsFactory = $statsFactory; |
460 | |
461 | if ( $this->transform ) { |
462 | $this->transform->setMetrics( $statsFactory ); |
463 | } |
464 | } |
465 | |
466 | /** |
467 | * Supply information about the revision and rendering that was the original basis of |
468 | * the input HTML. This is used to apply selective serialization (selser), if possible. |
469 | * |
470 | * @param RevisionRecord|int|null $rev |
471 | * @param ParsoidRenderID|PageBundle|ParserOutput|null $originalRendering |
472 | */ |
473 | public function setOriginal( $rev, $originalRendering ) { |
474 | if ( $originalRendering instanceof ParsoidRenderID ) { |
475 | $renderId = $originalRendering; |
476 | |
477 | // If the client asked for a render ID, load original data from stash |
478 | try { |
479 | $selserContext = $this->fetchSelserContextFromStash( $renderId ); |
480 | } catch ( InvalidArgumentException $ex ) { |
481 | $this->statsFactory |
482 | ->getCounter( 'html_input_transform_total' ) |
483 | ->setLabel( 'original_html_given', 'as_renderid' ) |
484 | ->setLabel( 'page_exists', 'unknown' ) |
485 | ->setLabel( 'status', 'bad_renderid' ) |
486 | ->copyToStatsdAt( 'html_input_transform.original_html.given.as_renderid.bad' ) |
487 | ->increment(); |
488 | throw new LocalizedHttpException( new MessageValue( "rest-bad-stash-key" ), |
489 | 400, |
490 | [ |
491 | 'reason' => $ex->getMessage(), |
492 | 'key' => "$renderId" |
493 | ] |
494 | ); |
495 | } |
496 | |
497 | if ( !$selserContext ) { |
498 | // NOTE: When the client asked for a specific stash key (resp. etag), |
499 | // we should fail with a 412 if we don't have the specific rendering. |
500 | // On the other hand, of the client only provided a base revision ID, |
501 | // we can re-parse and hope for the best. |
502 | |
503 | throw new LocalizedHttpException( |
504 | new MessageValue( "rest-no-stashed-content", [ $renderId->getKey() ] ), 412 |
505 | ); |
506 | |
507 | // TODO: This class should provide getETag and getLastModified methods for use by |
508 | // the REST endpoint, to provide proper support for conditionals. |
509 | // However, that requires some refactoring of how HTTP conditional checks |
510 | // work in the Handler base class. |
511 | } |
512 | |
513 | if ( !$rev ) { |
514 | $rev = $renderId->getRevisionID(); |
515 | } |
516 | |
517 | $originalRendering = $selserContext->getPageBundle(); |
518 | $content = $selserContext->getContent(); |
519 | |
520 | if ( $content ) { |
521 | $this->transform->setOriginalContent( $content ); |
522 | } |
523 | } elseif ( !$originalRendering && $rev ) { |
524 | // The client provided a revision ID, but not stash key. |
525 | // Try to get a rendering for the given revision, and use it as the basis for selser. |
526 | // Chances are good that the resulting diff will be reasonably clean. |
527 | // NOTE: If we don't have a revision ID, we should not attempt selser! |
528 | $originalRendering = $this->fetchParserOutputFromParsoid( $this->page, $rev, true ); |
529 | |
530 | if ( $originalRendering ) { |
531 | $this->statsFactory->getCounter( 'html_input_transform_total' ) |
532 | ->setLabel( 'original_html_given', 'as_revid' ) |
533 | ->setLabel( 'page_exists', 'unknown' ) |
534 | ->setLabel( 'status', 'found' ) |
535 | ->copyToStatsdAt( 'html_input_transform.original_html.given.as_revid.found' ) |
536 | ->increment(); |
537 | } else { |
538 | $this->statsFactory->getCounter( 'html_input_transform_total' ) |
539 | ->setLabel( 'original_html_given', 'as_revid' ) |
540 | ->setLabel( 'page_exists', 'unknown' ) |
541 | ->setLabel( 'status', 'not_found' ) |
542 | ->copyToStatsdAt( 'html_input_transform.original_html.given.as_revid.not_found' ) |
543 | ->increment(); |
544 | } |
545 | } elseif ( $originalRendering ) { |
546 | $this->statsFactory->getCounter( 'html_input_transform_total' ) |
547 | ->setLabel( 'original_html_given', 'true' ) |
548 | ->setLabel( 'page_exists', 'unknown' ) |
549 | ->setLabel( 'status', 'verbatim' ) |
550 | ->copyToStatsdAt( 'html_input_transform.original_html.given.verbatim' ) |
551 | ->increment(); |
552 | } |
553 | |
554 | if ( $originalRendering instanceof ParserOutput ) { |
555 | $originalRendering = PageBundleParserOutputConverter::pageBundleFromParserOutput( $originalRendering ); |
556 | |
557 | // NOTE: Use the default if we got a ParserOutput object. |
558 | // Don't apply the default if we got passed a PageBundle, |
559 | // in that case, we want to require the version to be explicit. |
560 | if ( $originalRendering->version === null && !isset( $originalRendering->headers['content-type'] ) ) { |
561 | $originalRendering->version = Parsoid::defaultHTMLVersion(); |
562 | } |
563 | } |
564 | |
565 | if ( !$originalRendering instanceof PageBundle ) { |
566 | return; |
567 | } |
568 | |
569 | if ( $originalRendering->version !== null ) { |
570 | $this->transform->setOriginalSchemaVersion( $originalRendering->version ); |
571 | } elseif ( !empty( $originalRendering->headers['content-type'] ) ) { |
572 | $vOriginal = ParsoidFormatHelper::parseContentTypeHeader( |
573 | // @phan-suppress-next-line PhanTypeArraySuspiciousNullable Silly Phan, we just checked. |
574 | $originalRendering->headers['content-type'] |
575 | ); |
576 | |
577 | if ( $vOriginal ) { |
578 | $this->transform->setOriginalSchemaVersion( $vOriginal ); |
579 | } |
580 | } |
581 | |
582 | if ( $rev instanceof RevisionRecord ) { |
583 | $this->transform->setOriginalRevision( $rev ); |
584 | } elseif ( $rev && is_int( $rev ) ) { |
585 | $this->transform->setOriginalRevisionId( $rev ); |
586 | } |
587 | |
588 | // NOTE: We might have an incomplete PageBundle here, with no HTML. |
589 | // PageBundle::$html is declared to not be nullable, so it would be set to the empty |
590 | // string if not given. |
591 | if ( $originalRendering->html !== '' ) { |
592 | $this->transform->setOriginalHtml( $originalRendering->html ); |
593 | } |
594 | |
595 | if ( $originalRendering->parsoid !== null ) { |
596 | $this->transform->setOriginalDataParsoid( $originalRendering->parsoid ); |
597 | } |
598 | |
599 | if ( $originalRendering->mw !== null ) { |
600 | $this->transform->setOriginalDataMW( $originalRendering->mw ); |
601 | } |
602 | } |
603 | |
604 | /** |
605 | * @return Content the content derived from the input HTML. |
606 | * @throws HttpException |
607 | */ |
608 | public function getContent(): Content { |
609 | try { |
610 | return $this->transform->htmlToContent(); |
611 | } catch ( ClientError $e ) { |
612 | throw new LocalizedHttpException( |
613 | new MessageValue( 'rest-html-backend-error', [ $e->getMessage() ] ), |
614 | 400, |
615 | [ 'reason' => $e->getMessage() ] |
616 | ); |
617 | } catch ( ResourceLimitExceededException $e ) { |
618 | throw new LocalizedHttpException( |
619 | new MessageValue( 'rest-resource-limit-exceeded' ), |
620 | 413, |
621 | [ 'reason' => $e->getMessage() ] |
622 | ); |
623 | } catch ( MWUnknownContentModelException $e ) { |
624 | throw new LocalizedHttpException( |
625 | new MessageValue( "rest-unknown-content-model", [ $e->getModelId() ] ), |
626 | 400 |
627 | ); |
628 | } |
629 | } |
630 | |
631 | /** |
632 | * Creates a response containing the content derived from the input HTML. |
633 | * This will set the appropriate Content-Type header. |
634 | * |
635 | * @param ResponseInterface $response |
636 | */ |
637 | public function putContent( ResponseInterface $response ) { |
638 | $content = $this->getContent(); |
639 | $data = $content->serialize(); |
640 | |
641 | try { |
642 | $contentType = ParsoidFormatHelper::getContentType( |
643 | $content->getModel(), |
644 | $this->envOptions['outputContentVersion'] |
645 | ); |
646 | } catch ( InvalidArgumentException $e ) { |
647 | // If Parsoid doesn't know the content type, |
648 | // ask the ContentHandler! |
649 | $contentType = $content->getDefaultFormat(); |
650 | } |
651 | |
652 | $response->setHeader( 'Content-Type', $contentType ); |
653 | $response->getBody()->write( $data ); |
654 | } |
655 | |
656 | /** |
657 | * @param PageIdentity $page |
658 | * @param RevisionRecord|int $revision |
659 | * @param bool $mayParse |
660 | * |
661 | * @return ParserOutput|null |
662 | * @throws HttpException |
663 | */ |
664 | private function fetchParserOutputFromParsoid( PageIdentity $page, $revision, bool $mayParse ): ?ParserOutput { |
665 | $parserOptions = ParserOptions::newFromAnon(); |
666 | $parserOptions->setUseParsoid(); |
667 | |
668 | try { |
669 | if ( !$page instanceof PageRecord ) { |
670 | $name = "$page"; |
671 | $page = $this->pageLookup->getPageByReference( $page ); |
672 | if ( !$page ) { |
673 | throw new RevisionAccessException( 'Page {name} not found', |
674 | [ 'name' => $name ] ); |
675 | } |
676 | } |
677 | |
678 | if ( is_int( $revision ) ) { |
679 | $revId = $revision; |
680 | $revision = $this->revisionLookup->getRevisionById( $revId, 0, $page ); |
681 | |
682 | if ( !$revision ) { |
683 | throw new RevisionAccessException( 'Revision {revId} not found', |
684 | [ 'revId' => $revId ] ); |
685 | } |
686 | } |
687 | |
688 | if ( $page->getId() !== $revision->getPageId() ) { |
689 | throw new RevisionAccessException( 'Revision {revId} does not belong to page {name}', |
690 | [ 'name' => $page->getDBkey(), |
691 | 'revId' => $revision->getId() ] ); |
692 | } |
693 | |
694 | if ( $mayParse ) { |
695 | try { |
696 | $status = $this->parserOutputAccess->getParserOutput( |
697 | $page, $parserOptions, $revision |
698 | ); |
699 | } catch ( ClientError $e ) { |
700 | $status = Status::newFatal( 'parsoid-client-error', $e->getMessage() ); |
701 | } catch ( ResourceLimitExceededException $e ) { |
702 | $status = Status::newFatal( 'parsoid-resource-limit-exceeded', $e->getMessage() ); |
703 | } |
704 | |
705 | if ( !$status->isOK() ) { |
706 | $this->throwHttpExceptionForStatus( $status ); |
707 | } |
708 | |
709 | $parserOutput = $status->getValue(); |
710 | } else { |
711 | $parserOutput = $this->parserOutputAccess->getCachedParserOutput( |
712 | $page, $parserOptions, $revision |
713 | ); |
714 | } |
715 | } catch ( RevisionAccessException $e ) { |
716 | // The client supplied bad revision ID, or the revision was deleted or suppressed. |
717 | throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ), |
718 | 404, |
719 | [ 'reason' => $e->getMessage() ] |
720 | ); |
721 | } |
722 | |
723 | return $parserOutput; |
724 | } |
725 | |
726 | /** |
727 | * @param ParsoidRenderID $renderID |
728 | * |
729 | * @return SelserContext|null |
730 | */ |
731 | private function fetchSelserContextFromStash( $renderID ): ?SelserContext { |
732 | $selserContext = $this->parsoidOutputStash->get( $renderID ); |
733 | $labels = [ |
734 | 'original_html_given' => 'as_renderid', |
735 | 'page_exists' => 'unknown', |
736 | 'status' => 'hit-stashed' |
737 | ]; |
738 | $counter = $this->statsFactory->getCounter( 'html_input_transform_total' ); |
739 | if ( $selserContext ) { |
740 | $counter->setLabels( $labels ) |
741 | ->copyToStatsdAt( 'html_input_transform.original_html.given.as_renderid.stash_hit.found.hit' ) |
742 | ->increment(); |
743 | return $selserContext; |
744 | } else { |
745 | // Looks like the rendering is gone from stash (or the client send us a bogus key). |
746 | // Try to load it from the parser cache instead. |
747 | // On a wiki with low edit frequency, there is a good chance that it's still there. |
748 | try { |
749 | $parserOutput = $this->fetchParserOutputFromParsoid( $this->page, $renderID->getRevisionID(), false ); |
750 | |
751 | if ( !$parserOutput ) { |
752 | $labels[ 'status' ] = 'miss-fallback_not_found'; |
753 | $counter->setLabels( $labels )->copyToStatsdAt( |
754 | 'html_input_transform.original_html.given.as_renderid.stash_miss_pc_fallback.not_found.miss' |
755 | )->increment(); |
756 | return null; |
757 | } |
758 | |
759 | $cachedRenderID = ParsoidRenderID::newFromParserOutput( $parserOutput ); |
760 | if ( $cachedRenderID->getKey() !== $renderID->getKey() ) { |
761 | $labels[ 'status' ] = 'mismatch-fallback_not_found'; |
762 | $counter->setLabels( $labels ) |
763 | ->copyToStatsdAt( |
764 | 'html_input_transform.original_html.given.as_renderid.' . |
765 | 'stash_miss_pc_fallback.not_found.mismatch' |
766 | ) |
767 | ->increment(); |
768 | |
769 | // It's not the correct rendering. |
770 | return null; |
771 | } |
772 | $labels[ 'status' ] = 'hit-fallback_found'; |
773 | $counter->setLabels( $labels ) |
774 | ->copyToStatsdAt( |
775 | 'html_input_transform.original_html.given.as_renderid.' . |
776 | 'stash_miss_pc_fallback.found.hit' |
777 | ) |
778 | ->increment(); |
779 | |
780 | $pb = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput ); |
781 | return new SelserContext( $pb, $renderID->getRevisionID() ); |
782 | } catch ( HttpException $e ) { |
783 | $labels[ 'status' ] = 'failed-fallback_not_found'; |
784 | $counter->setLabels( $labels ) |
785 | ->copyToStatsdAt( |
786 | 'html_input_transform.original_html.given.as_renderid.' . |
787 | 'stash_miss_pc_fallback.not_found.failed' |
788 | ) |
789 | ->increment(); |
790 | |
791 | // If the revision isn't found, don't trigger a 404. Return null to trigger a 412. |
792 | return null; |
793 | } |
794 | } |
795 | } |
796 | |
797 | /** |
798 | * @param Status $status |
799 | * |
800 | * @return never |
801 | * @throws HttpException |
802 | */ |
803 | private function throwHttpExceptionForStatus( Status $status ) { |
804 | // TODO: make this nicer. |
805 | if ( $status->hasMessage( 'parsoid-resource-limit-exceeded' ) ) { |
806 | throw new LocalizedHttpException( new MessageValue( "rest-parsoid-resource-exceeded" ), |
807 | 413, |
808 | [ 'reason' => $status->getHTML() ] |
809 | ); |
810 | } else { |
811 | throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error" ), |
812 | 400, |
813 | [ 'reason' => $status->getHTML() ] |
814 | ); |
815 | } |
816 | } |
817 | |
818 | } |