Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
86.98% |
167 / 192 |
|
72.97% |
27 / 37 |
CRAP | |
0.00% |
0 / 1 |
HtmlToContentTransform | |
86.98% |
167 / 192 |
|
72.97% |
27 / 37 |
106.49 | |
0.00% |
0 / 1 |
__construct | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
setMetrics | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
incrementMetrics | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
2 | |||
setOptions | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
setOriginalRevision | |
66.67% |
4 / 6 |
|
0.00% |
0 / 1 |
3.33 | |||
setOriginalRevisionId | |
60.00% |
3 / 5 |
|
0.00% |
0 / 1 |
3.58 | |||
setContentLanguage | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
setOriginalText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
setOriginalContent | |
66.67% |
4 / 6 |
|
0.00% |
0 / 1 |
3.33 | |||
validatePageBundle | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
3 | |||
setModifiedDataMW | |
50.00% |
2 / 4 |
|
0.00% |
0 / 1 |
2.50 | |||
setOriginalSchemaVersion | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
setOriginalHtml | |
50.00% |
2 / 4 |
|
0.00% |
0 / 1 |
2.50 | |||
setOriginalDataMW | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
3.07 | |||
setOriginalDataParsoid | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
getPageConfig | |
100.00% |
22 / 22 |
|
100.00% |
1 / 1 |
6 | |||
getModifiedHtmlSize | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getModifiedDocumentRaw | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
getModifiedDocument | |
100.00% |
5 / 5 |
|
100.00% |
1 / 1 |
2 | |||
hasOriginalHtml | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
hasOriginalDataParsoid | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getOriginalHtml | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
6 | |||
parseHTML | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getOriginalBody | |
64.71% |
11 / 17 |
|
0.00% |
0 / 1 |
6.10 | |||
getOriginalSchemaVersion | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
getSchemaVersion | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
getOriginalRevisionId | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
knowsOriginalContent | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
3 | |||
getContentModel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
getOffsetType | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
needsDowngrade | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
downgradeOriginalData | |
83.33% |
15 / 18 |
|
0.00% |
0 / 1 |
5.12 | |||
applyPageBundle | |
100.00% |
11 / 11 |
|
100.00% |
1 / 1 |
5 | |||
getSelserData | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
5.20 | |||
getContentHandler | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
htmlToContent | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
htmlToText | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
3 |
1 | <?php |
2 | |
3 | namespace MediaWiki\Parser\Parsoid; |
4 | |
5 | use Composer\Semver\Semver; |
6 | use Content; |
7 | use ContentHandler; |
8 | use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface; |
9 | use LogicException; |
10 | use MediaWiki\Content\IContentHandlerFactory; |
11 | use MediaWiki\Page\PageIdentity; |
12 | use MediaWiki\Parser\Parsoid\Config\PageConfigFactory; |
13 | use MediaWiki\Rest\HttpException; |
14 | use MediaWiki\Rest\LocalizedHttpException; |
15 | use MediaWiki\Revision\MutableRevisionRecord; |
16 | use MediaWiki\Revision\RevisionAccessException; |
17 | use MediaWiki\Revision\RevisionRecord; |
18 | use MediaWiki\Revision\SlotRecord; |
19 | use Wikimedia\Bcp47Code\Bcp47Code; |
20 | use Wikimedia\Message\MessageValue; |
21 | use Wikimedia\Parsoid\Config\PageConfig; |
22 | use Wikimedia\Parsoid\Core\ClientError; |
23 | use Wikimedia\Parsoid\Core\PageBundle; |
24 | use Wikimedia\Parsoid\Core\ResourceLimitExceededException; |
25 | use Wikimedia\Parsoid\Core\SelserData; |
26 | use Wikimedia\Parsoid\DOM\Document; |
27 | use Wikimedia\Parsoid\DOM\Element; |
28 | use Wikimedia\Parsoid\Parsoid; |
29 | use Wikimedia\Parsoid\Utils\ContentUtils; |
30 | use Wikimedia\Parsoid\Utils\DOMCompat; |
31 | use Wikimedia\Parsoid\Utils\DOMUtils; |
32 | use Wikimedia\Parsoid\Utils\Timing; |
33 | |
34 | /** |
35 | * This class allows HTML to be transformed to a page content source format such as wikitext. |
36 | * |
37 | * @since 1.40 |
38 | * @unstable should be stable before 1.40 release |
39 | */ |
40 | class HtmlToContentTransform { |
41 | private array $options = []; |
42 | private ?int $oldid = null; |
43 | private ?Bcp47Code $contentLanguage = null; |
44 | private ?Content $originalContent = null; |
45 | private ?RevisionRecord $originalRevision = null; |
46 | /** |
47 | * Whether $this->doc has had any necessary processing applied, |
48 | * such as injecting data-parsoid attributes from a PageBundle. |
49 | */ |
50 | private bool $docHasBeenProcessed = false; |
51 | private ?Document $doc = null; |
52 | private ?Element $originalBody = null; |
53 | protected ?StatsdDataFactoryInterface $metrics = null; |
54 | private PageBundle $modifiedPageBundle; |
55 | private PageBundle $originalPageBundle; |
56 | private ?PageConfig $pageConfig = null; |
57 | private Parsoid $parsoid; |
58 | private array $parsoidSettings; |
59 | private PageIdentity $page; |
60 | private PageConfigFactory $pageConfigFactory; |
61 | private IContentHandlerFactory $contentHandlerFactory; |
62 | |
63 | /** |
64 | * @param string $modifiedHTML |
65 | * @param PageIdentity $page |
66 | * @param Parsoid $parsoid |
67 | * @param array $parsoidSettings |
68 | * @param PageConfigFactory $pageConfigFactory |
69 | * @param IContentHandlerFactory $contentHandlerFactory |
70 | */ |
71 | public function __construct( |
72 | string $modifiedHTML, |
73 | PageIdentity $page, |
74 | Parsoid $parsoid, |
75 | array $parsoidSettings, |
76 | PageConfigFactory $pageConfigFactory, |
77 | IContentHandlerFactory $contentHandlerFactory |
78 | ) { |
79 | $this->parsoid = $parsoid; |
80 | $this->parsoidSettings = $parsoidSettings; |
81 | $this->modifiedPageBundle = new PageBundle( $modifiedHTML ); |
82 | $this->originalPageBundle = new PageBundle( '' ); |
83 | $this->page = $page; |
84 | $this->pageConfigFactory = $pageConfigFactory; |
85 | $this->contentHandlerFactory = $contentHandlerFactory; |
86 | } |
87 | |
88 | /** |
89 | * @param StatsdDataFactoryInterface $metrics |
90 | */ |
91 | public function setMetrics( StatsdDataFactoryInterface $metrics ): void { |
92 | $this->metrics = $metrics; |
93 | } |
94 | |
95 | private function incrementMetrics( string $key ) { |
96 | if ( $this->metrics ) { |
97 | $this->metrics->increment( $key ); |
98 | } |
99 | } |
100 | |
101 | public function setOptions( array $options ) { |
102 | $this->options = $options; |
103 | } |
104 | |
105 | /** |
106 | * @param RevisionRecord $rev |
107 | */ |
108 | public function setOriginalRevision( RevisionRecord $rev ): void { |
109 | if ( $this->pageConfig ) { |
110 | throw new LogicException( 'Cannot set revision after using the PageConfig' ); |
111 | } |
112 | if ( $this->originalRevision ) { |
113 | throw new LogicException( 'Cannot set revision again' ); |
114 | } |
115 | |
116 | $this->originalRevision = $rev; |
117 | $this->oldid = $rev->getId(); |
118 | } |
119 | |
120 | /** |
121 | * @param int $oldid |
122 | */ |
123 | public function setOriginalRevisionId( int $oldid ): void { |
124 | if ( $this->pageConfig ) { |
125 | throw new LogicException( 'Cannot set revision ID after using the PageConfig' ); |
126 | } |
127 | if ( $this->originalRevision ) { |
128 | throw new LogicException( 'Cannot set revision again' ); |
129 | } |
130 | |
131 | $this->oldid = $oldid; |
132 | } |
133 | |
134 | /** |
135 | * @param Bcp47Code $lang |
136 | */ |
137 | public function setContentLanguage( Bcp47Code $lang ): void { |
138 | if ( $this->pageConfig ) { |
139 | throw new LogicException( 'Cannot set content language after using the PageConfig' ); |
140 | } |
141 | |
142 | $this->contentLanguage = $lang; |
143 | } |
144 | |
145 | /** |
146 | * Sets the original source text (usually wikitext). |
147 | * |
148 | * @param string $text |
149 | */ |
150 | public function setOriginalText( string $text ): void { |
151 | $content = $this->getContentHandler()->unserializeContent( $text ); |
152 | $this->setOriginalContent( $content ); |
153 | } |
154 | |
155 | /** |
156 | * Sets the original content (such as wikitext). |
157 | * |
158 | * @param Content $content |
159 | */ |
160 | public function setOriginalContent( Content $content ): void { |
161 | if ( $this->pageConfig ) { |
162 | throw new LogicException( 'Cannot set text after using the PageConfig' ); |
163 | } |
164 | if ( $this->originalRevision ) { |
165 | throw new LogicException( 'Cannot set wikitext after using the PageConfig' ); |
166 | } |
167 | |
168 | $this->options['contentmodel'] = $content->getModel(); |
169 | $this->originalContent = $content; |
170 | } |
171 | |
172 | private function validatePageBundle( PageBundle $pb ) { |
173 | if ( !$pb->version ) { |
174 | return; |
175 | } |
176 | |
177 | $errorMessage = ''; |
178 | if ( !$pb->validate( $pb->version, $errorMessage ) ) { |
179 | throw new ClientError( $errorMessage ); |
180 | } |
181 | } |
182 | |
183 | /** |
184 | * @note Call this after all original data has been set! |
185 | * |
186 | * @param array $modifiedDataMW |
187 | */ |
188 | public function setModifiedDataMW( array $modifiedDataMW ): void { |
189 | // Relies on setOriginalSchemaVersion having been called already. |
190 | if ( !Semver::satisfies( $this->getSchemaVersion(), '^999.0.0' ) ) { |
191 | throw new ClientError( 'Modified data-mw is not supported by schema version ' |
192 | . $this->getSchemaVersion() ); |
193 | } |
194 | |
195 | $this->modifiedPageBundle->mw = $modifiedDataMW; |
196 | } |
197 | |
198 | /** |
199 | * @param string $originalSchemaVeraion |
200 | */ |
201 | public function setOriginalSchemaVersion( string $originalSchemaVeraion ): void { |
202 | $this->originalPageBundle->version = $originalSchemaVeraion; |
203 | } |
204 | |
205 | /** |
206 | * @param string $originalHtml |
207 | */ |
208 | public function setOriginalHtml( string $originalHtml ): void { |
209 | if ( $this->doc ) { |
210 | throw new LogicException( __FUNCTION__ . ' cannot be called after' . |
211 | ' getModifiedDocument()' ); |
212 | } |
213 | |
214 | $this->originalPageBundle->html = $originalHtml; |
215 | } |
216 | |
217 | /** |
218 | * @param array $originalDataMW |
219 | */ |
220 | public function setOriginalDataMW( array $originalDataMW ): void { |
221 | if ( $this->doc ) { |
222 | throw new LogicException( __FUNCTION__ . ' cannot be called after getModifiedDocument()' ); |
223 | } |
224 | |
225 | $this->originalPageBundle->mw = $originalDataMW; |
226 | |
227 | // Modified data-mw is going to be the same as original data-mw, |
228 | // unless specified otherwise. |
229 | if ( $this->modifiedPageBundle->mw === null ) { |
230 | $this->modifiedPageBundle->mw = $originalDataMW; |
231 | } |
232 | } |
233 | |
234 | /** |
235 | * @param array $originalDataParsoid |
236 | */ |
237 | public function setOriginalDataParsoid( array $originalDataParsoid ): void { |
238 | if ( $this->doc ) { |
239 | throw new LogicException( __FUNCTION__ . ' cannot be called after getModifiedDocument()' ); |
240 | } |
241 | |
242 | // data-parsoid is going to be the same for original and modified. |
243 | $this->originalPageBundle->parsoid = $originalDataParsoid; |
244 | $this->modifiedPageBundle->parsoid = $originalDataParsoid; |
245 | } |
246 | |
247 | /** |
248 | * @return PageConfig |
249 | */ |
250 | private function getPageConfig(): PageConfig { |
251 | if ( !$this->pageConfig ) { |
252 | |
253 | // XXX: do we even have to support wikitext overrides? What's the use case? |
254 | if ( $this->originalContent !== null ) { |
255 | // Create a mutable revision record point to the same revision |
256 | // and set to the desired content. |
257 | $revision = new MutableRevisionRecord( $this->page ); |
258 | if ( $this->oldid ) { |
259 | $revision->setId( $this->oldid ); |
260 | } |
261 | |
262 | $revision->setSlot( |
263 | SlotRecord::newUnsaved( |
264 | SlotRecord::MAIN, |
265 | $this->originalContent |
266 | ) |
267 | ); |
268 | } else { |
269 | // NOTE: PageConfigFactory allows $revision to be an int ID or a RevisionRecord. |
270 | $revision = $this->originalRevision ?: $this->oldid; |
271 | } |
272 | |
273 | try { |
274 | $this->pageConfig = $this->pageConfigFactory->create( |
275 | $this->page, |
276 | null, |
277 | $revision, |
278 | null, |
279 | $this->contentLanguage |
280 | ); |
281 | } catch ( RevisionAccessException $exception ) { |
282 | // TODO: Throw a different exception, this class should not know |
283 | // about HTTP status codes. |
284 | throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ), 404 ); |
285 | } |
286 | } |
287 | |
288 | return $this->pageConfig; |
289 | } |
290 | |
291 | /** |
292 | * The size of the modified HTML in characters. |
293 | * |
294 | * @return int |
295 | */ |
296 | public function getModifiedHtmlSize(): int { |
297 | return mb_strlen( $this->modifiedPageBundle->html ); |
298 | } |
299 | |
300 | private function getModifiedDocumentRaw(): Document { |
301 | if ( !$this->doc ) { |
302 | $this->doc = $this->parseHTML( $this->modifiedPageBundle->html, true ); |
303 | $this->modifiedPageBundle->version = DOMUtils::extractInlinedContentVersion( $this->doc ); |
304 | } |
305 | |
306 | return $this->doc; |
307 | } |
308 | |
309 | public function getModifiedDocument(): Document { |
310 | $doc = $this->getModifiedDocumentRaw(); |
311 | |
312 | if ( !$this->docHasBeenProcessed ) { |
313 | $this->applyPageBundle( $this->doc, $this->modifiedPageBundle ); |
314 | |
315 | $this->docHasBeenProcessed = true; |
316 | } |
317 | |
318 | return $doc; |
319 | } |
320 | |
321 | /** |
322 | * NOTE: The return value of this method depends on |
323 | * setOriginalData() having been called first. |
324 | * |
325 | * @return bool |
326 | */ |
327 | public function hasOriginalHtml(): bool { |
328 | return $this->originalPageBundle->html !== null && $this->originalPageBundle->html !== ''; |
329 | } |
330 | |
331 | /** |
332 | * NOTE: The return value of this method depends on |
333 | * setOriginalData() having been called first. |
334 | * |
335 | * @return bool |
336 | */ |
337 | public function hasOriginalDataParsoid(): bool { |
338 | return $this->originalPageBundle->parsoid !== null; |
339 | } |
340 | |
341 | /** |
342 | * Returns the original HTML, with any necessary processing applied. |
343 | * |
344 | * @todo Make this method redundant, nothing should operate on HTML strings. |
345 | * |
346 | * @return string |
347 | */ |
348 | public function getOriginalHtml(): string { |
349 | // NOTE: Schema version should have been set explicitly, |
350 | // so don't call getOriginalSchemaVersion, |
351 | // which will silently fall back to the default. |
352 | if ( !$this->originalPageBundle->version ) { |
353 | throw new ClientError( |
354 | 'Content-type of original html is missing.' |
355 | ); |
356 | } |
357 | |
358 | if ( !$this->originalBody ) { |
359 | // NOTE: Make sure we called getOriginalBody() at least once before we |
360 | // return the original HTML, so downgrades can be applied, |
361 | // data-parsoid can be injected, and $this->originalPageBundle->html |
362 | // is updated accordingly. |
363 | |
364 | if ( $this->hasOriginalDataParsoid() || $this->needsDowngrade( $this->originalPageBundle ) ) { |
365 | $this->getOriginalBody(); |
366 | } |
367 | } |
368 | |
369 | return $this->originalPageBundle->html ?: ''; |
370 | } |
371 | |
372 | /** |
373 | * @param string $html |
374 | * @param bool $validateXMLNames |
375 | * |
376 | * @return Document |
377 | * @throws ClientError |
378 | */ |
379 | protected function parseHTML( string $html, bool $validateXMLNames = false ): Document { |
380 | return DOMUtils::parseHTML( $html, $validateXMLNames ); |
381 | } |
382 | |
383 | /** |
384 | * NOTE: The return value of this method depends on |
385 | * setOriginalData() having been called first. |
386 | * |
387 | * @return Element |
388 | * @throws ClientError |
389 | */ |
390 | public function getOriginalBody(): Element { |
391 | if ( !$this->hasOriginalHtml() ) { |
392 | throw new LogicException( |
393 | 'No original data supplied, call hasOriginalHtml() first.' |
394 | ); |
395 | } |
396 | |
397 | if ( $this->originalBody ) { |
398 | return $this->originalBody; |
399 | } |
400 | |
401 | // NOTE: Schema version should have been set explicitly, |
402 | // so don't call getOriginalSchemaVersion, |
403 | // which will silently fall back to the default. |
404 | if ( !$this->originalPageBundle->version ) { |
405 | throw new ClientError( |
406 | 'Content-type of original html is missing.' |
407 | ); |
408 | } |
409 | |
410 | if ( $this->needsDowngrade( $this->originalPageBundle ) ) { |
411 | $this->downgradeOriginalData( $this->originalPageBundle, $this->getSchemaVersion() ); |
412 | } |
413 | |
414 | $doc = $this->parseHTML( $this->originalPageBundle->html ); |
415 | |
416 | $this->applyPageBundle( $doc, $this->originalPageBundle ); |
417 | |
418 | $this->originalBody = DOMCompat::getBody( $doc ); |
419 | |
420 | // XXX: use a separate field?? |
421 | $this->originalPageBundle->html = ContentUtils::toXML( $this->originalBody ); |
422 | |
423 | return $this->originalBody; |
424 | } |
425 | |
426 | public function getOriginalSchemaVersion(): string { |
427 | return $this->originalPageBundle->version ?: $this->getSchemaVersion(); |
428 | } |
429 | |
430 | /** |
431 | * NOTE: The return value of this method depends on |
432 | * setOriginalData() having been called first. |
433 | * |
434 | * @return string |
435 | */ |
436 | public function getSchemaVersion(): string { |
437 | // Get the content version of the edited doc, if available. |
438 | // Make sure $this->modifiedPageBundle->version is initialized. |
439 | $this->getModifiedDocumentRaw(); |
440 | $inputContentVersion = $this->modifiedPageBundle->version; |
441 | |
442 | if ( !$inputContentVersion ) { |
443 | $this->incrementMetrics( 'html2wt.original.version.notinline' ); |
444 | $inputContentVersion = $this->originalPageBundle->version ?: Parsoid::defaultHTMLVersion(); |
445 | } |
446 | |
447 | return $inputContentVersion; |
448 | } |
449 | |
450 | public function getOriginalRevisionId(): ?int { |
451 | return $this->oldid; |
452 | } |
453 | |
454 | public function knowsOriginalContent(): bool { |
455 | return $this->originalRevision || $this->oldid || $this->originalContent !== null; |
456 | } |
457 | |
458 | public function getContentModel(): string { |
459 | return $this->options['contentmodel'] ?? CONTENT_MODEL_WIKITEXT; |
460 | } |
461 | |
462 | public function getOffsetType(): string { |
463 | return $this->options['offsetType'] ?? 'byte'; |
464 | } |
465 | |
466 | private function needsDowngrade( PageBundle $pb ): bool { |
467 | $vOriginal = $pb->version; |
468 | $vEdited = $this->getSchemaVersion(); |
469 | |
470 | // Downgrades are only expected to be between major version |
471 | // |
472 | // RESTBase was only expected to store latest version. If a client asked for a version |
473 | // not satisfied by the latest version, it would downgrade the stored version where |
474 | // possible. So, it's the original version that needs to satisfy the edited version, |
475 | // otherwise it needs downgrading. |
476 | // |
477 | // There's also the case where an old version is not stored and a re-parse must occur. |
478 | // Here again the original version generated will be the latest, either satisfying |
479 | // the edited or needing downgrading. |
480 | return $vOriginal !== null && !Semver::satisfies( $vOriginal, "^{$vEdited}" ); |
481 | } |
482 | |
483 | private function downgradeOriginalData( PageBundle $pb, string $targetSchemaVersion ) { |
484 | if ( $pb->version === null ) { |
485 | throw new ClientError( 'Missing schema version' ); |
486 | } |
487 | |
488 | if ( $targetSchemaVersion === $pb->version ) { |
489 | // nothing to do. |
490 | return; |
491 | } |
492 | |
493 | if ( !$pb->parsoid ) { |
494 | // XXX: Should we also support downgrades if $pb->html has everything inlined? |
495 | // XXX: The downgrade should really be an operation on the DOM. |
496 | return; |
497 | } |
498 | |
499 | // We need to downgrade the original to match the edited doc's version. |
500 | $downgrade = Parsoid::findDowngrade( $pb->version, $targetSchemaVersion ); |
501 | |
502 | if ( !$downgrade ) { |
503 | throw new ClientError( |
504 | "No downgrade possible from schema version {$pb->version} to {$targetSchemaVersion}." |
505 | ); |
506 | } |
507 | |
508 | $this->incrementMetrics( |
509 | "downgrade.from.{$downgrade['from']}.to.{$downgrade['to']}" |
510 | ); |
511 | $downgradeTiming = Timing::start( $this->metrics ); |
512 | Parsoid::downgrade( $downgrade, $pb ); |
513 | $downgradeTiming->end( 'downgrade.time' ); |
514 | |
515 | // NOTE: Set $this->originalBody to null so getOriginalBody() will re-generate it. |
516 | // XXX: Parsoid::downgrade operates on the parsed Document, would be nice |
517 | // if we could get that instead of getting back HTML which we have to |
518 | // parse again! |
519 | $this->originalBody = null; |
520 | } |
521 | |
522 | /** |
523 | * @param Document $doc |
524 | * @param PageBundle $pb |
525 | * |
526 | * @throws ClientError |
527 | */ |
528 | private function applyPageBundle( Document $doc, PageBundle $pb ): void { |
529 | if ( $pb->parsoid === null && $pb->mw === null ) { |
530 | return; |
531 | } |
532 | |
533 | // Verify that the top-level parsoid object either doesn't contain |
534 | // offsetType, or that it matches the conversion that has been |
535 | // explicitly requested. |
536 | if ( isset( $pb->parsoid['offsetType'] ) ) { |
537 | $offsetType = $this->getOffsetType(); |
538 | $origOffsetType = $pb->parsoid['offsetType'] ?? $offsetType; |
539 | if ( $origOffsetType !== $offsetType ) { |
540 | throw new ClientError( |
541 | 'DSR offsetType mismatch: ' . $origOffsetType . ' vs ' . $offsetType |
542 | ); |
543 | } |
544 | } |
545 | |
546 | $this->validatePageBundle( $pb ); |
547 | PageBundle::apply( $doc, $pb ); |
548 | } |
549 | |
550 | /** |
551 | * Get a selective serialization (selser) data object. This |
552 | * can be null if selser is not enabled or oldid is not available. |
553 | * |
554 | * @return SelserData|null |
555 | * @throws HttpException |
556 | */ |
557 | private function getSelserData(): ?SelserData { |
558 | $oldhtml = $this->hasOriginalHtml() ? $this->getOriginalHtml() : null; |
559 | |
560 | // Selser requires knowledge of the original wikitext. |
561 | $knowsOriginal = $this->knowsOriginalContent(); |
562 | |
563 | if ( $knowsOriginal && !empty( $this->parsoidSettings['useSelser'] ) ) { |
564 | if ( !$this->getPageConfig()->getRevisionContent() ) { |
565 | throw new LocalizedHttpException( new MessageValue( "rest-previous-revision-unavailable" ), |
566 | 409 ); |
567 | } |
568 | |
569 | // TODO: T234548/T234549 - $pageConfig->getPageMainContent() is deprecated: |
570 | // should use $env->topFrame->getSrcText() |
571 | $selserData = new SelserData( $this->getPageConfig()->getPageMainContent(), |
572 | $oldhtml ); |
573 | } else { |
574 | $selserData = null; |
575 | } |
576 | |
577 | return $selserData; |
578 | } |
579 | |
580 | private function getContentHandler(): ContentHandler { |
581 | $model = $this->getContentModel(); |
582 | |
583 | return $this->contentHandlerFactory |
584 | ->getContentHandler( $model ); |
585 | } |
586 | |
587 | /** |
588 | * Returns a Content object derived from the supplied HTML. |
589 | * |
590 | * @return Content |
591 | */ |
592 | public function htmlToContent(): Content { |
593 | $text = $this->htmlToText(); |
594 | $content = $this->getContentHandler()->unserializeContent( $text ); |
595 | |
596 | return $content; |
597 | } |
598 | |
599 | /** |
600 | * Converts the input HTML to source format, typically wikitext. |
601 | * |
602 | * @see Parsoid::dom2wikitext |
603 | * |
604 | * @return string |
605 | */ |
606 | private function htmlToText(): string { |
607 | $doc = $this->getModifiedDocument(); |
608 | $htmlSize = $this->getModifiedHtmlSize(); |
609 | $inputContentVersion = $this->getSchemaVersion(); |
610 | $selserData = $this->getSelserData(); |
611 | |
612 | try { |
613 | $text = $this->parsoid->dom2wikitext( $this->getPageConfig(), $doc, [ |
614 | 'inputContentVersion' => $inputContentVersion, |
615 | 'offsetType' => $this->getOffsetType(), |
616 | 'contentmodel' => $this->getContentModel(), |
617 | 'htmlSize' => $htmlSize, // used to trigger status 413 if the input is too big |
618 | ], $selserData ); |
619 | } catch ( ClientError $e ) { |
620 | throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error", [ $e->getMessage() ] ), 400 ); |
621 | } catch ( ResourceLimitExceededException $e ) { |
622 | throw new LocalizedHttpException( |
623 | new MessageValue( "rest-parsoid-resource-exceeded", [ $e->getMessage() ] ), 413 |
624 | ); |
625 | } |
626 | |
627 | return $text; |
628 | } |
629 | |
630 | } |