Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
87.37% |
173 / 198 |
|
72.97% |
27 / 37 |
CRAP | |
0.00% |
0 / 1 |
| HtmlToContentTransform | |
87.37% |
173 / 198 |
|
72.97% |
27 / 37 |
104.94 | |
0.00% |
0 / 1 |
| __construct | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| setMetrics | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| incrementMetrics | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
| setOptions | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| setOriginalRevision | |
66.67% |
4 / 6 |
|
0.00% |
0 / 1 |
3.33 | |||
| setOriginalRevisionId | |
60.00% |
3 / 5 |
|
0.00% |
0 / 1 |
3.58 | |||
| setContentLanguage | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
| setOriginalText | |
100.00% |
2 / 2 |
|
100.00% |
1 / 1 |
1 | |||
| setOriginalContent | |
66.67% |
4 / 6 |
|
0.00% |
0 / 1 |
3.33 | |||
| validatePageBundle | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
3 | |||
| setModifiedDataMW | |
50.00% |
2 / 4 |
|
0.00% |
0 / 1 |
2.50 | |||
| setOriginalSchemaVersion | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| setOriginalHtml | |
50.00% |
2 / 4 |
|
0.00% |
0 / 1 |
2.50 | |||
| setOriginalDataMW | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
3.07 | |||
| setOriginalDataParsoid | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| getPageConfig | |
100.00% |
21 / 21 |
|
100.00% |
1 / 1 |
6 | |||
| getModifiedHtmlSize | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getModifiedDocumentRaw | |
100.00% |
4 / 4 |
|
100.00% |
1 / 1 |
2 | |||
| getModifiedDocument | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
2 | |||
| hasOriginalHtml | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| hasOriginalDataParsoid | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getOriginalHtml | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
6 | |||
| parseHTML | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getOriginalBody | |
64.71% |
11 / 17 |
|
0.00% |
0 / 1 |
6.10 | |||
| getOriginalSchemaVersion | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
2 | |||
| getSchemaVersion | |
100.00% |
9 / 9 |
|
100.00% |
1 / 1 |
3 | |||
| getOriginalRevisionId | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| knowsOriginalContent | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
3 | |||
| getContentModel | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| getOffsetType | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| needsDowngrade | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
2 | |||
| downgradeOriginalData | |
86.36% |
19 / 22 |
|
0.00% |
0 / 1 |
6.09 | |||
| applyPageBundle | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
5 | |||
| getSelserData | |
80.00% |
8 / 10 |
|
0.00% |
0 / 1 |
5.20 | |||
| getContentHandler | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| htmlToContent | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| htmlToText | |
100.00% |
17 / 17 |
|
100.00% |
1 / 1 |
3 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace MediaWiki\Parser\Parsoid; |
| 5 | |
| 6 | use Composer\Semver\Semver; |
| 7 | use LogicException; |
| 8 | use MediaWiki\Content\Content; |
| 9 | use MediaWiki\Content\ContentHandler; |
| 10 | use MediaWiki\Content\IContentHandlerFactory; |
| 11 | use MediaWiki\Page\PageIdentity; |
| 12 | use MediaWiki\Parser\ParserOptions; |
| 13 | use MediaWiki\Parser\Parsoid\Config\PageConfigFactory; |
| 14 | use MediaWiki\Rest\HttpException; |
| 15 | use MediaWiki\Rest\LocalizedHttpException; |
| 16 | use MediaWiki\Revision\MutableRevisionRecord; |
| 17 | use MediaWiki\Revision\RevisionAccessException; |
| 18 | use MediaWiki\Revision\RevisionRecord; |
| 19 | use MediaWiki\Revision\SlotRecord; |
| 20 | use Wikimedia\Bcp47Code\Bcp47Code; |
| 21 | use Wikimedia\Message\MessageValue; |
| 22 | use Wikimedia\Parsoid\Config\PageConfig; |
| 23 | use Wikimedia\Parsoid\Config\SiteConfig; |
| 24 | use Wikimedia\Parsoid\Core\BasePageBundle; |
| 25 | use Wikimedia\Parsoid\Core\ClientError; |
| 26 | use Wikimedia\Parsoid\Core\HtmlPageBundle; |
| 27 | use Wikimedia\Parsoid\Core\ResourceLimitExceededException; |
| 28 | use Wikimedia\Parsoid\Core\SelserData; |
| 29 | use Wikimedia\Parsoid\DOM\Document; |
| 30 | use Wikimedia\Parsoid\DOM\Element; |
| 31 | use Wikimedia\Parsoid\Parsoid; |
| 32 | use Wikimedia\Parsoid\Utils\ContentUtils; |
| 33 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 34 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 35 | use Wikimedia\Stats\StatsFactory; |
| 36 | |
| 37 | /** |
| 38 | * This class allows HTML to be transformed to a page content source format such as wikitext. |
| 39 | * |
| 40 | * @since 1.40 |
| 41 | * @unstable should be stable before 1.40 release |
| 42 | */ |
| 43 | class HtmlToContentTransform { |
| 44 | private array $options = []; |
| 45 | private ?int $oldid = null; |
| 46 | private ?Bcp47Code $contentLanguage = null; |
| 47 | private ?Content $originalContent = null; |
| 48 | private ?RevisionRecord $originalRevision = null; |
| 49 | /** |
| 50 | * Whether $this->doc has had any necessary processing applied, |
| 51 | * such as injecting data-parsoid attributes from a HtmlPageBundle. |
| 52 | */ |
| 53 | private bool $docHasBeenProcessed = false; |
| 54 | private ?Document $doc = null; |
| 55 | private ?Element $originalBody = null; |
| 56 | protected ?StatsFactory $metrics = null; |
| 57 | private HtmlPageBundle $modifiedPageBundle; |
| 58 | private HtmlPageBundle $originalPageBundle; |
| 59 | private ?PageConfig $pageConfig = null; |
| 60 | |
| 61 | public function __construct( |
| 62 | string $modifiedHTML, |
| 63 | private readonly PageIdentity $page, |
| 64 | private readonly Parsoid $parsoid, |
| 65 | private readonly array $parsoidSettings, |
| 66 | private readonly SiteConfig $siteConfig, |
| 67 | private readonly PageConfigFactory $pageConfigFactory, |
| 68 | private readonly IContentHandlerFactory $contentHandlerFactory, |
| 69 | ) { |
| 70 | $this->modifiedPageBundle = new HtmlPageBundle( $modifiedHTML ); |
| 71 | $this->originalPageBundle = new HtmlPageBundle( '' ); |
| 72 | } |
| 73 | |
| 74 | /** |
| 75 | * Set metrics sink. |
| 76 | */ |
| 77 | public function setMetrics( StatsFactory $metrics ): void { |
| 78 | $this->metrics = $metrics; |
| 79 | } |
| 80 | |
| 81 | private function incrementMetrics( string $key, array $labels ) { |
| 82 | if ( $this->metrics ) { |
| 83 | $counter = $this->metrics->getCounter( $key )->setLabels( $labels ); |
| 84 | $counter->increment(); |
| 85 | } |
| 86 | } |
| 87 | |
| 88 | public function setOptions( array $options ) { |
| 89 | $this->options = $options; |
| 90 | } |
| 91 | |
| 92 | public function setOriginalRevision( RevisionRecord $rev ): void { |
| 93 | if ( $this->pageConfig ) { |
| 94 | throw new LogicException( 'Cannot set revision after using the PageConfig' ); |
| 95 | } |
| 96 | if ( $this->originalRevision ) { |
| 97 | throw new LogicException( 'Cannot set revision again' ); |
| 98 | } |
| 99 | |
| 100 | $this->originalRevision = $rev; |
| 101 | $this->oldid = $rev->getId(); |
| 102 | } |
| 103 | |
| 104 | public function setOriginalRevisionId( int $oldid ): void { |
| 105 | if ( $this->pageConfig ) { |
| 106 | throw new LogicException( 'Cannot set revision ID after using the PageConfig' ); |
| 107 | } |
| 108 | if ( $this->originalRevision ) { |
| 109 | throw new LogicException( 'Cannot set revision again' ); |
| 110 | } |
| 111 | |
| 112 | $this->oldid = $oldid; |
| 113 | } |
| 114 | |
| 115 | public function setContentLanguage( Bcp47Code $lang ): void { |
| 116 | if ( $this->pageConfig ) { |
| 117 | throw new LogicException( 'Cannot set content language after using the PageConfig' ); |
| 118 | } |
| 119 | |
| 120 | $this->contentLanguage = $lang; |
| 121 | } |
| 122 | |
| 123 | /** |
| 124 | * Sets the original source text (usually wikitext). |
| 125 | */ |
| 126 | public function setOriginalText( string $text ): void { |
| 127 | $content = $this->getContentHandler()->unserializeContent( $text ); |
| 128 | $this->setOriginalContent( $content ); |
| 129 | } |
| 130 | |
| 131 | /** |
| 132 | * Sets the original content (such as wikitext). |
| 133 | */ |
| 134 | public function setOriginalContent( Content $content ): void { |
| 135 | if ( $this->pageConfig ) { |
| 136 | throw new LogicException( 'Cannot set text after using the PageConfig' ); |
| 137 | } |
| 138 | if ( $this->originalRevision ) { |
| 139 | throw new LogicException( 'Cannot set wikitext after using the PageConfig' ); |
| 140 | } |
| 141 | |
| 142 | $this->options['contentmodel'] = $content->getModel(); |
| 143 | $this->originalContent = $content; |
| 144 | } |
| 145 | |
| 146 | /** @throws ClientError */ |
| 147 | private function validatePageBundle( BasePageBundle $pb ) { |
| 148 | $version = $pb->version; |
| 149 | if ( !$version ) { |
| 150 | return; |
| 151 | } |
| 152 | |
| 153 | $errorMessage = ''; |
| 154 | if ( !$pb->validate( $version, $errorMessage ) ) { |
| 155 | throw new ClientError( $errorMessage ); |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | /** |
| 160 | * @note Call this after all original data has been set! |
| 161 | * |
| 162 | * @param array $modifiedDataMW |
| 163 | * @throws ClientError |
| 164 | */ |
| 165 | public function setModifiedDataMW( array $modifiedDataMW ): void { |
| 166 | // Relies on setOriginalSchemaVersion having been called already. |
| 167 | if ( !Semver::satisfies( $this->getSchemaVersion(), '^999.0.0' ) ) { |
| 168 | throw new ClientError( 'Modified data-mw is not supported by schema version ' |
| 169 | . $this->getSchemaVersion() ); |
| 170 | } |
| 171 | |
| 172 | $this->modifiedPageBundle->mw = $modifiedDataMW; |
| 173 | } |
| 174 | |
| 175 | public function setOriginalSchemaVersion( string $originalSchemaVeraion ): void { |
| 176 | $this->originalPageBundle->version = $originalSchemaVeraion; |
| 177 | } |
| 178 | |
| 179 | public function setOriginalHtml( string $originalHtml ): void { |
| 180 | if ( $this->doc ) { |
| 181 | throw new LogicException( __FUNCTION__ . ' cannot be called after' . |
| 182 | ' getModifiedDocument()' ); |
| 183 | } |
| 184 | |
| 185 | $this->originalPageBundle->html = $originalHtml; |
| 186 | } |
| 187 | |
| 188 | public function setOriginalDataMW( array $originalDataMW ): void { |
| 189 | if ( $this->doc ) { |
| 190 | throw new LogicException( __FUNCTION__ . ' cannot be called after getModifiedDocument()' ); |
| 191 | } |
| 192 | |
| 193 | $this->originalPageBundle->mw = $originalDataMW; |
| 194 | |
| 195 | // Modified data-mw is going to be the same as original data-mw, |
| 196 | // unless specified otherwise. |
| 197 | if ( $this->modifiedPageBundle->mw === null ) { |
| 198 | $this->modifiedPageBundle->mw = $originalDataMW; |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | public function setOriginalDataParsoid( array $originalDataParsoid ): void { |
| 203 | if ( $this->doc ) { |
| 204 | throw new LogicException( __FUNCTION__ . ' cannot be called after getModifiedDocument()' ); |
| 205 | } |
| 206 | |
| 207 | // data-parsoid is going to be the same for original and modified. |
| 208 | $this->originalPageBundle->parsoid = $originalDataParsoid; |
| 209 | $this->modifiedPageBundle->parsoid = $originalDataParsoid; |
| 210 | } |
| 211 | |
| 212 | private function getPageConfig(): PageConfig { |
| 213 | if ( !$this->pageConfig ) { |
| 214 | |
| 215 | // XXX: do we even have to support wikitext overrides? What's the use case? |
| 216 | if ( $this->originalContent !== null ) { |
| 217 | // Create a mutable revision record point to the same revision |
| 218 | // and set to the desired content. |
| 219 | $revision = new MutableRevisionRecord( $this->page ); |
| 220 | if ( $this->oldid ) { |
| 221 | $revision->setId( $this->oldid ); |
| 222 | } |
| 223 | |
| 224 | $revision->setSlot( |
| 225 | SlotRecord::newUnsaved( |
| 226 | SlotRecord::MAIN, |
| 227 | $this->originalContent |
| 228 | ) |
| 229 | ); |
| 230 | } else { |
| 231 | // NOTE: PageConfigFactory allows $revision to be an int ID or a RevisionRecord. |
| 232 | $revision = $this->originalRevision ?: $this->oldid; |
| 233 | } |
| 234 | |
| 235 | try { |
| 236 | $this->pageConfig = $this->pageConfigFactory->createFromParserOptions( |
| 237 | ParserOptions::newFromAnon(), |
| 238 | $this->page, |
| 239 | $revision, |
| 240 | $this->contentLanguage |
| 241 | ); |
| 242 | } catch ( RevisionAccessException ) { |
| 243 | // TODO: Throw a different exception, this class should not know |
| 244 | // about HTTP status codes. |
| 245 | throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ), 404 ); |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | return $this->pageConfig; |
| 250 | } |
| 251 | |
| 252 | /** |
| 253 | * The size of the modified HTML in characters. |
| 254 | */ |
| 255 | public function getModifiedHtmlSize(): int { |
| 256 | return mb_strlen( $this->modifiedPageBundle->html ); |
| 257 | } |
| 258 | |
| 259 | private function getModifiedDocumentRaw(): Document { |
| 260 | if ( !$this->doc ) { |
| 261 | $this->doc = $this->parseHTML( $this->modifiedPageBundle->html, true ); |
| 262 | $this->modifiedPageBundle->version = DOMUtils::extractInlinedContentVersion( $this->doc ); |
| 263 | } |
| 264 | |
| 265 | return $this->doc; |
| 266 | } |
| 267 | |
| 268 | public function getModifiedDocument(): Document { |
| 269 | $doc = $this->getModifiedDocumentRaw(); |
| 270 | |
| 271 | if ( !$this->docHasBeenProcessed ) { |
| 272 | $doc = $this->applyPageBundle( $doc, $this->modifiedPageBundle ); |
| 273 | |
| 274 | $this->doc = $doc; |
| 275 | $this->docHasBeenProcessed = true; |
| 276 | } |
| 277 | |
| 278 | return $doc; |
| 279 | } |
| 280 | |
| 281 | /** |
| 282 | * NOTE: The return value of this method depends on |
| 283 | * setOriginalData() having been called first. |
| 284 | */ |
| 285 | public function hasOriginalHtml(): bool { |
| 286 | return $this->originalPageBundle->html !== ''; |
| 287 | } |
| 288 | |
| 289 | /** |
| 290 | * NOTE: The return value of this method depends on |
| 291 | * setOriginalData() having been called first. |
| 292 | */ |
| 293 | public function hasOriginalDataParsoid(): bool { |
| 294 | return $this->originalPageBundle->parsoid !== null; |
| 295 | } |
| 296 | |
| 297 | /** |
| 298 | * Returns the original HTML, with any necessary processing applied. |
| 299 | * |
| 300 | * @todo Make this method redundant, nothing should operate on HTML strings. |
| 301 | * |
| 302 | * @return string |
| 303 | * @throws ClientError |
| 304 | */ |
| 305 | public function getOriginalHtml(): string { |
| 306 | // NOTE: Schema version should have been set explicitly, |
| 307 | // so don't call getOriginalSchemaVersion, |
| 308 | // which will silently fall back to the default. |
| 309 | if ( !$this->originalPageBundle->version ) { |
| 310 | throw new ClientError( |
| 311 | 'Content-type of original html is missing.' |
| 312 | ); |
| 313 | } |
| 314 | |
| 315 | if ( !$this->originalBody ) { |
| 316 | // NOTE: Make sure we called getOriginalBody() at least once before we |
| 317 | // return the original HTML, so downgrades can be applied, |
| 318 | // data-parsoid can be injected, and $this->originalPageBundle->html |
| 319 | // is updated accordingly. |
| 320 | |
| 321 | if ( $this->hasOriginalDataParsoid() || $this->needsDowngrade( $this->originalPageBundle ) ) { |
| 322 | $this->getOriginalBody(); |
| 323 | } |
| 324 | } |
| 325 | |
| 326 | return $this->originalPageBundle->html ?: ''; |
| 327 | } |
| 328 | |
| 329 | /** |
| 330 | * @param string $html |
| 331 | * @param bool $validateXMLNames |
| 332 | * |
| 333 | * @return Document |
| 334 | * @throws ClientError |
| 335 | */ |
| 336 | protected function parseHTML( string $html, bool $validateXMLNames = false ): Document { |
| 337 | return DOMUtils::parseHTML( $html, $validateXMLNames ); |
| 338 | } |
| 339 | |
| 340 | /** |
| 341 | * NOTE: The return value of this method depends on |
| 342 | * setOriginalData() having been called first. |
| 343 | * |
| 344 | * @return Element |
| 345 | * @throws ClientError |
| 346 | */ |
| 347 | public function getOriginalBody(): Element { |
| 348 | if ( !$this->hasOriginalHtml() ) { |
| 349 | throw new LogicException( |
| 350 | 'No original data supplied, call hasOriginalHtml() first.' |
| 351 | ); |
| 352 | } |
| 353 | |
| 354 | if ( $this->originalBody ) { |
| 355 | return $this->originalBody; |
| 356 | } |
| 357 | |
| 358 | // NOTE: Schema version should have been set explicitly, |
| 359 | // so don't call getOriginalSchemaVersion, |
| 360 | // which will silently fall back to the default. |
| 361 | if ( !$this->originalPageBundle->version ) { |
| 362 | throw new ClientError( |
| 363 | 'Content-type of original html is missing.' |
| 364 | ); |
| 365 | } |
| 366 | |
| 367 | if ( $this->needsDowngrade( $this->originalPageBundle ) ) { |
| 368 | $this->downgradeOriginalData( $this->originalPageBundle, $this->getSchemaVersion() ); |
| 369 | } |
| 370 | |
| 371 | $doc = $this->parseHTML( $this->originalPageBundle->html ); |
| 372 | |
| 373 | $doc = $this->applyPageBundle( $doc, $this->originalPageBundle ); |
| 374 | |
| 375 | $this->originalBody = DOMCompat::getBody( $doc ); |
| 376 | |
| 377 | // XXX: use a separate field?? |
| 378 | $this->originalPageBundle->html = ContentUtils::toXML( $this->originalBody ); |
| 379 | |
| 380 | return $this->originalBody; |
| 381 | } |
| 382 | |
| 383 | public function getOriginalSchemaVersion(): string { |
| 384 | return $this->originalPageBundle->version ?: $this->getSchemaVersion(); |
| 385 | } |
| 386 | |
| 387 | /** |
| 388 | * NOTE: The return value of this method depends on |
| 389 | * setOriginalData() having been called first. |
| 390 | */ |
| 391 | public function getSchemaVersion(): string { |
| 392 | // Get the content version of the edited doc, if available. |
| 393 | // Make sure $this->modifiedPageBundle->version is initialized. |
| 394 | $this->getModifiedDocumentRaw(); |
| 395 | $inputContentVersion = $this->modifiedPageBundle->version; |
| 396 | |
| 397 | if ( !$inputContentVersion ) { |
| 398 | $this->incrementMetrics( |
| 399 | 'html2wt_original_version_total', |
| 400 | [ 'input_content_version' => 'none' ] |
| 401 | ); |
| 402 | $inputContentVersion = $this->originalPageBundle->version ?: Parsoid::defaultHTMLVersion(); |
| 403 | } |
| 404 | |
| 405 | return $inputContentVersion; |
| 406 | } |
| 407 | |
| 408 | public function getOriginalRevisionId(): ?int { |
| 409 | return $this->oldid; |
| 410 | } |
| 411 | |
| 412 | public function knowsOriginalContent(): bool { |
| 413 | return $this->originalRevision || $this->oldid || $this->originalContent !== null; |
| 414 | } |
| 415 | |
| 416 | public function getContentModel(): string { |
| 417 | return $this->options['contentmodel'] ?? CONTENT_MODEL_WIKITEXT; |
| 418 | } |
| 419 | |
| 420 | public function getOffsetType(): string { |
| 421 | return $this->options['offsetType'] ?? 'byte'; |
| 422 | } |
| 423 | |
| 424 | private function needsDowngrade( HtmlPageBundle $pb ): bool { |
| 425 | $vOriginal = $pb->version; |
| 426 | $vEdited = $this->getSchemaVersion(); |
| 427 | |
| 428 | // Downgrades are only expected to be between major version |
| 429 | // |
| 430 | // RESTBase was only expected to store latest version. If a client asked for a version |
| 431 | // not satisfied by the latest version, it would downgrade the stored version where |
| 432 | // possible. So, it's the original version that needs to satisfy the edited version, |
| 433 | // otherwise it needs downgrading. |
| 434 | // |
| 435 | // There's also the case where an old version is not stored and a re-parse must occur. |
| 436 | // Here again the original version generated will be the latest, either satisfying |
| 437 | // the edited or needing downgrading. |
| 438 | return $vOriginal !== null && !Semver::satisfies( $vOriginal, "^{$vEdited}" ); |
| 439 | } |
| 440 | |
| 441 | /** @throws ClientError */ |
| 442 | private function downgradeOriginalData( HtmlPageBundle $pb, string $targetSchemaVersion ) { |
| 443 | if ( $pb->version === null ) { |
| 444 | throw new ClientError( 'Missing schema version' ); |
| 445 | } |
| 446 | |
| 447 | if ( $targetSchemaVersion === $pb->version ) { |
| 448 | // nothing to do. |
| 449 | return; |
| 450 | } |
| 451 | |
| 452 | if ( !$pb->parsoid ) { |
| 453 | // XXX: Should we also support downgrades if $pb->html has everything inlined? |
| 454 | // XXX: The downgrade should really be an operation on the DOM. |
| 455 | return; |
| 456 | } |
| 457 | |
| 458 | // We need to downgrade the original to match the edited doc's version. |
| 459 | $downgrade = Parsoid::findDowngrade( $pb->version, $targetSchemaVersion ); |
| 460 | |
| 461 | if ( !$downgrade ) { |
| 462 | throw new ClientError( |
| 463 | "No downgrade possible from schema version {$pb->version} to {$targetSchemaVersion}." |
| 464 | ); |
| 465 | } |
| 466 | |
| 467 | $this->incrementMetrics( |
| 468 | "downgrade_total", |
| 469 | [ 'from' => $downgrade['from'], 'to' => $downgrade['to'] ] |
| 470 | ); |
| 471 | |
| 472 | $downgradeTime = microtime( true ); |
| 473 | Parsoid::downgrade( $downgrade, $pb, $this->siteConfig ); |
| 474 | if ( $this->metrics ) { |
| 475 | $this->metrics |
| 476 | ->getTiming( 'downgrade_time_ms' ) |
| 477 | ->observe( ( microtime( true ) - $downgradeTime ) * 1000 ); |
| 478 | } |
| 479 | // NOTE: Set $this->originalBody to null so getOriginalBody() will re-generate it. |
| 480 | // XXX: Parsoid::downgrade operates on the parsed Document, would be nice |
| 481 | // if we could get that instead of getting back HTML which we have to |
| 482 | // parse again! |
| 483 | $this->originalBody = null; |
| 484 | } |
| 485 | |
| 486 | /** |
| 487 | * @param Document $doc |
| 488 | * @param BasePageBundle $pb |
| 489 | * @return Document $doc The Document with page bundle information in |
| 490 | * inline-attribute form |
| 491 | * |
| 492 | * @throws ClientError |
| 493 | */ |
| 494 | private function applyPageBundle( Document $doc, BasePageBundle $pb ): Document { |
| 495 | if ( $pb->parsoid === null && $pb->mw === null ) { |
| 496 | return $doc; |
| 497 | } |
| 498 | |
| 499 | // Verify that the top-level parsoid object either doesn't contain |
| 500 | // offsetType, or that it matches the conversion that has been |
| 501 | // explicitly requested. |
| 502 | if ( isset( $pb->parsoid['offsetType'] ) ) { |
| 503 | $offsetType = $this->getOffsetType(); |
| 504 | $origOffsetType = $pb->parsoid['offsetType'] ?? $offsetType; |
| 505 | if ( $origOffsetType !== $offsetType ) { |
| 506 | throw new ClientError( |
| 507 | 'DSR offsetType mismatch: ' . $origOffsetType . ' vs ' . $offsetType |
| 508 | ); |
| 509 | } |
| 510 | } |
| 511 | |
| 512 | $this->validatePageBundle( $pb ); |
| 513 | return $pb->withDocument( $doc )->toInlineAttributeDocument( |
| 514 | siteConfig: $this->siteConfig, |
| 515 | ); |
| 516 | } |
| 517 | |
| 518 | /** |
| 519 | * Get a selective serialization (selser) data object. This |
| 520 | * can be null if selser is not enabled or oldid is not available. |
| 521 | * |
| 522 | * @return SelserData|null |
| 523 | * @throws HttpException |
| 524 | */ |
| 525 | private function getSelserData(): ?SelserData { |
| 526 | $oldhtml = $this->hasOriginalHtml() ? $this->getOriginalHtml() : null; |
| 527 | |
| 528 | // Selser requires knowledge of the original wikitext. |
| 529 | $knowsOriginal = $this->knowsOriginalContent(); |
| 530 | |
| 531 | if ( $knowsOriginal && !empty( $this->parsoidSettings['useSelser'] ) ) { |
| 532 | if ( !$this->getPageConfig()->getRevisionContent() ) { |
| 533 | throw new LocalizedHttpException( new MessageValue( "rest-previous-revision-unavailable" ), |
| 534 | 409 ); |
| 535 | } |
| 536 | |
| 537 | // TODO: T234548/T234549 - $pageConfig->getPageMainContent() is deprecated: |
| 538 | // should use $env->topFrame->getSrcText() |
| 539 | $selserData = new SelserData( $this->getPageConfig()->getPageMainContent(), |
| 540 | $oldhtml ); |
| 541 | } else { |
| 542 | $selserData = null; |
| 543 | } |
| 544 | |
| 545 | return $selserData; |
| 546 | } |
| 547 | |
| 548 | private function getContentHandler(): ContentHandler { |
| 549 | $model = $this->getContentModel(); |
| 550 | |
| 551 | return $this->contentHandlerFactory |
| 552 | ->getContentHandler( $model ); |
| 553 | } |
| 554 | |
| 555 | /** |
| 556 | * Returns a Content object derived from the supplied HTML. |
| 557 | */ |
| 558 | public function htmlToContent(): Content { |
| 559 | $text = $this->htmlToText(); |
| 560 | $content = $this->getContentHandler()->unserializeContent( $text ); |
| 561 | |
| 562 | return $content; |
| 563 | } |
| 564 | |
| 565 | /** |
| 566 | * Converts the input HTML to source format, typically wikitext. |
| 567 | * |
| 568 | * @see Parsoid::dom2wikitext |
| 569 | * |
| 570 | * @return string |
| 571 | */ |
| 572 | private function htmlToText(): string { |
| 573 | $doc = $this->getModifiedDocument(); |
| 574 | $htmlSize = $this->getModifiedHtmlSize(); |
| 575 | $inputContentVersion = $this->getSchemaVersion(); |
| 576 | $selserData = $this->getSelserData(); |
| 577 | |
| 578 | try { |
| 579 | $text = $this->parsoid->dom2wikitext( $this->getPageConfig(), $doc, [ |
| 580 | 'inputContentVersion' => $inputContentVersion, |
| 581 | 'offsetType' => $this->getOffsetType(), |
| 582 | 'contentmodel' => $this->getContentModel(), |
| 583 | 'htmlSize' => $htmlSize, // used to trigger status 413 if the input is too big |
| 584 | ], $selserData ); |
| 585 | } catch ( ClientError $e ) { |
| 586 | throw new LocalizedHttpException( new MessageValue( "rest-parsoid-error", [ $e->getMessage() ] ), 400 ); |
| 587 | } catch ( ResourceLimitExceededException $e ) { |
| 588 | throw new LocalizedHttpException( |
| 589 | new MessageValue( "rest-parsoid-resource-exceeded", [ $e->getMessage() ] ), 413 |
| 590 | ); |
| 591 | } |
| 592 | |
| 593 | return $text; |
| 594 | } |
| 595 | |
| 596 | } |