Code Coverage |
||||||||||
Classes and Traits |
Functions and Methods |
Lines |
||||||||
Total | |
0.00% |
0 / 1 |
|
16.67% |
1 / 6 |
CRAP | |
40.43% |
19 / 47 |
ContentModelHandler | |
0.00% |
0 / 1 |
|
16.67% |
1 / 6 |
42.45 | |
40.43% |
19 / 47 |
__construct | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 2 |
|||
canonicalizeDOM | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 6 |
|||
setupSelser | |
0.00% |
0 / 1 |
6 | |
0.00% |
0 / 10 |
|||
toDOM | |
0.00% |
0 / 1 |
2 | |
0.00% |
0 / 2 |
|||
preprocessDOM | |
0.00% |
0 / 1 |
12 | |
0.00% |
0 / 8 |
|||
fromDOM | |
100.00% |
1 / 1 |
4 | |
100.00% |
19 / 19 |
<?php | |
declare( strict_types = 1 ); | |
namespace Wikimedia\Parsoid\Wikitext; | |
use Wikimedia\Parsoid\Config\Env; | |
use Wikimedia\Parsoid\Core\ContentModelHandler as IContentModelHandler; | |
use Wikimedia\Parsoid\Core\SelserData; | |
use Wikimedia\Parsoid\DOM\Document; | |
use Wikimedia\Parsoid\Ext\DOMProcessor as ExtDOMProcessor; | |
use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; | |
use Wikimedia\Parsoid\Html2Wt\SelectiveSerializer; | |
use Wikimedia\Parsoid\Html2Wt\WikitextSerializer; | |
use Wikimedia\Parsoid\Utils\ContentUtils; | |
use Wikimedia\Parsoid\Utils\DOMCompat; | |
use Wikimedia\Parsoid\Utils\DOMDataUtils; | |
use Wikimedia\Parsoid\Utils\Timing; | |
class ContentModelHandler extends IContentModelHandler { | |
/** @var Env */ | |
private $env; | |
/** | |
* Sneak an environment in here since it's not exposed as part of the | |
* ParsoidExtensionAPI | |
* | |
* @param Env $env | |
*/ | |
public function __construct( Env $env ) { | |
$this->env = $env; | |
} | |
/** | |
* Bring DOM to expected canonical form | |
* @param Env $env | |
* @param Document $doc | |
*/ | |
private function canonicalizeDOM( Env $env, Document $doc ): void { | |
$body = DOMCompat::getBody( $doc ); | |
// Convert DOM to internal canonical form | |
DOMDataUtils::visitAndLoadDataAttribs( $body, [ 'markNew' => true ] ); | |
// Update DSR offsets if necessary. | |
ContentUtils::convertOffsets( | |
$env, $doc, $env->getRequestOffsetType(), 'byte' | |
); | |
// Strip <section> and mw:FallbackId <span> tags, if present, | |
// as well as extended annotation wrappers. | |
// This ensures that we can accept HTML from CX / VE | |
// and other clients that might have stripped them. | |
ContentUtils::stripUnnecessaryWrappersAndFallbackIds( $body ); | |
} | |
/** | |
* Fetch prior DOM for selser. | |
* | |
* @param ParsoidExtensionAPI $extApi | |
* @param SelserData $selserData | |
*/ | |
private function setupSelser( ParsoidExtensionAPI $extApi, SelserData $selserData ) { | |
$env = $this->env; | |
// Why is it safe to use a reparsed dom for dom diff'ing? | |
// (Since that's the only use of `env.page.dom`) | |
// | |
// There are two types of non-determinism to discuss: | |
// | |
// * The first is from parsoid generated ids. At this point, | |
// data-attributes have already been applied so there's no chance | |
// that variability in the ids used to associate data-attributes | |
// will lead to data being applied to the wrong nodes. | |
// | |
// Further, although about ids will differ, they belong to the set | |
// of ignorable attributes in the dom differ. | |
// | |
// * Templates, and encapsulated content in general, are the second. | |
// Since that content can change in between parses, the resulting | |
// dom might not be the same. However, because dom diffing on | |
// on those regions only uses data-mw for comparision (which will | |
// remain constant between parses), this also shouldn't be an | |
// issue. | |
// | |
// There is one caveat. Because encapsulated content isn't | |
// guaranteed to be "balanced", the template affected regions | |
// may change between parses. This should be rare. | |
// | |
// We therefore consider this safe since it won't corrupt the page | |
// and, at worst, mixed up diff'ing annotations can end up with an | |
// unfaithful serialization of the edit. | |
// | |
// However, in cases where original content is not returned by the | |
// client / RESTBase, selective serialization cannot proceed and | |
// we're forced to fallback to normalizing the entire page. This has | |
// proved unacceptable to editors as is and, as we lean heavier on | |
// selser, will only get worse over time. | |
// | |
// So, we're forced to trade off the correctness for usability. | |
if ( $selserData->oldHTML === null ) { | |
// FIXME(T266838): Create a new Env for this parse? Something is | |
// needed to avoid this rigmarole. | |
$topLevelDoc = $env->topLevelDoc; | |
$env->setupTopLevelDoc(); | |
// This effectively parses $selserData->oldText for us because | |
// $selserData->oldText = $env->getPageconfig()->getPageMainContent() | |
$doc = $this->toDOM( $extApi ); | |
$env->topLevelDoc = $topLevelDoc; | |
} else { | |
$doc = ContentUtils::createDocument( $selserData->oldHTML, true ); | |
} | |
$this->canonicalizeDOM( $env, $doc ); | |
$selserData->oldDOM = $doc; | |
} | |
/** | |
* @inheritDoc | |
*/ | |
public function toDOM( ParsoidExtensionAPI $extApi ): Document { | |
return $this->env->getPipelineFactory()->parse( | |
$this->env->getPageConfig()->getPageMainContent() | |
); | |
} | |
/** | |
* Preprocess the edited DOM as required before attempting to convert it to wikitext | |
* 1. The edited DOM (represented by body) might not be in canonical form | |
* because Parsoid might be providing server-side management of global state | |
* for extensions. To address this and bring the DOM back to canonical form, | |
* we run extension-provided handlers. The original DOM isn't subject to this problem. | |
* FIXME: But, this is not the only reason an extension might register a preprocessor. | |
* How do we know when to run a preprocessor on both original & edited DOMs? | |
* 2. We need to do this after all data attributes have been loaded. | |
* 3. We need to do this before we run dom-diffs to eliminate spurious diffs. | |
* | |
* @param Env $env | |
* @param Document $doc | |
*/ | |
private function preprocessDOM( Env $env, Document $doc ): void { | |
$siteConfig = $env->getSiteConfig(); | |
// Run any registered DOM preprocessors | |
foreach ( $siteConfig->getExtDOMProcessors() as $extName => $domProcs ) { | |
foreach ( $domProcs as $i => $classNameOrSpec ) { | |
$c = $siteConfig->getObjectFactory()->createObject( $classNameOrSpec, [ | |
'allowClassName' => true, | |
'assertClass' => ExtDOMProcessor::class, | |
] ); | |
$c->htmlPreprocess( | |
new ParsoidExtensionAPI( $env ), DOMCompat::getBody( $doc ) | |
); | |
} | |
} | |
} | |
/** | |
* @inheritDoc | |
*/ | |
public function fromDOM( | |
ParsoidExtensionAPI $extApi, ?SelserData $selserData = null | |
): string { | |
$env = $this->env; | |
$metrics = $env->getSiteConfig()->metrics(); | |
$setupTiming = Timing::start( $metrics ); | |
$this->canonicalizeDOM( $env, $env->topLevelDoc ); | |
$serializerOpts = [ 'env' => $env, 'selserData' => $selserData ]; | |
if ( $selserData && $selserData->oldText !== null ) { | |
$serializer = new SelectiveSerializer( $serializerOpts ); | |
$this->setupSelser( $extApi, $selserData ); | |
$wtsType = 'selser'; | |
} else { | |
// Fallback | |
$serializer = new WikitextSerializer( $serializerOpts ); | |
$wtsType = 'noselser'; | |
} | |
$setupTiming->end( 'html2wt.setup' ); | |
$preprocTiming = Timing::start( $metrics ); | |
$this->preprocessDOM( $env, $env->topLevelDoc ); | |
$preprocTiming->end( 'html2wt.preprocess' ); | |
$serializeTiming = Timing::start( $metrics ); | |
$res = $serializer->serializeDOM( $env->topLevelDoc ); | |
$serializeTiming->end( "html2wt.{$wtsType}.serialize" ); | |
return $res; | |
} | |
} |