Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
24.47% |
23 / 94 |
|
14.29% |
1 / 7 |
CRAP | |
0.00% |
0 / 1 |
| ContentModelHandler | |
24.47% |
23 / 94 |
|
14.29% |
1 / 7 |
111.96 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| canonicalizeDOM | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
2 | |||
| setupSelser | |
0.00% |
0 / 13 |
|
0.00% |
0 / 1 |
6 | |||
| processIndicators | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
12 | |||
| toDOM | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
12 | |||
| preprocessEditedDOM | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
12 | |||
| fromDOM | |
100.00% |
23 / 23 |
|
100.00% |
1 / 1 |
2 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Wikitext; |
| 5 | |
| 6 | use Wikimedia\Assert\Assert; |
| 7 | use Wikimedia\Parsoid\Config\Env; |
| 8 | use Wikimedia\Parsoid\Core\ContentModelHandler as IContentModelHandler; |
| 9 | use Wikimedia\Parsoid\Core\DomPageBundle; |
| 10 | use Wikimedia\Parsoid\Core\SelectiveUpdateData; |
| 11 | use Wikimedia\Parsoid\DOM\Document; |
| 12 | use Wikimedia\Parsoid\Ext\DOMProcessor as ExtDOMProcessor; |
| 13 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
| 14 | use Wikimedia\Parsoid\Html2Wt\RemoveRedLinks; |
| 15 | use Wikimedia\Parsoid\Html2Wt\SelectiveSerializer; |
| 16 | use Wikimedia\Parsoid\Html2Wt\WikitextSerializer; |
| 17 | use Wikimedia\Parsoid\Utils\ContentUtils; |
| 18 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 19 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 20 | use Wikimedia\Parsoid\Utils\Timing; |
| 21 | |
| 22 | class ContentModelHandler extends IContentModelHandler { |
| 23 | |
| 24 | /** @var Env */ |
| 25 | private $env; |
| 26 | |
| 27 | /** |
| 28 | * Sneak an environment in here since it's not exposed as part of the |
| 29 | * ParsoidExtensionAPI |
| 30 | * |
| 31 | * @param Env $env |
| 32 | */ |
| 33 | public function __construct( Env $env ) { |
| 34 | $this->env = $env; |
| 35 | } |
| 36 | |
| 37 | /** |
| 38 | * Bring DOM to expected canonical form |
| 39 | */ |
| 40 | private function canonicalizeDOM( |
| 41 | Env $env, Document $doc |
| 42 | ): void { |
| 43 | Assert::invariant( |
| 44 | DOMDataUtils::isPreparedAndLoaded( $doc ), |
| 45 | "doc should already be prepared and loaded" |
| 46 | ); |
| 47 | $body = DOMCompat::getBody( $doc ); |
| 48 | |
| 49 | // Update DSR offsets if necessary. |
| 50 | ContentUtils::convertOffsets( |
| 51 | $env, $doc, $env->getRequestOffsetType(), 'byte' |
| 52 | ); |
| 53 | |
| 54 | // Strip <section> and mw:FallbackId <span> tags, if present, |
| 55 | // as well as extended annotation wrappers. |
| 56 | // This ensures that we can accept HTML from CX / VE |
| 57 | // and other clients that might have stripped them. |
| 58 | ContentUtils::stripUnnecessaryWrappersAndSyntheticNodes( $body ); |
| 59 | |
| 60 | $redLinkRemover = new RemoveRedLinks( $this->env ); |
| 61 | $redLinkRemover->run( $body ); |
| 62 | } |
| 63 | |
| 64 | /** |
| 65 | * Fetch prior DOM for selser. |
| 66 | * |
| 67 | * @param ParsoidExtensionAPI $extApi |
| 68 | * @param SelectiveUpdateData $selserData |
| 69 | */ |
| 70 | private function setupSelser( |
| 71 | ParsoidExtensionAPI $extApi, SelectiveUpdateData $selserData |
| 72 | ) { |
| 73 | $env = $this->env; |
| 74 | |
| 75 | // Why is it safe to use a reparsed dom for dom diff'ing? |
| 76 | // (Since that's the only use of `env.page.dom`) |
| 77 | // |
| 78 | // There are two types of non-determinism to discuss: |
| 79 | // |
| 80 | // * The first is from parsoid generated ids. At this point, |
| 81 | // data-attributes have already been applied so there's no chance |
| 82 | // that variability in the ids used to associate data-attributes |
| 83 | // will lead to data being applied to the wrong nodes. |
| 84 | // |
| 85 | // Further, although about ids will differ, they belong to the set |
| 86 | // of ignorable attributes in the dom differ. |
| 87 | // |
| 88 | // * Templates, and encapsulated content in general, are the second. |
| 89 | // Since that content can change in between parses, the resulting |
| 90 | // dom might not be the same. However, because dom diffing on |
| 91 | // on those regions only uses data-mw for comparision (which will |
| 92 | // remain constant between parses), this also shouldn't be an |
| 93 | // issue. |
| 94 | // |
| 95 | // There is one caveat. Because encapsulated content isn't |
| 96 | // guaranteed to be "balanced", the template affected regions |
| 97 | // may change between parses. This should be rare. |
| 98 | // |
| 99 | // We therefore consider this safe since it won't corrupt the page |
| 100 | // and, at worst, mixed up diff'ing annotations can end up with an |
| 101 | // unfaithful serialization of the edit. |
| 102 | // |
| 103 | // However, in cases where original content is not returned by the |
| 104 | // client / RESTBase, selective serialization cannot proceed and |
| 105 | // we're forced to fallback to normalizing the entire page. This has |
| 106 | // proved unacceptable to editors as is and, as we lean heavier on |
| 107 | // selser, will only get worse over time. |
| 108 | // |
| 109 | // So, we're forced to trade off the correctness for usability. |
| 110 | if ( $selserData->revHTML === null ) { |
| 111 | $env->log( "warn/html2wt", "Missing selserData->revHTML. Regenerating." ); |
| 112 | |
| 113 | // FIXME(T266838): Create a new Env for this parse? Something is |
| 114 | // needed to avoid this rigmarole. |
| 115 | $topLevelDoc = $env->getTopLevelDoc(); |
| 116 | $env->setupTopLevelDoc(); |
| 117 | // This effectively parses $selserData->revText for us because |
| 118 | // $selserData->revText = $env->getPageconfig()->getPageMainContent() |
| 119 | $doc = $this->toDOM( $extApi ); |
| 120 | $env->setupTopLevelDoc( $topLevelDoc ); |
| 121 | } else { |
| 122 | $doc = ContentUtils::createAndLoadDocument( |
| 123 | $selserData->revHTML, |
| 124 | [ 'markNew' => true, 'validateXMLNames' => true, ] |
| 125 | ); |
| 126 | } |
| 127 | |
| 128 | $this->canonicalizeDOM( $env, $doc ); |
| 129 | $selserData->revDOM = $doc; |
| 130 | } |
| 131 | |
| 132 | private function processIndicators( Document $doc, ParsoidExtensionAPI $extApi ): void { |
| 133 | // Erroneous indicators without names will be <span>s |
| 134 | $indicators = DOMCompat::querySelectorAll( $doc, 'meta[typeof~="mw:Extension/indicator"]' ); |
| 135 | $iData = []; |
| 136 | |
| 137 | // https://www.mediawiki.org/wiki/Help:Page_status_indicators#Adding_page_status_indicators |
| 138 | // says that last one wins. But, that may just be documentation of the |
| 139 | // implementation vs. being a deliberate strategy. |
| 140 | // |
| 141 | // The indicators are ordered by depth-first pre-order DOM traversal. |
| 142 | // This ensures that the indicators are in document textual order. |
| 143 | // Given that, the for-loop below implements "last-one-wins" semantics |
| 144 | // for indicators that use the same name key. |
| 145 | foreach ( $indicators as $meta ) { |
| 146 | $dmw = DOMDataUtils::getDataMw( $meta ); |
| 147 | $name = $dmw->getExtAttrib( 'name' ); |
| 148 | $iData[$name] = $dmw->html; |
| 149 | } |
| 150 | |
| 151 | // set indicator metadata for unique keys |
| 152 | foreach ( $iData as $name => $html ) { |
| 153 | $extApi->getMetadata()->setIndicator( (string)$name, $html ); |
| 154 | } |
| 155 | } |
| 156 | |
| 157 | /** |
| 158 | * @inheritDoc |
| 159 | */ |
| 160 | public function toDOM( |
| 161 | ParsoidExtensionAPI $extApi, ?SelectiveUpdateData $selectiveUpdateData = null |
| 162 | ): Document { |
| 163 | $env = $this->env; |
| 164 | $pipelineFactory = $env->getPipelineFactory(); |
| 165 | |
| 166 | if ( $selectiveUpdateData ) { |
| 167 | $doc = ContentUtils::createAndLoadDocument( |
| 168 | $selectiveUpdateData->revHTML, |
| 169 | [ |
| 170 | 'markNew' => false, // !isSelectiveUpdate |
| 171 | 'validateXMLNames' => true, |
| 172 | ] |
| 173 | ); |
| 174 | Assert::invariant( |
| 175 | !DomPageBundle::isSingleDocument( $doc ), |
| 176 | "toplevelDoc should not be a single-document page bundle" |
| 177 | ); |
| 178 | $env->setupTopLevelDoc( $doc ); |
| 179 | $this->canonicalizeDOM( $env, $env->getTopLevelDoc() ); |
| 180 | $selectiveUpdateData->revDOM = $doc; |
| 181 | $doc = $pipelineFactory->selectiveDOMUpdate( $selectiveUpdateData ); |
| 182 | } else { |
| 183 | $doc = $pipelineFactory->parse( |
| 184 | // @phan-suppress-next-line PhanDeprecatedFunction not ready for topFrame yet |
| 185 | $env->getPageConfig()->getPageMainContent() |
| 186 | ); |
| 187 | } |
| 188 | |
| 189 | // Hardcoded support for indicators |
| 190 | // TODO: Eventually we'll want to apply this to selective updates as well |
| 191 | if ( !$selectiveUpdateData ) { |
| 192 | $this->processIndicators( $doc, $extApi ); |
| 193 | } |
| 194 | |
| 195 | Assert::invariant( |
| 196 | DOMDataUtils::isPreparedAndLoaded( $doc ), |
| 197 | "toDOM should return a prepared and loaded doc" |
| 198 | ); |
| 199 | return $doc; |
| 200 | } |
| 201 | |
| 202 | /** |
| 203 | * Preprocess the edited DOM as required before attempting to convert it to wikitext |
| 204 | * 1. The edited DOM (represented by body) might not be in canonical form |
| 205 | * because Parsoid might be providing server-side management of global state |
| 206 | * for extensions. To address this and bring the DOM back to canonical form, |
| 207 | * we run extension-provided handlers. The original DOM isn't subject to this problem. |
| 208 | * FIXME: But, this is not the only reason an extension might register a preprocessor. |
| 209 | * How do we know when to run a preprocessor on both original & edited DOMs? |
| 210 | * 2. We need to do this after all data attributes have been loaded. |
| 211 | * 3. We need to do this before we run dom-diffs to eliminate spurious diffs. |
| 212 | * |
| 213 | * @param Env $env |
| 214 | * @param Document $doc |
| 215 | */ |
| 216 | private function preprocessEditedDOM( Env $env, Document $doc ): void { |
| 217 | $siteConfig = $env->getSiteConfig(); |
| 218 | |
| 219 | // Run any registered DOM preprocessors |
| 220 | foreach ( $siteConfig->getExtDOMProcessors() as $extName => $domProcs ) { |
| 221 | foreach ( $domProcs as $i => $classNameOrSpec ) { |
| 222 | $c = $siteConfig->getObjectFactory()->createObject( $classNameOrSpec, [ |
| 223 | 'allowClassName' => true, |
| 224 | 'assertClass' => ExtDOMProcessor::class, |
| 225 | ] ); |
| 226 | $c->htmlPreprocess( |
| 227 | new ParsoidExtensionAPI( $env ), DOMCompat::getBody( $doc ) |
| 228 | ); |
| 229 | } |
| 230 | } |
| 231 | } |
| 232 | |
| 233 | /** |
| 234 | * @inheritDoc |
| 235 | */ |
| 236 | public function fromDOM( |
| 237 | ParsoidExtensionAPI $extApi, ?SelectiveUpdateData $selserData = null |
| 238 | ): string { |
| 239 | $env = $this->env; |
| 240 | $siteConfig = $env->getSiteConfig(); |
| 241 | $setupTiming = Timing::start( $siteConfig ); |
| 242 | |
| 243 | $this->canonicalizeDOM( $env, $env->getTopLevelDoc() ); |
| 244 | |
| 245 | $serializerOpts = [ 'selserData' => $selserData ]; |
| 246 | if ( $selserData ) { |
| 247 | $serializer = new SelectiveSerializer( $env, $serializerOpts ); |
| 248 | $this->setupSelser( $extApi, $selserData ); |
| 249 | $wtsType = 'selser'; |
| 250 | } else { |
| 251 | // Fallback |
| 252 | $serializer = new WikitextSerializer( $env, $serializerOpts ); |
| 253 | $wtsType = 'noselser'; |
| 254 | } |
| 255 | |
| 256 | $setupTiming->end( 'html2wt.setup', 'html2wt_setup_seconds', [] ); |
| 257 | |
| 258 | $preprocTiming = Timing::start( $siteConfig ); |
| 259 | $this->preprocessEditedDOM( $env, $env->getTopLevelDoc() ); |
| 260 | $preprocTiming->end( 'html2wt.preprocess', 'html2wt_preprocess_seconds', [] ); |
| 261 | |
| 262 | $serializeTiming = Timing::start( $siteConfig ); |
| 263 | $res = $serializer->serializeDOM( $env->getTopLevelDoc() ); |
| 264 | $serializeTiming->end( |
| 265 | "html2wt.{$wtsType}.serialize", |
| 266 | "html2wt_serialize_seconds", |
| 267 | [ 'wts' => $wtsType ] |
| 268 | ); |
| 269 | |
| 270 | return $res; |
| 271 | } |
| 272 | |
| 273 | } |