Code Coverage for /src/src/Wikitext/ContentModelHandler.php

	Code Coverage
	Lines			Functions and Methods				Classes and Traits
Total	29.69% covered (danger)	29.69%	19 / 64	14.29% covered (danger)	14.29%	1 / 7	CRAP	0.00% covered (danger)	0.00%	0 / 1
ContentModelHandler	29.69% covered (danger)	29.69%	19 / 64	14.29% covered (danger)	14.29%	1 / 7	93.21	0.00% covered (danger)	0.00%	0 / 1
__construct	0.00% covered (danger)	0.00%	0 / 1	0.00% covered (danger)	0.00%	0 / 1	2
canonicalizeDOM	0.00% covered (danger)	0.00%	0 / 8	0.00% covered (danger)	0.00%	0 / 1	2
setupSelser	0.00% covered (danger)	0.00%	0 / 10	0.00% covered (danger)	0.00%	0 / 1	6
processIndicators	0.00% covered (danger)	0.00%	0 / 11	0.00% covered (danger)	0.00%	0 / 1	20
toDOM	0.00% covered (danger)	0.00%	0 / 5	0.00% covered (danger)	0.00%	0 / 1	2
preprocessEditedDOM	0.00% covered (danger)	0.00%	0 / 10	0.00% covered (danger)	0.00%	0 / 1	12
fromDOM	100.00% covered (success)	100.00%	19 / 19	100.00% covered (success)	100.00%	1 / 1	3

1	<?php
2	declare( strict_types = 1 );
3
4	namespace Wikimedia\Parsoid\Wikitext;
5
6	use Wikimedia\Parsoid\Config\Env;
7	use Wikimedia\Parsoid\Core\ContentModelHandler as IContentModelHandler;
8	use Wikimedia\Parsoid\Core\SelserData;
9	use Wikimedia\Parsoid\DOM\Document;
10	use Wikimedia\Parsoid\Ext\DOMProcessor as ExtDOMProcessor;
11	use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI;
12	use Wikimedia\Parsoid\Html2Wt\RemoveRedLinks;
13	use Wikimedia\Parsoid\Html2Wt\SelectiveSerializer;
14	use Wikimedia\Parsoid\Html2Wt\WikitextSerializer;
15	use Wikimedia\Parsoid\Utils\ContentUtils;
16	use Wikimedia\Parsoid\Utils\DOMCompat;
17	use Wikimedia\Parsoid\Utils\DOMDataUtils;
18	use Wikimedia\Parsoid\Utils\Timing;
19
20	class ContentModelHandler extends IContentModelHandler {
21
22	/** @var Env */
23	private $env;
24
25	/**
26	* Sneak an environment in here since it's not exposed as part of the
27	* ParsoidExtensionAPI
28	*
29	* @param Env $env
30	*/
31	public function __construct( Env $env ) {
32	$this->env = $env;
33	}
34
35	/**
36	* Bring DOM to expected canonical form
37	* @param Env $env
38	* @param Document $doc
39	*/
40	private function canonicalizeDOM( Env $env, Document $doc ): void {
41	$body = DOMCompat::getBody( $doc );
42
43	// Convert DOM to internal canonical form
44	DOMDataUtils::visitAndLoadDataAttribs( $body, [ 'markNew' => true ] );
45
46	// Update DSR offsets if necessary.
47	ContentUtils::convertOffsets(
48	$env, $doc, $env->getRequestOffsetType(), 'byte'
49	);
50
51	// Strip <section> and mw:FallbackId <span> tags, if present,
52	// as well as extended annotation wrappers.
53	// This ensures that we can accept HTML from CX / VE
54	// and other clients that might have stripped them.
55	ContentUtils::stripUnnecessaryWrappersAndSyntheticNodes( $body );
56
57	$redLinkRemover = new RemoveRedLinks( $this->env );
58	$redLinkRemover->run( $body );
59	}
60
61	/**
62	* Fetch prior DOM for selser.
63	*
64	* @param ParsoidExtensionAPI $extApi
65	* @param SelserData $selserData
66	*/
67	private function setupSelser( ParsoidExtensionAPI $extApi, SelserData $selserData ) {
68	$env = $this->env;
69
70	// Why is it safe to use a reparsed dom for dom diff'ing?
71	// (Since that's the only use of `env.page.dom`)
72	//
73	// There are two types of non-determinism to discuss:
74	//
75	// * The first is from parsoid generated ids. At this point,
76	// data-attributes have already been applied so there's no chance
77	// that variability in the ids used to associate data-attributes
78	// will lead to data being applied to the wrong nodes.
79	//
80	// Further, although about ids will differ, they belong to the set
81	// of ignorable attributes in the dom differ.
82	//
83	// * Templates, and encapsulated content in general, are the second.
84	// Since that content can change in between parses, the resulting
85	// dom might not be the same. However, because dom diffing on
86	// on those regions only uses data-mw for comparision (which will
87	// remain constant between parses), this also shouldn't be an
88	// issue.
89	//
90	// There is one caveat. Because encapsulated content isn't
91	// guaranteed to be "balanced", the template affected regions
92	// may change between parses. This should be rare.
93	//
94	// We therefore consider this safe since it won't corrupt the page
95	// and, at worst, mixed up diff'ing annotations can end up with an
96	// unfaithful serialization of the edit.
97	//
98	// However, in cases where original content is not returned by the
99	// client / RESTBase, selective serialization cannot proceed and
100	// we're forced to fallback to normalizing the entire page. This has
101	// proved unacceptable to editors as is and, as we lean heavier on
102	// selser, will only get worse over time.
103	//
104	// So, we're forced to trade off the correctness for usability.
105	if ( $selserData->oldHTML === null ) {
106	$env->log( "warn/html2wt", "Missing selserData->oldHTML. Regenerating." );
107
108	// FIXME(T266838): Create a new Env for this parse? Something is
109	// needed to avoid this rigmarole.
110	$topLevelDoc = $env->topLevelDoc;
111	$env->setupTopLevelDoc();
112	// This effectively parses $selserData->oldText for us because
113	// $selserData->oldText = $env->getPageconfig()->getPageMainContent()
114	$doc = $this->toDOM( $extApi );
115	$env->topLevelDoc = $topLevelDoc;
116	} else {
117	$doc = ContentUtils::createDocument( $selserData->oldHTML, true );
118	}
119
120	$this->canonicalizeDOM( $env, $doc );
121	$selserData->oldDOM = $doc;
122	}
123
124	private function processIndicators( Document $doc, ParsoidExtensionAPI $extApi ): void {
125	// Erroneous indicators without names will be <span>s
126	$indicators = DOMCompat::querySelectorAll( $doc, 'meta[typeof~="mw:Extension/indicator"]' );
127	$iData = [];
128
129	// https://www.mediawiki.org/wiki/Help:Page_status_indicators#Adding_page_status_indicators
130	// says that last one wins. But, that may just be documentation of the
131	// implementation vs. being a deliberate strategy.
132	//
133	// The indicators are ordered by depth-first pre-order DOM traversal.
134	// This ensures that the indicators are in document textual order.
135	// Given that, the for-loop below implements "last-one-wins" semantics
136	// for indicators that use the same name key.
137	foreach ( $indicators as $meta ) {
138	// Since the DOM is in "stored" state, we have to reparse data-mw here.
139	$codec = DOMDataUtils::getCodec( $doc );
140	$dataMwAttr = DOMCompat::getAttribute( $meta, 'data-mw' );
141	$dmw = $dataMwAttr === null ? null :
142	$codec->newFromJsonString( $dataMwAttr, DOMDataUtils::getCodecHints()['data-mw'] );
143	$name = $dmw->attrs->name;
144	$iData[$name] = $dmw->html;
145	}
146
147	// set indicator metadata for unique keys
148	foreach ( $iData as $name => $html ) {
149	$extApi->getMetadata()->setIndicator( (string)$name, $html );
150	}
151	}
152
153	/**
154	* @inheritDoc
155	*/
156	public function toDOM( ParsoidExtensionAPI $extApi ): Document {
157	$doc = $this->env->getPipelineFactory()->parse(
158	// @phan-suppress-next-line PhanDeprecatedFunction not ready for topFrame yet
159	$this->env->getPageConfig()->getPageMainContent()
160	);
161
162	// Hardcoded support for indicators
163	$this->processIndicators( $doc, $extApi );
164
165	return $doc;
166	}
167
168	/**
169	* Preprocess the edited DOM as required before attempting to convert it to wikitext
170	* 1. The edited DOM (represented by body) might not be in canonical form
171	* because Parsoid might be providing server-side management of global state
172	* for extensions. To address this and bring the DOM back to canonical form,
173	* we run extension-provided handlers. The original DOM isn't subject to this problem.
174	* FIXME: But, this is not the only reason an extension might register a preprocessor.
175	* How do we know when to run a preprocessor on both original & edited DOMs?
176	* 2. We need to do this after all data attributes have been loaded.
177	* 3. We need to do this before we run dom-diffs to eliminate spurious diffs.
178	*
179	* @param Env $env
180	* @param Document $doc
181	*/
182	private function preprocessEditedDOM( Env $env, Document $doc ): void {
183	$siteConfig = $env->getSiteConfig();
184
185	// Run any registered DOM preprocessors
186	foreach ( $siteConfig->getExtDOMProcessors() as $extName => $domProcs ) {
187	foreach ( $domProcs as $i => $classNameOrSpec ) {
188	$c = $siteConfig->getObjectFactory()->createObject( $classNameOrSpec, [
189	'allowClassName' => true,
190	'assertClass' => ExtDOMProcessor::class,
191	] );
192	$c->htmlPreprocess(
193	new ParsoidExtensionAPI( $env ), DOMCompat::getBody( $doc )
194	);
195	}
196	}
197	}
198
199	/**
200	* @inheritDoc
201	*/
202	public function fromDOM(
203	ParsoidExtensionAPI $extApi, ?SelserData $selserData = null
204	): string {
205	$env = $this->env;
206	$metrics = $env->getSiteConfig()->metrics();
207	$setupTiming = Timing::start( $metrics );
208
209	$this->canonicalizeDOM( $env, $env->topLevelDoc );
210
211	$serializerOpts = [ 'selserData' => $selserData ];
212	if ( $selserData && $selserData->oldText !== null ) {
213	$serializer = new SelectiveSerializer( $env, $serializerOpts );
214	$this->setupSelser( $extApi, $selserData );
215	$wtsType = 'selser';
216	} else {
217	// Fallback
218	$serializer = new WikitextSerializer( $env, $serializerOpts );
219	$wtsType = 'noselser';
220	}
221
222	$setupTiming->end( 'html2wt.setup' );
223
224	$preprocTiming = Timing::start( $metrics );
225	$this->preprocessEditedDOM( $env, $env->topLevelDoc );
226	$preprocTiming->end( 'html2wt.preprocess' );
227
228	$serializeTiming = Timing::start( $metrics );
229	$res = $serializer->serializeDOM( $env->topLevelDoc );
230	$serializeTiming->end( "html2wt.{$wtsType}.serialize" );
231
232	return $res;
233	}
234
235	}