Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
37.25% |
19 / 51 |
|
16.67% |
1 / 6 |
CRAP | |
0.00% |
0 / 1 |
ContentModelHandler | |
37.25% |
19 / 51 |
|
16.67% |
1 / 6 |
40.89 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
canonicalizeDOM | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
2 | |||
setupSelser | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
6 | |||
toDOM | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
2 | |||
preprocessEditedDOM | |
0.00% |
0 / 10 |
|
0.00% |
0 / 1 |
12 | |||
fromDOM | |
100.00% |
19 / 19 |
|
100.00% |
1 / 1 |
3 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Wikitext; |
5 | |
6 | use Wikimedia\Parsoid\Config\Env; |
7 | use Wikimedia\Parsoid\Core\ContentModelHandler as IContentModelHandler; |
8 | use Wikimedia\Parsoid\Core\SelserData; |
9 | use Wikimedia\Parsoid\DOM\Document; |
10 | use Wikimedia\Parsoid\Ext\DOMProcessor as ExtDOMProcessor; |
11 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
12 | use Wikimedia\Parsoid\Html2Wt\RemoveRedLinks; |
13 | use Wikimedia\Parsoid\Html2Wt\SelectiveSerializer; |
14 | use Wikimedia\Parsoid\Html2Wt\WikitextSerializer; |
15 | use Wikimedia\Parsoid\Utils\ContentUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMCompat; |
17 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
18 | use Wikimedia\Parsoid\Utils\Timing; |
19 | |
20 | class ContentModelHandler extends IContentModelHandler { |
21 | |
22 | /** @var Env */ |
23 | private $env; |
24 | |
25 | /** |
26 | * Sneak an environment in here since it's not exposed as part of the |
27 | * ParsoidExtensionAPI |
28 | * |
29 | * @param Env $env |
30 | */ |
31 | public function __construct( Env $env ) { |
32 | $this->env = $env; |
33 | } |
34 | |
35 | /** |
36 | * Bring DOM to expected canonical form |
37 | * @param Env $env |
38 | * @param Document $doc |
39 | */ |
40 | private function canonicalizeDOM( Env $env, Document $doc ): void { |
41 | $body = DOMCompat::getBody( $doc ); |
42 | |
43 | // Convert DOM to internal canonical form |
44 | DOMDataUtils::visitAndLoadDataAttribs( $body, [ 'markNew' => true ] ); |
45 | |
46 | // Update DSR offsets if necessary. |
47 | ContentUtils::convertOffsets( |
48 | $env, $doc, $env->getRequestOffsetType(), 'byte' |
49 | ); |
50 | |
51 | // Strip <section> and mw:FallbackId <span> tags, if present, |
52 | // as well as extended annotation wrappers. |
53 | // This ensures that we can accept HTML from CX / VE |
54 | // and other clients that might have stripped them. |
55 | ContentUtils::stripUnnecessaryWrappersAndSyntheticNodes( $body ); |
56 | |
57 | $redLinkRemover = new RemoveRedLinks( $this->env ); |
58 | $redLinkRemover->run( $body ); |
59 | } |
60 | |
61 | /** |
62 | * Fetch prior DOM for selser. |
63 | * |
64 | * @param ParsoidExtensionAPI $extApi |
65 | * @param SelserData $selserData |
66 | */ |
67 | private function setupSelser( ParsoidExtensionAPI $extApi, SelserData $selserData ) { |
68 | $env = $this->env; |
69 | |
70 | // Why is it safe to use a reparsed dom for dom diff'ing? |
71 | // (Since that's the only use of `env.page.dom`) |
72 | // |
73 | // There are two types of non-determinism to discuss: |
74 | // |
75 | // * The first is from parsoid generated ids. At this point, |
76 | // data-attributes have already been applied so there's no chance |
77 | // that variability in the ids used to associate data-attributes |
78 | // will lead to data being applied to the wrong nodes. |
79 | // |
80 | // Further, although about ids will differ, they belong to the set |
81 | // of ignorable attributes in the dom differ. |
82 | // |
83 | // * Templates, and encapsulated content in general, are the second. |
84 | // Since that content can change in between parses, the resulting |
85 | // dom might not be the same. However, because dom diffing on |
86 | // on those regions only uses data-mw for comparision (which will |
87 | // remain constant between parses), this also shouldn't be an |
88 | // issue. |
89 | // |
90 | // There is one caveat. Because encapsulated content isn't |
91 | // guaranteed to be "balanced", the template affected regions |
92 | // may change between parses. This should be rare. |
93 | // |
94 | // We therefore consider this safe since it won't corrupt the page |
95 | // and, at worst, mixed up diff'ing annotations can end up with an |
96 | // unfaithful serialization of the edit. |
97 | // |
98 | // However, in cases where original content is not returned by the |
99 | // client / RESTBase, selective serialization cannot proceed and |
100 | // we're forced to fallback to normalizing the entire page. This has |
101 | // proved unacceptable to editors as is and, as we lean heavier on |
102 | // selser, will only get worse over time. |
103 | // |
104 | // So, we're forced to trade off the correctness for usability. |
105 | if ( $selserData->oldHTML === null ) { |
106 | $env->log( "warn/html2wt", "Missing selserData->oldHTML. Regenerating." ); |
107 | |
108 | // FIXME(T266838): Create a new Env for this parse? Something is |
109 | // needed to avoid this rigmarole. |
110 | $topLevelDoc = $env->topLevelDoc; |
111 | $env->setupTopLevelDoc(); |
112 | // This effectively parses $selserData->oldText for us because |
113 | // $selserData->oldText = $env->getPageconfig()->getPageMainContent() |
114 | $doc = $this->toDOM( $extApi ); |
115 | $env->topLevelDoc = $topLevelDoc; |
116 | } else { |
117 | $doc = ContentUtils::createDocument( $selserData->oldHTML, true ); |
118 | } |
119 | |
120 | $this->canonicalizeDOM( $env, $doc ); |
121 | $selserData->oldDOM = $doc; |
122 | } |
123 | |
124 | /** |
125 | * @inheritDoc |
126 | */ |
127 | public function toDOM( ParsoidExtensionAPI $extApi ): Document { |
128 | return $this->env->getPipelineFactory()->parse( |
129 | // @phan-suppress-next-line PhanDeprecatedFunction not ready for topFrame yet |
130 | $this->env->getPageConfig()->getPageMainContent() |
131 | ); |
132 | } |
133 | |
134 | /** |
135 | * Preprocess the edited DOM as required before attempting to convert it to wikitext |
136 | * 1. The edited DOM (represented by body) might not be in canonical form |
137 | * because Parsoid might be providing server-side management of global state |
138 | * for extensions. To address this and bring the DOM back to canonical form, |
139 | * we run extension-provided handlers. The original DOM isn't subject to this problem. |
140 | * FIXME: But, this is not the only reason an extension might register a preprocessor. |
141 | * How do we know when to run a preprocessor on both original & edited DOMs? |
142 | * 2. We need to do this after all data attributes have been loaded. |
143 | * 3. We need to do this before we run dom-diffs to eliminate spurious diffs. |
144 | * |
145 | * @param Env $env |
146 | * @param Document $doc |
147 | */ |
148 | private function preprocessEditedDOM( Env $env, Document $doc ): void { |
149 | $siteConfig = $env->getSiteConfig(); |
150 | |
151 | // Run any registered DOM preprocessors |
152 | foreach ( $siteConfig->getExtDOMProcessors() as $extName => $domProcs ) { |
153 | foreach ( $domProcs as $i => $classNameOrSpec ) { |
154 | $c = $siteConfig->getObjectFactory()->createObject( $classNameOrSpec, [ |
155 | 'allowClassName' => true, |
156 | 'assertClass' => ExtDOMProcessor::class, |
157 | ] ); |
158 | $c->htmlPreprocess( |
159 | new ParsoidExtensionAPI( $env ), DOMCompat::getBody( $doc ) |
160 | ); |
161 | } |
162 | } |
163 | } |
164 | |
165 | /** |
166 | * @inheritDoc |
167 | */ |
168 | public function fromDOM( |
169 | ParsoidExtensionAPI $extApi, ?SelserData $selserData = null |
170 | ): string { |
171 | $env = $this->env; |
172 | $metrics = $env->getSiteConfig()->metrics(); |
173 | $setupTiming = Timing::start( $metrics ); |
174 | |
175 | $this->canonicalizeDOM( $env, $env->topLevelDoc ); |
176 | |
177 | $serializerOpts = [ 'env' => $env, 'selserData' => $selserData ]; |
178 | if ( $selserData && $selserData->oldText !== null ) { |
179 | $serializer = new SelectiveSerializer( $serializerOpts ); |
180 | $this->setupSelser( $extApi, $selserData ); |
181 | $wtsType = 'selser'; |
182 | } else { |
183 | // Fallback |
184 | $serializer = new WikitextSerializer( $serializerOpts ); |
185 | $wtsType = 'noselser'; |
186 | } |
187 | |
188 | $setupTiming->end( 'html2wt.setup' ); |
189 | |
190 | $preprocTiming = Timing::start( $metrics ); |
191 | $this->preprocessEditedDOM( $env, $env->topLevelDoc ); |
192 | $preprocTiming->end( 'html2wt.preprocess' ); |
193 | |
194 | $serializeTiming = Timing::start( $metrics ); |
195 | $res = $serializer->serializeDOM( $env->topLevelDoc ); |
196 | $serializeTiming->end( "html2wt.{$wtsType}.serialize" ); |
197 | |
198 | return $res; |
199 | } |
200 | |
201 | } |