Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 83 |
|
0.00% |
0 / 4 |
CRAP | |
0.00% |
0 / 1 |
SelectiveSerializer | |
0.00% |
0 / 83 |
|
0.00% |
0 / 4 |
930 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
2 | |||
wrapTextChildrenOfNode | |
0.00% |
0 / 53 |
|
0.00% |
0 / 1 |
506 | |||
preprocessDOMForSelser | |
0.00% |
0 / 3 |
|
0.00% |
0 / 1 |
6 | |||
serializeDOM | |
0.00% |
0 / 23 |
|
0.00% |
0 / 1 |
30 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Composer\Semver\Semver; |
7 | use Wikimedia\Parsoid\Config\Env; |
8 | use Wikimedia\Parsoid\Core\DomSourceRange; |
9 | use Wikimedia\Parsoid\Core\SelserData; |
10 | use Wikimedia\Parsoid\DOM\Comment; |
11 | use Wikimedia\Parsoid\DOM\Document; |
12 | use Wikimedia\Parsoid\DOM\Element; |
13 | use Wikimedia\Parsoid\DOM\Text; |
14 | use Wikimedia\Parsoid\Utils\ContentUtils; |
15 | use Wikimedia\Parsoid\Utils\DOMCompat; |
16 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
17 | use Wikimedia\Parsoid\Utils\DOMUtils; |
18 | use Wikimedia\Parsoid\Utils\Timing; |
19 | use Wikimedia\Parsoid\Utils\Utils; |
20 | use Wikimedia\Parsoid\Utils\WTUtils; |
21 | use Wikimedia\Parsoid\Wikitext\Consts; |
22 | |
23 | /** |
24 | * This is a Serializer class that will compare two versions of a DOM |
25 | * and re-use the original wikitext for unmodified regions of the DOM. |
26 | * Originally this relied on special change markers inserted by the |
27 | * editor, but we now generate these ourselves using DOMDiff. |
28 | */ |
29 | class SelectiveSerializer { |
30 | |
31 | /** @var Env */ |
32 | private $env; |
33 | |
34 | private $wts; |
35 | private $trace; |
36 | |
37 | /** @var SelserData */ |
38 | private $selserData; |
39 | |
40 | public function __construct( Env $env, array $options ) { |
41 | $this->env = $env; |
42 | $this->wts = new WikitextSerializer( $env, $options ); |
43 | $this->selserData = $options['selserData']; |
44 | |
45 | // Debug options |
46 | $this->trace = $this->env->hasTraceFlag( 'selser' ); |
47 | } |
48 | |
49 | /** |
50 | * Wrap text node children of nodes with name $nodeName in <span> tags and |
51 | * compute the DSR values for those span tags. |
52 | * |
53 | * This helps DOMDiff mark up diffs of content in these nodes at a more fine-grained level. |
54 | * |
55 | * These DSR values rely on availability of information about trimmed leading |
56 | * and trailing WS in these nodes in the wt->html direction. Given this info, |
57 | * on the original unedited DOM, the computed DSR values for span tags wrapping |
58 | * text nodes will be accurate. |
59 | * |
60 | * However, for the edited DOM with modified nodes, the computation is necessarily |
61 | * speculative and as such, the computed DSR values may be bogus. Given this, |
62 | * we rely on DOMDiff to diff the data-parsoid attribute and mark these nodes as |
63 | * modified because of the mismatched dsr values. If so, these span tags will never |
64 | * have selser reuse apply to them and the speculatively computed DSR values will |
65 | * be discarded. |
66 | * |
67 | * @param Element $body |
68 | * @param string $nodeName |
69 | */ |
70 | private function wrapTextChildrenOfNode( Element $body, string $nodeName ): void { |
71 | // Note that while it might seem that only the first and last child need to be |
72 | // wrapped, when nested list items are added, the previously last child of |
73 | // a list item become an intermediate child in the new DOM. Without the span |
74 | // wrapper, trailing trimmed whitespace gets dropped. |
75 | $inListItem = isset( Consts::$HTML['ListItemTags'][$nodeName] ); |
76 | foreach ( DOMCompat::querySelectorAll( $body, $nodeName ) as $elt ) { |
77 | if ( WTUtils::isLiteralHTMLNode( $elt ) ) { |
78 | continue; |
79 | } |
80 | |
81 | // Skip items with about id => part of templates / extensions like Cite |
82 | // CAVEAT: In some cases, this might be bailing out a little too early. |
83 | // For example, where certain extensions might actually support nested DSR |
84 | // values inside and where <li> items in them might benefit. But, given that |
85 | // so far, such extensions are more the exception than the norm, we will take |
86 | // the easy way out here and revisit this if dirty diffs for those <li> items |
87 | // merit further action in the future. |
88 | if ( $elt->hasAttribute( 'about' ) ) { |
89 | continue; |
90 | } |
91 | |
92 | // No point wrapping text nodes if there is no usable DSR |
93 | $eltDSR = DOMDataUtils::getDataParsoid( $elt )->dsr ?? null; |
94 | if ( !Utils::isValidDSR( $eltDSR ) ) { |
95 | continue; |
96 | } |
97 | |
98 | $doc = $body->ownerDocument; |
99 | $firstChild = $c = $elt->firstChild; |
100 | $start = $eltDSR->innerStart(); |
101 | while ( $c ) { |
102 | if ( $eltDSR && $c === $firstChild ) { |
103 | if ( $eltDSR->leadingWS < 0 ) { |
104 | // We don't have accurate information about the length of trimmed WS. |
105 | // So, we cannot wrap this text node with a <span>. |
106 | break; |
107 | } else { |
108 | $start += $eltDSR->leadingWS; |
109 | } |
110 | } |
111 | $next = $c->nextSibling; |
112 | if ( $c instanceof Text ) { |
113 | $text = $c->nodeValue; |
114 | $len = strlen( $text ); |
115 | |
116 | // Don't wrap newlines since single-line-context handling will convert these |
117 | // newlines into spaces and introduce dirty-diffs. Leaving nls outside the |
118 | // wrapped text lets it be handled as separator text and emitted appropriately. |
119 | if ( $len > 0 && $text[$len - 1] === "\n" ) { |
120 | $text = rtrim( $text, "\n" ); |
121 | $numOfNls = $len - strlen( $text ); |
122 | $nl = str_repeat( "\n", $numOfNls ); |
123 | $len -= $numOfNls; |
124 | } else { |
125 | $nl = null; |
126 | |
127 | // Detect last child of "original" item and tack on trailingWS width |
128 | // to the contents of this text node. If this is a list item and |
129 | // we added a nested list, that nested list will be the last item. |
130 | // |
131 | // Note that trailingWS is only captured for the last line, so if |
132 | // the text ends in a newline (the "if" condition), we shouldn't need |
133 | // to do this. |
134 | if ( $eltDSR && ( |
135 | !$next || ( |
136 | $inListItem && DOMUtils::isList( $next ) && WTUtils::isNewElt( $next ) |
137 | ) |
138 | ) ) { |
139 | $len += $eltDSR->trailingWS; |
140 | } |
141 | } |
142 | |
143 | $span = $doc->createElement( 'span' ); |
144 | $span->setAttribute( 'data-mw-selser-wrapper', '' ); |
145 | $dp = DOMDataUtils::getDataParsoid( $span ); |
146 | $dp->dsr = new DomSourceRange( $start, $start + $len, 0, 0 ); |
147 | $start += $len; |
148 | |
149 | if ( $nl ) { |
150 | $elt->insertBefore( $span, $c ); |
151 | $span->appendChild( $doc->createTextNode( $text ) ); |
152 | $c->nodeValue = $nl; |
153 | // @phan-suppress-next-line PhanPossiblyUndeclaredVariable |
154 | $start += $numOfNls; |
155 | } else { |
156 | $elt->replaceChild( $span, $c ); |
157 | $span->appendChild( $c ); |
158 | } |
159 | } elseif ( $c instanceof Comment ) { |
160 | $start += WTUtils::decodedCommentLength( $c ); |
161 | } elseif ( $c instanceof Element ) { |
162 | // No point wrapping following text nodes if there won't be any usable DSR |
163 | $cDSR = DOMDataUtils::getDataParsoid( $c )->dsr ?? null; |
164 | if ( !Utils::isValidDSR( $cDSR ) ) { |
165 | break; |
166 | } |
167 | $start = $cDSR->end; |
168 | $next = $c->hasAttribute( 'about' ) ? WTUtils::skipOverEncapsulatedContent( $c ) : $next; |
169 | } |
170 | $c = $next; |
171 | } |
172 | } |
173 | } |
174 | |
175 | private function preprocessDOMForSelser( Element $body ): void { |
176 | if ( Semver::satisfies( $this->env->getInputContentVersion(), '>=2.1.2' ) ) { |
177 | // Wrap text node children of <li> elements in dummy spans |
178 | $this->wrapTextChildrenOfNode( $body, 'li' ); |
179 | $this->wrapTextChildrenOfNode( $body, 'dd' ); |
180 | } |
181 | } |
182 | |
183 | /** |
184 | * Selectively serialize an HTML DOM. |
185 | * |
186 | * WARNING: You probably want to use WikitextContentModelHandler::fromDOM instead. |
187 | * |
188 | * @param Document $doc |
189 | * @return string |
190 | */ |
191 | public function serializeDOM( Document $doc ): string { |
192 | $serializeStart = null; |
193 | $domDiffStart = null; |
194 | $r = null; |
195 | |
196 | $body = DOMCompat::getBody( $doc ); |
197 | $oldBody = DOMCompat::getBody( $this->selserData->oldDOM ); |
198 | |
199 | // Preprocess DOMs - this is specific to selser |
200 | $this->preprocessDOMForSelser( $oldBody ); |
201 | $this->preprocessDOMForSelser( $body ); |
202 | |
203 | // Use provided diff-marked DOM (used during testing) |
204 | // or generate one (used in production) |
205 | if ( $this->env->getDOMDiff() ) { |
206 | $diff = [ 'isEmpty' => false ]; |
207 | $body = DOMCompat::getBody( $this->env->getDOMDiff() ); |
208 | } else { |
209 | $domDiffTiming = Timing::start( $this->env->getSiteConfig()->metrics() ); |
210 | $diff = ( new DOMDiff( $this->env ) )->diff( $oldBody, $body ); |
211 | $domDiffTiming->end( 'html2wt.selser.domDiff' ); |
212 | } |
213 | |
214 | if ( $diff['isEmpty'] ) { |
215 | // Nothing was modified, just re-use the original source |
216 | $r = $this->selserData->oldText; |
217 | } else { |
218 | if ( $this->trace || $this->env->hasDumpFlag( 'dom:post-dom-diff' ) ) { |
219 | $options = [ 'storeDiffMark' => true ]; |
220 | $this->env->writeDump( |
221 | ContentUtils::dumpDOM( $oldBody, 'OLD DOM ', $options ) . "\n" . |
222 | ContentUtils::dumpDOM( $body, 'DOM after running DOMDiff', $options ) |
223 | ); |
224 | } |
225 | |
226 | // Call the WikitextSerializer to do our bidding |
227 | $r = $this->wts->serializeDOM( $doc, true ); |
228 | } |
229 | |
230 | return $r; |
231 | } |
232 | } |