Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
0.00% |
0 / 107 |
|
0.00% |
0 / 17 |
CRAP | |
0.00% |
0 / 1 |
DOMHandler | |
0.00% |
0 / 107 |
|
0.00% |
0 / 17 |
4830 | |
0.00% |
0 / 1 |
__construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
handle | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
before | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
after | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
firstChild | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
lastChild | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
forceSOL | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
wtListEOL | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
506 | |||
getListBullets | |
0.00% |
0 / 36 |
|
0.00% |
0 / 1 |
182 | |||
maxNLsInTable | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
12 | |||
serializeTableElement | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
serializeTableTag | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
stxInfoValidForTableCell | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
getLeadingSpace | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
30 | |||
getTrailingSpace | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
30 | |||
isBuilderInsertedElt | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
emitPlaceholderSrc | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt\DOMHandlers; |
5 | |
6 | use LogicException; |
7 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
8 | use Wikimedia\Parsoid\DOM\Element; |
9 | use Wikimedia\Parsoid\DOM\Node; |
10 | use Wikimedia\Parsoid\DOM\Text; |
11 | use Wikimedia\Parsoid\Html2Wt\SerializerState; |
12 | use Wikimedia\Parsoid\Html2Wt\WTSUtils; |
13 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
14 | use Wikimedia\Parsoid\Utils\DOMCompat; |
15 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
16 | use Wikimedia\Parsoid\Utils\DOMUtils; |
17 | use Wikimedia\Parsoid\Utils\WTUtils; |
18 | |
19 | /** |
20 | * HTML -> Wikitext serialization relies on walking the DOM and delegating |
21 | * the serialization requests to different DOM nodes. |
22 | * |
23 | * This class represents the interface that various DOM handlers are expected |
24 | * to implement. |
25 | * |
26 | * There is the core 'handle' method that deals with converting the content |
27 | * of the node into wikitext markup. |
28 | * |
29 | * Then there are 4 newline-constraint methods that specify the constraints |
30 | * that need to be satisfied for the markup to be valid. For example, list items |
31 | * should always start on a newline, but can only have a single newline separator. |
32 | * Paragraphs always start on a newline and need at least 2 newlines in wikitext |
33 | * for them to be recognized as paragraphs. |
34 | * |
35 | * Each of the 4 newline-constraint methods (before, after, firstChild, lastChild) |
36 | * return an array with a 'min' and 'max' property. If a property is missing, it |
37 | * means that the dom node doesn't have any newline constraints. Some DOM handlers |
38 | * might therefore choose to implement none, some, or all of these methods. |
39 | * |
40 | * The return values of each of these methods are treated as consraints and the |
41 | * caller will have to resolve potentially conflicting constraints between a |
42 | * pair of nodes (siblings, parent-child). For example, if an after handler of |
43 | * a node wants 1 newline, but the before handler of its sibling wants none. |
44 | * |
45 | * Ideally, there should not be any incompatible constraints, but we haven't |
46 | * actually verified that this is the case. All consraint-hanlding code is in |
47 | * the separators-handling methods. |
48 | */ |
49 | class DOMHandler { |
50 | |
51 | /** @var bool */ |
52 | private $forceSOL; |
53 | |
54 | public function __construct( bool $forceSOL = false ) { |
55 | $this->forceSOL = $forceSOL; |
56 | } |
57 | |
58 | /** |
59 | * Serialize a DOM node to wikitext. |
60 | * Serialized wikitext should be returned via $state::emitChunk(). |
61 | * @param Element $node |
62 | * @param SerializerState $state |
63 | * @param bool $wrapperUnmodified |
64 | * @return Node|null The node to continue with (need not be an element always) |
65 | */ |
66 | public function handle( |
67 | Element $node, SerializerState $state, bool $wrapperUnmodified = false |
68 | ): ?Node { |
69 | throw new LogicException( 'Not implemented.' ); |
70 | } |
71 | |
72 | /** |
73 | * How many newlines should be emitted *before* this node? |
74 | * |
75 | * @param Element $node |
76 | * @param Node $otherNode |
77 | * @param SerializerState $state |
78 | * @return array |
79 | */ |
80 | public function before( Element $node, Node $otherNode, SerializerState $state ): array { |
81 | return []; |
82 | } |
83 | |
84 | /** |
85 | * How many newlines should be emitted *after* this node? |
86 | * |
87 | * @param Element $node |
88 | * @param Node $otherNode |
89 | * @param SerializerState $state |
90 | * @return array |
91 | */ |
92 | public function after( Element $node, Node $otherNode, SerializerState $state ): array { |
93 | return []; |
94 | } |
95 | |
96 | /** |
97 | * How many newlines should be emitted before the first child? |
98 | * |
99 | * @param Element|DocumentFragment $node |
100 | * @param Node $otherNode |
101 | * @param SerializerState $state |
102 | * @return array |
103 | */ |
104 | public function firstChild( Node $node, Node $otherNode, SerializerState $state ): array { |
105 | return []; |
106 | } |
107 | |
108 | /** |
109 | * How many newlines should be emitted after the last child? |
110 | * |
111 | * @param Element|DocumentFragment $node |
112 | * @param Node $otherNode |
113 | * @param SerializerState $state |
114 | * @return array |
115 | */ |
116 | public function lastChild( Node $node, Node $otherNode, SerializerState $state ): array { |
117 | return []; |
118 | } |
119 | |
120 | /** |
121 | * Put the serializer in start-of-line mode before it is handled. |
122 | * All non-newline whitespace found between HTML nodes is stripped |
123 | * to ensure SOL state is guaranteed. |
124 | * |
125 | * @return bool |
126 | */ |
127 | public function forceSOL(): bool { |
128 | return $this->forceSOL; |
129 | } |
130 | |
131 | /** |
132 | * List helper: This is a shared *after* newline handler for list items. |
133 | * |
134 | * @param Element $node |
135 | * @param Node $otherNode |
136 | * @return array An array in the form [ 'min' => <int>, 'max' => <int> ] or an empty array. |
137 | */ |
138 | protected function wtListEOL( Element $node, Node $otherNode ): array { |
139 | if ( !( $otherNode instanceof Element ) || DOMUtils::atTheTop( $otherNode ) ) { |
140 | return [ 'min' => 0, 'max' => 2 ]; |
141 | } |
142 | '@phan-var Element $otherNode';/** @var Element $otherNode */ |
143 | |
144 | if ( WTUtils::isFirstEncapsulationWrapperNode( $otherNode ) ) { |
145 | return [ 'min' => DOMUtils::isList( $node ) ? 1 : 0, 'max' => 2 ]; |
146 | } |
147 | |
148 | $nextSibling = DiffDOMUtils::nextNonSepSibling( $node ); |
149 | $dp = DOMDataUtils::getDataParsoid( $otherNode ); |
150 | if ( ( $nextSibling === $otherNode && ( $dp->stx ?? null ) === 'html' ) || isset( $dp->src ) ) { |
151 | return [ 'min' => 0, 'max' => 2 ]; |
152 | } elseif ( $nextSibling === $otherNode && DOMUtils::isListOrListItem( $otherNode ) ) { |
153 | if ( DOMUtils::isList( $node ) && DOMCompat::nodeName( $otherNode ) === DOMCompat::nodeName( $node ) ) { |
154 | // Adjacent lists of same type need extra newline |
155 | return [ 'min' => 2, 'max' => 2 ]; |
156 | } elseif ( DOMUtils::isListItem( $node ) |
157 | || in_array( DOMCompat::nodeName( $node->parentNode ), [ 'li', 'dd' ], true ) |
158 | ) { |
159 | // Top-level list |
160 | return [ 'min' => 1, 'max' => 1 ]; |
161 | } else { |
162 | return [ 'min' => 1, 'max' => 2 ]; |
163 | } |
164 | } elseif ( DOMUtils::isList( $otherNode ) |
165 | || ( $otherNode instanceof Element && ( $dp->stx ?? null ) === 'html' ) |
166 | ) { |
167 | // last child in ul/ol (the list element is our parent), defer |
168 | // separator constraints to the list. |
169 | return []; |
170 | } elseif ( |
171 | DOMUtils::isWikitextBlockNode( $node->parentNode ) && |
172 | DiffDOMUtils::lastNonSepChild( $node->parentNode ) === $node |
173 | ) { |
174 | // A list in a block node (<div>, <td>, etc) doesn't need a trailing empty line |
175 | // if it is the last non-separator child (ex: <div>..</ul></div>) |
176 | return [ 'min' => 1, 'max' => 2 ]; |
177 | } elseif ( DOMUtils::isFormattingElt( $otherNode ) ) { |
178 | return [ 'min' => 1, 'max' => 1 ]; |
179 | } else { |
180 | return [ |
181 | 'min' => WTUtils::isNewElt( $node ) && !WTUtils::isMarkerAnnotation( $otherNode ) |
182 | ? 2 : 1, |
183 | 'max' => 2 |
184 | ]; |
185 | } |
186 | } |
187 | |
188 | /** |
189 | * List helper: DOM-based list bullet construction. |
190 | * @param SerializerState $state |
191 | * @param Element $node |
192 | * @return string |
193 | */ |
194 | protected function getListBullets( SerializerState $state, Element $node ): string { |
195 | $parentTypes = [ |
196 | 'ul' => '*', |
197 | 'ol' => '#' |
198 | ]; |
199 | $listTypes = [ |
200 | 'ul' => '', |
201 | 'ol' => '', |
202 | 'dl' => '', |
203 | 'li' => '', |
204 | 'dt' => ';', |
205 | 'dd' => ':' |
206 | ]; |
207 | |
208 | // For new elements, for prettier wikitext serialization, |
209 | // emit a space after the last bullet (if required) |
210 | $space = $this->getLeadingSpace( $state, $node, ' ' ); |
211 | |
212 | $res = ''; |
213 | while ( !DOMUtils::atTheTop( $node ) ) { |
214 | $dp = DOMDataUtils::getDataParsoid( $node ); |
215 | $nodeName = DOMCompat::nodeName( $node ); |
216 | if ( isset( $listTypes[$nodeName] ) ) { |
217 | if ( $nodeName === 'li' ) { |
218 | $parentNode = $node->parentNode; |
219 | while ( $parentNode && !( isset( $parentTypes[DOMCompat::nodeName( $parentNode )] ) ) ) { |
220 | $parentNode = $parentNode->parentNode; |
221 | } |
222 | |
223 | if ( $parentNode ) { |
224 | if ( !WTUtils::isLiteralHTMLNode( $parentNode ) ) { |
225 | $res = $parentTypes[DOMCompat::nodeName( $parentNode )] . $res; |
226 | } |
227 | } else { |
228 | $state->getEnv()->log( 'error/html2wt', 'Input DOM is not well-formed.', |
229 | "Top-level <li> found that is not nested in <ol>/<ul>\n LI-node:", |
230 | DOMCompat::getOuterHTML( $node ) |
231 | ); |
232 | } |
233 | } elseif ( !WTUtils::isLiteralHTMLNode( $node ) ) { |
234 | $res = $listTypes[$nodeName] . $res; |
235 | } |
236 | } elseif ( !WTUtils::isLiteralHTMLNode( $node ) || |
237 | empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) |
238 | ) { |
239 | break; |
240 | } |
241 | |
242 | $node = $node->parentNode; |
243 | } |
244 | |
245 | // Don't emit a space if we aren't returning any bullets. |
246 | return strlen( $res ) ? $res . $space : ''; |
247 | } |
248 | |
249 | /** |
250 | * Helper: Newline constraint helper for table nodes |
251 | * @param Node $node |
252 | * @param Node $origNode |
253 | * @return int |
254 | */ |
255 | protected function maxNLsInTable( Node $node, Node $origNode ): int { |
256 | return ( WTUtils::isNewElt( $node ) || WTUtils::isNewElt( $origNode ) ) ? 1 : 2; |
257 | } |
258 | |
259 | /** |
260 | * Private helper for serializing table nodes |
261 | * @param string $symbol |
262 | * @param ?string $endSymbol |
263 | * @param SerializerState $state |
264 | * @param Element $node |
265 | * @return string |
266 | */ |
267 | private function serializeTableElement( |
268 | string $symbol, ?string $endSymbol, SerializerState $state, Element $node |
269 | ): string { |
270 | $token = WTSUtils::mkTagTk( $node ); |
271 | $sAttribs = $state->serializer->serializeAttributes( $node, $token ); |
272 | if ( $sAttribs !== '' ) { |
273 | // IMPORTANT: use ?? not ?: in the first check because we want to preserve an |
274 | // empty string. Use != '' in the second to avoid treating '0' as empty. |
275 | return $symbol . ' ' . $sAttribs . ( $endSymbol ?? ' |' ); |
276 | } else { |
277 | return $symbol . ( $endSymbol != '' ? $endSymbol : '' ); |
278 | } |
279 | } |
280 | |
281 | /** |
282 | * Helper: Handles content serialization for table nodes |
283 | * @param string $symbol |
284 | * @param ?string $endSymbol |
285 | * @param SerializerState $state |
286 | * @param Element $node |
287 | * @param bool $wrapperUnmodified |
288 | * @return string |
289 | */ |
290 | protected function serializeTableTag( |
291 | string $symbol, |
292 | ?string $endSymbol, |
293 | SerializerState $state, |
294 | Element $node, |
295 | bool $wrapperUnmodified |
296 | ): string { |
297 | if ( $wrapperUnmodified ) { |
298 | $dsr = DOMDataUtils::getDataParsoid( $node )->dsr; |
299 | return $state->getOrigSrc( $dsr->openRange() ) ?? ''; |
300 | } else { |
301 | return $this->serializeTableElement( $symbol, $endSymbol, $state, $node ); |
302 | } |
303 | } |
304 | |
305 | /** |
306 | * Helper: Checks whether syntax information in data-parsoid is valid |
307 | * in the presence of table edits. For example "|" is no longer valid |
308 | * table-cell markup if a table cell is added before this cell. |
309 | * |
310 | * @param SerializerState $state |
311 | * @param Element $node |
312 | * @return bool |
313 | */ |
314 | protected function stxInfoValidForTableCell( SerializerState $state, Element $node ): bool { |
315 | // If row syntax is not set, nothing to worry about |
316 | if ( ( DOMDataUtils::getDataParsoid( $node )->stx ?? null ) !== 'row' ) { |
317 | return true; |
318 | } |
319 | |
320 | // If we have an identical previous sibling, nothing to worry about |
321 | $prev = DiffDOMUtils::previousNonDeletedSibling( $node ); |
322 | return $prev !== null && DOMCompat::nodeName( $prev ) === DOMCompat::nodeName( $node ); |
323 | } |
324 | |
325 | /** |
326 | * Helper for several DOM handlers: Returns whitespace that needs to be emitted |
327 | * between the markup for the node and its content (ex: table cells, list items) |
328 | * based on node state (whether the node is original or new content) and other |
329 | * state (HTML version, whether selective serialization is enabled or not). |
330 | * @param SerializerState $state |
331 | * @param Element $node |
332 | * @param string $newEltDefault |
333 | * @return string |
334 | */ |
335 | protected function getLeadingSpace( |
336 | SerializerState $state, Element $node, string $newEltDefault |
337 | ): string { |
338 | $space = ''; |
339 | if ( WTUtils::isNewElt( $node ) ) { |
340 | $fc = DiffDOMUtils::firstNonDeletedChild( $node ); |
341 | // PORT-FIXME are different \s semantics going to be a problem? |
342 | if ( $fc && ( !( $fc instanceof Text ) || !preg_match( '/^\s/', $fc->nodeValue ) ) ) { |
343 | $space = $newEltDefault; |
344 | } |
345 | } |
346 | return $space; |
347 | } |
348 | |
349 | /** |
350 | * Helper for several DOM handlers: Returns whitespace that needs to be emitted |
351 | * between the markup for the node and its next sibling based on node state |
352 | * (whether the node is original or new content) and other state (HTML version, |
353 | * whether selective serialization is enabled or not). |
354 | * @param SerializerState $state |
355 | * @param Element $node |
356 | * @param string $newEltDefault |
357 | * @return string |
358 | */ |
359 | protected function getTrailingSpace( |
360 | SerializerState $state, Element $node, string $newEltDefault |
361 | ): string { |
362 | $space = ''; |
363 | if ( WTUtils::isNewElt( $node ) ) { |
364 | $lc = DiffDOMUtils::lastNonDeletedChild( $node ); |
365 | // PORT-FIXME are different \s semantics going to be a problem? |
366 | if ( $lc && ( !( $lc instanceof Text ) || !preg_match( '/\s$/D', $lc->nodeValue ) ) ) { |
367 | $space = $newEltDefault; |
368 | } |
369 | } |
370 | return $space; |
371 | } |
372 | |
373 | /** |
374 | * Helper: Is this node auto-inserted by the HTML5 tree-builder |
375 | * during wt->html? |
376 | * @param Node $node |
377 | * @return bool |
378 | */ |
379 | protected function isBuilderInsertedElt( Node $node ): bool { |
380 | if ( !( $node instanceof Element ) ) { |
381 | return false; |
382 | } |
383 | '@phan-var Element $node';/** @var Element $node */ |
384 | $dp = DOMDataUtils::getDataParsoid( $node ); |
385 | return !empty( $dp->autoInsertedStart ) && !empty( $dp->autoInsertedEnd ); |
386 | } |
387 | |
388 | /** |
389 | * Uneditable forms wrapped with mw:Placeholder tags OR unedited nowikis |
390 | * N.B. We no longer emit self-closed nowikis as placeholders, so remove this |
391 | * once all our stored content is updated. |
392 | * @param Element $node |
393 | * @param SerializerState $state |
394 | */ |
395 | protected function emitPlaceholderSrc( Element $node, SerializerState $state ) { |
396 | $dp = DOMDataUtils::getDataParsoid( $node ); |
397 | if ( preg_match( '!<nowiki\s*/>!', $dp->src ?? '' ) ) { |
398 | $state->hasSelfClosingNowikis = true; |
399 | } |
400 | // FIXME: Should this also check for tabs and plain space |
401 | // chars interspersed with newlines? |
402 | if ( preg_match( '/^\n+$/D', $dp->src ?? '' ) ) { |
403 | $state->appendSep( $dp->src ); |
404 | } else { |
405 | $state->serializer->emitWikitext( $dp->src, $node ); |
406 | } |
407 | } |
408 | |
409 | } |