Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
0.00% |
0 / 107 |
|
0.00% |
0 / 17 |
CRAP | |
0.00% |
0 / 1 |
| DOMHandler | |
0.00% |
0 / 107 |
|
0.00% |
0 / 17 |
4830 | |
0.00% |
0 / 1 |
| __construct | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| handle | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| before | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| after | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| firstChild | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| lastChild | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| forceSOL | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| wtListEOL | |
0.00% |
0 / 28 |
|
0.00% |
0 / 1 |
506 | |||
| getListBullets | |
0.00% |
0 / 36 |
|
0.00% |
0 / 1 |
182 | |||
| maxNLsInTable | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
12 | |||
| serializeTableElement | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
12 | |||
| serializeTableTag | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
6 | |||
| stxInfoValidForTableCell | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
| getLeadingSpace | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
30 | |||
| getTrailingSpace | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
30 | |||
| isBuilderInsertedElt | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
12 | |||
| emitPlaceholderSrc | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Html2Wt\DOMHandlers; |
| 5 | |
| 6 | use LogicException; |
| 7 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
| 8 | use Wikimedia\Parsoid\DOM\Element; |
| 9 | use Wikimedia\Parsoid\DOM\Node; |
| 10 | use Wikimedia\Parsoid\DOM\Text; |
| 11 | use Wikimedia\Parsoid\Html2Wt\SerializerState; |
| 12 | use Wikimedia\Parsoid\Html2Wt\WTSUtils; |
| 13 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
| 14 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 15 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 16 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 17 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 18 | |
| 19 | /** |
| 20 | * HTML -> Wikitext serialization relies on walking the DOM and delegating |
| 21 | * the serialization requests to different DOM nodes. |
| 22 | * |
| 23 | * This class represents the interface that various DOM handlers are expected |
| 24 | * to implement. |
| 25 | * |
| 26 | * There is the core 'handle' method that deals with converting the content |
| 27 | * of the node into wikitext markup. |
| 28 | * |
| 29 | * Then there are 4 newline-constraint methods that specify the constraints |
| 30 | * that need to be satisfied for the markup to be valid. For example, list items |
| 31 | * should always start on a newline, but can only have a single newline separator. |
| 32 | * Paragraphs always start on a newline and need at least 2 newlines in wikitext |
| 33 | * for them to be recognized as paragraphs. |
| 34 | * |
| 35 | * Each of the 4 newline-constraint methods (before, after, firstChild, lastChild) |
| 36 | * return an array with a 'min' and 'max' property. If a property is missing, it |
| 37 | * means that the dom node doesn't have any newline constraints. Some DOM handlers |
| 38 | * might therefore choose to implement none, some, or all of these methods. |
| 39 | * |
| 40 | * The return values of each of these methods are treated as consraints and the |
| 41 | * caller will have to resolve potentially conflicting constraints between a |
| 42 | * pair of nodes (siblings, parent-child). For example, if an after handler of |
| 43 | * a node wants 1 newline, but the before handler of its sibling wants none. |
| 44 | * |
| 45 | * Ideally, there should not be any incompatible constraints, but we haven't |
| 46 | * actually verified that this is the case. All consraint-hanlding code is in |
| 47 | * the separators-handling methods. |
| 48 | */ |
| 49 | class DOMHandler { |
| 50 | |
| 51 | /** @var bool */ |
| 52 | private $forceSOL; |
| 53 | |
| 54 | public function __construct( bool $forceSOL = false ) { |
| 55 | $this->forceSOL = $forceSOL; |
| 56 | } |
| 57 | |
| 58 | /** |
| 59 | * Serialize a DOM node to wikitext. |
| 60 | * Serialized wikitext should be returned via $state::emitChunk(). |
| 61 | * @param Element $node |
| 62 | * @param SerializerState $state |
| 63 | * @param bool $wrapperUnmodified |
| 64 | * @return Node|null The node to continue with (need not be an element always) |
| 65 | */ |
| 66 | public function handle( |
| 67 | Element $node, SerializerState $state, bool $wrapperUnmodified = false |
| 68 | ): ?Node { |
| 69 | throw new LogicException( 'Not implemented.' ); |
| 70 | } |
| 71 | |
| 72 | /** |
| 73 | * How many newlines should be emitted *before* this node? |
| 74 | * |
| 75 | * @param Element $node |
| 76 | * @param Node $otherNode |
| 77 | * @param SerializerState $state |
| 78 | * @return array |
| 79 | */ |
| 80 | public function before( Element $node, Node $otherNode, SerializerState $state ): array { |
| 81 | return []; |
| 82 | } |
| 83 | |
| 84 | /** |
| 85 | * How many newlines should be emitted *after* this node? |
| 86 | * |
| 87 | * @param Element $node |
| 88 | * @param Node $otherNode |
| 89 | * @param SerializerState $state |
| 90 | * @return array |
| 91 | */ |
| 92 | public function after( Element $node, Node $otherNode, SerializerState $state ): array { |
| 93 | return []; |
| 94 | } |
| 95 | |
| 96 | /** |
| 97 | * How many newlines should be emitted before the first child? |
| 98 | * |
| 99 | * @param Element|DocumentFragment $node |
| 100 | * @param Node $otherNode |
| 101 | * @param SerializerState $state |
| 102 | * @return array |
| 103 | */ |
| 104 | public function firstChild( Node $node, Node $otherNode, SerializerState $state ): array { |
| 105 | return []; |
| 106 | } |
| 107 | |
| 108 | /** |
| 109 | * How many newlines should be emitted after the last child? |
| 110 | * |
| 111 | * @param Element|DocumentFragment $node |
| 112 | * @param Node $otherNode |
| 113 | * @param SerializerState $state |
| 114 | * @return array |
| 115 | */ |
| 116 | public function lastChild( Node $node, Node $otherNode, SerializerState $state ): array { |
| 117 | return []; |
| 118 | } |
| 119 | |
| 120 | /** |
| 121 | * Put the serializer in start-of-line mode before it is handled. |
| 122 | * All non-newline whitespace found between HTML nodes is stripped |
| 123 | * to ensure SOL state is guaranteed. |
| 124 | * |
| 125 | * @return bool |
| 126 | */ |
| 127 | public function forceSOL(): bool { |
| 128 | return $this->forceSOL; |
| 129 | } |
| 130 | |
| 131 | /** |
| 132 | * List helper: This is a shared *after* newline handler for list items. |
| 133 | * |
| 134 | * @param Element $node |
| 135 | * @param Node $otherNode |
| 136 | * @return array An array in the form [ 'min' => <int>, 'max' => <int> ] or an empty array. |
| 137 | */ |
| 138 | protected function wtListEOL( Element $node, Node $otherNode ): array { |
| 139 | if ( !( $otherNode instanceof Element ) || DOMUtils::atTheTop( $otherNode ) ) { |
| 140 | return [ 'min' => 0, 'max' => 2 ]; |
| 141 | } |
| 142 | '@phan-var Element $otherNode';/** @var Element $otherNode */ |
| 143 | |
| 144 | if ( WTUtils::isFirstEncapsulationWrapperNode( $otherNode ) ) { |
| 145 | return [ 'min' => DOMUtils::isList( $node ) ? 1 : 0, 'max' => 2 ]; |
| 146 | } |
| 147 | |
| 148 | $nextSibling = DiffDOMUtils::nextNonSepSibling( $node ); |
| 149 | $dp = DOMDataUtils::getDataParsoid( $otherNode ); |
| 150 | if ( ( $nextSibling === $otherNode && ( $dp->stx ?? null ) === 'html' ) || isset( $dp->src ) ) { |
| 151 | return [ 'min' => 0, 'max' => 2 ]; |
| 152 | } elseif ( $nextSibling === $otherNode && DOMUtils::isListOrListItem( $otherNode ) ) { |
| 153 | if ( DOMUtils::isList( $node ) && DOMCompat::nodeName( $otherNode ) === DOMCompat::nodeName( $node ) ) { |
| 154 | // Adjacent lists of same type need extra newline |
| 155 | return [ 'min' => 2, 'max' => 2 ]; |
| 156 | } elseif ( DOMUtils::isListItem( $node ) |
| 157 | || in_array( DOMCompat::nodeName( $node->parentNode ), [ 'li', 'dd' ], true ) |
| 158 | ) { |
| 159 | // Top-level list |
| 160 | return [ 'min' => 1, 'max' => 1 ]; |
| 161 | } else { |
| 162 | return [ 'min' => 1, 'max' => 2 ]; |
| 163 | } |
| 164 | } elseif ( DOMUtils::isList( $otherNode ) |
| 165 | || ( $otherNode instanceof Element && ( $dp->stx ?? null ) === 'html' ) |
| 166 | ) { |
| 167 | // last child in ul/ol (the list element is our parent), defer |
| 168 | // separator constraints to the list. |
| 169 | return []; |
| 170 | } elseif ( |
| 171 | DOMUtils::isWikitextBlockNode( $node->parentNode ) && |
| 172 | DiffDOMUtils::lastNonSepChild( $node->parentNode ) === $node |
| 173 | ) { |
| 174 | // A list in a block node (<div>, <td>, etc) doesn't need a trailing empty line |
| 175 | // if it is the last non-separator child (ex: <div>..</ul></div>) |
| 176 | return [ 'min' => 1, 'max' => 2 ]; |
| 177 | } elseif ( DOMUtils::isFormattingElt( $otherNode ) ) { |
| 178 | return [ 'min' => 1, 'max' => 1 ]; |
| 179 | } else { |
| 180 | return [ |
| 181 | 'min' => WTUtils::isNewElt( $node ) && !WTUtils::isMarkerAnnotation( $otherNode ) |
| 182 | ? 2 : 1, |
| 183 | 'max' => 2 |
| 184 | ]; |
| 185 | } |
| 186 | } |
| 187 | |
| 188 | /** |
| 189 | * List helper: DOM-based list bullet construction. |
| 190 | * @param SerializerState $state |
| 191 | * @param Element $node |
| 192 | * @return string |
| 193 | */ |
| 194 | protected function getListBullets( SerializerState $state, Element $node ): string { |
| 195 | $parentTypes = [ |
| 196 | 'ul' => '*', |
| 197 | 'ol' => '#' |
| 198 | ]; |
| 199 | $listTypes = [ |
| 200 | 'ul' => '', |
| 201 | 'ol' => '', |
| 202 | 'dl' => '', |
| 203 | 'li' => '', |
| 204 | 'dt' => ';', |
| 205 | 'dd' => ':' |
| 206 | ]; |
| 207 | |
| 208 | // For new elements, for prettier wikitext serialization, |
| 209 | // emit a space after the last bullet (if required) |
| 210 | $space = $this->getLeadingSpace( $state, $node, ' ' ); |
| 211 | |
| 212 | $res = ''; |
| 213 | while ( !DOMUtils::atTheTop( $node ) ) { |
| 214 | $dp = DOMDataUtils::getDataParsoid( $node ); |
| 215 | $nodeName = DOMCompat::nodeName( $node ); |
| 216 | if ( isset( $listTypes[$nodeName] ) ) { |
| 217 | if ( $nodeName === 'li' ) { |
| 218 | $parentNode = $node->parentNode; |
| 219 | while ( $parentNode && !( isset( $parentTypes[DOMCompat::nodeName( $parentNode )] ) ) ) { |
| 220 | $parentNode = $parentNode->parentNode; |
| 221 | } |
| 222 | |
| 223 | if ( $parentNode ) { |
| 224 | if ( !WTUtils::isLiteralHTMLNode( $parentNode ) ) { |
| 225 | $res = $parentTypes[DOMCompat::nodeName( $parentNode )] . $res; |
| 226 | } |
| 227 | } else { |
| 228 | $state->getEnv()->log( 'error/html2wt', 'Input DOM is not well-formed.', |
| 229 | "Top-level <li> found that is not nested in <ol>/<ul>\n LI-node:", |
| 230 | DOMCompat::getOuterHTML( $node ) |
| 231 | ); |
| 232 | } |
| 233 | } elseif ( !WTUtils::isLiteralHTMLNode( $node ) ) { |
| 234 | $res = $listTypes[$nodeName] . $res; |
| 235 | } |
| 236 | } elseif ( !WTUtils::isLiteralHTMLNode( $node ) || |
| 237 | empty( $dp->autoInsertedStart ) || empty( $dp->autoInsertedEnd ) |
| 238 | ) { |
| 239 | break; |
| 240 | } |
| 241 | |
| 242 | $node = $node->parentNode; |
| 243 | } |
| 244 | |
| 245 | // Don't emit a space if we aren't returning any bullets. |
| 246 | return strlen( $res ) ? $res . $space : ''; |
| 247 | } |
| 248 | |
| 249 | /** |
| 250 | * Helper: Newline constraint helper for table nodes |
| 251 | * @param Node $node |
| 252 | * @param Node $origNode |
| 253 | * @return int |
| 254 | */ |
| 255 | protected function maxNLsInTable( Node $node, Node $origNode ): int { |
| 256 | return ( WTUtils::isNewElt( $node ) || WTUtils::isNewElt( $origNode ) ) ? 1 : 2; |
| 257 | } |
| 258 | |
| 259 | /** |
| 260 | * Private helper for serializing table nodes |
| 261 | * @param string $symbol |
| 262 | * @param ?string $endSymbol |
| 263 | * @param SerializerState $state |
| 264 | * @param Element $node |
| 265 | * @return string |
| 266 | */ |
| 267 | private function serializeTableElement( |
| 268 | string $symbol, ?string $endSymbol, SerializerState $state, Element $node |
| 269 | ): string { |
| 270 | $token = WTSUtils::mkTagTk( $node ); |
| 271 | $sAttribs = $state->serializer->serializeAttributes( $node, $token ); |
| 272 | if ( $sAttribs !== '' ) { |
| 273 | // IMPORTANT: use ?? not ?: in the first check because we want to preserve an |
| 274 | // empty string. Use != '' in the second to avoid treating '0' as empty. |
| 275 | return $symbol . ' ' . $sAttribs . ( $endSymbol ?? ' |' ); |
| 276 | } else { |
| 277 | return $symbol . ( $endSymbol != '' ? $endSymbol : '' ); |
| 278 | } |
| 279 | } |
| 280 | |
| 281 | /** |
| 282 | * Helper: Handles content serialization for table nodes |
| 283 | * @param string $symbol |
| 284 | * @param ?string $endSymbol |
| 285 | * @param SerializerState $state |
| 286 | * @param Element $node |
| 287 | * @param bool $wrapperUnmodified |
| 288 | * @return string |
| 289 | */ |
| 290 | protected function serializeTableTag( |
| 291 | string $symbol, |
| 292 | ?string $endSymbol, |
| 293 | SerializerState $state, |
| 294 | Element $node, |
| 295 | bool $wrapperUnmodified |
| 296 | ): string { |
| 297 | if ( $wrapperUnmodified ) { |
| 298 | $dsr = DOMDataUtils::getDataParsoid( $node )->dsr; |
| 299 | return $state->getOrigSrc( $dsr->openRange() ) ?? ''; |
| 300 | } else { |
| 301 | return $this->serializeTableElement( $symbol, $endSymbol, $state, $node ); |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | /** |
| 306 | * Helper: Checks whether syntax information in data-parsoid is valid |
| 307 | * in the presence of table edits. For example "|" is no longer valid |
| 308 | * table-cell markup if a table cell is added before this cell. |
| 309 | * |
| 310 | * @param SerializerState $state |
| 311 | * @param Element $node |
| 312 | * @return bool |
| 313 | */ |
| 314 | protected function stxInfoValidForTableCell( SerializerState $state, Element $node ): bool { |
| 315 | // If row syntax is not set, nothing to worry about |
| 316 | if ( ( DOMDataUtils::getDataParsoid( $node )->stx ?? null ) !== 'row' ) { |
| 317 | return true; |
| 318 | } |
| 319 | |
| 320 | // If we have an identical previous sibling, nothing to worry about |
| 321 | $prev = DiffDOMUtils::previousNonDeletedSibling( $node ); |
| 322 | return $prev !== null && DOMCompat::nodeName( $prev ) === DOMCompat::nodeName( $node ); |
| 323 | } |
| 324 | |
| 325 | /** |
| 326 | * Helper for several DOM handlers: Returns whitespace that needs to be emitted |
| 327 | * between the markup for the node and its content (ex: table cells, list items) |
| 328 | * based on node state (whether the node is original or new content) and other |
| 329 | * state (HTML version, whether selective serialization is enabled or not). |
| 330 | * @param SerializerState $state |
| 331 | * @param Element $node |
| 332 | * @param string $newEltDefault |
| 333 | * @return string |
| 334 | */ |
| 335 | protected function getLeadingSpace( |
| 336 | SerializerState $state, Element $node, string $newEltDefault |
| 337 | ): string { |
| 338 | $space = ''; |
| 339 | if ( WTUtils::isNewElt( $node ) ) { |
| 340 | $fc = DiffDOMUtils::firstNonDeletedChild( $node ); |
| 341 | // PORT-FIXME are different \s semantics going to be a problem? |
| 342 | if ( $fc && ( !( $fc instanceof Text ) || !preg_match( '/^\s/', $fc->nodeValue ) ) ) { |
| 343 | $space = $newEltDefault; |
| 344 | } |
| 345 | } |
| 346 | return $space; |
| 347 | } |
| 348 | |
| 349 | /** |
| 350 | * Helper for several DOM handlers: Returns whitespace that needs to be emitted |
| 351 | * between the markup for the node and its next sibling based on node state |
| 352 | * (whether the node is original or new content) and other state (HTML version, |
| 353 | * whether selective serialization is enabled or not). |
| 354 | * @param SerializerState $state |
| 355 | * @param Element $node |
| 356 | * @param string $newEltDefault |
| 357 | * @return string |
| 358 | */ |
| 359 | protected function getTrailingSpace( |
| 360 | SerializerState $state, Element $node, string $newEltDefault |
| 361 | ): string { |
| 362 | $space = ''; |
| 363 | if ( WTUtils::isNewElt( $node ) ) { |
| 364 | $lc = DiffDOMUtils::lastNonDeletedChild( $node ); |
| 365 | // PORT-FIXME are different \s semantics going to be a problem? |
| 366 | if ( $lc && ( !( $lc instanceof Text ) || !preg_match( '/\s$/D', $lc->nodeValue ) ) ) { |
| 367 | $space = $newEltDefault; |
| 368 | } |
| 369 | } |
| 370 | return $space; |
| 371 | } |
| 372 | |
| 373 | /** |
| 374 | * Helper: Is this node auto-inserted by the HTML5 tree-builder |
| 375 | * during wt->html? |
| 376 | * @param Node $node |
| 377 | * @return bool |
| 378 | */ |
| 379 | protected function isBuilderInsertedElt( Node $node ): bool { |
| 380 | if ( !( $node instanceof Element ) ) { |
| 381 | return false; |
| 382 | } |
| 383 | '@phan-var Element $node';/** @var Element $node */ |
| 384 | $dp = DOMDataUtils::getDataParsoid( $node ); |
| 385 | return !empty( $dp->autoInsertedStart ) && !empty( $dp->autoInsertedEnd ); |
| 386 | } |
| 387 | |
| 388 | /** |
| 389 | * Uneditable forms wrapped with mw:Placeholder tags OR unedited nowikis |
| 390 | * N.B. We no longer emit self-closed nowikis as placeholders, so remove this |
| 391 | * once all our stored content is updated. |
| 392 | * @param Element $node |
| 393 | * @param SerializerState $state |
| 394 | */ |
| 395 | protected function emitPlaceholderSrc( Element $node, SerializerState $state ) { |
| 396 | $dp = DOMDataUtils::getDataParsoid( $node ); |
| 397 | if ( preg_match( '!<nowiki\s*/>!', $dp->src ?? '' ) ) { |
| 398 | $state->hasSelfClosingNowikis = true; |
| 399 | } |
| 400 | // FIXME: Should this also check for tabs and plain space |
| 401 | // chars interspersed with newlines? |
| 402 | if ( preg_match( '/^\n+$/D', $dp->src ?? '' ) ) { |
| 403 | $state->appendSep( $dp->src ); |
| 404 | } else { |
| 405 | $state->serializer->emitWikitext( $dp->src, $node ); |
| 406 | } |
| 407 | } |
| 408 | |
| 409 | } |