Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
1.32% |
6 / 454 |
|
6.67% |
1 / 15 |
CRAP | |
0.00% |
0 / 1 |
| Separators | |
1.32% |
6 / 454 |
|
6.67% |
1 / 15 |
45046.54 | |
0.00% |
0 / 1 |
| loggableConstraints | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
6 | |||
| precedingSeparatorTextLen | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
30 | |||
| getSepNlConstraints | |
0.00% |
0 / 35 |
|
0.00% |
0 / 1 |
72 | |||
| makeSeparator | |
0.00% |
0 / 64 |
|
0.00% |
0 / 1 |
702 | |||
| mergeConstraints | |
0.00% |
0 / 15 |
|
0.00% |
0 / 1 |
6 | |||
| debugOut | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
| updateSeparatorConstraints | |
0.00% |
0 / 37 |
|
0.00% |
0 / 1 |
72 | |||
| __construct | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| makeSepIndentPreSafe | |
0.00% |
0 / 60 |
|
0.00% |
0 / 1 |
1260 | |||
| handleAutoInserted | |
0.00% |
0 / 9 |
|
0.00% |
0 / 1 |
20 | |||
| fetchLeadingTrimmedSpace | |
0.00% |
0 / 31 |
|
0.00% |
0 / 1 |
342 | |||
| fetchTrailingTrimmedSpace | |
0.00% |
0 / 27 |
|
0.00% |
0 / 1 |
240 | |||
| recoverTrimmedWhitespace | |
100.00% |
6 / 6 |
|
100.00% |
1 / 1 |
4 | |||
| buildSep | |
0.00% |
0 / 132 |
|
0.00% |
0 / 1 |
6642 | |||
| needNewSep | |
0.00% |
0 / 4 |
|
0.00% |
0 / 1 |
20 | |||
| 1 | <?php |
| 2 | |
| 3 | declare( strict_types = 1 ); |
| 4 | |
| 5 | namespace Wikimedia\Parsoid\Html2Wt; |
| 6 | |
| 7 | use Wikimedia\Assert\Assert; |
| 8 | use Wikimedia\Parsoid\Config\Env; |
| 9 | use Wikimedia\Parsoid\Core\DOMCompat; |
| 10 | use Wikimedia\Parsoid\Core\DomSourceRange; |
| 11 | use Wikimedia\Parsoid\DOM\Comment; |
| 12 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
| 13 | use Wikimedia\Parsoid\DOM\Element; |
| 14 | use Wikimedia\Parsoid\DOM\Node; |
| 15 | use Wikimedia\Parsoid\Html2Wt\DOMHandlers\DOMHandler; |
| 16 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
| 17 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 18 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 19 | use Wikimedia\Parsoid\Utils\PHPUtils; |
| 20 | use Wikimedia\Parsoid\Utils\TokenUtils; |
| 21 | use Wikimedia\Parsoid\Utils\Utils; |
| 22 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 23 | use Wikimedia\Parsoid\Wikitext\Consts; |
| 24 | |
| 25 | class Separators { |
| 26 | /* |
| 27 | * This regexp looks for leading whitespace on the last line of a separator string. |
| 28 | * So, only comments (single or multi-line) or other newlines can precede that |
| 29 | * whitespace-of-interest. But, also account for any whitespace preceding newlines |
| 30 | * since that needs to be skipped over (Ex: " \n "). |
| 31 | */ |
| 32 | private const INDENT_PRE_WS_IN_SEP_REGEXP = |
| 33 | '/^((?: *\n|(?:' . Utils::COMMENT_REGEXP_FRAGMENT . '))*)( +)([^\n]*)$/D'; |
| 34 | |
| 35 | /** |
| 36 | * @var SerializerState |
| 37 | */ |
| 38 | private $state; |
| 39 | |
| 40 | /** |
| 41 | * @var Env |
| 42 | */ |
| 43 | private $env; |
| 44 | |
| 45 | /** |
| 46 | * Clean up the constraints object to prevent excessively verbose output |
| 47 | * and clog up log files / test runs. |
| 48 | * |
| 49 | * @param array $constraints |
| 50 | * @return array |
| 51 | */ |
| 52 | private static function loggableConstraints( array $constraints ): array { |
| 53 | $c = [ |
| 54 | 'a' => $constraints['a'] ?? null, |
| 55 | 'b' => $constraints['b'] ?? null, |
| 56 | 'min' => $constraints['min'] ?? null, |
| 57 | 'max' => $constraints['max'] ?? null, |
| 58 | ]; |
| 59 | if ( !empty( $constraints['constraintInfo'] ) ) { |
| 60 | $constraintInfo = $constraints['constraintInfo']; |
| 61 | $c['constraintInfo'] = [ |
| 62 | 'onSOL' => $constraintInfo['onSOL'] ?? false, |
| 63 | 'sepType' => $constraintInfo['sepType'] ?? null, |
| 64 | 'nodeA' => DOMUtils::nodeName( $constraintInfo['nodeA'] ), |
| 65 | 'nodeB' => DOMUtils::nodeName( $constraintInfo['nodeB'] ), |
| 66 | ]; |
| 67 | } |
| 68 | return $c; |
| 69 | } |
| 70 | |
| 71 | private static function precedingSeparatorTextLen( Node $n ): ?int { |
| 72 | // Given the CSS white-space property and specifically, |
| 73 | // "pre" and "pre-line" values for this property, it seems that any |
| 74 | // sensible HTML editor would have to preserve IEW in HTML documents |
| 75 | // to preserve rendering. One use-case where an editor might change |
| 76 | // IEW drastically would be when the user explicitly requests it |
| 77 | // (Ex: pretty-printing of raw source code). |
| 78 | // |
| 79 | // For now, we are going to exploit this. This information is |
| 80 | // only used to extrapolate DSR values and extract a separator |
| 81 | // string from source, and is only used locally. In addition, |
| 82 | // the extracted text is verified for being a valid separator. |
| 83 | // |
| 84 | // So, at worst, this can create a local dirty diff around separators |
| 85 | // and at best, it gets us a clean diff. |
| 86 | |
| 87 | $len = 0; |
| 88 | $orig = $n; |
| 89 | while ( $n ) { |
| 90 | if ( DOMUtils::isIEW( $n ) ) { |
| 91 | $len += strlen( $n->nodeValue ); |
| 92 | } elseif ( $n instanceof Comment ) { |
| 93 | $len += WTUtils::decodedCommentLength( $n ); |
| 94 | } elseif ( $n !== $orig ) { // dont return if input node! |
| 95 | return null; |
| 96 | } |
| 97 | |
| 98 | $n = $n->previousSibling; |
| 99 | } |
| 100 | |
| 101 | return $len; |
| 102 | } |
| 103 | |
| 104 | /** |
| 105 | * Helper for updateSeparatorConstraints. |
| 106 | * |
| 107 | * Collects, checks and integrates separator newline requirements to a simple |
| 108 | * min, max structure. |
| 109 | * |
| 110 | * @param Node $nodeA |
| 111 | * @param array $aCons |
| 112 | * @param Node $nodeB |
| 113 | * @param array $bCons |
| 114 | * @return array |
| 115 | */ |
| 116 | private function getSepNlConstraints( |
| 117 | Node $nodeA, array $aCons, Node $nodeB, array $bCons |
| 118 | ): array { |
| 119 | $env = $this->state->getEnv(); |
| 120 | |
| 121 | $nlConstraints = [ |
| 122 | 'min' => $aCons['min'] ?? null, |
| 123 | 'max' => $aCons['max'] ?? null, |
| 124 | 'constraintInfo' => [], |
| 125 | ]; |
| 126 | |
| 127 | if ( isset( $bCons['min'] ) ) { |
| 128 | if ( $nlConstraints['max'] !== null && $nlConstraints['max'] < $bCons['min'] ) { |
| 129 | // Conflict, warn and let nodeB win. |
| 130 | $env->log( |
| 131 | 'info/html2wt', |
| 132 | 'Incompatible constraints 1:', |
| 133 | DOMUtils::nodeName( $nodeA ), |
| 134 | DOMUtils::nodeName( $nodeB ), |
| 135 | self::loggableConstraints( $nlConstraints ) |
| 136 | ); |
| 137 | $nlConstraints['min'] = $bCons['min']; |
| 138 | $nlConstraints['max'] = $bCons['min']; |
| 139 | } else { |
| 140 | $nlConstraints['min'] = max( $nlConstraints['min'] ?? 0, $bCons['min'] ); |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | if ( isset( $bCons['max'] ) ) { |
| 145 | if ( ( $nlConstraints['min'] ?? 0 ) > $bCons['max'] ) { |
| 146 | // Conflict, warn and let nodeB win. |
| 147 | $env->log( |
| 148 | 'info/html2wt', |
| 149 | 'Incompatible constraints 2:', |
| 150 | DOMUtils::nodeName( $nodeA ), |
| 151 | DOMUtils::nodeName( $nodeB ), |
| 152 | self::loggableConstraints( $nlConstraints ) |
| 153 | ); |
| 154 | $nlConstraints['min'] = $bCons['max']; |
| 155 | $nlConstraints['max'] = $bCons['max']; |
| 156 | } else { |
| 157 | $nlConstraints['max'] = min( $nlConstraints['max'] ?? $bCons['max'], $bCons['max'] ); |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | if ( $nlConstraints['max'] === null ) { |
| 162 | // Anything more than two lines will trigger paragraphs, so default to |
| 163 | // two if nothing is specified. (FIXME: This is a conservative strategy |
| 164 | // since strictly speaking, this is not always true. This is more a |
| 165 | // cautious fallback to handle cases where some DOM handler is missing |
| 166 | // a necessary max constraint.) |
| 167 | $nlConstraints['max'] = 2; |
| 168 | } |
| 169 | |
| 170 | if ( ( $nlConstraints['min'] ?? 0 ) > $nlConstraints['max'] ) { |
| 171 | $nlConstraints['max'] = $nlConstraints['min']; |
| 172 | } |
| 173 | |
| 174 | return $nlConstraints; |
| 175 | } |
| 176 | |
| 177 | /** |
| 178 | * Create a separator given a (potentially empty) separator text and newline constraints. |
| 179 | * |
| 180 | * @param Node $node |
| 181 | * @param string $sep |
| 182 | * @param array $nlConstraints |
| 183 | * @return string |
| 184 | */ |
| 185 | private function makeSeparator( Node $node, string $sep, array $nlConstraints ): string { |
| 186 | $origSep = $sep; |
| 187 | $sepType = $nlConstraints['constraintInfo']['sepType'] ?? null; |
| 188 | |
| 189 | // Split on comment/ws-only lines, consuming subsequent newlines since |
| 190 | // those lines are ignored by the PHP parser |
| 191 | // Ignore lines with ws and a single comment in them |
| 192 | $splitRe = implode( [ "#(?:\n(?:[ \t]*?", |
| 193 | Utils::COMMENT_REGEXP_FRAGMENT, |
| 194 | "[ \t]*?)+(?=\n))+|", |
| 195 | Utils::COMMENT_REGEXP_FRAGMENT, |
| 196 | "#" |
| 197 | ] ); |
| 198 | $sepNlCount = substr_count( implode( preg_split( $splitRe, $sep ) ), "\n" ); |
| 199 | $minNls = $nlConstraints['min'] ?? 0; |
| 200 | |
| 201 | if ( $this->state->atStartOfOutput && $minNls > 0 ) { |
| 202 | // Skip first newline as we are in start-of-line context |
| 203 | $minNls--; |
| 204 | } |
| 205 | |
| 206 | if ( $minNls > 0 && $sepNlCount < $minNls ) { |
| 207 | // Append newlines |
| 208 | $nlBuf = []; |
| 209 | for ( $i = 0; $i < ( $minNls - $sepNlCount ); $i++ ) { |
| 210 | $nlBuf[] = "\n"; |
| 211 | } |
| 212 | |
| 213 | /* ------------------------------------------------------------------ |
| 214 | * The following two heuristics try to do a best-guess on where to |
| 215 | * add the newlines relative to nodeA and nodeB that best matches |
| 216 | * wikitext output expectations. |
| 217 | * |
| 218 | * 1. In a parent-child separator scenario, where the first child of |
| 219 | * nodeA is not an element, it could have contributed to the separator. |
| 220 | * In that case, the newlines should be prepended because they |
| 221 | * usually correspond to the parent's constraints, |
| 222 | * and the separator was plucked from the child. |
| 223 | * |
| 224 | * Try html2wt on this snippet: |
| 225 | * |
| 226 | * a<p><!--cmt-->b</p> |
| 227 | * |
| 228 | * 2. In a sibling scenario, if nodeB is a literal-HTML element, nodeA is |
| 229 | * forcing the newline and hence the newline should be emitted right |
| 230 | * after it. |
| 231 | * |
| 232 | * Try html2wt on this snippet: |
| 233 | * |
| 234 | * <p>foo</p> <p data-parsoid='{"stx":"html"}'>bar</p> |
| 235 | * -------------------------------------------------------------------- */ |
| 236 | $constraintInfo = $nlConstraints['constraintInfo'] ?? []; |
| 237 | $sepType = $constraintInfo['sepType'] ?? null; |
| 238 | $nodeA = $constraintInfo['nodeA'] ?? null; |
| 239 | $nodeB = $constraintInfo['nodeB'] ?? null; |
| 240 | if ( |
| 241 | $sepType === 'parent-child' && |
| 242 | !DiffDOMUtils::isContentNode( DiffDOMUtils::firstNonDeletedChild( $nodeA ) ) && |
| 243 | !( |
| 244 | isset( Consts::$HTML['ChildTableTags'][DOMUtils::nodeName( $nodeB )] ) && |
| 245 | !WTUtils::isLiteralHTMLNode( $nodeB ) |
| 246 | ) |
| 247 | ) { |
| 248 | $sep = implode( $nlBuf ) . $sep; |
| 249 | } elseif ( $sepType === 'sibling' && WTUtils::isLiteralHTMLNode( $nodeB ) ) { |
| 250 | $sep = implode( $nlBuf ) . $sep; |
| 251 | } else { |
| 252 | $sep .= implode( $nlBuf ); |
| 253 | } |
| 254 | } elseif ( isset( $nlConstraints['max'] ) && $sepNlCount > $nlConstraints['max'] && ( |
| 255 | // In selser mode, if the current node is an unmodified rendering-transparent node |
| 256 | // of a sibling pair, leave the separator alone since the excess newlines aren't |
| 257 | // going to change the semantics of how this node will be parsed in wt->html direction. |
| 258 | // This will instead eliminate a dirty diff on the page. |
| 259 | !$this->state->selserMode || |
| 260 | $sepType !== 'sibling' || |
| 261 | !$this->state->currNodeUnmodified || |
| 262 | !WTUtils::isRenderingTransparentNode( $node ) |
| 263 | ) ) { |
| 264 | // Strip some newlines outside of comments. |
| 265 | // |
| 266 | // Capture separators in a single array with a capturing version of |
| 267 | // the split regexp, so that we can work on the non-separator bits |
| 268 | // when stripping newlines. |
| 269 | // |
| 270 | // Dirty-diff minimizing heuristic: Strip newlines away from an unmodified node. |
| 271 | // If both nodes are unmodified, this dirties the separator before the current node. |
| 272 | // If both nodes are modified, this dirties the separator after the previous node. |
| 273 | $allBits = preg_split( '#(' . PHPUtils::reStrip( $splitRe, '#' ) . ')#', |
| 274 | $sep, -1, PREG_SPLIT_DELIM_CAPTURE ); |
| 275 | $newBits = []; |
| 276 | $n = $sepNlCount - $nlConstraints['max']; |
| 277 | |
| 278 | $stripAtEnd = $this->state->prevNodeUnmodified; |
| 279 | while ( $n > 0 ) { |
| 280 | $bit = $stripAtEnd ? array_pop( $allBits ) : array_shift( $allBits ); |
| 281 | while ( $bit && preg_match( $splitRe, $bit ) ) { |
| 282 | // Retain comment-only lines as is |
| 283 | $newBits[] = $bit; |
| 284 | $bit = $stripAtEnd ? array_pop( $allBits ) : array_shift( $allBits ); |
| 285 | } |
| 286 | // @phan-suppress-next-line PhanPluginLoopVariableReuse |
| 287 | while ( $n > 0 && str_contains( $bit, "\n" ) ) { |
| 288 | $bit = preg_replace( '/\n([^\n]*)/', '$1', $bit, 1 ); |
| 289 | $n--; |
| 290 | } |
| 291 | $newBits[] = $bit; |
| 292 | } |
| 293 | if ( $stripAtEnd ) { |
| 294 | $newBits = array_merge( $allBits, array_reverse( $newBits ) ); |
| 295 | } else { |
| 296 | PHPUtils::pushArray( $newBits, $allBits ); |
| 297 | } |
| 298 | $sep = implode( $newBits ); |
| 299 | } |
| 300 | |
| 301 | $this->state->getEnv()->log( |
| 302 | 'debug/wts/sep', |
| 303 | 'make-new |', |
| 304 | static function () use ( $nlConstraints, $sepNlCount, $minNls, $sep, $origSep ) { |
| 305 | $constraints = Utils::cloneArray( $nlConstraints ); |
| 306 | unset( $constraints['constraintInfo'] ); |
| 307 | return PHPUtils::jsonEncode( $sep ) . ', ' . PHPUtils::jsonEncode( $origSep ) . ', ' . |
| 308 | $minNls . ', ' . $sepNlCount . ', ' . PHPUtils::jsonEncode( $constraints ); |
| 309 | } |
| 310 | ); |
| 311 | |
| 312 | return $sep; |
| 313 | } |
| 314 | |
| 315 | /** |
| 316 | * Merge two constraints. |
| 317 | * @param Env $env |
| 318 | * @param array $oldConstraints |
| 319 | * @param array $newConstraints |
| 320 | * @return array |
| 321 | */ |
| 322 | private static function mergeConstraints( |
| 323 | Env $env, array $oldConstraints, array $newConstraints |
| 324 | ): array { |
| 325 | $res = [ |
| 326 | 'min' => max( $oldConstraints['min'] ?? 0, $newConstraints['min'] ?? 0 ), |
| 327 | 'max' => min( $oldConstraints['max'] ?? 2, $newConstraints['max'] ?? 2 ), |
| 328 | 'constraintInfo' => [], |
| 329 | ]; |
| 330 | |
| 331 | if ( $res['min'] > $res['max'] ) { |
| 332 | $res['max'] = $res['min']; |
| 333 | $env->log( |
| 334 | 'info/html2wt', |
| 335 | 'Incompatible constraints (merge):', |
| 336 | $res, |
| 337 | self::loggableConstraints( $oldConstraints ), |
| 338 | self::loggableConstraints( $newConstraints ) |
| 339 | ); |
| 340 | } |
| 341 | |
| 342 | return $res; |
| 343 | } |
| 344 | |
| 345 | public static function debugOut( Node $node ): string { |
| 346 | $value = ''; |
| 347 | if ( $node instanceof Element ) { |
| 348 | $value = DOMCompat::getOuterHTML( $node ); |
| 349 | } |
| 350 | if ( !$value ) { |
| 351 | $value = $node->nodeValue; |
| 352 | } |
| 353 | return mb_substr( PHPUtils::jsonEncode( $value ), 0, 40 ); |
| 354 | } |
| 355 | |
| 356 | /** |
| 357 | * Figure out separator constraints and merge them with existing constraints |
| 358 | * in state so that they can be emitted when the next content emits source. |
| 359 | * |
| 360 | * @param Node $nodeA |
| 361 | * @param DOMHandler $sepHandlerA |
| 362 | * @param Node $nodeB |
| 363 | * @param DOMHandler $sepHandlerB |
| 364 | */ |
| 365 | public function updateSeparatorConstraints( |
| 366 | Node $nodeA, DOMHandler $sepHandlerA, Node $nodeB, DOMHandler $sepHandlerB |
| 367 | ): void { |
| 368 | $state = $this->state; |
| 369 | |
| 370 | if ( $nodeB->parentNode === $nodeA ) { |
| 371 | // parent-child separator, nodeA parent of nodeB |
| 372 | '@phan-var Element|DocumentFragment $nodeA'; // @var Element|DocumentFragment $nodeA |
| 373 | $sepType = 'parent-child'; |
| 374 | $aCons = $sepHandlerA->firstChild( $nodeA, $nodeB, $state ); |
| 375 | $bCons = $nodeB instanceof Element ? $sepHandlerB->before( $nodeB, $nodeA, $state ) : []; |
| 376 | } elseif ( $nodeA->parentNode === $nodeB ) { |
| 377 | // parent-child separator, nodeB parent of nodeA |
| 378 | '@phan-var Element|DocumentFragment $nodeB'; // @var Element|DocumentFragment $nodeA |
| 379 | $sepType = 'child-parent'; |
| 380 | $aCons = $nodeA instanceof Element ? $sepHandlerA->after( $nodeA, $nodeB, $state ) : []; |
| 381 | $bCons = $sepHandlerB->lastChild( $nodeB, $nodeA, $state ); |
| 382 | } else { |
| 383 | // sibling separator |
| 384 | $sepType = 'sibling'; |
| 385 | $aCons = $nodeA instanceof Element ? $sepHandlerA->after( $nodeA, $nodeB, $state ) : []; |
| 386 | $bCons = $nodeB instanceof Element ? $sepHandlerB->before( $nodeB, $nodeA, $state ) : []; |
| 387 | } |
| 388 | $nlConstraints = $this->getSepNlConstraints( $nodeA, $aCons, $nodeB, $bCons ); |
| 389 | |
| 390 | if ( !empty( $state->sep->constraints ) ) { |
| 391 | // Merge the constraints |
| 392 | $state->sep->constraints = self::mergeConstraints( |
| 393 | $this->env, |
| 394 | $state->sep->constraints, |
| 395 | $nlConstraints |
| 396 | ); |
| 397 | } else { |
| 398 | $state->sep->constraints = $nlConstraints; |
| 399 | } |
| 400 | |
| 401 | $this->env->log( |
| 402 | 'debug/wts/sep', |
| 403 | function () use ( $sepType, $nodeA, $nodeB, $state ) { |
| 404 | return 'constraint' . ' | ' . |
| 405 | $sepType . ' | ' . |
| 406 | '<' . DOMUtils::nodeName( $nodeA ) . ',' . DOMUtils::nodeName( $nodeB ) . |
| 407 | '>' . ' | ' . PHPUtils::jsonEncode( $state->sep->constraints ) . ' | ' . |
| 408 | self::debugOut( $nodeA ) . ' | ' . self::debugOut( $nodeB ); |
| 409 | } |
| 410 | ); |
| 411 | |
| 412 | $state->sep->constraints['constraintInfo'] = [ |
| 413 | 'onSOL' => $state->onSOL, |
| 414 | // force SOL state when separator is built/emitted |
| 415 | 'forceSOL' => $sepHandlerB->forceSOL(), |
| 416 | 'sepType' => $sepType, |
| 417 | 'nodeA' => $nodeA, |
| 418 | 'nodeB' => $nodeB, |
| 419 | ]; |
| 420 | } |
| 421 | |
| 422 | public function __construct( Env $env, SerializerState $state ) { |
| 423 | $this->env = $env; |
| 424 | $this->state = $state; |
| 425 | } |
| 426 | |
| 427 | private function makeSepIndentPreSafe( |
| 428 | string $sep, array $nlConstraints |
| 429 | ): string { |
| 430 | $state = $this->state; |
| 431 | $constraintInfo = $nlConstraints['constraintInfo'] ?? []; |
| 432 | $sepType = $constraintInfo['sepType'] ?? null; |
| 433 | $nodeA = $constraintInfo['nodeA'] ?? null; |
| 434 | $nodeB = $constraintInfo['nodeB'] ?? null; |
| 435 | $forceSOL = ( $constraintInfo['forceSOL'] ?? false ) && $sepType !== 'child-parent'; |
| 436 | $origNodeB = $nodeB; |
| 437 | |
| 438 | // Ex: "<div>foo</div>\n <span>bar</span>" |
| 439 | // |
| 440 | // We also should test for onSOL state to deal with HTML like |
| 441 | // <ul> <li>foo</li></ul> |
| 442 | // and strip the leading space before non-indent-pre-safe tags |
| 443 | if ( |
| 444 | !$state->inPHPBlock && |
| 445 | !$state->inIndentPre && |
| 446 | preg_match( self::INDENT_PRE_WS_IN_SEP_REGEXP, $sep ) && ( |
| 447 | str_contains( $sep, "\n" ) || !empty( $constraintInfo['onSOL'] ) || $forceSOL |
| 448 | ) |
| 449 | ) { |
| 450 | // 'sep' is the separator before 'nodeB' and it has leading spaces on a newline. |
| 451 | // We have to decide whether that leading space will trigger indent-pres in wikitext. |
| 452 | // The decision depends on where this separator will be emitted relative |
| 453 | // to 'nodeA' and 'nodeB'. |
| 454 | |
| 455 | $isIndentPreSafe = false; |
| 456 | |
| 457 | // Example sepType scenarios: |
| 458 | // |
| 459 | // 1. sibling |
| 460 | // <div>foo</div> |
| 461 | // <span>bar</span> |
| 462 | // The span will be wrapped in an indent-pre if the leading space |
| 463 | // is not stripped since span is not a block tag |
| 464 | // |
| 465 | // 2. child-parent |
| 466 | // <span>foo |
| 467 | // </span>bar |
| 468 | // The " </span>bar" will be wrapped in an indent-pre if the |
| 469 | // leading space is not stripped since span is not a block tag |
| 470 | // |
| 471 | // 3. parent-child |
| 472 | // <div>foo |
| 473 | // <span>bar</span> |
| 474 | // </div> |
| 475 | // |
| 476 | // In all cases, only block-tags prevent indent-pres. |
| 477 | // (except for a special case for <br> nodes) |
| 478 | if ( $nodeB && WTSUtils::precedingSpaceSuppressesIndentPre( $nodeB, $origNodeB ) ) { |
| 479 | $isIndentPreSafe = true; |
| 480 | } elseif ( $sepType === 'sibling' || ( $nodeA && DOMUtils::atTheTop( $nodeA ) ) ) { |
| 481 | Assert::invariant( !DOMUtils::atTheTop( $nodeA ) || $sepType === 'parent-child', __METHOD__ ); |
| 482 | |
| 483 | // 'nodeB' is the first non-separator child of 'nodeA'. |
| 484 | // |
| 485 | // Walk past sol-transparent nodes in the right-sibling chain |
| 486 | // of 'nodeB' till we establish indent-pre safety. |
| 487 | while ( $nodeB && |
| 488 | ( DiffUtils::isDiffMarker( $nodeB ) || WTUtils::emitsSolTransparentSingleLineWT( $nodeB ) ) |
| 489 | ) { |
| 490 | $nodeB = $nodeB->nextSibling; |
| 491 | } |
| 492 | |
| 493 | $isIndentPreSafe = !$nodeB || WTSUtils::precedingSpaceSuppressesIndentPre( $nodeB, $origNodeB ); |
| 494 | } |
| 495 | |
| 496 | // Check whether nodeB is nested inside an element that suppresses |
| 497 | // indent-pres. |
| 498 | if ( $nodeB && !$isIndentPreSafe && !DOMUtils::atTheTop( $nodeB ) ) { |
| 499 | $parentB = $nodeB->parentNode; // could be nodeA |
| 500 | while ( WTUtils::isZeroWidthWikitextElt( $parentB ) ) { |
| 501 | $parentB = $parentB->parentNode; |
| 502 | } |
| 503 | |
| 504 | // The token stream paragraph wrapper (and legacy doBlockLevels) |
| 505 | // tracks this separately with $inBlockquote |
| 506 | $isIndentPreSafe = DOMUtils::hasNameOrHasAncestorOfName( |
| 507 | $parentB, 'blockquote' |
| 508 | ); |
| 509 | |
| 510 | // First scope wins |
| 511 | while ( !$isIndentPreSafe && !DOMUtils::atTheTop( $parentB ) ) { |
| 512 | if ( |
| 513 | TokenUtils::tagOpensBlockScope( DOMUtils::nodeName( $parentB ) ) && |
| 514 | // Only html p-tag is indent pre suppressing |
| 515 | ( DOMUtils::nodeName( $parentB ) !== 'p' || WTUtils::isLiteralHTMLNode( $parentB ) ) |
| 516 | ) { |
| 517 | $isIndentPreSafe = true; |
| 518 | break; |
| 519 | } elseif ( TokenUtils::tagClosesBlockScope( DOMUtils::nodeName( $parentB ) ) ) { |
| 520 | break; |
| 521 | } |
| 522 | $parentB = $parentB->parentNode; |
| 523 | } |
| 524 | } |
| 525 | |
| 526 | // @phan-suppress-next-line PhanUndeclaredVariable false positive |
| 527 | $stripLeadingSpace = ( !empty( $constraintInfo['onSOL'] ) || $forceSOL ) && |
| 528 | $nodeB && !WTUtils::isLiteralHTMLNode( $nodeB ) && |
| 529 | isset( Consts::$HTMLTagsRequiringSOLContext[DOMUtils::nodeName( $nodeB )] ); |
| 530 | if ( !$isIndentPreSafe || $stripLeadingSpace ) { |
| 531 | // Wrap non-nl ws from last line, but preserve comments. |
| 532 | // This avoids triggering indent-pres. |
| 533 | $sep = preg_replace_callback( |
| 534 | self::INDENT_PRE_WS_IN_SEP_REGEXP, |
| 535 | static function ( $matches ) use ( $stripLeadingSpace, $state ) { |
| 536 | if ( !$stripLeadingSpace ) { |
| 537 | // Since we nowiki-ed, we are no longer in sol state |
| 538 | $state->onSOL = false; |
| 539 | $state->hasIndentPreNowikis = true; |
| 540 | $space = '<nowiki>' . $matches[2] . '</nowiki>'; |
| 541 | } |
| 542 | return ( $matches[1] ?? '' ) . ( $space ?? '' ) . ( $matches[3] ?? '' ); |
| 543 | }, |
| 544 | $sep |
| 545 | ); |
| 546 | } |
| 547 | } |
| 548 | |
| 549 | $state->getEnv()->log( |
| 550 | 'debug/wts/sep', |
| 551 | 'ipre-safe |', |
| 552 | static function () use ( $sep, $nlConstraints ) { |
| 553 | $constraints = Utils::cloneArray( $nlConstraints ); |
| 554 | unset( $constraints['constraintInfo'] ); |
| 555 | return PHPUtils::jsonEncode( $sep ) . ', ' . PHPUtils::jsonEncode( $constraints ); |
| 556 | } |
| 557 | ); |
| 558 | |
| 559 | return $sep; |
| 560 | } |
| 561 | |
| 562 | /** |
| 563 | * Serializing auto inserted content should invalidate the original separator |
| 564 | * @param Element $node |
| 565 | * @return DomSourceRange|null |
| 566 | */ |
| 567 | private static function handleAutoInserted( Element $node ): ?DomSourceRange { |
| 568 | $dp = DOMDataUtils::getDataParsoid( $node ); |
| 569 | if ( !isset( $dp->dsr ) ) { |
| 570 | return null; |
| 571 | } |
| 572 | |
| 573 | $dsr = clone $dp->dsr; |
| 574 | if ( !empty( $dp->autoInsertedStart ) ) { |
| 575 | $dsr->openWidth = null; |
| 576 | } |
| 577 | if ( !empty( $dp->autoInsertedEnd ) ) { |
| 578 | $dsr->closeWidth = null; |
| 579 | } |
| 580 | return $dsr; |
| 581 | } |
| 582 | |
| 583 | /** |
| 584 | * $node is embedded inside a parent node that has its leading/trailing whitespace trimmed |
| 585 | * in the wt->html direction. In this method, we attempt to recover leading trimmed whitespace |
| 586 | * using DSR information on $node. |
| 587 | * |
| 588 | * In some cases, $node might have an additional "data-mw-selser-wrapper" span |
| 589 | * that is added by SelSer - look past those wrappers. |
| 590 | * |
| 591 | * The recovery is attempted in two different ways: |
| 592 | * 1. If we have additional DSR fields about leading/trailing WS |
| 593 | * (represented by $state->haveTrimmedWsDSR), that info is used. |
| 594 | * 2. If not, we simply inspect source at $dsr->innerStart and if it |
| 595 | * happens to be whitespace, we use that. |
| 596 | * |
| 597 | * @param Node $node |
| 598 | * @return ?string |
| 599 | */ |
| 600 | private function fetchLeadingTrimmedSpace( Node $node ): ?string { |
| 601 | $origNode = $node; |
| 602 | $parentNode = $node->parentNode; |
| 603 | |
| 604 | // Skip past the artificial span wrapper |
| 605 | if ( $parentNode instanceof Element && $parentNode->hasAttribute( 'data-mw-selser-wrapper' ) ) { |
| 606 | $node = $parentNode; |
| 607 | $parentNode = $parentNode->parentNode; |
| 608 | } |
| 609 | |
| 610 | // Leading trimmed whitespace only makes sense for first child. |
| 611 | // Ignore comments (which are part of separators) + deletion markers. |
| 612 | if ( DiffDOMUtils::previousNonSepSibling( $node ) ) { |
| 613 | return null; |
| 614 | } |
| 615 | |
| 616 | '@phan-var Element|DocumentFragment $parentNode'; // @var Element|DocumentFragment $parentNode |
| 617 | if ( isset( Consts::$WikitextTagsWithTrimmableWS[DOMUtils::nodeName( $parentNode )] ) && |
| 618 | ( $origNode instanceof Element || !preg_match( '/^[ \t]/', $origNode->nodeValue ) ) |
| 619 | ) { |
| 620 | // Don't reintroduce whitespace that's already been captured as a DisplaySpace |
| 621 | if ( DOMUtils::hasTypeOf( $origNode, 'mw:DisplaySpace' ) ) { |
| 622 | return null; |
| 623 | } |
| 624 | |
| 625 | // FIXME: Is this complexity worth some minor dirty diff on this test? |
| 626 | // ParserTest: "3. List embedded in a formatting tag in a misnested way" |
| 627 | // I've not added an equivalent check in the trailing whitespace case. |
| 628 | if ( $origNode instanceof Element && |
| 629 | isset( DOMDataUtils::getDataParsoid( $origNode )->autoInsertedStart ) && |
| 630 | strspn( $origNode->firstChild->textContent ?? '', " \t" ) >= 1 |
| 631 | ) { |
| 632 | return null; |
| 633 | } |
| 634 | |
| 635 | $state = $this->state; |
| 636 | $dsr = DOMDataUtils::getDataParsoid( $parentNode )->dsr ?? null; |
| 637 | if ( Utils::isValidDSR( $dsr, true ) ) { |
| 638 | if ( |
| 639 | $state->haveTrimmedWsDSR && |
| 640 | $dsr->hasTrimmedWS() && |
| 641 | $dsr->hasValidLeadingWS() |
| 642 | ) { |
| 643 | if ( preg_match( |
| 644 | '/^([ \t]*)/', |
| 645 | $state->getOrigSrc( $dsr->innerRange() ) ?? '', |
| 646 | $matches |
| 647 | ) ) { |
| 648 | // $matches[1] is just spaces and tabs |
| 649 | return substr( $matches[1], 0, $dsr->leadingWS ); |
| 650 | } |
| 651 | } elseif ( $dsr->innerStart() < $dsr->innerEnd() ) { |
| 652 | $sep = $state->getOrigSrc( $dsr->innerRange() ) ?? ''; |
| 653 | // return first character of inner range iff it is |
| 654 | // tab or space |
| 655 | return preg_match( '/^[ \t]/', $sep ) ? $sep[0] : null; |
| 656 | } |
| 657 | } |
| 658 | } |
| 659 | |
| 660 | return null; |
| 661 | } |
| 662 | |
| 663 | /** |
| 664 | * $node is embedded inside a parent node that has its leading/trailing whitespace trimmed |
| 665 | * in the wt->html direction. In this method, we attempt to recover trailing trimmed whitespace |
| 666 | * using DSR information on $node. |
| 667 | * |
| 668 | * In some cases, $node might have an additional "data-mw-selser-wrapper" span |
| 669 | * that is added by SelSer - look past those wrappers. |
| 670 | * |
| 671 | * The recovery is attempted in two different ways: |
| 672 | * 1. If we have additional DSR fields about leading/trailing WS |
| 673 | * (represented by $state->haveTrimmedWsDSR), that info is used. |
| 674 | * 2. If not, we simply inspect source at $dsr->innerEnd and if it |
| 675 | * happens to be whitespace, we use that. |
| 676 | * |
| 677 | * @param Node $node |
| 678 | * @return ?string |
| 679 | */ |
| 680 | private function fetchTrailingTrimmedSpace( Node $node ): ?string { |
| 681 | $origNode = $node; |
| 682 | $parentNode = $node->parentNode; |
| 683 | |
| 684 | // Skip past the artificial span wrapper |
| 685 | if ( $parentNode instanceof Element && $parentNode->hasAttribute( 'data-mw-selser-wrapper' ) ) { |
| 686 | $node = $parentNode; |
| 687 | $parentNode = $parentNode->parentNode; |
| 688 | } |
| 689 | |
| 690 | // Trailing trimmed whitespace only makes sense for last child. |
| 691 | // Ignore comments (which are part of separators) + deletion markers. |
| 692 | if ( DiffDOMUtils::nextNonSepSibling( $node ) ) { |
| 693 | return null; |
| 694 | } |
| 695 | |
| 696 | '@phan-var Element|DocumentFragment $parentNode'; // @var Element|DocumentFragment $parentNode |
| 697 | if ( isset( Consts::$WikitextTagsWithTrimmableWS[DOMUtils::nodeName( $parentNode )] ) && |
| 698 | ( $origNode instanceof Element || !preg_match( '/[ \t]$/', $origNode->nodeValue ) ) |
| 699 | ) { |
| 700 | // Don't reintroduce whitespace that's already been captured as a DisplaySpace |
| 701 | if ( DOMUtils::hasTypeOf( $origNode, 'mw:DisplaySpace' ) ) { |
| 702 | return null; |
| 703 | } |
| 704 | |
| 705 | $state = $this->state; |
| 706 | $dsr = DOMDataUtils::getDataParsoid( $parentNode )->dsr ?? null; |
| 707 | if ( Utils::isValidDSR( $dsr, true ) ) { |
| 708 | if ( |
| 709 | $state->haveTrimmedWsDSR && |
| 710 | $dsr->hasTrimmedWS() && |
| 711 | $dsr->hasValidTrailingWS() |
| 712 | ) { |
| 713 | if ( preg_match( |
| 714 | '/([ \t]*)$/', |
| 715 | $state->getOrigSrc( $dsr->innerRange() ) ?? '', |
| 716 | $matches |
| 717 | ) ) { |
| 718 | // $matches[1] is just spaces and tabs |
| 719 | // note that trailingWS can be zero |
| 720 | return substr( $matches[1], strlen( $matches[1] ) - $dsr->trailingWS ); |
| 721 | } |
| 722 | } elseif ( ( $dsr->innerEnd() - 1 ) > $dsr->innerStart() ) { |
| 723 | // The > instead of >= in the test above is to |
| 724 | // deal with an edge case where that single space |
| 725 | // is captured by the getLeadingSpace case above |
| 726 | $sep = $state->getOrigSrc( $dsr->innerRange() ) ?? ''; |
| 727 | // Return last character of $sep iff it is space or tab |
| 728 | return preg_match( '/[ \t]$/', $sep ) ? substr( $sep, -1 ) : null; |
| 729 | } |
| 730 | } |
| 731 | } |
| 732 | |
| 733 | return null; |
| 734 | } |
| 735 | |
| 736 | /** |
| 737 | * Emit a separator based on the collected (and merged) constraints |
| 738 | * and existing separator text. Called when new output is triggered. |
| 739 | * @param Node $node |
| 740 | * @param bool $leading |
| 741 | * if true, trimmed leading whitespace is emitted |
| 742 | * if false, trimmed trailing whitespace is emitted |
| 743 | * @return string|null |
| 744 | */ |
| 745 | public function recoverTrimmedWhitespace( Node $node, bool $leading ): ?string { |
| 746 | // Deal with scenarios where leading / trailing whitespace were trimmed. |
| 747 | // We now need to figure out if we need to add any leading / trailing WS back. |
| 748 | if ( $this->state->selserMode ) { |
| 749 | if ( $leading ) { |
| 750 | return $this->fetchLeadingTrimmedSpace( $node ); |
| 751 | } else { |
| 752 | $lastChild = DiffDOMUtils::lastNonDeletedChild( $node ); |
| 753 | return $lastChild ? $this->fetchTrailingTrimmedSpace( $lastChild ) : null; |
| 754 | } |
| 755 | } |
| 756 | |
| 757 | return null; |
| 758 | } |
| 759 | |
| 760 | /** |
| 761 | * Emit a separator based on the collected (and merged) constraints |
| 762 | * and existing separator text. Called when new output is triggered. |
| 763 | * @param Node $node |
| 764 | * @return string|null |
| 765 | */ |
| 766 | public function buildSep( Node $node ): ?string { |
| 767 | $state = $this->state; |
| 768 | $sepType = $state->sep->constraints['constraintInfo']['sepType'] ?? null; |
| 769 | $sep = null; |
| 770 | $origNode = $node; |
| 771 | $prevNode = $state->sep->lastSourceNode; |
| 772 | $dsrA = null; |
| 773 | $dsrB = null; |
| 774 | |
| 775 | /* ---------------------------------------------------------------------- |
| 776 | * Assuming we have access to the original source, we can use DSR offsets |
| 777 | * to extract separators from source only if: |
| 778 | * - we are in selser mode AND |
| 779 | * - this node is not part of a newly inserted subtree (marked 'modified') |
| 780 | * for which DSR isn't available |
| 781 | * - neither node is adjacent to a deleted block node |
| 782 | * (see the long comment in SerializerState::emitChunk in the middle) |
| 783 | * |
| 784 | * In other scenarios, DSR values on "adjacent" nodes in the edited DOM |
| 785 | * may not reflect deleted content between them. |
| 786 | * ---------------------------------------------------------------------- */ |
| 787 | $origSepNeeded = $node !== $prevNode && $state->selserMode; |
| 788 | $origSepNeededAndUsable = |
| 789 | $origSepNeeded && !$state->inInsertedContent && |
| 790 | !WTSUtils::nextToDeletedBlockNodeInWT( $prevNode, true ) && |
| 791 | !WTSUtils::nextToDeletedBlockNodeInWT( $node, false ) && |
| 792 | WTSUtils::origSrcValidInEditedContext( $state, $prevNode ) && |
| 793 | WTSUtils::origSrcValidInEditedContext( $state, $node ) && |
| 794 | !$this->needNewSep( $prevNode ); |
| 795 | if ( $origSepNeededAndUsable ) { |
| 796 | if ( $prevNode instanceof Element ) { |
| 797 | $dsrA = self::handleAutoInserted( $prevNode ); |
| 798 | } elseif ( !( $prevNode instanceof DocumentFragment ) ) { |
| 799 | // Check if $prevNode is the last child of a zero-width element, |
| 800 | // and use that for dsr purposes instead. Typical case: text in p. |
| 801 | if ( |
| 802 | !$prevNode->nextSibling && |
| 803 | $prevNode->parentNode !== $node && |
| 804 | $prevNode->parentNode instanceof Element && |
| 805 | ( DOMDataUtils::getDataParsoid( $prevNode->parentNode )->dsr->closeWidth ?? null ) === 0 |
| 806 | ) { |
| 807 | $dsrA = self::handleAutoInserted( $prevNode->parentNode ); |
| 808 | } elseif ( |
| 809 | // Can we extrapolate DSR from $prevNode->previousSibling? |
| 810 | // Yes, if $prevNode->parentNode didn't have its children edited. |
| 811 | $prevNode->previousSibling instanceof Element && |
| 812 | !DiffUtils::directChildrenChanged( $prevNode->parentNode ) |
| 813 | ) { |
| 814 | $endDsr = DOMDataUtils::getDataParsoid( $prevNode->previousSibling )->dsr->end ?? null; |
| 815 | if ( is_int( $endDsr ) ) { |
| 816 | if ( $prevNode instanceof Comment ) { |
| 817 | '@phan-var Comment $prevNode'; // @var Comment $prevNode |
| 818 | $correction = WTUtils::decodedCommentLength( $prevNode ); |
| 819 | } else { |
| 820 | $correction = strlen( $prevNode->nodeValue ); |
| 821 | } |
| 822 | $dsrA = new DomSourceRange( |
| 823 | $endDsr, |
| 824 | $endDsr + $correction + WTUtils::indentPreDSRCorrection( $prevNode ), |
| 825 | 0, |
| 826 | 0 |
| 827 | ); |
| 828 | } |
| 829 | } |
| 830 | } |
| 831 | |
| 832 | if ( !$dsrA ) { |
| 833 | // nothing to do -- no reason to compute dsrB if dsrA is null |
| 834 | } elseif ( $node instanceof Element ) { |
| 835 | // $node is parent of $prevNode |
| 836 | if ( $prevNode->parentNode === $node ) { |
| 837 | '@phan-var Element|DocumentFragment $node'; // @var Element|DocumentFragment $node |
| 838 | // FIXME: Maybe we shouldn't set dsr in the dsr pass if both aren't valid? |
| 839 | // |
| 840 | // When we are in the lastChild sep scenario and the parent doesn't have |
| 841 | // useable dsr, if possible, walk up the ancestor nodes till we find |
| 842 | // a dsr-bearing node |
| 843 | // |
| 844 | // This fix is needed to handle trailing newlines in this wikitext: |
| 845 | // [[File:foo.jpg|thumb|300px|foo\n{{1x|A}}\n{{1x|B}}\n{{1x|C}}\n\n]] |
| 846 | while ( |
| 847 | !$node->nextSibling && |
| 848 | !DOMUtils::atTheTop( $node ) && |
| 849 | ( |
| 850 | empty( DOMDataUtils::getDataParsoid( $node )->dsr ) || |
| 851 | DOMDataUtils::getDataParsoid( $node )->dsr->start === null || |
| 852 | DOMDataUtils::getDataParsoid( $node )->dsr->end === null |
| 853 | ) |
| 854 | ) { |
| 855 | $node = $node->parentNode; |
| 856 | } |
| 857 | } |
| 858 | |
| 859 | // The top node could be a document fragment |
| 860 | $dsrB = $node instanceof Element ? self::handleAutoInserted( $node ) : null; |
| 861 | } elseif ( !( $node instanceof DocumentFragment ) ) { |
| 862 | // $node is text/comment. Can we extrapolate DSR from $node->parentNode? |
| 863 | // Yes, if this is the child of a zero-width element and |
| 864 | // is only preceded by separator elements. |
| 865 | // |
| 866 | // 1. text in p. |
| 867 | // 2. ws-only child of a node with auto-inserted start tag |
| 868 | // Ex: "<span> <s>x</span> </s>" --> <span> <s>x</s*></span><s*> </s> |
| 869 | // 3. ws-only children of a node with auto-inserted start tag |
| 870 | // Ex: "{|\n|-\n <!--foo--> \n|}" |
| 871 | $nodeParent = $node->parentNode; |
| 872 | // phpcs:ignore Generic.Files.LineLength.TooLong |
| 873 | '@phan-var Element|DocumentFragment $nodeParent'; // @var Element|DocumentFragment $nodeParent |
| 874 | |
| 875 | if ( |
| 876 | $nodeParent !== $prevNode && |
| 877 | $nodeParent instanceof Element && |
| 878 | ( DOMDataUtils::getDataParsoid( $nodeParent )->dsr->openWidth ?? null ) === 0 |
| 879 | ) { |
| 880 | $sepLen = self::precedingSeparatorTextLen( $node ); |
| 881 | if ( $sepLen !== null ) { |
| 882 | $dsrB = DOMDataUtils::getDataParsoid( $nodeParent )->dsr; |
| 883 | if ( is_int( $dsrB->start ) && $sepLen > 0 ) { |
| 884 | $dsrB = clone $dsrB; |
| 885 | $dsrB->start += $sepLen; |
| 886 | } |
| 887 | } |
| 888 | } |
| 889 | } |
| 890 | |
| 891 | // FIXME: Maybe we shouldn't set dsr in the dsr pass if both aren't valid? |
| 892 | // NOTE: Synthetic DSR ranges |
| 893 | // may not necessarily have offsets that correspond to valid |
| 894 | // UTF-8 characters. So use $state->isValidDSR() to ensure that |
| 895 | // all offsets land on valid UTF-8 characters before trying to |
| 896 | // construct substrings based on relations between them. |
| 897 | if ( |
| 898 | $state->isValidDSR( $dsrA ) && |
| 899 | $state->isValidDSR( $dsrB ) |
| 900 | ) { |
| 901 | // Figure out containment relationship |
| 902 | if ( $dsrA->start <= $dsrB->start ) { |
| 903 | if ( $dsrB->end <= $dsrA->end ) { |
| 904 | if ( $dsrA->start === $dsrB->start && $dsrA->end === $dsrB->end ) { |
| 905 | // Both have the same dsr range, so there can't be any |
| 906 | // separators between them |
| 907 | $sep = ''; |
| 908 | } elseif ( $dsrA->openWidth !== null && $state->isValidDSR( $dsrA, true ) ) { |
| 909 | // B in A, from parent to child |
| 910 | $sep = $state->getOrigSrc( $dsrA->openRange()->to( $dsrB ) ); |
| 911 | } |
| 912 | } elseif ( $dsrA->end <= $dsrB->start ) { |
| 913 | // B following A (siblingish) |
| 914 | $sep = $state->getOrigSrc( $dsrA->to( $dsrB ) ); |
| 915 | } elseif ( $dsrB->closeWidth !== null && $state->isValidDSR( $dsrB, true ) ) { |
| 916 | // A in B, from child to parent |
| 917 | $sep = $state->getOrigSrc( $dsrA->to( $dsrB->closeRange() ) ); |
| 918 | } |
| 919 | } elseif ( $dsrA->end <= $dsrB->end ) { |
| 920 | if ( $dsrB->closeWidth !== null && $state->isValidDSR( $dsrB, true ) ) { |
| 921 | // A in B, from child to parent |
| 922 | $sep = $state->getOrigSrc( $dsrA->to( $dsrB->closeRange() ) ); |
| 923 | } |
| 924 | } else { |
| 925 | $this->env->log( 'info/html2wt', 'dsr backwards: should not happen!' ); |
| 926 | } |
| 927 | |
| 928 | // Reset if $sep is invalid |
| 929 | if ( $sep && !WTSUtils::isValidSep( $sep ) ) { |
| 930 | $sep = null; |
| 931 | } |
| 932 | } |
| 933 | } elseif ( $origSepNeeded && !DiffUtils::hasDiffMarkers( $prevNode ) ) { |
| 934 | // Given the following conditions: |
| 935 | // - $prevNode has no diff markers. (checked above) |
| 936 | // - $prevNode's next non-sep sibling ($next) was inserted. |
| 937 | // - $next is an ancestor of $node. |
| 938 | // - all of those ancestor nodes from $node->$next have zero-width |
| 939 | // wikitext (otherwise, the separator isn't usable) |
| 940 | // Try to extract a separator from original source that existed |
| 941 | // between $prevNode and its original next sibling or its parent |
| 942 | // (if $prevNode was the last non-sep child). |
| 943 | // |
| 944 | // This minimizes dirty-diffs to that separator text from |
| 945 | // the insertion of $next after $prevNode. |
| 946 | $next = DiffDOMUtils::nextNonSepSibling( $prevNode ); |
| 947 | $origSepUsable = $next && DiffUtils::hasInsertedDiffMark( $next ); |
| 948 | |
| 949 | // Check that $next is an ancestor of $node and all nodes |
| 950 | // on that path have zero-width wikitext |
| 951 | if ( $origSepUsable && $node !== $next ) { |
| 952 | $n = $node->parentNode; |
| 953 | while ( $n && $next !== $n ) { |
| 954 | if ( !WTUtils::isZeroWidthWikitextElt( $n ) ) { |
| 955 | $origSepUsable = false; |
| 956 | break; |
| 957 | } |
| 958 | $n = $n->parentNode; |
| 959 | } |
| 960 | $origSepUsable = $origSepUsable && $n !== null; |
| 961 | } |
| 962 | |
| 963 | // Extract separator from original source if possible |
| 964 | if ( $origSepUsable ) { |
| 965 | $origNext = DiffDOMUtils::nextNonSepSibling( $next ); |
| 966 | if ( !$origNext ) { // $prevNode was last non-sep child of its parent |
| 967 | // We could work harder for text/comments and extrapolate, but skipping that here |
| 968 | // FIXME: If we had a generic DSR extrapolation utility, that would be useful |
| 969 | $o1 = $prevNode instanceof Element ? |
| 970 | DOMDataUtils::getDataParsoid( $prevNode )->dsr ?? null : null; |
| 971 | if ( $o1 !== null ) { |
| 972 | $dsr2 = DOMDataUtils::getDataParsoid( $prevNode->parentNode )->dsr ?? null; |
| 973 | $sep = $dsr2 !== null ? $state->getOrigSrc( $o1->to( $dsr2->closeRange() ) ) : null; |
| 974 | } |
| 975 | } elseif ( !DiffUtils::hasDiffMarkers( $origNext ) ) { |
| 976 | // We could work harder for text/comments and extrapolate, but skipping that here |
| 977 | // FIXME: If we had a generic DSR extrapolation utility, that would be useful |
| 978 | $o1 = $prevNode instanceof Element ? |
| 979 | DOMDataUtils::getDataParsoid( $prevNode )->dsr ?? null : null; |
| 980 | if ( $o1 !== null ) { |
| 981 | $o2 = $origNext instanceof Element ? |
| 982 | DOMDataUtils::getDataParsoid( $origNext )->dsr ?? null : null; |
| 983 | $sep = $o2 !== null ? $state->getOrigSrc( $o1->to( $o2 ) ) : null; |
| 984 | } |
| 985 | } |
| 986 | |
| 987 | if ( $sep !== null ) { |
| 988 | // Since this is an inserted node, we might have to augment this |
| 989 | // with newline constraints and so, we just set this recovered sep |
| 990 | // to the buffered sep in state->sep->src |
| 991 | $state->sep->src = $sep; |
| 992 | $sep = null; |
| 993 | } |
| 994 | } |
| 995 | } |
| 996 | |
| 997 | // If all efforts failed, use special-purpose heuristics to recover |
| 998 | // trimmed leading / trailing whitespace from lists, headings, table-cells |
| 999 | if ( $sep === null ) { |
| 1000 | if ( $sepType === 'parent-child' ) { |
| 1001 | $sep = $this->recoverTrimmedWhitespace( $node, true ); |
| 1002 | $state->sep->src = ( $sep ?? '' ) . $state->sep->src; |
| 1003 | } elseif ( $sepType === 'child-parent' ) { |
| 1004 | $sep = $this->recoverTrimmedWhitespace( $node, false ); |
| 1005 | $state->sep->src .= $sep ?? ''; |
| 1006 | } |
| 1007 | } |
| 1008 | |
| 1009 | $this->env->log( |
| 1010 | 'debug/wts/sep', |
| 1011 | static function () use ( $prevNode, $origNode, $sep, $state ) { |
| 1012 | return 'maybe-sep | ' . |
| 1013 | 'prev:' . ( $prevNode ? DOMUtils::nodeName( $prevNode ) : '--none--' ) . |
| 1014 | ', node:' . DOMUtils::nodeName( $origNode ) . |
| 1015 | ', sep: ' . PHPUtils::jsonEncode( $sep ) . |
| 1016 | ', state.sep.src: ' . PHPUtils::jsonEncode( $state->sep->src ?? null ); |
| 1017 | } |
| 1018 | ); |
| 1019 | |
| 1020 | // If the separator is being emitted before a node that emits sol-transparent WT, |
| 1021 | // go through makeSeparator to verify indent-pre constraints are met. |
| 1022 | $sepConstraints = $state->sep->constraints ?? [ 'max' => 0 ]; |
| 1023 | if ( $sep === null || ( $state->sep->src && $state->sep->src !== $sep ) ) { |
| 1024 | if ( !empty( $state->sep->constraints ) || !empty( $state->sep->src ) ) { |
| 1025 | // TODO: set modified flag if start or end node (but not both) are |
| 1026 | // modified / new so that the selser can use the separator |
| 1027 | $sep = $this->makeSeparator( $node, $state->sep->src ?? '', $sepConstraints ); |
| 1028 | } else { |
| 1029 | $sep = null; |
| 1030 | } |
| 1031 | } |
| 1032 | |
| 1033 | if ( $sep !== null ) { |
| 1034 | $sep = self::makeSepIndentPreSafe( $sep, $sepConstraints ); |
| 1035 | } |
| 1036 | return $sep; |
| 1037 | } |
| 1038 | |
| 1039 | private function needNewSep( Node $node ): bool { |
| 1040 | // If an "empty start tag" tr node is modified to add attributes to it, we cannot re-use the existing "" |
| 1041 | // separator, and we need to force its re-generation |
| 1042 | if ( $node instanceof Element && DOMUtils::nodeName( $node ) === 'tr' |
| 1043 | && empty( DOMDataUtils::getDataParsoid( $node )->startTagSrc ) |
| 1044 | ) { |
| 1045 | return DiffUtils::hasDiffMark( $node, DiffMarkers::MODIFIED_WRAPPER ); |
| 1046 | } |
| 1047 | return false; |
| 1048 | } |
| 1049 | } |