Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
| Total | |
23.11% |
58 / 251 |
|
37.04% |
10 / 27 |
CRAP | |
0.00% |
0 / 1 |
| SerializerState | |
23.11% |
58 / 251 |
|
37.04% |
10 / 27 |
4111.04 | |
0.00% |
0 / 1 |
| solWikitextRegexp | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
6 | |||
| solRegexp | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
| __construct | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
| getEnv | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| initMode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| appendSep | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| updateSep | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| resetSep | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
| resetCurrLine | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
| flushLine | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
| getOrigSrc | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
3.07 | |||
| isValidDSR | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
342 | |||
| updateModificationFlags | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
| sepIntroducedSOL | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
| pushToCurrLine | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
| emitSep | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
| emitSepForNode | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
272 | |||
| recoverTrimmedWhitespace | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
2 | |||
| emitChunk | |
40.48% |
17 / 42 |
|
0.00% |
0 / 1 |
156.81 | |||
| serializeChildren | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
| kickOffSerialize | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
| serializeChildrenToString | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
2 | |||
| serializeLinkChildrenToString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| serializeCaptionChildrenToString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| serializeIndentPreChildrenToString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
| openAnnotationRange | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| closeAnnotationRange | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
| 1 | <?php |
| 2 | declare( strict_types = 1 ); |
| 3 | |
| 4 | namespace Wikimedia\Parsoid\Html2Wt; |
| 5 | |
| 6 | use Composer\Semver\Semver; |
| 7 | use stdClass; |
| 8 | use Wikimedia\Assert\Assert; |
| 9 | use Wikimedia\Parsoid\Config\Env; |
| 10 | use Wikimedia\Parsoid\Core\DomSourceRange; |
| 11 | use Wikimedia\Parsoid\Core\SelectiveUpdateData; |
| 12 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
| 13 | use Wikimedia\Parsoid\DOM\Element; |
| 14 | use Wikimedia\Parsoid\DOM\Node; |
| 15 | use Wikimedia\Parsoid\DOM\Text; |
| 16 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
| 17 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText; |
| 18 | use Wikimedia\Parsoid\Tokens\SourceRange; |
| 19 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
| 20 | use Wikimedia\Parsoid\Utils\DOMCompat; |
| 21 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
| 22 | use Wikimedia\Parsoid\Utils\DOMUtils; |
| 23 | use Wikimedia\Parsoid\Utils\PHPUtils; |
| 24 | use Wikimedia\Parsoid\Utils\Utils; |
| 25 | use Wikimedia\Parsoid\Utils\WTUtils; |
| 26 | |
| 27 | /** |
| 28 | * State object for the wikitext serializers. |
| 29 | */ |
| 30 | class SerializerState { |
| 31 | |
| 32 | /** |
| 33 | * Regexp for checking if what we have consumed wikimarkup that has special meaning at the |
| 34 | * beginning of the line, and is indeed at the beginning of the line (modulo comments and |
| 35 | * other ignored elements). |
| 36 | * |
| 37 | * @return string |
| 38 | */ |
| 39 | private function solWikitextRegexp(): string { |
| 40 | static $solWikitextRegexp = null; |
| 41 | if ( $solWikitextRegexp === null ) { |
| 42 | $sol = PHPUtils::reStrip( |
| 43 | $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp( true ), |
| 44 | '@' |
| 45 | ); |
| 46 | $solWikitextRegexp = '@' . |
| 47 | '^(' . $sol . ')' . |
| 48 | '([\ \*#:;{\|!=].*)$' . |
| 49 | '@D'; |
| 50 | } |
| 51 | return $solWikitextRegexp; |
| 52 | } |
| 53 | |
| 54 | /** |
| 55 | * Regexp for checking whether we are at the start of the line (modulo comments and |
| 56 | * other ignored elements). |
| 57 | * |
| 58 | * @return string |
| 59 | */ |
| 60 | private function solRegexp(): string { |
| 61 | static $solRegexp = null; |
| 62 | if ( $solRegexp === null ) { |
| 63 | $sol = PHPUtils::reStrip( |
| 64 | $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp( true ), |
| 65 | '@' |
| 66 | ); |
| 67 | $solRegexp = '@(^|\n)' . $sol . '$@D'; |
| 68 | } |
| 69 | return $solRegexp; |
| 70 | } |
| 71 | |
| 72 | /** |
| 73 | * Separator information: |
| 74 | * - constraints (array<array|int>|null): min/max number of newlines |
| 75 | * - src (string|null): collected separator text from DOM text/comment nodes |
| 76 | * - lastSourceNode (?Node): Seems to be bookkeeping to make sure we don't reuse |
| 77 | * original separators when `emitChunk` is called |
| 78 | * consecutively on the same node. However, it also |
| 79 | * differs from `state.prevNode` in that it only gets |
| 80 | * updated when a node calls `emitChunk` so that nodes |
| 81 | * serializing `justChildren` don't mix up `buildSep`. |
| 82 | * FIXME: could use a dedicated class |
| 83 | * @var stdClass |
| 84 | */ |
| 85 | public $sep; |
| 86 | |
| 87 | /** |
| 88 | * Is the serializer at the start of a new wikitext line? |
| 89 | * @var bool |
| 90 | */ |
| 91 | public $onSOL = true; |
| 92 | |
| 93 | /** |
| 94 | * True when wts kicks off, false after the first char has been output |
| 95 | * SSS FIXME: Can this be done away with in some way? |
| 96 | * @var bool |
| 97 | */ |
| 98 | public $atStartOfOutput = true; |
| 99 | |
| 100 | /** |
| 101 | * Is the serializer currently handling link content (children of `<a>`)? |
| 102 | * @var bool |
| 103 | */ |
| 104 | public $inLink = false; |
| 105 | |
| 106 | /** |
| 107 | * Is the serializer currently handling caption content? |
| 108 | * @var bool |
| 109 | */ |
| 110 | public $inCaption = false; |
| 111 | |
| 112 | /** |
| 113 | * Is the serializer currently handling an indent-pre tag? |
| 114 | * @var bool |
| 115 | */ |
| 116 | public $inIndentPre = false; |
| 117 | |
| 118 | /** |
| 119 | * Is the serializer currently handling a html-pre tag? |
| 120 | * @var bool |
| 121 | */ |
| 122 | public $inHTMLPre = false; |
| 123 | |
| 124 | /** |
| 125 | * Is the serializer currently handling a tag that the PHP parser |
| 126 | * treats as a block tag? |
| 127 | * @var bool |
| 128 | */ |
| 129 | public $inPHPBlock = false; |
| 130 | |
| 131 | /** |
| 132 | * Is the serializer being invoked recursively to serialize a |
| 133 | * template-generated attribute (via `WSP.getAttributeValue`'s |
| 134 | * template handling). If so, we should suppress some |
| 135 | * serialization escapes, like autolink protection, since |
| 136 | * these are not valid for attribute values. |
| 137 | * @var bool |
| 138 | */ |
| 139 | public $inAttribute = false; |
| 140 | |
| 141 | /** |
| 142 | * Is the serializer currently processing a subtree that has been |
| 143 | * marked inserted compared to original content (ex: via VE / CX)? |
| 144 | * |
| 145 | * @var bool |
| 146 | */ |
| 147 | public $inInsertedContent; |
| 148 | |
| 149 | /** |
| 150 | * Did we introduce nowikis for indent-pre protection? |
| 151 | * If yes, we might run a post-pass to strip useless ones. |
| 152 | * @var bool |
| 153 | */ |
| 154 | public $hasIndentPreNowikis = false; |
| 155 | |
| 156 | /** |
| 157 | * Did we introduce nowikis to preserve quote semantics? |
| 158 | * If yes, we might run a post-pass to strip useless ones. |
| 159 | * @var bool |
| 160 | */ |
| 161 | public $hasQuoteNowikis = false; |
| 162 | |
| 163 | /** |
| 164 | * Did we introduce `<nowiki />`s? |
| 165 | * If yes, we do a postpass to remove unnecessary trailing ones. |
| 166 | * @var bool |
| 167 | */ |
| 168 | public $hasSelfClosingNowikis = false; |
| 169 | |
| 170 | /** |
| 171 | * Did we introduce nowikis around `=.*=` text? |
| 172 | * If yes, we do a postpass to remove unnecessary escapes. |
| 173 | * @var bool |
| 174 | */ |
| 175 | public $hasHeadingEscapes = false; |
| 176 | |
| 177 | /** |
| 178 | * Records the nesting level of wikitext tables |
| 179 | * @var int |
| 180 | */ |
| 181 | public $wikiTableNesting = 0; |
| 182 | |
| 183 | /** |
| 184 | * Stack of wikitext escaping handlers -- these handlers are responsible |
| 185 | * for smart escaping when the surrounding wikitext context is known. |
| 186 | * @var (callable|null)[] See {@link serializeChildren()} |
| 187 | */ |
| 188 | public $wteHandlerStack = []; |
| 189 | |
| 190 | /** |
| 191 | * This array is used by the wikitext escaping algorithm -- represents |
| 192 | * a "single line" of output wikitext as represented by a block node in |
| 193 | * the DOM. |
| 194 | * - firstNode (?Node): first DOM node processed on this line |
| 195 | * - text (string): output so far from all nodes on the current line |
| 196 | * - chunks (ConstrainedText[]): list of chunks comprising the current line |
| 197 | * @var stdClass |
| 198 | * XXX: replace with output buffering per line |
| 199 | * FIXME: could use a dedicated class |
| 200 | */ |
| 201 | public $currLine; |
| 202 | |
| 203 | /** |
| 204 | * Stack used to enforce single-line context |
| 205 | * @var SingleLineContext |
| 206 | */ |
| 207 | public $singleLineContext; |
| 208 | |
| 209 | /** |
| 210 | * Text to be emitted at the start of file, for redirects |
| 211 | * @var string|null |
| 212 | */ |
| 213 | public $redirectText = null; |
| 214 | |
| 215 | /** @var WikitextSerializer */ |
| 216 | public $serializer; |
| 217 | |
| 218 | /** @var ParsoidExtensionAPI */ |
| 219 | public $extApi; |
| 220 | |
| 221 | /** @var string The serialized output */ |
| 222 | public $out = ''; |
| 223 | |
| 224 | /** |
| 225 | * Are we in selective serialization mode? |
| 226 | * @see SelectiveSerializer |
| 227 | * @var bool |
| 228 | */ |
| 229 | public $selserMode; |
| 230 | |
| 231 | private ?SelectiveUpdateData $selserData; |
| 232 | |
| 233 | /** |
| 234 | * If in selser mode, while processing a node, do we know if |
| 235 | * its previous node has not been modified in an edit? |
| 236 | * @var bool |
| 237 | */ |
| 238 | public $prevNodeUnmodified; |
| 239 | |
| 240 | /** |
| 241 | * If in selser mode, while processing a node, do we know if |
| 242 | * it has not been modified in an edit? |
| 243 | * @var bool |
| 244 | */ |
| 245 | public $currNodeUnmodified; |
| 246 | |
| 247 | /** |
| 248 | * Should we run the wikitext escaping code on the wikitext chunk |
| 249 | * that will be emitted? |
| 250 | * @var bool |
| 251 | */ |
| 252 | public $needsEscaping = false; |
| 253 | |
| 254 | /** |
| 255 | * Used as fast patch for special protected characters in WikitextEscapeHandlers and |
| 256 | * comes from LanguageVariantHandler |
| 257 | * @var string|null |
| 258 | */ |
| 259 | public $protect; |
| 260 | |
| 261 | /** @var Separators */ |
| 262 | public $separators; |
| 263 | |
| 264 | /** @var Env */ |
| 265 | private $env; |
| 266 | |
| 267 | /** @var Element */ |
| 268 | public $currNode; |
| 269 | |
| 270 | /** @var Element */ |
| 271 | private $prevNode; |
| 272 | |
| 273 | /** @var array */ |
| 274 | public $openAnnotations; |
| 275 | |
| 276 | /** |
| 277 | * Log prefix to use in trace output |
| 278 | * @var string |
| 279 | */ |
| 280 | private $logPrefix = 'OUT:'; |
| 281 | |
| 282 | public bool $haveTrimmedWsDSR = false; |
| 283 | |
| 284 | /** |
| 285 | * @param WikitextSerializer $serializer |
| 286 | * @param array $options List of options for serialization: |
| 287 | * - onSOL: (bool) |
| 288 | * - inPHPBlock: (bool) |
| 289 | * - inAttribute: (bool) |
| 290 | * - protect: (string) |
| 291 | * - selserData: (SelectiveUpdateData) |
| 292 | */ |
| 293 | public function __construct( WikitextSerializer $serializer, array $options = [] ) { |
| 294 | $this->env = $serializer->env; |
| 295 | $this->serializer = $serializer; |
| 296 | $this->extApi = new ParsoidExtensionAPI( $this->env, [ 'html2wt' => [ 'state' => $this ] ] ); |
| 297 | $this->onSOL = $options['onSOL'] ?? $this->onSOL; |
| 298 | $this->inPHPBlock = $options['inPHPBlock'] ?? $this->inPHPBlock; |
| 299 | $this->inAttribute = $options['inAttribute'] ?? $this->inAttribute; |
| 300 | $this->protect = $options['protect'] ?? null; |
| 301 | $this->selserData = $options['selserData'] ?? null; |
| 302 | $this->resetCurrLine( null ); |
| 303 | $this->singleLineContext = new SingleLineContext(); |
| 304 | $this->resetSep(); |
| 305 | $this->haveTrimmedWsDSR = Semver::satisfies( $this->env->getInputContentVersion(), '>=2.1.1' ); |
| 306 | $this->separators = new Separators( $this->env, $this ); |
| 307 | } |
| 308 | |
| 309 | /** |
| 310 | * @note Porting note: this replaces direct access |
| 311 | * @return Env |
| 312 | */ |
| 313 | public function getEnv(): Env { |
| 314 | return $this->env; |
| 315 | } |
| 316 | |
| 317 | /** |
| 318 | * Initialize a few boolean flags based on serialization mode. |
| 319 | * FIXME: Ideally, this should be private. Requires shuffing around |
| 320 | * where SerializerState is constructed so that $selserMode is known |
| 321 | * at the time of construction. |
| 322 | * @private for use by WikitextSerializer only |
| 323 | * @param bool $selserMode Are we running selective serialization? |
| 324 | */ |
| 325 | public function initMode( bool $selserMode ): void { |
| 326 | $this->selserMode = $selserMode; |
| 327 | } |
| 328 | |
| 329 | /** |
| 330 | * Appends the separator source to the separator src buffer. |
| 331 | * Don't update $state->onSOL since this string hasn't been emitted yet. |
| 332 | * If content handlers change behavior based on whether this newline will |
| 333 | * be emitted or not, they should peek into this buffer (ex: see TDHandler |
| 334 | * and THHandler code). |
| 335 | * |
| 336 | * @param string $src |
| 337 | */ |
| 338 | public function appendSep( string $src ): void { |
| 339 | $this->sep->src = ( $this->sep->src ?? '' ) . $src; |
| 340 | } |
| 341 | |
| 342 | /** |
| 343 | * Cycle the state after processing a node. |
| 344 | * @param Node $node |
| 345 | */ |
| 346 | public function updateSep( Node $node ): void { |
| 347 | $this->sep->lastSourceNode = $node; |
| 348 | } |
| 349 | |
| 350 | private function resetSep() { |
| 351 | $this->sep = (object)[ |
| 352 | 'constraints' => null, |
| 353 | 'src' => null, |
| 354 | 'lastSourceNode' => null, |
| 355 | ]; |
| 356 | } |
| 357 | |
| 358 | /** |
| 359 | * Reset the current line state. |
| 360 | * @param ?Node $node |
| 361 | */ |
| 362 | private function resetCurrLine( ?Node $node ): void { |
| 363 | $this->currLine = (object)[ |
| 364 | 'text' => '', |
| 365 | 'chunks' => [], |
| 366 | 'firstNode' => $node |
| 367 | ]; |
| 368 | } |
| 369 | |
| 370 | /** |
| 371 | * Process and emit a line of ConstrainedText chunks, adjusting chunk boundaries as necessary. |
| 372 | * (Start of line and end of line are always safe for ConstrainedText chunks, so we don't need |
| 373 | * to buffer more than the last line.) |
| 374 | */ |
| 375 | private function flushLine(): void { |
| 376 | $this->out .= ConstrainedText::escapeLine( $this->currLine->chunks ); |
| 377 | $this->currLine->chunks = []; |
| 378 | } |
| 379 | |
| 380 | /** |
| 381 | * Extracts a subset of the page source bound by the supplied source range. |
| 382 | * @param SourceRange $sr |
| 383 | * @return string|null |
| 384 | */ |
| 385 | public function getOrigSrc( SourceRange $sr ): ?string { |
| 386 | Assert::invariant( $this->selserMode, 'SerializerState::$selserMode must be set' ); |
| 387 | if ( |
| 388 | $sr->start <= $sr->end && |
| 389 | // FIXME: Having a $start greater than the source length is |
| 390 | // probably a canary for corruption. Maybe we should be throwing |
| 391 | // here instead. See T240053. |
| 392 | // But, see comment in UnpackDOMFragments where we very very rarely |
| 393 | // can deliberately set DSR to point outside page source. |
| 394 | $sr->start <= strlen( $this->selserData->revText ) |
| 395 | ) { |
| 396 | // XXX should use $frame->getSrcText() like WTUtils::getWTSource |
| 397 | return $sr->substr( $this->selserData->revText ); |
| 398 | } else { |
| 399 | return null; |
| 400 | } |
| 401 | } |
| 402 | |
| 403 | /** |
| 404 | * Check the validity of a DSR in the context of the page source. |
| 405 | * |
| 406 | * Returns false if Utils::isValidDSR() would return false, but also |
| 407 | * returns false if the DSR offsets would create a bad UTF-8 string |
| 408 | * (ie, the start offsets don't point to a valid UTF-8 start character). |
| 409 | * @param ?DomSourceRange $dsr DSR source range values |
| 410 | * @param bool $all Also check the widths of the container tag |
| 411 | * @return bool |
| 412 | */ |
| 413 | public function isValidDSR( ?DomSourceRange $dsr, bool $all = false ) { |
| 414 | if ( !Utils::isValidDSR( $dsr, $all ) ) { |
| 415 | return false; |
| 416 | } |
| 417 | if ( !( $dsr->start <= $dsr->end && |
| 418 | $dsr->end <= strlen( $this->selserData->revText ) ) ) { |
| 419 | return false; |
| 420 | } |
| 421 | // check the UTF-8 ranges. |
| 422 | $src = $this->selserData->revText; |
| 423 | $check = static function ( $start, $end ) use ( $src ) { |
| 424 | if ( $start === $end ) { |
| 425 | // zero-length string is always ok |
| 426 | return true; |
| 427 | } |
| 428 | $firstChar = ord( $src[$start] ); |
| 429 | if ( ( $firstChar & 0xC0 ) === 0x80 ) { |
| 430 | return false; // bad UTF-8 at start of string |
| 431 | } |
| 432 | $i = 0; |
| 433 | // This next loop won't pass $start because we've already |
| 434 | // asserted that the first character isn't 10xx xxxx |
| 435 | do { |
| 436 | $i--; |
| 437 | if ( $i <= -5 ) { |
| 438 | return false; // bad UTF-8 at end of string (>4 byte sequence) |
| 439 | } |
| 440 | $lastChar = ord( $src[$end + $i] ); |
| 441 | } while ( ( $lastChar & 0xC0 ) === 0x80 ); |
| 442 | if ( ( $lastChar & 0x80 ) === 0 ) { |
| 443 | return $i === -1; |
| 444 | } elseif ( ( $lastChar & 0xE0 ) === 0xC0 ) { |
| 445 | return $i === -2; |
| 446 | } elseif ( ( $lastChar & 0xF0 ) === 0xE0 ) { |
| 447 | return $i === -3; |
| 448 | } elseif ( ( $lastChar & 0xF8 ) === 0xF0 ) { |
| 449 | return $i === -4; |
| 450 | } else { |
| 451 | return false; |
| 452 | } |
| 453 | }; |
| 454 | if ( !$all ) { |
| 455 | return $check( $dsr->start, $dsr->end ); |
| 456 | } |
| 457 | // Check each inner ranges. |
| 458 | $openEnd = $dsr->start + $dsr->openWidth; |
| 459 | if ( $openEnd > $dsr->end ) { |
| 460 | return false; |
| 461 | } |
| 462 | if ( !$check( $dsr->start, $openEnd ) ) { |
| 463 | return false; |
| 464 | } |
| 465 | $closeStart = $dsr->end - $dsr->closeWidth; |
| 466 | if ( $dsr->start > $closeStart ) { |
| 467 | return false; |
| 468 | } |
| 469 | if ( !$check( $closeStart, $dsr->end ) ) { |
| 470 | return false; |
| 471 | } |
| 472 | if ( $openEnd > $closeStart ) { |
| 473 | return false; |
| 474 | } |
| 475 | if ( !$check( $openEnd, $closeStart ) ) { |
| 476 | return false; |
| 477 | } |
| 478 | return true; |
| 479 | } |
| 480 | |
| 481 | /** |
| 482 | * Like it says on the tin. |
| 483 | * @param Node $node |
| 484 | */ |
| 485 | public function updateModificationFlags( Node $node ): void { |
| 486 | $this->prevNodeUnmodified = $this->currNodeUnmodified; |
| 487 | $this->currNodeUnmodified = false; |
| 488 | $this->prevNode = $node; |
| 489 | } |
| 490 | |
| 491 | /** |
| 492 | * Separators put us in SOL state. |
| 493 | * @param string $sep |
| 494 | * @param Node $node |
| 495 | */ |
| 496 | private function sepIntroducedSOL( string $sep, Node $node ): void { |
| 497 | // Don't get tripped by newlines in comments! Be wary of nowikis added |
| 498 | // by makeSepIndentPreSafe on the last line. |
| 499 | $nonCommentSep = preg_replace( Utils::COMMENT_REGEXP, '', $sep ); |
| 500 | if ( substr( $nonCommentSep, -1 ) === "\n" ) { |
| 501 | $this->onSOL = true; |
| 502 | } |
| 503 | |
| 504 | if ( str_contains( $nonCommentSep, "\n" ) ) { |
| 505 | // process escapes in our full line |
| 506 | $this->flushLine(); |
| 507 | $this->resetCurrLine( $node ); |
| 508 | } |
| 509 | } |
| 510 | |
| 511 | /** |
| 512 | * Accumulates chunks on the current line. |
| 513 | * @param ConstrainedText $chunk |
| 514 | * @param string $logPrefix |
| 515 | */ |
| 516 | private function pushToCurrLine( ConstrainedText $chunk, string $logPrefix ) { |
| 517 | // Emitting text that has not been escaped |
| 518 | $this->currLine->text .= $chunk->text; |
| 519 | |
| 520 | $this->currLine->chunks[] = $chunk; |
| 521 | |
| 522 | $this->env->trace( |
| 523 | $this->serializer->logType, |
| 524 | '--->', $logPrefix, static fn () =>PHPUtils::jsonEncode( $chunk->text ) |
| 525 | ); |
| 526 | } |
| 527 | |
| 528 | /** |
| 529 | * Pushes the separator to the current line and resets the separator state. |
| 530 | * @param string $sep |
| 531 | * @param Node $node |
| 532 | * @param string $debugPrefix |
| 533 | */ |
| 534 | private function emitSep( string $sep, Node $node, string $debugPrefix ): void { |
| 535 | $sep = ConstrainedText::cast( $sep, $node ); |
| 536 | |
| 537 | // Replace newlines if we're in a single-line context |
| 538 | if ( $this->singleLineContext->enforced() ) { |
| 539 | $sep->text = preg_replace( '/\n/', ' ', $sep->text ); |
| 540 | } |
| 541 | |
| 542 | $this->pushToCurrLine( $sep, $debugPrefix ); |
| 543 | $this->sepIntroducedSOL( $sep->text, $node ); |
| 544 | |
| 545 | // Reset separator state |
| 546 | $this->resetSep(); |
| 547 | $this->updateSep( $node ); |
| 548 | } |
| 549 | |
| 550 | /** |
| 551 | * Determines if we can use the original separator for this node or if we |
| 552 | * need to build one based on its constraints, and then emits it. |
| 553 | * |
| 554 | * @param Node $node |
| 555 | */ |
| 556 | private function emitSepForNode( Node $node ): void { |
| 557 | /* When block nodes are deleted, the deletion affects whether unmodified |
| 558 | * newline separators between a pair of unmodified P tags can be reused. |
| 559 | * |
| 560 | * Example: |
| 561 | * ``` |
| 562 | * Original WT : "<div>x</div>foo\nbar" |
| 563 | * Original HTML: "<div>x</div><p>foo</p>\n<p>bar</p>" |
| 564 | * Edited HTML : "<p>foo</p>\n<p>bar</p>" |
| 565 | * Annotated DOM: "<mw:DiffMarker is-block><p>foo</p>\n<p>bar</p>" |
| 566 | * Expected WT : "foo\n\nbar" |
| 567 | * ``` |
| 568 | * |
| 569 | * Note the additional newline between "foo" and "bar" even though originally, |
| 570 | * there was just a single newline. |
| 571 | * |
| 572 | * So, even though the two P tags and the separator between them is |
| 573 | * unmodified, it is insufficient to rely on just that. We have to look at |
| 574 | * what has happened on the two wikitext lines onto which the two P tags |
| 575 | * will get serialized. |
| 576 | * |
| 577 | * Now, if you check the code for `nextToDeletedBlockNodeInWT`, that code is |
| 578 | * not really looking at ALL the nodes before/after the nodes that could |
| 579 | * serialize onto the wikitext lines. It is looking at the immediately |
| 580 | * adjacent nodes, i.e. it is not necessary to look if a block-tag was |
| 581 | * deleted 2 or 5 siblings away. If we had to actually examine all of those, |
| 582 | * nodes, this would get very complex, and it would be much simpler to just |
| 583 | * discard the original separators => potentially lots of dirty diffs. |
| 584 | * |
| 585 | * To understand why it is sufficient (for correctness) to examine just |
| 586 | * the immediately adjacent nodes, let us look at an additional example. |
| 587 | * ``` |
| 588 | * Original WT : "a<div>b</div>c<div>d</div>e\nf" |
| 589 | * Original HTML: "<p>a</p><div>b</div><p>c</p><div>d</div><p>e</p>\n<p>f</p>" |
| 590 | * ``` |
| 591 | * Note how `<block>` tags and `<p>` tags interleave in the HTML. This would be |
| 592 | * the case always no matter how much inline content showed up between the |
| 593 | * block tags in wikitext. If the b-`<div>` was deleted, we don't care |
| 594 | * about it, since we still have the d-`<div>` before the P tag that preserves |
| 595 | * the correctness of the single `"\n"` separator. If the d-`<div>` was deleted, |
| 596 | * we conservatively ignore the original separator and let normal P-P constraints |
| 597 | * take care of it. At worst, we might generate a dirty diff in this scenario. */ |
| 598 | $origSepNeeded = ( $node !== $this->sep->lastSourceNode ); |
| 599 | $origSepUsable = $origSepNeeded && |
| 600 | ( |
| 601 | // first-content-node of <body> ($this->prevNode) |
| 602 | ( |
| 603 | DOMUtils::isBody( $this->prevNode ) && |
| 604 | $node->parentNode === $this->prevNode |
| 605 | ) |
| 606 | || |
| 607 | // unmodified sibling node of $this->prevNode |
| 608 | ( |
| 609 | $this->prevNode && $this->prevNodeUnmodified && |
| 610 | $node->parentNode === $this->prevNode->parentNode && |
| 611 | !WTSUtils::nextToDeletedBlockNodeInWT( $this->prevNode, true ) |
| 612 | ) |
| 613 | ) && |
| 614 | $this->currNodeUnmodified && !WTSUtils::nextToDeletedBlockNodeInWT( $node, false ); |
| 615 | |
| 616 | $origSep = null; |
| 617 | if ( $origSepUsable ) { |
| 618 | if ( $this->prevNode instanceof Element && $node instanceof Element ) { |
| 619 | '@phan-var Element $node';/** @var Element $node */ |
| 620 | if ( DOMUtils::isBody( $this->prevNode ) ) { |
| 621 | // <body> won't have DSR in body_only scenarios |
| 622 | $sr = new SourceRange( 0, 0 ); |
| 623 | } else { |
| 624 | $sr = DOMDataUtils::getDataParsoid( $this->prevNode )->dsr; |
| 625 | } |
| 626 | $sr = $sr->to( DOMDataUtils::getDataParsoid( $node )->dsr ); |
| 627 | $origSep = $this->getOrigSrc( $sr ); |
| 628 | } elseif ( $this->sep->src && WTSUtils::isValidSep( $this->sep->src ) ) { |
| 629 | // We don't know where '$this->sep->src' comes from. So, reuse it |
| 630 | // only if it is a valid separator string. |
| 631 | $origSep = $this->sep->src; |
| 632 | } |
| 633 | } |
| 634 | |
| 635 | if ( $origSep !== null ) { |
| 636 | $this->emitSep( $origSep, $node, 'ORIG-SEP:' ); |
| 637 | } else { |
| 638 | $sep = $this->separators->buildSep( $node ); |
| 639 | $this->emitSep( $sep ?? '', $node, 'SEP:' ); |
| 640 | } |
| 641 | } |
| 642 | |
| 643 | /** |
| 644 | * Recovers and emits any trimmed whitespace for $node |
| 645 | * @param Node $node |
| 646 | * @param bool $leading |
| 647 | * if true, trimmed leading whitespace is emitted |
| 648 | * if false, trimmed railing whitespace is emitted |
| 649 | * @return string|null |
| 650 | */ |
| 651 | public function recoverTrimmedWhitespace( Node $node, bool $leading ): ?string { |
| 652 | $sep = $this->separators->recoverTrimmedWhitespace( $node, $leading ); |
| 653 | $this->env->trace( |
| 654 | $this->serializer->logType, |
| 655 | '--->', "TRIMMED-SEP:", static fn () => PHPUtils::jsonEncode( $sep ) |
| 656 | ); |
| 657 | return $sep; |
| 658 | } |
| 659 | |
| 660 | /** |
| 661 | * Pushes the chunk to the current line. |
| 662 | * @param ConstrainedText|string $res |
| 663 | * @param Node $node |
| 664 | */ |
| 665 | public function emitChunk( $res, Node $node ): void { |
| 666 | $res = ConstrainedText::cast( $res, $node ); |
| 667 | |
| 668 | // Replace newlines if we're in a single-line context |
| 669 | if ( $this->singleLineContext->enforced() ) { |
| 670 | $res->text = str_replace( "\n", ' ', $res->text ); |
| 671 | } |
| 672 | |
| 673 | // Emit separator first |
| 674 | if ( $res->noSep ) { |
| 675 | /* skip separators for internal tokens from SelSer */ |
| 676 | if ( $this->onSOL ) { |
| 677 | // process escapes in our full line |
| 678 | $this->flushLine(); |
| 679 | $this->resetCurrLine( $node ); |
| 680 | } |
| 681 | } else { |
| 682 | $this->emitSepForNode( $node ); |
| 683 | } |
| 684 | |
| 685 | $needsEscaping = $this->needsEscaping; |
| 686 | if ( $needsEscaping && $this->currNode instanceof Text ) { |
| 687 | $needsEscaping = !$this->inHTMLPre && ( $this->onSOL || !$this->currNodeUnmodified ); |
| 688 | } |
| 689 | |
| 690 | // Escape 'res' if necessary |
| 691 | if ( $needsEscaping ) { |
| 692 | $res = new ConstrainedText( [ |
| 693 | 'text' => $this->serializer->escapeWikitext( $this, $res->text, [ |
| 694 | 'node' => $node, |
| 695 | 'isLastChild' => DiffDOMUtils::nextNonDeletedSibling( $node ) === null, |
| 696 | ] ), |
| 697 | 'prefix' => $res->prefix, |
| 698 | 'suffix' => $res->suffix, |
| 699 | 'node' => $res->node, |
| 700 | ] ); |
| 701 | $this->needsEscaping = false; |
| 702 | } else { |
| 703 | // If 'res' is coming from selser and the current node is a paragraph tag, |
| 704 | // check if 'res' might need some leading chars nowiki-escaped before being output. |
| 705 | // Because of block-tag p-wrapping behavior, sol-sensitive characters that used to |
| 706 | // be in non-sol positions, but yet wrapped in p-tags, could end up in sol-position |
| 707 | // if those block tags get deleted during edits. |
| 708 | // |
| 709 | // Ex: a<div>foo</div>*b |
| 710 | // -- wt2html --> <p>a</p><div>foo<div><p>*b</p> |
| 711 | // -- EDIT --> <p>a</p><p>*b</p> |
| 712 | // -- html2wt --> a\n\n<nowiki>*</nowiki>b |
| 713 | // |
| 714 | // In this scenario, the <p>a</p>, <p>*b</p>, and <p>#c</p> |
| 715 | // will be marked unmodified and will be processed below. |
| 716 | if ( $this->selserMode |
| 717 | && $this->onSOL |
| 718 | && $this->currNodeUnmodified |
| 719 | // 'node' came from original Parsoid HTML unmodified. So, if its content |
| 720 | // needs nowiki-escaping, we know that the reason it didn't parse into |
| 721 | // lists/headings/whatever is because it didn't occur at the start of the |
| 722 | // line => it had a block-tag in the original wikitext. So if the previous |
| 723 | // node was also unmodified (and since it also came from original Parsoid |
| 724 | // HTML), we can safely infer that it couldn't have been an inline node or |
| 725 | // a P-tag (if it were, the p-wrapping code would have swallowed that content |
| 726 | // into 'node'). So, it would have to be some sort of block tag => this.onSOL |
| 727 | // couldn't have been true (because we could have serialized 'node' on the |
| 728 | // same line as the block tag) => we can save some effort by eliminating |
| 729 | // scenarios where 'this.prevNodeUnmodified' is true. |
| 730 | && !$this->prevNodeUnmodified |
| 731 | && DOMCompat::nodeName( $node ) === 'p' && !WTUtils::isLiteralHTMLNode( $node ) |
| 732 | ) { |
| 733 | $pChild = DiffDOMUtils::firstNonSepChild( $node ); |
| 734 | // If a text node, we have to make sure that the text doesn't |
| 735 | // get reparsed as non-text in the wt2html pipeline. |
| 736 | if ( $pChild instanceof Text ) { |
| 737 | $match = $res->matches( $this->solWikitextRegexp(), $this->env ); |
| 738 | if ( $match && isset( $match[2] ) ) { |
| 739 | if ( preg_match( '/^([\*#:;]|{\||.*=$)/D', $match[2] ) |
| 740 | // ! and | chars are harmless outside tables |
| 741 | || ( strspn( $match[2], '|!' ) && $this->wikiTableNesting > 0 ) |
| 742 | // indent-pres are suppressed inside <blockquote> |
| 743 | || ( preg_match( '/^ \S/', $match[2] ) |
| 744 | && !DOMUtils::hasNameOrHasAncestorOfName( $node, 'blockquote' ) ) |
| 745 | ) { |
| 746 | $res = ConstrainedText::cast( ( $match[1] ?: '' ) |
| 747 | . '<nowiki>' . substr( $match[2], 0, 1 ) . '</nowiki>' |
| 748 | . substr( $match[2], 1 ), $node ); |
| 749 | } |
| 750 | } |
| 751 | } |
| 752 | } |
| 753 | } |
| 754 | |
| 755 | // Output res |
| 756 | $this->pushToCurrLine( $res, $this->logPrefix ); |
| 757 | |
| 758 | // Update sol flag. Test for newlines followed by optional includeonly or comments |
| 759 | if ( !$res->matches( $this->solRegexp(), $this->env ) ) { |
| 760 | $this->onSOL = false; |
| 761 | } |
| 762 | |
| 763 | // We've emit something so we're no longer at SOO. |
| 764 | $this->atStartOfOutput = false; |
| 765 | } |
| 766 | |
| 767 | /** |
| 768 | * Serialize the children of a DOM node, sharing the global serializer state. |
| 769 | * Typically called by a DOM-based handler to continue handling its children. |
| 770 | * @param Element|DocumentFragment $node |
| 771 | * @param ?callable $wtEscaper ( $state, $text, $opts ) |
| 772 | * PORT-FIXME document better; should this be done via WikitextEscapeHandlers somehow? |
| 773 | * @param ?Node $firstChild |
| 774 | */ |
| 775 | public function serializeChildren( |
| 776 | Node $node, ?callable $wtEscaper = null, ?Node $firstChild = null |
| 777 | ): void { |
| 778 | // SSS FIXME: Unsure if this is the right thing always |
| 779 | if ( $wtEscaper ) { |
| 780 | $this->wteHandlerStack[] = $wtEscaper; |
| 781 | } |
| 782 | |
| 783 | $child = $firstChild ?: $node->firstChild; |
| 784 | while ( $child !== null ) { |
| 785 | // We always get the next child to process |
| 786 | $child = $this->serializer->serializeNode( $child ); |
| 787 | } |
| 788 | |
| 789 | if ( $wtEscaper ) { |
| 790 | array_pop( $this->wteHandlerStack ); |
| 791 | } |
| 792 | |
| 793 | // If we serialized children explicitly, |
| 794 | // we were obviously processing a modified node. |
| 795 | $this->currNodeUnmodified = false; |
| 796 | } |
| 797 | |
| 798 | /** |
| 799 | * Abstracts some steps taken in `serializeChildrenToString` and `serializeDOM` |
| 800 | * |
| 801 | * @param Element|DocumentFragment $node |
| 802 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
| 803 | * @internal For use by WikitextSerializer only |
| 804 | */ |
| 805 | public function kickOffSerialize( |
| 806 | Node $node, ?callable $wtEscaper = null |
| 807 | ): void { |
| 808 | $this->updateSep( $node ); |
| 809 | $this->currNodeUnmodified = false; |
| 810 | $this->updateModificationFlags( $node ); |
| 811 | $this->resetCurrLine( $node->firstChild ); |
| 812 | $this->serializeChildren( $node, $wtEscaper ); |
| 813 | // Emit child-parent seps. |
| 814 | $this->emitSepForNode( $node ); |
| 815 | // We've reached EOF, flush the remaining buffered text. |
| 816 | $this->flushLine(); |
| 817 | } |
| 818 | |
| 819 | /** |
| 820 | * Serialize children to a string |
| 821 | * |
| 822 | * FIXME(arlorla): Shouldn't affect the separator state, but accidents have |
| 823 | * have been known to happen. T109793 suggests using its own wts / state. |
| 824 | * |
| 825 | * @param Element|DocumentFragment $node |
| 826 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
| 827 | * @param string $inState |
| 828 | * @return string |
| 829 | */ |
| 830 | private function serializeChildrenToString( |
| 831 | Node $node, ?callable $wtEscaper, string $inState |
| 832 | ): string { |
| 833 | $states = [ 'inLink', 'inCaption', 'inIndentPre', 'inHTMLPre', 'inPHPBlock', 'inAttribute' ]; |
| 834 | Assert::parameter( in_array( $inState, $states, true ), '$inState', 'Must be one of: ' |
| 835 | . implode( ', ', $states ) ); |
| 836 | // FIXME: Make sure that the separators emitted here conform to the |
| 837 | // syntactic constraints of syntactic context. |
| 838 | $oldSep = $this->sep; |
| 839 | $oldSOL = $this->onSOL; |
| 840 | $oldOut = $this->out; |
| 841 | $oldStart = $this->atStartOfOutput; |
| 842 | $oldCurrLine = $this->currLine; |
| 843 | $oldLogPrefix = $this->logPrefix; |
| 844 | // Modification flags |
| 845 | $oldPrevNodeUnmodified = $this->prevNodeUnmodified; |
| 846 | $oldCurrNodeUnmodified = $this->currNodeUnmodified; |
| 847 | $oldPrevNode = $this->prevNode; |
| 848 | |
| 849 | $this->out = ''; |
| 850 | $this->logPrefix = 'OUT(C):'; |
| 851 | $this->resetSep(); |
| 852 | $this->onSOL = false; |
| 853 | $this->atStartOfOutput = false; |
| 854 | $this->$inState = true; |
| 855 | |
| 856 | $this->singleLineContext->disable(); |
| 857 | $this->kickOffSerialize( $node, $wtEscaper ); |
| 858 | $this->singleLineContext->pop(); |
| 859 | |
| 860 | // restore the state |
| 861 | $bits = $this->out; |
| 862 | $this->out = $oldOut; |
| 863 | $this->$inState = false; |
| 864 | $this->sep = $oldSep; |
| 865 | $this->onSOL = $oldSOL; |
| 866 | $this->atStartOfOutput = $oldStart; |
| 867 | $this->currLine = $oldCurrLine; |
| 868 | $this->logPrefix = $oldLogPrefix; |
| 869 | // Modification flags |
| 870 | $this->prevNodeUnmodified = $oldPrevNodeUnmodified; |
| 871 | $this->currNodeUnmodified = $oldCurrNodeUnmodified; |
| 872 | $this->prevNode = $oldPrevNode; |
| 873 | return $bits; |
| 874 | } |
| 875 | |
| 876 | /** |
| 877 | * Serialize children of a link to a string |
| 878 | * @param Element|DocumentFragment $node |
| 879 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
| 880 | * @return string |
| 881 | */ |
| 882 | public function serializeLinkChildrenToString( |
| 883 | Node $node, ?callable $wtEscaper = null |
| 884 | ): string { |
| 885 | return $this->serializeChildrenToString( $node, $wtEscaper, 'inLink' ); |
| 886 | } |
| 887 | |
| 888 | /** |
| 889 | * Serialize children of a caption to a string |
| 890 | * @param Element|DocumentFragment $node |
| 891 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
| 892 | * @return string |
| 893 | */ |
| 894 | public function serializeCaptionChildrenToString( |
| 895 | Node $node, ?callable $wtEscaper = null |
| 896 | ): string { |
| 897 | return $this->serializeChildrenToString( $node, $wtEscaper, 'inCaption' ); |
| 898 | } |
| 899 | |
| 900 | /** |
| 901 | * Serialize children of an indent-pre to a string |
| 902 | * @param Element|DocumentFragment $node |
| 903 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
| 904 | * @return string |
| 905 | */ |
| 906 | public function serializeIndentPreChildrenToString( |
| 907 | Node $node, ?callable $wtEscaper = null |
| 908 | ): string { |
| 909 | return $this->serializeChildrenToString( $node, $wtEscaper, 'inIndentPre' ); |
| 910 | } |
| 911 | |
| 912 | /** |
| 913 | * Take notes of the open annotation ranges and whether they have been extended. |
| 914 | * @param string $ann |
| 915 | * @param bool $extended |
| 916 | */ |
| 917 | public function openAnnotationRange( string $ann, bool $extended ) { |
| 918 | $this->openAnnotations[$ann] = $extended; |
| 919 | } |
| 920 | |
| 921 | /** |
| 922 | * Removes the corresponding annotation range from the list of open ranges. |
| 923 | * @param string $ann |
| 924 | */ |
| 925 | public function closeAnnotationRange( string $ann ) { |
| 926 | unset( $this->openAnnotations[$ann] ); |
| 927 | } |
| 928 | |
| 929 | } |