Code Coverage |
||||||||||
Lines |
Functions and Methods |
Classes and Traits |
||||||||
Total | |
23.29% |
58 / 249 |
|
37.04% |
10 / 27 |
CRAP | |
0.00% |
0 / 1 |
SerializerState | |
23.29% |
58 / 249 |
|
37.04% |
10 / 27 |
4082.02 | |
0.00% |
0 / 1 |
solWikitextRegexp | |
0.00% |
0 / 11 |
|
0.00% |
0 / 1 |
6 | |||
solRegexp | |
0.00% |
0 / 8 |
|
0.00% |
0 / 1 |
6 | |||
__construct | |
100.00% |
13 / 13 |
|
100.00% |
1 / 1 |
1 | |||
getEnv | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
initMode | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
appendSep | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
updateSep | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
resetSep | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
resetCurrLine | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
flushLine | |
0.00% |
0 / 2 |
|
0.00% |
0 / 1 |
2 | |||
getOrigSrc | |
80.00% |
4 / 5 |
|
0.00% |
0 / 1 |
3.07 | |||
isValidDSR | |
0.00% |
0 / 45 |
|
0.00% |
0 / 1 |
342 | |||
updateModificationFlags | |
100.00% |
3 / 3 |
|
100.00% |
1 / 1 |
1 | |||
sepIntroducedSOL | |
0.00% |
0 / 6 |
|
0.00% |
0 / 1 |
12 | |||
pushToCurrLine | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
emitSep | |
0.00% |
0 / 7 |
|
0.00% |
0 / 1 |
6 | |||
emitSepForNode | |
0.00% |
0 / 30 |
|
0.00% |
0 / 1 |
272 | |||
recoverTrimmedWhitespace | |
0.00% |
0 / 5 |
|
0.00% |
0 / 1 |
2 | |||
emitChunk | |
40.48% |
17 / 42 |
|
0.00% |
0 / 1 |
156.81 | |||
serializeChildren | |
100.00% |
8 / 8 |
|
100.00% |
1 / 1 |
5 | |||
kickOffSerialize | |
100.00% |
7 / 7 |
|
100.00% |
1 / 1 |
1 | |||
serializeChildrenToString | |
0.00% |
0 / 33 |
|
0.00% |
0 / 1 |
2 | |||
serializeLinkChildrenToString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
serializeCaptionChildrenToString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
serializeIndentPreChildrenToString | |
100.00% |
1 / 1 |
|
100.00% |
1 / 1 |
1 | |||
openAnnotationRange | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 | |||
closeAnnotationRange | |
0.00% |
0 / 1 |
|
0.00% |
0 / 1 |
2 |
1 | <?php |
2 | declare( strict_types = 1 ); |
3 | |
4 | namespace Wikimedia\Parsoid\Html2Wt; |
5 | |
6 | use Composer\Semver\Semver; |
7 | use stdClass; |
8 | use Wikimedia\Assert\Assert; |
9 | use Wikimedia\Parsoid\Config\Env; |
10 | use Wikimedia\Parsoid\Core\DomSourceRange; |
11 | use Wikimedia\Parsoid\Core\SelectiveUpdateData; |
12 | use Wikimedia\Parsoid\DOM\DocumentFragment; |
13 | use Wikimedia\Parsoid\DOM\Element; |
14 | use Wikimedia\Parsoid\DOM\Node; |
15 | use Wikimedia\Parsoid\DOM\Text; |
16 | use Wikimedia\Parsoid\Ext\ParsoidExtensionAPI; |
17 | use Wikimedia\Parsoid\Html2Wt\ConstrainedText\ConstrainedText; |
18 | use Wikimedia\Parsoid\Tokens\SourceRange; |
19 | use Wikimedia\Parsoid\Utils\DiffDOMUtils; |
20 | use Wikimedia\Parsoid\Utils\DOMCompat; |
21 | use Wikimedia\Parsoid\Utils\DOMDataUtils; |
22 | use Wikimedia\Parsoid\Utils\DOMUtils; |
23 | use Wikimedia\Parsoid\Utils\PHPUtils; |
24 | use Wikimedia\Parsoid\Utils\Utils; |
25 | use Wikimedia\Parsoid\Utils\WTUtils; |
26 | |
27 | /** |
28 | * State object for the wikitext serializers. |
29 | */ |
30 | class SerializerState { |
31 | |
32 | /** |
33 | * Regexp for checking if what we have consumed wikimarkup that has special meaning at the |
34 | * beginning of the line, and is indeed at the beginning of the line (modulo comments and |
35 | * other ignored elements). |
36 | * |
37 | * @return string |
38 | */ |
39 | private function solWikitextRegexp(): string { |
40 | static $solWikitextRegexp = null; |
41 | if ( $solWikitextRegexp === null ) { |
42 | $sol = PHPUtils::reStrip( |
43 | $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp( true ), |
44 | '@' |
45 | ); |
46 | $solWikitextRegexp = '@' . |
47 | '^(' . $sol . ')' . |
48 | '([\ \*#:;{\|!=].*)$' . |
49 | '@D'; |
50 | } |
51 | return $solWikitextRegexp; |
52 | } |
53 | |
54 | /** |
55 | * Regexp for checking whether we are at the start of the line (modulo comments and |
56 | * other ignored elements). |
57 | * |
58 | * @return string |
59 | */ |
60 | private function solRegexp(): string { |
61 | static $solRegexp = null; |
62 | if ( $solRegexp === null ) { |
63 | $sol = PHPUtils::reStrip( |
64 | $this->env->getSiteConfig()->solTransparentWikitextNoWsRegexp( true ), |
65 | '@' |
66 | ); |
67 | $solRegexp = '@(^|\n)' . $sol . '$@D'; |
68 | } |
69 | return $solRegexp; |
70 | } |
71 | |
72 | /** |
73 | * Separator information: |
74 | * - constraints (array<array|int>|null): min/max number of newlines |
75 | * - src (string|null): collected separator text from DOM text/comment nodes |
76 | * - lastSourceNode (?Node): Seems to be bookkeeping to make sure we don't reuse |
77 | * original separators when `emitChunk` is called |
78 | * consecutively on the same node. However, it also |
79 | * differs from `state.prevNode` in that it only gets |
80 | * updated when a node calls `emitChunk` so that nodes |
81 | * serializing `justChildren` don't mix up `buildSep`. |
82 | * FIXME: could use a dedicated class |
83 | * @var stdClass |
84 | */ |
85 | public $sep; |
86 | |
87 | /** |
88 | * Is the serializer at the start of a new wikitext line? |
89 | * @var bool |
90 | */ |
91 | public $onSOL = true; |
92 | |
93 | /** |
94 | * True when wts kicks off, false after the first char has been output |
95 | * SSS FIXME: Can this be done away with in some way? |
96 | * @var bool |
97 | */ |
98 | public $atStartOfOutput = true; |
99 | |
100 | /** |
101 | * Is the serializer currently handling link content (children of `<a>`)? |
102 | * @var bool |
103 | */ |
104 | public $inLink = false; |
105 | |
106 | /** |
107 | * Is the serializer currently handling caption content? |
108 | * @var bool |
109 | */ |
110 | public $inCaption = false; |
111 | |
112 | /** |
113 | * Is the serializer currently handling an indent-pre tag? |
114 | * @var bool |
115 | */ |
116 | public $inIndentPre = false; |
117 | |
118 | /** |
119 | * Is the serializer currently handling a html-pre tag? |
120 | * @var bool |
121 | */ |
122 | public $inHTMLPre = false; |
123 | |
124 | /** |
125 | * Is the serializer currently handling a tag that the PHP parser |
126 | * treats as a block tag? |
127 | * @var bool |
128 | */ |
129 | public $inPHPBlock = false; |
130 | |
131 | /** |
132 | * Is the serializer being invoked recursively to serialize a |
133 | * template-generated attribute (via `WSP.getAttributeValue`'s |
134 | * template handling). If so, we should suppress some |
135 | * serialization escapes, like autolink protection, since |
136 | * these are not valid for attribute values. |
137 | * @var bool |
138 | */ |
139 | public $inAttribute = false; |
140 | |
141 | /** |
142 | * Is the serializer currently processing a subtree that has been |
143 | * marked inserted compared to original content (ex: via VE / CX)? |
144 | * |
145 | * @var bool |
146 | */ |
147 | public $inInsertedContent; |
148 | |
149 | /** |
150 | * Did we introduce nowikis for indent-pre protection? |
151 | * If yes, we might run a post-pass to strip useless ones. |
152 | * @var bool |
153 | */ |
154 | public $hasIndentPreNowikis = false; |
155 | |
156 | /** |
157 | * Did we introduce nowikis to preserve quote semantics? |
158 | * If yes, we might run a post-pass to strip useless ones. |
159 | * @var bool |
160 | */ |
161 | public $hasQuoteNowikis = false; |
162 | |
163 | /** |
164 | * Did we introduce `<nowiki />`s? |
165 | * If yes, we do a postpass to remove unnecessary trailing ones. |
166 | * @var bool |
167 | */ |
168 | public $hasSelfClosingNowikis = false; |
169 | |
170 | /** |
171 | * Did we introduce nowikis around `=.*=` text? |
172 | * If yes, we do a postpass to remove unnecessary escapes. |
173 | * @var bool |
174 | */ |
175 | public $hasHeadingEscapes = false; |
176 | |
177 | /** |
178 | * Records the nesting level of wikitext tables |
179 | * @var int |
180 | */ |
181 | public $wikiTableNesting = 0; |
182 | |
183 | /** |
184 | * Stack of wikitext escaping handlers -- these handlers are responsible |
185 | * for smart escaping when the surrounding wikitext context is known. |
186 | * @var callable[] See {@link serializeChildren()} |
187 | */ |
188 | public $wteHandlerStack = []; |
189 | |
190 | /** |
191 | * This array is used by the wikitext escaping algorithm -- represents |
192 | * a "single line" of output wikitext as represented by a block node in |
193 | * the DOM. |
194 | * - firstNode (?Node): first DOM node processed on this line |
195 | * - text (string): output so far from all nodes on the current line |
196 | * - chunks (ConstrainedText[]): list of chunks comprising the current line |
197 | * @var stdClass |
198 | * XXX: replace with output buffering per line |
199 | * FIXME: could use a dedicated class |
200 | */ |
201 | public $currLine; |
202 | |
203 | /** |
204 | * Stack used to enforce single-line context |
205 | * @var SingleLineContext |
206 | */ |
207 | public $singleLineContext; |
208 | |
209 | /** |
210 | * Text to be emitted at the start of file, for redirects |
211 | * @var string|null |
212 | */ |
213 | public $redirectText = null; |
214 | |
215 | /** @var WikitextSerializer */ |
216 | public $serializer; |
217 | |
218 | /** @var ParsoidExtensionAPI */ |
219 | public $extApi; |
220 | |
221 | /** @var string The serialized output */ |
222 | public $out = ''; |
223 | |
224 | /** |
225 | * Are we in selective serialization mode? |
226 | * @see SelectiveSerializer |
227 | * @var bool |
228 | */ |
229 | public $selserMode; |
230 | |
231 | private ?SelectiveUpdateData $selserData; |
232 | |
233 | /** |
234 | * If in selser mode, while processing a node, do we know if |
235 | * its previous node has not been modified in an edit? |
236 | * @var bool |
237 | */ |
238 | public $prevNodeUnmodified; |
239 | |
240 | /** |
241 | * If in selser mode, while processing a node, do we know if |
242 | * it has not been modified in an edit? |
243 | * @var bool |
244 | */ |
245 | public $currNodeUnmodified; |
246 | |
247 | /** |
248 | * Should we run the wikitext escaping code on the wikitext chunk |
249 | * that will be emitted? |
250 | * @var bool |
251 | */ |
252 | public $needsEscaping = false; |
253 | |
254 | /** |
255 | * Used as fast patch for special protected characters in WikitextEscapeHandlers and |
256 | * comes from LanguageVariantHandler |
257 | * @var string|null |
258 | */ |
259 | public $protect; |
260 | |
261 | /** @var Separators */ |
262 | public $separators; |
263 | |
264 | /** @var Env */ |
265 | private $env; |
266 | |
267 | /** @var Element */ |
268 | public $currNode; |
269 | |
270 | /** @var Element */ |
271 | private $prevNode; |
272 | |
273 | /** @var array */ |
274 | public $openAnnotations; |
275 | |
276 | /** |
277 | * Log prefix to use in trace output |
278 | * @var string |
279 | */ |
280 | private $logPrefix = 'OUT:'; |
281 | |
282 | public $haveTrimmedWsDSR = false; |
283 | |
284 | /** |
285 | * @param WikitextSerializer $serializer |
286 | * @param array $options List of options for serialization: |
287 | * - onSOL: (bool) |
288 | * - inPHPBlock: (bool) |
289 | * - inAttribute: (bool) |
290 | * - protect: (string) |
291 | * - selserData: (SelectiveUpdateData) |
292 | */ |
293 | public function __construct( WikitextSerializer $serializer, array $options = [] ) { |
294 | $this->env = $serializer->env; |
295 | $this->serializer = $serializer; |
296 | $this->extApi = new ParsoidExtensionAPI( $this->env, [ 'html2wt' => [ 'state' => $this ] ] ); |
297 | $this->onSOL = $options['onSOL'] ?? $this->onSOL; |
298 | $this->inPHPBlock = $options['inPHPBlock'] ?? $this->inPHPBlock; |
299 | $this->inAttribute = $options['inAttribute'] ?? $this->inAttribute; |
300 | $this->protect = $options['protect'] ?? null; |
301 | $this->selserData = $options['selserData'] ?? null; |
302 | $this->resetCurrLine( null ); |
303 | $this->singleLineContext = new SingleLineContext(); |
304 | $this->resetSep(); |
305 | $this->haveTrimmedWsDSR = Semver::satisfies( $this->env->getInputContentVersion(), '>=2.1.1' ); |
306 | $this->separators = new Separators( $this->env, $this ); |
307 | } |
308 | |
309 | /** |
310 | * @note Porting note: this replaces direct access |
311 | * @return Env |
312 | */ |
313 | public function getEnv(): Env { |
314 | return $this->env; |
315 | } |
316 | |
317 | /** |
318 | * Initialize a few boolean flags based on serialization mode. |
319 | * FIXME: Ideally, this should be private. Requires shuffing around |
320 | * where SerializerState is constructed so that $selserMode is known |
321 | * at the time of construction. |
322 | * @private for use by WikitextSerializer only |
323 | * @param bool $selserMode Are we running selective serialization? |
324 | */ |
325 | public function initMode( bool $selserMode ): void { |
326 | $this->selserMode = $selserMode; |
327 | } |
328 | |
329 | /** |
330 | * Appends the separator source to the separator src buffer. |
331 | * Don't update $state->onSOL since this string hasn't been emitted yet. |
332 | * If content handlers change behavior based on whether this newline will |
333 | * be emitted or not, they should peek into this buffer (ex: see TDHandler |
334 | * and THHandler code). |
335 | * |
336 | * @param string $src |
337 | */ |
338 | public function appendSep( string $src ): void { |
339 | $this->sep->src = ( $this->sep->src ?? '' ) . $src; |
340 | } |
341 | |
342 | /** |
343 | * Cycle the state after processing a node. |
344 | * @param Node $node |
345 | */ |
346 | public function updateSep( Node $node ): void { |
347 | $this->sep->lastSourceNode = $node; |
348 | } |
349 | |
350 | private function resetSep() { |
351 | $this->sep = (object)[ |
352 | 'constraints' => null, |
353 | 'src' => null, |
354 | 'lastSourceNode' => null, |
355 | ]; |
356 | } |
357 | |
358 | /** |
359 | * Reset the current line state. |
360 | * @param ?Node $node |
361 | */ |
362 | private function resetCurrLine( ?Node $node ): void { |
363 | $this->currLine = (object)[ |
364 | 'text' => '', |
365 | 'chunks' => [], |
366 | 'firstNode' => $node |
367 | ]; |
368 | } |
369 | |
370 | /** |
371 | * Process and emit a line of ConstrainedText chunks, adjusting chunk boundaries as necessary. |
372 | * (Start of line and end of line are always safe for ConstrainedText chunks, so we don't need |
373 | * to buffer more than the last line.) |
374 | */ |
375 | private function flushLine(): void { |
376 | $this->out .= ConstrainedText::escapeLine( $this->currLine->chunks ); |
377 | $this->currLine->chunks = []; |
378 | } |
379 | |
380 | /** |
381 | * Extracts a subset of the page source bound by the supplied source range. |
382 | * @param SourceRange $sr |
383 | * @return string|null |
384 | */ |
385 | public function getOrigSrc( SourceRange $sr ): ?string { |
386 | Assert::invariant( $this->selserMode, 'SerializerState::$selserMode must be set' ); |
387 | if ( |
388 | $sr->start <= $sr->end && |
389 | // FIXME: Having a $start greater than the source length is |
390 | // probably a canary for corruption. Maybe we should be throwing |
391 | // here instead. See T240053. |
392 | // But, see comment in UnpackDOMFragments where we very very rarely |
393 | // can deliberately set DSR to point outside page source. |
394 | $sr->start <= strlen( $this->selserData->revText ) |
395 | ) { |
396 | // XXX should use $frame->getSrcText() like WTUtils::getWTSource |
397 | return $sr->substr( $this->selserData->revText ); |
398 | } else { |
399 | return null; |
400 | } |
401 | } |
402 | |
403 | /** |
404 | * Check the validity of a DSR in the context of the page source. |
405 | * |
406 | * Returns false if Utils::isValidDSR() would return false, but also |
407 | * returns false if the DSR offsets would create a bad UTF-8 string |
408 | * (ie, the start offsets don't point to a valid UTF-8 start character). |
409 | * @param ?DomSourceRange $dsr DSR source range values |
410 | * @param bool $all Also check the widths of the container tag |
411 | * @return bool |
412 | */ |
413 | public function isValidDSR( ?DomSourceRange $dsr, bool $all = false ) { |
414 | if ( !Utils::isValidDSR( $dsr, $all ) ) { |
415 | return false; |
416 | } |
417 | if ( !( $dsr->start <= $dsr->end && |
418 | $dsr->end <= strlen( $this->selserData->revText ) ) ) { |
419 | return false; |
420 | } |
421 | // check the UTF-8 ranges. |
422 | $src = $this->selserData->revText; |
423 | $check = static function ( $start, $end ) use ( $src ) { |
424 | if ( $start === $end ) { |
425 | // zero-length string is always ok |
426 | return true; |
427 | } |
428 | $firstChar = ord( $src[$start] ); |
429 | if ( ( $firstChar & 0xC0 ) === 0x80 ) { |
430 | return false; // bad UTF-8 at start of string |
431 | } |
432 | $i = 0; |
433 | // This next loop won't pass $start because we've already |
434 | // asserted that the first character isn't 10xx xxxx |
435 | do { |
436 | $i--; |
437 | if ( $i <= -5 ) { |
438 | return false; // bad UTF-8 at end of string (>4 byte sequence) |
439 | } |
440 | $lastChar = ord( $src[$end + $i] ); |
441 | } while ( ( $lastChar & 0xC0 ) === 0x80 ); |
442 | if ( ( $lastChar & 0x80 ) === 0 ) { |
443 | return $i === -1; |
444 | } elseif ( ( $lastChar & 0xE0 ) === 0xC0 ) { |
445 | return $i === -2; |
446 | } elseif ( ( $lastChar & 0xF0 ) === 0xE0 ) { |
447 | return $i === -3; |
448 | } elseif ( ( $lastChar & 0xF8 ) === 0xF0 ) { |
449 | return $i === -4; |
450 | } else { |
451 | return false; |
452 | } |
453 | }; |
454 | if ( !$all ) { |
455 | return $check( $dsr->start, $dsr->end ); |
456 | } |
457 | // Check each inner ranges. |
458 | $openEnd = $dsr->start + $dsr->openWidth; |
459 | if ( $openEnd > $dsr->end ) { |
460 | return false; |
461 | } |
462 | if ( !$check( $dsr->start, $openEnd ) ) { |
463 | return false; |
464 | } |
465 | $closeStart = $dsr->end - $dsr->closeWidth; |
466 | if ( $dsr->start > $closeStart ) { |
467 | return false; |
468 | } |
469 | if ( !$check( $closeStart, $dsr->end ) ) { |
470 | return false; |
471 | } |
472 | if ( $openEnd > $closeStart ) { |
473 | return false; |
474 | } |
475 | if ( !$check( $openEnd, $closeStart ) ) { |
476 | return false; |
477 | } |
478 | return true; |
479 | } |
480 | |
481 | /** |
482 | * Like it says on the tin. |
483 | * @param Node $node |
484 | */ |
485 | public function updateModificationFlags( Node $node ): void { |
486 | $this->prevNodeUnmodified = $this->currNodeUnmodified; |
487 | $this->currNodeUnmodified = false; |
488 | $this->prevNode = $node; |
489 | } |
490 | |
491 | /** |
492 | * Separators put us in SOL state. |
493 | * @param string $sep |
494 | * @param Node $node |
495 | */ |
496 | private function sepIntroducedSOL( string $sep, Node $node ): void { |
497 | // Don't get tripped by newlines in comments! Be wary of nowikis added |
498 | // by makeSepIndentPreSafe on the last line. |
499 | $nonCommentSep = preg_replace( Utils::COMMENT_REGEXP, '', $sep ); |
500 | if ( substr( $nonCommentSep, -1 ) === "\n" ) { |
501 | $this->onSOL = true; |
502 | } |
503 | |
504 | if ( str_contains( $nonCommentSep, "\n" ) ) { |
505 | // process escapes in our full line |
506 | $this->flushLine(); |
507 | $this->resetCurrLine( $node ); |
508 | } |
509 | } |
510 | |
511 | /** |
512 | * Accumulates chunks on the current line. |
513 | * @param ConstrainedText $chunk |
514 | * @param string $logPrefix |
515 | */ |
516 | private function pushToCurrLine( ConstrainedText $chunk, string $logPrefix ) { |
517 | // Emitting text that has not been escaped |
518 | $this->currLine->text .= $chunk->text; |
519 | |
520 | $this->currLine->chunks[] = $chunk; |
521 | |
522 | $this->serializer->trace( '--->', $logPrefix, static function () use ( $chunk ) { |
523 | return PHPUtils::jsonEncode( $chunk->text ); |
524 | } ); |
525 | } |
526 | |
527 | /** |
528 | * Pushes the separator to the current line and resets the separator state. |
529 | * @param string $sep |
530 | * @param Node $node |
531 | * @param string $debugPrefix |
532 | */ |
533 | private function emitSep( string $sep, Node $node, string $debugPrefix ): void { |
534 | $sep = ConstrainedText::cast( $sep, $node ); |
535 | |
536 | // Replace newlines if we're in a single-line context |
537 | if ( $this->singleLineContext->enforced() ) { |
538 | $sep->text = preg_replace( '/\n/', ' ', $sep->text ); |
539 | } |
540 | |
541 | $this->pushToCurrLine( $sep, $debugPrefix ); |
542 | $this->sepIntroducedSOL( $sep->text, $node ); |
543 | |
544 | // Reset separator state |
545 | $this->resetSep(); |
546 | $this->updateSep( $node ); |
547 | } |
548 | |
549 | /** |
550 | * Determines if we can use the original separator for this node or if we |
551 | * need to build one based on its constraints, and then emits it. |
552 | * |
553 | * @param Node $node |
554 | */ |
555 | private function emitSepForNode( Node $node ): void { |
556 | /* When block nodes are deleted, the deletion affects whether unmodified |
557 | * newline separators between a pair of unmodified P tags can be reused. |
558 | * |
559 | * Example: |
560 | * ``` |
561 | * Original WT : "<div>x</div>foo\nbar" |
562 | * Original HTML: "<div>x</div><p>foo</p>\n<p>bar</p>" |
563 | * Edited HTML : "<p>foo</p>\n<p>bar</p>" |
564 | * Annotated DOM: "<mw:DiffMarker is-block><p>foo</p>\n<p>bar</p>" |
565 | * Expected WT : "foo\n\nbar" |
566 | * ``` |
567 | * |
568 | * Note the additional newline between "foo" and "bar" even though originally, |
569 | * there was just a single newline. |
570 | * |
571 | * So, even though the two P tags and the separator between them is |
572 | * unmodified, it is insufficient to rely on just that. We have to look at |
573 | * what has happened on the two wikitext lines onto which the two P tags |
574 | * will get serialized. |
575 | * |
576 | * Now, if you check the code for `nextToDeletedBlockNodeInWT`, that code is |
577 | * not really looking at ALL the nodes before/after the nodes that could |
578 | * serialize onto the wikitext lines. It is looking at the immediately |
579 | * adjacent nodes, i.e. it is not necessary to look if a block-tag was |
580 | * deleted 2 or 5 siblings away. If we had to actually examine all of those, |
581 | * nodes, this would get very complex, and it would be much simpler to just |
582 | * discard the original separators => potentially lots of dirty diffs. |
583 | * |
584 | * To understand why it is sufficient (for correctness) to examine just |
585 | * the immediately adjacent nodes, let us look at an additional example. |
586 | * ``` |
587 | * Original WT : "a<div>b</div>c<div>d</div>e\nf" |
588 | * Original HTML: "<p>a</p><div>b</div><p>c</p><div>d</div><p>e</p>\n<p>f</p>" |
589 | * ``` |
590 | * Note how `<block>` tags and `<p>` tags interleave in the HTML. This would be |
591 | * the case always no matter how much inline content showed up between the |
592 | * block tags in wikitext. If the b-`<div>` was deleted, we don't care |
593 | * about it, since we still have the d-`<div>` before the P tag that preserves |
594 | * the correctness of the single `"\n"` separator. If the d-`<div>` was deleted, |
595 | * we conservatively ignore the original separator and let normal P-P constraints |
596 | * take care of it. At worst, we might generate a dirty diff in this scenario. */ |
597 | $origSepNeeded = ( $node !== $this->sep->lastSourceNode ); |
598 | $origSepUsable = $origSepNeeded && |
599 | ( |
600 | // first-content-node of <body> ($this->prevNode) |
601 | ( |
602 | DOMUtils::isBody( $this->prevNode ) && |
603 | $node->parentNode === $this->prevNode |
604 | ) |
605 | || |
606 | // unmodified sibling node of $this->prevNode |
607 | ( |
608 | $this->prevNode && $this->prevNodeUnmodified && |
609 | $node->parentNode === $this->prevNode->parentNode && |
610 | !WTSUtils::nextToDeletedBlockNodeInWT( $this->prevNode, true ) |
611 | ) |
612 | ) && |
613 | $this->currNodeUnmodified && !WTSUtils::nextToDeletedBlockNodeInWT( $node, false ); |
614 | |
615 | $origSep = null; |
616 | if ( $origSepUsable ) { |
617 | if ( $this->prevNode instanceof Element && $node instanceof Element ) { |
618 | '@phan-var Element $node';/** @var Element $node */ |
619 | if ( DOMUtils::isBody( $this->prevNode ) ) { |
620 | // <body> won't have DSR in body_only scenarios |
621 | $sr = new SourceRange( 0, 0 ); |
622 | } else { |
623 | $sr = DOMDataUtils::getDataParsoid( $this->prevNode )->dsr; |
624 | } |
625 | $sr = $sr->to( DOMDataUtils::getDataParsoid( $node )->dsr ); |
626 | $origSep = $this->getOrigSrc( $sr ); |
627 | } elseif ( $this->sep->src && WTSUtils::isValidSep( $this->sep->src ) ) { |
628 | // We don't know where '$this->sep->src' comes from. So, reuse it |
629 | // only if it is a valid separator string. |
630 | $origSep = $this->sep->src; |
631 | } |
632 | } |
633 | |
634 | if ( $origSep !== null ) { |
635 | $this->emitSep( $origSep, $node, 'ORIG-SEP:' ); |
636 | } else { |
637 | $sep = $this->separators->buildSep( $node ); |
638 | $this->emitSep( $sep ?? '', $node, 'SEP:' ); |
639 | } |
640 | } |
641 | |
642 | /** |
643 | * Recovers and emits any trimmed whitespace for $node |
644 | * @param Node $node |
645 | * @param bool $leading |
646 | * if true, trimmed leading whitespace is emitted |
647 | * if false, trimmed railing whitespace is emitted |
648 | * @return string|null |
649 | */ |
650 | public function recoverTrimmedWhitespace( Node $node, bool $leading ): ?string { |
651 | $sep = $this->separators->recoverTrimmedWhitespace( $node, $leading ); |
652 | $this->serializer->trace( '--->', "TRIMMED-SEP:", static function () use ( $sep ) { |
653 | return PHPUtils::jsonEncode( $sep ); |
654 | } ); |
655 | return $sep; |
656 | } |
657 | |
658 | /** |
659 | * Pushes the chunk to the current line. |
660 | * @param ConstrainedText|string $res |
661 | * @param Node $node |
662 | */ |
663 | public function emitChunk( $res, Node $node ): void { |
664 | $res = ConstrainedText::cast( $res, $node ); |
665 | |
666 | // Replace newlines if we're in a single-line context |
667 | if ( $this->singleLineContext->enforced() ) { |
668 | $res->text = str_replace( "\n", ' ', $res->text ); |
669 | } |
670 | |
671 | // Emit separator first |
672 | if ( $res->noSep ) { |
673 | /* skip separators for internal tokens from SelSer */ |
674 | if ( $this->onSOL ) { |
675 | // process escapes in our full line |
676 | $this->flushLine(); |
677 | $this->resetCurrLine( $node ); |
678 | } |
679 | } else { |
680 | $this->emitSepForNode( $node ); |
681 | } |
682 | |
683 | $needsEscaping = $this->needsEscaping; |
684 | if ( $needsEscaping && $this->currNode instanceof Text ) { |
685 | $needsEscaping = !$this->inHTMLPre && ( $this->onSOL || !$this->currNodeUnmodified ); |
686 | } |
687 | |
688 | // Escape 'res' if necessary |
689 | if ( $needsEscaping ) { |
690 | $res = new ConstrainedText( [ |
691 | 'text' => $this->serializer->escapeWikitext( $this, $res->text, [ |
692 | 'node' => $node, |
693 | 'isLastChild' => DiffDOMUtils::nextNonDeletedSibling( $node ) === null, |
694 | ] ), |
695 | 'prefix' => $res->prefix, |
696 | 'suffix' => $res->suffix, |
697 | 'node' => $res->node, |
698 | ] ); |
699 | $this->needsEscaping = false; |
700 | } else { |
701 | // If 'res' is coming from selser and the current node is a paragraph tag, |
702 | // check if 'res' might need some leading chars nowiki-escaped before being output. |
703 | // Because of block-tag p-wrapping behavior, sol-sensitive characters that used to |
704 | // be in non-sol positions, but yet wrapped in p-tags, could end up in sol-position |
705 | // if those block tags get deleted during edits. |
706 | // |
707 | // Ex: a<div>foo</div>*b |
708 | // -- wt2html --> <p>a</p><div>foo<div><p>*b</p> |
709 | // -- EDIT --> <p>a</p><p>*b</p> |
710 | // -- html2wt --> a\n\n<nowiki>*</nowiki>b |
711 | // |
712 | // In this scenario, the <p>a</p>, <p>*b</p>, and <p>#c</p> |
713 | // will be marked unmodified and will be processed below. |
714 | if ( $this->selserMode |
715 | && $this->onSOL |
716 | && $this->currNodeUnmodified |
717 | // 'node' came from original Parsoid HTML unmodified. So, if its content |
718 | // needs nowiki-escaping, we know that the reason it didn't parse into |
719 | // lists/headings/whatever is because it didn't occur at the start of the |
720 | // line => it had a block-tag in the original wikitext. So if the previous |
721 | // node was also unmodified (and since it also came from original Parsoid |
722 | // HTML), we can safely infer that it couldn't have been an inline node or |
723 | // a P-tag (if it were, the p-wrapping code would have swallowed that content |
724 | // into 'node'). So, it would have to be some sort of block tag => this.onSOL |
725 | // couldn't have been true (because we could have serialized 'node' on the |
726 | // same line as the block tag) => we can save some effort by eliminating |
727 | // scenarios where 'this.prevNodeUnmodified' is true. |
728 | && !$this->prevNodeUnmodified |
729 | && DOMCompat::nodeName( $node ) === 'p' && !WTUtils::isLiteralHTMLNode( $node ) |
730 | ) { |
731 | $pChild = DiffDOMUtils::firstNonSepChild( $node ); |
732 | // If a text node, we have to make sure that the text doesn't |
733 | // get reparsed as non-text in the wt2html pipeline. |
734 | if ( $pChild instanceof Text ) { |
735 | $match = $res->matches( $this->solWikitextRegexp(), $this->env ); |
736 | if ( $match && isset( $match[2] ) ) { |
737 | if ( preg_match( '/^([\*#:;]|{\||.*=$)/D', $match[2] ) |
738 | // ! and | chars are harmless outside tables |
739 | || ( strspn( $match[2], '|!' ) && $this->wikiTableNesting > 0 ) |
740 | // indent-pres are suppressed inside <blockquote> |
741 | || ( preg_match( '/^ \S/', $match[2] ) |
742 | && !DOMUtils::hasNameOrHasAncestorOfName( $node, 'blockquote' ) ) |
743 | ) { |
744 | $res = ConstrainedText::cast( ( $match[1] ?: '' ) |
745 | . '<nowiki>' . substr( $match[2], 0, 1 ) . '</nowiki>' |
746 | . substr( $match[2], 1 ), $node ); |
747 | } |
748 | } |
749 | } |
750 | } |
751 | } |
752 | |
753 | // Output res |
754 | $this->pushToCurrLine( $res, $this->logPrefix ); |
755 | |
756 | // Update sol flag. Test for newlines followed by optional includeonly or comments |
757 | if ( !$res->matches( $this->solRegexp(), $this->env ) ) { |
758 | $this->onSOL = false; |
759 | } |
760 | |
761 | // We've emit something so we're no longer at SOO. |
762 | $this->atStartOfOutput = false; |
763 | } |
764 | |
765 | /** |
766 | * Serialize the children of a DOM node, sharing the global serializer state. |
767 | * Typically called by a DOM-based handler to continue handling its children. |
768 | * @param Element|DocumentFragment $node |
769 | * @param ?callable $wtEscaper ( $state, $text, $opts ) |
770 | * PORT-FIXME document better; should this be done via WikitextEscapeHandlers somehow? |
771 | * @param ?Node $firstChild |
772 | */ |
773 | public function serializeChildren( |
774 | Node $node, ?callable $wtEscaper = null, ?Node $firstChild = null |
775 | ): void { |
776 | // SSS FIXME: Unsure if this is the right thing always |
777 | if ( $wtEscaper ) { |
778 | $this->wteHandlerStack[] = $wtEscaper; |
779 | } |
780 | |
781 | $child = $firstChild ?: $node->firstChild; |
782 | while ( $child !== null ) { |
783 | // We always get the next child to process |
784 | $child = $this->serializer->serializeNode( $child ); |
785 | } |
786 | |
787 | if ( $wtEscaper ) { |
788 | array_pop( $this->wteHandlerStack ); |
789 | } |
790 | |
791 | // If we serialized children explicitly, |
792 | // we were obviously processing a modified node. |
793 | $this->currNodeUnmodified = false; |
794 | } |
795 | |
796 | /** |
797 | * Abstracts some steps taken in `serializeChildrenToString` and `serializeDOM` |
798 | * |
799 | * @param Element|DocumentFragment $node |
800 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
801 | * @internal For use by WikitextSerializer only |
802 | */ |
803 | public function kickOffSerialize( |
804 | Node $node, ?callable $wtEscaper = null |
805 | ): void { |
806 | $this->updateSep( $node ); |
807 | $this->currNodeUnmodified = false; |
808 | $this->updateModificationFlags( $node ); |
809 | $this->resetCurrLine( $node->firstChild ); |
810 | $this->serializeChildren( $node, $wtEscaper ); |
811 | // Emit child-parent seps. |
812 | $this->emitSepForNode( $node ); |
813 | // We've reached EOF, flush the remaining buffered text. |
814 | $this->flushLine(); |
815 | } |
816 | |
817 | /** |
818 | * Serialize children to a string |
819 | * |
820 | * FIXME(arlorla): Shouldn't affect the separator state, but accidents have |
821 | * have been known to happen. T109793 suggests using its own wts / state. |
822 | * |
823 | * @param Element|DocumentFragment $node |
824 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
825 | * @param string $inState |
826 | * @return string |
827 | */ |
828 | private function serializeChildrenToString( |
829 | Node $node, ?callable $wtEscaper, string $inState |
830 | ): string { |
831 | $states = [ 'inLink', 'inCaption', 'inIndentPre', 'inHTMLPre', 'inPHPBlock', 'inAttribute' ]; |
832 | Assert::parameter( in_array( $inState, $states, true ), '$inState', 'Must be one of: ' |
833 | . implode( ', ', $states ) ); |
834 | // FIXME: Make sure that the separators emitted here conform to the |
835 | // syntactic constraints of syntactic context. |
836 | $oldSep = $this->sep; |
837 | $oldSOL = $this->onSOL; |
838 | $oldOut = $this->out; |
839 | $oldStart = $this->atStartOfOutput; |
840 | $oldCurrLine = $this->currLine; |
841 | $oldLogPrefix = $this->logPrefix; |
842 | // Modification flags |
843 | $oldPrevNodeUnmodified = $this->prevNodeUnmodified; |
844 | $oldCurrNodeUnmodified = $this->currNodeUnmodified; |
845 | $oldPrevNode = $this->prevNode; |
846 | |
847 | $this->out = ''; |
848 | $this->logPrefix = 'OUT(C):'; |
849 | $this->resetSep(); |
850 | $this->onSOL = false; |
851 | $this->atStartOfOutput = false; |
852 | $this->$inState = true; |
853 | |
854 | $this->singleLineContext->disable(); |
855 | $this->kickOffSerialize( $node, $wtEscaper ); |
856 | $this->singleLineContext->pop(); |
857 | |
858 | // restore the state |
859 | $bits = $this->out; |
860 | $this->out = $oldOut; |
861 | $this->$inState = false; |
862 | $this->sep = $oldSep; |
863 | $this->onSOL = $oldSOL; |
864 | $this->atStartOfOutput = $oldStart; |
865 | $this->currLine = $oldCurrLine; |
866 | $this->logPrefix = $oldLogPrefix; |
867 | // Modification flags |
868 | $this->prevNodeUnmodified = $oldPrevNodeUnmodified; |
869 | $this->currNodeUnmodified = $oldCurrNodeUnmodified; |
870 | $this->prevNode = $oldPrevNode; |
871 | return $bits; |
872 | } |
873 | |
874 | /** |
875 | * Serialize children of a link to a string |
876 | * @param Element|DocumentFragment $node |
877 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
878 | * @return string |
879 | */ |
880 | public function serializeLinkChildrenToString( |
881 | Node $node, ?callable $wtEscaper = null |
882 | ): string { |
883 | return $this->serializeChildrenToString( $node, $wtEscaper, 'inLink' ); |
884 | } |
885 | |
886 | /** |
887 | * Serialize children of a caption to a string |
888 | * @param Element|DocumentFragment $node |
889 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
890 | * @return string |
891 | */ |
892 | public function serializeCaptionChildrenToString( |
893 | Node $node, ?callable $wtEscaper = null |
894 | ): string { |
895 | return $this->serializeChildrenToString( $node, $wtEscaper, 'inCaption' ); |
896 | } |
897 | |
898 | /** |
899 | * Serialize children of an indent-pre to a string |
900 | * @param Element|DocumentFragment $node |
901 | * @param ?callable $wtEscaper See {@link serializeChildren()} |
902 | * @return string |
903 | */ |
904 | public function serializeIndentPreChildrenToString( |
905 | Node $node, ?callable $wtEscaper = null |
906 | ): string { |
907 | return $this->serializeChildrenToString( $node, $wtEscaper, 'inIndentPre' ); |
908 | } |
909 | |
910 | /** |
911 | * Take notes of the open annotation ranges and whether they have been extended. |
912 | * @param string $ann |
913 | * @param bool $extended |
914 | */ |
915 | public function openAnnotationRange( string $ann, bool $extended ) { |
916 | $this->openAnnotations[$ann] = $extended; |
917 | } |
918 | |
919 | /** |
920 | * Removes the corresponding annotation range from the list of open ranges. |
921 | * @param string $ann |
922 | */ |
923 | public function closeAnnotationRange( string $ann ) { |
924 | unset( $this->openAnnotations[$ann] ); |
925 | } |
926 | |
927 | } |